public inbox for libc-alpha@sourceware.org
 help / color / mirror / Atom feed
* [PATCH v1 1/3] Bench: Expand bench-memchr.c
@ 2021-05-03  8:44 Noah Goldstein
  2021-05-03  8:44 ` [PATCH v1 2/3] x86: Optimize memchr-avx2.S Noah Goldstein
                   ` (2 more replies)
  0 siblings, 3 replies; 20+ messages in thread
From: Noah Goldstein @ 2021-05-03  8:44 UTC (permalink / raw)
  To: libc-alpha

No bug. This commit adds some additional cases for bench-memchr.c
including testing medium sizes and testing short length with both an
inbound match and out of bound match.

Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
---    
 benchtests/bench-memchr.c | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/benchtests/bench-memchr.c b/benchtests/bench-memchr.c
index f5ced9d80d..5573f93312 100644
--- a/benchtests/bench-memchr.c
+++ b/benchtests/bench-memchr.c
@@ -135,12 +135,25 @@ test_main (void)
       do_test (i, i, 256, 0);
 #endif
     }
+  for (i = 1; i < 8; ++i)
+    {
+      do_test (i, i << 5, 192, 23);
+      do_test (i, i << 5, 192, 0);
+      do_test (i, i << 5, 256, 23);
+      do_test (i, i << 5, 256, 0);
+      do_test (i, i << 5, 512, 23);
+      do_test (i, i << 5, 512, 0);
+    }
   for (i = 1; i < 32; ++i)
     {
       do_test (0, i, i + 1, 23);
       do_test (0, i, i + 1, 0);
       do_test (i, i, i + 1, 23);
       do_test (i, i, i + 1, 0);
+      do_test (0, i, i - 1, 23);
+      do_test (0, i, i - 1, 0);
+      do_test (i, i, i - 1, 23);
+      do_test (i, i, i - 1, 0);
 #ifdef USE_AS_MEMRCHR
       /* Also test the position close to the beginning for memrchr.  */
       do_test (0, 1, i + 1, 23);
-- 
2.29.2


^ permalink raw reply	[flat|nested] 20+ messages in thread

* [PATCH v1 2/3] x86: Optimize memchr-avx2.S
  2021-05-03  8:44 [PATCH v1 1/3] Bench: Expand bench-memchr.c Noah Goldstein
@ 2021-05-03  8:44 ` Noah Goldstein
  2021-05-03 18:50   ` H.J. Lu
                     ` (2 more replies)
  2021-05-03  8:44 ` [PATCH v1 3/3] x86: Optimize memchr-evex.S Noah Goldstein
  2021-05-03 17:17 ` [PATCH v1 1/3] Bench: Expand bench-memchr.c H.J. Lu
  2 siblings, 3 replies; 20+ messages in thread
From: Noah Goldstein @ 2021-05-03  8:44 UTC (permalink / raw)
  To: libc-alpha

No bug. This commit optimizes memchr-avx2.S. The optimizations include
replacing some branches with cmovcc, avoiding some branches entirely
in the less_4x_vec case, making the page cross logic less strict,
asaving a few instructions the in loop return loop. test-memchr,
test-rawmemchr, and test-wmemchr are all passing.

Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
---
 sysdeps/x86_64/multiarch/memchr-avx2.S | 446 +++++++++++++++----------
 1 file changed, 262 insertions(+), 184 deletions(-)

diff --git a/sysdeps/x86_64/multiarch/memchr-avx2.S b/sysdeps/x86_64/multiarch/memchr-avx2.S
index 1fcb1c350f..8368fcd1e1 100644
--- a/sysdeps/x86_64/multiarch/memchr-avx2.S
+++ b/sysdeps/x86_64/multiarch/memchr-avx2.S
@@ -26,8 +26,22 @@
 
 # ifdef USE_AS_WMEMCHR
 #  define VPCMPEQ	vpcmpeqd
+#  define VPBROADCAST	vpbroadcastd
+#  define CHAR_SIZE	4
 # else
 #  define VPCMPEQ	vpcmpeqb
+#  define VPBROADCAST	vpbroadcastb
+#  define CHAR_SIZE	1
+# endif
+
+# ifdef USE_AS_RAWMEMCHR
+#  define ERAW_PTR_REG	ecx
+#  define RRAW_PTR_REG	rcx
+#  define ALGN_PTR_REG	rdi
+# else
+#  define ERAW_PTR_REG	edi
+#  define RRAW_PTR_REG	rdi
+#  define ALGN_PTR_REG	rcx
 # endif
 
 # ifndef VZEROUPPER
@@ -39,303 +53,367 @@
 # endif
 
 # define VEC_SIZE 32
+# define PAGE_SIZE 4096
+
 
 	.section SECTION(.text),"ax",@progbits
-ENTRY (MEMCHR)
+ENTRY(MEMCHR)
 # ifndef USE_AS_RAWMEMCHR
 	/* Check for zero length.  */
 	test	%RDX_LP, %RDX_LP
 	jz	L(null)
 # endif
-	movl	%edi, %ecx
-	/* Broadcast CHAR to YMM0.  */
-	vmovd	%esi, %xmm0
 # ifdef USE_AS_WMEMCHR
 	shl	$2, %RDX_LP
-	vpbroadcastd %xmm0, %ymm0
 # else
 #  ifdef __ILP32__
 	/* Clear the upper 32 bits.  */
 	movl	%edx, %edx
 #  endif
-	vpbroadcastb %xmm0, %ymm0
 # endif
-	/* Check if we may cross page boundary with one vector load.  */
-	andl	$(2 * VEC_SIZE - 1), %ecx
-	cmpl	$VEC_SIZE, %ecx
-	ja	L(cros_page_boundary)
+	/* Broadcast CHAR to YMMMATCH.  */
+	vmovd	%esi, %xmm0
+	VPBROADCAST %xmm0, %ymm0
+	/* Check if we may cross page boundary with one
+	   vector load.  */
+	movl	%edi, %eax
+	andl	$(PAGE_SIZE - 1), %eax
+	cmpl	$(PAGE_SIZE - VEC_SIZE), %eax
+	ja	L(cross_page_boundary)
 
 	/* Check the first VEC_SIZE bytes.  */
-	VPCMPEQ (%rdi), %ymm0, %ymm1
+	VPCMPEQ	(%rdi), %ymm0, %ymm1
 	vpmovmskb %ymm1, %eax
-	testl	%eax, %eax
-
 # ifndef USE_AS_RAWMEMCHR
-	jnz	L(first_vec_x0_check)
-	/* Adjust length and check the end of data.  */
-	subq	$VEC_SIZE, %rdx
-	jbe	L(zero)
-# else
-	jnz	L(first_vec_x0)
+	/* If length < CHAR_PER_VEC handle special.  */
+	cmpq	$VEC_SIZE, %rdx
+	jbe	L(first_vec_x0)
 # endif
-
-	/* Align data for aligned loads in the loop.  */
-	addq	$VEC_SIZE, %rdi
-	andl	$(VEC_SIZE - 1), %ecx
-	andq	$-VEC_SIZE, %rdi
+	testl	%eax, %eax
+	jz	L(aligned_more)
+	tzcntl	%eax, %eax
+	addq	%rdi, %rax
+	VZEROUPPER_RETURN
 
 # ifndef USE_AS_RAWMEMCHR
-	/* Adjust length.  */
-	addq	%rcx, %rdx
-
-	subq	$(VEC_SIZE * 4), %rdx
-	jbe	L(last_4x_vec_or_less)
+	.p2align 5
+L(first_vec_x0):
+	/* Check if first match was before length.  */
+	tzcntl	%eax, %eax
+	xorl	%ecx, %ecx
+	cmpl	%eax, %edx
+	leaq	(%rdi, %rax), %rax
+	cmovle	%rcx, %rax
+	VZEROUPPER_RETURN
+L(null):
+	xorl	%eax, %eax
+	ret
 # endif
-	jmp	L(more_4x_vec)
-
 	.p2align 4
-L(cros_page_boundary):
-	andl	$(VEC_SIZE - 1), %ecx
-	andq	$-VEC_SIZE, %rdi
-	VPCMPEQ (%rdi), %ymm0, %ymm1
+L(cross_page_boundary):
+	/* Save pointer before aligning as its original
+	   value is necessary for computer return address if byte is
+	   found or adjusting length if it is not and this is
+	   memchr.  */
+	movq	%rdi, %rcx
+	/* Align data to VEC_SIZE - 1. ALGN_PTR_REG is
+	   rcx for memchr and rdi for rawmemchr.  */
+	orq	$(VEC_SIZE - 1), %ALGN_PTR_REG
+	VPCMPEQ	-(VEC_SIZE - 1)(%ALGN_PTR_REG), %ymm0, %ymm1
 	vpmovmskb %ymm1, %eax
+# ifndef USE_AS_RAWMEMCHR
+	/* Calculate length until end of page (length
+	   checked for a match).  */
+	leaq	1(%ALGN_PTR_REG), %rsi
+	subq	%RRAW_PTR_REG, %rsi
+# endif
 	/* Remove the leading bytes.  */
-	sarl	%cl, %eax
-	testl	%eax, %eax
-	jz	L(aligned_more)
-	tzcntl	%eax, %eax
+	sarxl	%ERAW_PTR_REG, %eax, %eax
 # ifndef USE_AS_RAWMEMCHR
 	/* Check the end of data.  */
-	cmpq	%rax, %rdx
-	jbe	L(zero)
+	cmpq	%rsi, %rdx
+	jbe	L(first_vec_x0)
 # endif
-	addq	%rdi, %rax
-	addq	%rcx, %rax
+	testl	%eax, %eax
+	jz	L(cross_page_continue)
+	tzcntl	%eax, %eax
+	addq	%RRAW_PTR_REG, %rax
 L(return_vzeroupper):
 	ZERO_UPPER_VEC_REGISTERS_RETURN
 
 	.p2align 4
-L(aligned_more):
-# ifndef USE_AS_RAWMEMCHR
-        /* Calculate "rdx + rcx - VEC_SIZE" with "rdx - (VEC_SIZE - rcx)"
-	   instead of "(rdx + rcx) - VEC_SIZE" to void possible addition
-	   overflow.  */
-	negq	%rcx
-	addq	$VEC_SIZE, %rcx
+L(first_vec_x1):
+	tzcntl	%eax, %eax
+	incq	%rdi
+	addq	%rdi, %rax
+	VZEROUPPER_RETURN
 
-	/* Check the end of data.  */
-	subq	%rcx, %rdx
-	jbe	L(zero)
-# endif
+	.p2align 4
+L(first_vec_x2):
+	tzcntl	%eax, %eax
+	addq	$(VEC_SIZE + 1), %rdi
+	addq	%rdi, %rax
+	VZEROUPPER_RETURN
 
-	addq	$VEC_SIZE, %rdi
+	.p2align 4
+L(first_vec_x3):
+	tzcntl	%eax, %eax
+	addq	$(VEC_SIZE * 2 + 1), %rdi
+	addq	%rdi, %rax
+	VZEROUPPER_RETURN
 
-# ifndef USE_AS_RAWMEMCHR
-	subq	$(VEC_SIZE * 4), %rdx
-	jbe	L(last_4x_vec_or_less)
-# endif
 
-L(more_4x_vec):
-	/* Check the first 4 * VEC_SIZE.  Only one VEC_SIZE at a time
-	   since data is only aligned to VEC_SIZE.  */
-	VPCMPEQ (%rdi), %ymm0, %ymm1
-	vpmovmskb %ymm1, %eax
-	testl	%eax, %eax
-	jnz	L(first_vec_x0)
+	.p2align 4
+L(first_vec_x4):
+	tzcntl	%eax, %eax
+	addq	$(VEC_SIZE * 3 + 1), %rdi
+	addq	%rdi, %rax
+	VZEROUPPER_RETURN
 
-	VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1
+	.p2align 4
+L(aligned_more):
+	/* Check the first 4 * VEC_SIZE.  Only one
+	   VEC_SIZE at a time since data is only aligned to
+	   VEC_SIZE.  */
+
+# ifndef USE_AS_RAWMEMCHR
+L(cross_page_continue):
+	/* Align data to VEC_SIZE - 1.  */
+	xorl	%ecx, %ecx
+	subl	%edi, %ecx
+	orq	$(VEC_SIZE - 1), %rdi
+	/* esi is for adjusting length to see if near the
+	   end.  */
+	leal	(VEC_SIZE * 4 + 1)(%rdi, %rcx), %esi
+# else
+	orq	$(VEC_SIZE - 1), %rdi
+L(cross_page_continue):
+# endif
+	/* Load first VEC regardless.  */
+	VPCMPEQ	1(%rdi), %ymm0, %ymm1
 	vpmovmskb %ymm1, %eax
+# ifndef USE_AS_RAWMEMCHR
+	/* Adjust length. If near end handle specially.
+	 */
+	subq	%rsi, %rdx
+	jbe	L(last_4x_vec_or_less)
+# endif
 	testl	%eax, %eax
 	jnz	L(first_vec_x1)
 
-	VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm0, %ymm1
+	VPCMPEQ	(VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
 	vpmovmskb %ymm1, %eax
 	testl	%eax, %eax
 	jnz	L(first_vec_x2)
 
-	VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1
+	VPCMPEQ	(VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm1
 	vpmovmskb %ymm1, %eax
 	testl	%eax, %eax
 	jnz	L(first_vec_x3)
 
-	addq	$(VEC_SIZE * 4), %rdi
+	VPCMPEQ	(VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm1
+	vpmovmskb %ymm1, %eax
+	testl	%eax, %eax
+	jnz	L(first_vec_x4)
 
 # ifndef USE_AS_RAWMEMCHR
+	/* Check if at last VEC_SIZE * 4 length.  */
 	subq	$(VEC_SIZE * 4), %rdx
-	jbe	L(last_4x_vec_or_less)
-# endif
-
-	/* Align data to 4 * VEC_SIZE.  */
-	movq	%rdi, %rcx
-	andl	$(4 * VEC_SIZE - 1), %ecx
-	andq	$-(4 * VEC_SIZE), %rdi
-
-# ifndef USE_AS_RAWMEMCHR
-	/* Adjust length.  */
+	jbe	L(last_4x_vec_or_less_cmpeq)
+	/* Align data to VEC_SIZE * 4 - 1 for the  loop
+	   and readjust length.  */
+	incq	%rdi
+	movl	%edi, %ecx
+	orq	$(VEC_SIZE * 4 - 1), %rdi
+	andl	$(VEC_SIZE * 4 - 1), %ecx
 	addq	%rcx, %rdx
+# else
+	/* Align data to VEC_SIZE * 4 - 1 for loop.  */
+	incq	%rdi
+	orq	$(VEC_SIZE * 4 - 1), %rdi
 # endif
 
+	/* Compare 4 * VEC at a time forward.  */
 	.p2align 4
 L(loop_4x_vec):
-	/* Compare 4 * VEC at a time forward.  */
-	VPCMPEQ (%rdi), %ymm0, %ymm1
-	VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm2
-	VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm0, %ymm3
-	VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm0, %ymm4
-
+	VPCMPEQ	1(%rdi), %ymm0, %ymm1
+	VPCMPEQ	(VEC_SIZE + 1)(%rdi), %ymm0, %ymm2
+	VPCMPEQ	(VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm3
+	VPCMPEQ	(VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm4
 	vpor	%ymm1, %ymm2, %ymm5
 	vpor	%ymm3, %ymm4, %ymm6
 	vpor	%ymm5, %ymm6, %ymm5
 
-	vpmovmskb %ymm5, %eax
-	testl	%eax, %eax
-	jnz	L(4x_vec_end)
-
-	addq	$(VEC_SIZE * 4), %rdi
-
+	vpmovmskb %ymm5, %ecx
 # ifdef USE_AS_RAWMEMCHR
-	jmp	L(loop_4x_vec)
+	subq	$-(VEC_SIZE * 4), %rdi
+	testl	%ecx, %ecx
+	jz	L(loop_4x_vec)
 # else
-	subq	$(VEC_SIZE * 4), %rdx
-	ja	L(loop_4x_vec)
+	testl	%ecx, %ecx
+	jnz	L(loop_4x_vec_end)
 
-L(last_4x_vec_or_less):
-	/* Less than 4 * VEC and aligned to VEC_SIZE.  */
-	addl	$(VEC_SIZE * 2), %edx
-	jle	L(last_2x_vec)
+	subq	$-(VEC_SIZE * 4), %rdi
 
-	VPCMPEQ (%rdi), %ymm0, %ymm1
-	vpmovmskb %ymm1, %eax
-	testl	%eax, %eax
-	jnz	L(first_vec_x0)
+	subq	$(VEC_SIZE * 4), %rdx
+	ja	L(loop_4x_vec)
 
-	VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1
+	/* Fall through into less than 4 remaining
+	   vectors of length case.  */
+	VPCMPEQ	(VEC_SIZE * 0 + 1)(%rdi), %ymm0, %ymm1
 	vpmovmskb %ymm1, %eax
+	.p2align 4
+L(last_4x_vec_or_less):
+	/* Check if first VEC contained match.  */
 	testl	%eax, %eax
-	jnz	L(first_vec_x1)
+	jnz	L(first_vec_x1_check)
 
-	VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm0, %ymm1
-	vpmovmskb %ymm1, %eax
-	testl	%eax, %eax
+	/* If remaining length > VEC_SIZE * 2.  */
+	addl	$(VEC_SIZE * 2), %edx
+	jg	L(last_4x_vec)
 
-	jnz	L(first_vec_x2_check)
-	subl	$VEC_SIZE, %edx
-	jle	L(zero)
+L(last_2x_vec):
+	/* If remaining length < VEC_SIZE.  */
+	addl	$VEC_SIZE, %edx
+	jle	L(zero_end)
 
-	VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1
+	/* Check VEC2 and compare any match with
+	   remaining length.  */
+	VPCMPEQ	(VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
 	vpmovmskb %ymm1, %eax
-	testl	%eax, %eax
-
-	jnz	L(first_vec_x3_check)
-	xorl	%eax, %eax
+	tzcntl	%eax, %eax
+	cmpl	%eax, %edx
+	jbe	L(set_zero_end)
+	addq	$(VEC_SIZE + 1), %rdi
+	addq	%rdi, %rax
+L(zero_end):
 	VZEROUPPER_RETURN
 
 	.p2align 4
-L(last_2x_vec):
-	addl	$(VEC_SIZE * 2), %edx
-	VPCMPEQ (%rdi), %ymm0, %ymm1
+L(loop_4x_vec_end):
+# endif
+	/* rawmemchr will fall through into this if match
+	   was found in loop.  */
+
 	vpmovmskb %ymm1, %eax
 	testl	%eax, %eax
+	jnz	L(last_vec_x1_return)
 
-	jnz	L(first_vec_x0_check)
-	subl	$VEC_SIZE, %edx
-	jle	L(zero)
-
-	VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1
-	vpmovmskb %ymm1, %eax
+	vpmovmskb %ymm2, %eax
 	testl	%eax, %eax
-	jnz	L(first_vec_x1_check)
-	xorl	%eax, %eax
-	VZEROUPPER_RETURN
+	jnz	L(last_vec_x2_return)
 
-	.p2align 4
-L(first_vec_x0_check):
-	tzcntl	%eax, %eax
-	/* Check the end of data.  */
-	cmpq	%rax, %rdx
-	jbe	L(zero)
+	vpmovmskb %ymm3, %eax
+	/* Combine VEC3 matches (eax) with VEC4 matches
+	   (ecx).  */
+	salq	$32, %rcx
+	orq	%rcx, %rax
+	tzcntq	%rax, %rax
+# ifdef USE_AS_RAWMEMCHR
+	subq	$(VEC_SIZE * 2 - 1), %rdi
+# else
+	subq	$-(VEC_SIZE * 2 + 1), %rdi
+# endif
 	addq	%rdi, %rax
 	VZEROUPPER_RETURN
+# ifndef USE_AS_RAWMEMCHR
 
 	.p2align 4
 L(first_vec_x1_check):
 	tzcntl	%eax, %eax
-	/* Check the end of data.  */
-	cmpq	%rax, %rdx
-	jbe	L(zero)
-	addq	$VEC_SIZE, %rax
+	/* Adjust length.  */
+	subl	$-(VEC_SIZE * 4), %edx
+	/* Check if match within remaining length.  */
+	cmpl	%eax, %edx
+	jbe	L(set_zero_end)
+	incq	%rdi
 	addq	%rdi, %rax
 	VZEROUPPER_RETURN
+	.p2align 4
+L(set_zero_end):
+	xorl	%eax, %eax
+	VZEROUPPER_RETURN
+# endif
 
 	.p2align 4
-L(first_vec_x2_check):
+L(last_vec_x1_return):
 	tzcntl	%eax, %eax
-	/* Check the end of data.  */
-	cmpq	%rax, %rdx
-	jbe	L(zero)
-	addq	$(VEC_SIZE * 2), %rax
+# ifdef USE_AS_RAWMEMCHR
+	subq	$(VEC_SIZE * 4 - 1), %rdi
+# else
+	incq	%rdi
+# endif
 	addq	%rdi, %rax
 	VZEROUPPER_RETURN
 
 	.p2align 4
-L(first_vec_x3_check):
+L(last_vec_x2_return):
 	tzcntl	%eax, %eax
-	/* Check the end of data.  */
-	cmpq	%rax, %rdx
-	jbe	L(zero)
-	addq	$(VEC_SIZE * 3), %rax
+# ifdef USE_AS_RAWMEMCHR
+	subq	$(VEC_SIZE * 3 - 1), %rdi
+# else
+	subq	$-(VEC_SIZE + 1), %rdi
+# endif
 	addq	%rdi, %rax
 	VZEROUPPER_RETURN
 
+# ifndef USE_AS_RAWMEMCHR
 	.p2align 4
-L(zero):
-	xorl	%eax, %eax
-	jmp     L(return_vzeroupper)
+L(last_4x_vec_or_less_cmpeq):
+	VPCMPEQ	(VEC_SIZE * 4 + 1)(%rdi), %ymm0, %ymm1
+	vpmovmskb %ymm1, %eax
+	subq	$-(VEC_SIZE * 4), %rdi
+	/* Check first VEC regardless.  */
+	testl	%eax, %eax
+	jnz	L(first_vec_x1_check)
 
+	/* If remaining length <= CHAR_PER_VEC * 2.  */
+	addl	$(VEC_SIZE * 2), %edx
+	jle	L(last_2x_vec)
 	.p2align 4
-L(null):
-	xorl	%eax, %eax
-	ret
-# endif
+L(last_4x_vec):
+	VPCMPEQ	(VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
+	vpmovmskb %ymm1, %eax
+	testl	%eax, %eax
+	jnz	L(last_vec_x2_return)
 
-	.p2align 4
-L(first_vec_x0):
-	tzcntl	%eax, %eax
-	addq	%rdi, %rax
-	VZEROUPPER_RETURN
+	VPCMPEQ	(VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm1
+	vpmovmskb %ymm1, %eax
 
-	.p2align 4
-L(first_vec_x1):
-	tzcntl	%eax, %eax
-	addq	$VEC_SIZE, %rax
-	addq	%rdi, %rax
-	VZEROUPPER_RETURN
+	/* Create mask for possible matches within
+	   remaining length.  */
+	movq	$-1, %rcx
+	bzhiq	%rdx, %rcx, %rcx
 
-	.p2align 4
-L(first_vec_x2):
+	/* Test matches in data against length match.  */
+	andl	%ecx, %eax
+	jnz	L(last_vec_x3)
+
+	/* if remaining length <= VEC_SIZE * 3 (Note this
+	   is after remaining length was found to be > VEC_SIZE * 2.
+	 */
+	subl	$VEC_SIZE, %edx
+	jbe	L(zero_end2)
+
+	VPCMPEQ	(VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm1
+	vpmovmskb %ymm1, %eax
+	/* Shift remaining length mask for last VEC.  */
+	shrq	$32, %rcx
+	andl	%ecx, %eax
+	jz	L(zero_end2)
 	tzcntl	%eax, %eax
-	addq	$(VEC_SIZE * 2), %rax
+	addq	$(VEC_SIZE * 3 + 1), %rdi
 	addq	%rdi, %rax
+L(zero_end2):
 	VZEROUPPER_RETURN
 
 	.p2align 4
-L(4x_vec_end):
-	vpmovmskb %ymm1, %eax
-	testl	%eax, %eax
-	jnz	L(first_vec_x0)
-	vpmovmskb %ymm2, %eax
-	testl	%eax, %eax
-	jnz	L(first_vec_x1)
-	vpmovmskb %ymm3, %eax
-	testl	%eax, %eax
-	jnz	L(first_vec_x2)
-	vpmovmskb %ymm4, %eax
-	testl	%eax, %eax
-L(first_vec_x3):
+L(last_vec_x3):
 	tzcntl	%eax, %eax
-	addq	$(VEC_SIZE * 3), %rax
+	subq	$-(VEC_SIZE * 2 + 1), %rdi
 	addq	%rdi, %rax
 	VZEROUPPER_RETURN
+# endif
 
-END (MEMCHR)
+END(MEMCHR)
 #endif
-- 
2.29.2


^ permalink raw reply	[flat|nested] 20+ messages in thread

* [PATCH v1 3/3] x86: Optimize memchr-evex.S
  2021-05-03  8:44 [PATCH v1 1/3] Bench: Expand bench-memchr.c Noah Goldstein
  2021-05-03  8:44 ` [PATCH v1 2/3] x86: Optimize memchr-avx2.S Noah Goldstein
@ 2021-05-03  8:44 ` Noah Goldstein
  2021-05-03 18:58   ` H.J. Lu
  2021-05-03 17:17 ` [PATCH v1 1/3] Bench: Expand bench-memchr.c H.J. Lu
  2 siblings, 1 reply; 20+ messages in thread
From: Noah Goldstein @ 2021-05-03  8:44 UTC (permalink / raw)
  To: libc-alpha

No bug. This commit optimizes memchr-evex.S. The optimizations include
replacing some branches with cmovcc, avoiding some branches entirely
in the less_4x_vec case, making the page cross logic less strict,
saving some ALU in the alignment process, and most importantly
increasing ILP in the 4x loop. test-memchr, test-rawmemchr, and
test-wmemchr are all passing.

Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
---
Tests where run on the following CPUs:

Tigerlake: https://ark.intel.com/content/www/us/en/ark/products/208921/intel-core-i7-1165g7-processor-12m-cache-up-to-4-70-ghz-with-ipu.html

Icelake: https://ark.intel.com/content/www/us/en/ark/products/196597/intel-core-i7-1065g7-processor-8m-cache-up-to-3-90-ghz.html

Skylake: https://ark.intel.com/content/www/us/en/ark/products/149091/intel-core-i7-8565u-processor-8m-cache-up-to-4-60-ghz.html

All times are the geometric mean of N=20. The unit of time is
seconds.

"Cur" refers to the current implementation
"New" refers to this patches implementation

Note: The numbers for size = [1, 32] are highly dependent on function
alignment. That being said the new implementation which uses cmovcc
instead of a branch (mostly for the reason of high variance with
different alignments) for the [1, 32] case is far more consistent and
performs about as well (and should only be a bigger improvement in
cases where the sizes / position are not 100% predictable).

For memchr-evex the numbers are a near universal improvement. The case
where the current implement as better is for size = 0 and for size =
[1, 32] with pos < size the two implementations are about the
same. For size = [1, 32] with pos > size, for medium range sizes, and
large size, however, the new implementation is faster.

Results For Tigerlake memchr-evex
size  , algn  , Pos   , Cur T , New T , Win   , Dif   
2048  , 0     , , 32    5.58  , 5.22  , New   , 0.36  
256   , 1     , , 64    5.22  , 4.93  , New   , 0.29  
2048  , 0     , , 64    5.22  , 4.89  , New   , 0.33  
256   , 2     , , 64    5.14  , 4.81  , New   , 0.33  
2048  , 0     , , 128   6.3   , 5.67  , New   , 0.63  
256   , 3     , , 64    5.22  , 4.9   , New   , 0.32  
2048  , 0     , , 256   11.07 , 10.92 , New   , 0.15  
256   , 4     , , 64    5.16  , 4.86  , New   , 0.3   
2048  , 0     , , 512   15.66 , 14.81 , New   , 0.85  
256   , 5     , , 64    5.15  , 4.84  , New   , 0.31  
2048  , 0     , , 1024  25.7  , 23.02 , New   , 2.68  
256   , 6     , , 64    5.12  , 4.89  , New   , 0.23  
2048  , 0     , , 2048  42.34 , 37.71 , New   , 4.63  
256   , 7     , , 64    5.03  , 4.62  , New   , 0.41  
192   , 1     , , 32    4.96  , 4.28  , New   , 0.68  
256   , 1     , , 32    4.95  , 4.28  , New   , 0.67  
512   , 1     , , 32    4.94  , 4.29  , New   , 0.65  
192   , 2     , , 64    5.1   , 4.8   , New   , 0.3   
512   , 2     , , 64    5.12  , 4.72  , New   , 0.4   
192   , 3     , , 96    5.54  , 5.12  , New   , 0.42  
256   , 3     , , 96    5.52  , 5.15  , New   , 0.37  
512   , 3     , , 96    5.51  , 5.16  , New   , 0.35  
192   , 4     , , 128   6.1   , 5.53  , New   , 0.57  
256   , 4     , , 128   6.09  , 5.49  , New   , 0.6   
512   , 4     , , 128   6.08  , 5.48  , New   , 0.6   
192   , 5     , , 160   7.42  , 6.71  , New   , 0.71  
256   , 5     , , 160   6.86  , 6.71  , New   , 0.15  
512   , 5     , , 160   9.28  , 8.68  , New   , 0.6   
192   , 6     , , 192   7.94  , 7.47  , New   , 0.47  
256   , 6     , , 192   7.62  , 7.17  , New   , 0.45  
512   , 6     , , 192   9.2   , 9.16  , New   , 0.04  
192   , 7     , , 224   8.02  , 7.43  , New   , 0.59  
256   , 7     , , 224   8.34  , 7.85  , New   , 0.49  
512   , 7     , , 224   9.89  , 9.16  , New   , 0.73  
2     , 0     , , 1     3.0   , 3.0   , Eq    , 0.0
2     , 1     , , 1     3.0   , 3.0   , Eq    , 0.0
0     , 0     , , 1     3.01  , 3.6   , Cur   , 0.59  
0     , 1     , , 1     3.01  , 3.6   , Cur   , 0.59  
3     , 0     , , 2     3.0   , 3.0   , Eq    , 0.0
3     , 2     , , 2     3.0   , 3.0   , Eq    , 0.0
1     , 0     , , 2     3.6   , 3.0   , New   , 0.6   
1     , 2     , , 2     3.6   , 3.0   , New   , 0.6   
4     , 0     , , 3     3.01  , 3.01  , Eq    , 0.0
4     , 3     , , 3     3.01  , 3.01  , Eq    , 0.0
2     , 0     , , 3     3.62  , 3.02  , New   , 0.6   
2     , 3     , , 3     3.62  , 3.03  , New   , 0.59  
5     , 0     , , 4     3.02  , 3.03  , Cur   , 0.01  
5     , 4     , , 4     3.02  , 3.02  , Eq    , 0.0
3     , 0     , , 4     3.63  , 3.02  , New   , 0.61  
3     , 4     , , 4     3.63  , 3.04  , New   , 0.59  
6     , 0     , , 5     3.05  , 3.04  , New   , 0.01  
6     , 5     , , 5     3.02  , 3.02  , Eq    , 0.0
4     , 0     , , 5     3.63  , 3.02  , New   , 0.61  
4     , 5     , , 5     3.64  , 3.03  , New   , 0.61  
7     , 0     , , 6     3.03  , 3.03  , Eq    , 0.0
7     , 6     , , 6     3.02  , 3.02  , Eq    , 0.0
5     , 0     , , 6     3.64  , 3.01  , New   , 0.63  
5     , 6     , , 6     3.64  , 3.03  , New   , 0.61  
8     , 0     , , 7     3.03  , 3.04  , Cur   , 0.01  
8     , 7     , , 7     3.04  , 3.04  , Eq    , 0.0
6     , 0     , , 7     3.67  , 3.04  , New   , 0.63  
6     , 7     , , 7     3.65  , 3.05  , New   , 0.6   
9     , 0     , , 8     3.05  , 3.05  , Eq    , 0.0
7     , 0     , , 8     3.67  , 3.05  , New   , 0.62  
10    , 0     , , 9     3.06  , 3.06  , Eq    , 0.0
10    , 1     , , 9     3.06  , 3.06  , Eq    , 0.0
8     , 0     , , 9     3.67  , 3.06  , New   , 0.61  
8     , 1     , , 9     3.67  , 3.06  , New   , 0.61  
11    , 0     , , 10    3.06  , 3.06  , Eq    , 0.0
11    , 2     , , 10    3.07  , 3.06  , New   , 0.01  
9     , 0     , , 10    3.67  , 3.05  , New   , 0.62  
9     , 2     , , 10    3.67  , 3.06  , New   , 0.61  
12    , 0     , , 11    3.06  , 3.06  , Eq    , 0.0
12    , 3     , , 11    3.06  , 3.06  , Eq    , 0.0
10    , 0     , , 11    3.67  , 3.06  , New   , 0.61  
10    , 3     , , 11    3.67  , 3.06  , New   , 0.61  
13    , 0     , , 12    3.06  , 3.07  , Cur   , 0.01  
13    , 4     , , 12    3.06  , 3.07  , Cur   , 0.01  
11    , 0     , , 12    3.67  , 3.11  , New   , 0.56  
11    , 4     , , 12    3.68  , 3.12  , New   , 0.56  
14    , 0     , , 13    3.07  , 3.1   , Cur   , 0.03  
14    , 5     , , 13    3.06  , 3.07  , Cur   , 0.01  
12    , 0     , , 13    3.67  , 3.07  , New   , 0.6   
12    , 5     , , 13    3.67  , 3.08  , New   , 0.59  
15    , 0     , , 14    3.06  , 3.06  , Eq    , 0.0
15    , 6     , , 14    3.07  , 3.06  , New   , 0.01  
13    , 0     , , 14    3.67  , 3.06  , New   , 0.61  
13    , 6     , , 14    3.68  , 3.06  , New   , 0.62  
16    , 0     , , 15    3.06  , 3.06  , Eq    , 0.0
16    , 7     , , 15    3.06  , 3.05  , New   , 0.01  
14    , 0     , , 15    3.68  , 3.06  , New   , 0.62  
14    , 7     , , 15    3.67  , 3.06  , New   , 0.61  
17    , 0     , , 16    3.07  , 3.06  , New   , 0.01  
15    , 0     , , 16    3.68  , 3.06  , New   , 0.62  
18    , 0     , , 17    3.06  , 3.06  , Eq    , 0.0
18    , 1     , , 17    3.06  , 3.06  , Eq    , 0.0
16    , 0     , , 17    3.67  , 3.06  , New   , 0.61  
16    , 1     , , 17    3.67  , 3.05  , New   , 0.62  
19    , 0     , , 18    3.07  , 3.06  , New   , 0.01  
19    , 2     , , 18    3.06  , 3.06  , Eq    , 0.0
17    , 0     , , 18    3.68  , 3.08  , New   , 0.6   
17    , 2     , , 18    3.68  , 3.06  , New   , 0.62  
20    , 0     , , 19    3.06  , 3.06  , Eq    , 0.0
20    , 3     , , 19    3.06  , 3.06  , Eq    , 0.0
18    , 0     , , 19    3.68  , 3.06  , New   , 0.62  
18    , 3     , , 19    3.68  , 3.06  , New   , 0.62  
21    , 0     , , 20    3.06  , 3.06  , Eq    , 0.0
21    , 4     , , 20    3.06  , 3.06  , Eq    , 0.0
19    , 0     , , 20    3.67  , 3.06  , New   , 0.61  
19    , 4     , , 20    3.67  , 3.06  , New   , 0.61  
22    , 0     , , 21    3.06  , 3.06  , Eq    , 0.0
22    , 5     , , 21    3.06  , 3.06  , Eq    , 0.0
20    , 0     , , 21    3.67  , 3.05  , New   , 0.62  
20    , 5     , , 21    3.68  , 3.06  , New   , 0.62  
23    , 0     , , 22    3.07  , 3.06  , New   , 0.01  
23    , 6     , , 22    3.06  , 3.06  , Eq    , 0.0
21    , 0     , , 22    3.68  , 3.07  , New   , 0.61  
21    , 6     , , 22    3.67  , 3.06  , New   , 0.61  
24    , 0     , , 23    3.19  , 3.06  , New   , 0.13  
24    , 7     , , 23    3.08  , 3.06  , New   , 0.02  
22    , 0     , , 23    3.69  , 3.06  , New   , 0.63  
22    , 7     , , 23    3.68  , 3.06  , New   , 0.62  
25    , 0     , , 24    3.07  , 3.06  , New   , 0.01  
23    , 0     , , 24    3.68  , 3.06  , New   , 0.62  
26    , 0     , , 25    3.06  , 3.05  , New   , 0.01  
26    , 1     , , 25    3.07  , 3.06  , New   , 0.01  
24    , 0     , , 25    3.67  , 3.05  , New   , 0.62  
24    , 1     , , 25    3.68  , 3.06  , New   , 0.62  
27    , 0     , , 26    3.12  , 3.06  , New   , 0.06  
27    , 2     , , 26    3.08  , 3.06  , New   , 0.02  
25    , 0     , , 26    3.69  , 3.06  , New   , 0.63  
25    , 2     , , 26    3.67  , 3.06  , New   , 0.61  
28    , 0     , , 27    3.06  , 3.06  , Eq    , 0.0
28    , 3     , , 27    3.06  , 3.06  , Eq    , 0.0
26    , 0     , , 27    3.67  , 3.06  , New   , 0.61  
26    , 3     , , 27    3.67  , 3.06  , New   , 0.61  
29    , 0     , , 28    3.06  , 3.06  , Eq    , 0.0
29    , 4     , , 28    3.06  , 3.06  , Eq    , 0.0
27    , 0     , , 28    3.68  , 3.05  , New   , 0.63  
27    , 4     , , 28    3.67  , 3.06  , New   , 0.61  
30    , 0     , , 29    3.06  , 3.06  , Eq    , 0.0
30    , 5     , , 29    3.06  , 3.06  , Eq    , 0.0
28    , 0     , , 29    3.67  , 3.06  , New   , 0.61  
28    , 5     , , 29    3.68  , 3.06  , New   , 0.62  
31    , 0     , , 30    3.06  , 3.06  , Eq    , 0.0
31    , 6     , , 30    3.06  , 3.06  , Eq    , 0.0
29    , 0     , , 30    3.68  , 3.06  , New   , 0.62  
29    , 6     , , 30    3.7   , 3.06  , New   , 0.64  
32    , 0     , , 31    3.17  , 3.06  , New   , 0.11  
32    , 7     , , 31    3.12  , 3.06  , New   , 0.06  
30    , 0     , , 31    3.68  , 3.06  , New   , 0.62  
30    , 7     , , 31    3.68  , 3.06  , New   , 0.62

Results For Icelake memchr-evex
size  , algn  , Pos   , Cur T , New T , Win   , Dif   
2048  , 0     , , 32    4.94  , 4.26  , New   , 0.68  
256   , 1     , , 64    4.5   , 4.13  , New   , 0.37  
2048  , 0     , , 64    4.19  , 3.9   , New   , 0.29  
256   , 2     , , 64    4.19  , 3.87  , New   , 0.32  
2048  , 0     , , 128   4.96  , 4.53  , New   , 0.43  
256   , 3     , , 64    4.07  , 3.86  , New   , 0.21  
2048  , 0     , , 256   8.77  , 8.61  , New   , 0.16  
256   , 4     , , 64    4.08  , 3.87  , New   , 0.21  
2048  , 0     , , 512   12.22 , 11.67 , New   , 0.55  
256   , 5     , , 64    4.12  , 3.83  , New   , 0.29  
2048  , 0     , , 1024  20.06 , 18.09 , New   , 1.97  
256   , 6     , , 64    4.2   , 3.95  , New   , 0.25  
2048  , 0     , , 2048  33.83 , 30.62 , New   , 3.21  
256   , 7     , , 64    4.3   , 4.04  , New   , 0.26  
192   , 1     , , 32    4.2   , 3.71  , New   , 0.49  
256   , 1     , , 32    4.24  , 3.76  , New   , 0.48  
512   , 1     , , 32    4.29  , 3.74  , New   , 0.55  
192   , 2     , , 64    4.42  , 4.0   , New   , 0.42  
512   , 2     , , 64    4.17  , 3.83  , New   , 0.34  
192   , 3     , , 96    4.44  , 4.26  , New   , 0.18  
256   , 3     , , 96    4.45  , 4.14  , New   , 0.31  
512   , 3     , , 96    4.42  , 4.15  , New   , 0.27  
192   , 4     , , 128   4.93  , 4.45  , New   , 0.48  
256   , 4     , , 128   4.93  , 4.47  , New   , 0.46  
512   , 4     , , 128   4.95  , 4.47  , New   , 0.48  
192   , 5     , , 160   5.95  , 5.44  , New   , 0.51  
256   , 5     , , 160   5.59  , 5.47  , New   , 0.12  
512   , 5     , , 160   7.59  , 7.34  , New   , 0.25  
192   , 6     , , 192   6.53  , 6.08  , New   , 0.45  
256   , 6     , , 192   6.2   , 5.88  , New   , 0.32  
512   , 6     , , 192   7.53  , 7.62  , Cur   , 0.09  
192   , 7     , , 224   6.62  , 6.12  , New   , 0.5   
256   , 7     , , 224   6.79  , 6.51  , New   , 0.28  
512   , 7     , , 224   8.12  , 7.61  , New   , 0.51  
2     , 0     , , 1     2.5   , 2.54  , Cur   , 0.04  
2     , 1     , , 1     2.56  , 2.55  , New   , 0.01  
0     , 0     , , 1     2.57  , 3.12  , Cur   , 0.55  
0     , 1     , , 1     2.59  , 3.14  , Cur   , 0.55  
3     , 0     , , 2     2.62  , 2.63  , Cur   , 0.01  
3     , 2     , , 2     2.66  , 2.67  , Cur   , 0.01  
1     , 0     , , 2     3.24  , 2.72  , New   , 0.52  
1     , 2     , , 2     3.28  , 2.75  , New   , 0.53  
4     , 0     , , 3     2.78  , 2.8   , Cur   , 0.02  
4     , 3     , , 3     2.8   , 2.82  , Cur   , 0.02  
2     , 0     , , 3     3.38  , 2.86  , New   , 0.52  
2     , 3     , , 3     3.41  , 2.89  , New   , 0.52  
5     , 0     , , 4     2.88  , 2.91  , Cur   , 0.03  
5     , 4     , , 4     2.88  , 2.92  , Cur   , 0.04  
3     , 0     , , 4     3.48  , 2.93  , New   , 0.55  
3     , 4     , , 4     3.47  , 2.93  , New   , 0.54  
6     , 0     , , 5     2.95  , 2.94  , New   , 0.01  
6     , 5     , , 5     2.91  , 2.92  , Cur   , 0.01  
4     , 0     , , 5     3.47  , 2.9   , New   , 0.57  
4     , 5     , , 5     3.43  , 2.91  , New   , 0.52  
7     , 0     , , 6     2.87  , 2.9   , Cur   , 0.03  
7     , 6     , , 6     2.87  , 2.89  , Cur   , 0.02  
5     , 0     , , 6     3.44  , 2.88  , New   , 0.56  
5     , 6     , , 6     3.41  , 2.87  , New   , 0.54  
8     , 0     , , 7     2.86  , 2.87  , Cur   , 0.01  
8     , 7     , , 7     2.86  , 2.87  , Cur   , 0.01  
6     , 0     , , 7     3.43  , 2.87  , New   , 0.56  
6     , 7     , , 7     3.44  , 2.87  , New   , 0.57  
9     , 0     , , 8     2.86  , 2.88  , Cur   , 0.02  
7     , 0     , , 8     3.41  , 2.89  , New   , 0.52  
10    , 0     , , 9     2.83  , 2.87  , Cur   , 0.04  
10    , 1     , , 9     2.82  , 2.87  , Cur   , 0.05  
8     , 0     , , 9     3.4   , 2.89  , New   , 0.51  
8     , 1     , , 9     3.41  , 2.87  , New   , 0.54  
11    , 0     , , 10    2.83  , 2.88  , Cur   , 0.05  
11    , 2     , , 10    2.84  , 2.88  , Cur   , 0.04  
9     , 0     , , 10    3.41  , 2.87  , New   , 0.54  
9     , 2     , , 10    3.41  , 2.88  , New   , 0.53  
12    , 0     , , 11    2.83  , 2.89  , Cur   , 0.06  
12    , 3     , , 11    2.85  , 2.87  , Cur   , 0.02  
10    , 0     , , 11    3.41  , 2.87  , New   , 0.54  
10    , 3     , , 11    3.42  , 2.88  , New   , 0.54  
13    , 0     , , 12    2.86  , 2.87  , Cur   , 0.01  
13    , 4     , , 12    2.84  , 2.88  , Cur   , 0.04  
11    , 0     , , 12    3.43  , 2.87  , New   , 0.56  
11    , 4     , , 12    3.49  , 2.87  , New   , 0.62  
14    , 0     , , 13    2.85  , 2.86  , Cur   , 0.01  
14    , 5     , , 13    2.85  , 2.86  , Cur   , 0.01  
12    , 0     , , 13    3.41  , 2.86  , New   , 0.55  
12    , 5     , , 13    3.44  , 2.85  , New   , 0.59  
15    , 0     , , 14    2.83  , 2.87  , Cur   , 0.04  
15    , 6     , , 14    2.82  , 2.86  , Cur   , 0.04  
13    , 0     , , 14    3.41  , 2.86  , New   , 0.55  
13    , 6     , , 14    3.4   , 2.86  , New   , 0.54  
16    , 0     , , 15    2.84  , 2.86  , Cur   , 0.02  
16    , 7     , , 15    2.83  , 2.85  , Cur   , 0.02  
14    , 0     , , 15    3.41  , 2.85  , New   , 0.56  
14    , 7     , , 15    3.39  , 2.87  , New   , 0.52  
17    , 0     , , 16    2.83  , 2.87  , Cur   , 0.04  
15    , 0     , , 16    3.4   , 2.85  , New   , 0.55  
18    , 0     , , 17    2.83  , 2.86  , Cur   , 0.03  
18    , 1     , , 17    2.85  , 2.84  , New   , 0.01  
16    , 0     , , 17    3.41  , 2.85  , New   , 0.56  
16    , 1     , , 17    3.4   , 2.86  , New   , 0.54  
19    , 0     , , 18    2.8   , 2.84  , Cur   , 0.04  
19    , 2     , , 18    2.82  , 2.83  , Cur   , 0.01  
17    , 0     , , 18    3.39  , 2.86  , New   , 0.53  
17    , 2     , , 18    3.39  , 2.84  , New   , 0.55  
20    , 0     , , 19    2.85  , 2.87  , Cur   , 0.02  
20    , 3     , , 19    2.88  , 2.87  , New   , 0.01  
18    , 0     , , 19    3.38  , 2.85  , New   , 0.53  
18    , 3     , , 19    3.4   , 2.85  , New   , 0.55  
21    , 0     , , 20    2.83  , 2.85  , Cur   , 0.02  
21    , 4     , , 20    2.88  , 2.85  , New   , 0.03  
19    , 0     , , 20    3.39  , 2.84  , New   , 0.55  
19    , 4     , , 20    3.39  , 2.96  , New   , 0.43  
22    , 0     , , 21    2.84  , 2.9   , Cur   , 0.06  
22    , 5     , , 21    2.81  , 2.84  , Cur   , 0.03  
20    , 0     , , 21    3.41  , 2.81  , New   , 0.6   
20    , 5     , , 21    3.38  , 2.83  , New   , 0.55  
23    , 0     , , 22    2.8   , 2.82  , Cur   , 0.02  
23    , 6     , , 22    2.81  , 2.83  , Cur   , 0.02  
21    , 0     , , 22    3.35  , 2.81  , New   , 0.54  
21    , 6     , , 22    3.34  , 2.81  , New   , 0.53  
24    , 0     , , 23    2.77  , 2.84  , Cur   , 0.07  
24    , 7     , , 23    2.78  , 2.8   , Cur   , 0.02  
22    , 0     , , 23    3.34  , 2.79  , New   , 0.55  
22    , 7     , , 23    3.32  , 2.79  , New   , 0.53  
25    , 0     , , 24    2.77  , 2.8   , Cur   , 0.03  
23    , 0     , , 24    3.29  , 2.79  , New   , 0.5   
26    , 0     , , 25    2.73  , 2.78  , Cur   , 0.05  
26    , 1     , , 25    2.75  , 2.79  , Cur   , 0.04  
24    , 0     , , 25    3.27  , 2.79  , New   , 0.48  
24    , 1     , , 25    3.27  , 2.77  , New   , 0.5   
27    , 0     , , 26    2.72  , 2.78  , Cur   , 0.06  
27    , 2     , , 26    2.75  , 2.76  , Cur   , 0.01  
25    , 0     , , 26    3.29  , 2.73  , New   , 0.56  
25    , 2     , , 26    3.3   , 2.76  , New   , 0.54  
28    , 0     , , 27    2.75  , 2.79  , Cur   , 0.04  
28    , 3     , , 27    2.77  , 2.77  , Eq    , 0.0
26    , 0     , , 27    3.28  , 2.78  , New   , 0.5   
26    , 3     , , 27    3.29  , 2.78  , New   , 0.51  
29    , 0     , , 28    2.74  , 2.76  , Cur   , 0.02  
29    , 4     , , 28    2.74  , 2.77  , Cur   , 0.03  
27    , 0     , , 28    3.3   , 2.76  , New   , 0.54  
27    , 4     , , 28    3.3   , 2.74  , New   , 0.56  
30    , 0     , , 29    2.72  , 2.76  , Cur   , 0.04  
30    , 5     , , 29    2.74  , 2.75  , Cur   , 0.01  
28    , 0     , , 29    3.25  , 2.73  , New   , 0.52  
28    , 5     , , 29    3.3   , 2.73  , New   , 0.57  
31    , 0     , , 30    2.73  , 2.77  , Cur   , 0.04  
31    , 6     , , 30    2.74  , 2.76  , Cur   , 0.02  
29    , 0     , , 30    3.25  , 2.73  , New   , 0.52  
29    , 6     , , 30    3.26  , 2.74  , New   , 0.52  
32    , 0     , , 31    2.73  , 2.74  , Cur   , 0.01  
32    , 7     , , 31    2.73  , 2.75  , Cur   , 0.02  
30    , 0     , , 31    3.24  , 2.72  , New   , 0.52  
30    , 7     , , 31    3.24  , 2.72  , New   , 0.52

For memchr-avx2 the improvements are more modest though again near
universal. The improvement is most significant for medium sizes and
small sizes with pos > size. For small sizes with pos < size and large
sizes the two implementations perform roughly the same for large
sizes.

Results For Tigerlake memchr-avx2
size  , algn  , Pos   , Cur T , New T , Win   , Dif   
2048  , 0     , , 32    6.15  , 6.27  , Cur   , 0.12  
256   , 1     , , 64    6.21  , 6.03  , New   , 0.18  
2048  , 0     , , 64    6.07  , 5.95  , New   , 0.12  
256   , 2     , , 64    6.01  , 5.8   , New   , 0.21  
2048  , 0     , , 128   7.05  , 6.55  , New   , 0.5   
256   , 3     , , 64    6.14  , 5.83  , New   , 0.31  
2048  , 0     , , 256   11.78 , 11.78 , Eq    , 0.0
256   , 4     , , 64    6.1   , 5.85  , New   , 0.25  
2048  , 0     , , 512   16.32 , 15.96 , New   , 0.36  
256   , 5     , , 64    6.1   , 5.77  , New   , 0.33  
2048  , 0     , , 1024  25.38 , 25.18 , New   , 0.2   
256   , 6     , , 64    6.08  , 5.88  , New   , 0.2   
2048  , 0     , , 2048  38.56 , 38.32 , New   , 0.24  
256   , 7     , , 64    5.93  , 5.68  , New   , 0.25  
192   , 1     , , 32    5.49  , 5.3   , New   , 0.19  
256   , 1     , , 32    5.5   , 5.28  , New   , 0.22  
512   , 1     , , 32    5.48  , 5.32  , New   , 0.16  
192   , 2     , , 64    6.1   , 5.73  , New   , 0.37  
512   , 2     , , 64    5.88  , 5.72  , New   , 0.16  
192   , 3     , , 96    6.31  , 5.93  , New   , 0.38  
256   , 3     , , 96    6.32  , 5.93  , New   , 0.39  
512   , 3     , , 96    6.2   , 5.94  , New   , 0.26  
192   , 4     , , 128   6.65  , 6.4   , New   , 0.25  
256   , 4     , , 128   6.6   , 6.37  , New   , 0.23  
512   , 4     , , 128   6.74  , 6.33  , New   , 0.41  
192   , 5     , , 160   7.78  , 7.4   , New   , 0.38  
256   , 5     , , 160   7.18  , 7.4   , Cur   , 0.22  
512   , 5     , , 160   9.81  , 9.44  , New   , 0.37  
192   , 6     , , 192   9.12  , 7.77  , New   , 1.35  
256   , 6     , , 192   7.97  , 7.66  , New   , 0.31  
512   , 6     , , 192   10.14 , 9.95  , New   , 0.19  
192   , 7     , , 224   8.96  , 7.78  , New   , 1.18  
256   , 7     , , 224   8.52  , 8.23  , New   , 0.29  
512   , 7     , , 224   10.33 , 9.98  , New   , 0.35  
2     , 0     , , 1     3.61  , 3.6   , New   , 0.01  
2     , 1     , , 1     3.6   , 3.6   , Eq    , 0.0
0     , 0     , , 1     3.02  , 3.0   , New   , 0.02  
0     , 1     , , 1     3.0   , 3.0   , Eq    , 0.0
3     , 0     , , 2     3.6   , 3.6   , Eq    , 0.0
3     , 2     , , 2     3.61  , 3.6   , New   , 0.01  
1     , 0     , , 2     4.82  , 3.6   , New   , 1.22  
1     , 2     , , 2     4.81  , 3.6   , New   , 1.21  
4     , 0     , , 3     3.61  , 3.61  , Eq    , 0.0
4     , 3     , , 3     3.62  , 3.61  , New   , 0.01  
2     , 0     , , 3     4.82  , 3.62  , New   , 1.2   
2     , 3     , , 3     4.83  , 3.63  , New   , 1.2   
5     , 0     , , 4     3.63  , 3.64  , Cur   , 0.01  
5     , 4     , , 4     3.63  , 3.62  , New   , 0.01  
3     , 0     , , 4     4.84  , 3.62  , New   , 1.22  
3     , 4     , , 4     4.84  , 3.64  , New   , 1.2   
6     , 0     , , 5     3.66  , 3.64  , New   , 0.02  
6     , 5     , , 5     3.65  , 3.62  , New   , 0.03  
4     , 0     , , 5     4.83  , 3.63  , New   , 1.2   
4     , 5     , , 5     4.85  , 3.64  , New   , 1.21  
7     , 0     , , 6     3.76  , 3.79  , Cur   , 0.03  
7     , 6     , , 6     3.76  , 3.72  , New   , 0.04  
5     , 0     , , 6     4.84  , 3.62  , New   , 1.22  
5     , 6     , , 6     4.85  , 3.64  , New   , 1.21  
8     , 0     , , 7     3.64  , 3.65  , Cur   , 0.01  
8     , 7     , , 7     3.65  , 3.65  , Eq    , 0.0
6     , 0     , , 7     4.88  , 3.64  , New   , 1.24  
6     , 7     , , 7     4.87  , 3.65  , New   , 1.22  
9     , 0     , , 8     3.66  , 3.66  , Eq    , 0.0
7     , 0     , , 8     4.89  , 3.66  , New   , 1.23  
10    , 0     , , 9     3.67  , 3.67  , Eq    , 0.0
10    , 1     , , 9     3.67  , 3.67  , Eq    , 0.0
8     , 0     , , 9     4.9   , 3.67  , New   , 1.23  
8     , 1     , , 9     4.9   , 3.67  , New   , 1.23  
11    , 0     , , 10    3.68  , 3.67  , New   , 0.01  
11    , 2     , , 10    3.69  , 3.67  , New   , 0.02  
9     , 0     , , 10    4.9   , 3.67  , New   , 1.23  
9     , 2     , , 10    4.9   , 3.67  , New   , 1.23  
12    , 0     , , 11    3.71  , 3.68  , New   , 0.03  
12    , 3     , , 11    3.71  , 3.67  , New   , 0.04  
10    , 0     , , 11    4.9   , 3.67  , New   , 1.23  
10    , 3     , , 11    4.9   , 3.67  , New   , 1.23  
13    , 0     , , 12    4.24  , 4.23  , New   , 0.01  
13    , 4     , , 12    4.23  , 4.23  , Eq    , 0.0
11    , 0     , , 12    4.9   , 3.7   , New   , 1.2   
11    , 4     , , 12    4.9   , 3.73  , New   , 1.17  
14    , 0     , , 13    3.99  , 4.01  , Cur   , 0.02  
14    , 5     , , 13    3.98  , 3.98  , Eq    , 0.0
12    , 0     , , 13    4.9   , 3.69  , New   , 1.21  
12    , 5     , , 13    4.9   , 3.69  , New   , 1.21  
15    , 0     , , 14    3.99  , 3.97  , New   , 0.02  
15    , 6     , , 14    4.0   , 4.0   , Eq    , 0.0
13    , 0     , , 14    4.9   , 3.67  , New   , 1.23  
13    , 6     , , 14    4.9   , 3.67  , New   , 1.23  
16    , 0     , , 15    3.99  , 4.02  , Cur   , 0.03  
16    , 7     , , 15    4.01  , 3.96  , New   , 0.05  
14    , 0     , , 15    4.93  , 3.67  , New   , 1.26  
14    , 7     , , 15    4.92  , 3.67  , New   , 1.25  
17    , 0     , , 16    4.04  , 3.99  , New   , 0.05  
15    , 0     , , 16    5.42  , 4.22  , New   , 1.2   
18    , 0     , , 17    4.01  , 3.97  , New   , 0.04  
18    , 1     , , 17    3.99  , 3.98  , New   , 0.01  
16    , 0     , , 17    5.22  , 3.98  , New   , 1.24  
16    , 1     , , 17    5.19  , 3.98  , New   , 1.21  
19    , 0     , , 18    4.0   , 3.99  , New   , 0.01  
19    , 2     , , 18    4.03  , 3.97  , New   , 0.06  
17    , 0     , , 18    5.18  , 3.99  , New   , 1.19  
17    , 2     , , 18    5.18  , 3.98  , New   , 1.2   
20    , 0     , , 19    4.02  , 3.98  , New   , 0.04  
20    , 3     , , 19    4.0   , 3.98  , New   , 0.02  
18    , 0     , , 19    5.19  , 3.97  , New   , 1.22  
18    , 3     , , 19    5.21  , 3.98  , New   , 1.23  
21    , 0     , , 20    3.98  , 4.0   , Cur   , 0.02  
21    , 4     , , 20    4.0   , 4.0   , Eq    , 0.0
19    , 0     , , 20    5.19  , 3.99  , New   , 1.2   
19    , 4     , , 20    5.17  , 3.99  , New   , 1.18  
22    , 0     , , 21    4.03  , 3.98  , New   , 0.05  
22    , 5     , , 21    4.01  , 3.95  , New   , 0.06  
20    , 0     , , 21    5.19  , 4.0   , New   , 1.19  
20    , 5     , , 21    5.21  , 3.99  , New   , 1.22  
23    , 0     , , 22    4.06  , 3.97  , New   , 0.09  
23    , 6     , , 22    4.02  , 3.98  , New   , 0.04  
21    , 0     , , 22    5.2   , 4.02  , New   , 1.18  
21    , 6     , , 22    5.22  , 4.0   , New   , 1.22  
24    , 0     , , 23    4.15  , 3.98  , New   , 0.17  
24    , 7     , , 23    4.0   , 4.01  , Cur   , 0.01  
22    , 0     , , 23    5.28  , 4.0   , New   , 1.28  
22    , 7     , , 23    5.22  , 3.99  , New   , 1.23  
25    , 0     , , 24    4.1   , 4.04  , New   , 0.06  
23    , 0     , , 24    5.23  , 4.04  , New   , 1.19  
26    , 0     , , 25    4.1   , 4.06  , New   , 0.04  
26    , 1     , , 25    4.07  , 3.99  , New   , 0.08  
24    , 0     , , 25    5.26  , 4.02  , New   , 1.24  
24    , 1     , , 25    5.21  , 4.0   , New   , 1.21  
27    , 0     , , 26    4.17  , 4.03  , New   , 0.14  
27    , 2     , , 26    4.09  , 4.03  , New   , 0.06  
25    , 0     , , 26    5.29  , 4.1   , New   , 1.19  
25    , 2     , , 26    5.25  , 4.0   , New   , 1.25  
28    , 0     , , 27    4.06  , 4.1   , Cur   , 0.04  
28    , 3     , , 27    4.09  , 4.04  , New   , 0.05  
26    , 0     , , 27    5.26  , 4.04  , New   , 1.22  
26    , 3     , , 27    5.28  , 4.01  , New   , 1.27  
29    , 0     , , 28    4.07  , 4.02  , New   , 0.05  
29    , 4     , , 28    4.07  , 4.05  , New   , 0.02  
27    , 0     , , 28    5.25  , 4.02  , New   , 1.23  
27    , 4     , , 28    5.25  , 4.03  , New   , 1.22  
30    , 0     , , 29    4.14  , 4.06  , New   , 0.08  
30    , 5     , , 29    4.08  , 4.04  , New   , 0.04  
28    , 0     , , 29    5.26  , 4.07  , New   , 1.19  
28    , 5     , , 29    5.28  , 4.04  , New   , 1.24  
31    , 0     , , 30    4.09  , 4.08  , New   , 0.01  
31    , 6     , , 30    4.1   , 4.08  , New   , 0.02  
29    , 0     , , 30    5.28  , 4.05  , New   , 1.23  
29    , 6     , , 30    5.24  , 4.07  , New   , 1.17  
32    , 0     , , 31    4.1   , 4.13  , Cur   , 0.03  
32    , 7     , , 31    4.16  , 4.09  , New   , 0.07  
30    , 0     , , 31    5.31  , 4.09  , New   , 1.22  
30    , 7     , , 31    5.28  , 4.08  , New   , 1.2

Results For Icelake memchr-avx2
size  , algn  , Pos   , Cur T , New T , Win   , Dif   
2048  , 0     , , 32    5.74  , 5.08  , New   , 0.66  
256   , 1     , , 64    5.16  , 4.93  , New   , 0.23  
2048  , 0     , , 64    4.86  , 4.69  , New   , 0.17  
256   , 2     , , 64    4.78  , 4.7   , New   , 0.08  
2048  , 0     , , 128   5.64  , 5.0   , New   , 0.64  
256   , 3     , , 64    4.64  , 4.59  , New   , 0.05  
2048  , 0     , , 256   9.07  , 9.17  , Cur   , 0.1   
256   , 4     , , 64    4.7   , 4.6   , New   , 0.1   
2048  , 0     , , 512   12.56 , 12.33 , New   , 0.23  
256   , 5     , , 64    4.72  , 4.61  , New   , 0.11  
2048  , 0     , , 1024  19.36 , 19.49 , Cur   , 0.13  
256   , 6     , , 64    4.82  , 4.69  , New   , 0.13  
2048  , 0     , , 2048  29.99 , 30.53 , Cur   , 0.54  
256   , 7     , , 64    4.9   , 4.85  , New   , 0.05  
192   , 1     , , 32    4.89  , 4.45  , New   , 0.44  
256   , 1     , , 32    4.93  , 4.44  , New   , 0.49  
512   , 1     , , 32    4.97  , 4.45  , New   , 0.52  
192   , 2     , , 64    5.04  , 4.65  , New   , 0.39  
512   , 2     , , 64    4.75  , 4.66  , New   , 0.09  
192   , 3     , , 96    5.14  , 4.66  , New   , 0.48  
256   , 3     , , 96    5.12  , 4.66  , New   , 0.46  
512   , 3     , , 96    5.13  , 4.62  , New   , 0.51  
192   , 4     , , 128   5.65  , 4.95  , New   , 0.7   
256   , 4     , , 128   5.63  , 4.95  , New   , 0.68  
512   , 4     , , 128   5.68  , 4.96  , New   , 0.72  
192   , 5     , , 160   6.1   , 5.84  , New   , 0.26  
256   , 5     , , 160   5.58  , 5.84  , Cur   , 0.26  
512   , 5     , , 160   7.95  , 7.74  , New   , 0.21  
192   , 6     , , 192   7.07  , 6.23  , New   , 0.84  
256   , 6     , , 192   6.34  , 6.09  , New   , 0.25  
512   , 6     , , 192   8.17  , 8.13  , New   , 0.04  
192   , 7     , , 224   7.06  , 6.23  , New   , 0.83  
256   , 7     , , 224   6.76  , 6.65  , New   , 0.11  
512   , 7     , , 224   8.29  , 8.08  , New   , 0.21  
2     , 0     , , 1     3.0   , 3.04  , Cur   , 0.04  
2     , 1     , , 1     3.06  , 3.07  , Cur   , 0.01  
0     , 0     , , 1     2.57  , 2.59  , Cur   , 0.02  
0     , 1     , , 1     2.6   , 2.61  , Cur   , 0.01  
3     , 0     , , 2     3.15  , 3.17  , Cur   , 0.02  
3     , 2     , , 2     3.19  , 3.21  , Cur   , 0.02  
1     , 0     , , 2     4.32  , 3.25  , New   , 1.07  
1     , 2     , , 2     4.36  , 3.31  , New   , 1.05  
4     , 0     , , 3     3.5   , 3.52  , Cur   , 0.02  
4     , 3     , , 3     3.52  , 3.54  , Cur   , 0.02  
2     , 0     , , 3     4.51  , 3.43  , New   , 1.08  
2     , 3     , , 3     4.56  , 3.47  , New   , 1.09  
5     , 0     , , 4     3.61  , 3.65  , Cur   , 0.04  
5     , 4     , , 4     3.63  , 3.67  , Cur   , 0.04  
3     , 0     , , 4     4.64  , 3.51  , New   , 1.13  
3     , 4     , , 4     4.7   , 3.51  , New   , 1.19  
6     , 0     , , 5     3.66  , 3.68  , Cur   , 0.02  
6     , 5     , , 5     3.69  , 3.65  , New   , 0.04  
4     , 0     , , 5     4.7   , 3.49  , New   , 1.21  
4     , 5     , , 5     4.58  , 3.48  , New   , 1.1   
7     , 0     , , 6     3.6   , 3.65  , Cur   , 0.05  
7     , 6     , , 6     3.59  , 3.64  , Cur   , 0.05  
5     , 0     , , 6     4.74  , 3.65  , New   , 1.09  
5     , 6     , , 6     4.73  , 3.64  , New   , 1.09  
8     , 0     , , 7     3.6   , 3.61  , Cur   , 0.01  
8     , 7     , , 7     3.6   , 3.61  , Cur   , 0.01  
6     , 0     , , 7     4.73  , 3.6   , New   , 1.13  
6     , 7     , , 7     4.73  , 3.62  , New   , 1.11  
9     , 0     , , 8     3.59  , 3.62  , Cur   , 0.03  
7     , 0     , , 8     4.72  , 3.64  , New   , 1.08  
10    , 0     , , 9     3.57  , 3.62  , Cur   , 0.05  
10    , 1     , , 9     3.56  , 3.61  , Cur   , 0.05  
8     , 0     , , 9     4.69  , 3.63  , New   , 1.06  
8     , 1     , , 9     4.71  , 3.61  , New   , 1.1   
11    , 0     , , 10    3.58  , 3.62  , Cur   , 0.04  
11    , 2     , , 10    3.59  , 3.63  , Cur   , 0.04  
9     , 0     , , 10    4.72  , 3.61  , New   , 1.11  
9     , 2     , , 10    4.7   , 3.61  , New   , 1.09  
12    , 0     , , 11    3.58  , 3.63  , Cur   , 0.05  
12    , 3     , , 11    3.58  , 3.62  , Cur   , 0.04  
10    , 0     , , 11    4.7   , 3.6   , New   , 1.1   
10    , 3     , , 11    4.73  , 3.64  , New   , 1.09  
13    , 0     , , 12    3.6   , 3.6   , Eq    , 0.0
13    , 4     , , 12    3.57  , 3.62  , Cur   , 0.05  
11    , 0     , , 12    4.73  , 3.62  , New   , 1.11  
11    , 4     , , 12    4.79  , 3.61  , New   , 1.18  
14    , 0     , , 13    3.61  , 3.62  , Cur   , 0.01  
14    , 5     , , 13    3.59  , 3.59  , Eq    , 0.0
12    , 0     , , 13    4.7   , 3.61  , New   , 1.09  
12    , 5     , , 13    4.75  , 3.58  , New   , 1.17  
15    , 0     , , 14    3.58  , 3.62  , Cur   , 0.04  
15    , 6     , , 14    3.59  , 3.62  , Cur   , 0.03  
13    , 0     , , 14    4.68  , 3.6   , New   , 1.08  
13    , 6     , , 14    4.68  , 3.63  , New   , 1.05  
16    , 0     , , 15    3.57  , 3.6   , Cur   , 0.03  
16    , 7     , , 15    3.55  , 3.59  , Cur   , 0.04  
14    , 0     , , 15    4.69  , 3.61  , New   , 1.08  
14    , 7     , , 15    4.69  , 3.61  , New   , 1.08  
17    , 0     , , 16    3.56  , 3.61  , Cur   , 0.05  
15    , 0     , , 16    4.71  , 3.58  , New   , 1.13  
18    , 0     , , 17    3.57  , 3.65  , Cur   , 0.08  
18    , 1     , , 17    3.58  , 3.59  , Cur   , 0.01  
16    , 0     , , 17    4.7   , 3.58  , New   , 1.12  
16    , 1     , , 17    4.68  , 3.59  , New   , 1.09  
19    , 0     , , 18    3.51  , 3.58  , Cur   , 0.07  
19    , 2     , , 18    3.55  , 3.58  , Cur   , 0.03  
17    , 0     , , 18    4.69  , 3.61  , New   , 1.08  
17    , 2     , , 18    4.68  , 3.61  , New   , 1.07  
20    , 0     , , 19    3.57  , 3.6   , Cur   , 0.03  
20    , 3     , , 19    3.59  , 3.59  , Eq    , 0.0
18    , 0     , , 19    4.68  , 3.59  , New   , 1.09  
18    , 3     , , 19    4.67  , 3.57  , New   , 1.1   
21    , 0     , , 20    3.61  , 3.58  , New   , 0.03  
21    , 4     , , 20    3.62  , 3.6   , New   , 0.02  
19    , 0     , , 20    4.74  , 3.57  , New   , 1.17  
19    , 4     , , 20    4.69  , 3.7   , New   , 0.99  
22    , 0     , , 21    3.57  , 3.64  , Cur   , 0.07  
22    , 5     , , 21    3.55  , 3.6   , Cur   , 0.05  
20    , 0     , , 21    4.72  , 3.55  , New   , 1.17  
20    , 5     , , 21    4.66  , 3.55  , New   , 1.11  
23    , 0     , , 22    3.56  , 3.56  , Eq    , 0.0
23    , 6     , , 22    3.54  , 3.56  , Cur   , 0.02  
21    , 0     , , 22    4.65  , 3.53  , New   , 1.12  
21    , 6     , , 22    4.62  , 3.56  , New   , 1.06  
24    , 0     , , 23    3.5   , 3.54  , Cur   , 0.04  
24    , 7     , , 23    3.52  , 3.53  , Cur   , 0.01  
22    , 0     , , 23    4.61  , 3.51  , New   , 1.1   
22    , 7     , , 23    4.6   , 3.51  , New   , 1.09  
25    , 0     , , 24    3.5   , 3.53  , Cur   , 0.03  
23    , 0     , , 24    4.54  , 3.5   , New   , 1.04  
26    , 0     , , 25    3.47  , 3.49  , Cur   , 0.02  
26    , 1     , , 25    3.46  , 3.51  , Cur   , 0.05  
24    , 0     , , 25    4.53  , 3.51  , New   , 1.02  
24    , 1     , , 25    4.51  , 3.51  , New   , 1.0   
27    , 0     , , 26    3.44  , 3.51  , Cur   , 0.07  
27    , 2     , , 26    3.51  , 3.52  , Cur   , 0.01  
25    , 0     , , 26    4.56  , 3.46  , New   , 1.1   
25    , 2     , , 26    4.55  , 3.47  , New   , 1.08  
28    , 0     , , 27    3.47  , 3.5   , Cur   , 0.03  
28    , 3     , , 27    3.48  , 3.47  , New   , 0.01  
26    , 0     , , 27    4.52  , 3.44  , New   , 1.08  
26    , 3     , , 27    4.55  , 3.46  , New   , 1.09  
29    , 0     , , 28    3.45  , 3.49  , Cur   , 0.04  
29    , 4     , , 28    3.5   , 3.5   , Eq    , 0.0
27    , 0     , , 28    4.56  , 3.49  , New   , 1.07  
27    , 4     , , 28    4.5   , 3.49  , New   , 1.01  
30    , 0     , , 29    3.44  , 3.48  , Cur   , 0.04  
30    , 5     , , 29    3.46  , 3.47  , Cur   , 0.01  
28    , 0     , , 29    4.49  , 3.43  , New   , 1.06  
28    , 5     , , 29    4.57  , 3.45  , New   , 1.12  
31    , 0     , , 30    3.48  , 3.48  , Eq    , 0.0
31    , 6     , , 30    3.46  , 3.49  , Cur   , 0.03  
29    , 0     , , 30    4.49  , 3.44  , New   , 1.05  
29    , 6     , , 30    4.53  , 3.44  , New   , 1.09  
32    , 0     , , 31    3.44  , 3.45  , Cur   , 0.01  
32    , 7     , , 31    3.46  , 3.51  , Cur   , 0.05  
30    , 0     , , 31    4.48  , 3.42  , New   , 1.06  
30    , 7     , , 31    4.48  , 3.44  , New   , 1.04


Results For Skylake memchr-avx2
size  , algn  , Pos   , Cur T , New T , Win   , Dif   
2048  , 0     , , 32    6.61  , 5.4   , New   , 1.21  
256   , 1     , , 64    6.52  , 5.68  , New   , 0.84  
2048  , 0     , , 64    6.03  , 5.47  , New   , 0.56  
256   , 2     , , 64    6.07  , 5.42  , New   , 0.65  
2048  , 0     , , 128   7.01  , 5.83  , New   , 1.18  
256   , 3     , , 64    6.24  , 5.68  , New   , 0.56  
2048  , 0     , , 256   11.03 , 9.86  , New   , 1.17  
256   , 4     , , 64    6.17  , 5.49  , New   , 0.68  
2048  , 0     , , 512   14.11 , 13.41 , New   , 0.7   
256   , 5     , , 64    6.03  , 5.45  , New   , 0.58  
2048  , 0     , , 1024  19.82 , 19.92 , Cur   , 0.1   
256   , 6     , , 64    6.14  , 5.7   , New   , 0.44  
2048  , 0     , , 2048  30.9  , 30.59 , New   , 0.31  
256   , 7     , , 64    6.05  , 5.64  , New   , 0.41  
192   , 1     , , 32    5.6   , 4.89  , New   , 0.71  
256   , 1     , , 32    5.59  , 5.07  , New   , 0.52  
512   , 1     , , 32    5.58  , 4.93  , New   , 0.65  
192   , 2     , , 64    6.14  , 5.46  , New   , 0.68  
512   , 2     , , 64    5.95  , 5.38  , New   , 0.57  
192   , 3     , , 96    6.6   , 5.74  , New   , 0.86  
256   , 3     , , 96    6.48  , 5.37  , New   , 1.11  
512   , 3     , , 96    6.56  , 5.44  , New   , 1.12  
192   , 4     , , 128   7.04  , 6.02  , New   , 1.02  
256   , 4     , , 128   6.96  , 5.89  , New   , 1.07  
512   , 4     , , 128   6.97  , 5.99  , New   , 0.98  
192   , 5     , , 160   8.49  , 7.07  , New   , 1.42  
256   , 5     , , 160   8.1   , 6.96  , New   , 1.14  
512   , 5     , , 160   10.48 , 9.14  , New   , 1.34  
192   , 6     , , 192   8.46  , 8.52  , Cur   , 0.06  
256   , 6     , , 192   8.53  , 7.58  , New   , 0.95  
512   , 6     , , 192   10.88 , 9.06  , New   , 1.82  
192   , 7     , , 224   8.59  , 8.35  , New   , 0.24  
256   , 7     , , 224   8.86  , 7.91  , New   , 0.95  
512   , 7     , , 224   10.89 , 8.98  , New   , 1.91  
2     , 0     , , 1     4.28  , 3.62  , New   , 0.66  
2     , 1     , , 1     4.32  , 3.75  , New   , 0.57  
0     , 0     , , 1     3.76  , 3.24  , New   , 0.52  
0     , 1     , , 1     3.7   , 3.19  , New   , 0.51  
3     , 0     , , 2     4.16  , 3.67  , New   , 0.49  
3     , 2     , , 2     4.21  , 3.68  , New   , 0.53  
1     , 0     , , 2     4.25  , 3.74  , New   , 0.51  
1     , 2     , , 2     4.4   , 3.82  , New   , 0.58  
4     , 0     , , 3     4.43  , 3.88  , New   , 0.55  
4     , 3     , , 3     4.34  , 3.8   , New   , 0.54  
2     , 0     , , 3     4.33  , 3.79  , New   , 0.54  
2     , 3     , , 3     4.37  , 3.84  , New   , 0.53  
5     , 0     , , 4     4.45  , 3.87  , New   , 0.58  
5     , 4     , , 4     4.41  , 3.84  , New   , 0.57  
3     , 0     , , 4     4.34  , 3.83  , New   , 0.51  
3     , 4     , , 4     4.35  , 3.82  , New   , 0.53  
6     , 0     , , 5     4.41  , 3.88  , New   , 0.53  
6     , 5     , , 5     4.41  , 3.88  , New   , 0.53  
4     , 0     , , 5     4.35  , 3.84  , New   , 0.51  
4     , 5     , , 5     4.37  , 3.85  , New   , 0.52  
7     , 0     , , 6     4.4   , 3.84  , New   , 0.56  
7     , 6     , , 6     4.39  , 3.83  , New   , 0.56  
5     , 0     , , 6     4.37  , 3.85  , New   , 0.52  
5     , 6     , , 6     4.4   , 3.86  , New   , 0.54  
8     , 0     , , 7     4.39  , 3.88  , New   , 0.51  
8     , 7     , , 7     4.4   , 3.83  , New   , 0.57  
6     , 0     , , 7     4.39  , 3.85  , New   , 0.54  
6     , 7     , , 7     4.38  , 3.87  , New   , 0.51  
9     , 0     , , 8     4.47  , 3.96  , New   , 0.51  
7     , 0     , , 8     4.37  , 3.85  , New   , 0.52  
10    , 0     , , 9     4.61  , 4.08  , New   , 0.53  
10    , 1     , , 9     4.61  , 4.09  , New   , 0.52  
8     , 0     , , 9     4.37  , 3.85  , New   , 0.52  
8     , 1     , , 9     4.37  , 3.85  , New   , 0.52  
11    , 0     , , 10    4.68  , 4.06  , New   , 0.62  
11    , 2     , , 10    4.56  , 4.1   , New   , 0.46  
9     , 0     , , 10    4.36  , 3.83  , New   , 0.53  
9     , 2     , , 10    4.37  , 3.83  , New   , 0.54  
12    , 0     , , 11    4.62  , 4.05  , New   , 0.57  
12    , 3     , , 11    4.63  , 4.06  , New   , 0.57  
10    , 0     , , 11    4.38  , 3.86  , New   , 0.52  
10    , 3     , , 11    4.41  , 3.86  , New   , 0.55  
13    , 0     , , 12    4.57  , 4.08  , New   , 0.49  
13    , 4     , , 12    4.59  , 4.12  , New   , 0.47  
11    , 0     , , 12    4.45  , 4.0   , New   , 0.45  
11    , 4     , , 12    4.51  , 4.04  , New   , 0.47  
14    , 0     , , 13    4.64  , 4.16  , New   , 0.48  
14    , 5     , , 13    4.67  , 4.1   , New   , 0.57  
12    , 0     , , 13    4.58  , 4.08  , New   , 0.5   
12    , 5     , , 13    4.6   , 4.1   , New   , 0.5   
15    , 0     , , 14    4.61  , 4.05  , New   , 0.56  
15    , 6     , , 14    4.59  , 4.06  , New   , 0.53  
13    , 0     , , 14    4.57  , 4.06  , New   , 0.51  
13    , 6     , , 14    4.57  , 4.05  , New   , 0.52  
16    , 0     , , 15    4.62  , 4.05  , New   , 0.57  
16    , 7     , , 15    4.63  , 4.06  , New   , 0.57  
14    , 0     , , 15    4.61  , 4.06  , New   , 0.55  
14    , 7     , , 15    4.59  , 4.05  , New   , 0.54  
17    , 0     , , 16    4.58  , 4.08  , New   , 0.5   
15    , 0     , , 16    4.64  , 4.06  , New   , 0.58  
18    , 0     , , 17    4.56  , 4.17  , New   , 0.39  
18    , 1     , , 17    4.59  , 4.09  , New   , 0.5   
16    , 0     , , 17    4.59  , 4.07  , New   , 0.52  
16    , 1     , , 17    4.58  , 4.04  , New   , 0.54  
19    , 0     , , 18    4.61  , 4.05  , New   , 0.56  
19    , 2     , , 18    4.6   , 4.08  , New   , 0.52  
17    , 0     , , 18    4.64  , 4.11  , New   , 0.53  
17    , 2     , , 18    4.56  , 4.13  , New   , 0.43  
20    , 0     , , 19    4.77  , 4.3   , New   , 0.47  
20    , 3     , , 19    4.6   , 4.14  , New   , 0.46  
18    , 0     , , 19    4.72  , 4.02  , New   , 0.7   
18    , 3     , , 19    4.53  , 4.01  , New   , 0.52  
21    , 0     , , 20    4.66  , 4.26  , New   , 0.4   
21    , 4     , , 20    4.74  , 4.07  , New   , 0.67  
19    , 0     , , 20    4.62  , 4.12  , New   , 0.5   
19    , 4     , , 20    4.57  , 4.04  , New   , 0.53  
22    , 0     , , 21    4.61  , 4.13  , New   , 0.48  
22    , 5     , , 21    4.64  , 4.08  , New   , 0.56  
20    , 0     , , 21    4.49  , 4.01  , New   , 0.48  
20    , 5     , , 21    4.58  , 4.06  , New   , 0.52  
23    , 0     , , 22    4.62  , 4.13  , New   , 0.49  
23    , 6     , , 22    4.72  , 4.27  , New   , 0.45  
21    , 0     , , 22    4.65  , 3.97  , New   , 0.68  
21    , 6     , , 22    4.5   , 4.02  , New   , 0.48  
24    , 0     , , 23    4.78  , 4.07  , New   , 0.71  
24    , 7     , , 23    4.67  , 4.23  , New   , 0.44  
22    , 0     , , 23    4.49  , 3.99  , New   , 0.5   
22    , 7     , , 23    4.56  , 4.03  , New   , 0.53  
25    , 0     , , 24    4.6   , 4.15  , New   , 0.45  
23    , 0     , , 24    4.57  , 4.06  , New   , 0.51  
26    , 0     , , 25    4.54  , 4.14  , New   , 0.4   
26    , 1     , , 25    4.72  , 4.1   , New   , 0.62  
24    , 0     , , 25    4.52  , 4.13  , New   , 0.39  
24    , 1     , , 25    4.55  , 4.0   , New   , 0.55  
27    , 0     , , 26    4.51  , 4.06  , New   , 0.45  
27    , 2     , , 26    4.53  , 4.16  , New   , 0.37  
25    , 0     , , 26    4.59  , 4.09  , New   , 0.5   
25    , 2     , , 26    4.55  , 4.01  , New   , 0.54  
28    , 0     , , 27    4.59  , 3.99  , New   , 0.6   
28    , 3     , , 27    4.57  , 3.95  , New   , 0.62  
26    , 0     , , 27    4.55  , 4.15  , New   , 0.4   
26    , 3     , , 27    4.57  , 3.99  , New   , 0.58  
29    , 0     , , 28    4.41  , 4.03  , New   , 0.38  
29    , 4     , , 28    4.59  , 4.02  , New   , 0.57  
27    , 0     , , 28    4.63  , 4.08  , New   , 0.55  
27    , 4     , , 28    4.44  , 4.02  , New   , 0.42  
30    , 0     , , 29    4.53  , 3.93  , New   , 0.6   
30    , 5     , , 29    4.55  , 3.88  , New   , 0.67  
28    , 0     , , 29    4.49  , 3.9   , New   , 0.59  
28    , 5     , , 29    4.44  , 3.94  , New   , 0.5   
31    , 0     , , 30    4.41  , 3.85  , New   , 0.56  
31    , 6     , , 30    4.48  , 3.86  , New   , 0.62  
29    , 0     , , 30    4.55  , 3.94  , New   , 0.61  
29    , 6     , , 30    4.32  , 3.95  , New   , 0.37  
32    , 0     , , 31    4.36  , 3.91  , New   , 0.45  
32    , 7     , , 31    4.37  , 3.89  , New   , 0.48  
30    , 0     , , 31    4.65  , 3.9   , New   , 0.75  
30    , 7     , , 31    4.42  , 3.93  , New   , 0.49  

 sysdeps/x86_64/multiarch/memchr-evex.S | 580 +++++++++++++++----------
 1 file changed, 349 insertions(+), 231 deletions(-)

diff --git a/sysdeps/x86_64/multiarch/memchr-evex.S b/sysdeps/x86_64/multiarch/memchr-evex.S
index 6dd5d67b90..65c16ef8a4 100644
--- a/sysdeps/x86_64/multiarch/memchr-evex.S
+++ b/sysdeps/x86_64/multiarch/memchr-evex.S
@@ -26,14 +26,28 @@
 
 # ifdef USE_AS_WMEMCHR
 #  define VPBROADCAST	vpbroadcastd
-#  define VPCMP		vpcmpd
-#  define SHIFT_REG	r8d
+#  define VPMINU	vpminud
+#  define VPCMP	vpcmpd
+#  define VPCMPEQ	vpcmpeqd
+#  define CHAR_SIZE	4
 # else
 #  define VPBROADCAST	vpbroadcastb
-#  define VPCMP		vpcmpb
-#  define SHIFT_REG	ecx
+#  define VPMINU	vpminub
+#  define VPCMP	vpcmpb
+#  define VPCMPEQ	vpcmpeqb
+#  define CHAR_SIZE	1
 # endif
 
+# ifdef USE_AS_RAWMEMCHR
+#  define RAW_PTR_REG	rcx
+#  define ALGN_PTR_REG	rdi
+# else
+#  define RAW_PTR_REG	rdi
+#  define ALGN_PTR_REG	rcx
+# endif
+
+#define XZERO		xmm23
+#define YZERO		ymm23
 # define XMMMATCH	xmm16
 # define YMMMATCH	ymm16
 # define YMM1		ymm17
@@ -44,18 +58,16 @@
 # define YMM6		ymm22
 
 # define VEC_SIZE 32
+# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE)
+# define PAGE_SIZE 4096
 
 	.section .text.evex,"ax",@progbits
-ENTRY (MEMCHR)
+ENTRY(MEMCHR)
 # ifndef USE_AS_RAWMEMCHR
 	/* Check for zero length.  */
 	test	%RDX_LP, %RDX_LP
 	jz	L(zero)
-# endif
-	movl	%edi, %ecx
-# ifdef USE_AS_WMEMCHR
-	shl	$2, %RDX_LP
-# else
+
 #  ifdef __ILP32__
 	/* Clear the upper 32 bits.  */
 	movl	%edx, %edx
@@ -63,319 +75,425 @@ ENTRY (MEMCHR)
 # endif
 	/* Broadcast CHAR to YMMMATCH.  */
 	VPBROADCAST %esi, %YMMMATCH
-	/* Check if we may cross page boundary with one vector load.  */
-	andl	$(2 * VEC_SIZE - 1), %ecx
-	cmpl	$VEC_SIZE, %ecx
-	ja	L(cros_page_boundary)
+	/* Check if we may cross page boundary with one
+	   vector load.  */
+	movl	%edi, %eax
+	andl	$(PAGE_SIZE - 1), %eax
+	cmpl	$(PAGE_SIZE - VEC_SIZE), %eax
+	ja	L(cross_page_boundary)
 
 	/* Check the first VEC_SIZE bytes.  */
-	VPCMP	$0, (%rdi), %YMMMATCH, %k1
-	kmovd	%k1, %eax
-	testl	%eax, %eax
-
+	VPCMP	$0, (%rdi), %YMMMATCH, %k0
+	kmovd	%k0, %eax
 # ifndef USE_AS_RAWMEMCHR
-	jnz	L(first_vec_x0_check)
-	/* Adjust length and check the end of data.  */
-	subq	$VEC_SIZE, %rdx
-	jbe	L(zero)
+	/* If length < CHAR_PER_VEC handle special.  */
+	cmpq	$CHAR_PER_VEC, %rdx
+	jbe	L(first_vec_x0)
+# endif
+	testl	%eax, %eax
+	jz	L(aligned_more)
+	tzcntl	%eax, %eax
+# ifdef USE_AS_WMEMCHR
+	/* NB: Multiply bytes by CHAR_SIZE to get the
+	   wchar_t count.  */
+	leaq	(%rdi, %rax, CHAR_SIZE), %rax
 # else
-	jnz	L(first_vec_x0)
+	addq	%rdi, %rax
 # endif
-
-	/* Align data for aligned loads in the loop.  */
-	addq	$VEC_SIZE, %rdi
-	andl	$(VEC_SIZE - 1), %ecx
-	andq	$-VEC_SIZE, %rdi
+	ret
 
 # ifndef USE_AS_RAWMEMCHR
-	/* Adjust length.  */
-	addq	%rcx, %rdx
-
-	subq	$(VEC_SIZE * 4), %rdx
-	jbe	L(last_4x_vec_or_less)
-# endif
-	jmp	L(more_4x_vec)
+L(zero):
+	xorl	%eax, %eax
+	ret
 
+	.p2align 5
+L(first_vec_x0):
+	/* Check if first match was before length.  */
+	tzcntl	%eax, %eax
+	xorl	%ecx, %ecx
+	cmpl	%eax, %edx
+	leaq	(%rdi, %rax, CHAR_SIZE), %rax
+	cmovle	%rcx, %rax
+	ret
+# else
+	/* NB: first_vec_x0 is 17 bytes which will leave
+	   cross_page_boundary (which is relatively cold) close
+	   enough to ideal alignment. So only realign
+	   L(cross_page_boundary) if rawmemchr.  */
 	.p2align 4
-L(cros_page_boundary):
-	andl	$(VEC_SIZE - 1), %ecx
+# endif
+L(cross_page_boundary):
+	/* Save pointer before aligning as its original
+	   value is necessary for computer return address if byte is
+	   found or adjusting length if it is not and this is
+	   memchr.  */
+	movq	%rdi, %rcx
+	/* Align data to VEC_SIZE. ALGN_PTR_REG is rcx
+	   for memchr and rdi for rawmemchr.  */
+	andq	$-VEC_SIZE, %ALGN_PTR_REG
+	VPCMP	$0, (%ALGN_PTR_REG), %YMMMATCH, %k0
+	kmovd	%k0, %r8d
 # ifdef USE_AS_WMEMCHR
-	/* NB: Divide shift count by 4 since each bit in K1 represent 4
-	   bytes.  */
-	movl	%ecx, %SHIFT_REG
-	sarl	$2, %SHIFT_REG
+	/* NB: Divide shift count by 4 since each bit in
+	   K0 represent 4 bytes.  */
+	sarl	$2, %eax
+# endif
+# ifndef USE_AS_RAWMEMCHR
+	movl	$(PAGE_SIZE / CHAR_SIZE), %esi
+	subl	%eax, %esi
 # endif
-	andq	$-VEC_SIZE, %rdi
-	VPCMP	$0, (%rdi), %YMMMATCH, %k1
-	kmovd	%k1, %eax
-	/* Remove the leading bytes.  */
-	sarxl	%SHIFT_REG, %eax, %eax
-	testl	%eax, %eax
-	jz	L(aligned_more)
-	tzcntl	%eax, %eax
 # ifdef USE_AS_WMEMCHR
-	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
-	sall	$2, %eax
+	andl	$(CHAR_PER_VEC - 1), %eax
 # endif
+	/* Remove the leading bytes.  */
+	sarxl	%eax, %r8d, %eax
 # ifndef USE_AS_RAWMEMCHR
 	/* Check the end of data.  */
-	cmpq	%rax, %rdx
-	jbe	L(zero)
+	cmpq	%rsi, %rdx
+	jbe	L(first_vec_x0)
+# endif
+	testl	%eax, %eax
+	jz	L(cross_page_continue)
+	tzcntl	%eax, %eax
+# ifdef USE_AS_WMEMCHR
+	/* NB: Multiply bytes by CHAR_SIZE to get the
+	   wchar_t count.  */
+	leaq	(%RAW_PTR_REG, %rax, CHAR_SIZE), %rax
+# else
+	addq	%RAW_PTR_REG, %rax
 # endif
-	addq	%rdi, %rax
-	addq	%rcx, %rax
 	ret
 
 	.p2align 4
-L(aligned_more):
-# ifndef USE_AS_RAWMEMCHR
-        /* Calculate "rdx + rcx - VEC_SIZE" with "rdx - (VEC_SIZE - rcx)"
-	   instead of "(rdx + rcx) - VEC_SIZE" to void possible addition
-	   overflow.  */
-	negq	%rcx
-	addq	$VEC_SIZE, %rcx
+L(first_vec_x1):
+	tzcntl	%eax, %eax
+	leaq	VEC_SIZE(%rdi, %rax, CHAR_SIZE), %rax
+	ret
 
-	/* Check the end of data.  */
-	subq	%rcx, %rdx
-	jbe	L(zero)
-# endif
+	.p2align 4
+L(first_vec_x2):
+	tzcntl	%eax, %eax
+	leaq	(VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
+	ret
 
-	addq	$VEC_SIZE, %rdi
+	.p2align 4
+L(first_vec_x3):
+	tzcntl	%eax, %eax
+	leaq	(VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax
+	ret
+
+	.p2align 4
+L(first_vec_x4):
+	tzcntl	%eax, %eax
+	leaq	(VEC_SIZE * 4)(%rdi, %rax, CHAR_SIZE), %rax
+	ret
+
+	.p2align 5
+L(aligned_more):
+	/* Check the first 4 * VEC_SIZE.  Only one
+	   VEC_SIZE at a time since data is only aligned to
+	   VEC_SIZE.  */
 
 # ifndef USE_AS_RAWMEMCHR
-	subq	$(VEC_SIZE * 4), %rdx
+	/* Align data to VEC_SIZE.  */
+L(cross_page_continue):
+	xorl	%ecx, %ecx
+	subl	%edi, %ecx
+	andq	$-VEC_SIZE, %rdi
+	/* esi is for adjusting length to see if near the
+	   end.  */
+	leal	(VEC_SIZE * 5)(%rdi, %rcx), %esi
+#  ifdef USE_AS_WMEMCHR
+	/* NB: Divide bytes by 4 to get the wchar_t
+	   count.  */
+	sarl	$2, %esi
+#  endif
+# else
+	andq	$-VEC_SIZE, %rdi
+L(cross_page_continue):
+# endif
+	/* Load first VEC regardless.  */
+	VPCMP	$0, (VEC_SIZE)(%rdi), %YMMMATCH, %k0
+	kmovd	%k0, %eax
+# ifndef USE_AS_RAWMEMCHR
+	/* Adjust length. If near end handle specially.
+	 */
+	subq	%rsi, %rdx
 	jbe	L(last_4x_vec_or_less)
 # endif
-
-L(more_4x_vec):
-	/* Check the first 4 * VEC_SIZE.  Only one VEC_SIZE at a time
-	   since data is only aligned to VEC_SIZE.  */
-	VPCMP	$0, (%rdi), %YMMMATCH, %k1
-	kmovd	%k1, %eax
-	testl	%eax, %eax
-	jnz	L(first_vec_x0)
-
-	VPCMP	$0, VEC_SIZE(%rdi), %YMMMATCH, %k1
-	kmovd	%k1, %eax
 	testl	%eax, %eax
 	jnz	L(first_vec_x1)
 
-	VPCMP	$0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k1
-	kmovd	%k1, %eax
+	VPCMP	$0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k0
+	kmovd	%k0, %eax
 	testl	%eax, %eax
 	jnz	L(first_vec_x2)
 
-	VPCMP	$0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k1
-	kmovd	%k1, %eax
+	VPCMP	$0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k0
+	kmovd	%k0, %eax
 	testl	%eax, %eax
 	jnz	L(first_vec_x3)
 
-	addq	$(VEC_SIZE * 4), %rdi
+	VPCMP	$0, (VEC_SIZE * 4)(%rdi), %YMMMATCH, %k0
+	kmovd	%k0, %eax
+	testl	%eax, %eax
+	jnz	L(first_vec_x4)
+
 
 # ifndef USE_AS_RAWMEMCHR
-	subq	$(VEC_SIZE * 4), %rdx
-	jbe	L(last_4x_vec_or_less)
-# endif
+	/* Check if at last CHAR_PER_VEC * 4 length.  */
+	subq	$(CHAR_PER_VEC * 4), %rdx
+	jbe	L(last_4x_vec_or_less_cmpeq)
+	addq	$VEC_SIZE, %rdi
 
-	/* Align data to 4 * VEC_SIZE.  */
-	movq	%rdi, %rcx
-	andl	$(4 * VEC_SIZE - 1), %ecx
+	/* Align data to VEC_SIZE * 4 for the loop and
+	   readjust length.  */
+#  ifdef USE_AS_WMEMCHR
+	movl	%edi, %ecx
 	andq	$-(4 * VEC_SIZE), %rdi
-
-# ifndef USE_AS_RAWMEMCHR
-	/* Adjust length.  */
+	andl	$(VEC_SIZE * 4 - 1), %ecx
+	/* NB: Divide bytes by 4 to get the wchar_t
+	   count.  */
+	sarl	$2, %ecx
 	addq	%rcx, %rdx
+#  else
+	addq	%rdi, %rdx
+	andq	$-(4 * VEC_SIZE), %rdi
+	subq	%rdi, %rdx
+#  endif
+# else
+	addq	$VEC_SIZE, %rdi
+	andq	$-(4 * VEC_SIZE), %rdi
 # endif
 
+	vpxorq	%XZERO, %XZERO, %XZERO
+
+	/* Compare 4 * VEC at a time forward.  */
 	.p2align 4
 L(loop_4x_vec):
-	/* Compare 4 * VEC at a time forward.  */
-	VPCMP	$0, (%rdi), %YMMMATCH, %k1
-	VPCMP	$0, VEC_SIZE(%rdi), %YMMMATCH, %k2
-	kord	%k1, %k2, %k5
-	VPCMP	$0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k3
-	VPCMP	$0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k4
-
-	kord	%k3, %k4, %k6
-	kortestd %k5, %k6
-	jnz	L(4x_vec_end)
-
-	addq	$(VEC_SIZE * 4), %rdi
-
+	/* It would be possible to save some instructions
+	   using 4x VPCMP but bottleneck on port 5 makes it not woth
+	   it.  */
+	VPCMP	$4, (VEC_SIZE * 4)(%rdi), %YMMMATCH, %k1
+	/* xor will set bytes match esi to zero.  */
+	vpxorq	(VEC_SIZE * 5)(%rdi), %YMMMATCH, %YMM2
+	vpxorq	(VEC_SIZE * 6)(%rdi), %YMMMATCH, %YMM3
+	VPCMP	$0, (VEC_SIZE * 7)(%rdi), %YMMMATCH, %k3
+	/* Reduce VEC2 / VEC3 with min and VEC1 with zero
+	   mask.  */
+	VPMINU	%YMM2, %YMM3, %YMM3 {%k1} {z}
+	VPCMP	$0, %YMM3, %YZERO, %k2
 # ifdef USE_AS_RAWMEMCHR
-	jmp	L(loop_4x_vec)
+	subq	$-(VEC_SIZE * 4), %rdi
+	kortestd %k2, %k3
+	jz	L(loop_4x_vec)
 # else
-	subq	$(VEC_SIZE * 4), %rdx
-	ja	L(loop_4x_vec)
+	kortestd %k2, %k3
+	jnz	L(loop_4x_vec_end)
 
-L(last_4x_vec_or_less):
-	/* Less than 4 * VEC and aligned to VEC_SIZE.  */
-	addl	$(VEC_SIZE * 2), %edx
-	jle	L(last_2x_vec)
+	subq	$-(VEC_SIZE * 4), %rdi
 
-	VPCMP	$0, (%rdi), %YMMMATCH, %k1
-	kmovd	%k1, %eax
-	testl	%eax, %eax
-	jnz	L(first_vec_x0)
+	subq	$(CHAR_PER_VEC * 4), %rdx
+	ja	L(loop_4x_vec)
 
-	VPCMP	$0, VEC_SIZE(%rdi), %YMMMATCH, %k1
-	kmovd	%k1, %eax
+	/* Fall through into less than 4 remaining
+	   vectors of length case.  */
+	VPCMP	$0, (VEC_SIZE * 4)(%rdi), %YMMMATCH, %k0
+	kmovd	%k0, %eax
+	addq	$(VEC_SIZE * 3), %rdi
+	.p2align 4
+L(last_4x_vec_or_less):
+	/* Check if first VEC contained match.  */
 	testl	%eax, %eax
-	jnz	L(first_vec_x1)
+	jnz	L(first_vec_x1_check)
 
-	VPCMP	$0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k1
-	kmovd	%k1, %eax
-	testl	%eax, %eax
+	/* If remaining length > CHAR_PER_VEC * 2.  */
+	addl	$(CHAR_PER_VEC * 2), %edx
+	jg	L(last_4x_vec)
 
-	jnz	L(first_vec_x2_check)
-	subl	$VEC_SIZE, %edx
-	jle	L(zero)
+L(last_2x_vec):
+	/* If remaining length < CHAR_PER_VEC.  */
+	addl	$CHAR_PER_VEC, %edx
+	jle	L(zero_end)
+
+	/* Check VEC2 and compare any match with
+	   remaining length.  */
+	VPCMP	$0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k0
+	kmovd	%k0, %eax
+	tzcntl	%eax, %eax
+	cmpl	%eax, %edx
+	jbe	L(set_zero_end)
+	leaq	(VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
+L(zero_end):
+	ret
 
-	VPCMP	$0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k1
-	kmovd	%k1, %eax
-	testl	%eax, %eax
 
-	jnz	L(first_vec_x3_check)
+	.p2align 4
+L(first_vec_x1_check):
+	tzcntl	%eax, %eax
+	/* Adjust length.  */
+	subl	$-(CHAR_PER_VEC * 4), %edx
+	/* Check if match within remaining length.  */
+	cmpl	%eax, %edx
+	jbe	L(set_zero_end)
+	/* NB: Multiply bytes by CHAR_SIZE to get the
+	   wchar_t count.  */
+	leaq	VEC_SIZE(%rdi, %rax, CHAR_SIZE), %rax
+	ret
+L(set_zero_end):
 	xorl	%eax, %eax
 	ret
 
 	.p2align 4
-L(last_2x_vec):
-	addl	$(VEC_SIZE * 2), %edx
-	VPCMP	$0, (%rdi), %YMMMATCH, %k1
+L(loop_4x_vec_end):
+# endif
+	/* rawmemchr will fall through into this if match
+	   was found in loop.  */
+
+	/* k1 has not of matches with VEC1.  */
 	kmovd	%k1, %eax
-	testl	%eax, %eax
+# ifdef USE_AS_WMEMCHR
+	subl	$((1 << CHAR_PER_VEC) - 1), %eax
+# else
+	incl	%eax
+# endif
+	jnz	L(last_vec_x1_return)
 
-	jnz	L(first_vec_x0_check)
-	subl	$VEC_SIZE, %edx
-	jle	L(zero)
+	VPCMP	$0, %YMM2, %YZERO, %k0
+	kmovd	%k0, %eax
+	testl	%eax, %eax
+	jnz	L(last_vec_x2_return)
 
-	VPCMP	$0, VEC_SIZE(%rdi), %YMMMATCH, %k1
-	kmovd	%k1, %eax
+	kmovd	%k2, %eax
 	testl	%eax, %eax
-	jnz	L(first_vec_x1_check)
-	xorl	%eax, %eax
-	ret
+	jnz	L(last_vec_x3_return)
 
-	.p2align 4
-L(first_vec_x0_check):
+	kmovd	%k3, %eax
 	tzcntl	%eax, %eax
-# ifdef USE_AS_WMEMCHR
-	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
-	sall	$2, %eax
+# ifdef USE_AS_RAWMEMCHR
+	leaq	(VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax
+# else
+	leaq	(VEC_SIZE * 7)(%rdi, %rax, CHAR_SIZE), %rax
 # endif
-	/* Check the end of data.  */
-	cmpq	%rax, %rdx
-	jbe	L(zero)
-	addq	%rdi, %rax
 	ret
 
 	.p2align 4
-L(first_vec_x1_check):
+L(last_vec_x1_return):
 	tzcntl	%eax, %eax
-# ifdef USE_AS_WMEMCHR
-	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
-	sall	$2, %eax
-# endif
-	/* Check the end of data.  */
-	cmpq	%rax, %rdx
-	jbe	L(zero)
-	addq	$VEC_SIZE, %rax
+# ifdef USE_AS_RAWMEMCHR
+#  ifdef USE_AS_WMEMCHR
+	/* NB: Multiply bytes by CHAR_SIZE to get the
+	   wchar_t count.  */
+	leaq	(%rdi, %rax, CHAR_SIZE), %rax
+#  else
 	addq	%rdi, %rax
-	ret
-
-	.p2align 4
-L(first_vec_x2_check):
-	tzcntl	%eax, %eax
-# ifdef USE_AS_WMEMCHR
-	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
-	sall	$2, %eax
+#  endif
+# else
+	/* NB: Multiply bytes by CHAR_SIZE to get the
+	   wchar_t count.  */
+	leaq	(VEC_SIZE * 4)(%rdi, %rax, CHAR_SIZE), %rax
 # endif
-	/* Check the end of data.  */
-	cmpq	%rax, %rdx
-	jbe	L(zero)
-	addq	$(VEC_SIZE * 2), %rax
-	addq	%rdi, %rax
 	ret
 
 	.p2align 4
-L(first_vec_x3_check):
+L(last_vec_x2_return):
 	tzcntl	%eax, %eax
-# ifdef USE_AS_WMEMCHR
-	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
-	sall	$2, %eax
+# ifdef USE_AS_RAWMEMCHR
+	/* NB: Multiply bytes by CHAR_SIZE to get the
+	   wchar_t count.  */
+	leaq	VEC_SIZE(%rdi, %rax, CHAR_SIZE), %rax
+# else
+	/* NB: Multiply bytes by CHAR_SIZE to get the
+	   wchar_t count.  */
+	leaq	(VEC_SIZE * 5)(%rdi, %rax, CHAR_SIZE), %rax
 # endif
-	/* Check the end of data.  */
-	cmpq	%rax, %rdx
-	jbe	L(zero)
-	addq	$(VEC_SIZE * 3), %rax
-	addq	%rdi, %rax
 	ret
 
 	.p2align 4
-L(zero):
-	xorl	%eax, %eax
-	ret
-# endif
-
-	.p2align 4
-L(first_vec_x0):
+L(last_vec_x3_return):
 	tzcntl	%eax, %eax
-# ifdef USE_AS_WMEMCHR
-	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
-	leaq	(%rdi, %rax, 4), %rax
+# ifdef USE_AS_RAWMEMCHR
+	/* NB: Multiply bytes by CHAR_SIZE to get the
+	   wchar_t count.  */
+	leaq	(VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
 # else
-	addq	%rdi, %rax
+	/* NB: Multiply bytes by CHAR_SIZE to get the
+	   wchar_t count.  */
+	leaq	(VEC_SIZE * 6)(%rdi, %rax, CHAR_SIZE), %rax
 # endif
 	ret
 
+
+# ifndef USE_AS_RAWMEMCHR
+L(last_4x_vec_or_less_cmpeq):
+	VPCMP	$0, (VEC_SIZE * 5)(%rdi), %YMMMATCH, %k0
+	kmovd	%k0, %eax
+	subq	$-(VEC_SIZE * 4), %rdi
+	/* Check first VEC regardless.  */
+	testl	%eax, %eax
+	jnz	L(first_vec_x1_check)
+
+	/* If remaining length <= CHAR_PER_VEC * 2.  */
+	addl	$(CHAR_PER_VEC * 2), %edx
+	jle	L(last_2x_vec)
+
 	.p2align 4
-L(first_vec_x1):
+L(last_4x_vec):
+	VPCMP	$0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k0
+	kmovd	%k0, %eax
+	testl	%eax, %eax
+	jnz	L(last_vec_x2)
+
+
+	VPCMP	$0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k0
+	kmovd	%k0, %eax
+	/* Create mask for possible matches within
+	   remaining length.  */
+#  ifdef USE_AS_WMEMCHR
+	movl	$((1 << (CHAR_PER_VEC * 2)) - 1), %ecx
+	bzhil	%edx, %ecx, %ecx
+#  else
+	movq	$-1, %rcx
+	bzhiq	%rdx, %rcx, %rcx
+#  endif
+	/* Test matches in data against length match.  */
+	andl	%ecx, %eax
+	jnz	L(last_vec_x3)
+
+	/* if remaining length <= CHAR_PER_VEC * 3 (Note
+	   this is after remaining length was found to be >
+	   CHAR_PER_VEC * 2.  */
+	subl	$CHAR_PER_VEC, %edx
+	jbe	L(zero_end2)
+
+
+	VPCMP	$0, (VEC_SIZE * 4)(%rdi), %YMMMATCH, %k0
+	kmovd	%k0, %eax
+	/* Shift remaining length mask for last VEC.  */
+#  ifdef USE_AS_WMEMCHR
+	shrl	$CHAR_PER_VEC, %ecx
+#  else
+	shrq	$CHAR_PER_VEC, %rcx
+#  endif
+	andl	%ecx, %eax
+	jz	L(zero_end2)
 	tzcntl	%eax, %eax
-# ifdef USE_AS_WMEMCHR
-	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
-	leaq	VEC_SIZE(%rdi, %rax, 4), %rax
-# else
-	addq	$VEC_SIZE, %rax
-	addq	%rdi, %rax
-# endif
+	leaq	(VEC_SIZE * 4)(%rdi, %rax, CHAR_SIZE), %rax
+L(zero_end2):
 	ret
 
-	.p2align 4
-L(first_vec_x2):
+L(last_vec_x2):
 	tzcntl	%eax, %eax
-# ifdef USE_AS_WMEMCHR
-	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
-	leaq	(VEC_SIZE * 2)(%rdi, %rax, 4), %rax
-# else
-	addq	$(VEC_SIZE * 2), %rax
-	addq	%rdi, %rax
-# endif
+	leaq	(VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
 	ret
 
 	.p2align 4
-L(4x_vec_end):
-	kmovd	%k1, %eax
-	testl	%eax, %eax
-	jnz	L(first_vec_x0)
-	kmovd	%k2, %eax
-	testl	%eax, %eax
-	jnz	L(first_vec_x1)
-	kmovd	%k3, %eax
-	testl	%eax, %eax
-	jnz	L(first_vec_x2)
-	kmovd	%k4, %eax
-	testl	%eax, %eax
-L(first_vec_x3):
+L(last_vec_x3):
 	tzcntl	%eax, %eax
-# ifdef USE_AS_WMEMCHR
-	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
-	leaq	(VEC_SIZE * 3)(%rdi, %rax, 4), %rax
-# else
-	addq	$(VEC_SIZE * 3), %rax
-	addq	%rdi, %rax
-# endif
+	leaq	(VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax
 	ret
+# endif
 
-END (MEMCHR)
+END(MEMCHR)
 #endif
-- 
2.29.2


^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH v1 1/3] Bench: Expand bench-memchr.c
  2021-05-03  8:44 [PATCH v1 1/3] Bench: Expand bench-memchr.c Noah Goldstein
  2021-05-03  8:44 ` [PATCH v1 2/3] x86: Optimize memchr-avx2.S Noah Goldstein
  2021-05-03  8:44 ` [PATCH v1 3/3] x86: Optimize memchr-evex.S Noah Goldstein
@ 2021-05-03 17:17 ` H.J. Lu
  2021-05-03 19:51   ` Noah Goldstein
  2 siblings, 1 reply; 20+ messages in thread
From: H.J. Lu @ 2021-05-03 17:17 UTC (permalink / raw)
  To: Noah Goldstein; +Cc: GNU C Library, Carlos O'Donell

On Mon, May 3, 2021 at 1:45 AM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> No bug. This commit adds some additional cases for bench-memchr.c
> including testing medium sizes and testing short length with both an
> inbound match and out of bound match.
>
> Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
> ---
>  benchtests/bench-memchr.c | 13 +++++++++++++
>  1 file changed, 13 insertions(+)
>
> diff --git a/benchtests/bench-memchr.c b/benchtests/bench-memchr.c
> index f5ced9d80d..5573f93312 100644
> --- a/benchtests/bench-memchr.c
> +++ b/benchtests/bench-memchr.c
> @@ -135,12 +135,25 @@ test_main (void)
>        do_test (i, i, 256, 0);
>  #endif
>      }
> +  for (i = 1; i < 8; ++i)
> +    {
> +      do_test (i, i << 5, 192, 23);
> +      do_test (i, i << 5, 192, 0);
> +      do_test (i, i << 5, 256, 23);
> +      do_test (i, i << 5, 256, 0);
> +      do_test (i, i << 5, 512, 23);
> +      do_test (i, i << 5, 512, 0);
> +    }
>    for (i = 1; i < 32; ++i)
>      {
>        do_test (0, i, i + 1, 23);
>        do_test (0, i, i + 1, 0);
>        do_test (i, i, i + 1, 23);
>        do_test (i, i, i + 1, 0);
> +      do_test (0, i, i - 1, 23);
> +      do_test (0, i, i - 1, 0);
> +      do_test (i, i, i - 1, 23);
> +      do_test (i, i, i - 1, 0);
>  #ifdef USE_AS_MEMRCHR
>        /* Also test the position close to the beginning for memrchr.  */
>        do_test (0, 1, i + 1, 23);
> --
> 2.29.2
>

LGTM.  I will check it in for you.

BTW,  can you apply an account on sourceware.org:

https://sourceware.org/

so that you can push your commits directly?  You can put me down
as your sponsor.

Thanks.

-- 
H.J.

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH v1 2/3] x86: Optimize memchr-avx2.S
  2021-05-03  8:44 ` [PATCH v1 2/3] x86: Optimize memchr-avx2.S Noah Goldstein
@ 2021-05-03 18:50   ` H.J. Lu
  2021-05-03 20:06     ` Noah Goldstein
  2021-05-03 20:06   ` [PATCH v2 " Noah Goldstein
  2021-05-03 22:58   ` [PATCH v3 " Noah Goldstein
  2 siblings, 1 reply; 20+ messages in thread
From: H.J. Lu @ 2021-05-03 18:50 UTC (permalink / raw)
  To: Noah Goldstein; +Cc: libc-alpha, carlos, hjl.tools

On Mon, May 03, 2021 at 04:44:36AM -0400, Noah Goldstein wrote:
> No bug. This commit optimizes memchr-avx2.S. The optimizations include
> replacing some branches with cmovcc, avoiding some branches entirely
> in the less_4x_vec case, making the page cross logic less strict,
> asaving a few instructions the in loop return loop. test-memchr,
> test-rawmemchr, and test-wmemchr are all passing.
> 
> Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
> ---
>  sysdeps/x86_64/multiarch/memchr-avx2.S | 446 +++++++++++++++----------
>  1 file changed, 262 insertions(+), 184 deletions(-)
> 
> diff --git a/sysdeps/x86_64/multiarch/memchr-avx2.S b/sysdeps/x86_64/multiarch/memchr-avx2.S
> index 1fcb1c350f..8368fcd1e1 100644
> --- a/sysdeps/x86_64/multiarch/memchr-avx2.S
> +++ b/sysdeps/x86_64/multiarch/memchr-avx2.S
> @@ -26,8 +26,22 @@
>  
>  # ifdef USE_AS_WMEMCHR
>  #  define VPCMPEQ	vpcmpeqd
> +#  define VPBROADCAST	vpbroadcastd
> +#  define CHAR_SIZE	4
>  # else
>  #  define VPCMPEQ	vpcmpeqb
> +#  define VPBROADCAST	vpbroadcastb
> +#  define CHAR_SIZE	1
> +# endif
> +
> +# ifdef USE_AS_RAWMEMCHR
> +#  define ERAW_PTR_REG	ecx
> +#  define RRAW_PTR_REG	rcx
> +#  define ALGN_PTR_REG	rdi
> +# else
> +#  define ERAW_PTR_REG	edi
> +#  define RRAW_PTR_REG	rdi
> +#  define ALGN_PTR_REG	rcx
>  # endif
>  
>  # ifndef VZEROUPPER
> @@ -39,303 +53,367 @@
>  # endif
>  
>  # define VEC_SIZE 32
> +# define PAGE_SIZE 4096
> +

Remove the extra line here.

>  
>  	.section SECTION(.text),"ax",@progbits
> -ENTRY (MEMCHR)
> +ENTRY(MEMCHR)

No need for this change.

>  # ifndef USE_AS_RAWMEMCHR
>  	/* Check for zero length.  */
>  	test	%RDX_LP, %RDX_LP
>  	jz	L(null)
>  # endif
> -	movl	%edi, %ecx
> -	/* Broadcast CHAR to YMM0.  */
> -	vmovd	%esi, %xmm0
>  # ifdef USE_AS_WMEMCHR
>  	shl	$2, %RDX_LP
> -	vpbroadcastd %xmm0, %ymm0
>  # else
>  #  ifdef __ILP32__
>  	/* Clear the upper 32 bits.  */
>  	movl	%edx, %edx
>  #  endif
> -	vpbroadcastb %xmm0, %ymm0
>  # endif
> -	/* Check if we may cross page boundary with one vector load.  */
> -	andl	$(2 * VEC_SIZE - 1), %ecx
> -	cmpl	$VEC_SIZE, %ecx
> -	ja	L(cros_page_boundary)
> +	/* Broadcast CHAR to YMMMATCH.  */
> +	vmovd	%esi, %xmm0
> +	VPBROADCAST %xmm0, %ymm0
> +	/* Check if we may cross page boundary with one
> +	   vector load.  */
> +	movl	%edi, %eax
> +	andl	$(PAGE_SIZE - 1), %eax
> +	cmpl	$(PAGE_SIZE - VEC_SIZE), %eax
> +	ja	L(cross_page_boundary)
>  
>  	/* Check the first VEC_SIZE bytes.  */
> -	VPCMPEQ (%rdi), %ymm0, %ymm1
> +	VPCMPEQ	(%rdi), %ymm0, %ymm1
>  	vpmovmskb %ymm1, %eax
> -	testl	%eax, %eax
> -
>  # ifndef USE_AS_RAWMEMCHR
> -	jnz	L(first_vec_x0_check)
> -	/* Adjust length and check the end of data.  */
> -	subq	$VEC_SIZE, %rdx
> -	jbe	L(zero)
> -# else
> -	jnz	L(first_vec_x0)
> +	/* If length < CHAR_PER_VEC handle special.  */
> +	cmpq	$VEC_SIZE, %rdx
> +	jbe	L(first_vec_x0)
>  # endif
> -
> -	/* Align data for aligned loads in the loop.  */
> -	addq	$VEC_SIZE, %rdi
> -	andl	$(VEC_SIZE - 1), %ecx
> -	andq	$-VEC_SIZE, %rdi
> +	testl	%eax, %eax
> +	jz	L(aligned_more)
> +	tzcntl	%eax, %eax
> +	addq	%rdi, %rax
> +	VZEROUPPER_RETURN
>  
>  # ifndef USE_AS_RAWMEMCHR
> -	/* Adjust length.  */
> -	addq	%rcx, %rdx
> -
> -	subq	$(VEC_SIZE * 4), %rdx
> -	jbe	L(last_4x_vec_or_less)
> +	.p2align 5
> +L(first_vec_x0):
> +	/* Check if first match was before length.  */
> +	tzcntl	%eax, %eax
> +	xorl	%ecx, %ecx
> +	cmpl	%eax, %edx
> +	leaq	(%rdi, %rax), %rax
> +	cmovle	%rcx, %rax
> +	VZEROUPPER_RETURN
> +L(null):
> +	xorl	%eax, %eax
> +	ret
>  # endif
> -	jmp	L(more_4x_vec)
> -
>  	.p2align 4
> -L(cros_page_boundary):
> -	andl	$(VEC_SIZE - 1), %ecx
> -	andq	$-VEC_SIZE, %rdi
> -	VPCMPEQ (%rdi), %ymm0, %ymm1
> +L(cross_page_boundary):
> +	/* Save pointer before aligning as its original
> +	   value is necessary for computer return address if byte is
> +	   found or adjusting length if it is not and this is

Fit comments to 72 columns.

> +	   memchr.  */
> +	movq	%rdi, %rcx
> +	/* Align data to VEC_SIZE - 1. ALGN_PTR_REG is
> +	   rcx for memchr and rdi for rawmemchr.  */
> +	orq	$(VEC_SIZE - 1), %ALGN_PTR_REG
> +	VPCMPEQ	-(VEC_SIZE - 1)(%ALGN_PTR_REG), %ymm0, %ymm1
>  	vpmovmskb %ymm1, %eax
> +# ifndef USE_AS_RAWMEMCHR
> +	/* Calculate length until end of page (length
> +	   checked for a match).  */
> +	leaq	1(%ALGN_PTR_REG), %rsi
> +	subq	%RRAW_PTR_REG, %rsi
> +# endif
>  	/* Remove the leading bytes.  */
> -	sarl	%cl, %eax
> -	testl	%eax, %eax
> -	jz	L(aligned_more)
> -	tzcntl	%eax, %eax
> +	sarxl	%ERAW_PTR_REG, %eax, %eax
>  # ifndef USE_AS_RAWMEMCHR
>  	/* Check the end of data.  */
> -	cmpq	%rax, %rdx
> -	jbe	L(zero)
> +	cmpq	%rsi, %rdx
> +	jbe	L(first_vec_x0)
>  # endif
> -	addq	%rdi, %rax
> -	addq	%rcx, %rax
> +	testl	%eax, %eax
> +	jz	L(cross_page_continue)
> +	tzcntl	%eax, %eax
> +	addq	%RRAW_PTR_REG, %rax
>  L(return_vzeroupper):
>  	ZERO_UPPER_VEC_REGISTERS_RETURN
>  
>  	.p2align 4
> -L(aligned_more):
> -# ifndef USE_AS_RAWMEMCHR
> -        /* Calculate "rdx + rcx - VEC_SIZE" with "rdx - (VEC_SIZE - rcx)"
> -	   instead of "(rdx + rcx) - VEC_SIZE" to void possible addition
> -	   overflow.  */
> -	negq	%rcx
> -	addq	$VEC_SIZE, %rcx
> +L(first_vec_x1):
> +	tzcntl	%eax, %eax
> +	incq	%rdi
> +	addq	%rdi, %rax
> +	VZEROUPPER_RETURN
>  
> -	/* Check the end of data.  */
> -	subq	%rcx, %rdx
> -	jbe	L(zero)
> -# endif
> +	.p2align 4
> +L(first_vec_x2):
> +	tzcntl	%eax, %eax
> +	addq	$(VEC_SIZE + 1), %rdi
> +	addq	%rdi, %rax
> +	VZEROUPPER_RETURN
>  
> -	addq	$VEC_SIZE, %rdi
> +	.p2align 4
> +L(first_vec_x3):
> +	tzcntl	%eax, %eax
> +	addq	$(VEC_SIZE * 2 + 1), %rdi
> +	addq	%rdi, %rax
> +	VZEROUPPER_RETURN
>  
> -# ifndef USE_AS_RAWMEMCHR
> -	subq	$(VEC_SIZE * 4), %rdx
> -	jbe	L(last_4x_vec_or_less)
> -# endif
>  
> -L(more_4x_vec):
> -	/* Check the first 4 * VEC_SIZE.  Only one VEC_SIZE at a time
> -	   since data is only aligned to VEC_SIZE.  */
> -	VPCMPEQ (%rdi), %ymm0, %ymm1
> -	vpmovmskb %ymm1, %eax
> -	testl	%eax, %eax
> -	jnz	L(first_vec_x0)
> +	.p2align 4
> +L(first_vec_x4):
> +	tzcntl	%eax, %eax
> +	addq	$(VEC_SIZE * 3 + 1), %rdi
> +	addq	%rdi, %rax
> +	VZEROUPPER_RETURN
>  
> -	VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1
> +	.p2align 4
> +L(aligned_more):
> +	/* Check the first 4 * VEC_SIZE.  Only one
> +	   VEC_SIZE at a time since data is only aligned to
> +	   VEC_SIZE.  */

Fit comments to 72 columns.

> +
> +# ifndef USE_AS_RAWMEMCHR
> +L(cross_page_continue):
> +	/* Align data to VEC_SIZE - 1.  */
> +	xorl	%ecx, %ecx
> +	subl	%edi, %ecx
> +	orq	$(VEC_SIZE - 1), %rdi
> +	/* esi is for adjusting length to see if near the
> +	   end.  */

Fit comments to 72 columns.

> +	leal	(VEC_SIZE * 4 + 1)(%rdi, %rcx), %esi
> +# else
> +	orq	$(VEC_SIZE - 1), %rdi
> +L(cross_page_continue):
> +# endif
> +	/* Load first VEC regardless.  */
> +	VPCMPEQ	1(%rdi), %ymm0, %ymm1
>  	vpmovmskb %ymm1, %eax
> +# ifndef USE_AS_RAWMEMCHR
> +	/* Adjust length. If near end handle specially.
> +	 */

Put the comments on one line.

> +	subq	%rsi, %rdx
> +	jbe	L(last_4x_vec_or_less)
> +# endif
>  	testl	%eax, %eax
>  	jnz	L(first_vec_x1)
>  
> -	VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm0, %ymm1
> +	VPCMPEQ	(VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
>  	vpmovmskb %ymm1, %eax
>  	testl	%eax, %eax
>  	jnz	L(first_vec_x2)
>  
> -	VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1
> +	VPCMPEQ	(VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm1
>  	vpmovmskb %ymm1, %eax
>  	testl	%eax, %eax
>  	jnz	L(first_vec_x3)
>  
> -	addq	$(VEC_SIZE * 4), %rdi
> +	VPCMPEQ	(VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm1
> +	vpmovmskb %ymm1, %eax
> +	testl	%eax, %eax
> +	jnz	L(first_vec_x4)
>  
>  # ifndef USE_AS_RAWMEMCHR
> +	/* Check if at last VEC_SIZE * 4 length.  */
>  	subq	$(VEC_SIZE * 4), %rdx
> -	jbe	L(last_4x_vec_or_less)
> -# endif
> -
> -	/* Align data to 4 * VEC_SIZE.  */
> -	movq	%rdi, %rcx
> -	andl	$(4 * VEC_SIZE - 1), %ecx
> -	andq	$-(4 * VEC_SIZE), %rdi
> -
> -# ifndef USE_AS_RAWMEMCHR
> -	/* Adjust length.  */
> +	jbe	L(last_4x_vec_or_less_cmpeq)
> +	/* Align data to VEC_SIZE * 4 - 1 for the  loop
> +	   and readjust length.  */
> +	incq	%rdi
> +	movl	%edi, %ecx
> +	orq	$(VEC_SIZE * 4 - 1), %rdi
> +	andl	$(VEC_SIZE * 4 - 1), %ecx
>  	addq	%rcx, %rdx
> +# else
> +	/* Align data to VEC_SIZE * 4 - 1 for loop.  */
> +	incq	%rdi
> +	orq	$(VEC_SIZE * 4 - 1), %rdi
>  # endif
>  
> +	/* Compare 4 * VEC at a time forward.  */
>  	.p2align 4
>  L(loop_4x_vec):
> -	/* Compare 4 * VEC at a time forward.  */
> -	VPCMPEQ (%rdi), %ymm0, %ymm1
> -	VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm2
> -	VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm0, %ymm3
> -	VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm0, %ymm4
> -
> +	VPCMPEQ	1(%rdi), %ymm0, %ymm1
> +	VPCMPEQ	(VEC_SIZE + 1)(%rdi), %ymm0, %ymm2
> +	VPCMPEQ	(VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm3
> +	VPCMPEQ	(VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm4
>  	vpor	%ymm1, %ymm2, %ymm5
>  	vpor	%ymm3, %ymm4, %ymm6
>  	vpor	%ymm5, %ymm6, %ymm5
>  
> -	vpmovmskb %ymm5, %eax
> -	testl	%eax, %eax
> -	jnz	L(4x_vec_end)
> -
> -	addq	$(VEC_SIZE * 4), %rdi
> -
> +	vpmovmskb %ymm5, %ecx
>  # ifdef USE_AS_RAWMEMCHR
> -	jmp	L(loop_4x_vec)
> +	subq	$-(VEC_SIZE * 4), %rdi
> +	testl	%ecx, %ecx
> +	jz	L(loop_4x_vec)
>  # else
> -	subq	$(VEC_SIZE * 4), %rdx
> -	ja	L(loop_4x_vec)
> +	testl	%ecx, %ecx
> +	jnz	L(loop_4x_vec_end)
>  
> -L(last_4x_vec_or_less):
> -	/* Less than 4 * VEC and aligned to VEC_SIZE.  */
> -	addl	$(VEC_SIZE * 2), %edx
> -	jle	L(last_2x_vec)
> +	subq	$-(VEC_SIZE * 4), %rdi
>  
> -	VPCMPEQ (%rdi), %ymm0, %ymm1
> -	vpmovmskb %ymm1, %eax
> -	testl	%eax, %eax
> -	jnz	L(first_vec_x0)
> +	subq	$(VEC_SIZE * 4), %rdx
> +	ja	L(loop_4x_vec)
>  
> -	VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1
> +	/* Fall through into less than 4 remaining
> +	   vectors of length case.  */

Fit comments to 72 columns.

> +	VPCMPEQ	(VEC_SIZE * 0 + 1)(%rdi), %ymm0, %ymm1
>  	vpmovmskb %ymm1, %eax
> +	.p2align 4
> +L(last_4x_vec_or_less):
> +	/* Check if first VEC contained match.  */
>  	testl	%eax, %eax
> -	jnz	L(first_vec_x1)
> +	jnz	L(first_vec_x1_check)
>  
> -	VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm0, %ymm1
> -	vpmovmskb %ymm1, %eax
> -	testl	%eax, %eax
> +	/* If remaining length > VEC_SIZE * 2.  */
> +	addl	$(VEC_SIZE * 2), %edx
> +	jg	L(last_4x_vec)
>  
> -	jnz	L(first_vec_x2_check)
> -	subl	$VEC_SIZE, %edx
> -	jle	L(zero)
> +L(last_2x_vec):
> +	/* If remaining length < VEC_SIZE.  */
> +	addl	$VEC_SIZE, %edx
> +	jle	L(zero_end)
>  
> -	VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1
> +	/* Check VEC2 and compare any match with
> +	   remaining length.  */

Fit comments to 72 columns.

> +	VPCMPEQ	(VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
>  	vpmovmskb %ymm1, %eax
> -	testl	%eax, %eax
> -
> -	jnz	L(first_vec_x3_check)
> -	xorl	%eax, %eax
> +	tzcntl	%eax, %eax
> +	cmpl	%eax, %edx
> +	jbe	L(set_zero_end)
> +	addq	$(VEC_SIZE + 1), %rdi
> +	addq	%rdi, %rax
> +L(zero_end):
>  	VZEROUPPER_RETURN
>  
>  	.p2align 4
> -L(last_2x_vec):
> -	addl	$(VEC_SIZE * 2), %edx
> -	VPCMPEQ (%rdi), %ymm0, %ymm1
> +L(loop_4x_vec_end):
> +# endif
> +	/* rawmemchr will fall through into this if match
> +	   was found in loop.  */

Fit comments to 72 columns.

> +
>  	vpmovmskb %ymm1, %eax
>  	testl	%eax, %eax
> +	jnz	L(last_vec_x1_return)
>  
> -	jnz	L(first_vec_x0_check)
> -	subl	$VEC_SIZE, %edx
> -	jle	L(zero)
> -
> -	VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1
> -	vpmovmskb %ymm1, %eax
> +	vpmovmskb %ymm2, %eax
>  	testl	%eax, %eax
> -	jnz	L(first_vec_x1_check)
> -	xorl	%eax, %eax
> -	VZEROUPPER_RETURN
> +	jnz	L(last_vec_x2_return)
>  
> -	.p2align 4
> -L(first_vec_x0_check):
> -	tzcntl	%eax, %eax
> -	/* Check the end of data.  */
> -	cmpq	%rax, %rdx
> -	jbe	L(zero)
> +	vpmovmskb %ymm3, %eax
> +	/* Combine VEC3 matches (eax) with VEC4 matches
> +	   (ecx).  */

Fit comments to 72 columns.

> +	salq	$32, %rcx
> +	orq	%rcx, %rax
> +	tzcntq	%rax, %rax
> +# ifdef USE_AS_RAWMEMCHR
> +	subq	$(VEC_SIZE * 2 - 1), %rdi
> +# else
> +	subq	$-(VEC_SIZE * 2 + 1), %rdi
> +# endif
>  	addq	%rdi, %rax
>  	VZEROUPPER_RETURN
> +# ifndef USE_AS_RAWMEMCHR
>  
>  	.p2align 4
>  L(first_vec_x1_check):
>  	tzcntl	%eax, %eax
> -	/* Check the end of data.  */
> -	cmpq	%rax, %rdx
> -	jbe	L(zero)
> -	addq	$VEC_SIZE, %rax
> +	/* Adjust length.  */
> +	subl	$-(VEC_SIZE * 4), %edx
> +	/* Check if match within remaining length.  */
> +	cmpl	%eax, %edx
> +	jbe	L(set_zero_end)
> +	incq	%rdi
>  	addq	%rdi, %rax
>  	VZEROUPPER_RETURN
> +	.p2align 4
> +L(set_zero_end):
> +	xorl	%eax, %eax
> +	VZEROUPPER_RETURN
> +# endif
>  
>  	.p2align 4
> -L(first_vec_x2_check):
> +L(last_vec_x1_return):
>  	tzcntl	%eax, %eax
> -	/* Check the end of data.  */
> -	cmpq	%rax, %rdx
> -	jbe	L(zero)
> -	addq	$(VEC_SIZE * 2), %rax
> +# ifdef USE_AS_RAWMEMCHR
> +	subq	$(VEC_SIZE * 4 - 1), %rdi
> +# else
> +	incq	%rdi
> +# endif
>  	addq	%rdi, %rax
>  	VZEROUPPER_RETURN
>  
>  	.p2align 4
> -L(first_vec_x3_check):
> +L(last_vec_x2_return):
>  	tzcntl	%eax, %eax
> -	/* Check the end of data.  */
> -	cmpq	%rax, %rdx
> -	jbe	L(zero)
> -	addq	$(VEC_SIZE * 3), %rax
> +# ifdef USE_AS_RAWMEMCHR
> +	subq	$(VEC_SIZE * 3 - 1), %rdi
> +# else
> +	subq	$-(VEC_SIZE + 1), %rdi
> +# endif
>  	addq	%rdi, %rax
>  	VZEROUPPER_RETURN
>  
> +# ifndef USE_AS_RAWMEMCHR
>  	.p2align 4
> -L(zero):
> -	xorl	%eax, %eax
> -	jmp     L(return_vzeroupper)
> +L(last_4x_vec_or_less_cmpeq):
> +	VPCMPEQ	(VEC_SIZE * 4 + 1)(%rdi), %ymm0, %ymm1
> +	vpmovmskb %ymm1, %eax
> +	subq	$-(VEC_SIZE * 4), %rdi
> +	/* Check first VEC regardless.  */
> +	testl	%eax, %eax
> +	jnz	L(first_vec_x1_check)
>  
> +	/* If remaining length <= CHAR_PER_VEC * 2.  */
> +	addl	$(VEC_SIZE * 2), %edx
> +	jle	L(last_2x_vec)
>  	.p2align 4
> -L(null):
> -	xorl	%eax, %eax
> -	ret
> -# endif
> +L(last_4x_vec):
> +	VPCMPEQ	(VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
> +	vpmovmskb %ymm1, %eax
> +	testl	%eax, %eax
> +	jnz	L(last_vec_x2_return)
>  
> -	.p2align 4
> -L(first_vec_x0):
> -	tzcntl	%eax, %eax
> -	addq	%rdi, %rax
> -	VZEROUPPER_RETURN
> +	VPCMPEQ	(VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm1
> +	vpmovmskb %ymm1, %eax
>  
> -	.p2align 4
> -L(first_vec_x1):
> -	tzcntl	%eax, %eax
> -	addq	$VEC_SIZE, %rax
> -	addq	%rdi, %rax
> -	VZEROUPPER_RETURN
> +	/* Create mask for possible matches within
> +	   remaining length.  */

Fit comments to 72 columns.

> +	movq	$-1, %rcx
> +	bzhiq	%rdx, %rcx, %rcx
>  
> -	.p2align 4
> -L(first_vec_x2):
> +	/* Test matches in data against length match.  */
> +	andl	%ecx, %eax
> +	jnz	L(last_vec_x3)
> +
> +	/* if remaining length <= VEC_SIZE * 3 (Note this
> +	   is after remaining length was found to be > VEC_SIZE * 2.

Fit comments to 72 columns.

> +	 */
> +	subl	$VEC_SIZE, %edx
> +	jbe	L(zero_end2)
> +
> +	VPCMPEQ	(VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm1
> +	vpmovmskb %ymm1, %eax
> +	/* Shift remaining length mask for last VEC.  */
> +	shrq	$32, %rcx
> +	andl	%ecx, %eax
> +	jz	L(zero_end2)
>  	tzcntl	%eax, %eax
> -	addq	$(VEC_SIZE * 2), %rax
> +	addq	$(VEC_SIZE * 3 + 1), %rdi
>  	addq	%rdi, %rax
> +L(zero_end2):
>  	VZEROUPPER_RETURN
>  
>  	.p2align 4
> -L(4x_vec_end):
> -	vpmovmskb %ymm1, %eax
> -	testl	%eax, %eax
> -	jnz	L(first_vec_x0)
> -	vpmovmskb %ymm2, %eax
> -	testl	%eax, %eax
> -	jnz	L(first_vec_x1)
> -	vpmovmskb %ymm3, %eax
> -	testl	%eax, %eax
> -	jnz	L(first_vec_x2)
> -	vpmovmskb %ymm4, %eax
> -	testl	%eax, %eax
> -L(first_vec_x3):
> +L(last_vec_x3):
>  	tzcntl	%eax, %eax
> -	addq	$(VEC_SIZE * 3), %rax
> +	subq	$-(VEC_SIZE * 2 + 1), %rdi
>  	addq	%rdi, %rax
>  	VZEROUPPER_RETURN
> +# endif
>  
> -END (MEMCHR)
> +END(MEMCHR)

No need for this change.

>  #endif
> -- 
> 2.29.2
> 

Thanks.

H.J.

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH v1 3/3] x86: Optimize memchr-evex.S
  2021-05-03  8:44 ` [PATCH v1 3/3] x86: Optimize memchr-evex.S Noah Goldstein
@ 2021-05-03 18:58   ` H.J. Lu
  2021-05-03 20:06     ` Noah Goldstein
  0 siblings, 1 reply; 20+ messages in thread
From: H.J. Lu @ 2021-05-03 18:58 UTC (permalink / raw)
  To: Noah Goldstein; +Cc: libc-alpha, carlos, hjl.tools

On Mon, May 03, 2021 at 04:44:38AM -0400, Noah Goldstein wrote:
> No bug. This commit optimizes memchr-evex.S. The optimizations include
> replacing some branches with cmovcc, avoiding some branches entirely
> in the less_4x_vec case, making the page cross logic less strict,
> saving some ALU in the alignment process, and most importantly
> increasing ILP in the 4x loop. test-memchr, test-rawmemchr, and
> test-wmemchr are all passing.
> 
> Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
> ---
> Tests where run on the following CPUs:
> 
> Tigerlake: https://ark.intel.com/content/www/us/en/ark/products/208921/intel-core-i7-1165g7-processor-12m-cache-up-to-4-70-ghz-with-ipu.html
> 
> Icelake: https://ark.intel.com/content/www/us/en/ark/products/196597/intel-core-i7-1065g7-processor-8m-cache-up-to-3-90-ghz.html
> 
> Skylake: https://ark.intel.com/content/www/us/en/ark/products/149091/intel-core-i7-8565u-processor-8m-cache-up-to-4-60-ghz.html
> 
> All times are the geometric mean of N=20. The unit of time is
> seconds.
> 
> "Cur" refers to the current implementation
> "New" refers to this patches implementation
> 
> Note: The numbers for size = [1, 32] are highly dependent on function
> alignment. That being said the new implementation which uses cmovcc
> instead of a branch (mostly for the reason of high variance with
> different alignments) for the [1, 32] case is far more consistent and
> performs about as well (and should only be a bigger improvement in
> cases where the sizes / position are not 100% predictable).
> 
> For memchr-evex the numbers are a near universal improvement. The case
> where the current implement as better is for size = 0 and for size =
> [1, 32] with pos < size the two implementations are about the
> same. For size = [1, 32] with pos > size, for medium range sizes, and
> large size, however, the new implementation is faster.
> 
> Results For Tigerlake memchr-evex
> size  , algn  , Pos   , Cur T , New T , Win   , Dif   
> 2048  , 0     , , 32    5.58  , 5.22  , New   , 0.36  
> 256   , 1     , , 64    5.22  , 4.93  , New   , 0.29  
> 2048  , 0     , , 64    5.22  , 4.89  , New   , 0.33  
> 256   , 2     , , 64    5.14  , 4.81  , New   , 0.33  
> 2048  , 0     , , 128   6.3   , 5.67  , New   , 0.63  
> 256   , 3     , , 64    5.22  , 4.9   , New   , 0.32  
> 2048  , 0     , , 256   11.07 , 10.92 , New   , 0.15  
> 256   , 4     , , 64    5.16  , 4.86  , New   , 0.3   
> 2048  , 0     , , 512   15.66 , 14.81 , New   , 0.85  
> 256   , 5     , , 64    5.15  , 4.84  , New   , 0.31  
> 2048  , 0     , , 1024  25.7  , 23.02 , New   , 2.68  
> 256   , 6     , , 64    5.12  , 4.89  , New   , 0.23  
> 2048  , 0     , , 2048  42.34 , 37.71 , New   , 4.63  
> 256   , 7     , , 64    5.03  , 4.62  , New   , 0.41  
> 192   , 1     , , 32    4.96  , 4.28  , New   , 0.68  
> 256   , 1     , , 32    4.95  , 4.28  , New   , 0.67  
> 512   , 1     , , 32    4.94  , 4.29  , New   , 0.65  
> 192   , 2     , , 64    5.1   , 4.8   , New   , 0.3   
> 512   , 2     , , 64    5.12  , 4.72  , New   , 0.4   
> 192   , 3     , , 96    5.54  , 5.12  , New   , 0.42  
> 256   , 3     , , 96    5.52  , 5.15  , New   , 0.37  
> 512   , 3     , , 96    5.51  , 5.16  , New   , 0.35  
> 192   , 4     , , 128   6.1   , 5.53  , New   , 0.57  
> 256   , 4     , , 128   6.09  , 5.49  , New   , 0.6   
> 512   , 4     , , 128   6.08  , 5.48  , New   , 0.6   
> 192   , 5     , , 160   7.42  , 6.71  , New   , 0.71  
> 256   , 5     , , 160   6.86  , 6.71  , New   , 0.15  
> 512   , 5     , , 160   9.28  , 8.68  , New   , 0.6   
> 192   , 6     , , 192   7.94  , 7.47  , New   , 0.47  
> 256   , 6     , , 192   7.62  , 7.17  , New   , 0.45  
> 512   , 6     , , 192   9.2   , 9.16  , New   , 0.04  
> 192   , 7     , , 224   8.02  , 7.43  , New   , 0.59  
> 256   , 7     , , 224   8.34  , 7.85  , New   , 0.49  
> 512   , 7     , , 224   9.89  , 9.16  , New   , 0.73  
> 2     , 0     , , 1     3.0   , 3.0   , Eq    , 0.0
> 2     , 1     , , 1     3.0   , 3.0   , Eq    , 0.0
> 0     , 0     , , 1     3.01  , 3.6   , Cur   , 0.59  
> 0     , 1     , , 1     3.01  , 3.6   , Cur   , 0.59  
> 3     , 0     , , 2     3.0   , 3.0   , Eq    , 0.0
> 3     , 2     , , 2     3.0   , 3.0   , Eq    , 0.0
> 1     , 0     , , 2     3.6   , 3.0   , New   , 0.6   
> 1     , 2     , , 2     3.6   , 3.0   , New   , 0.6   
> 4     , 0     , , 3     3.01  , 3.01  , Eq    , 0.0
> 4     , 3     , , 3     3.01  , 3.01  , Eq    , 0.0
> 2     , 0     , , 3     3.62  , 3.02  , New   , 0.6   
> 2     , 3     , , 3     3.62  , 3.03  , New   , 0.59  
> 5     , 0     , , 4     3.02  , 3.03  , Cur   , 0.01  
> 5     , 4     , , 4     3.02  , 3.02  , Eq    , 0.0
> 3     , 0     , , 4     3.63  , 3.02  , New   , 0.61  
> 3     , 4     , , 4     3.63  , 3.04  , New   , 0.59  
> 6     , 0     , , 5     3.05  , 3.04  , New   , 0.01  
> 6     , 5     , , 5     3.02  , 3.02  , Eq    , 0.0
> 4     , 0     , , 5     3.63  , 3.02  , New   , 0.61  
> 4     , 5     , , 5     3.64  , 3.03  , New   , 0.61  
> 7     , 0     , , 6     3.03  , 3.03  , Eq    , 0.0
> 7     , 6     , , 6     3.02  , 3.02  , Eq    , 0.0
> 5     , 0     , , 6     3.64  , 3.01  , New   , 0.63  
> 5     , 6     , , 6     3.64  , 3.03  , New   , 0.61  
> 8     , 0     , , 7     3.03  , 3.04  , Cur   , 0.01  
> 8     , 7     , , 7     3.04  , 3.04  , Eq    , 0.0
> 6     , 0     , , 7     3.67  , 3.04  , New   , 0.63  
> 6     , 7     , , 7     3.65  , 3.05  , New   , 0.6   
> 9     , 0     , , 8     3.05  , 3.05  , Eq    , 0.0
> 7     , 0     , , 8     3.67  , 3.05  , New   , 0.62  
> 10    , 0     , , 9     3.06  , 3.06  , Eq    , 0.0
> 10    , 1     , , 9     3.06  , 3.06  , Eq    , 0.0
> 8     , 0     , , 9     3.67  , 3.06  , New   , 0.61  
> 8     , 1     , , 9     3.67  , 3.06  , New   , 0.61  
> 11    , 0     , , 10    3.06  , 3.06  , Eq    , 0.0
> 11    , 2     , , 10    3.07  , 3.06  , New   , 0.01  
> 9     , 0     , , 10    3.67  , 3.05  , New   , 0.62  
> 9     , 2     , , 10    3.67  , 3.06  , New   , 0.61  
> 12    , 0     , , 11    3.06  , 3.06  , Eq    , 0.0
> 12    , 3     , , 11    3.06  , 3.06  , Eq    , 0.0
> 10    , 0     , , 11    3.67  , 3.06  , New   , 0.61  
> 10    , 3     , , 11    3.67  , 3.06  , New   , 0.61  
> 13    , 0     , , 12    3.06  , 3.07  , Cur   , 0.01  
> 13    , 4     , , 12    3.06  , 3.07  , Cur   , 0.01  
> 11    , 0     , , 12    3.67  , 3.11  , New   , 0.56  
> 11    , 4     , , 12    3.68  , 3.12  , New   , 0.56  
> 14    , 0     , , 13    3.07  , 3.1   , Cur   , 0.03  
> 14    , 5     , , 13    3.06  , 3.07  , Cur   , 0.01  
> 12    , 0     , , 13    3.67  , 3.07  , New   , 0.6   
> 12    , 5     , , 13    3.67  , 3.08  , New   , 0.59  
> 15    , 0     , , 14    3.06  , 3.06  , Eq    , 0.0
> 15    , 6     , , 14    3.07  , 3.06  , New   , 0.01  
> 13    , 0     , , 14    3.67  , 3.06  , New   , 0.61  
> 13    , 6     , , 14    3.68  , 3.06  , New   , 0.62  
> 16    , 0     , , 15    3.06  , 3.06  , Eq    , 0.0
> 16    , 7     , , 15    3.06  , 3.05  , New   , 0.01  
> 14    , 0     , , 15    3.68  , 3.06  , New   , 0.62  
> 14    , 7     , , 15    3.67  , 3.06  , New   , 0.61  
> 17    , 0     , , 16    3.07  , 3.06  , New   , 0.01  
> 15    , 0     , , 16    3.68  , 3.06  , New   , 0.62  
> 18    , 0     , , 17    3.06  , 3.06  , Eq    , 0.0
> 18    , 1     , , 17    3.06  , 3.06  , Eq    , 0.0
> 16    , 0     , , 17    3.67  , 3.06  , New   , 0.61  
> 16    , 1     , , 17    3.67  , 3.05  , New   , 0.62  
> 19    , 0     , , 18    3.07  , 3.06  , New   , 0.01  
> 19    , 2     , , 18    3.06  , 3.06  , Eq    , 0.0
> 17    , 0     , , 18    3.68  , 3.08  , New   , 0.6   
> 17    , 2     , , 18    3.68  , 3.06  , New   , 0.62  
> 20    , 0     , , 19    3.06  , 3.06  , Eq    , 0.0
> 20    , 3     , , 19    3.06  , 3.06  , Eq    , 0.0
> 18    , 0     , , 19    3.68  , 3.06  , New   , 0.62  
> 18    , 3     , , 19    3.68  , 3.06  , New   , 0.62  
> 21    , 0     , , 20    3.06  , 3.06  , Eq    , 0.0
> 21    , 4     , , 20    3.06  , 3.06  , Eq    , 0.0
> 19    , 0     , , 20    3.67  , 3.06  , New   , 0.61  
> 19    , 4     , , 20    3.67  , 3.06  , New   , 0.61  
> 22    , 0     , , 21    3.06  , 3.06  , Eq    , 0.0
> 22    , 5     , , 21    3.06  , 3.06  , Eq    , 0.0
> 20    , 0     , , 21    3.67  , 3.05  , New   , 0.62  
> 20    , 5     , , 21    3.68  , 3.06  , New   , 0.62  
> 23    , 0     , , 22    3.07  , 3.06  , New   , 0.01  
> 23    , 6     , , 22    3.06  , 3.06  , Eq    , 0.0
> 21    , 0     , , 22    3.68  , 3.07  , New   , 0.61  
> 21    , 6     , , 22    3.67  , 3.06  , New   , 0.61  
> 24    , 0     , , 23    3.19  , 3.06  , New   , 0.13  
> 24    , 7     , , 23    3.08  , 3.06  , New   , 0.02  
> 22    , 0     , , 23    3.69  , 3.06  , New   , 0.63  
> 22    , 7     , , 23    3.68  , 3.06  , New   , 0.62  
> 25    , 0     , , 24    3.07  , 3.06  , New   , 0.01  
> 23    , 0     , , 24    3.68  , 3.06  , New   , 0.62  
> 26    , 0     , , 25    3.06  , 3.05  , New   , 0.01  
> 26    , 1     , , 25    3.07  , 3.06  , New   , 0.01  
> 24    , 0     , , 25    3.67  , 3.05  , New   , 0.62  
> 24    , 1     , , 25    3.68  , 3.06  , New   , 0.62  
> 27    , 0     , , 26    3.12  , 3.06  , New   , 0.06  
> 27    , 2     , , 26    3.08  , 3.06  , New   , 0.02  
> 25    , 0     , , 26    3.69  , 3.06  , New   , 0.63  
> 25    , 2     , , 26    3.67  , 3.06  , New   , 0.61  
> 28    , 0     , , 27    3.06  , 3.06  , Eq    , 0.0
> 28    , 3     , , 27    3.06  , 3.06  , Eq    , 0.0
> 26    , 0     , , 27    3.67  , 3.06  , New   , 0.61  
> 26    , 3     , , 27    3.67  , 3.06  , New   , 0.61  
> 29    , 0     , , 28    3.06  , 3.06  , Eq    , 0.0
> 29    , 4     , , 28    3.06  , 3.06  , Eq    , 0.0
> 27    , 0     , , 28    3.68  , 3.05  , New   , 0.63  
> 27    , 4     , , 28    3.67  , 3.06  , New   , 0.61  
> 30    , 0     , , 29    3.06  , 3.06  , Eq    , 0.0
> 30    , 5     , , 29    3.06  , 3.06  , Eq    , 0.0
> 28    , 0     , , 29    3.67  , 3.06  , New   , 0.61  
> 28    , 5     , , 29    3.68  , 3.06  , New   , 0.62  
> 31    , 0     , , 30    3.06  , 3.06  , Eq    , 0.0
> 31    , 6     , , 30    3.06  , 3.06  , Eq    , 0.0
> 29    , 0     , , 30    3.68  , 3.06  , New   , 0.62  
> 29    , 6     , , 30    3.7   , 3.06  , New   , 0.64  
> 32    , 0     , , 31    3.17  , 3.06  , New   , 0.11  
> 32    , 7     , , 31    3.12  , 3.06  , New   , 0.06  
> 30    , 0     , , 31    3.68  , 3.06  , New   , 0.62  
> 30    , 7     , , 31    3.68  , 3.06  , New   , 0.62
> 
> Results For Icelake memchr-evex
> size  , algn  , Pos   , Cur T , New T , Win   , Dif   
> 2048  , 0     , , 32    4.94  , 4.26  , New   , 0.68  
> 256   , 1     , , 64    4.5   , 4.13  , New   , 0.37  
> 2048  , 0     , , 64    4.19  , 3.9   , New   , 0.29  
> 256   , 2     , , 64    4.19  , 3.87  , New   , 0.32  
> 2048  , 0     , , 128   4.96  , 4.53  , New   , 0.43  
> 256   , 3     , , 64    4.07  , 3.86  , New   , 0.21  
> 2048  , 0     , , 256   8.77  , 8.61  , New   , 0.16  
> 256   , 4     , , 64    4.08  , 3.87  , New   , 0.21  
> 2048  , 0     , , 512   12.22 , 11.67 , New   , 0.55  
> 256   , 5     , , 64    4.12  , 3.83  , New   , 0.29  
> 2048  , 0     , , 1024  20.06 , 18.09 , New   , 1.97  
> 256   , 6     , , 64    4.2   , 3.95  , New   , 0.25  
> 2048  , 0     , , 2048  33.83 , 30.62 , New   , 3.21  
> 256   , 7     , , 64    4.3   , 4.04  , New   , 0.26  
> 192   , 1     , , 32    4.2   , 3.71  , New   , 0.49  
> 256   , 1     , , 32    4.24  , 3.76  , New   , 0.48  
> 512   , 1     , , 32    4.29  , 3.74  , New   , 0.55  
> 192   , 2     , , 64    4.42  , 4.0   , New   , 0.42  
> 512   , 2     , , 64    4.17  , 3.83  , New   , 0.34  
> 192   , 3     , , 96    4.44  , 4.26  , New   , 0.18  
> 256   , 3     , , 96    4.45  , 4.14  , New   , 0.31  
> 512   , 3     , , 96    4.42  , 4.15  , New   , 0.27  
> 192   , 4     , , 128   4.93  , 4.45  , New   , 0.48  
> 256   , 4     , , 128   4.93  , 4.47  , New   , 0.46  
> 512   , 4     , , 128   4.95  , 4.47  , New   , 0.48  
> 192   , 5     , , 160   5.95  , 5.44  , New   , 0.51  
> 256   , 5     , , 160   5.59  , 5.47  , New   , 0.12  
> 512   , 5     , , 160   7.59  , 7.34  , New   , 0.25  
> 192   , 6     , , 192   6.53  , 6.08  , New   , 0.45  
> 256   , 6     , , 192   6.2   , 5.88  , New   , 0.32  
> 512   , 6     , , 192   7.53  , 7.62  , Cur   , 0.09  
> 192   , 7     , , 224   6.62  , 6.12  , New   , 0.5   
> 256   , 7     , , 224   6.79  , 6.51  , New   , 0.28  
> 512   , 7     , , 224   8.12  , 7.61  , New   , 0.51  
> 2     , 0     , , 1     2.5   , 2.54  , Cur   , 0.04  
> 2     , 1     , , 1     2.56  , 2.55  , New   , 0.01  
> 0     , 0     , , 1     2.57  , 3.12  , Cur   , 0.55  
> 0     , 1     , , 1     2.59  , 3.14  , Cur   , 0.55  
> 3     , 0     , , 2     2.62  , 2.63  , Cur   , 0.01  
> 3     , 2     , , 2     2.66  , 2.67  , Cur   , 0.01  
> 1     , 0     , , 2     3.24  , 2.72  , New   , 0.52  
> 1     , 2     , , 2     3.28  , 2.75  , New   , 0.53  
> 4     , 0     , , 3     2.78  , 2.8   , Cur   , 0.02  
> 4     , 3     , , 3     2.8   , 2.82  , Cur   , 0.02  
> 2     , 0     , , 3     3.38  , 2.86  , New   , 0.52  
> 2     , 3     , , 3     3.41  , 2.89  , New   , 0.52  
> 5     , 0     , , 4     2.88  , 2.91  , Cur   , 0.03  
> 5     , 4     , , 4     2.88  , 2.92  , Cur   , 0.04  
> 3     , 0     , , 4     3.48  , 2.93  , New   , 0.55  
> 3     , 4     , , 4     3.47  , 2.93  , New   , 0.54  
> 6     , 0     , , 5     2.95  , 2.94  , New   , 0.01  
> 6     , 5     , , 5     2.91  , 2.92  , Cur   , 0.01  
> 4     , 0     , , 5     3.47  , 2.9   , New   , 0.57  
> 4     , 5     , , 5     3.43  , 2.91  , New   , 0.52  
> 7     , 0     , , 6     2.87  , 2.9   , Cur   , 0.03  
> 7     , 6     , , 6     2.87  , 2.89  , Cur   , 0.02  
> 5     , 0     , , 6     3.44  , 2.88  , New   , 0.56  
> 5     , 6     , , 6     3.41  , 2.87  , New   , 0.54  
> 8     , 0     , , 7     2.86  , 2.87  , Cur   , 0.01  
> 8     , 7     , , 7     2.86  , 2.87  , Cur   , 0.01  
> 6     , 0     , , 7     3.43  , 2.87  , New   , 0.56  
> 6     , 7     , , 7     3.44  , 2.87  , New   , 0.57  
> 9     , 0     , , 8     2.86  , 2.88  , Cur   , 0.02  
> 7     , 0     , , 8     3.41  , 2.89  , New   , 0.52  
> 10    , 0     , , 9     2.83  , 2.87  , Cur   , 0.04  
> 10    , 1     , , 9     2.82  , 2.87  , Cur   , 0.05  
> 8     , 0     , , 9     3.4   , 2.89  , New   , 0.51  
> 8     , 1     , , 9     3.41  , 2.87  , New   , 0.54  
> 11    , 0     , , 10    2.83  , 2.88  , Cur   , 0.05  
> 11    , 2     , , 10    2.84  , 2.88  , Cur   , 0.04  
> 9     , 0     , , 10    3.41  , 2.87  , New   , 0.54  
> 9     , 2     , , 10    3.41  , 2.88  , New   , 0.53  
> 12    , 0     , , 11    2.83  , 2.89  , Cur   , 0.06  
> 12    , 3     , , 11    2.85  , 2.87  , Cur   , 0.02  
> 10    , 0     , , 11    3.41  , 2.87  , New   , 0.54  
> 10    , 3     , , 11    3.42  , 2.88  , New   , 0.54  
> 13    , 0     , , 12    2.86  , 2.87  , Cur   , 0.01  
> 13    , 4     , , 12    2.84  , 2.88  , Cur   , 0.04  
> 11    , 0     , , 12    3.43  , 2.87  , New   , 0.56  
> 11    , 4     , , 12    3.49  , 2.87  , New   , 0.62  
> 14    , 0     , , 13    2.85  , 2.86  , Cur   , 0.01  
> 14    , 5     , , 13    2.85  , 2.86  , Cur   , 0.01  
> 12    , 0     , , 13    3.41  , 2.86  , New   , 0.55  
> 12    , 5     , , 13    3.44  , 2.85  , New   , 0.59  
> 15    , 0     , , 14    2.83  , 2.87  , Cur   , 0.04  
> 15    , 6     , , 14    2.82  , 2.86  , Cur   , 0.04  
> 13    , 0     , , 14    3.41  , 2.86  , New   , 0.55  
> 13    , 6     , , 14    3.4   , 2.86  , New   , 0.54  
> 16    , 0     , , 15    2.84  , 2.86  , Cur   , 0.02  
> 16    , 7     , , 15    2.83  , 2.85  , Cur   , 0.02  
> 14    , 0     , , 15    3.41  , 2.85  , New   , 0.56  
> 14    , 7     , , 15    3.39  , 2.87  , New   , 0.52  
> 17    , 0     , , 16    2.83  , 2.87  , Cur   , 0.04  
> 15    , 0     , , 16    3.4   , 2.85  , New   , 0.55  
> 18    , 0     , , 17    2.83  , 2.86  , Cur   , 0.03  
> 18    , 1     , , 17    2.85  , 2.84  , New   , 0.01  
> 16    , 0     , , 17    3.41  , 2.85  , New   , 0.56  
> 16    , 1     , , 17    3.4   , 2.86  , New   , 0.54  
> 19    , 0     , , 18    2.8   , 2.84  , Cur   , 0.04  
> 19    , 2     , , 18    2.82  , 2.83  , Cur   , 0.01  
> 17    , 0     , , 18    3.39  , 2.86  , New   , 0.53  
> 17    , 2     , , 18    3.39  , 2.84  , New   , 0.55  
> 20    , 0     , , 19    2.85  , 2.87  , Cur   , 0.02  
> 20    , 3     , , 19    2.88  , 2.87  , New   , 0.01  
> 18    , 0     , , 19    3.38  , 2.85  , New   , 0.53  
> 18    , 3     , , 19    3.4   , 2.85  , New   , 0.55  
> 21    , 0     , , 20    2.83  , 2.85  , Cur   , 0.02  
> 21    , 4     , , 20    2.88  , 2.85  , New   , 0.03  
> 19    , 0     , , 20    3.39  , 2.84  , New   , 0.55  
> 19    , 4     , , 20    3.39  , 2.96  , New   , 0.43  
> 22    , 0     , , 21    2.84  , 2.9   , Cur   , 0.06  
> 22    , 5     , , 21    2.81  , 2.84  , Cur   , 0.03  
> 20    , 0     , , 21    3.41  , 2.81  , New   , 0.6   
> 20    , 5     , , 21    3.38  , 2.83  , New   , 0.55  
> 23    , 0     , , 22    2.8   , 2.82  , Cur   , 0.02  
> 23    , 6     , , 22    2.81  , 2.83  , Cur   , 0.02  
> 21    , 0     , , 22    3.35  , 2.81  , New   , 0.54  
> 21    , 6     , , 22    3.34  , 2.81  , New   , 0.53  
> 24    , 0     , , 23    2.77  , 2.84  , Cur   , 0.07  
> 24    , 7     , , 23    2.78  , 2.8   , Cur   , 0.02  
> 22    , 0     , , 23    3.34  , 2.79  , New   , 0.55  
> 22    , 7     , , 23    3.32  , 2.79  , New   , 0.53  
> 25    , 0     , , 24    2.77  , 2.8   , Cur   , 0.03  
> 23    , 0     , , 24    3.29  , 2.79  , New   , 0.5   
> 26    , 0     , , 25    2.73  , 2.78  , Cur   , 0.05  
> 26    , 1     , , 25    2.75  , 2.79  , Cur   , 0.04  
> 24    , 0     , , 25    3.27  , 2.79  , New   , 0.48  
> 24    , 1     , , 25    3.27  , 2.77  , New   , 0.5   
> 27    , 0     , , 26    2.72  , 2.78  , Cur   , 0.06  
> 27    , 2     , , 26    2.75  , 2.76  , Cur   , 0.01  
> 25    , 0     , , 26    3.29  , 2.73  , New   , 0.56  
> 25    , 2     , , 26    3.3   , 2.76  , New   , 0.54  
> 28    , 0     , , 27    2.75  , 2.79  , Cur   , 0.04  
> 28    , 3     , , 27    2.77  , 2.77  , Eq    , 0.0
> 26    , 0     , , 27    3.28  , 2.78  , New   , 0.5   
> 26    , 3     , , 27    3.29  , 2.78  , New   , 0.51  
> 29    , 0     , , 28    2.74  , 2.76  , Cur   , 0.02  
> 29    , 4     , , 28    2.74  , 2.77  , Cur   , 0.03  
> 27    , 0     , , 28    3.3   , 2.76  , New   , 0.54  
> 27    , 4     , , 28    3.3   , 2.74  , New   , 0.56  
> 30    , 0     , , 29    2.72  , 2.76  , Cur   , 0.04  
> 30    , 5     , , 29    2.74  , 2.75  , Cur   , 0.01  
> 28    , 0     , , 29    3.25  , 2.73  , New   , 0.52  
> 28    , 5     , , 29    3.3   , 2.73  , New   , 0.57  
> 31    , 0     , , 30    2.73  , 2.77  , Cur   , 0.04  
> 31    , 6     , , 30    2.74  , 2.76  , Cur   , 0.02  
> 29    , 0     , , 30    3.25  , 2.73  , New   , 0.52  
> 29    , 6     , , 30    3.26  , 2.74  , New   , 0.52  
> 32    , 0     , , 31    2.73  , 2.74  , Cur   , 0.01  
> 32    , 7     , , 31    2.73  , 2.75  , Cur   , 0.02  
> 30    , 0     , , 31    3.24  , 2.72  , New   , 0.52  
> 30    , 7     , , 31    3.24  , 2.72  , New   , 0.52
> 
> For memchr-avx2 the improvements are more modest though again near
> universal. The improvement is most significant for medium sizes and
> small sizes with pos > size. For small sizes with pos < size and large
> sizes the two implementations perform roughly the same for large
> sizes.
> 
> Results For Tigerlake memchr-avx2
> size  , algn  , Pos   , Cur T , New T , Win   , Dif   
> 2048  , 0     , , 32    6.15  , 6.27  , Cur   , 0.12  
> 256   , 1     , , 64    6.21  , 6.03  , New   , 0.18  
> 2048  , 0     , , 64    6.07  , 5.95  , New   , 0.12  
> 256   , 2     , , 64    6.01  , 5.8   , New   , 0.21  
> 2048  , 0     , , 128   7.05  , 6.55  , New   , 0.5   
> 256   , 3     , , 64    6.14  , 5.83  , New   , 0.31  
> 2048  , 0     , , 256   11.78 , 11.78 , Eq    , 0.0
> 256   , 4     , , 64    6.1   , 5.85  , New   , 0.25  
> 2048  , 0     , , 512   16.32 , 15.96 , New   , 0.36  
> 256   , 5     , , 64    6.1   , 5.77  , New   , 0.33  
> 2048  , 0     , , 1024  25.38 , 25.18 , New   , 0.2   
> 256   , 6     , , 64    6.08  , 5.88  , New   , 0.2   
> 2048  , 0     , , 2048  38.56 , 38.32 , New   , 0.24  
> 256   , 7     , , 64    5.93  , 5.68  , New   , 0.25  
> 192   , 1     , , 32    5.49  , 5.3   , New   , 0.19  
> 256   , 1     , , 32    5.5   , 5.28  , New   , 0.22  
> 512   , 1     , , 32    5.48  , 5.32  , New   , 0.16  
> 192   , 2     , , 64    6.1   , 5.73  , New   , 0.37  
> 512   , 2     , , 64    5.88  , 5.72  , New   , 0.16  
> 192   , 3     , , 96    6.31  , 5.93  , New   , 0.38  
> 256   , 3     , , 96    6.32  , 5.93  , New   , 0.39  
> 512   , 3     , , 96    6.2   , 5.94  , New   , 0.26  
> 192   , 4     , , 128   6.65  , 6.4   , New   , 0.25  
> 256   , 4     , , 128   6.6   , 6.37  , New   , 0.23  
> 512   , 4     , , 128   6.74  , 6.33  , New   , 0.41  
> 192   , 5     , , 160   7.78  , 7.4   , New   , 0.38  
> 256   , 5     , , 160   7.18  , 7.4   , Cur   , 0.22  
> 512   , 5     , , 160   9.81  , 9.44  , New   , 0.37  
> 192   , 6     , , 192   9.12  , 7.77  , New   , 1.35  
> 256   , 6     , , 192   7.97  , 7.66  , New   , 0.31  
> 512   , 6     , , 192   10.14 , 9.95  , New   , 0.19  
> 192   , 7     , , 224   8.96  , 7.78  , New   , 1.18  
> 256   , 7     , , 224   8.52  , 8.23  , New   , 0.29  
> 512   , 7     , , 224   10.33 , 9.98  , New   , 0.35  
> 2     , 0     , , 1     3.61  , 3.6   , New   , 0.01  
> 2     , 1     , , 1     3.6   , 3.6   , Eq    , 0.0
> 0     , 0     , , 1     3.02  , 3.0   , New   , 0.02  
> 0     , 1     , , 1     3.0   , 3.0   , Eq    , 0.0
> 3     , 0     , , 2     3.6   , 3.6   , Eq    , 0.0
> 3     , 2     , , 2     3.61  , 3.6   , New   , 0.01  
> 1     , 0     , , 2     4.82  , 3.6   , New   , 1.22  
> 1     , 2     , , 2     4.81  , 3.6   , New   , 1.21  
> 4     , 0     , , 3     3.61  , 3.61  , Eq    , 0.0
> 4     , 3     , , 3     3.62  , 3.61  , New   , 0.01  
> 2     , 0     , , 3     4.82  , 3.62  , New   , 1.2   
> 2     , 3     , , 3     4.83  , 3.63  , New   , 1.2   
> 5     , 0     , , 4     3.63  , 3.64  , Cur   , 0.01  
> 5     , 4     , , 4     3.63  , 3.62  , New   , 0.01  
> 3     , 0     , , 4     4.84  , 3.62  , New   , 1.22  
> 3     , 4     , , 4     4.84  , 3.64  , New   , 1.2   
> 6     , 0     , , 5     3.66  , 3.64  , New   , 0.02  
> 6     , 5     , , 5     3.65  , 3.62  , New   , 0.03  
> 4     , 0     , , 5     4.83  , 3.63  , New   , 1.2   
> 4     , 5     , , 5     4.85  , 3.64  , New   , 1.21  
> 7     , 0     , , 6     3.76  , 3.79  , Cur   , 0.03  
> 7     , 6     , , 6     3.76  , 3.72  , New   , 0.04  
> 5     , 0     , , 6     4.84  , 3.62  , New   , 1.22  
> 5     , 6     , , 6     4.85  , 3.64  , New   , 1.21  
> 8     , 0     , , 7     3.64  , 3.65  , Cur   , 0.01  
> 8     , 7     , , 7     3.65  , 3.65  , Eq    , 0.0
> 6     , 0     , , 7     4.88  , 3.64  , New   , 1.24  
> 6     , 7     , , 7     4.87  , 3.65  , New   , 1.22  
> 9     , 0     , , 8     3.66  , 3.66  , Eq    , 0.0
> 7     , 0     , , 8     4.89  , 3.66  , New   , 1.23  
> 10    , 0     , , 9     3.67  , 3.67  , Eq    , 0.0
> 10    , 1     , , 9     3.67  , 3.67  , Eq    , 0.0
> 8     , 0     , , 9     4.9   , 3.67  , New   , 1.23  
> 8     , 1     , , 9     4.9   , 3.67  , New   , 1.23  
> 11    , 0     , , 10    3.68  , 3.67  , New   , 0.01  
> 11    , 2     , , 10    3.69  , 3.67  , New   , 0.02  
> 9     , 0     , , 10    4.9   , 3.67  , New   , 1.23  
> 9     , 2     , , 10    4.9   , 3.67  , New   , 1.23  
> 12    , 0     , , 11    3.71  , 3.68  , New   , 0.03  
> 12    , 3     , , 11    3.71  , 3.67  , New   , 0.04  
> 10    , 0     , , 11    4.9   , 3.67  , New   , 1.23  
> 10    , 3     , , 11    4.9   , 3.67  , New   , 1.23  
> 13    , 0     , , 12    4.24  , 4.23  , New   , 0.01  
> 13    , 4     , , 12    4.23  , 4.23  , Eq    , 0.0
> 11    , 0     , , 12    4.9   , 3.7   , New   , 1.2   
> 11    , 4     , , 12    4.9   , 3.73  , New   , 1.17  
> 14    , 0     , , 13    3.99  , 4.01  , Cur   , 0.02  
> 14    , 5     , , 13    3.98  , 3.98  , Eq    , 0.0
> 12    , 0     , , 13    4.9   , 3.69  , New   , 1.21  
> 12    , 5     , , 13    4.9   , 3.69  , New   , 1.21  
> 15    , 0     , , 14    3.99  , 3.97  , New   , 0.02  
> 15    , 6     , , 14    4.0   , 4.0   , Eq    , 0.0
> 13    , 0     , , 14    4.9   , 3.67  , New   , 1.23  
> 13    , 6     , , 14    4.9   , 3.67  , New   , 1.23  
> 16    , 0     , , 15    3.99  , 4.02  , Cur   , 0.03  
> 16    , 7     , , 15    4.01  , 3.96  , New   , 0.05  
> 14    , 0     , , 15    4.93  , 3.67  , New   , 1.26  
> 14    , 7     , , 15    4.92  , 3.67  , New   , 1.25  
> 17    , 0     , , 16    4.04  , 3.99  , New   , 0.05  
> 15    , 0     , , 16    5.42  , 4.22  , New   , 1.2   
> 18    , 0     , , 17    4.01  , 3.97  , New   , 0.04  
> 18    , 1     , , 17    3.99  , 3.98  , New   , 0.01  
> 16    , 0     , , 17    5.22  , 3.98  , New   , 1.24  
> 16    , 1     , , 17    5.19  , 3.98  , New   , 1.21  
> 19    , 0     , , 18    4.0   , 3.99  , New   , 0.01  
> 19    , 2     , , 18    4.03  , 3.97  , New   , 0.06  
> 17    , 0     , , 18    5.18  , 3.99  , New   , 1.19  
> 17    , 2     , , 18    5.18  , 3.98  , New   , 1.2   
> 20    , 0     , , 19    4.02  , 3.98  , New   , 0.04  
> 20    , 3     , , 19    4.0   , 3.98  , New   , 0.02  
> 18    , 0     , , 19    5.19  , 3.97  , New   , 1.22  
> 18    , 3     , , 19    5.21  , 3.98  , New   , 1.23  
> 21    , 0     , , 20    3.98  , 4.0   , Cur   , 0.02  
> 21    , 4     , , 20    4.0   , 4.0   , Eq    , 0.0
> 19    , 0     , , 20    5.19  , 3.99  , New   , 1.2   
> 19    , 4     , , 20    5.17  , 3.99  , New   , 1.18  
> 22    , 0     , , 21    4.03  , 3.98  , New   , 0.05  
> 22    , 5     , , 21    4.01  , 3.95  , New   , 0.06  
> 20    , 0     , , 21    5.19  , 4.0   , New   , 1.19  
> 20    , 5     , , 21    5.21  , 3.99  , New   , 1.22  
> 23    , 0     , , 22    4.06  , 3.97  , New   , 0.09  
> 23    , 6     , , 22    4.02  , 3.98  , New   , 0.04  
> 21    , 0     , , 22    5.2   , 4.02  , New   , 1.18  
> 21    , 6     , , 22    5.22  , 4.0   , New   , 1.22  
> 24    , 0     , , 23    4.15  , 3.98  , New   , 0.17  
> 24    , 7     , , 23    4.0   , 4.01  , Cur   , 0.01  
> 22    , 0     , , 23    5.28  , 4.0   , New   , 1.28  
> 22    , 7     , , 23    5.22  , 3.99  , New   , 1.23  
> 25    , 0     , , 24    4.1   , 4.04  , New   , 0.06  
> 23    , 0     , , 24    5.23  , 4.04  , New   , 1.19  
> 26    , 0     , , 25    4.1   , 4.06  , New   , 0.04  
> 26    , 1     , , 25    4.07  , 3.99  , New   , 0.08  
> 24    , 0     , , 25    5.26  , 4.02  , New   , 1.24  
> 24    , 1     , , 25    5.21  , 4.0   , New   , 1.21  
> 27    , 0     , , 26    4.17  , 4.03  , New   , 0.14  
> 27    , 2     , , 26    4.09  , 4.03  , New   , 0.06  
> 25    , 0     , , 26    5.29  , 4.1   , New   , 1.19  
> 25    , 2     , , 26    5.25  , 4.0   , New   , 1.25  
> 28    , 0     , , 27    4.06  , 4.1   , Cur   , 0.04  
> 28    , 3     , , 27    4.09  , 4.04  , New   , 0.05  
> 26    , 0     , , 27    5.26  , 4.04  , New   , 1.22  
> 26    , 3     , , 27    5.28  , 4.01  , New   , 1.27  
> 29    , 0     , , 28    4.07  , 4.02  , New   , 0.05  
> 29    , 4     , , 28    4.07  , 4.05  , New   , 0.02  
> 27    , 0     , , 28    5.25  , 4.02  , New   , 1.23  
> 27    , 4     , , 28    5.25  , 4.03  , New   , 1.22  
> 30    , 0     , , 29    4.14  , 4.06  , New   , 0.08  
> 30    , 5     , , 29    4.08  , 4.04  , New   , 0.04  
> 28    , 0     , , 29    5.26  , 4.07  , New   , 1.19  
> 28    , 5     , , 29    5.28  , 4.04  , New   , 1.24  
> 31    , 0     , , 30    4.09  , 4.08  , New   , 0.01  
> 31    , 6     , , 30    4.1   , 4.08  , New   , 0.02  
> 29    , 0     , , 30    5.28  , 4.05  , New   , 1.23  
> 29    , 6     , , 30    5.24  , 4.07  , New   , 1.17  
> 32    , 0     , , 31    4.1   , 4.13  , Cur   , 0.03  
> 32    , 7     , , 31    4.16  , 4.09  , New   , 0.07  
> 30    , 0     , , 31    5.31  , 4.09  , New   , 1.22  
> 30    , 7     , , 31    5.28  , 4.08  , New   , 1.2
> 
> Results For Icelake memchr-avx2
> size  , algn  , Pos   , Cur T , New T , Win   , Dif   
> 2048  , 0     , , 32    5.74  , 5.08  , New   , 0.66  
> 256   , 1     , , 64    5.16  , 4.93  , New   , 0.23  
> 2048  , 0     , , 64    4.86  , 4.69  , New   , 0.17  
> 256   , 2     , , 64    4.78  , 4.7   , New   , 0.08  
> 2048  , 0     , , 128   5.64  , 5.0   , New   , 0.64  
> 256   , 3     , , 64    4.64  , 4.59  , New   , 0.05  
> 2048  , 0     , , 256   9.07  , 9.17  , Cur   , 0.1   
> 256   , 4     , , 64    4.7   , 4.6   , New   , 0.1   
> 2048  , 0     , , 512   12.56 , 12.33 , New   , 0.23  
> 256   , 5     , , 64    4.72  , 4.61  , New   , 0.11  
> 2048  , 0     , , 1024  19.36 , 19.49 , Cur   , 0.13  
> 256   , 6     , , 64    4.82  , 4.69  , New   , 0.13  
> 2048  , 0     , , 2048  29.99 , 30.53 , Cur   , 0.54  
> 256   , 7     , , 64    4.9   , 4.85  , New   , 0.05  
> 192   , 1     , , 32    4.89  , 4.45  , New   , 0.44  
> 256   , 1     , , 32    4.93  , 4.44  , New   , 0.49  
> 512   , 1     , , 32    4.97  , 4.45  , New   , 0.52  
> 192   , 2     , , 64    5.04  , 4.65  , New   , 0.39  
> 512   , 2     , , 64    4.75  , 4.66  , New   , 0.09  
> 192   , 3     , , 96    5.14  , 4.66  , New   , 0.48  
> 256   , 3     , , 96    5.12  , 4.66  , New   , 0.46  
> 512   , 3     , , 96    5.13  , 4.62  , New   , 0.51  
> 192   , 4     , , 128   5.65  , 4.95  , New   , 0.7   
> 256   , 4     , , 128   5.63  , 4.95  , New   , 0.68  
> 512   , 4     , , 128   5.68  , 4.96  , New   , 0.72  
> 192   , 5     , , 160   6.1   , 5.84  , New   , 0.26  
> 256   , 5     , , 160   5.58  , 5.84  , Cur   , 0.26  
> 512   , 5     , , 160   7.95  , 7.74  , New   , 0.21  
> 192   , 6     , , 192   7.07  , 6.23  , New   , 0.84  
> 256   , 6     , , 192   6.34  , 6.09  , New   , 0.25  
> 512   , 6     , , 192   8.17  , 8.13  , New   , 0.04  
> 192   , 7     , , 224   7.06  , 6.23  , New   , 0.83  
> 256   , 7     , , 224   6.76  , 6.65  , New   , 0.11  
> 512   , 7     , , 224   8.29  , 8.08  , New   , 0.21  
> 2     , 0     , , 1     3.0   , 3.04  , Cur   , 0.04  
> 2     , 1     , , 1     3.06  , 3.07  , Cur   , 0.01  
> 0     , 0     , , 1     2.57  , 2.59  , Cur   , 0.02  
> 0     , 1     , , 1     2.6   , 2.61  , Cur   , 0.01  
> 3     , 0     , , 2     3.15  , 3.17  , Cur   , 0.02  
> 3     , 2     , , 2     3.19  , 3.21  , Cur   , 0.02  
> 1     , 0     , , 2     4.32  , 3.25  , New   , 1.07  
> 1     , 2     , , 2     4.36  , 3.31  , New   , 1.05  
> 4     , 0     , , 3     3.5   , 3.52  , Cur   , 0.02  
> 4     , 3     , , 3     3.52  , 3.54  , Cur   , 0.02  
> 2     , 0     , , 3     4.51  , 3.43  , New   , 1.08  
> 2     , 3     , , 3     4.56  , 3.47  , New   , 1.09  
> 5     , 0     , , 4     3.61  , 3.65  , Cur   , 0.04  
> 5     , 4     , , 4     3.63  , 3.67  , Cur   , 0.04  
> 3     , 0     , , 4     4.64  , 3.51  , New   , 1.13  
> 3     , 4     , , 4     4.7   , 3.51  , New   , 1.19  
> 6     , 0     , , 5     3.66  , 3.68  , Cur   , 0.02  
> 6     , 5     , , 5     3.69  , 3.65  , New   , 0.04  
> 4     , 0     , , 5     4.7   , 3.49  , New   , 1.21  
> 4     , 5     , , 5     4.58  , 3.48  , New   , 1.1   
> 7     , 0     , , 6     3.6   , 3.65  , Cur   , 0.05  
> 7     , 6     , , 6     3.59  , 3.64  , Cur   , 0.05  
> 5     , 0     , , 6     4.74  , 3.65  , New   , 1.09  
> 5     , 6     , , 6     4.73  , 3.64  , New   , 1.09  
> 8     , 0     , , 7     3.6   , 3.61  , Cur   , 0.01  
> 8     , 7     , , 7     3.6   , 3.61  , Cur   , 0.01  
> 6     , 0     , , 7     4.73  , 3.6   , New   , 1.13  
> 6     , 7     , , 7     4.73  , 3.62  , New   , 1.11  
> 9     , 0     , , 8     3.59  , 3.62  , Cur   , 0.03  
> 7     , 0     , , 8     4.72  , 3.64  , New   , 1.08  
> 10    , 0     , , 9     3.57  , 3.62  , Cur   , 0.05  
> 10    , 1     , , 9     3.56  , 3.61  , Cur   , 0.05  
> 8     , 0     , , 9     4.69  , 3.63  , New   , 1.06  
> 8     , 1     , , 9     4.71  , 3.61  , New   , 1.1   
> 11    , 0     , , 10    3.58  , 3.62  , Cur   , 0.04  
> 11    , 2     , , 10    3.59  , 3.63  , Cur   , 0.04  
> 9     , 0     , , 10    4.72  , 3.61  , New   , 1.11  
> 9     , 2     , , 10    4.7   , 3.61  , New   , 1.09  
> 12    , 0     , , 11    3.58  , 3.63  , Cur   , 0.05  
> 12    , 3     , , 11    3.58  , 3.62  , Cur   , 0.04  
> 10    , 0     , , 11    4.7   , 3.6   , New   , 1.1   
> 10    , 3     , , 11    4.73  , 3.64  , New   , 1.09  
> 13    , 0     , , 12    3.6   , 3.6   , Eq    , 0.0
> 13    , 4     , , 12    3.57  , 3.62  , Cur   , 0.05  
> 11    , 0     , , 12    4.73  , 3.62  , New   , 1.11  
> 11    , 4     , , 12    4.79  , 3.61  , New   , 1.18  
> 14    , 0     , , 13    3.61  , 3.62  , Cur   , 0.01  
> 14    , 5     , , 13    3.59  , 3.59  , Eq    , 0.0
> 12    , 0     , , 13    4.7   , 3.61  , New   , 1.09  
> 12    , 5     , , 13    4.75  , 3.58  , New   , 1.17  
> 15    , 0     , , 14    3.58  , 3.62  , Cur   , 0.04  
> 15    , 6     , , 14    3.59  , 3.62  , Cur   , 0.03  
> 13    , 0     , , 14    4.68  , 3.6   , New   , 1.08  
> 13    , 6     , , 14    4.68  , 3.63  , New   , 1.05  
> 16    , 0     , , 15    3.57  , 3.6   , Cur   , 0.03  
> 16    , 7     , , 15    3.55  , 3.59  , Cur   , 0.04  
> 14    , 0     , , 15    4.69  , 3.61  , New   , 1.08  
> 14    , 7     , , 15    4.69  , 3.61  , New   , 1.08  
> 17    , 0     , , 16    3.56  , 3.61  , Cur   , 0.05  
> 15    , 0     , , 16    4.71  , 3.58  , New   , 1.13  
> 18    , 0     , , 17    3.57  , 3.65  , Cur   , 0.08  
> 18    , 1     , , 17    3.58  , 3.59  , Cur   , 0.01  
> 16    , 0     , , 17    4.7   , 3.58  , New   , 1.12  
> 16    , 1     , , 17    4.68  , 3.59  , New   , 1.09  
> 19    , 0     , , 18    3.51  , 3.58  , Cur   , 0.07  
> 19    , 2     , , 18    3.55  , 3.58  , Cur   , 0.03  
> 17    , 0     , , 18    4.69  , 3.61  , New   , 1.08  
> 17    , 2     , , 18    4.68  , 3.61  , New   , 1.07  
> 20    , 0     , , 19    3.57  , 3.6   , Cur   , 0.03  
> 20    , 3     , , 19    3.59  , 3.59  , Eq    , 0.0
> 18    , 0     , , 19    4.68  , 3.59  , New   , 1.09  
> 18    , 3     , , 19    4.67  , 3.57  , New   , 1.1   
> 21    , 0     , , 20    3.61  , 3.58  , New   , 0.03  
> 21    , 4     , , 20    3.62  , 3.6   , New   , 0.02  
> 19    , 0     , , 20    4.74  , 3.57  , New   , 1.17  
> 19    , 4     , , 20    4.69  , 3.7   , New   , 0.99  
> 22    , 0     , , 21    3.57  , 3.64  , Cur   , 0.07  
> 22    , 5     , , 21    3.55  , 3.6   , Cur   , 0.05  
> 20    , 0     , , 21    4.72  , 3.55  , New   , 1.17  
> 20    , 5     , , 21    4.66  , 3.55  , New   , 1.11  
> 23    , 0     , , 22    3.56  , 3.56  , Eq    , 0.0
> 23    , 6     , , 22    3.54  , 3.56  , Cur   , 0.02  
> 21    , 0     , , 22    4.65  , 3.53  , New   , 1.12  
> 21    , 6     , , 22    4.62  , 3.56  , New   , 1.06  
> 24    , 0     , , 23    3.5   , 3.54  , Cur   , 0.04  
> 24    , 7     , , 23    3.52  , 3.53  , Cur   , 0.01  
> 22    , 0     , , 23    4.61  , 3.51  , New   , 1.1   
> 22    , 7     , , 23    4.6   , 3.51  , New   , 1.09  
> 25    , 0     , , 24    3.5   , 3.53  , Cur   , 0.03  
> 23    , 0     , , 24    4.54  , 3.5   , New   , 1.04  
> 26    , 0     , , 25    3.47  , 3.49  , Cur   , 0.02  
> 26    , 1     , , 25    3.46  , 3.51  , Cur   , 0.05  
> 24    , 0     , , 25    4.53  , 3.51  , New   , 1.02  
> 24    , 1     , , 25    4.51  , 3.51  , New   , 1.0   
> 27    , 0     , , 26    3.44  , 3.51  , Cur   , 0.07  
> 27    , 2     , , 26    3.51  , 3.52  , Cur   , 0.01  
> 25    , 0     , , 26    4.56  , 3.46  , New   , 1.1   
> 25    , 2     , , 26    4.55  , 3.47  , New   , 1.08  
> 28    , 0     , , 27    3.47  , 3.5   , Cur   , 0.03  
> 28    , 3     , , 27    3.48  , 3.47  , New   , 0.01  
> 26    , 0     , , 27    4.52  , 3.44  , New   , 1.08  
> 26    , 3     , , 27    4.55  , 3.46  , New   , 1.09  
> 29    , 0     , , 28    3.45  , 3.49  , Cur   , 0.04  
> 29    , 4     , , 28    3.5   , 3.5   , Eq    , 0.0
> 27    , 0     , , 28    4.56  , 3.49  , New   , 1.07  
> 27    , 4     , , 28    4.5   , 3.49  , New   , 1.01  
> 30    , 0     , , 29    3.44  , 3.48  , Cur   , 0.04  
> 30    , 5     , , 29    3.46  , 3.47  , Cur   , 0.01  
> 28    , 0     , , 29    4.49  , 3.43  , New   , 1.06  
> 28    , 5     , , 29    4.57  , 3.45  , New   , 1.12  
> 31    , 0     , , 30    3.48  , 3.48  , Eq    , 0.0
> 31    , 6     , , 30    3.46  , 3.49  , Cur   , 0.03  
> 29    , 0     , , 30    4.49  , 3.44  , New   , 1.05  
> 29    , 6     , , 30    4.53  , 3.44  , New   , 1.09  
> 32    , 0     , , 31    3.44  , 3.45  , Cur   , 0.01  
> 32    , 7     , , 31    3.46  , 3.51  , Cur   , 0.05  
> 30    , 0     , , 31    4.48  , 3.42  , New   , 1.06  
> 30    , 7     , , 31    4.48  , 3.44  , New   , 1.04
> 
> 
> Results For Skylake memchr-avx2
> size  , algn  , Pos   , Cur T , New T , Win   , Dif   
> 2048  , 0     , , 32    6.61  , 5.4   , New   , 1.21  
> 256   , 1     , , 64    6.52  , 5.68  , New   , 0.84  
> 2048  , 0     , , 64    6.03  , 5.47  , New   , 0.56  
> 256   , 2     , , 64    6.07  , 5.42  , New   , 0.65  
> 2048  , 0     , , 128   7.01  , 5.83  , New   , 1.18  
> 256   , 3     , , 64    6.24  , 5.68  , New   , 0.56  
> 2048  , 0     , , 256   11.03 , 9.86  , New   , 1.17  
> 256   , 4     , , 64    6.17  , 5.49  , New   , 0.68  
> 2048  , 0     , , 512   14.11 , 13.41 , New   , 0.7   
> 256   , 5     , , 64    6.03  , 5.45  , New   , 0.58  
> 2048  , 0     , , 1024  19.82 , 19.92 , Cur   , 0.1   
> 256   , 6     , , 64    6.14  , 5.7   , New   , 0.44  
> 2048  , 0     , , 2048  30.9  , 30.59 , New   , 0.31  
> 256   , 7     , , 64    6.05  , 5.64  , New   , 0.41  
> 192   , 1     , , 32    5.6   , 4.89  , New   , 0.71  
> 256   , 1     , , 32    5.59  , 5.07  , New   , 0.52  
> 512   , 1     , , 32    5.58  , 4.93  , New   , 0.65  
> 192   , 2     , , 64    6.14  , 5.46  , New   , 0.68  
> 512   , 2     , , 64    5.95  , 5.38  , New   , 0.57  
> 192   , 3     , , 96    6.6   , 5.74  , New   , 0.86  
> 256   , 3     , , 96    6.48  , 5.37  , New   , 1.11  
> 512   , 3     , , 96    6.56  , 5.44  , New   , 1.12  
> 192   , 4     , , 128   7.04  , 6.02  , New   , 1.02  
> 256   , 4     , , 128   6.96  , 5.89  , New   , 1.07  
> 512   , 4     , , 128   6.97  , 5.99  , New   , 0.98  
> 192   , 5     , , 160   8.49  , 7.07  , New   , 1.42  
> 256   , 5     , , 160   8.1   , 6.96  , New   , 1.14  
> 512   , 5     , , 160   10.48 , 9.14  , New   , 1.34  
> 192   , 6     , , 192   8.46  , 8.52  , Cur   , 0.06  
> 256   , 6     , , 192   8.53  , 7.58  , New   , 0.95  
> 512   , 6     , , 192   10.88 , 9.06  , New   , 1.82  
> 192   , 7     , , 224   8.59  , 8.35  , New   , 0.24  
> 256   , 7     , , 224   8.86  , 7.91  , New   , 0.95  
> 512   , 7     , , 224   10.89 , 8.98  , New   , 1.91  
> 2     , 0     , , 1     4.28  , 3.62  , New   , 0.66  
> 2     , 1     , , 1     4.32  , 3.75  , New   , 0.57  
> 0     , 0     , , 1     3.76  , 3.24  , New   , 0.52  
> 0     , 1     , , 1     3.7   , 3.19  , New   , 0.51  
> 3     , 0     , , 2     4.16  , 3.67  , New   , 0.49  
> 3     , 2     , , 2     4.21  , 3.68  , New   , 0.53  
> 1     , 0     , , 2     4.25  , 3.74  , New   , 0.51  
> 1     , 2     , , 2     4.4   , 3.82  , New   , 0.58  
> 4     , 0     , , 3     4.43  , 3.88  , New   , 0.55  
> 4     , 3     , , 3     4.34  , 3.8   , New   , 0.54  
> 2     , 0     , , 3     4.33  , 3.79  , New   , 0.54  
> 2     , 3     , , 3     4.37  , 3.84  , New   , 0.53  
> 5     , 0     , , 4     4.45  , 3.87  , New   , 0.58  
> 5     , 4     , , 4     4.41  , 3.84  , New   , 0.57  
> 3     , 0     , , 4     4.34  , 3.83  , New   , 0.51  
> 3     , 4     , , 4     4.35  , 3.82  , New   , 0.53  
> 6     , 0     , , 5     4.41  , 3.88  , New   , 0.53  
> 6     , 5     , , 5     4.41  , 3.88  , New   , 0.53  
> 4     , 0     , , 5     4.35  , 3.84  , New   , 0.51  
> 4     , 5     , , 5     4.37  , 3.85  , New   , 0.52  
> 7     , 0     , , 6     4.4   , 3.84  , New   , 0.56  
> 7     , 6     , , 6     4.39  , 3.83  , New   , 0.56  
> 5     , 0     , , 6     4.37  , 3.85  , New   , 0.52  
> 5     , 6     , , 6     4.4   , 3.86  , New   , 0.54  
> 8     , 0     , , 7     4.39  , 3.88  , New   , 0.51  
> 8     , 7     , , 7     4.4   , 3.83  , New   , 0.57  
> 6     , 0     , , 7     4.39  , 3.85  , New   , 0.54  
> 6     , 7     , , 7     4.38  , 3.87  , New   , 0.51  
> 9     , 0     , , 8     4.47  , 3.96  , New   , 0.51  
> 7     , 0     , , 8     4.37  , 3.85  , New   , 0.52  
> 10    , 0     , , 9     4.61  , 4.08  , New   , 0.53  
> 10    , 1     , , 9     4.61  , 4.09  , New   , 0.52  
> 8     , 0     , , 9     4.37  , 3.85  , New   , 0.52  
> 8     , 1     , , 9     4.37  , 3.85  , New   , 0.52  
> 11    , 0     , , 10    4.68  , 4.06  , New   , 0.62  
> 11    , 2     , , 10    4.56  , 4.1   , New   , 0.46  
> 9     , 0     , , 10    4.36  , 3.83  , New   , 0.53  
> 9     , 2     , , 10    4.37  , 3.83  , New   , 0.54  
> 12    , 0     , , 11    4.62  , 4.05  , New   , 0.57  
> 12    , 3     , , 11    4.63  , 4.06  , New   , 0.57  
> 10    , 0     , , 11    4.38  , 3.86  , New   , 0.52  
> 10    , 3     , , 11    4.41  , 3.86  , New   , 0.55  
> 13    , 0     , , 12    4.57  , 4.08  , New   , 0.49  
> 13    , 4     , , 12    4.59  , 4.12  , New   , 0.47  
> 11    , 0     , , 12    4.45  , 4.0   , New   , 0.45  
> 11    , 4     , , 12    4.51  , 4.04  , New   , 0.47  
> 14    , 0     , , 13    4.64  , 4.16  , New   , 0.48  
> 14    , 5     , , 13    4.67  , 4.1   , New   , 0.57  
> 12    , 0     , , 13    4.58  , 4.08  , New   , 0.5   
> 12    , 5     , , 13    4.6   , 4.1   , New   , 0.5   
> 15    , 0     , , 14    4.61  , 4.05  , New   , 0.56  
> 15    , 6     , , 14    4.59  , 4.06  , New   , 0.53  
> 13    , 0     , , 14    4.57  , 4.06  , New   , 0.51  
> 13    , 6     , , 14    4.57  , 4.05  , New   , 0.52  
> 16    , 0     , , 15    4.62  , 4.05  , New   , 0.57  
> 16    , 7     , , 15    4.63  , 4.06  , New   , 0.57  
> 14    , 0     , , 15    4.61  , 4.06  , New   , 0.55  
> 14    , 7     , , 15    4.59  , 4.05  , New   , 0.54  
> 17    , 0     , , 16    4.58  , 4.08  , New   , 0.5   
> 15    , 0     , , 16    4.64  , 4.06  , New   , 0.58  
> 18    , 0     , , 17    4.56  , 4.17  , New   , 0.39  
> 18    , 1     , , 17    4.59  , 4.09  , New   , 0.5   
> 16    , 0     , , 17    4.59  , 4.07  , New   , 0.52  
> 16    , 1     , , 17    4.58  , 4.04  , New   , 0.54  
> 19    , 0     , , 18    4.61  , 4.05  , New   , 0.56  
> 19    , 2     , , 18    4.6   , 4.08  , New   , 0.52  
> 17    , 0     , , 18    4.64  , 4.11  , New   , 0.53  
> 17    , 2     , , 18    4.56  , 4.13  , New   , 0.43  
> 20    , 0     , , 19    4.77  , 4.3   , New   , 0.47  
> 20    , 3     , , 19    4.6   , 4.14  , New   , 0.46  
> 18    , 0     , , 19    4.72  , 4.02  , New   , 0.7   
> 18    , 3     , , 19    4.53  , 4.01  , New   , 0.52  
> 21    , 0     , , 20    4.66  , 4.26  , New   , 0.4   
> 21    , 4     , , 20    4.74  , 4.07  , New   , 0.67  
> 19    , 0     , , 20    4.62  , 4.12  , New   , 0.5   
> 19    , 4     , , 20    4.57  , 4.04  , New   , 0.53  
> 22    , 0     , , 21    4.61  , 4.13  , New   , 0.48  
> 22    , 5     , , 21    4.64  , 4.08  , New   , 0.56  
> 20    , 0     , , 21    4.49  , 4.01  , New   , 0.48  
> 20    , 5     , , 21    4.58  , 4.06  , New   , 0.52  
> 23    , 0     , , 22    4.62  , 4.13  , New   , 0.49  
> 23    , 6     , , 22    4.72  , 4.27  , New   , 0.45  
> 21    , 0     , , 22    4.65  , 3.97  , New   , 0.68  
> 21    , 6     , , 22    4.5   , 4.02  , New   , 0.48  
> 24    , 0     , , 23    4.78  , 4.07  , New   , 0.71  
> 24    , 7     , , 23    4.67  , 4.23  , New   , 0.44  
> 22    , 0     , , 23    4.49  , 3.99  , New   , 0.5   
> 22    , 7     , , 23    4.56  , 4.03  , New   , 0.53  
> 25    , 0     , , 24    4.6   , 4.15  , New   , 0.45  
> 23    , 0     , , 24    4.57  , 4.06  , New   , 0.51  
> 26    , 0     , , 25    4.54  , 4.14  , New   , 0.4   
> 26    , 1     , , 25    4.72  , 4.1   , New   , 0.62  
> 24    , 0     , , 25    4.52  , 4.13  , New   , 0.39  
> 24    , 1     , , 25    4.55  , 4.0   , New   , 0.55  
> 27    , 0     , , 26    4.51  , 4.06  , New   , 0.45  
> 27    , 2     , , 26    4.53  , 4.16  , New   , 0.37  
> 25    , 0     , , 26    4.59  , 4.09  , New   , 0.5   
> 25    , 2     , , 26    4.55  , 4.01  , New   , 0.54  
> 28    , 0     , , 27    4.59  , 3.99  , New   , 0.6   
> 28    , 3     , , 27    4.57  , 3.95  , New   , 0.62  
> 26    , 0     , , 27    4.55  , 4.15  , New   , 0.4   
> 26    , 3     , , 27    4.57  , 3.99  , New   , 0.58  
> 29    , 0     , , 28    4.41  , 4.03  , New   , 0.38  
> 29    , 4     , , 28    4.59  , 4.02  , New   , 0.57  
> 27    , 0     , , 28    4.63  , 4.08  , New   , 0.55  
> 27    , 4     , , 28    4.44  , 4.02  , New   , 0.42  
> 30    , 0     , , 29    4.53  , 3.93  , New   , 0.6   
> 30    , 5     , , 29    4.55  , 3.88  , New   , 0.67  
> 28    , 0     , , 29    4.49  , 3.9   , New   , 0.59  
> 28    , 5     , , 29    4.44  , 3.94  , New   , 0.5   
> 31    , 0     , , 30    4.41  , 3.85  , New   , 0.56  
> 31    , 6     , , 30    4.48  , 3.86  , New   , 0.62  
> 29    , 0     , , 30    4.55  , 3.94  , New   , 0.61  
> 29    , 6     , , 30    4.32  , 3.95  , New   , 0.37  
> 32    , 0     , , 31    4.36  , 3.91  , New   , 0.45  
> 32    , 7     , , 31    4.37  , 3.89  , New   , 0.48  
> 30    , 0     , , 31    4.65  , 3.9   , New   , 0.75  
> 30    , 7     , , 31    4.42  , 3.93  , New   , 0.49  
> 
>  sysdeps/x86_64/multiarch/memchr-evex.S | 580 +++++++++++++++----------
>  1 file changed, 349 insertions(+), 231 deletions(-)
> 
> diff --git a/sysdeps/x86_64/multiarch/memchr-evex.S b/sysdeps/x86_64/multiarch/memchr-evex.S
> index 6dd5d67b90..65c16ef8a4 100644
> --- a/sysdeps/x86_64/multiarch/memchr-evex.S
> +++ b/sysdeps/x86_64/multiarch/memchr-evex.S
> @@ -26,14 +26,28 @@
>  
>  # ifdef USE_AS_WMEMCHR
>  #  define VPBROADCAST	vpbroadcastd
> -#  define VPCMP		vpcmpd
> -#  define SHIFT_REG	r8d
> +#  define VPMINU	vpminud
> +#  define VPCMP	vpcmpd
> +#  define VPCMPEQ	vpcmpeqd
> +#  define CHAR_SIZE	4
>  # else
>  #  define VPBROADCAST	vpbroadcastb
> -#  define VPCMP		vpcmpb
> -#  define SHIFT_REG	ecx
> +#  define VPMINU	vpminub
> +#  define VPCMP	vpcmpb
> +#  define VPCMPEQ	vpcmpeqb
> +#  define CHAR_SIZE	1
>  # endif
>  
> +# ifdef USE_AS_RAWMEMCHR
> +#  define RAW_PTR_REG	rcx
> +#  define ALGN_PTR_REG	rdi
> +# else
> +#  define RAW_PTR_REG	rdi
> +#  define ALGN_PTR_REG	rcx
> +# endif
> +
> +#define XZERO		xmm23

Add a space before define.  Rename XZERO to XMMZERO.

> +#define YZERO		ymm23

Add a space before define.  Rename YZERO to YMMZERO.

>  # define XMMMATCH	xmm16
>  # define YMMMATCH	ymm16
>  # define YMM1		ymm17
> @@ -44,18 +58,16 @@
>  # define YMM6		ymm22
>  
>  # define VEC_SIZE 32
> +# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE)
> +# define PAGE_SIZE 4096
>  
>  	.section .text.evex,"ax",@progbits
> -ENTRY (MEMCHR)
> +ENTRY(MEMCHR)

No need for this change.

>  # ifndef USE_AS_RAWMEMCHR
>  	/* Check for zero length.  */
>  	test	%RDX_LP, %RDX_LP
>  	jz	L(zero)
> -# endif
> -	movl	%edi, %ecx
> -# ifdef USE_AS_WMEMCHR
> -	shl	$2, %RDX_LP
> -# else
> +
>  #  ifdef __ILP32__
>  	/* Clear the upper 32 bits.  */
>  	movl	%edx, %edx
> @@ -63,319 +75,425 @@ ENTRY (MEMCHR)
>  # endif
>  	/* Broadcast CHAR to YMMMATCH.  */
>  	VPBROADCAST %esi, %YMMMATCH
> -	/* Check if we may cross page boundary with one vector load.  */
> -	andl	$(2 * VEC_SIZE - 1), %ecx
> -	cmpl	$VEC_SIZE, %ecx
> -	ja	L(cros_page_boundary)
> +	/* Check if we may cross page boundary with one
> +	   vector load.  */

Fit comments to 72 columns.

> +	movl	%edi, %eax
> +	andl	$(PAGE_SIZE - 1), %eax
> +	cmpl	$(PAGE_SIZE - VEC_SIZE), %eax
> +	ja	L(cross_page_boundary)
>  
>  	/* Check the first VEC_SIZE bytes.  */
> -	VPCMP	$0, (%rdi), %YMMMATCH, %k1
> -	kmovd	%k1, %eax
> -	testl	%eax, %eax
> -
> +	VPCMP	$0, (%rdi), %YMMMATCH, %k0
> +	kmovd	%k0, %eax
>  # ifndef USE_AS_RAWMEMCHR
> -	jnz	L(first_vec_x0_check)
> -	/* Adjust length and check the end of data.  */
> -	subq	$VEC_SIZE, %rdx
> -	jbe	L(zero)
> +	/* If length < CHAR_PER_VEC handle special.  */
> +	cmpq	$CHAR_PER_VEC, %rdx
> +	jbe	L(first_vec_x0)
> +# endif
> +	testl	%eax, %eax
> +	jz	L(aligned_more)
> +	tzcntl	%eax, %eax
> +# ifdef USE_AS_WMEMCHR
> +	/* NB: Multiply bytes by CHAR_SIZE to get the
> +	   wchar_t count.  */

Fit comments to 72 columns.

> +	leaq	(%rdi, %rax, CHAR_SIZE), %rax
>  # else
> -	jnz	L(first_vec_x0)
> +	addq	%rdi, %rax
>  # endif
> -
> -	/* Align data for aligned loads in the loop.  */
> -	addq	$VEC_SIZE, %rdi
> -	andl	$(VEC_SIZE - 1), %ecx
> -	andq	$-VEC_SIZE, %rdi
> +	ret
>  
>  # ifndef USE_AS_RAWMEMCHR
> -	/* Adjust length.  */
> -	addq	%rcx, %rdx
> -
> -	subq	$(VEC_SIZE * 4), %rdx
> -	jbe	L(last_4x_vec_or_less)
> -# endif
> -	jmp	L(more_4x_vec)
> +L(zero):
> +	xorl	%eax, %eax
> +	ret
>  
> +	.p2align 5
> +L(first_vec_x0):
> +	/* Check if first match was before length.  */
> +	tzcntl	%eax, %eax
> +	xorl	%ecx, %ecx
> +	cmpl	%eax, %edx
> +	leaq	(%rdi, %rax, CHAR_SIZE), %rax
> +	cmovle	%rcx, %rax
> +	ret
> +# else
> +	/* NB: first_vec_x0 is 17 bytes which will leave
> +	   cross_page_boundary (which is relatively cold) close
> +	   enough to ideal alignment. So only realign
> +	   L(cross_page_boundary) if rawmemchr.  */

Fit comments to 72 columns.

>  	.p2align 4
> -L(cros_page_boundary):
> -	andl	$(VEC_SIZE - 1), %ecx
> +# endif
> +L(cross_page_boundary):
> +	/* Save pointer before aligning as its original
> +	   value is necessary for computer return address if byte is
> +	   found or adjusting length if it is not and this is
> +	   memchr.  */

Fit comments to 72 columns.

> +	movq	%rdi, %rcx
> +	/* Align data to VEC_SIZE. ALGN_PTR_REG is rcx
> +	   for memchr and rdi for rawmemchr.  */

Fit comments to 72 columns.

> +	andq	$-VEC_SIZE, %ALGN_PTR_REG
> +	VPCMP	$0, (%ALGN_PTR_REG), %YMMMATCH, %k0
> +	kmovd	%k0, %r8d
>  # ifdef USE_AS_WMEMCHR
> -	/* NB: Divide shift count by 4 since each bit in K1 represent 4
> -	   bytes.  */
> -	movl	%ecx, %SHIFT_REG
> -	sarl	$2, %SHIFT_REG
> +	/* NB: Divide shift count by 4 since each bit in
> +	   K0 represent 4 bytes.  */
> +	sarl	$2, %eax
> +# endif
> +# ifndef USE_AS_RAWMEMCHR
> +	movl	$(PAGE_SIZE / CHAR_SIZE), %esi
> +	subl	%eax, %esi
>  # endif
> -	andq	$-VEC_SIZE, %rdi
> -	VPCMP	$0, (%rdi), %YMMMATCH, %k1
> -	kmovd	%k1, %eax
> -	/* Remove the leading bytes.  */
> -	sarxl	%SHIFT_REG, %eax, %eax
> -	testl	%eax, %eax
> -	jz	L(aligned_more)
> -	tzcntl	%eax, %eax
>  # ifdef USE_AS_WMEMCHR
> -	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
> -	sall	$2, %eax
> +	andl	$(CHAR_PER_VEC - 1), %eax
>  # endif
> +	/* Remove the leading bytes.  */
> +	sarxl	%eax, %r8d, %eax
>  # ifndef USE_AS_RAWMEMCHR
>  	/* Check the end of data.  */
> -	cmpq	%rax, %rdx
> -	jbe	L(zero)
> +	cmpq	%rsi, %rdx
> +	jbe	L(first_vec_x0)
> +# endif
> +	testl	%eax, %eax
> +	jz	L(cross_page_continue)
> +	tzcntl	%eax, %eax
> +# ifdef USE_AS_WMEMCHR
> +	/* NB: Multiply bytes by CHAR_SIZE to get the
> +	   wchar_t count.  */
> +	leaq	(%RAW_PTR_REG, %rax, CHAR_SIZE), %rax
> +# else
> +	addq	%RAW_PTR_REG, %rax
>  # endif
> -	addq	%rdi, %rax
> -	addq	%rcx, %rax
>  	ret
>  
>  	.p2align 4
> -L(aligned_more):
> -# ifndef USE_AS_RAWMEMCHR
> -        /* Calculate "rdx + rcx - VEC_SIZE" with "rdx - (VEC_SIZE - rcx)"
> -	   instead of "(rdx + rcx) - VEC_SIZE" to void possible addition
> -	   overflow.  */
> -	negq	%rcx
> -	addq	$VEC_SIZE, %rcx
> +L(first_vec_x1):
> +	tzcntl	%eax, %eax
> +	leaq	VEC_SIZE(%rdi, %rax, CHAR_SIZE), %rax
> +	ret
>  
> -	/* Check the end of data.  */
> -	subq	%rcx, %rdx
> -	jbe	L(zero)
> -# endif
> +	.p2align 4
> +L(first_vec_x2):
> +	tzcntl	%eax, %eax
> +	leaq	(VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
> +	ret
>  
> -	addq	$VEC_SIZE, %rdi
> +	.p2align 4
> +L(first_vec_x3):
> +	tzcntl	%eax, %eax
> +	leaq	(VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax
> +	ret
> +
> +	.p2align 4
> +L(first_vec_x4):
> +	tzcntl	%eax, %eax
> +	leaq	(VEC_SIZE * 4)(%rdi, %rax, CHAR_SIZE), %rax
> +	ret
> +
> +	.p2align 5
> +L(aligned_more):
> +	/* Check the first 4 * VEC_SIZE.  Only one
> +	   VEC_SIZE at a time since data is only aligned to
> +	   VEC_SIZE.  */

Fit comments to 72 columns.

>  
>  # ifndef USE_AS_RAWMEMCHR
> -	subq	$(VEC_SIZE * 4), %rdx
> +	/* Align data to VEC_SIZE.  */
> +L(cross_page_continue):
> +	xorl	%ecx, %ecx
> +	subl	%edi, %ecx
> +	andq	$-VEC_SIZE, %rdi
> +	/* esi is for adjusting length to see if near the
> +	   end.  */

Fit comments to 72 columns.

> +	leal	(VEC_SIZE * 5)(%rdi, %rcx), %esi
> +#  ifdef USE_AS_WMEMCHR
> +	/* NB: Divide bytes by 4 to get the wchar_t
> +	   count.  */
> +	sarl	$2, %esi
> +#  endif
> +# else
> +	andq	$-VEC_SIZE, %rdi
> +L(cross_page_continue):
> +# endif
> +	/* Load first VEC regardless.  */
> +	VPCMP	$0, (VEC_SIZE)(%rdi), %YMMMATCH, %k0
> +	kmovd	%k0, %eax
> +# ifndef USE_AS_RAWMEMCHR
> +	/* Adjust length. If near end handle specially.
> +	 */

Fit comments to 72 columns.

> +	subq	%rsi, %rdx
>  	jbe	L(last_4x_vec_or_less)
>  # endif
> -
> -L(more_4x_vec):
> -	/* Check the first 4 * VEC_SIZE.  Only one VEC_SIZE at a time
> -	   since data is only aligned to VEC_SIZE.  */
> -	VPCMP	$0, (%rdi), %YMMMATCH, %k1
> -	kmovd	%k1, %eax
> -	testl	%eax, %eax
> -	jnz	L(first_vec_x0)
> -
> -	VPCMP	$0, VEC_SIZE(%rdi), %YMMMATCH, %k1
> -	kmovd	%k1, %eax
>  	testl	%eax, %eax
>  	jnz	L(first_vec_x1)
>  
> -	VPCMP	$0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k1
> -	kmovd	%k1, %eax
> +	VPCMP	$0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k0
> +	kmovd	%k0, %eax
>  	testl	%eax, %eax
>  	jnz	L(first_vec_x2)
>  
> -	VPCMP	$0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k1
> -	kmovd	%k1, %eax
> +	VPCMP	$0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k0
> +	kmovd	%k0, %eax
>  	testl	%eax, %eax
>  	jnz	L(first_vec_x3)
>  
> -	addq	$(VEC_SIZE * 4), %rdi
> +	VPCMP	$0, (VEC_SIZE * 4)(%rdi), %YMMMATCH, %k0
> +	kmovd	%k0, %eax
> +	testl	%eax, %eax
> +	jnz	L(first_vec_x4)
> +
>  
>  # ifndef USE_AS_RAWMEMCHR
> -	subq	$(VEC_SIZE * 4), %rdx
> -	jbe	L(last_4x_vec_or_less)
> -# endif
> +	/* Check if at last CHAR_PER_VEC * 4 length.  */
> +	subq	$(CHAR_PER_VEC * 4), %rdx
> +	jbe	L(last_4x_vec_or_less_cmpeq)
> +	addq	$VEC_SIZE, %rdi
>  
> -	/* Align data to 4 * VEC_SIZE.  */
> -	movq	%rdi, %rcx
> -	andl	$(4 * VEC_SIZE - 1), %ecx
> +	/* Align data to VEC_SIZE * 4 for the loop and
> +	   readjust length.  */

Fit comments to 72 columns.

> +#  ifdef USE_AS_WMEMCHR
> +	movl	%edi, %ecx
>  	andq	$-(4 * VEC_SIZE), %rdi
> -
> -# ifndef USE_AS_RAWMEMCHR
> -	/* Adjust length.  */
> +	andl	$(VEC_SIZE * 4 - 1), %ecx
> +	/* NB: Divide bytes by 4 to get the wchar_t
> +	   count.  */

Fit comments to 72 columns.

> +	sarl	$2, %ecx
>  	addq	%rcx, %rdx
> +#  else
> +	addq	%rdi, %rdx
> +	andq	$-(4 * VEC_SIZE), %rdi
> +	subq	%rdi, %rdx
> +#  endif
> +# else
> +	addq	$VEC_SIZE, %rdi
> +	andq	$-(4 * VEC_SIZE), %rdi
>  # endif
>  
> +	vpxorq	%XZERO, %XZERO, %XZERO
> +
> +	/* Compare 4 * VEC at a time forward.  */
>  	.p2align 4
>  L(loop_4x_vec):
> -	/* Compare 4 * VEC at a time forward.  */
> -	VPCMP	$0, (%rdi), %YMMMATCH, %k1
> -	VPCMP	$0, VEC_SIZE(%rdi), %YMMMATCH, %k2
> -	kord	%k1, %k2, %k5
> -	VPCMP	$0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k3
> -	VPCMP	$0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k4
> -
> -	kord	%k3, %k4, %k6
> -	kortestd %k5, %k6
> -	jnz	L(4x_vec_end)
> -
> -	addq	$(VEC_SIZE * 4), %rdi
> -
> +	/* It would be possible to save some instructions
> +	   using 4x VPCMP but bottleneck on port 5 makes it not woth
> +	   it.  */

Fit comments to 72 columns.

> +	VPCMP	$4, (VEC_SIZE * 4)(%rdi), %YMMMATCH, %k1
> +	/* xor will set bytes match esi to zero.  */
> +	vpxorq	(VEC_SIZE * 5)(%rdi), %YMMMATCH, %YMM2
> +	vpxorq	(VEC_SIZE * 6)(%rdi), %YMMMATCH, %YMM3
> +	VPCMP	$0, (VEC_SIZE * 7)(%rdi), %YMMMATCH, %k3
> +	/* Reduce VEC2 / VEC3 with min and VEC1 with zero
> +	   mask.  */

Fit comments to 72 columns.

> +	VPMINU	%YMM2, %YMM3, %YMM3 {%k1} {z}
> +	VPCMP	$0, %YMM3, %YZERO, %k2
>  # ifdef USE_AS_RAWMEMCHR
> -	jmp	L(loop_4x_vec)
> +	subq	$-(VEC_SIZE * 4), %rdi
> +	kortestd %k2, %k3
> +	jz	L(loop_4x_vec)
>  # else
> -	subq	$(VEC_SIZE * 4), %rdx
> -	ja	L(loop_4x_vec)
> +	kortestd %k2, %k3
> +	jnz	L(loop_4x_vec_end)
>  
> -L(last_4x_vec_or_less):
> -	/* Less than 4 * VEC and aligned to VEC_SIZE.  */
> -	addl	$(VEC_SIZE * 2), %edx
> -	jle	L(last_2x_vec)
> +	subq	$-(VEC_SIZE * 4), %rdi
>  
> -	VPCMP	$0, (%rdi), %YMMMATCH, %k1
> -	kmovd	%k1, %eax
> -	testl	%eax, %eax
> -	jnz	L(first_vec_x0)
> +	subq	$(CHAR_PER_VEC * 4), %rdx
> +	ja	L(loop_4x_vec)
>  
> -	VPCMP	$0, VEC_SIZE(%rdi), %YMMMATCH, %k1
> -	kmovd	%k1, %eax
> +	/* Fall through into less than 4 remaining
> +	   vectors of length case.  */

Fit comments to 72 columns.

> +	VPCMP	$0, (VEC_SIZE * 4)(%rdi), %YMMMATCH, %k0
> +	kmovd	%k0, %eax
> +	addq	$(VEC_SIZE * 3), %rdi
> +	.p2align 4
> +L(last_4x_vec_or_less):
> +	/* Check if first VEC contained match.  */
>  	testl	%eax, %eax
> -	jnz	L(first_vec_x1)
> +	jnz	L(first_vec_x1_check)
>  
> -	VPCMP	$0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k1
> -	kmovd	%k1, %eax
> -	testl	%eax, %eax
> +	/* If remaining length > CHAR_PER_VEC * 2.  */
> +	addl	$(CHAR_PER_VEC * 2), %edx
> +	jg	L(last_4x_vec)
>  
> -	jnz	L(first_vec_x2_check)
> -	subl	$VEC_SIZE, %edx
> -	jle	L(zero)
> +L(last_2x_vec):
> +	/* If remaining length < CHAR_PER_VEC.  */
> +	addl	$CHAR_PER_VEC, %edx
> +	jle	L(zero_end)
> +
> +	/* Check VEC2 and compare any match with
> +	   remaining length.  */

Fit comments to 72 columns.

> +	VPCMP	$0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k0
> +	kmovd	%k0, %eax
> +	tzcntl	%eax, %eax
> +	cmpl	%eax, %edx
> +	jbe	L(set_zero_end)
> +	leaq	(VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
> +L(zero_end):
> +	ret
>  
> -	VPCMP	$0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k1
> -	kmovd	%k1, %eax
> -	testl	%eax, %eax
>  
> -	jnz	L(first_vec_x3_check)
> +	.p2align 4
> +L(first_vec_x1_check):
> +	tzcntl	%eax, %eax
> +	/* Adjust length.  */
> +	subl	$-(CHAR_PER_VEC * 4), %edx
> +	/* Check if match within remaining length.  */
> +	cmpl	%eax, %edx
> +	jbe	L(set_zero_end)
> +	/* NB: Multiply bytes by CHAR_SIZE to get the
> +	   wchar_t count.  */

Fit comments to 72 columns.

> +	leaq	VEC_SIZE(%rdi, %rax, CHAR_SIZE), %rax
> +	ret
> +L(set_zero_end):
>  	xorl	%eax, %eax
>  	ret
>  
>  	.p2align 4
> -L(last_2x_vec):
> -	addl	$(VEC_SIZE * 2), %edx
> -	VPCMP	$0, (%rdi), %YMMMATCH, %k1
> +L(loop_4x_vec_end):
> +# endif
> +	/* rawmemchr will fall through into this if match
> +	   was found in loop.  */

Fit comments to 72 columns.

> +
> +	/* k1 has not of matches with VEC1.  */
>  	kmovd	%k1, %eax
> -	testl	%eax, %eax
> +# ifdef USE_AS_WMEMCHR
> +	subl	$((1 << CHAR_PER_VEC) - 1), %eax
> +# else
> +	incl	%eax
> +# endif
> +	jnz	L(last_vec_x1_return)
>  
> -	jnz	L(first_vec_x0_check)
> -	subl	$VEC_SIZE, %edx
> -	jle	L(zero)
> +	VPCMP	$0, %YMM2, %YZERO, %k0
> +	kmovd	%k0, %eax
> +	testl	%eax, %eax
> +	jnz	L(last_vec_x2_return)
>  
> -	VPCMP	$0, VEC_SIZE(%rdi), %YMMMATCH, %k1
> -	kmovd	%k1, %eax
> +	kmovd	%k2, %eax
>  	testl	%eax, %eax
> -	jnz	L(first_vec_x1_check)
> -	xorl	%eax, %eax
> -	ret
> +	jnz	L(last_vec_x3_return)
>  
> -	.p2align 4
> -L(first_vec_x0_check):
> +	kmovd	%k3, %eax
>  	tzcntl	%eax, %eax
> -# ifdef USE_AS_WMEMCHR
> -	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
> -	sall	$2, %eax
> +# ifdef USE_AS_RAWMEMCHR
> +	leaq	(VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax
> +# else
> +	leaq	(VEC_SIZE * 7)(%rdi, %rax, CHAR_SIZE), %rax
>  # endif
> -	/* Check the end of data.  */
> -	cmpq	%rax, %rdx
> -	jbe	L(zero)
> -	addq	%rdi, %rax
>  	ret
>  
>  	.p2align 4
> -L(first_vec_x1_check):
> +L(last_vec_x1_return):
>  	tzcntl	%eax, %eax
> -# ifdef USE_AS_WMEMCHR
> -	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
> -	sall	$2, %eax
> -# endif
> -	/* Check the end of data.  */
> -	cmpq	%rax, %rdx
> -	jbe	L(zero)
> -	addq	$VEC_SIZE, %rax
> +# ifdef USE_AS_RAWMEMCHR
> +#  ifdef USE_AS_WMEMCHR
> +	/* NB: Multiply bytes by CHAR_SIZE to get the
> +	   wchar_t count.  */

Fit comments to 72 columns.

> +	leaq	(%rdi, %rax, CHAR_SIZE), %rax
> +#  else
>  	addq	%rdi, %rax
> -	ret
> -
> -	.p2align 4
> -L(first_vec_x2_check):
> -	tzcntl	%eax, %eax
> -# ifdef USE_AS_WMEMCHR
> -	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
> -	sall	$2, %eax
> +#  endif
> +# else
> +	/* NB: Multiply bytes by CHAR_SIZE to get the
> +	   wchar_t count.  */

Fit comments to 72 columns.

> +	leaq	(VEC_SIZE * 4)(%rdi, %rax, CHAR_SIZE), %rax
>  # endif
> -	/* Check the end of data.  */
> -	cmpq	%rax, %rdx
> -	jbe	L(zero)
> -	addq	$(VEC_SIZE * 2), %rax
> -	addq	%rdi, %rax
>  	ret
>  
>  	.p2align 4
> -L(first_vec_x3_check):
> +L(last_vec_x2_return):
>  	tzcntl	%eax, %eax
> -# ifdef USE_AS_WMEMCHR
> -	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
> -	sall	$2, %eax
> +# ifdef USE_AS_RAWMEMCHR
> +	/* NB: Multiply bytes by CHAR_SIZE to get the
> +	   wchar_t count.  */
> +	leaq	VEC_SIZE(%rdi, %rax, CHAR_SIZE), %rax
> +# else
> +	/* NB: Multiply bytes by CHAR_SIZE to get the
> +	   wchar_t count.  */
> +	leaq	(VEC_SIZE * 5)(%rdi, %rax, CHAR_SIZE), %rax
>  # endif
> -	/* Check the end of data.  */
> -	cmpq	%rax, %rdx
> -	jbe	L(zero)
> -	addq	$(VEC_SIZE * 3), %rax
> -	addq	%rdi, %rax
>  	ret
>  
>  	.p2align 4
> -L(zero):
> -	xorl	%eax, %eax
> -	ret
> -# endif
> -
> -	.p2align 4
> -L(first_vec_x0):
> +L(last_vec_x3_return):
>  	tzcntl	%eax, %eax
> -# ifdef USE_AS_WMEMCHR
> -	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
> -	leaq	(%rdi, %rax, 4), %rax
> +# ifdef USE_AS_RAWMEMCHR
> +	/* NB: Multiply bytes by CHAR_SIZE to get the
> +	   wchar_t count.  */

Fit comments to 72 columns.

> +	leaq	(VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
>  # else
> -	addq	%rdi, %rax
> +	/* NB: Multiply bytes by CHAR_SIZE to get the
> +	   wchar_t count.  */

Fit comments to 72 columns.

> +	leaq	(VEC_SIZE * 6)(%rdi, %rax, CHAR_SIZE), %rax
>  # endif
>  	ret
>  
> +
> +# ifndef USE_AS_RAWMEMCHR
> +L(last_4x_vec_or_less_cmpeq):
> +	VPCMP	$0, (VEC_SIZE * 5)(%rdi), %YMMMATCH, %k0
> +	kmovd	%k0, %eax
> +	subq	$-(VEC_SIZE * 4), %rdi
> +	/* Check first VEC regardless.  */
> +	testl	%eax, %eax
> +	jnz	L(first_vec_x1_check)
> +
> +	/* If remaining length <= CHAR_PER_VEC * 2.  */
> +	addl	$(CHAR_PER_VEC * 2), %edx
> +	jle	L(last_2x_vec)
> +
>  	.p2align 4
> -L(first_vec_x1):
> +L(last_4x_vec):
> +	VPCMP	$0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k0
> +	kmovd	%k0, %eax
> +	testl	%eax, %eax
> +	jnz	L(last_vec_x2)
> +
> +
> +	VPCMP	$0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k0
> +	kmovd	%k0, %eax
> +	/* Create mask for possible matches within
> +	   remaining length.  */

Fit comments to 72 columns.

> +#  ifdef USE_AS_WMEMCHR
> +	movl	$((1 << (CHAR_PER_VEC * 2)) - 1), %ecx
> +	bzhil	%edx, %ecx, %ecx
> +#  else
> +	movq	$-1, %rcx
> +	bzhiq	%rdx, %rcx, %rcx
> +#  endif
> +	/* Test matches in data against length match.  */
> +	andl	%ecx, %eax
> +	jnz	L(last_vec_x3)
> +
> +	/* if remaining length <= CHAR_PER_VEC * 3 (Note
> +	   this is after remaining length was found to be >
> +	   CHAR_PER_VEC * 2.  */

Fit comments to 72 columns.

> +	subl	$CHAR_PER_VEC, %edx
> +	jbe	L(zero_end2)
> +
> +
> +	VPCMP	$0, (VEC_SIZE * 4)(%rdi), %YMMMATCH, %k0
> +	kmovd	%k0, %eax
> +	/* Shift remaining length mask for last VEC.  */
> +#  ifdef USE_AS_WMEMCHR
> +	shrl	$CHAR_PER_VEC, %ecx
> +#  else
> +	shrq	$CHAR_PER_VEC, %rcx
> +#  endif
> +	andl	%ecx, %eax
> +	jz	L(zero_end2)
>  	tzcntl	%eax, %eax
> -# ifdef USE_AS_WMEMCHR
> -	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
> -	leaq	VEC_SIZE(%rdi, %rax, 4), %rax
> -# else
> -	addq	$VEC_SIZE, %rax
> -	addq	%rdi, %rax
> -# endif
> +	leaq	(VEC_SIZE * 4)(%rdi, %rax, CHAR_SIZE), %rax
> +L(zero_end2):
>  	ret
>  
> -	.p2align 4
> -L(first_vec_x2):
> +L(last_vec_x2):
>  	tzcntl	%eax, %eax
> -# ifdef USE_AS_WMEMCHR
> -	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
> -	leaq	(VEC_SIZE * 2)(%rdi, %rax, 4), %rax
> -# else
> -	addq	$(VEC_SIZE * 2), %rax
> -	addq	%rdi, %rax
> -# endif
> +	leaq	(VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
>  	ret
>  
>  	.p2align 4
> -L(4x_vec_end):
> -	kmovd	%k1, %eax
> -	testl	%eax, %eax
> -	jnz	L(first_vec_x0)
> -	kmovd	%k2, %eax
> -	testl	%eax, %eax
> -	jnz	L(first_vec_x1)
> -	kmovd	%k3, %eax
> -	testl	%eax, %eax
> -	jnz	L(first_vec_x2)
> -	kmovd	%k4, %eax
> -	testl	%eax, %eax
> -L(first_vec_x3):
> +L(last_vec_x3):
>  	tzcntl	%eax, %eax
> -# ifdef USE_AS_WMEMCHR
> -	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
> -	leaq	(VEC_SIZE * 3)(%rdi, %rax, 4), %rax
> -# else
> -	addq	$(VEC_SIZE * 3), %rax
> -	addq	%rdi, %rax
> -# endif
> +	leaq	(VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax
>  	ret
> +# endif
>  
> -END (MEMCHR)
> +END(MEMCHR)

No need for this change.

>  #endif
> -- 
> 2.29.2
> 

Thanks.

H.J.

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH v1 1/3] Bench: Expand bench-memchr.c
  2021-05-03 17:17 ` [PATCH v1 1/3] Bench: Expand bench-memchr.c H.J. Lu
@ 2021-05-03 19:51   ` Noah Goldstein
  2021-05-03 20:59     ` H.J. Lu
  0 siblings, 1 reply; 20+ messages in thread
From: Noah Goldstein @ 2021-05-03 19:51 UTC (permalink / raw)
  To: H.J. Lu; +Cc: GNU C Library, Carlos O'Donell

On Mon, May 3, 2021 at 1:18 PM H.J. Lu <hjl.tools@gmail.com> wrote:
>
> On Mon, May 3, 2021 at 1:45 AM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> >
> > No bug. This commit adds some additional cases for bench-memchr.c
> > including testing medium sizes and testing short length with both an
> > inbound match and out of bound match.
> >
> > Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
> > ---
> >  benchtests/bench-memchr.c | 13 +++++++++++++
> >  1 file changed, 13 insertions(+)
> >
> > diff --git a/benchtests/bench-memchr.c b/benchtests/bench-memchr.c
> > index f5ced9d80d..5573f93312 100644
> > --- a/benchtests/bench-memchr.c
> > +++ b/benchtests/bench-memchr.c
> > @@ -135,12 +135,25 @@ test_main (void)
> >        do_test (i, i, 256, 0);
> >  #endif
> >      }
> > +  for (i = 1; i < 8; ++i)
> > +    {
> > +      do_test (i, i << 5, 192, 23);
> > +      do_test (i, i << 5, 192, 0);
> > +      do_test (i, i << 5, 256, 23);
> > +      do_test (i, i << 5, 256, 0);
> > +      do_test (i, i << 5, 512, 23);
> > +      do_test (i, i << 5, 512, 0);
> > +    }
> >    for (i = 1; i < 32; ++i)
> >      {
> >        do_test (0, i, i + 1, 23);
> >        do_test (0, i, i + 1, 0);
> >        do_test (i, i, i + 1, 23);
> >        do_test (i, i, i + 1, 0);
> > +      do_test (0, i, i - 1, 23);
> > +      do_test (0, i, i - 1, 0);
> > +      do_test (i, i, i - 1, 23);
> > +      do_test (i, i, i - 1, 0);
> >  #ifdef USE_AS_MEMRCHR
> >        /* Also test the position close to the beginning for memrchr.  */
> >        do_test (0, 1, i + 1, 23);
> > --
> > 2.29.2
> >
>
> LGTM.  I will check it in for you.

Thanks!

>
> BTW,  can you apply an account on sourceware.org:
>
> https://sourceware.org/
>
> so that you can push your commits directly?  You can put me down
> as your sponsor.

Done. Are there any wikis / manuals on how to properly use write access?
All I'm finding are resources on how to obtain it.

>
> Thanks.
>
> --
> H.J.

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH v1 2/3] x86: Optimize memchr-avx2.S
  2021-05-03 18:50   ` H.J. Lu
@ 2021-05-03 20:06     ` Noah Goldstein
  0 siblings, 0 replies; 20+ messages in thread
From: Noah Goldstein @ 2021-05-03 20:06 UTC (permalink / raw)
  To: H.J. Lu; +Cc: GNU C Library, Carlos O'Donell

On Mon, May 3, 2021 at 2:50 PM H.J. Lu <hjl.tools@gmail.com> wrote:
>
> On Mon, May 03, 2021 at 04:44:36AM -0400, Noah Goldstein wrote:
> > No bug. This commit optimizes memchr-avx2.S. The optimizations include
> > replacing some branches with cmovcc, avoiding some branches entirely
> > in the less_4x_vec case, making the page cross logic less strict,
> > asaving a few instructions the in loop return loop. test-memchr,
> > test-rawmemchr, and test-wmemchr are all passing.
> >
> > Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
> > ---
> >  sysdeps/x86_64/multiarch/memchr-avx2.S | 446 +++++++++++++++----------
> >  1 file changed, 262 insertions(+), 184 deletions(-)
> >
> > diff --git a/sysdeps/x86_64/multiarch/memchr-avx2.S b/sysdeps/x86_64/multiarch/memchr-avx2.S
> > index 1fcb1c350f..8368fcd1e1 100644
> > --- a/sysdeps/x86_64/multiarch/memchr-avx2.S
> > +++ b/sysdeps/x86_64/multiarch/memchr-avx2.S
> > @@ -26,8 +26,22 @@
> >
> >  # ifdef USE_AS_WMEMCHR
> >  #  define VPCMPEQ    vpcmpeqd
> > +#  define VPBROADCAST        vpbroadcastd
> > +#  define CHAR_SIZE  4
> >  # else
> >  #  define VPCMPEQ    vpcmpeqb
> > +#  define VPBROADCAST        vpbroadcastb
> > +#  define CHAR_SIZE  1
> > +# endif
> > +
> > +# ifdef USE_AS_RAWMEMCHR
> > +#  define ERAW_PTR_REG       ecx
> > +#  define RRAW_PTR_REG       rcx
> > +#  define ALGN_PTR_REG       rdi
> > +# else
> > +#  define ERAW_PTR_REG       edi
> > +#  define RRAW_PTR_REG       rdi
> > +#  define ALGN_PTR_REG       rcx
> >  # endif
> >
> >  # ifndef VZEROUPPER
> > @@ -39,303 +53,367 @@
> >  # endif
> >
> >  # define VEC_SIZE 32
> > +# define PAGE_SIZE 4096
> > +
>
> Remove the extra line here.

Done.

>
> >
> >       .section SECTION(.text),"ax",@progbits
> > -ENTRY (MEMCHR)
> > +ENTRY(MEMCHR)
>
> No need for this change.

Fixed.

>
> >  # ifndef USE_AS_RAWMEMCHR
> >       /* Check for zero length.  */
> >       test    %RDX_LP, %RDX_LP
> >       jz      L(null)
> >  # endif
> > -     movl    %edi, %ecx
> > -     /* Broadcast CHAR to YMM0.  */
> > -     vmovd   %esi, %xmm0
> >  # ifdef USE_AS_WMEMCHR
> >       shl     $2, %RDX_LP
> > -     vpbroadcastd %xmm0, %ymm0
> >  # else
> >  #  ifdef __ILP32__
> >       /* Clear the upper 32 bits.  */
> >       movl    %edx, %edx
> >  #  endif
> > -     vpbroadcastb %xmm0, %ymm0
> >  # endif
> > -     /* Check if we may cross page boundary with one vector load.  */
> > -     andl    $(2 * VEC_SIZE - 1), %ecx
> > -     cmpl    $VEC_SIZE, %ecx
> > -     ja      L(cros_page_boundary)
> > +     /* Broadcast CHAR to YMMMATCH.  */
> > +     vmovd   %esi, %xmm0
> > +     VPBROADCAST %xmm0, %ymm0
> > +     /* Check if we may cross page boundary with one
> > +        vector load.  */
> > +     movl    %edi, %eax
> > +     andl    $(PAGE_SIZE - 1), %eax
> > +     cmpl    $(PAGE_SIZE - VEC_SIZE), %eax
> > +     ja      L(cross_page_boundary)
> >
> >       /* Check the first VEC_SIZE bytes.  */
> > -     VPCMPEQ (%rdi), %ymm0, %ymm1
> > +     VPCMPEQ (%rdi), %ymm0, %ymm1
> >       vpmovmskb %ymm1, %eax
> > -     testl   %eax, %eax
> > -
> >  # ifndef USE_AS_RAWMEMCHR
> > -     jnz     L(first_vec_x0_check)
> > -     /* Adjust length and check the end of data.  */
> > -     subq    $VEC_SIZE, %rdx
> > -     jbe     L(zero)
> > -# else
> > -     jnz     L(first_vec_x0)
> > +     /* If length < CHAR_PER_VEC handle special.  */
> > +     cmpq    $VEC_SIZE, %rdx
> > +     jbe     L(first_vec_x0)
> >  # endif
> > -
> > -     /* Align data for aligned loads in the loop.  */
> > -     addq    $VEC_SIZE, %rdi
> > -     andl    $(VEC_SIZE - 1), %ecx
> > -     andq    $-VEC_SIZE, %rdi
> > +     testl   %eax, %eax
> > +     jz      L(aligned_more)
> > +     tzcntl  %eax, %eax
> > +     addq    %rdi, %rax
> > +     VZEROUPPER_RETURN
> >
> >  # ifndef USE_AS_RAWMEMCHR
> > -     /* Adjust length.  */
> > -     addq    %rcx, %rdx
> > -
> > -     subq    $(VEC_SIZE * 4), %rdx
> > -     jbe     L(last_4x_vec_or_less)
> > +     .p2align 5
> > +L(first_vec_x0):
> > +     /* Check if first match was before length.  */
> > +     tzcntl  %eax, %eax
> > +     xorl    %ecx, %ecx
> > +     cmpl    %eax, %edx
> > +     leaq    (%rdi, %rax), %rax
> > +     cmovle  %rcx, %rax
> > +     VZEROUPPER_RETURN
> > +L(null):
> > +     xorl    %eax, %eax
> > +     ret
> >  # endif
> > -     jmp     L(more_4x_vec)
> > -
> >       .p2align 4
> > -L(cros_page_boundary):
> > -     andl    $(VEC_SIZE - 1), %ecx
> > -     andq    $-VEC_SIZE, %rdi
> > -     VPCMPEQ (%rdi), %ymm0, %ymm1
> > +L(cross_page_boundary):
> > +     /* Save pointer before aligning as its original
> > +        value is necessary for computer return address if byte is
> > +        found or adjusting length if it is not and this is
>
> Fit comments to 72 columns.

Fixed. Still working out the kinks in my formatter.
For the 72 column fill does tab count as 1, 4, or 8 units?

>
> > +        memchr.  */
> > +     movq    %rdi, %rcx
> > +     /* Align data to VEC_SIZE - 1. ALGN_PTR_REG is
> > +        rcx for memchr and rdi for rawmemchr.  */
> > +     orq     $(VEC_SIZE - 1), %ALGN_PTR_REG
> > +     VPCMPEQ -(VEC_SIZE - 1)(%ALGN_PTR_REG), %ymm0, %ymm1
> >       vpmovmskb %ymm1, %eax
> > +# ifndef USE_AS_RAWMEMCHR
> > +     /* Calculate length until end of page (length
> > +        checked for a match).  */
> > +     leaq    1(%ALGN_PTR_REG), %rsi
> > +     subq    %RRAW_PTR_REG, %rsi
> > +# endif
> >       /* Remove the leading bytes.  */
> > -     sarl    %cl, %eax
> > -     testl   %eax, %eax
> > -     jz      L(aligned_more)
> > -     tzcntl  %eax, %eax
> > +     sarxl   %ERAW_PTR_REG, %eax, %eax
> >  # ifndef USE_AS_RAWMEMCHR
> >       /* Check the end of data.  */
> > -     cmpq    %rax, %rdx
> > -     jbe     L(zero)
> > +     cmpq    %rsi, %rdx
> > +     jbe     L(first_vec_x0)
> >  # endif
> > -     addq    %rdi, %rax
> > -     addq    %rcx, %rax
> > +     testl   %eax, %eax
> > +     jz      L(cross_page_continue)
> > +     tzcntl  %eax, %eax
> > +     addq    %RRAW_PTR_REG, %rax
> >  L(return_vzeroupper):
> >       ZERO_UPPER_VEC_REGISTERS_RETURN
> >
> >       .p2align 4
> > -L(aligned_more):
> > -# ifndef USE_AS_RAWMEMCHR
> > -        /* Calculate "rdx + rcx - VEC_SIZE" with "rdx - (VEC_SIZE - rcx)"
> > -        instead of "(rdx + rcx) - VEC_SIZE" to void possible addition
> > -        overflow.  */
> > -     negq    %rcx
> > -     addq    $VEC_SIZE, %rcx
> > +L(first_vec_x1):
> > +     tzcntl  %eax, %eax
> > +     incq    %rdi
> > +     addq    %rdi, %rax
> > +     VZEROUPPER_RETURN
> >
> > -     /* Check the end of data.  */
> > -     subq    %rcx, %rdx
> > -     jbe     L(zero)
> > -# endif
> > +     .p2align 4
> > +L(first_vec_x2):
> > +     tzcntl  %eax, %eax
> > +     addq    $(VEC_SIZE + 1), %rdi
> > +     addq    %rdi, %rax
> > +     VZEROUPPER_RETURN
> >
> > -     addq    $VEC_SIZE, %rdi
> > +     .p2align 4
> > +L(first_vec_x3):
> > +     tzcntl  %eax, %eax
> > +     addq    $(VEC_SIZE * 2 + 1), %rdi
> > +     addq    %rdi, %rax
> > +     VZEROUPPER_RETURN
> >
> > -# ifndef USE_AS_RAWMEMCHR
> > -     subq    $(VEC_SIZE * 4), %rdx
> > -     jbe     L(last_4x_vec_or_less)
> > -# endif
> >
> > -L(more_4x_vec):
> > -     /* Check the first 4 * VEC_SIZE.  Only one VEC_SIZE at a time
> > -        since data is only aligned to VEC_SIZE.  */
> > -     VPCMPEQ (%rdi), %ymm0, %ymm1
> > -     vpmovmskb %ymm1, %eax
> > -     testl   %eax, %eax
> > -     jnz     L(first_vec_x0)
> > +     .p2align 4
> > +L(first_vec_x4):
> > +     tzcntl  %eax, %eax
> > +     addq    $(VEC_SIZE * 3 + 1), %rdi
> > +     addq    %rdi, %rax
> > +     VZEROUPPER_RETURN
> >
> > -     VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1
> > +     .p2align 4
> > +L(aligned_more):
> > +     /* Check the first 4 * VEC_SIZE.  Only one
> > +        VEC_SIZE at a time since data is only aligned to
> > +        VEC_SIZE.  */
>
> Fit comments to 72 columns.

Adjusted closer. Hopefully fixed.

>
> > +
> > +# ifndef USE_AS_RAWMEMCHR
> > +L(cross_page_continue):
> > +     /* Align data to VEC_SIZE - 1.  */
> > +     xorl    %ecx, %ecx
> > +     subl    %edi, %ecx
> > +     orq     $(VEC_SIZE - 1), %rdi
> > +     /* esi is for adjusting length to see if near the
> > +        end.  */
>
> Fit comments to 72 columns.

Adjusted closer. Hopefully fixed.

>
> > +     leal    (VEC_SIZE * 4 + 1)(%rdi, %rcx), %esi
> > +# else
> > +     orq     $(VEC_SIZE - 1), %rdi
> > +L(cross_page_continue):
> > +# endif
> > +     /* Load first VEC regardless.  */
> > +     VPCMPEQ 1(%rdi), %ymm0, %ymm1
> >       vpmovmskb %ymm1, %eax
> > +# ifndef USE_AS_RAWMEMCHR
> > +     /* Adjust length. If near end handle specially.
> > +      */
>
> Put the comments on one line.

Fixed.

>
> > +     subq    %rsi, %rdx
> > +     jbe     L(last_4x_vec_or_less)
> > +# endif
> >       testl   %eax, %eax
> >       jnz     L(first_vec_x1)
> >
> > -     VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm0, %ymm1
> > +     VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
> >       vpmovmskb %ymm1, %eax
> >       testl   %eax, %eax
> >       jnz     L(first_vec_x2)
> >
> > -     VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1
> > +     VPCMPEQ (VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm1
> >       vpmovmskb %ymm1, %eax
> >       testl   %eax, %eax
> >       jnz     L(first_vec_x3)
> >
> > -     addq    $(VEC_SIZE * 4), %rdi
> > +     VPCMPEQ (VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm1
> > +     vpmovmskb %ymm1, %eax
> > +     testl   %eax, %eax
> > +     jnz     L(first_vec_x4)
> >
> >  # ifndef USE_AS_RAWMEMCHR
> > +     /* Check if at last VEC_SIZE * 4 length.  */
> >       subq    $(VEC_SIZE * 4), %rdx
> > -     jbe     L(last_4x_vec_or_less)
> > -# endif
> > -
> > -     /* Align data to 4 * VEC_SIZE.  */
> > -     movq    %rdi, %rcx
> > -     andl    $(4 * VEC_SIZE - 1), %ecx
> > -     andq    $-(4 * VEC_SIZE), %rdi
> > -
> > -# ifndef USE_AS_RAWMEMCHR
> > -     /* Adjust length.  */
> > +     jbe     L(last_4x_vec_or_less_cmpeq)
> > +     /* Align data to VEC_SIZE * 4 - 1 for the  loop
> > +        and readjust length.  */
> > +     incq    %rdi
> > +     movl    %edi, %ecx
> > +     orq     $(VEC_SIZE * 4 - 1), %rdi
> > +     andl    $(VEC_SIZE * 4 - 1), %ecx
> >       addq    %rcx, %rdx
> > +# else
> > +     /* Align data to VEC_SIZE * 4 - 1 for loop.  */
> > +     incq    %rdi
> > +     orq     $(VEC_SIZE * 4 - 1), %rdi
> >  # endif
> >
> > +     /* Compare 4 * VEC at a time forward.  */
> >       .p2align 4
> >  L(loop_4x_vec):
> > -     /* Compare 4 * VEC at a time forward.  */
> > -     VPCMPEQ (%rdi), %ymm0, %ymm1
> > -     VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm2
> > -     VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm0, %ymm3
> > -     VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm0, %ymm4
> > -
> > +     VPCMPEQ 1(%rdi), %ymm0, %ymm1
> > +     VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm2
> > +     VPCMPEQ (VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm3
> > +     VPCMPEQ (VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm4
> >       vpor    %ymm1, %ymm2, %ymm5
> >       vpor    %ymm3, %ymm4, %ymm6
> >       vpor    %ymm5, %ymm6, %ymm5
> >
> > -     vpmovmskb %ymm5, %eax
> > -     testl   %eax, %eax
> > -     jnz     L(4x_vec_end)
> > -
> > -     addq    $(VEC_SIZE * 4), %rdi
> > -
> > +     vpmovmskb %ymm5, %ecx
> >  # ifdef USE_AS_RAWMEMCHR
> > -     jmp     L(loop_4x_vec)
> > +     subq    $-(VEC_SIZE * 4), %rdi
> > +     testl   %ecx, %ecx
> > +     jz      L(loop_4x_vec)
> >  # else
> > -     subq    $(VEC_SIZE * 4), %rdx
> > -     ja      L(loop_4x_vec)
> > +     testl   %ecx, %ecx
> > +     jnz     L(loop_4x_vec_end)
> >
> > -L(last_4x_vec_or_less):
> > -     /* Less than 4 * VEC and aligned to VEC_SIZE.  */
> > -     addl    $(VEC_SIZE * 2), %edx
> > -     jle     L(last_2x_vec)
> > +     subq    $-(VEC_SIZE * 4), %rdi
> >
> > -     VPCMPEQ (%rdi), %ymm0, %ymm1
> > -     vpmovmskb %ymm1, %eax
> > -     testl   %eax, %eax
> > -     jnz     L(first_vec_x0)
> > +     subq    $(VEC_SIZE * 4), %rdx
> > +     ja      L(loop_4x_vec)
> >
> > -     VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1
> > +     /* Fall through into less than 4 remaining
> > +        vectors of length case.  */
>
> Fit comments to 72 columns.

Adjusted closer. Hopefully fixed.

>
> > +     VPCMPEQ (VEC_SIZE * 0 + 1)(%rdi), %ymm0, %ymm1
> >       vpmovmskb %ymm1, %eax
> > +     .p2align 4
> > +L(last_4x_vec_or_less):
> > +     /* Check if first VEC contained match.  */
> >       testl   %eax, %eax
> > -     jnz     L(first_vec_x1)
> > +     jnz     L(first_vec_x1_check)
> >
> > -     VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm0, %ymm1
> > -     vpmovmskb %ymm1, %eax
> > -     testl   %eax, %eax
> > +     /* If remaining length > VEC_SIZE * 2.  */
> > +     addl    $(VEC_SIZE * 2), %edx
> > +     jg      L(last_4x_vec)
> >
> > -     jnz     L(first_vec_x2_check)
> > -     subl    $VEC_SIZE, %edx
> > -     jle     L(zero)
> > +L(last_2x_vec):
> > +     /* If remaining length < VEC_SIZE.  */
> > +     addl    $VEC_SIZE, %edx
> > +     jle     L(zero_end)
> >
> > -     VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1
> > +     /* Check VEC2 and compare any match with
> > +        remaining length.  */
>
> Fit comments to 72 columns.

Adjusted closer. Hopefully fixed.
>
> > +     VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
> >       vpmovmskb %ymm1, %eax
> > -     testl   %eax, %eax
> > -
> > -     jnz     L(first_vec_x3_check)
> > -     xorl    %eax, %eax
> > +     tzcntl  %eax, %eax
> > +     cmpl    %eax, %edx
> > +     jbe     L(set_zero_end)
> > +     addq    $(VEC_SIZE + 1), %rdi
> > +     addq    %rdi, %rax
> > +L(zero_end):
> >       VZEROUPPER_RETURN
> >
> >       .p2align 4
> > -L(last_2x_vec):
> > -     addl    $(VEC_SIZE * 2), %edx
> > -     VPCMPEQ (%rdi), %ymm0, %ymm1
> > +L(loop_4x_vec_end):
> > +# endif
> > +     /* rawmemchr will fall through into this if match
> > +        was found in loop.  */
>
> Fit comments to 72 columns.

Adjusted closer. Hopefully fixed.

>
> > +
> >       vpmovmskb %ymm1, %eax
> >       testl   %eax, %eax
> > +     jnz     L(last_vec_x1_return)
> >
> > -     jnz     L(first_vec_x0_check)
> > -     subl    $VEC_SIZE, %edx
> > -     jle     L(zero)
> > -
> > -     VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1
> > -     vpmovmskb %ymm1, %eax
> > +     vpmovmskb %ymm2, %eax
> >       testl   %eax, %eax
> > -     jnz     L(first_vec_x1_check)
> > -     xorl    %eax, %eax
> > -     VZEROUPPER_RETURN
> > +     jnz     L(last_vec_x2_return)
> >
> > -     .p2align 4
> > -L(first_vec_x0_check):
> > -     tzcntl  %eax, %eax
> > -     /* Check the end of data.  */
> > -     cmpq    %rax, %rdx
> > -     jbe     L(zero)
> > +     vpmovmskb %ymm3, %eax
> > +     /* Combine VEC3 matches (eax) with VEC4 matches
> > +        (ecx).  */
>
> Fit comments to 72 columns.

Adjusted closer. Hopefully fixed.

>
> > +     salq    $32, %rcx
> > +     orq     %rcx, %rax
> > +     tzcntq  %rax, %rax
> > +# ifdef USE_AS_RAWMEMCHR
> > +     subq    $(VEC_SIZE * 2 - 1), %rdi
> > +# else
> > +     subq    $-(VEC_SIZE * 2 + 1), %rdi
> > +# endif
> >       addq    %rdi, %rax
> >       VZEROUPPER_RETURN
> > +# ifndef USE_AS_RAWMEMCHR
> >
> >       .p2align 4
> >  L(first_vec_x1_check):
> >       tzcntl  %eax, %eax
> > -     /* Check the end of data.  */
> > -     cmpq    %rax, %rdx
> > -     jbe     L(zero)
> > -     addq    $VEC_SIZE, %rax
> > +     /* Adjust length.  */
> > +     subl    $-(VEC_SIZE * 4), %edx
> > +     /* Check if match within remaining length.  */
> > +     cmpl    %eax, %edx
> > +     jbe     L(set_zero_end)
> > +     incq    %rdi
> >       addq    %rdi, %rax
> >       VZEROUPPER_RETURN
> > +     .p2align 4
> > +L(set_zero_end):
> > +     xorl    %eax, %eax
> > +     VZEROUPPER_RETURN
> > +# endif
> >
> >       .p2align 4
> > -L(first_vec_x2_check):
> > +L(last_vec_x1_return):
> >       tzcntl  %eax, %eax
> > -     /* Check the end of data.  */
> > -     cmpq    %rax, %rdx
> > -     jbe     L(zero)
> > -     addq    $(VEC_SIZE * 2), %rax
> > +# ifdef USE_AS_RAWMEMCHR
> > +     subq    $(VEC_SIZE * 4 - 1), %rdi
> > +# else
> > +     incq    %rdi
> > +# endif
> >       addq    %rdi, %rax
> >       VZEROUPPER_RETURN
> >
> >       .p2align 4
> > -L(first_vec_x3_check):
> > +L(last_vec_x2_return):
> >       tzcntl  %eax, %eax
> > -     /* Check the end of data.  */
> > -     cmpq    %rax, %rdx
> > -     jbe     L(zero)
> > -     addq    $(VEC_SIZE * 3), %rax
> > +# ifdef USE_AS_RAWMEMCHR
> > +     subq    $(VEC_SIZE * 3 - 1), %rdi
> > +# else
> > +     subq    $-(VEC_SIZE + 1), %rdi
> > +# endif
> >       addq    %rdi, %rax
> >       VZEROUPPER_RETURN
> >
> > +# ifndef USE_AS_RAWMEMCHR
> >       .p2align 4
> > -L(zero):
> > -     xorl    %eax, %eax
> > -     jmp     L(return_vzeroupper)
> > +L(last_4x_vec_or_less_cmpeq):
> > +     VPCMPEQ (VEC_SIZE * 4 + 1)(%rdi), %ymm0, %ymm1
> > +     vpmovmskb %ymm1, %eax
> > +     subq    $-(VEC_SIZE * 4), %rdi
> > +     /* Check first VEC regardless.  */
> > +     testl   %eax, %eax
> > +     jnz     L(first_vec_x1_check)
> >
> > +     /* If remaining length <= CHAR_PER_VEC * 2.  */
> > +     addl    $(VEC_SIZE * 2), %edx
> > +     jle     L(last_2x_vec)
> >       .p2align 4
> > -L(null):
> > -     xorl    %eax, %eax
> > -     ret
> > -# endif
> > +L(last_4x_vec):
> > +     VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
> > +     vpmovmskb %ymm1, %eax
> > +     testl   %eax, %eax
> > +     jnz     L(last_vec_x2_return)
> >
> > -     .p2align 4
> > -L(first_vec_x0):
> > -     tzcntl  %eax, %eax
> > -     addq    %rdi, %rax
> > -     VZEROUPPER_RETURN
> > +     VPCMPEQ (VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm1
> > +     vpmovmskb %ymm1, %eax
> >
> > -     .p2align 4
> > -L(first_vec_x1):
> > -     tzcntl  %eax, %eax
> > -     addq    $VEC_SIZE, %rax
> > -     addq    %rdi, %rax
> > -     VZEROUPPER_RETURN
> > +     /* Create mask for possible matches within
> > +        remaining length.  */
>
> Fit comments to 72 columns.

Adjusted closer. Hopefully fixed.

>
> > +     movq    $-1, %rcx
> > +     bzhiq   %rdx, %rcx, %rcx
> >
> > -     .p2align 4
> > -L(first_vec_x2):
> > +     /* Test matches in data against length match.  */
> > +     andl    %ecx, %eax
> > +     jnz     L(last_vec_x3)
> > +
> > +     /* if remaining length <= VEC_SIZE * 3 (Note this
> > +        is after remaining length was found to be > VEC_SIZE * 2.
>
> Fit comments to 72 columns.

Adjusted closer. Hopefully fixed.

>
> > +      */
> > +     subl    $VEC_SIZE, %edx
> > +     jbe     L(zero_end2)
> > +
> > +     VPCMPEQ (VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm1
> > +     vpmovmskb %ymm1, %eax
> > +     /* Shift remaining length mask for last VEC.  */
> > +     shrq    $32, %rcx
> > +     andl    %ecx, %eax
> > +     jz      L(zero_end2)
> >       tzcntl  %eax, %eax
> > -     addq    $(VEC_SIZE * 2), %rax
> > +     addq    $(VEC_SIZE * 3 + 1), %rdi
> >       addq    %rdi, %rax
> > +L(zero_end2):
> >       VZEROUPPER_RETURN
> >
> >       .p2align 4
> > -L(4x_vec_end):
> > -     vpmovmskb %ymm1, %eax
> > -     testl   %eax, %eax
> > -     jnz     L(first_vec_x0)
> > -     vpmovmskb %ymm2, %eax
> > -     testl   %eax, %eax
> > -     jnz     L(first_vec_x1)
> > -     vpmovmskb %ymm3, %eax
> > -     testl   %eax, %eax
> > -     jnz     L(first_vec_x2)
> > -     vpmovmskb %ymm4, %eax
> > -     testl   %eax, %eax
> > -L(first_vec_x3):
> > +L(last_vec_x3):
> >       tzcntl  %eax, %eax
> > -     addq    $(VEC_SIZE * 3), %rax
> > +     subq    $-(VEC_SIZE * 2 + 1), %rdi
> >       addq    %rdi, %rax
> >       VZEROUPPER_RETURN
> > +# endif
> >
> > -END (MEMCHR)
> > +END(MEMCHR)
>
> No need for this change.

Fixed.

>
> >  #endif
> > --
> > 2.29.2
> >
>
> Thanks.
>
> H.J.

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH v1 3/3] x86: Optimize memchr-evex.S
  2021-05-03 18:58   ` H.J. Lu
@ 2021-05-03 20:06     ` Noah Goldstein
  0 siblings, 0 replies; 20+ messages in thread
From: Noah Goldstein @ 2021-05-03 20:06 UTC (permalink / raw)
  To: H.J. Lu; +Cc: GNU C Library, Carlos O'Donell

On Mon, May 3, 2021 at 2:58 PM H.J. Lu <hjl.tools@gmail.com> wrote:
>
> On Mon, May 03, 2021 at 04:44:38AM -0400, Noah Goldstein wrote:
> > No bug. This commit optimizes memchr-evex.S. The optimizations include
> > replacing some branches with cmovcc, avoiding some branches entirely
> > in the less_4x_vec case, making the page cross logic less strict,
> > saving some ALU in the alignment process, and most importantly
> > increasing ILP in the 4x loop. test-memchr, test-rawmemchr, and
> > test-wmemchr are all passing.
> >
> > Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
> > ---
> > Tests where run on the following CPUs:
> >
> > Tigerlake: https://ark.intel.com/content/www/us/en/ark/products/208921/intel-core-i7-1165g7-processor-12m-cache-up-to-4-70-ghz-with-ipu.html
> >
> > Icelake: https://ark.intel.com/content/www/us/en/ark/products/196597/intel-core-i7-1065g7-processor-8m-cache-up-to-3-90-ghz.html
> >
> > Skylake: https://ark.intel.com/content/www/us/en/ark/products/149091/intel-core-i7-8565u-processor-8m-cache-up-to-4-60-ghz.html
> >
> > All times are the geometric mean of N=20. The unit of time is
> > seconds.
> >
> > "Cur" refers to the current implementation
> > "New" refers to this patches implementation
> >
> > Note: The numbers for size = [1, 32] are highly dependent on function
> > alignment. That being said the new implementation which uses cmovcc
> > instead of a branch (mostly for the reason of high variance with
> > different alignments) for the [1, 32] case is far more consistent and
> > performs about as well (and should only be a bigger improvement in
> > cases where the sizes / position are not 100% predictable).
> >
> > For memchr-evex the numbers are a near universal improvement. The case
> > where the current implement as better is for size = 0 and for size =
> > [1, 32] with pos < size the two implementations are about the
> > same. For size = [1, 32] with pos > size, for medium range sizes, and
> > large size, however, the new implementation is faster.
> >
> > Results For Tigerlake memchr-evex
> > size  , algn  , Pos   , Cur T , New T , Win   , Dif
> > 2048  , 0     , , 32    5.58  , 5.22  , New   , 0.36
> > 256   , 1     , , 64    5.22  , 4.93  , New   , 0.29
> > 2048  , 0     , , 64    5.22  , 4.89  , New   , 0.33
> > 256   , 2     , , 64    5.14  , 4.81  , New   , 0.33
> > 2048  , 0     , , 128   6.3   , 5.67  , New   , 0.63
> > 256   , 3     , , 64    5.22  , 4.9   , New   , 0.32
> > 2048  , 0     , , 256   11.07 , 10.92 , New   , 0.15
> > 256   , 4     , , 64    5.16  , 4.86  , New   , 0.3
> > 2048  , 0     , , 512   15.66 , 14.81 , New   , 0.85
> > 256   , 5     , , 64    5.15  , 4.84  , New   , 0.31
> > 2048  , 0     , , 1024  25.7  , 23.02 , New   , 2.68
> > 256   , 6     , , 64    5.12  , 4.89  , New   , 0.23
> > 2048  , 0     , , 2048  42.34 , 37.71 , New   , 4.63
> > 256   , 7     , , 64    5.03  , 4.62  , New   , 0.41
> > 192   , 1     , , 32    4.96  , 4.28  , New   , 0.68
> > 256   , 1     , , 32    4.95  , 4.28  , New   , 0.67
> > 512   , 1     , , 32    4.94  , 4.29  , New   , 0.65
> > 192   , 2     , , 64    5.1   , 4.8   , New   , 0.3
> > 512   , 2     , , 64    5.12  , 4.72  , New   , 0.4
> > 192   , 3     , , 96    5.54  , 5.12  , New   , 0.42
> > 256   , 3     , , 96    5.52  , 5.15  , New   , 0.37
> > 512   , 3     , , 96    5.51  , 5.16  , New   , 0.35
> > 192   , 4     , , 128   6.1   , 5.53  , New   , 0.57
> > 256   , 4     , , 128   6.09  , 5.49  , New   , 0.6
> > 512   , 4     , , 128   6.08  , 5.48  , New   , 0.6
> > 192   , 5     , , 160   7.42  , 6.71  , New   , 0.71
> > 256   , 5     , , 160   6.86  , 6.71  , New   , 0.15
> > 512   , 5     , , 160   9.28  , 8.68  , New   , 0.6
> > 192   , 6     , , 192   7.94  , 7.47  , New   , 0.47
> > 256   , 6     , , 192   7.62  , 7.17  , New   , 0.45
> > 512   , 6     , , 192   9.2   , 9.16  , New   , 0.04
> > 192   , 7     , , 224   8.02  , 7.43  , New   , 0.59
> > 256   , 7     , , 224   8.34  , 7.85  , New   , 0.49
> > 512   , 7     , , 224   9.89  , 9.16  , New   , 0.73
> > 2     , 0     , , 1     3.0   , 3.0   , Eq    , 0.0
> > 2     , 1     , , 1     3.0   , 3.0   , Eq    , 0.0
> > 0     , 0     , , 1     3.01  , 3.6   , Cur   , 0.59
> > 0     , 1     , , 1     3.01  , 3.6   , Cur   , 0.59
> > 3     , 0     , , 2     3.0   , 3.0   , Eq    , 0.0
> > 3     , 2     , , 2     3.0   , 3.0   , Eq    , 0.0
> > 1     , 0     , , 2     3.6   , 3.0   , New   , 0.6
> > 1     , 2     , , 2     3.6   , 3.0   , New   , 0.6
> > 4     , 0     , , 3     3.01  , 3.01  , Eq    , 0.0
> > 4     , 3     , , 3     3.01  , 3.01  , Eq    , 0.0
> > 2     , 0     , , 3     3.62  , 3.02  , New   , 0.6
> > 2     , 3     , , 3     3.62  , 3.03  , New   , 0.59
> > 5     , 0     , , 4     3.02  , 3.03  , Cur   , 0.01
> > 5     , 4     , , 4     3.02  , 3.02  , Eq    , 0.0
> > 3     , 0     , , 4     3.63  , 3.02  , New   , 0.61
> > 3     , 4     , , 4     3.63  , 3.04  , New   , 0.59
> > 6     , 0     , , 5     3.05  , 3.04  , New   , 0.01
> > 6     , 5     , , 5     3.02  , 3.02  , Eq    , 0.0
> > 4     , 0     , , 5     3.63  , 3.02  , New   , 0.61
> > 4     , 5     , , 5     3.64  , 3.03  , New   , 0.61
> > 7     , 0     , , 6     3.03  , 3.03  , Eq    , 0.0
> > 7     , 6     , , 6     3.02  , 3.02  , Eq    , 0.0
> > 5     , 0     , , 6     3.64  , 3.01  , New   , 0.63
> > 5     , 6     , , 6     3.64  , 3.03  , New   , 0.61
> > 8     , 0     , , 7     3.03  , 3.04  , Cur   , 0.01
> > 8     , 7     , , 7     3.04  , 3.04  , Eq    , 0.0
> > 6     , 0     , , 7     3.67  , 3.04  , New   , 0.63
> > 6     , 7     , , 7     3.65  , 3.05  , New   , 0.6
> > 9     , 0     , , 8     3.05  , 3.05  , Eq    , 0.0
> > 7     , 0     , , 8     3.67  , 3.05  , New   , 0.62
> > 10    , 0     , , 9     3.06  , 3.06  , Eq    , 0.0
> > 10    , 1     , , 9     3.06  , 3.06  , Eq    , 0.0
> > 8     , 0     , , 9     3.67  , 3.06  , New   , 0.61
> > 8     , 1     , , 9     3.67  , 3.06  , New   , 0.61
> > 11    , 0     , , 10    3.06  , 3.06  , Eq    , 0.0
> > 11    , 2     , , 10    3.07  , 3.06  , New   , 0.01
> > 9     , 0     , , 10    3.67  , 3.05  , New   , 0.62
> > 9     , 2     , , 10    3.67  , 3.06  , New   , 0.61
> > 12    , 0     , , 11    3.06  , 3.06  , Eq    , 0.0
> > 12    , 3     , , 11    3.06  , 3.06  , Eq    , 0.0
> > 10    , 0     , , 11    3.67  , 3.06  , New   , 0.61
> > 10    , 3     , , 11    3.67  , 3.06  , New   , 0.61
> > 13    , 0     , , 12    3.06  , 3.07  , Cur   , 0.01
> > 13    , 4     , , 12    3.06  , 3.07  , Cur   , 0.01
> > 11    , 0     , , 12    3.67  , 3.11  , New   , 0.56
> > 11    , 4     , , 12    3.68  , 3.12  , New   , 0.56
> > 14    , 0     , , 13    3.07  , 3.1   , Cur   , 0.03
> > 14    , 5     , , 13    3.06  , 3.07  , Cur   , 0.01
> > 12    , 0     , , 13    3.67  , 3.07  , New   , 0.6
> > 12    , 5     , , 13    3.67  , 3.08  , New   , 0.59
> > 15    , 0     , , 14    3.06  , 3.06  , Eq    , 0.0
> > 15    , 6     , , 14    3.07  , 3.06  , New   , 0.01
> > 13    , 0     , , 14    3.67  , 3.06  , New   , 0.61
> > 13    , 6     , , 14    3.68  , 3.06  , New   , 0.62
> > 16    , 0     , , 15    3.06  , 3.06  , Eq    , 0.0
> > 16    , 7     , , 15    3.06  , 3.05  , New   , 0.01
> > 14    , 0     , , 15    3.68  , 3.06  , New   , 0.62
> > 14    , 7     , , 15    3.67  , 3.06  , New   , 0.61
> > 17    , 0     , , 16    3.07  , 3.06  , New   , 0.01
> > 15    , 0     , , 16    3.68  , 3.06  , New   , 0.62
> > 18    , 0     , , 17    3.06  , 3.06  , Eq    , 0.0
> > 18    , 1     , , 17    3.06  , 3.06  , Eq    , 0.0
> > 16    , 0     , , 17    3.67  , 3.06  , New   , 0.61
> > 16    , 1     , , 17    3.67  , 3.05  , New   , 0.62
> > 19    , 0     , , 18    3.07  , 3.06  , New   , 0.01
> > 19    , 2     , , 18    3.06  , 3.06  , Eq    , 0.0
> > 17    , 0     , , 18    3.68  , 3.08  , New   , 0.6
> > 17    , 2     , , 18    3.68  , 3.06  , New   , 0.62
> > 20    , 0     , , 19    3.06  , 3.06  , Eq    , 0.0
> > 20    , 3     , , 19    3.06  , 3.06  , Eq    , 0.0
> > 18    , 0     , , 19    3.68  , 3.06  , New   , 0.62
> > 18    , 3     , , 19    3.68  , 3.06  , New   , 0.62
> > 21    , 0     , , 20    3.06  , 3.06  , Eq    , 0.0
> > 21    , 4     , , 20    3.06  , 3.06  , Eq    , 0.0
> > 19    , 0     , , 20    3.67  , 3.06  , New   , 0.61
> > 19    , 4     , , 20    3.67  , 3.06  , New   , 0.61
> > 22    , 0     , , 21    3.06  , 3.06  , Eq    , 0.0
> > 22    , 5     , , 21    3.06  , 3.06  , Eq    , 0.0
> > 20    , 0     , , 21    3.67  , 3.05  , New   , 0.62
> > 20    , 5     , , 21    3.68  , 3.06  , New   , 0.62
> > 23    , 0     , , 22    3.07  , 3.06  , New   , 0.01
> > 23    , 6     , , 22    3.06  , 3.06  , Eq    , 0.0
> > 21    , 0     , , 22    3.68  , 3.07  , New   , 0.61
> > 21    , 6     , , 22    3.67  , 3.06  , New   , 0.61
> > 24    , 0     , , 23    3.19  , 3.06  , New   , 0.13
> > 24    , 7     , , 23    3.08  , 3.06  , New   , 0.02
> > 22    , 0     , , 23    3.69  , 3.06  , New   , 0.63
> > 22    , 7     , , 23    3.68  , 3.06  , New   , 0.62
> > 25    , 0     , , 24    3.07  , 3.06  , New   , 0.01
> > 23    , 0     , , 24    3.68  , 3.06  , New   , 0.62
> > 26    , 0     , , 25    3.06  , 3.05  , New   , 0.01
> > 26    , 1     , , 25    3.07  , 3.06  , New   , 0.01
> > 24    , 0     , , 25    3.67  , 3.05  , New   , 0.62
> > 24    , 1     , , 25    3.68  , 3.06  , New   , 0.62
> > 27    , 0     , , 26    3.12  , 3.06  , New   , 0.06
> > 27    , 2     , , 26    3.08  , 3.06  , New   , 0.02
> > 25    , 0     , , 26    3.69  , 3.06  , New   , 0.63
> > 25    , 2     , , 26    3.67  , 3.06  , New   , 0.61
> > 28    , 0     , , 27    3.06  , 3.06  , Eq    , 0.0
> > 28    , 3     , , 27    3.06  , 3.06  , Eq    , 0.0
> > 26    , 0     , , 27    3.67  , 3.06  , New   , 0.61
> > 26    , 3     , , 27    3.67  , 3.06  , New   , 0.61
> > 29    , 0     , , 28    3.06  , 3.06  , Eq    , 0.0
> > 29    , 4     , , 28    3.06  , 3.06  , Eq    , 0.0
> > 27    , 0     , , 28    3.68  , 3.05  , New   , 0.63
> > 27    , 4     , , 28    3.67  , 3.06  , New   , 0.61
> > 30    , 0     , , 29    3.06  , 3.06  , Eq    , 0.0
> > 30    , 5     , , 29    3.06  , 3.06  , Eq    , 0.0
> > 28    , 0     , , 29    3.67  , 3.06  , New   , 0.61
> > 28    , 5     , , 29    3.68  , 3.06  , New   , 0.62
> > 31    , 0     , , 30    3.06  , 3.06  , Eq    , 0.0
> > 31    , 6     , , 30    3.06  , 3.06  , Eq    , 0.0
> > 29    , 0     , , 30    3.68  , 3.06  , New   , 0.62
> > 29    , 6     , , 30    3.7   , 3.06  , New   , 0.64
> > 32    , 0     , , 31    3.17  , 3.06  , New   , 0.11
> > 32    , 7     , , 31    3.12  , 3.06  , New   , 0.06
> > 30    , 0     , , 31    3.68  , 3.06  , New   , 0.62
> > 30    , 7     , , 31    3.68  , 3.06  , New   , 0.62
> >
> > Results For Icelake memchr-evex
> > size  , algn  , Pos   , Cur T , New T , Win   , Dif
> > 2048  , 0     , , 32    4.94  , 4.26  , New   , 0.68
> > 256   , 1     , , 64    4.5   , 4.13  , New   , 0.37
> > 2048  , 0     , , 64    4.19  , 3.9   , New   , 0.29
> > 256   , 2     , , 64    4.19  , 3.87  , New   , 0.32
> > 2048  , 0     , , 128   4.96  , 4.53  , New   , 0.43
> > 256   , 3     , , 64    4.07  , 3.86  , New   , 0.21
> > 2048  , 0     , , 256   8.77  , 8.61  , New   , 0.16
> > 256   , 4     , , 64    4.08  , 3.87  , New   , 0.21
> > 2048  , 0     , , 512   12.22 , 11.67 , New   , 0.55
> > 256   , 5     , , 64    4.12  , 3.83  , New   , 0.29
> > 2048  , 0     , , 1024  20.06 , 18.09 , New   , 1.97
> > 256   , 6     , , 64    4.2   , 3.95  , New   , 0.25
> > 2048  , 0     , , 2048  33.83 , 30.62 , New   , 3.21
> > 256   , 7     , , 64    4.3   , 4.04  , New   , 0.26
> > 192   , 1     , , 32    4.2   , 3.71  , New   , 0.49
> > 256   , 1     , , 32    4.24  , 3.76  , New   , 0.48
> > 512   , 1     , , 32    4.29  , 3.74  , New   , 0.55
> > 192   , 2     , , 64    4.42  , 4.0   , New   , 0.42
> > 512   , 2     , , 64    4.17  , 3.83  , New   , 0.34
> > 192   , 3     , , 96    4.44  , 4.26  , New   , 0.18
> > 256   , 3     , , 96    4.45  , 4.14  , New   , 0.31
> > 512   , 3     , , 96    4.42  , 4.15  , New   , 0.27
> > 192   , 4     , , 128   4.93  , 4.45  , New   , 0.48
> > 256   , 4     , , 128   4.93  , 4.47  , New   , 0.46
> > 512   , 4     , , 128   4.95  , 4.47  , New   , 0.48
> > 192   , 5     , , 160   5.95  , 5.44  , New   , 0.51
> > 256   , 5     , , 160   5.59  , 5.47  , New   , 0.12
> > 512   , 5     , , 160   7.59  , 7.34  , New   , 0.25
> > 192   , 6     , , 192   6.53  , 6.08  , New   , 0.45
> > 256   , 6     , , 192   6.2   , 5.88  , New   , 0.32
> > 512   , 6     , , 192   7.53  , 7.62  , Cur   , 0.09
> > 192   , 7     , , 224   6.62  , 6.12  , New   , 0.5
> > 256   , 7     , , 224   6.79  , 6.51  , New   , 0.28
> > 512   , 7     , , 224   8.12  , 7.61  , New   , 0.51
> > 2     , 0     , , 1     2.5   , 2.54  , Cur   , 0.04
> > 2     , 1     , , 1     2.56  , 2.55  , New   , 0.01
> > 0     , 0     , , 1     2.57  , 3.12  , Cur   , 0.55
> > 0     , 1     , , 1     2.59  , 3.14  , Cur   , 0.55
> > 3     , 0     , , 2     2.62  , 2.63  , Cur   , 0.01
> > 3     , 2     , , 2     2.66  , 2.67  , Cur   , 0.01
> > 1     , 0     , , 2     3.24  , 2.72  , New   , 0.52
> > 1     , 2     , , 2     3.28  , 2.75  , New   , 0.53
> > 4     , 0     , , 3     2.78  , 2.8   , Cur   , 0.02
> > 4     , 3     , , 3     2.8   , 2.82  , Cur   , 0.02
> > 2     , 0     , , 3     3.38  , 2.86  , New   , 0.52
> > 2     , 3     , , 3     3.41  , 2.89  , New   , 0.52
> > 5     , 0     , , 4     2.88  , 2.91  , Cur   , 0.03
> > 5     , 4     , , 4     2.88  , 2.92  , Cur   , 0.04
> > 3     , 0     , , 4     3.48  , 2.93  , New   , 0.55
> > 3     , 4     , , 4     3.47  , 2.93  , New   , 0.54
> > 6     , 0     , , 5     2.95  , 2.94  , New   , 0.01
> > 6     , 5     , , 5     2.91  , 2.92  , Cur   , 0.01
> > 4     , 0     , , 5     3.47  , 2.9   , New   , 0.57
> > 4     , 5     , , 5     3.43  , 2.91  , New   , 0.52
> > 7     , 0     , , 6     2.87  , 2.9   , Cur   , 0.03
> > 7     , 6     , , 6     2.87  , 2.89  , Cur   , 0.02
> > 5     , 0     , , 6     3.44  , 2.88  , New   , 0.56
> > 5     , 6     , , 6     3.41  , 2.87  , New   , 0.54
> > 8     , 0     , , 7     2.86  , 2.87  , Cur   , 0.01
> > 8     , 7     , , 7     2.86  , 2.87  , Cur   , 0.01
> > 6     , 0     , , 7     3.43  , 2.87  , New   , 0.56
> > 6     , 7     , , 7     3.44  , 2.87  , New   , 0.57
> > 9     , 0     , , 8     2.86  , 2.88  , Cur   , 0.02
> > 7     , 0     , , 8     3.41  , 2.89  , New   , 0.52
> > 10    , 0     , , 9     2.83  , 2.87  , Cur   , 0.04
> > 10    , 1     , , 9     2.82  , 2.87  , Cur   , 0.05
> > 8     , 0     , , 9     3.4   , 2.89  , New   , 0.51
> > 8     , 1     , , 9     3.41  , 2.87  , New   , 0.54
> > 11    , 0     , , 10    2.83  , 2.88  , Cur   , 0.05
> > 11    , 2     , , 10    2.84  , 2.88  , Cur   , 0.04
> > 9     , 0     , , 10    3.41  , 2.87  , New   , 0.54
> > 9     , 2     , , 10    3.41  , 2.88  , New   , 0.53
> > 12    , 0     , , 11    2.83  , 2.89  , Cur   , 0.06
> > 12    , 3     , , 11    2.85  , 2.87  , Cur   , 0.02
> > 10    , 0     , , 11    3.41  , 2.87  , New   , 0.54
> > 10    , 3     , , 11    3.42  , 2.88  , New   , 0.54
> > 13    , 0     , , 12    2.86  , 2.87  , Cur   , 0.01
> > 13    , 4     , , 12    2.84  , 2.88  , Cur   , 0.04
> > 11    , 0     , , 12    3.43  , 2.87  , New   , 0.56
> > 11    , 4     , , 12    3.49  , 2.87  , New   , 0.62
> > 14    , 0     , , 13    2.85  , 2.86  , Cur   , 0.01
> > 14    , 5     , , 13    2.85  , 2.86  , Cur   , 0.01
> > 12    , 0     , , 13    3.41  , 2.86  , New   , 0.55
> > 12    , 5     , , 13    3.44  , 2.85  , New   , 0.59
> > 15    , 0     , , 14    2.83  , 2.87  , Cur   , 0.04
> > 15    , 6     , , 14    2.82  , 2.86  , Cur   , 0.04
> > 13    , 0     , , 14    3.41  , 2.86  , New   , 0.55
> > 13    , 6     , , 14    3.4   , 2.86  , New   , 0.54
> > 16    , 0     , , 15    2.84  , 2.86  , Cur   , 0.02
> > 16    , 7     , , 15    2.83  , 2.85  , Cur   , 0.02
> > 14    , 0     , , 15    3.41  , 2.85  , New   , 0.56
> > 14    , 7     , , 15    3.39  , 2.87  , New   , 0.52
> > 17    , 0     , , 16    2.83  , 2.87  , Cur   , 0.04
> > 15    , 0     , , 16    3.4   , 2.85  , New   , 0.55
> > 18    , 0     , , 17    2.83  , 2.86  , Cur   , 0.03
> > 18    , 1     , , 17    2.85  , 2.84  , New   , 0.01
> > 16    , 0     , , 17    3.41  , 2.85  , New   , 0.56
> > 16    , 1     , , 17    3.4   , 2.86  , New   , 0.54
> > 19    , 0     , , 18    2.8   , 2.84  , Cur   , 0.04
> > 19    , 2     , , 18    2.82  , 2.83  , Cur   , 0.01
> > 17    , 0     , , 18    3.39  , 2.86  , New   , 0.53
> > 17    , 2     , , 18    3.39  , 2.84  , New   , 0.55
> > 20    , 0     , , 19    2.85  , 2.87  , Cur   , 0.02
> > 20    , 3     , , 19    2.88  , 2.87  , New   , 0.01
> > 18    , 0     , , 19    3.38  , 2.85  , New   , 0.53
> > 18    , 3     , , 19    3.4   , 2.85  , New   , 0.55
> > 21    , 0     , , 20    2.83  , 2.85  , Cur   , 0.02
> > 21    , 4     , , 20    2.88  , 2.85  , New   , 0.03
> > 19    , 0     , , 20    3.39  , 2.84  , New   , 0.55
> > 19    , 4     , , 20    3.39  , 2.96  , New   , 0.43
> > 22    , 0     , , 21    2.84  , 2.9   , Cur   , 0.06
> > 22    , 5     , , 21    2.81  , 2.84  , Cur   , 0.03
> > 20    , 0     , , 21    3.41  , 2.81  , New   , 0.6
> > 20    , 5     , , 21    3.38  , 2.83  , New   , 0.55
> > 23    , 0     , , 22    2.8   , 2.82  , Cur   , 0.02
> > 23    , 6     , , 22    2.81  , 2.83  , Cur   , 0.02
> > 21    , 0     , , 22    3.35  , 2.81  , New   , 0.54
> > 21    , 6     , , 22    3.34  , 2.81  , New   , 0.53
> > 24    , 0     , , 23    2.77  , 2.84  , Cur   , 0.07
> > 24    , 7     , , 23    2.78  , 2.8   , Cur   , 0.02
> > 22    , 0     , , 23    3.34  , 2.79  , New   , 0.55
> > 22    , 7     , , 23    3.32  , 2.79  , New   , 0.53
> > 25    , 0     , , 24    2.77  , 2.8   , Cur   , 0.03
> > 23    , 0     , , 24    3.29  , 2.79  , New   , 0.5
> > 26    , 0     , , 25    2.73  , 2.78  , Cur   , 0.05
> > 26    , 1     , , 25    2.75  , 2.79  , Cur   , 0.04
> > 24    , 0     , , 25    3.27  , 2.79  , New   , 0.48
> > 24    , 1     , , 25    3.27  , 2.77  , New   , 0.5
> > 27    , 0     , , 26    2.72  , 2.78  , Cur   , 0.06
> > 27    , 2     , , 26    2.75  , 2.76  , Cur   , 0.01
> > 25    , 0     , , 26    3.29  , 2.73  , New   , 0.56
> > 25    , 2     , , 26    3.3   , 2.76  , New   , 0.54
> > 28    , 0     , , 27    2.75  , 2.79  , Cur   , 0.04
> > 28    , 3     , , 27    2.77  , 2.77  , Eq    , 0.0
> > 26    , 0     , , 27    3.28  , 2.78  , New   , 0.5
> > 26    , 3     , , 27    3.29  , 2.78  , New   , 0.51
> > 29    , 0     , , 28    2.74  , 2.76  , Cur   , 0.02
> > 29    , 4     , , 28    2.74  , 2.77  , Cur   , 0.03
> > 27    , 0     , , 28    3.3   , 2.76  , New   , 0.54
> > 27    , 4     , , 28    3.3   , 2.74  , New   , 0.56
> > 30    , 0     , , 29    2.72  , 2.76  , Cur   , 0.04
> > 30    , 5     , , 29    2.74  , 2.75  , Cur   , 0.01
> > 28    , 0     , , 29    3.25  , 2.73  , New   , 0.52
> > 28    , 5     , , 29    3.3   , 2.73  , New   , 0.57
> > 31    , 0     , , 30    2.73  , 2.77  , Cur   , 0.04
> > 31    , 6     , , 30    2.74  , 2.76  , Cur   , 0.02
> > 29    , 0     , , 30    3.25  , 2.73  , New   , 0.52
> > 29    , 6     , , 30    3.26  , 2.74  , New   , 0.52
> > 32    , 0     , , 31    2.73  , 2.74  , Cur   , 0.01
> > 32    , 7     , , 31    2.73  , 2.75  , Cur   , 0.02
> > 30    , 0     , , 31    3.24  , 2.72  , New   , 0.52
> > 30    , 7     , , 31    3.24  , 2.72  , New   , 0.52
> >
> > For memchr-avx2 the improvements are more modest though again near
> > universal. The improvement is most significant for medium sizes and
> > small sizes with pos > size. For small sizes with pos < size and large
> > sizes the two implementations perform roughly the same for large
> > sizes.
> >
> > Results For Tigerlake memchr-avx2
> > size  , algn  , Pos   , Cur T , New T , Win   , Dif
> > 2048  , 0     , , 32    6.15  , 6.27  , Cur   , 0.12
> > 256   , 1     , , 64    6.21  , 6.03  , New   , 0.18
> > 2048  , 0     , , 64    6.07  , 5.95  , New   , 0.12
> > 256   , 2     , , 64    6.01  , 5.8   , New   , 0.21
> > 2048  , 0     , , 128   7.05  , 6.55  , New   , 0.5
> > 256   , 3     , , 64    6.14  , 5.83  , New   , 0.31
> > 2048  , 0     , , 256   11.78 , 11.78 , Eq    , 0.0
> > 256   , 4     , , 64    6.1   , 5.85  , New   , 0.25
> > 2048  , 0     , , 512   16.32 , 15.96 , New   , 0.36
> > 256   , 5     , , 64    6.1   , 5.77  , New   , 0.33
> > 2048  , 0     , , 1024  25.38 , 25.18 , New   , 0.2
> > 256   , 6     , , 64    6.08  , 5.88  , New   , 0.2
> > 2048  , 0     , , 2048  38.56 , 38.32 , New   , 0.24
> > 256   , 7     , , 64    5.93  , 5.68  , New   , 0.25
> > 192   , 1     , , 32    5.49  , 5.3   , New   , 0.19
> > 256   , 1     , , 32    5.5   , 5.28  , New   , 0.22
> > 512   , 1     , , 32    5.48  , 5.32  , New   , 0.16
> > 192   , 2     , , 64    6.1   , 5.73  , New   , 0.37
> > 512   , 2     , , 64    5.88  , 5.72  , New   , 0.16
> > 192   , 3     , , 96    6.31  , 5.93  , New   , 0.38
> > 256   , 3     , , 96    6.32  , 5.93  , New   , 0.39
> > 512   , 3     , , 96    6.2   , 5.94  , New   , 0.26
> > 192   , 4     , , 128   6.65  , 6.4   , New   , 0.25
> > 256   , 4     , , 128   6.6   , 6.37  , New   , 0.23
> > 512   , 4     , , 128   6.74  , 6.33  , New   , 0.41
> > 192   , 5     , , 160   7.78  , 7.4   , New   , 0.38
> > 256   , 5     , , 160   7.18  , 7.4   , Cur   , 0.22
> > 512   , 5     , , 160   9.81  , 9.44  , New   , 0.37
> > 192   , 6     , , 192   9.12  , 7.77  , New   , 1.35
> > 256   , 6     , , 192   7.97  , 7.66  , New   , 0.31
> > 512   , 6     , , 192   10.14 , 9.95  , New   , 0.19
> > 192   , 7     , , 224   8.96  , 7.78  , New   , 1.18
> > 256   , 7     , , 224   8.52  , 8.23  , New   , 0.29
> > 512   , 7     , , 224   10.33 , 9.98  , New   , 0.35
> > 2     , 0     , , 1     3.61  , 3.6   , New   , 0.01
> > 2     , 1     , , 1     3.6   , 3.6   , Eq    , 0.0
> > 0     , 0     , , 1     3.02  , 3.0   , New   , 0.02
> > 0     , 1     , , 1     3.0   , 3.0   , Eq    , 0.0
> > 3     , 0     , , 2     3.6   , 3.6   , Eq    , 0.0
> > 3     , 2     , , 2     3.61  , 3.6   , New   , 0.01
> > 1     , 0     , , 2     4.82  , 3.6   , New   , 1.22
> > 1     , 2     , , 2     4.81  , 3.6   , New   , 1.21
> > 4     , 0     , , 3     3.61  , 3.61  , Eq    , 0.0
> > 4     , 3     , , 3     3.62  , 3.61  , New   , 0.01
> > 2     , 0     , , 3     4.82  , 3.62  , New   , 1.2
> > 2     , 3     , , 3     4.83  , 3.63  , New   , 1.2
> > 5     , 0     , , 4     3.63  , 3.64  , Cur   , 0.01
> > 5     , 4     , , 4     3.63  , 3.62  , New   , 0.01
> > 3     , 0     , , 4     4.84  , 3.62  , New   , 1.22
> > 3     , 4     , , 4     4.84  , 3.64  , New   , 1.2
> > 6     , 0     , , 5     3.66  , 3.64  , New   , 0.02
> > 6     , 5     , , 5     3.65  , 3.62  , New   , 0.03
> > 4     , 0     , , 5     4.83  , 3.63  , New   , 1.2
> > 4     , 5     , , 5     4.85  , 3.64  , New   , 1.21
> > 7     , 0     , , 6     3.76  , 3.79  , Cur   , 0.03
> > 7     , 6     , , 6     3.76  , 3.72  , New   , 0.04
> > 5     , 0     , , 6     4.84  , 3.62  , New   , 1.22
> > 5     , 6     , , 6     4.85  , 3.64  , New   , 1.21
> > 8     , 0     , , 7     3.64  , 3.65  , Cur   , 0.01
> > 8     , 7     , , 7     3.65  , 3.65  , Eq    , 0.0
> > 6     , 0     , , 7     4.88  , 3.64  , New   , 1.24
> > 6     , 7     , , 7     4.87  , 3.65  , New   , 1.22
> > 9     , 0     , , 8     3.66  , 3.66  , Eq    , 0.0
> > 7     , 0     , , 8     4.89  , 3.66  , New   , 1.23
> > 10    , 0     , , 9     3.67  , 3.67  , Eq    , 0.0
> > 10    , 1     , , 9     3.67  , 3.67  , Eq    , 0.0
> > 8     , 0     , , 9     4.9   , 3.67  , New   , 1.23
> > 8     , 1     , , 9     4.9   , 3.67  , New   , 1.23
> > 11    , 0     , , 10    3.68  , 3.67  , New   , 0.01
> > 11    , 2     , , 10    3.69  , 3.67  , New   , 0.02
> > 9     , 0     , , 10    4.9   , 3.67  , New   , 1.23
> > 9     , 2     , , 10    4.9   , 3.67  , New   , 1.23
> > 12    , 0     , , 11    3.71  , 3.68  , New   , 0.03
> > 12    , 3     , , 11    3.71  , 3.67  , New   , 0.04
> > 10    , 0     , , 11    4.9   , 3.67  , New   , 1.23
> > 10    , 3     , , 11    4.9   , 3.67  , New   , 1.23
> > 13    , 0     , , 12    4.24  , 4.23  , New   , 0.01
> > 13    , 4     , , 12    4.23  , 4.23  , Eq    , 0.0
> > 11    , 0     , , 12    4.9   , 3.7   , New   , 1.2
> > 11    , 4     , , 12    4.9   , 3.73  , New   , 1.17
> > 14    , 0     , , 13    3.99  , 4.01  , Cur   , 0.02
> > 14    , 5     , , 13    3.98  , 3.98  , Eq    , 0.0
> > 12    , 0     , , 13    4.9   , 3.69  , New   , 1.21
> > 12    , 5     , , 13    4.9   , 3.69  , New   , 1.21
> > 15    , 0     , , 14    3.99  , 3.97  , New   , 0.02
> > 15    , 6     , , 14    4.0   , 4.0   , Eq    , 0.0
> > 13    , 0     , , 14    4.9   , 3.67  , New   , 1.23
> > 13    , 6     , , 14    4.9   , 3.67  , New   , 1.23
> > 16    , 0     , , 15    3.99  , 4.02  , Cur   , 0.03
> > 16    , 7     , , 15    4.01  , 3.96  , New   , 0.05
> > 14    , 0     , , 15    4.93  , 3.67  , New   , 1.26
> > 14    , 7     , , 15    4.92  , 3.67  , New   , 1.25
> > 17    , 0     , , 16    4.04  , 3.99  , New   , 0.05
> > 15    , 0     , , 16    5.42  , 4.22  , New   , 1.2
> > 18    , 0     , , 17    4.01  , 3.97  , New   , 0.04
> > 18    , 1     , , 17    3.99  , 3.98  , New   , 0.01
> > 16    , 0     , , 17    5.22  , 3.98  , New   , 1.24
> > 16    , 1     , , 17    5.19  , 3.98  , New   , 1.21
> > 19    , 0     , , 18    4.0   , 3.99  , New   , 0.01
> > 19    , 2     , , 18    4.03  , 3.97  , New   , 0.06
> > 17    , 0     , , 18    5.18  , 3.99  , New   , 1.19
> > 17    , 2     , , 18    5.18  , 3.98  , New   , 1.2
> > 20    , 0     , , 19    4.02  , 3.98  , New   , 0.04
> > 20    , 3     , , 19    4.0   , 3.98  , New   , 0.02
> > 18    , 0     , , 19    5.19  , 3.97  , New   , 1.22
> > 18    , 3     , , 19    5.21  , 3.98  , New   , 1.23
> > 21    , 0     , , 20    3.98  , 4.0   , Cur   , 0.02
> > 21    , 4     , , 20    4.0   , 4.0   , Eq    , 0.0
> > 19    , 0     , , 20    5.19  , 3.99  , New   , 1.2
> > 19    , 4     , , 20    5.17  , 3.99  , New   , 1.18
> > 22    , 0     , , 21    4.03  , 3.98  , New   , 0.05
> > 22    , 5     , , 21    4.01  , 3.95  , New   , 0.06
> > 20    , 0     , , 21    5.19  , 4.0   , New   , 1.19
> > 20    , 5     , , 21    5.21  , 3.99  , New   , 1.22
> > 23    , 0     , , 22    4.06  , 3.97  , New   , 0.09
> > 23    , 6     , , 22    4.02  , 3.98  , New   , 0.04
> > 21    , 0     , , 22    5.2   , 4.02  , New   , 1.18
> > 21    , 6     , , 22    5.22  , 4.0   , New   , 1.22
> > 24    , 0     , , 23    4.15  , 3.98  , New   , 0.17
> > 24    , 7     , , 23    4.0   , 4.01  , Cur   , 0.01
> > 22    , 0     , , 23    5.28  , 4.0   , New   , 1.28
> > 22    , 7     , , 23    5.22  , 3.99  , New   , 1.23
> > 25    , 0     , , 24    4.1   , 4.04  , New   , 0.06
> > 23    , 0     , , 24    5.23  , 4.04  , New   , 1.19
> > 26    , 0     , , 25    4.1   , 4.06  , New   , 0.04
> > 26    , 1     , , 25    4.07  , 3.99  , New   , 0.08
> > 24    , 0     , , 25    5.26  , 4.02  , New   , 1.24
> > 24    , 1     , , 25    5.21  , 4.0   , New   , 1.21
> > 27    , 0     , , 26    4.17  , 4.03  , New   , 0.14
> > 27    , 2     , , 26    4.09  , 4.03  , New   , 0.06
> > 25    , 0     , , 26    5.29  , 4.1   , New   , 1.19
> > 25    , 2     , , 26    5.25  , 4.0   , New   , 1.25
> > 28    , 0     , , 27    4.06  , 4.1   , Cur   , 0.04
> > 28    , 3     , , 27    4.09  , 4.04  , New   , 0.05
> > 26    , 0     , , 27    5.26  , 4.04  , New   , 1.22
> > 26    , 3     , , 27    5.28  , 4.01  , New   , 1.27
> > 29    , 0     , , 28    4.07  , 4.02  , New   , 0.05
> > 29    , 4     , , 28    4.07  , 4.05  , New   , 0.02
> > 27    , 0     , , 28    5.25  , 4.02  , New   , 1.23
> > 27    , 4     , , 28    5.25  , 4.03  , New   , 1.22
> > 30    , 0     , , 29    4.14  , 4.06  , New   , 0.08
> > 30    , 5     , , 29    4.08  , 4.04  , New   , 0.04
> > 28    , 0     , , 29    5.26  , 4.07  , New   , 1.19
> > 28    , 5     , , 29    5.28  , 4.04  , New   , 1.24
> > 31    , 0     , , 30    4.09  , 4.08  , New   , 0.01
> > 31    , 6     , , 30    4.1   , 4.08  , New   , 0.02
> > 29    , 0     , , 30    5.28  , 4.05  , New   , 1.23
> > 29    , 6     , , 30    5.24  , 4.07  , New   , 1.17
> > 32    , 0     , , 31    4.1   , 4.13  , Cur   , 0.03
> > 32    , 7     , , 31    4.16  , 4.09  , New   , 0.07
> > 30    , 0     , , 31    5.31  , 4.09  , New   , 1.22
> > 30    , 7     , , 31    5.28  , 4.08  , New   , 1.2
> >
> > Results For Icelake memchr-avx2
> > size  , algn  , Pos   , Cur T , New T , Win   , Dif
> > 2048  , 0     , , 32    5.74  , 5.08  , New   , 0.66
> > 256   , 1     , , 64    5.16  , 4.93  , New   , 0.23
> > 2048  , 0     , , 64    4.86  , 4.69  , New   , 0.17
> > 256   , 2     , , 64    4.78  , 4.7   , New   , 0.08
> > 2048  , 0     , , 128   5.64  , 5.0   , New   , 0.64
> > 256   , 3     , , 64    4.64  , 4.59  , New   , 0.05
> > 2048  , 0     , , 256   9.07  , 9.17  , Cur   , 0.1
> > 256   , 4     , , 64    4.7   , 4.6   , New   , 0.1
> > 2048  , 0     , , 512   12.56 , 12.33 , New   , 0.23
> > 256   , 5     , , 64    4.72  , 4.61  , New   , 0.11
> > 2048  , 0     , , 1024  19.36 , 19.49 , Cur   , 0.13
> > 256   , 6     , , 64    4.82  , 4.69  , New   , 0.13
> > 2048  , 0     , , 2048  29.99 , 30.53 , Cur   , 0.54
> > 256   , 7     , , 64    4.9   , 4.85  , New   , 0.05
> > 192   , 1     , , 32    4.89  , 4.45  , New   , 0.44
> > 256   , 1     , , 32    4.93  , 4.44  , New   , 0.49
> > 512   , 1     , , 32    4.97  , 4.45  , New   , 0.52
> > 192   , 2     , , 64    5.04  , 4.65  , New   , 0.39
> > 512   , 2     , , 64    4.75  , 4.66  , New   , 0.09
> > 192   , 3     , , 96    5.14  , 4.66  , New   , 0.48
> > 256   , 3     , , 96    5.12  , 4.66  , New   , 0.46
> > 512   , 3     , , 96    5.13  , 4.62  , New   , 0.51
> > 192   , 4     , , 128   5.65  , 4.95  , New   , 0.7
> > 256   , 4     , , 128   5.63  , 4.95  , New   , 0.68
> > 512   , 4     , , 128   5.68  , 4.96  , New   , 0.72
> > 192   , 5     , , 160   6.1   , 5.84  , New   , 0.26
> > 256   , 5     , , 160   5.58  , 5.84  , Cur   , 0.26
> > 512   , 5     , , 160   7.95  , 7.74  , New   , 0.21
> > 192   , 6     , , 192   7.07  , 6.23  , New   , 0.84
> > 256   , 6     , , 192   6.34  , 6.09  , New   , 0.25
> > 512   , 6     , , 192   8.17  , 8.13  , New   , 0.04
> > 192   , 7     , , 224   7.06  , 6.23  , New   , 0.83
> > 256   , 7     , , 224   6.76  , 6.65  , New   , 0.11
> > 512   , 7     , , 224   8.29  , 8.08  , New   , 0.21
> > 2     , 0     , , 1     3.0   , 3.04  , Cur   , 0.04
> > 2     , 1     , , 1     3.06  , 3.07  , Cur   , 0.01
> > 0     , 0     , , 1     2.57  , 2.59  , Cur   , 0.02
> > 0     , 1     , , 1     2.6   , 2.61  , Cur   , 0.01
> > 3     , 0     , , 2     3.15  , 3.17  , Cur   , 0.02
> > 3     , 2     , , 2     3.19  , 3.21  , Cur   , 0.02
> > 1     , 0     , , 2     4.32  , 3.25  , New   , 1.07
> > 1     , 2     , , 2     4.36  , 3.31  , New   , 1.05
> > 4     , 0     , , 3     3.5   , 3.52  , Cur   , 0.02
> > 4     , 3     , , 3     3.52  , 3.54  , Cur   , 0.02
> > 2     , 0     , , 3     4.51  , 3.43  , New   , 1.08
> > 2     , 3     , , 3     4.56  , 3.47  , New   , 1.09
> > 5     , 0     , , 4     3.61  , 3.65  , Cur   , 0.04
> > 5     , 4     , , 4     3.63  , 3.67  , Cur   , 0.04
> > 3     , 0     , , 4     4.64  , 3.51  , New   , 1.13
> > 3     , 4     , , 4     4.7   , 3.51  , New   , 1.19
> > 6     , 0     , , 5     3.66  , 3.68  , Cur   , 0.02
> > 6     , 5     , , 5     3.69  , 3.65  , New   , 0.04
> > 4     , 0     , , 5     4.7   , 3.49  , New   , 1.21
> > 4     , 5     , , 5     4.58  , 3.48  , New   , 1.1
> > 7     , 0     , , 6     3.6   , 3.65  , Cur   , 0.05
> > 7     , 6     , , 6     3.59  , 3.64  , Cur   , 0.05
> > 5     , 0     , , 6     4.74  , 3.65  , New   , 1.09
> > 5     , 6     , , 6     4.73  , 3.64  , New   , 1.09
> > 8     , 0     , , 7     3.6   , 3.61  , Cur   , 0.01
> > 8     , 7     , , 7     3.6   , 3.61  , Cur   , 0.01
> > 6     , 0     , , 7     4.73  , 3.6   , New   , 1.13
> > 6     , 7     , , 7     4.73  , 3.62  , New   , 1.11
> > 9     , 0     , , 8     3.59  , 3.62  , Cur   , 0.03
> > 7     , 0     , , 8     4.72  , 3.64  , New   , 1.08
> > 10    , 0     , , 9     3.57  , 3.62  , Cur   , 0.05
> > 10    , 1     , , 9     3.56  , 3.61  , Cur   , 0.05
> > 8     , 0     , , 9     4.69  , 3.63  , New   , 1.06
> > 8     , 1     , , 9     4.71  , 3.61  , New   , 1.1
> > 11    , 0     , , 10    3.58  , 3.62  , Cur   , 0.04
> > 11    , 2     , , 10    3.59  , 3.63  , Cur   , 0.04
> > 9     , 0     , , 10    4.72  , 3.61  , New   , 1.11
> > 9     , 2     , , 10    4.7   , 3.61  , New   , 1.09
> > 12    , 0     , , 11    3.58  , 3.63  , Cur   , 0.05
> > 12    , 3     , , 11    3.58  , 3.62  , Cur   , 0.04
> > 10    , 0     , , 11    4.7   , 3.6   , New   , 1.1
> > 10    , 3     , , 11    4.73  , 3.64  , New   , 1.09
> > 13    , 0     , , 12    3.6   , 3.6   , Eq    , 0.0
> > 13    , 4     , , 12    3.57  , 3.62  , Cur   , 0.05
> > 11    , 0     , , 12    4.73  , 3.62  , New   , 1.11
> > 11    , 4     , , 12    4.79  , 3.61  , New   , 1.18
> > 14    , 0     , , 13    3.61  , 3.62  , Cur   , 0.01
> > 14    , 5     , , 13    3.59  , 3.59  , Eq    , 0.0
> > 12    , 0     , , 13    4.7   , 3.61  , New   , 1.09
> > 12    , 5     , , 13    4.75  , 3.58  , New   , 1.17
> > 15    , 0     , , 14    3.58  , 3.62  , Cur   , 0.04
> > 15    , 6     , , 14    3.59  , 3.62  , Cur   , 0.03
> > 13    , 0     , , 14    4.68  , 3.6   , New   , 1.08
> > 13    , 6     , , 14    4.68  , 3.63  , New   , 1.05
> > 16    , 0     , , 15    3.57  , 3.6   , Cur   , 0.03
> > 16    , 7     , , 15    3.55  , 3.59  , Cur   , 0.04
> > 14    , 0     , , 15    4.69  , 3.61  , New   , 1.08
> > 14    , 7     , , 15    4.69  , 3.61  , New   , 1.08
> > 17    , 0     , , 16    3.56  , 3.61  , Cur   , 0.05
> > 15    , 0     , , 16    4.71  , 3.58  , New   , 1.13
> > 18    , 0     , , 17    3.57  , 3.65  , Cur   , 0.08
> > 18    , 1     , , 17    3.58  , 3.59  , Cur   , 0.01
> > 16    , 0     , , 17    4.7   , 3.58  , New   , 1.12
> > 16    , 1     , , 17    4.68  , 3.59  , New   , 1.09
> > 19    , 0     , , 18    3.51  , 3.58  , Cur   , 0.07
> > 19    , 2     , , 18    3.55  , 3.58  , Cur   , 0.03
> > 17    , 0     , , 18    4.69  , 3.61  , New   , 1.08
> > 17    , 2     , , 18    4.68  , 3.61  , New   , 1.07
> > 20    , 0     , , 19    3.57  , 3.6   , Cur   , 0.03
> > 20    , 3     , , 19    3.59  , 3.59  , Eq    , 0.0
> > 18    , 0     , , 19    4.68  , 3.59  , New   , 1.09
> > 18    , 3     , , 19    4.67  , 3.57  , New   , 1.1
> > 21    , 0     , , 20    3.61  , 3.58  , New   , 0.03
> > 21    , 4     , , 20    3.62  , 3.6   , New   , 0.02
> > 19    , 0     , , 20    4.74  , 3.57  , New   , 1.17
> > 19    , 4     , , 20    4.69  , 3.7   , New   , 0.99
> > 22    , 0     , , 21    3.57  , 3.64  , Cur   , 0.07
> > 22    , 5     , , 21    3.55  , 3.6   , Cur   , 0.05
> > 20    , 0     , , 21    4.72  , 3.55  , New   , 1.17
> > 20    , 5     , , 21    4.66  , 3.55  , New   , 1.11
> > 23    , 0     , , 22    3.56  , 3.56  , Eq    , 0.0
> > 23    , 6     , , 22    3.54  , 3.56  , Cur   , 0.02
> > 21    , 0     , , 22    4.65  , 3.53  , New   , 1.12
> > 21    , 6     , , 22    4.62  , 3.56  , New   , 1.06
> > 24    , 0     , , 23    3.5   , 3.54  , Cur   , 0.04
> > 24    , 7     , , 23    3.52  , 3.53  , Cur   , 0.01
> > 22    , 0     , , 23    4.61  , 3.51  , New   , 1.1
> > 22    , 7     , , 23    4.6   , 3.51  , New   , 1.09
> > 25    , 0     , , 24    3.5   , 3.53  , Cur   , 0.03
> > 23    , 0     , , 24    4.54  , 3.5   , New   , 1.04
> > 26    , 0     , , 25    3.47  , 3.49  , Cur   , 0.02
> > 26    , 1     , , 25    3.46  , 3.51  , Cur   , 0.05
> > 24    , 0     , , 25    4.53  , 3.51  , New   , 1.02
> > 24    , 1     , , 25    4.51  , 3.51  , New   , 1.0
> > 27    , 0     , , 26    3.44  , 3.51  , Cur   , 0.07
> > 27    , 2     , , 26    3.51  , 3.52  , Cur   , 0.01
> > 25    , 0     , , 26    4.56  , 3.46  , New   , 1.1
> > 25    , 2     , , 26    4.55  , 3.47  , New   , 1.08
> > 28    , 0     , , 27    3.47  , 3.5   , Cur   , 0.03
> > 28    , 3     , , 27    3.48  , 3.47  , New   , 0.01
> > 26    , 0     , , 27    4.52  , 3.44  , New   , 1.08
> > 26    , 3     , , 27    4.55  , 3.46  , New   , 1.09
> > 29    , 0     , , 28    3.45  , 3.49  , Cur   , 0.04
> > 29    , 4     , , 28    3.5   , 3.5   , Eq    , 0.0
> > 27    , 0     , , 28    4.56  , 3.49  , New   , 1.07
> > 27    , 4     , , 28    4.5   , 3.49  , New   , 1.01
> > 30    , 0     , , 29    3.44  , 3.48  , Cur   , 0.04
> > 30    , 5     , , 29    3.46  , 3.47  , Cur   , 0.01
> > 28    , 0     , , 29    4.49  , 3.43  , New   , 1.06
> > 28    , 5     , , 29    4.57  , 3.45  , New   , 1.12
> > 31    , 0     , , 30    3.48  , 3.48  , Eq    , 0.0
> > 31    , 6     , , 30    3.46  , 3.49  , Cur   , 0.03
> > 29    , 0     , , 30    4.49  , 3.44  , New   , 1.05
> > 29    , 6     , , 30    4.53  , 3.44  , New   , 1.09
> > 32    , 0     , , 31    3.44  , 3.45  , Cur   , 0.01
> > 32    , 7     , , 31    3.46  , 3.51  , Cur   , 0.05
> > 30    , 0     , , 31    4.48  , 3.42  , New   , 1.06
> > 30    , 7     , , 31    4.48  , 3.44  , New   , 1.04
> >
> >
> > Results For Skylake memchr-avx2
> > size  , algn  , Pos   , Cur T , New T , Win   , Dif
> > 2048  , 0     , , 32    6.61  , 5.4   , New   , 1.21
> > 256   , 1     , , 64    6.52  , 5.68  , New   , 0.84
> > 2048  , 0     , , 64    6.03  , 5.47  , New   , 0.56
> > 256   , 2     , , 64    6.07  , 5.42  , New   , 0.65
> > 2048  , 0     , , 128   7.01  , 5.83  , New   , 1.18
> > 256   , 3     , , 64    6.24  , 5.68  , New   , 0.56
> > 2048  , 0     , , 256   11.03 , 9.86  , New   , 1.17
> > 256   , 4     , , 64    6.17  , 5.49  , New   , 0.68
> > 2048  , 0     , , 512   14.11 , 13.41 , New   , 0.7
> > 256   , 5     , , 64    6.03  , 5.45  , New   , 0.58
> > 2048  , 0     , , 1024  19.82 , 19.92 , Cur   , 0.1
> > 256   , 6     , , 64    6.14  , 5.7   , New   , 0.44
> > 2048  , 0     , , 2048  30.9  , 30.59 , New   , 0.31
> > 256   , 7     , , 64    6.05  , 5.64  , New   , 0.41
> > 192   , 1     , , 32    5.6   , 4.89  , New   , 0.71
> > 256   , 1     , , 32    5.59  , 5.07  , New   , 0.52
> > 512   , 1     , , 32    5.58  , 4.93  , New   , 0.65
> > 192   , 2     , , 64    6.14  , 5.46  , New   , 0.68
> > 512   , 2     , , 64    5.95  , 5.38  , New   , 0.57
> > 192   , 3     , , 96    6.6   , 5.74  , New   , 0.86
> > 256   , 3     , , 96    6.48  , 5.37  , New   , 1.11
> > 512   , 3     , , 96    6.56  , 5.44  , New   , 1.12
> > 192   , 4     , , 128   7.04  , 6.02  , New   , 1.02
> > 256   , 4     , , 128   6.96  , 5.89  , New   , 1.07
> > 512   , 4     , , 128   6.97  , 5.99  , New   , 0.98
> > 192   , 5     , , 160   8.49  , 7.07  , New   , 1.42
> > 256   , 5     , , 160   8.1   , 6.96  , New   , 1.14
> > 512   , 5     , , 160   10.48 , 9.14  , New   , 1.34
> > 192   , 6     , , 192   8.46  , 8.52  , Cur   , 0.06
> > 256   , 6     , , 192   8.53  , 7.58  , New   , 0.95
> > 512   , 6     , , 192   10.88 , 9.06  , New   , 1.82
> > 192   , 7     , , 224   8.59  , 8.35  , New   , 0.24
> > 256   , 7     , , 224   8.86  , 7.91  , New   , 0.95
> > 512   , 7     , , 224   10.89 , 8.98  , New   , 1.91
> > 2     , 0     , , 1     4.28  , 3.62  , New   , 0.66
> > 2     , 1     , , 1     4.32  , 3.75  , New   , 0.57
> > 0     , 0     , , 1     3.76  , 3.24  , New   , 0.52
> > 0     , 1     , , 1     3.7   , 3.19  , New   , 0.51
> > 3     , 0     , , 2     4.16  , 3.67  , New   , 0.49
> > 3     , 2     , , 2     4.21  , 3.68  , New   , 0.53
> > 1     , 0     , , 2     4.25  , 3.74  , New   , 0.51
> > 1     , 2     , , 2     4.4   , 3.82  , New   , 0.58
> > 4     , 0     , , 3     4.43  , 3.88  , New   , 0.55
> > 4     , 3     , , 3     4.34  , 3.8   , New   , 0.54
> > 2     , 0     , , 3     4.33  , 3.79  , New   , 0.54
> > 2     , 3     , , 3     4.37  , 3.84  , New   , 0.53
> > 5     , 0     , , 4     4.45  , 3.87  , New   , 0.58
> > 5     , 4     , , 4     4.41  , 3.84  , New   , 0.57
> > 3     , 0     , , 4     4.34  , 3.83  , New   , 0.51
> > 3     , 4     , , 4     4.35  , 3.82  , New   , 0.53
> > 6     , 0     , , 5     4.41  , 3.88  , New   , 0.53
> > 6     , 5     , , 5     4.41  , 3.88  , New   , 0.53
> > 4     , 0     , , 5     4.35  , 3.84  , New   , 0.51
> > 4     , 5     , , 5     4.37  , 3.85  , New   , 0.52
> > 7     , 0     , , 6     4.4   , 3.84  , New   , 0.56
> > 7     , 6     , , 6     4.39  , 3.83  , New   , 0.56
> > 5     , 0     , , 6     4.37  , 3.85  , New   , 0.52
> > 5     , 6     , , 6     4.4   , 3.86  , New   , 0.54
> > 8     , 0     , , 7     4.39  , 3.88  , New   , 0.51
> > 8     , 7     , , 7     4.4   , 3.83  , New   , 0.57
> > 6     , 0     , , 7     4.39  , 3.85  , New   , 0.54
> > 6     , 7     , , 7     4.38  , 3.87  , New   , 0.51
> > 9     , 0     , , 8     4.47  , 3.96  , New   , 0.51
> > 7     , 0     , , 8     4.37  , 3.85  , New   , 0.52
> > 10    , 0     , , 9     4.61  , 4.08  , New   , 0.53
> > 10    , 1     , , 9     4.61  , 4.09  , New   , 0.52
> > 8     , 0     , , 9     4.37  , 3.85  , New   , 0.52
> > 8     , 1     , , 9     4.37  , 3.85  , New   , 0.52
> > 11    , 0     , , 10    4.68  , 4.06  , New   , 0.62
> > 11    , 2     , , 10    4.56  , 4.1   , New   , 0.46
> > 9     , 0     , , 10    4.36  , 3.83  , New   , 0.53
> > 9     , 2     , , 10    4.37  , 3.83  , New   , 0.54
> > 12    , 0     , , 11    4.62  , 4.05  , New   , 0.57
> > 12    , 3     , , 11    4.63  , 4.06  , New   , 0.57
> > 10    , 0     , , 11    4.38  , 3.86  , New   , 0.52
> > 10    , 3     , , 11    4.41  , 3.86  , New   , 0.55
> > 13    , 0     , , 12    4.57  , 4.08  , New   , 0.49
> > 13    , 4     , , 12    4.59  , 4.12  , New   , 0.47
> > 11    , 0     , , 12    4.45  , 4.0   , New   , 0.45
> > 11    , 4     , , 12    4.51  , 4.04  , New   , 0.47
> > 14    , 0     , , 13    4.64  , 4.16  , New   , 0.48
> > 14    , 5     , , 13    4.67  , 4.1   , New   , 0.57
> > 12    , 0     , , 13    4.58  , 4.08  , New   , 0.5
> > 12    , 5     , , 13    4.6   , 4.1   , New   , 0.5
> > 15    , 0     , , 14    4.61  , 4.05  , New   , 0.56
> > 15    , 6     , , 14    4.59  , 4.06  , New   , 0.53
> > 13    , 0     , , 14    4.57  , 4.06  , New   , 0.51
> > 13    , 6     , , 14    4.57  , 4.05  , New   , 0.52
> > 16    , 0     , , 15    4.62  , 4.05  , New   , 0.57
> > 16    , 7     , , 15    4.63  , 4.06  , New   , 0.57
> > 14    , 0     , , 15    4.61  , 4.06  , New   , 0.55
> > 14    , 7     , , 15    4.59  , 4.05  , New   , 0.54
> > 17    , 0     , , 16    4.58  , 4.08  , New   , 0.5
> > 15    , 0     , , 16    4.64  , 4.06  , New   , 0.58
> > 18    , 0     , , 17    4.56  , 4.17  , New   , 0.39
> > 18    , 1     , , 17    4.59  , 4.09  , New   , 0.5
> > 16    , 0     , , 17    4.59  , 4.07  , New   , 0.52
> > 16    , 1     , , 17    4.58  , 4.04  , New   , 0.54
> > 19    , 0     , , 18    4.61  , 4.05  , New   , 0.56
> > 19    , 2     , , 18    4.6   , 4.08  , New   , 0.52
> > 17    , 0     , , 18    4.64  , 4.11  , New   , 0.53
> > 17    , 2     , , 18    4.56  , 4.13  , New   , 0.43
> > 20    , 0     , , 19    4.77  , 4.3   , New   , 0.47
> > 20    , 3     , , 19    4.6   , 4.14  , New   , 0.46
> > 18    , 0     , , 19    4.72  , 4.02  , New   , 0.7
> > 18    , 3     , , 19    4.53  , 4.01  , New   , 0.52
> > 21    , 0     , , 20    4.66  , 4.26  , New   , 0.4
> > 21    , 4     , , 20    4.74  , 4.07  , New   , 0.67
> > 19    , 0     , , 20    4.62  , 4.12  , New   , 0.5
> > 19    , 4     , , 20    4.57  , 4.04  , New   , 0.53
> > 22    , 0     , , 21    4.61  , 4.13  , New   , 0.48
> > 22    , 5     , , 21    4.64  , 4.08  , New   , 0.56
> > 20    , 0     , , 21    4.49  , 4.01  , New   , 0.48
> > 20    , 5     , , 21    4.58  , 4.06  , New   , 0.52
> > 23    , 0     , , 22    4.62  , 4.13  , New   , 0.49
> > 23    , 6     , , 22    4.72  , 4.27  , New   , 0.45
> > 21    , 0     , , 22    4.65  , 3.97  , New   , 0.68
> > 21    , 6     , , 22    4.5   , 4.02  , New   , 0.48
> > 24    , 0     , , 23    4.78  , 4.07  , New   , 0.71
> > 24    , 7     , , 23    4.67  , 4.23  , New   , 0.44
> > 22    , 0     , , 23    4.49  , 3.99  , New   , 0.5
> > 22    , 7     , , 23    4.56  , 4.03  , New   , 0.53
> > 25    , 0     , , 24    4.6   , 4.15  , New   , 0.45
> > 23    , 0     , , 24    4.57  , 4.06  , New   , 0.51
> > 26    , 0     , , 25    4.54  , 4.14  , New   , 0.4
> > 26    , 1     , , 25    4.72  , 4.1   , New   , 0.62
> > 24    , 0     , , 25    4.52  , 4.13  , New   , 0.39
> > 24    , 1     , , 25    4.55  , 4.0   , New   , 0.55
> > 27    , 0     , , 26    4.51  , 4.06  , New   , 0.45
> > 27    , 2     , , 26    4.53  , 4.16  , New   , 0.37
> > 25    , 0     , , 26    4.59  , 4.09  , New   , 0.5
> > 25    , 2     , , 26    4.55  , 4.01  , New   , 0.54
> > 28    , 0     , , 27    4.59  , 3.99  , New   , 0.6
> > 28    , 3     , , 27    4.57  , 3.95  , New   , 0.62
> > 26    , 0     , , 27    4.55  , 4.15  , New   , 0.4
> > 26    , 3     , , 27    4.57  , 3.99  , New   , 0.58
> > 29    , 0     , , 28    4.41  , 4.03  , New   , 0.38
> > 29    , 4     , , 28    4.59  , 4.02  , New   , 0.57
> > 27    , 0     , , 28    4.63  , 4.08  , New   , 0.55
> > 27    , 4     , , 28    4.44  , 4.02  , New   , 0.42
> > 30    , 0     , , 29    4.53  , 3.93  , New   , 0.6
> > 30    , 5     , , 29    4.55  , 3.88  , New   , 0.67
> > 28    , 0     , , 29    4.49  , 3.9   , New   , 0.59
> > 28    , 5     , , 29    4.44  , 3.94  , New   , 0.5
> > 31    , 0     , , 30    4.41  , 3.85  , New   , 0.56
> > 31    , 6     , , 30    4.48  , 3.86  , New   , 0.62
> > 29    , 0     , , 30    4.55  , 3.94  , New   , 0.61
> > 29    , 6     , , 30    4.32  , 3.95  , New   , 0.37
> > 32    , 0     , , 31    4.36  , 3.91  , New   , 0.45
> > 32    , 7     , , 31    4.37  , 3.89  , New   , 0.48
> > 30    , 0     , , 31    4.65  , 3.9   , New   , 0.75
> > 30    , 7     , , 31    4.42  , 3.93  , New   , 0.49
> >
> >  sysdeps/x86_64/multiarch/memchr-evex.S | 580 +++++++++++++++----------
> >  1 file changed, 349 insertions(+), 231 deletions(-)
> >
> > diff --git a/sysdeps/x86_64/multiarch/memchr-evex.S b/sysdeps/x86_64/multiarch/memchr-evex.S
> > index 6dd5d67b90..65c16ef8a4 100644
> > --- a/sysdeps/x86_64/multiarch/memchr-evex.S
> > +++ b/sysdeps/x86_64/multiarch/memchr-evex.S
> > @@ -26,14 +26,28 @@
> >
> >  # ifdef USE_AS_WMEMCHR
> >  #  define VPBROADCAST        vpbroadcastd
> > -#  define VPCMP              vpcmpd
> > -#  define SHIFT_REG  r8d
> > +#  define VPMINU     vpminud
> > +#  define VPCMP      vpcmpd
> > +#  define VPCMPEQ    vpcmpeqd
> > +#  define CHAR_SIZE  4
> >  # else
> >  #  define VPBROADCAST        vpbroadcastb
> > -#  define VPCMP              vpcmpb
> > -#  define SHIFT_REG  ecx
> > +#  define VPMINU     vpminub
> > +#  define VPCMP      vpcmpb
> > +#  define VPCMPEQ    vpcmpeqb
> > +#  define CHAR_SIZE  1
> >  # endif
> >
> > +# ifdef USE_AS_RAWMEMCHR
> > +#  define RAW_PTR_REG        rcx
> > +#  define ALGN_PTR_REG       rdi
> > +# else
> > +#  define RAW_PTR_REG        rdi
> > +#  define ALGN_PTR_REG       rcx
> > +# endif
> > +
> > +#define XZERO                xmm23
>
> Add a space before define.  Rename XZERO to XMMZERO.

Done.

>
> > +#define YZERO                ymm23
>
> Add a space before define.  Rename YZERO to YMMZERO.

Done.

>
> >  # define XMMMATCH    xmm16
> >  # define YMMMATCH    ymm16
> >  # define YMM1                ymm17
> > @@ -44,18 +58,16 @@
> >  # define YMM6                ymm22
> >
> >  # define VEC_SIZE 32
> > +# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE)
> > +# define PAGE_SIZE 4096
> >
> >       .section .text.evex,"ax",@progbits
> > -ENTRY (MEMCHR)
> > +ENTRY(MEMCHR)
>
> No need for this change.

Fixed.

>
> >  # ifndef USE_AS_RAWMEMCHR
> >       /* Check for zero length.  */
> >       test    %RDX_LP, %RDX_LP
> >       jz      L(zero)
> > -# endif
> > -     movl    %edi, %ecx
> > -# ifdef USE_AS_WMEMCHR
> > -     shl     $2, %RDX_LP
> > -# else
> > +
> >  #  ifdef __ILP32__
> >       /* Clear the upper 32 bits.  */
> >       movl    %edx, %edx
> > @@ -63,319 +75,425 @@ ENTRY (MEMCHR)
> >  # endif
> >       /* Broadcast CHAR to YMMMATCH.  */
> >       VPBROADCAST %esi, %YMMMATCH
> > -     /* Check if we may cross page boundary with one vector load.  */
> > -     andl    $(2 * VEC_SIZE - 1), %ecx
> > -     cmpl    $VEC_SIZE, %ecx
> > -     ja      L(cros_page_boundary)
> > +     /* Check if we may cross page boundary with one
> > +        vector load.  */
>
> Fit comments to 72 columns.

Fixed.

>
> > +     movl    %edi, %eax
> > +     andl    $(PAGE_SIZE - 1), %eax
> > +     cmpl    $(PAGE_SIZE - VEC_SIZE), %eax
> > +     ja      L(cross_page_boundary)
> >
> >       /* Check the first VEC_SIZE bytes.  */
> > -     VPCMP   $0, (%rdi), %YMMMATCH, %k1
> > -     kmovd   %k1, %eax
> > -     testl   %eax, %eax
> > -
> > +     VPCMP   $0, (%rdi), %YMMMATCH, %k0
> > +     kmovd   %k0, %eax
> >  # ifndef USE_AS_RAWMEMCHR
> > -     jnz     L(first_vec_x0_check)
> > -     /* Adjust length and check the end of data.  */
> > -     subq    $VEC_SIZE, %rdx
> > -     jbe     L(zero)
> > +     /* If length < CHAR_PER_VEC handle special.  */
> > +     cmpq    $CHAR_PER_VEC, %rdx
> > +     jbe     L(first_vec_x0)
> > +# endif
> > +     testl   %eax, %eax
> > +     jz      L(aligned_more)
> > +     tzcntl  %eax, %eax
> > +# ifdef USE_AS_WMEMCHR
> > +     /* NB: Multiply bytes by CHAR_SIZE to get the
> > +        wchar_t count.  */
>
> Fit comments to 72 columns.

Fixed.

>
> > +     leaq    (%rdi, %rax, CHAR_SIZE), %rax
> >  # else
> > -     jnz     L(first_vec_x0)
> > +     addq    %rdi, %rax
> >  # endif
> > -
> > -     /* Align data for aligned loads in the loop.  */
> > -     addq    $VEC_SIZE, %rdi
> > -     andl    $(VEC_SIZE - 1), %ecx
> > -     andq    $-VEC_SIZE, %rdi
> > +     ret
> >
> >  # ifndef USE_AS_RAWMEMCHR
> > -     /* Adjust length.  */
> > -     addq    %rcx, %rdx
> > -
> > -     subq    $(VEC_SIZE * 4), %rdx
> > -     jbe     L(last_4x_vec_or_less)
> > -# endif
> > -     jmp     L(more_4x_vec)
> > +L(zero):
> > +     xorl    %eax, %eax
> > +     ret
> >
> > +     .p2align 5
> > +L(first_vec_x0):
> > +     /* Check if first match was before length.  */
> > +     tzcntl  %eax, %eax
> > +     xorl    %ecx, %ecx
> > +     cmpl    %eax, %edx
> > +     leaq    (%rdi, %rax, CHAR_SIZE), %rax
> > +     cmovle  %rcx, %rax
> > +     ret
> > +# else
> > +     /* NB: first_vec_x0 is 17 bytes which will leave
> > +        cross_page_boundary (which is relatively cold) close
> > +        enough to ideal alignment. So only realign
> > +        L(cross_page_boundary) if rawmemchr.  */
>
> Fit comments to 72 columns.

Fixed.

>
> >       .p2align 4
> > -L(cros_page_boundary):
> > -     andl    $(VEC_SIZE - 1), %ecx
> > +# endif
> > +L(cross_page_boundary):
> > +     /* Save pointer before aligning as its original
> > +        value is necessary for computer return address if byte is
> > +        found or adjusting length if it is not and this is
> > +        memchr.  */
>
> Fit comments to 72 columns.

Fixed.

>
> > +     movq    %rdi, %rcx
> > +     /* Align data to VEC_SIZE. ALGN_PTR_REG is rcx
> > +        for memchr and rdi for rawmemchr.  */
>
> Fit comments to 72 columns.

Fixed.

>
> > +     andq    $-VEC_SIZE, %ALGN_PTR_REG
> > +     VPCMP   $0, (%ALGN_PTR_REG), %YMMMATCH, %k0
> > +     kmovd   %k0, %r8d
> >  # ifdef USE_AS_WMEMCHR
> > -     /* NB: Divide shift count by 4 since each bit in K1 represent 4
> > -        bytes.  */
> > -     movl    %ecx, %SHIFT_REG
> > -     sarl    $2, %SHIFT_REG
> > +     /* NB: Divide shift count by 4 since each bit in
> > +        K0 represent 4 bytes.  */
> > +     sarl    $2, %eax
> > +# endif
> > +# ifndef USE_AS_RAWMEMCHR
> > +     movl    $(PAGE_SIZE / CHAR_SIZE), %esi
> > +     subl    %eax, %esi
> >  # endif
> > -     andq    $-VEC_SIZE, %rdi
> > -     VPCMP   $0, (%rdi), %YMMMATCH, %k1
> > -     kmovd   %k1, %eax
> > -     /* Remove the leading bytes.  */
> > -     sarxl   %SHIFT_REG, %eax, %eax
> > -     testl   %eax, %eax
> > -     jz      L(aligned_more)
> > -     tzcntl  %eax, %eax
> >  # ifdef USE_AS_WMEMCHR
> > -     /* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
> > -     sall    $2, %eax
> > +     andl    $(CHAR_PER_VEC - 1), %eax
> >  # endif
> > +     /* Remove the leading bytes.  */
> > +     sarxl   %eax, %r8d, %eax
> >  # ifndef USE_AS_RAWMEMCHR
> >       /* Check the end of data.  */
> > -     cmpq    %rax, %rdx
> > -     jbe     L(zero)
> > +     cmpq    %rsi, %rdx
> > +     jbe     L(first_vec_x0)
> > +# endif
> > +     testl   %eax, %eax
> > +     jz      L(cross_page_continue)
> > +     tzcntl  %eax, %eax
> > +# ifdef USE_AS_WMEMCHR
> > +     /* NB: Multiply bytes by CHAR_SIZE to get the
> > +        wchar_t count.  */
> > +     leaq    (%RAW_PTR_REG, %rax, CHAR_SIZE), %rax
> > +# else
> > +     addq    %RAW_PTR_REG, %rax
> >  # endif
> > -     addq    %rdi, %rax
> > -     addq    %rcx, %rax
> >       ret
> >
> >       .p2align 4
> > -L(aligned_more):
> > -# ifndef USE_AS_RAWMEMCHR
> > -        /* Calculate "rdx + rcx - VEC_SIZE" with "rdx - (VEC_SIZE - rcx)"
> > -        instead of "(rdx + rcx) - VEC_SIZE" to void possible addition
> > -        overflow.  */
> > -     negq    %rcx
> > -     addq    $VEC_SIZE, %rcx
> > +L(first_vec_x1):
> > +     tzcntl  %eax, %eax
> > +     leaq    VEC_SIZE(%rdi, %rax, CHAR_SIZE), %rax
> > +     ret
> >
> > -     /* Check the end of data.  */
> > -     subq    %rcx, %rdx
> > -     jbe     L(zero)
> > -# endif
> > +     .p2align 4
> > +L(first_vec_x2):
> > +     tzcntl  %eax, %eax
> > +     leaq    (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
> > +     ret
> >
> > -     addq    $VEC_SIZE, %rdi
> > +     .p2align 4
> > +L(first_vec_x3):
> > +     tzcntl  %eax, %eax
> > +     leaq    (VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax
> > +     ret
> > +
> > +     .p2align 4
> > +L(first_vec_x4):
> > +     tzcntl  %eax, %eax
> > +     leaq    (VEC_SIZE * 4)(%rdi, %rax, CHAR_SIZE), %rax
> > +     ret
> > +
> > +     .p2align 5
> > +L(aligned_more):
> > +     /* Check the first 4 * VEC_SIZE.  Only one
> > +        VEC_SIZE at a time since data is only aligned to
> > +        VEC_SIZE.  */
>
> Fit comments to 72 columns.

Fixed.

>
> >
> >  # ifndef USE_AS_RAWMEMCHR
> > -     subq    $(VEC_SIZE * 4), %rdx
> > +     /* Align data to VEC_SIZE.  */
> > +L(cross_page_continue):
> > +     xorl    %ecx, %ecx
> > +     subl    %edi, %ecx
> > +     andq    $-VEC_SIZE, %rdi
> > +     /* esi is for adjusting length to see if near the
> > +        end.  */
>
> Fit comments to 72 columns.

Fixed.

>
> > +     leal    (VEC_SIZE * 5)(%rdi, %rcx), %esi
> > +#  ifdef USE_AS_WMEMCHR
> > +     /* NB: Divide bytes by 4 to get the wchar_t
> > +        count.  */
> > +     sarl    $2, %esi
> > +#  endif
> > +# else
> > +     andq    $-VEC_SIZE, %rdi
> > +L(cross_page_continue):
> > +# endif
> > +     /* Load first VEC regardless.  */
> > +     VPCMP   $0, (VEC_SIZE)(%rdi), %YMMMATCH, %k0
> > +     kmovd   %k0, %eax
> > +# ifndef USE_AS_RAWMEMCHR
> > +     /* Adjust length. If near end handle specially.
> > +      */
>
> Fit comments to 72 columns.

Fixed.

>
> > +     subq    %rsi, %rdx
> >       jbe     L(last_4x_vec_or_less)
> >  # endif
> > -
> > -L(more_4x_vec):
> > -     /* Check the first 4 * VEC_SIZE.  Only one VEC_SIZE at a time
> > -        since data is only aligned to VEC_SIZE.  */
> > -     VPCMP   $0, (%rdi), %YMMMATCH, %k1
> > -     kmovd   %k1, %eax
> > -     testl   %eax, %eax
> > -     jnz     L(first_vec_x0)
> > -
> > -     VPCMP   $0, VEC_SIZE(%rdi), %YMMMATCH, %k1
> > -     kmovd   %k1, %eax
> >       testl   %eax, %eax
> >       jnz     L(first_vec_x1)
> >
> > -     VPCMP   $0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k1
> > -     kmovd   %k1, %eax
> > +     VPCMP   $0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k0
> > +     kmovd   %k0, %eax
> >       testl   %eax, %eax
> >       jnz     L(first_vec_x2)
> >
> > -     VPCMP   $0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k1
> > -     kmovd   %k1, %eax
> > +     VPCMP   $0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k0
> > +     kmovd   %k0, %eax
> >       testl   %eax, %eax
> >       jnz     L(first_vec_x3)
> >
> > -     addq    $(VEC_SIZE * 4), %rdi
> > +     VPCMP   $0, (VEC_SIZE * 4)(%rdi), %YMMMATCH, %k0
> > +     kmovd   %k0, %eax
> > +     testl   %eax, %eax
> > +     jnz     L(first_vec_x4)
> > +
> >
> >  # ifndef USE_AS_RAWMEMCHR
> > -     subq    $(VEC_SIZE * 4), %rdx
> > -     jbe     L(last_4x_vec_or_less)
> > -# endif
> > +     /* Check if at last CHAR_PER_VEC * 4 length.  */
> > +     subq    $(CHAR_PER_VEC * 4), %rdx
> > +     jbe     L(last_4x_vec_or_less_cmpeq)
> > +     addq    $VEC_SIZE, %rdi
> >
> > -     /* Align data to 4 * VEC_SIZE.  */
> > -     movq    %rdi, %rcx
> > -     andl    $(4 * VEC_SIZE - 1), %ecx
> > +     /* Align data to VEC_SIZE * 4 for the loop and
> > +        readjust length.  */
>
> Fit comments to 72 columns.

Fixed.

>
> > +#  ifdef USE_AS_WMEMCHR
> > +     movl    %edi, %ecx
> >       andq    $-(4 * VEC_SIZE), %rdi
> > -
> > -# ifndef USE_AS_RAWMEMCHR
> > -     /* Adjust length.  */
> > +     andl    $(VEC_SIZE * 4 - 1), %ecx
> > +     /* NB: Divide bytes by 4 to get the wchar_t
> > +        count.  */
>
> Fit comments to 72 columns.

Fixed.

>
> > +     sarl    $2, %ecx
> >       addq    %rcx, %rdx
> > +#  else
> > +     addq    %rdi, %rdx
> > +     andq    $-(4 * VEC_SIZE), %rdi
> > +     subq    %rdi, %rdx
> > +#  endif
> > +# else
> > +     addq    $VEC_SIZE, %rdi
> > +     andq    $-(4 * VEC_SIZE), %rdi
> >  # endif
> >
> > +     vpxorq  %XZERO, %XZERO, %XZERO
> > +
> > +     /* Compare 4 * VEC at a time forward.  */
> >       .p2align 4
> >  L(loop_4x_vec):
> > -     /* Compare 4 * VEC at a time forward.  */
> > -     VPCMP   $0, (%rdi), %YMMMATCH, %k1
> > -     VPCMP   $0, VEC_SIZE(%rdi), %YMMMATCH, %k2
> > -     kord    %k1, %k2, %k5
> > -     VPCMP   $0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k3
> > -     VPCMP   $0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k4
> > -
> > -     kord    %k3, %k4, %k6
> > -     kortestd %k5, %k6
> > -     jnz     L(4x_vec_end)
> > -
> > -     addq    $(VEC_SIZE * 4), %rdi
> > -
> > +     /* It would be possible to save some instructions
> > +        using 4x VPCMP but bottleneck on port 5 makes it not woth
> > +        it.  */
>
> Fit comments to 72 columns.

Fixed.

>
> > +     VPCMP   $4, (VEC_SIZE * 4)(%rdi), %YMMMATCH, %k1
> > +     /* xor will set bytes match esi to zero.  */
> > +     vpxorq  (VEC_SIZE * 5)(%rdi), %YMMMATCH, %YMM2
> > +     vpxorq  (VEC_SIZE * 6)(%rdi), %YMMMATCH, %YMM3
> > +     VPCMP   $0, (VEC_SIZE * 7)(%rdi), %YMMMATCH, %k3
> > +     /* Reduce VEC2 / VEC3 with min and VEC1 with zero
> > +        mask.  */
>
> Fit comments to 72 columns.

Fixed.

>
> > +     VPMINU  %YMM2, %YMM3, %YMM3 {%k1} {z}
> > +     VPCMP   $0, %YMM3, %YZERO, %k2
> >  # ifdef USE_AS_RAWMEMCHR
> > -     jmp     L(loop_4x_vec)
> > +     subq    $-(VEC_SIZE * 4), %rdi
> > +     kortestd %k2, %k3
> > +     jz      L(loop_4x_vec)
> >  # else
> > -     subq    $(VEC_SIZE * 4), %rdx
> > -     ja      L(loop_4x_vec)
> > +     kortestd %k2, %k3
> > +     jnz     L(loop_4x_vec_end)
> >
> > -L(last_4x_vec_or_less):
> > -     /* Less than 4 * VEC and aligned to VEC_SIZE.  */
> > -     addl    $(VEC_SIZE * 2), %edx
> > -     jle     L(last_2x_vec)
> > +     subq    $-(VEC_SIZE * 4), %rdi
> >
> > -     VPCMP   $0, (%rdi), %YMMMATCH, %k1
> > -     kmovd   %k1, %eax
> > -     testl   %eax, %eax
> > -     jnz     L(first_vec_x0)
> > +     subq    $(CHAR_PER_VEC * 4), %rdx
> > +     ja      L(loop_4x_vec)
> >
> > -     VPCMP   $0, VEC_SIZE(%rdi), %YMMMATCH, %k1
> > -     kmovd   %k1, %eax
> > +     /* Fall through into less than 4 remaining
> > +        vectors of length case.  */
>
> Fit comments to 72 columns.

Fixed.

>
> > +     VPCMP   $0, (VEC_SIZE * 4)(%rdi), %YMMMATCH, %k0
> > +     kmovd   %k0, %eax
> > +     addq    $(VEC_SIZE * 3), %rdi
> > +     .p2align 4
> > +L(last_4x_vec_or_less):
> > +     /* Check if first VEC contained match.  */
> >       testl   %eax, %eax
> > -     jnz     L(first_vec_x1)
> > +     jnz     L(first_vec_x1_check)
> >
> > -     VPCMP   $0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k1
> > -     kmovd   %k1, %eax
> > -     testl   %eax, %eax
> > +     /* If remaining length > CHAR_PER_VEC * 2.  */
> > +     addl    $(CHAR_PER_VEC * 2), %edx
> > +     jg      L(last_4x_vec)
> >
> > -     jnz     L(first_vec_x2_check)
> > -     subl    $VEC_SIZE, %edx
> > -     jle     L(zero)
> > +L(last_2x_vec):
> > +     /* If remaining length < CHAR_PER_VEC.  */
> > +     addl    $CHAR_PER_VEC, %edx
> > +     jle     L(zero_end)
> > +
> > +     /* Check VEC2 and compare any match with
> > +        remaining length.  */
>
> Fit comments to 72 columns.

Fixed.

>
> > +     VPCMP   $0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k0
> > +     kmovd   %k0, %eax
> > +     tzcntl  %eax, %eax
> > +     cmpl    %eax, %edx
> > +     jbe     L(set_zero_end)
> > +     leaq    (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
> > +L(zero_end):
> > +     ret
> >
> > -     VPCMP   $0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k1
> > -     kmovd   %k1, %eax
> > -     testl   %eax, %eax
> >
> > -     jnz     L(first_vec_x3_check)
> > +     .p2align 4
> > +L(first_vec_x1_check):
> > +     tzcntl  %eax, %eax
> > +     /* Adjust length.  */
> > +     subl    $-(CHAR_PER_VEC * 4), %edx
> > +     /* Check if match within remaining length.  */
> > +     cmpl    %eax, %edx
> > +     jbe     L(set_zero_end)
> > +     /* NB: Multiply bytes by CHAR_SIZE to get the
> > +        wchar_t count.  */
>
> Fit comments to 72 columns.

Fixed.

>
> > +     leaq    VEC_SIZE(%rdi, %rax, CHAR_SIZE), %rax
> > +     ret
> > +L(set_zero_end):
> >       xorl    %eax, %eax
> >       ret
> >
> >       .p2align 4
> > -L(last_2x_vec):
> > -     addl    $(VEC_SIZE * 2), %edx
> > -     VPCMP   $0, (%rdi), %YMMMATCH, %k1
> > +L(loop_4x_vec_end):
> > +# endif
> > +     /* rawmemchr will fall through into this if match
> > +        was found in loop.  */
>
> Fit comments to 72 columns.

Fixed.

>
> > +
> > +     /* k1 has not of matches with VEC1.  */
> >       kmovd   %k1, %eax
> > -     testl   %eax, %eax
> > +# ifdef USE_AS_WMEMCHR
> > +     subl    $((1 << CHAR_PER_VEC) - 1), %eax
> > +# else
> > +     incl    %eax
> > +# endif
> > +     jnz     L(last_vec_x1_return)
> >
> > -     jnz     L(first_vec_x0_check)
> > -     subl    $VEC_SIZE, %edx
> > -     jle     L(zero)
> > +     VPCMP   $0, %YMM2, %YZERO, %k0
> > +     kmovd   %k0, %eax
> > +     testl   %eax, %eax
> > +     jnz     L(last_vec_x2_return)
> >
> > -     VPCMP   $0, VEC_SIZE(%rdi), %YMMMATCH, %k1
> > -     kmovd   %k1, %eax
> > +     kmovd   %k2, %eax
> >       testl   %eax, %eax
> > -     jnz     L(first_vec_x1_check)
> > -     xorl    %eax, %eax
> > -     ret
> > +     jnz     L(last_vec_x3_return)
> >
> > -     .p2align 4
> > -L(first_vec_x0_check):
> > +     kmovd   %k3, %eax
> >       tzcntl  %eax, %eax
> > -# ifdef USE_AS_WMEMCHR
> > -     /* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
> > -     sall    $2, %eax
> > +# ifdef USE_AS_RAWMEMCHR
> > +     leaq    (VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax
> > +# else
> > +     leaq    (VEC_SIZE * 7)(%rdi, %rax, CHAR_SIZE), %rax
> >  # endif
> > -     /* Check the end of data.  */
> > -     cmpq    %rax, %rdx
> > -     jbe     L(zero)
> > -     addq    %rdi, %rax
> >       ret
> >
> >       .p2align 4
> > -L(first_vec_x1_check):
> > +L(last_vec_x1_return):
> >       tzcntl  %eax, %eax
> > -# ifdef USE_AS_WMEMCHR
> > -     /* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
> > -     sall    $2, %eax
> > -# endif
> > -     /* Check the end of data.  */
> > -     cmpq    %rax, %rdx
> > -     jbe     L(zero)
> > -     addq    $VEC_SIZE, %rax
> > +# ifdef USE_AS_RAWMEMCHR
> > +#  ifdef USE_AS_WMEMCHR
> > +     /* NB: Multiply bytes by CHAR_SIZE to get the
> > +        wchar_t count.  */
>
> Fit comments to 72 columns.
Fixed.
>
> > +     leaq    (%rdi, %rax, CHAR_SIZE), %rax
> > +#  else
> >       addq    %rdi, %rax
> > -     ret
> > -
> > -     .p2align 4
> > -L(first_vec_x2_check):
> > -     tzcntl  %eax, %eax
> > -# ifdef USE_AS_WMEMCHR
> > -     /* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
> > -     sall    $2, %eax
> > +#  endif
> > +# else
> > +     /* NB: Multiply bytes by CHAR_SIZE to get the
> > +        wchar_t count.  */
>
> Fit comments to 72 columns.
Fixed.
>
> > +     leaq    (VEC_SIZE * 4)(%rdi, %rax, CHAR_SIZE), %rax
> >  # endif
> > -     /* Check the end of data.  */
> > -     cmpq    %rax, %rdx
> > -     jbe     L(zero)
> > -     addq    $(VEC_SIZE * 2), %rax
> > -     addq    %rdi, %rax
> >       ret
> >
> >       .p2align 4
> > -L(first_vec_x3_check):
> > +L(last_vec_x2_return):
> >       tzcntl  %eax, %eax
> > -# ifdef USE_AS_WMEMCHR
> > -     /* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
> > -     sall    $2, %eax
> > +# ifdef USE_AS_RAWMEMCHR
> > +     /* NB: Multiply bytes by CHAR_SIZE to get the
> > +        wchar_t count.  */
> > +     leaq    VEC_SIZE(%rdi, %rax, CHAR_SIZE), %rax
> > +# else
> > +     /* NB: Multiply bytes by CHAR_SIZE to get the
> > +        wchar_t count.  */
> > +     leaq    (VEC_SIZE * 5)(%rdi, %rax, CHAR_SIZE), %rax
> >  # endif
> > -     /* Check the end of data.  */
> > -     cmpq    %rax, %rdx
> > -     jbe     L(zero)
> > -     addq    $(VEC_SIZE * 3), %rax
> > -     addq    %rdi, %rax
> >       ret
> >
> >       .p2align 4
> > -L(zero):
> > -     xorl    %eax, %eax
> > -     ret
> > -# endif
> > -
> > -     .p2align 4
> > -L(first_vec_x0):
> > +L(last_vec_x3_return):
> >       tzcntl  %eax, %eax
> > -# ifdef USE_AS_WMEMCHR
> > -     /* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
> > -     leaq    (%rdi, %rax, 4), %rax
> > +# ifdef USE_AS_RAWMEMCHR
> > +     /* NB: Multiply bytes by CHAR_SIZE to get the
> > +        wchar_t count.  */
>
> Fit comments to 72 columns.
Fixed.
>
> > +     leaq    (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
> >  # else
> > -     addq    %rdi, %rax
> > +     /* NB: Multiply bytes by CHAR_SIZE to get the
> > +        wchar_t count.  */
>
> Fit comments to 72 columns.
Fixed.
>
> > +     leaq    (VEC_SIZE * 6)(%rdi, %rax, CHAR_SIZE), %rax
> >  # endif
> >       ret
> >
> > +
> > +# ifndef USE_AS_RAWMEMCHR
> > +L(last_4x_vec_or_less_cmpeq):
> > +     VPCMP   $0, (VEC_SIZE * 5)(%rdi), %YMMMATCH, %k0
> > +     kmovd   %k0, %eax
> > +     subq    $-(VEC_SIZE * 4), %rdi
> > +     /* Check first VEC regardless.  */
> > +     testl   %eax, %eax
> > +     jnz     L(first_vec_x1_check)
> > +
> > +     /* If remaining length <= CHAR_PER_VEC * 2.  */
> > +     addl    $(CHAR_PER_VEC * 2), %edx
> > +     jle     L(last_2x_vec)
> > +
> >       .p2align 4
> > -L(first_vec_x1):
> > +L(last_4x_vec):
> > +     VPCMP   $0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k0
> > +     kmovd   %k0, %eax
> > +     testl   %eax, %eax
> > +     jnz     L(last_vec_x2)
> > +
> > +
> > +     VPCMP   $0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k0
> > +     kmovd   %k0, %eax
> > +     /* Create mask for possible matches within
> > +        remaining length.  */
>
> Fit comments to 72 columns.
Fixed.
>
> > +#  ifdef USE_AS_WMEMCHR
> > +     movl    $((1 << (CHAR_PER_VEC * 2)) - 1), %ecx
> > +     bzhil   %edx, %ecx, %ecx
> > +#  else
> > +     movq    $-1, %rcx
> > +     bzhiq   %rdx, %rcx, %rcx
> > +#  endif
> > +     /* Test matches in data against length match.  */
> > +     andl    %ecx, %eax
> > +     jnz     L(last_vec_x3)
> > +
> > +     /* if remaining length <= CHAR_PER_VEC * 3 (Note
> > +        this is after remaining length was found to be >
> > +        CHAR_PER_VEC * 2.  */
>
> Fit comments to 72 columns.
Fixed.
>
> > +     subl    $CHAR_PER_VEC, %edx
> > +     jbe     L(zero_end2)
> > +
> > +
> > +     VPCMP   $0, (VEC_SIZE * 4)(%rdi), %YMMMATCH, %k0
> > +     kmovd   %k0, %eax
> > +     /* Shift remaining length mask for last VEC.  */
> > +#  ifdef USE_AS_WMEMCHR
> > +     shrl    $CHAR_PER_VEC, %ecx
> > +#  else
> > +     shrq    $CHAR_PER_VEC, %rcx
> > +#  endif
> > +     andl    %ecx, %eax
> > +     jz      L(zero_end2)
> >       tzcntl  %eax, %eax
> > -# ifdef USE_AS_WMEMCHR
> > -     /* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
> > -     leaq    VEC_SIZE(%rdi, %rax, 4), %rax
> > -# else
> > -     addq    $VEC_SIZE, %rax
> > -     addq    %rdi, %rax
> > -# endif
> > +     leaq    (VEC_SIZE * 4)(%rdi, %rax, CHAR_SIZE), %rax
> > +L(zero_end2):
> >       ret
> >
> > -     .p2align 4
> > -L(first_vec_x2):
> > +L(last_vec_x2):
> >       tzcntl  %eax, %eax
> > -# ifdef USE_AS_WMEMCHR
> > -     /* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
> > -     leaq    (VEC_SIZE * 2)(%rdi, %rax, 4), %rax
> > -# else
> > -     addq    $(VEC_SIZE * 2), %rax
> > -     addq    %rdi, %rax
> > -# endif
> > +     leaq    (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
> >       ret
> >
> >       .p2align 4
> > -L(4x_vec_end):
> > -     kmovd   %k1, %eax
> > -     testl   %eax, %eax
> > -     jnz     L(first_vec_x0)
> > -     kmovd   %k2, %eax
> > -     testl   %eax, %eax
> > -     jnz     L(first_vec_x1)
> > -     kmovd   %k3, %eax
> > -     testl   %eax, %eax
> > -     jnz     L(first_vec_x2)
> > -     kmovd   %k4, %eax
> > -     testl   %eax, %eax
> > -L(first_vec_x3):
> > +L(last_vec_x3):
> >       tzcntl  %eax, %eax
> > -# ifdef USE_AS_WMEMCHR
> > -     /* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
> > -     leaq    (VEC_SIZE * 3)(%rdi, %rax, 4), %rax
> > -# else
> > -     addq    $(VEC_SIZE * 3), %rax
> > -     addq    %rdi, %rax
> > -# endif
> > +     leaq    (VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax
> >       ret
> > +# endif
> >
> > -END (MEMCHR)
> > +END(MEMCHR)
>
> No need for this change.Fixed.
>
> >  #endif
> > --
> > 2.29.2
> >
>
> Thanks.
>
> H.J.

^ permalink raw reply	[flat|nested] 20+ messages in thread

* [PATCH v2 2/3] x86: Optimize memchr-avx2.S
  2021-05-03  8:44 ` [PATCH v1 2/3] x86: Optimize memchr-avx2.S Noah Goldstein
  2021-05-03 18:50   ` H.J. Lu
@ 2021-05-03 20:06   ` Noah Goldstein
  2021-05-03 20:06     ` [PATCH v2 3/3] x86: Optimize memchr-evex.S Noah Goldstein
  2021-05-03 22:25     ` [PATCH v2 2/3] x86: Optimize memchr-avx2.S H.J. Lu
  2021-05-03 22:58   ` [PATCH v3 " Noah Goldstein
  2 siblings, 2 replies; 20+ messages in thread
From: Noah Goldstein @ 2021-05-03 20:06 UTC (permalink / raw)
  To: libc-alpha

No bug. This commit optimizes memchr-avx2.S. The optimizations include
replacing some branches with cmovcc, avoiding some branches entirely
in the less_4x_vec case, making the page cross logic less strict,
asaving a few instructions the in loop return loop. test-memchr,
test-rawmemchr, and test-wmemchr are all passing.

Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
---
 sysdeps/x86_64/multiarch/memchr-avx2.S | 426 ++++++++++++++-----------
 1 file changed, 247 insertions(+), 179 deletions(-)

diff --git a/sysdeps/x86_64/multiarch/memchr-avx2.S b/sysdeps/x86_64/multiarch/memchr-avx2.S
index 1fcb1c350f..8b862fb9d1 100644
--- a/sysdeps/x86_64/multiarch/memchr-avx2.S
+++ b/sysdeps/x86_64/multiarch/memchr-avx2.S
@@ -26,8 +26,22 @@
 
 # ifdef USE_AS_WMEMCHR
 #  define VPCMPEQ	vpcmpeqd
+#  define VPBROADCAST	vpbroadcastd
+#  define CHAR_SIZE	4
 # else
 #  define VPCMPEQ	vpcmpeqb
+#  define VPBROADCAST	vpbroadcastb
+#  define CHAR_SIZE	1
+# endif
+
+# ifdef USE_AS_RAWMEMCHR
+#  define ERAW_PTR_REG	ecx
+#  define RRAW_PTR_REG	rcx
+#  define ALGN_PTR_REG	rdi
+# else
+#  define ERAW_PTR_REG	edi
+#  define RRAW_PTR_REG	rdi
+#  define ALGN_PTR_REG	rcx
 # endif
 
 # ifndef VZEROUPPER
@@ -39,6 +53,7 @@
 # endif
 
 # define VEC_SIZE 32
+# define PAGE_SIZE 4096
 
 	.section SECTION(.text),"ax",@progbits
 ENTRY (MEMCHR)
@@ -47,295 +62,348 @@ ENTRY (MEMCHR)
 	test	%RDX_LP, %RDX_LP
 	jz	L(null)
 # endif
-	movl	%edi, %ecx
-	/* Broadcast CHAR to YMM0.  */
-	vmovd	%esi, %xmm0
 # ifdef USE_AS_WMEMCHR
 	shl	$2, %RDX_LP
-	vpbroadcastd %xmm0, %ymm0
 # else
 #  ifdef __ILP32__
 	/* Clear the upper 32 bits.  */
 	movl	%edx, %edx
 #  endif
-	vpbroadcastb %xmm0, %ymm0
 # endif
+	/* Broadcast CHAR to YMMMATCH.  */
+	vmovd	%esi, %xmm0
+	VPBROADCAST %xmm0, %ymm0
 	/* Check if we may cross page boundary with one vector load.  */
-	andl	$(2 * VEC_SIZE - 1), %ecx
-	cmpl	$VEC_SIZE, %ecx
-	ja	L(cros_page_boundary)
+	movl	%edi, %eax
+	andl	$(PAGE_SIZE - 1), %eax
+	cmpl	$(PAGE_SIZE - VEC_SIZE), %eax
+	ja	L(cross_page_boundary)
 
 	/* Check the first VEC_SIZE bytes.  */
-	VPCMPEQ (%rdi), %ymm0, %ymm1
+	VPCMPEQ	(%rdi), %ymm0, %ymm1
 	vpmovmskb %ymm1, %eax
-	testl	%eax, %eax
-
 # ifndef USE_AS_RAWMEMCHR
-	jnz	L(first_vec_x0_check)
-	/* Adjust length and check the end of data.  */
-	subq	$VEC_SIZE, %rdx
-	jbe	L(zero)
-# else
-	jnz	L(first_vec_x0)
+	/* If length < CHAR_PER_VEC handle special.  */
+	cmpq	$VEC_SIZE, %rdx
+	jbe	L(first_vec_x0)
 # endif
-
-	/* Align data for aligned loads in the loop.  */
-	addq	$VEC_SIZE, %rdi
-	andl	$(VEC_SIZE - 1), %ecx
-	andq	$-VEC_SIZE, %rdi
+	testl	%eax, %eax
+	jz	L(aligned_more)
+	tzcntl	%eax, %eax
+	addq	%rdi, %rax
+	VZEROUPPER_RETURN
 
 # ifndef USE_AS_RAWMEMCHR
-	/* Adjust length.  */
-	addq	%rcx, %rdx
-
-	subq	$(VEC_SIZE * 4), %rdx
-	jbe	L(last_4x_vec_or_less)
+	.p2align 5
+L(first_vec_x0):
+	/* Check if first match was before length.  */
+	tzcntl	%eax, %eax
+	xorl	%ecx, %ecx
+	cmpl	%eax, %edx
+	leaq	(%rdi, %rax), %rax
+	cmovle	%rcx, %rax
+	VZEROUPPER_RETURN
+L(null):
+	xorl	%eax, %eax
+	ret
 # endif
-	jmp	L(more_4x_vec)
-
 	.p2align 4
-L(cros_page_boundary):
-	andl	$(VEC_SIZE - 1), %ecx
-	andq	$-VEC_SIZE, %rdi
-	VPCMPEQ (%rdi), %ymm0, %ymm1
+L(cross_page_boundary):
+	/* Save pointer before aligning as its original value is necessary
+	   for computer return address if byte is found or adjusting length
+	   if it is not and this is memchr.  */
+	movq	%rdi, %rcx
+	/* Align data to VEC_SIZE - 1. ALGN_PTR_REG is rcx for memchr and
+	   rdi for rawmemchr.  */
+	orq	$(VEC_SIZE - 1), %ALGN_PTR_REG
+	VPCMPEQ	-(VEC_SIZE - 1)(%ALGN_PTR_REG), %ymm0, %ymm1
 	vpmovmskb %ymm1, %eax
+# ifndef USE_AS_RAWMEMCHR
+	/* Calculate length until end of page (length checked for a
+	   match).  */
+	leaq	1(%ALGN_PTR_REG), %rsi
+	subq	%RRAW_PTR_REG, %rsi
+# endif
 	/* Remove the leading bytes.  */
-	sarl	%cl, %eax
-	testl	%eax, %eax
-	jz	L(aligned_more)
-	tzcntl	%eax, %eax
+	sarxl	%ERAW_PTR_REG, %eax, %eax
 # ifndef USE_AS_RAWMEMCHR
 	/* Check the end of data.  */
-	cmpq	%rax, %rdx
-	jbe	L(zero)
+	cmpq	%rsi, %rdx
+	jbe	L(first_vec_x0)
 # endif
-	addq	%rdi, %rax
-	addq	%rcx, %rax
+	testl	%eax, %eax
+	jz	L(cross_page_continue)
+	tzcntl	%eax, %eax
+	addq	%RRAW_PTR_REG, %rax
 L(return_vzeroupper):
 	ZERO_UPPER_VEC_REGISTERS_RETURN
 
 	.p2align 4
-L(aligned_more):
-# ifndef USE_AS_RAWMEMCHR
-        /* Calculate "rdx + rcx - VEC_SIZE" with "rdx - (VEC_SIZE - rcx)"
-	   instead of "(rdx + rcx) - VEC_SIZE" to void possible addition
-	   overflow.  */
-	negq	%rcx
-	addq	$VEC_SIZE, %rcx
+L(first_vec_x1):
+	tzcntl	%eax, %eax
+	incq	%rdi
+	addq	%rdi, %rax
+	VZEROUPPER_RETURN
 
-	/* Check the end of data.  */
-	subq	%rcx, %rdx
-	jbe	L(zero)
-# endif
+	.p2align 4
+L(first_vec_x2):
+	tzcntl	%eax, %eax
+	addq	$(VEC_SIZE + 1), %rdi
+	addq	%rdi, %rax
+	VZEROUPPER_RETURN
 
-	addq	$VEC_SIZE, %rdi
+	.p2align 4
+L(first_vec_x3):
+	tzcntl	%eax, %eax
+	addq	$(VEC_SIZE * 2 + 1), %rdi
+	addq	%rdi, %rax
+	VZEROUPPER_RETURN
 
-# ifndef USE_AS_RAWMEMCHR
-	subq	$(VEC_SIZE * 4), %rdx
-	jbe	L(last_4x_vec_or_less)
-# endif
 
-L(more_4x_vec):
+	.p2align 4
+L(first_vec_x4):
+	tzcntl	%eax, %eax
+	addq	$(VEC_SIZE * 3 + 1), %rdi
+	addq	%rdi, %rax
+	VZEROUPPER_RETURN
+
+	.p2align 4
+L(aligned_more):
 	/* Check the first 4 * VEC_SIZE.  Only one VEC_SIZE at a time
 	   since data is only aligned to VEC_SIZE.  */
-	VPCMPEQ (%rdi), %ymm0, %ymm1
-	vpmovmskb %ymm1, %eax
-	testl	%eax, %eax
-	jnz	L(first_vec_x0)
 
-	VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1
+# ifndef USE_AS_RAWMEMCHR
+L(cross_page_continue):
+	/* Align data to VEC_SIZE - 1.  */
+	xorl	%ecx, %ecx
+	subl	%edi, %ecx
+	orq	$(VEC_SIZE - 1), %rdi
+	/* esi is for adjusting length to see if near the end.  */
+	leal	(VEC_SIZE * 4 + 1)(%rdi, %rcx), %esi
+# else
+	orq	$(VEC_SIZE - 1), %rdi
+L(cross_page_continue):
+# endif
+	/* Load first VEC regardless.  */
+	VPCMPEQ	1(%rdi), %ymm0, %ymm1
 	vpmovmskb %ymm1, %eax
+# ifndef USE_AS_RAWMEMCHR
+	/* Adjust length. If near end handle specially.  */
+	subq	%rsi, %rdx
+	jbe	L(last_4x_vec_or_less)
+# endif
 	testl	%eax, %eax
 	jnz	L(first_vec_x1)
 
-	VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm0, %ymm1
+	VPCMPEQ	(VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
 	vpmovmskb %ymm1, %eax
 	testl	%eax, %eax
 	jnz	L(first_vec_x2)
 
-	VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1
+	VPCMPEQ	(VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm1
 	vpmovmskb %ymm1, %eax
 	testl	%eax, %eax
 	jnz	L(first_vec_x3)
 
-	addq	$(VEC_SIZE * 4), %rdi
+	VPCMPEQ	(VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm1
+	vpmovmskb %ymm1, %eax
+	testl	%eax, %eax
+	jnz	L(first_vec_x4)
 
 # ifndef USE_AS_RAWMEMCHR
+	/* Check if at last VEC_SIZE * 4 length.  */
 	subq	$(VEC_SIZE * 4), %rdx
-	jbe	L(last_4x_vec_or_less)
-# endif
-
-	/* Align data to 4 * VEC_SIZE.  */
-	movq	%rdi, %rcx
-	andl	$(4 * VEC_SIZE - 1), %ecx
-	andq	$-(4 * VEC_SIZE), %rdi
-
-# ifndef USE_AS_RAWMEMCHR
-	/* Adjust length.  */
+	jbe	L(last_4x_vec_or_less_cmpeq)
+	/* Align data to VEC_SIZE * 4 - 1 for the loop and readjust
+	   length.  */
+	incq	%rdi
+	movl	%edi, %ecx
+	orq	$(VEC_SIZE * 4 - 1), %rdi
+	andl	$(VEC_SIZE * 4 - 1), %ecx
 	addq	%rcx, %rdx
+# else
+	/* Align data to VEC_SIZE * 4 - 1 for loop.  */
+	incq	%rdi
+	orq	$(VEC_SIZE * 4 - 1), %rdi
 # endif
 
+	/* Compare 4 * VEC at a time forward.  */
 	.p2align 4
 L(loop_4x_vec):
-	/* Compare 4 * VEC at a time forward.  */
-	VPCMPEQ (%rdi), %ymm0, %ymm1
-	VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm2
-	VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm0, %ymm3
-	VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm0, %ymm4
-
+	VPCMPEQ	1(%rdi), %ymm0, %ymm1
+	VPCMPEQ	(VEC_SIZE + 1)(%rdi), %ymm0, %ymm2
+	VPCMPEQ	(VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm3
+	VPCMPEQ	(VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm4
 	vpor	%ymm1, %ymm2, %ymm5
 	vpor	%ymm3, %ymm4, %ymm6
 	vpor	%ymm5, %ymm6, %ymm5
 
-	vpmovmskb %ymm5, %eax
-	testl	%eax, %eax
-	jnz	L(4x_vec_end)
-
-	addq	$(VEC_SIZE * 4), %rdi
-
+	vpmovmskb %ymm5, %ecx
 # ifdef USE_AS_RAWMEMCHR
-	jmp	L(loop_4x_vec)
+	subq	$-(VEC_SIZE * 4), %rdi
+	testl	%ecx, %ecx
+	jz	L(loop_4x_vec)
 # else
-	subq	$(VEC_SIZE * 4), %rdx
-	ja	L(loop_4x_vec)
+	testl	%ecx, %ecx
+	jnz	L(loop_4x_vec_end)
 
-L(last_4x_vec_or_less):
-	/* Less than 4 * VEC and aligned to VEC_SIZE.  */
-	addl	$(VEC_SIZE * 2), %edx
-	jle	L(last_2x_vec)
+	subq	$-(VEC_SIZE * 4), %rdi
 
-	VPCMPEQ (%rdi), %ymm0, %ymm1
-	vpmovmskb %ymm1, %eax
-	testl	%eax, %eax
-	jnz	L(first_vec_x0)
+	subq	$(VEC_SIZE * 4), %rdx
+	ja	L(loop_4x_vec)
 
-	VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1
+	/* Fall through into less than 4 remaining vectors of length case.
+	 */
+	VPCMPEQ	(VEC_SIZE * 0 + 1)(%rdi), %ymm0, %ymm1
 	vpmovmskb %ymm1, %eax
+	.p2align 4
+L(last_4x_vec_or_less):
+	/* Check if first VEC contained match.  */
 	testl	%eax, %eax
-	jnz	L(first_vec_x1)
+	jnz	L(first_vec_x1_check)
 
-	VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm0, %ymm1
-	vpmovmskb %ymm1, %eax
-	testl	%eax, %eax
+	/* If remaining length > VEC_SIZE * 2.  */
+	addl	$(VEC_SIZE * 2), %edx
+	jg	L(last_4x_vec)
 
-	jnz	L(first_vec_x2_check)
-	subl	$VEC_SIZE, %edx
-	jle	L(zero)
+L(last_2x_vec):
+	/* If remaining length < VEC_SIZE.  */
+	addl	$VEC_SIZE, %edx
+	jle	L(zero_end)
 
-	VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1
+	/* Check VEC2 and compare any match with remaining length.  */
+	VPCMPEQ	(VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
 	vpmovmskb %ymm1, %eax
-	testl	%eax, %eax
-
-	jnz	L(first_vec_x3_check)
-	xorl	%eax, %eax
+	tzcntl	%eax, %eax
+	cmpl	%eax, %edx
+	jbe	L(set_zero_end)
+	addq	$(VEC_SIZE + 1), %rdi
+	addq	%rdi, %rax
+L(zero_end):
 	VZEROUPPER_RETURN
 
 	.p2align 4
-L(last_2x_vec):
-	addl	$(VEC_SIZE * 2), %edx
-	VPCMPEQ (%rdi), %ymm0, %ymm1
+L(loop_4x_vec_end):
+# endif
+	/* rawmemchr will fall through into this if match was found in
+	   loop.  */
+
 	vpmovmskb %ymm1, %eax
 	testl	%eax, %eax
+	jnz	L(last_vec_x1_return)
 
-	jnz	L(first_vec_x0_check)
-	subl	$VEC_SIZE, %edx
-	jle	L(zero)
-
-	VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1
-	vpmovmskb %ymm1, %eax
+	vpmovmskb %ymm2, %eax
 	testl	%eax, %eax
-	jnz	L(first_vec_x1_check)
-	xorl	%eax, %eax
-	VZEROUPPER_RETURN
+	jnz	L(last_vec_x2_return)
 
-	.p2align 4
-L(first_vec_x0_check):
-	tzcntl	%eax, %eax
-	/* Check the end of data.  */
-	cmpq	%rax, %rdx
-	jbe	L(zero)
+	vpmovmskb %ymm3, %eax
+	/* Combine VEC3 matches (eax) with VEC4 matches (ecx).  */
+	salq	$32, %rcx
+	orq	%rcx, %rax
+	tzcntq	%rax, %rax
+# ifdef USE_AS_RAWMEMCHR
+	subq	$(VEC_SIZE * 2 - 1), %rdi
+# else
+	subq	$-(VEC_SIZE * 2 + 1), %rdi
+# endif
 	addq	%rdi, %rax
 	VZEROUPPER_RETURN
+# ifndef USE_AS_RAWMEMCHR
 
 	.p2align 4
 L(first_vec_x1_check):
 	tzcntl	%eax, %eax
-	/* Check the end of data.  */
-	cmpq	%rax, %rdx
-	jbe	L(zero)
-	addq	$VEC_SIZE, %rax
+	/* Adjust length.  */
+	subl	$-(VEC_SIZE * 4), %edx
+	/* Check if match within remaining length.  */
+	cmpl	%eax, %edx
+	jbe	L(set_zero_end)
+	incq	%rdi
 	addq	%rdi, %rax
 	VZEROUPPER_RETURN
+	.p2align 4
+L(set_zero_end):
+	xorl	%eax, %eax
+	VZEROUPPER_RETURN
+# endif
 
 	.p2align 4
-L(first_vec_x2_check):
+L(last_vec_x1_return):
 	tzcntl	%eax, %eax
-	/* Check the end of data.  */
-	cmpq	%rax, %rdx
-	jbe	L(zero)
-	addq	$(VEC_SIZE * 2), %rax
+# ifdef USE_AS_RAWMEMCHR
+	subq	$(VEC_SIZE * 4 - 1), %rdi
+# else
+	incq	%rdi
+# endif
 	addq	%rdi, %rax
 	VZEROUPPER_RETURN
 
 	.p2align 4
-L(first_vec_x3_check):
+L(last_vec_x2_return):
 	tzcntl	%eax, %eax
-	/* Check the end of data.  */
-	cmpq	%rax, %rdx
-	jbe	L(zero)
-	addq	$(VEC_SIZE * 3), %rax
+# ifdef USE_AS_RAWMEMCHR
+	subq	$(VEC_SIZE * 3 - 1), %rdi
+# else
+	subq	$-(VEC_SIZE + 1), %rdi
+# endif
 	addq	%rdi, %rax
 	VZEROUPPER_RETURN
 
+# ifndef USE_AS_RAWMEMCHR
 	.p2align 4
-L(zero):
-	xorl	%eax, %eax
-	jmp     L(return_vzeroupper)
+L(last_4x_vec_or_less_cmpeq):
+	VPCMPEQ	(VEC_SIZE * 4 + 1)(%rdi), %ymm0, %ymm1
+	vpmovmskb %ymm1, %eax
+	subq	$-(VEC_SIZE * 4), %rdi
+	/* Check first VEC regardless.  */
+	testl	%eax, %eax
+	jnz	L(first_vec_x1_check)
 
+	/* If remaining length <= CHAR_PER_VEC * 2.  */
+	addl	$(VEC_SIZE * 2), %edx
+	jle	L(last_2x_vec)
 	.p2align 4
-L(null):
-	xorl	%eax, %eax
-	ret
-# endif
+L(last_4x_vec):
+	VPCMPEQ	(VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
+	vpmovmskb %ymm1, %eax
+	testl	%eax, %eax
+	jnz	L(last_vec_x2_return)
 
-	.p2align 4
-L(first_vec_x0):
-	tzcntl	%eax, %eax
-	addq	%rdi, %rax
-	VZEROUPPER_RETURN
+	VPCMPEQ	(VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm1
+	vpmovmskb %ymm1, %eax
 
-	.p2align 4
-L(first_vec_x1):
-	tzcntl	%eax, %eax
-	addq	$VEC_SIZE, %rax
-	addq	%rdi, %rax
-	VZEROUPPER_RETURN
+	/* Create mask for possible matches within remaining length.  */
+	movq	$-1, %rcx
+	bzhiq	%rdx, %rcx, %rcx
 
-	.p2align 4
-L(first_vec_x2):
+	/* Test matches in data against length match.  */
+	andl	%ecx, %eax
+	jnz	L(last_vec_x3)
+
+	/* if remaining length <= VEC_SIZE * 3 (Note this is after
+	   remaining length was found to be > VEC_SIZE * 2.  */
+	subl	$VEC_SIZE, %edx
+	jbe	L(zero_end2)
+
+	VPCMPEQ	(VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm1
+	vpmovmskb %ymm1, %eax
+	/* Shift remaining length mask for last VEC.  */
+	shrq	$32, %rcx
+	andl	%ecx, %eax
+	jz	L(zero_end2)
 	tzcntl	%eax, %eax
-	addq	$(VEC_SIZE * 2), %rax
+	addq	$(VEC_SIZE * 3 + 1), %rdi
 	addq	%rdi, %rax
+L(zero_end2):
 	VZEROUPPER_RETURN
 
 	.p2align 4
-L(4x_vec_end):
-	vpmovmskb %ymm1, %eax
-	testl	%eax, %eax
-	jnz	L(first_vec_x0)
-	vpmovmskb %ymm2, %eax
-	testl	%eax, %eax
-	jnz	L(first_vec_x1)
-	vpmovmskb %ymm3, %eax
-	testl	%eax, %eax
-	jnz	L(first_vec_x2)
-	vpmovmskb %ymm4, %eax
-	testl	%eax, %eax
-L(first_vec_x3):
+L(last_vec_x3):
 	tzcntl	%eax, %eax
-	addq	$(VEC_SIZE * 3), %rax
+	subq	$-(VEC_SIZE * 2 + 1), %rdi
 	addq	%rdi, %rax
 	VZEROUPPER_RETURN
+# endif
 
 END (MEMCHR)
 #endif
-- 
2.29.2


^ permalink raw reply	[flat|nested] 20+ messages in thread

* [PATCH v2 3/3] x86: Optimize memchr-evex.S
  2021-05-03 20:06   ` [PATCH v2 " Noah Goldstein
@ 2021-05-03 20:06     ` Noah Goldstein
  2021-05-03 22:26       ` H.J. Lu
  2021-05-03 22:25     ` [PATCH v2 2/3] x86: Optimize memchr-avx2.S H.J. Lu
  1 sibling, 1 reply; 20+ messages in thread
From: Noah Goldstein @ 2021-05-03 20:06 UTC (permalink / raw)
  To: libc-alpha

No bug. This commit optimizes memchr-evex.S. The optimizations include
replacing some branches with cmovcc, avoiding some branches entirely
in the less_4x_vec case, making the page cross logic less strict,
saving some ALU in the alignment process, and most importantly
increasing ILP in the 4x loop. test-memchr, test-rawmemchr, and
test-wmemchr are all passing.

Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
---
 sysdeps/x86_64/multiarch/memchr-evex.S | 547 +++++++++++++++----------
 1 file changed, 322 insertions(+), 225 deletions(-)

diff --git a/sysdeps/x86_64/multiarch/memchr-evex.S b/sysdeps/x86_64/multiarch/memchr-evex.S
index 6dd5d67b90..147d7aa8ee 100644
--- a/sysdeps/x86_64/multiarch/memchr-evex.S
+++ b/sysdeps/x86_64/multiarch/memchr-evex.S
@@ -26,14 +26,28 @@
 
 # ifdef USE_AS_WMEMCHR
 #  define VPBROADCAST	vpbroadcastd
-#  define VPCMP		vpcmpd
-#  define SHIFT_REG	r8d
+#  define VPMINU	vpminud
+#  define VPCMP	vpcmpd
+#  define VPCMPEQ	vpcmpeqd
+#  define CHAR_SIZE	4
 # else
 #  define VPBROADCAST	vpbroadcastb
-#  define VPCMP		vpcmpb
-#  define SHIFT_REG	ecx
+#  define VPMINU	vpminub
+#  define VPCMP	vpcmpb
+#  define VPCMPEQ	vpcmpeqb
+#  define CHAR_SIZE	1
 # endif
 
+# ifdef USE_AS_RAWMEMCHR
+#  define RAW_PTR_REG	rcx
+#  define ALGN_PTR_REG	rdi
+# else
+#  define RAW_PTR_REG	rdi
+#  define ALGN_PTR_REG	rcx
+# endif
+
+# define XZERO		xmm23
+# define YZERO		ymm23
 # define XMMMATCH	xmm16
 # define YMMMATCH	ymm16
 # define YMM1		ymm17
@@ -44,6 +58,8 @@
 # define YMM6		ymm22
 
 # define VEC_SIZE 32
+# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE)
+# define PAGE_SIZE 4096
 
 	.section .text.evex,"ax",@progbits
 ENTRY (MEMCHR)
@@ -51,11 +67,7 @@ ENTRY (MEMCHR)
 	/* Check for zero length.  */
 	test	%RDX_LP, %RDX_LP
 	jz	L(zero)
-# endif
-	movl	%edi, %ecx
-# ifdef USE_AS_WMEMCHR
-	shl	$2, %RDX_LP
-# else
+
 #  ifdef __ILP32__
 	/* Clear the upper 32 bits.  */
 	movl	%edx, %edx
@@ -64,318 +76,403 @@ ENTRY (MEMCHR)
 	/* Broadcast CHAR to YMMMATCH.  */
 	VPBROADCAST %esi, %YMMMATCH
 	/* Check if we may cross page boundary with one vector load.  */
-	andl	$(2 * VEC_SIZE - 1), %ecx
-	cmpl	$VEC_SIZE, %ecx
-	ja	L(cros_page_boundary)
+	movl	%edi, %eax
+	andl	$(PAGE_SIZE - 1), %eax
+	cmpl	$(PAGE_SIZE - VEC_SIZE), %eax
+	ja	L(cross_page_boundary)
 
 	/* Check the first VEC_SIZE bytes.  */
-	VPCMP	$0, (%rdi), %YMMMATCH, %k1
-	kmovd	%k1, %eax
-	testl	%eax, %eax
-
+	VPCMP	$0, (%rdi), %YMMMATCH, %k0
+	kmovd	%k0, %eax
 # ifndef USE_AS_RAWMEMCHR
-	jnz	L(first_vec_x0_check)
-	/* Adjust length and check the end of data.  */
-	subq	$VEC_SIZE, %rdx
-	jbe	L(zero)
+	/* If length < CHAR_PER_VEC handle special.  */
+	cmpq	$CHAR_PER_VEC, %rdx
+	jbe	L(first_vec_x0)
+# endif
+	testl	%eax, %eax
+	jz	L(aligned_more)
+	tzcntl	%eax, %eax
+# ifdef USE_AS_WMEMCHR
+	/* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count.  */
+	leaq	(%rdi, %rax, CHAR_SIZE), %rax
 # else
-	jnz	L(first_vec_x0)
+	addq	%rdi, %rax
 # endif
-
-	/* Align data for aligned loads in the loop.  */
-	addq	$VEC_SIZE, %rdi
-	andl	$(VEC_SIZE - 1), %ecx
-	andq	$-VEC_SIZE, %rdi
+	ret
 
 # ifndef USE_AS_RAWMEMCHR
-	/* Adjust length.  */
-	addq	%rcx, %rdx
-
-	subq	$(VEC_SIZE * 4), %rdx
-	jbe	L(last_4x_vec_or_less)
-# endif
-	jmp	L(more_4x_vec)
+L(zero):
+	xorl	%eax, %eax
+	ret
 
+	.p2align 5
+L(first_vec_x0):
+	/* Check if first match was before length.  */
+	tzcntl	%eax, %eax
+	xorl	%ecx, %ecx
+	cmpl	%eax, %edx
+	leaq	(%rdi, %rax, CHAR_SIZE), %rax
+	cmovle	%rcx, %rax
+	ret
+# else
+	/* NB: first_vec_x0 is 17 bytes which will leave
+	   cross_page_boundary (which is relatively cold) close enough
+	   to ideal alignment. So only realign L(cross_page_boundary) if
+	   rawmemchr.  */
 	.p2align 4
-L(cros_page_boundary):
-	andl	$(VEC_SIZE - 1), %ecx
+# endif
+L(cross_page_boundary):
+	/* Save pointer before aligning as its original value is
+	   necessary for computer return address if byte is found or
+	   adjusting length if it is not and this is memchr.  */
+	movq	%rdi, %rcx
+	/* Align data to VEC_SIZE. ALGN_PTR_REG is rcx for memchr and rdi
+	   for rawmemchr.  */
+	andq	$-VEC_SIZE, %ALGN_PTR_REG
+	VPCMP	$0, (%ALGN_PTR_REG), %YMMMATCH, %k0
+	kmovd	%k0, %r8d
 # ifdef USE_AS_WMEMCHR
-	/* NB: Divide shift count by 4 since each bit in K1 represent 4
+	/* NB: Divide shift count by 4 since each bit in K0 represent 4
 	   bytes.  */
-	movl	%ecx, %SHIFT_REG
-	sarl	$2, %SHIFT_REG
+	sarl	$2, %eax
+# endif
+# ifndef USE_AS_RAWMEMCHR
+	movl	$(PAGE_SIZE / CHAR_SIZE), %esi
+	subl	%eax, %esi
 # endif
-	andq	$-VEC_SIZE, %rdi
-	VPCMP	$0, (%rdi), %YMMMATCH, %k1
-	kmovd	%k1, %eax
-	/* Remove the leading bytes.  */
-	sarxl	%SHIFT_REG, %eax, %eax
-	testl	%eax, %eax
-	jz	L(aligned_more)
-	tzcntl	%eax, %eax
 # ifdef USE_AS_WMEMCHR
-	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
-	sall	$2, %eax
+	andl	$(CHAR_PER_VEC - 1), %eax
 # endif
+	/* Remove the leading bytes.  */
+	sarxl	%eax, %r8d, %eax
 # ifndef USE_AS_RAWMEMCHR
 	/* Check the end of data.  */
-	cmpq	%rax, %rdx
-	jbe	L(zero)
+	cmpq	%rsi, %rdx
+	jbe	L(first_vec_x0)
+# endif
+	testl	%eax, %eax
+	jz	L(cross_page_continue)
+	tzcntl	%eax, %eax
+# ifdef USE_AS_WMEMCHR
+	/* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count.  */
+	leaq	(%RAW_PTR_REG, %rax, CHAR_SIZE), %rax
+# else
+	addq	%RAW_PTR_REG, %rax
 # endif
-	addq	%rdi, %rax
-	addq	%rcx, %rax
 	ret
 
 	.p2align 4
-L(aligned_more):
-# ifndef USE_AS_RAWMEMCHR
-        /* Calculate "rdx + rcx - VEC_SIZE" with "rdx - (VEC_SIZE - rcx)"
-	   instead of "(rdx + rcx) - VEC_SIZE" to void possible addition
-	   overflow.  */
-	negq	%rcx
-	addq	$VEC_SIZE, %rcx
+L(first_vec_x1):
+	tzcntl	%eax, %eax
+	leaq	VEC_SIZE(%rdi, %rax, CHAR_SIZE), %rax
+	ret
 
-	/* Check the end of data.  */
-	subq	%rcx, %rdx
-	jbe	L(zero)
-# endif
+	.p2align 4
+L(first_vec_x2):
+	tzcntl	%eax, %eax
+	leaq	(VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
+	ret
 
-	addq	$VEC_SIZE, %rdi
+	.p2align 4
+L(first_vec_x3):
+	tzcntl	%eax, %eax
+	leaq	(VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax
+	ret
 
-# ifndef USE_AS_RAWMEMCHR
-	subq	$(VEC_SIZE * 4), %rdx
-	jbe	L(last_4x_vec_or_less)
-# endif
+	.p2align 4
+L(first_vec_x4):
+	tzcntl	%eax, %eax
+	leaq	(VEC_SIZE * 4)(%rdi, %rax, CHAR_SIZE), %rax
+	ret
 
-L(more_4x_vec):
+	.p2align 5
+L(aligned_more):
 	/* Check the first 4 * VEC_SIZE.  Only one VEC_SIZE at a time
 	   since data is only aligned to VEC_SIZE.  */
-	VPCMP	$0, (%rdi), %YMMMATCH, %k1
-	kmovd	%k1, %eax
-	testl	%eax, %eax
-	jnz	L(first_vec_x0)
 
-	VPCMP	$0, VEC_SIZE(%rdi), %YMMMATCH, %k1
-	kmovd	%k1, %eax
+# ifndef USE_AS_RAWMEMCHR
+	/* Align data to VEC_SIZE.  */
+L(cross_page_continue):
+	xorl	%ecx, %ecx
+	subl	%edi, %ecx
+	andq	$-VEC_SIZE, %rdi
+	/* esi is for adjusting length to see if near the end.  */
+	leal	(VEC_SIZE * 5)(%rdi, %rcx), %esi
+#  ifdef USE_AS_WMEMCHR
+	/* NB: Divide bytes by 4 to get the wchar_t count.  */
+	sarl	$2, %esi
+#  endif
+# else
+	andq	$-VEC_SIZE, %rdi
+L(cross_page_continue):
+# endif
+	/* Load first VEC regardless.  */
+	VPCMP	$0, (VEC_SIZE)(%rdi), %YMMMATCH, %k0
+	kmovd	%k0, %eax
+# ifndef USE_AS_RAWMEMCHR
+	/* Adjust length. If near end handle specially.  */
+	subq	%rsi, %rdx
+	jbe	L(last_4x_vec_or_less)
+# endif
 	testl	%eax, %eax
 	jnz	L(first_vec_x1)
 
-	VPCMP	$0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k1
-	kmovd	%k1, %eax
+	VPCMP	$0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k0
+	kmovd	%k0, %eax
 	testl	%eax, %eax
 	jnz	L(first_vec_x2)
 
-	VPCMP	$0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k1
-	kmovd	%k1, %eax
+	VPCMP	$0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k0
+	kmovd	%k0, %eax
 	testl	%eax, %eax
 	jnz	L(first_vec_x3)
 
-	addq	$(VEC_SIZE * 4), %rdi
+	VPCMP	$0, (VEC_SIZE * 4)(%rdi), %YMMMATCH, %k0
+	kmovd	%k0, %eax
+	testl	%eax, %eax
+	jnz	L(first_vec_x4)
+
 
 # ifndef USE_AS_RAWMEMCHR
-	subq	$(VEC_SIZE * 4), %rdx
-	jbe	L(last_4x_vec_or_less)
-# endif
+	/* Check if at last CHAR_PER_VEC * 4 length.  */
+	subq	$(CHAR_PER_VEC * 4), %rdx
+	jbe	L(last_4x_vec_or_less_cmpeq)
+	addq	$VEC_SIZE, %rdi
 
-	/* Align data to 4 * VEC_SIZE.  */
-	movq	%rdi, %rcx
-	andl	$(4 * VEC_SIZE - 1), %ecx
+	/* Align data to VEC_SIZE * 4 for the loop and readjust length.
+	 */
+#  ifdef USE_AS_WMEMCHR
+	movl	%edi, %ecx
 	andq	$-(4 * VEC_SIZE), %rdi
-
-# ifndef USE_AS_RAWMEMCHR
-	/* Adjust length.  */
+	andl	$(VEC_SIZE * 4 - 1), %ecx
+	/* NB: Divide bytes by 4 to get the wchar_t count.  */
+	sarl	$2, %ecx
 	addq	%rcx, %rdx
+#  else
+	addq	%rdi, %rdx
+	andq	$-(4 * VEC_SIZE), %rdi
+	subq	%rdi, %rdx
+#  endif
+# else
+	addq	$VEC_SIZE, %rdi
+	andq	$-(4 * VEC_SIZE), %rdi
 # endif
 
+	vpxorq	%XZERO, %XZERO, %XZERO
+
+	/* Compare 4 * VEC at a time forward.  */
 	.p2align 4
 L(loop_4x_vec):
-	/* Compare 4 * VEC at a time forward.  */
-	VPCMP	$0, (%rdi), %YMMMATCH, %k1
-	VPCMP	$0, VEC_SIZE(%rdi), %YMMMATCH, %k2
-	kord	%k1, %k2, %k5
-	VPCMP	$0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k3
-	VPCMP	$0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k4
-
-	kord	%k3, %k4, %k6
-	kortestd %k5, %k6
-	jnz	L(4x_vec_end)
-
-	addq	$(VEC_SIZE * 4), %rdi
-
+	/* It would be possible to save some instructions using 4x VPCMP
+	   but bottleneck on port 5 makes it not woth it.  */
+	VPCMP	$4, (VEC_SIZE * 4)(%rdi), %YMMMATCH, %k1
+	/* xor will set bytes match esi to zero.  */
+	vpxorq	(VEC_SIZE * 5)(%rdi), %YMMMATCH, %YMM2
+	vpxorq	(VEC_SIZE * 6)(%rdi), %YMMMATCH, %YMM3
+	VPCMP	$0, (VEC_SIZE * 7)(%rdi), %YMMMATCH, %k3
+	/* Reduce VEC2 / VEC3 with min and VEC1 with zero mask.  */
+	VPMINU	%YMM2, %YMM3, %YMM3 {%k1} {z}
+	VPCMP	$0, %YMM3, %YZERO, %k2
 # ifdef USE_AS_RAWMEMCHR
-	jmp	L(loop_4x_vec)
+	subq	$-(VEC_SIZE * 4), %rdi
+	kortestd %k2, %k3
+	jz	L(loop_4x_vec)
 # else
-	subq	$(VEC_SIZE * 4), %rdx
+	kortestd %k2, %k3
+	jnz	L(loop_4x_vec_end)
+
+	subq	$-(VEC_SIZE * 4), %rdi
+
+	subq	$(CHAR_PER_VEC * 4), %rdx
 	ja	L(loop_4x_vec)
 
+	/* Fall through into less than 4 remaining vectors of length case.
+	 */
+	VPCMP	$0, (VEC_SIZE * 4)(%rdi), %YMMMATCH, %k0
+	kmovd	%k0, %eax
+	addq	$(VEC_SIZE * 3), %rdi
+	.p2align 4
 L(last_4x_vec_or_less):
-	/* Less than 4 * VEC and aligned to VEC_SIZE.  */
-	addl	$(VEC_SIZE * 2), %edx
-	jle	L(last_2x_vec)
-
-	VPCMP	$0, (%rdi), %YMMMATCH, %k1
-	kmovd	%k1, %eax
+	/* Check if first VEC contained match.  */
 	testl	%eax, %eax
-	jnz	L(first_vec_x0)
+	jnz	L(first_vec_x1_check)
 
-	VPCMP	$0, VEC_SIZE(%rdi), %YMMMATCH, %k1
-	kmovd	%k1, %eax
-	testl	%eax, %eax
-	jnz	L(first_vec_x1)
+	/* If remaining length > CHAR_PER_VEC * 2.  */
+	addl	$(CHAR_PER_VEC * 2), %edx
+	jg	L(last_4x_vec)
 
-	VPCMP	$0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k1
-	kmovd	%k1, %eax
-	testl	%eax, %eax
+L(last_2x_vec):
+	/* If remaining length < CHAR_PER_VEC.  */
+	addl	$CHAR_PER_VEC, %edx
+	jle	L(zero_end)
 
-	jnz	L(first_vec_x2_check)
-	subl	$VEC_SIZE, %edx
-	jle	L(zero)
+	/* Check VEC2 and compare any match with remaining length.  */
+	VPCMP	$0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k0
+	kmovd	%k0, %eax
+	tzcntl	%eax, %eax
+	cmpl	%eax, %edx
+	jbe	L(set_zero_end)
+	leaq	(VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
+L(zero_end):
+	ret
 
-	VPCMP	$0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k1
-	kmovd	%k1, %eax
-	testl	%eax, %eax
 
-	jnz	L(first_vec_x3_check)
+	.p2align 4
+L(first_vec_x1_check):
+	tzcntl	%eax, %eax
+	/* Adjust length.  */
+	subl	$-(CHAR_PER_VEC * 4), %edx
+	/* Check if match within remaining length.  */
+	cmpl	%eax, %edx
+	jbe	L(set_zero_end)
+	/* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count.  */
+	leaq	VEC_SIZE(%rdi, %rax, CHAR_SIZE), %rax
+	ret
+L(set_zero_end):
 	xorl	%eax, %eax
 	ret
 
 	.p2align 4
-L(last_2x_vec):
-	addl	$(VEC_SIZE * 2), %edx
-	VPCMP	$0, (%rdi), %YMMMATCH, %k1
+L(loop_4x_vec_end):
+# endif
+	/* rawmemchr will fall through into this if match was found in
+	   loop.  */
+
+	/* k1 has not of matches with VEC1.  */
 	kmovd	%k1, %eax
-	testl	%eax, %eax
+# ifdef USE_AS_WMEMCHR
+	subl	$((1 << CHAR_PER_VEC) - 1), %eax
+# else
+	incl	%eax
+# endif
+	jnz	L(last_vec_x1_return)
 
-	jnz	L(first_vec_x0_check)
-	subl	$VEC_SIZE, %edx
-	jle	L(zero)
+	VPCMP	$0, %YMM2, %YZERO, %k0
+	kmovd	%k0, %eax
+	testl	%eax, %eax
+	jnz	L(last_vec_x2_return)
 
-	VPCMP	$0, VEC_SIZE(%rdi), %YMMMATCH, %k1
-	kmovd	%k1, %eax
+	kmovd	%k2, %eax
 	testl	%eax, %eax
-	jnz	L(first_vec_x1_check)
-	xorl	%eax, %eax
-	ret
+	jnz	L(last_vec_x3_return)
 
-	.p2align 4
-L(first_vec_x0_check):
+	kmovd	%k3, %eax
 	tzcntl	%eax, %eax
-# ifdef USE_AS_WMEMCHR
-	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
-	sall	$2, %eax
+# ifdef USE_AS_RAWMEMCHR
+	leaq	(VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax
+# else
+	leaq	(VEC_SIZE * 7)(%rdi, %rax, CHAR_SIZE), %rax
 # endif
-	/* Check the end of data.  */
-	cmpq	%rax, %rdx
-	jbe	L(zero)
-	addq	%rdi, %rax
 	ret
 
 	.p2align 4
-L(first_vec_x1_check):
+L(last_vec_x1_return):
 	tzcntl	%eax, %eax
-# ifdef USE_AS_WMEMCHR
-	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
-	sall	$2, %eax
-# endif
-	/* Check the end of data.  */
-	cmpq	%rax, %rdx
-	jbe	L(zero)
-	addq	$VEC_SIZE, %rax
+# ifdef USE_AS_RAWMEMCHR
+#  ifdef USE_AS_WMEMCHR
+	/* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count.  */
+	leaq	(%rdi, %rax, CHAR_SIZE), %rax
+#  else
 	addq	%rdi, %rax
-	ret
-
-	.p2align 4
-L(first_vec_x2_check):
-	tzcntl	%eax, %eax
-# ifdef USE_AS_WMEMCHR
-	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
-	sall	$2, %eax
+#  endif
+# else
+	/* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count.  */
+	leaq	(VEC_SIZE * 4)(%rdi, %rax, CHAR_SIZE), %rax
 # endif
-	/* Check the end of data.  */
-	cmpq	%rax, %rdx
-	jbe	L(zero)
-	addq	$(VEC_SIZE * 2), %rax
-	addq	%rdi, %rax
 	ret
 
 	.p2align 4
-L(first_vec_x3_check):
+L(last_vec_x2_return):
 	tzcntl	%eax, %eax
-# ifdef USE_AS_WMEMCHR
-	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
-	sall	$2, %eax
+# ifdef USE_AS_RAWMEMCHR
+	/* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count.  */
+	leaq	VEC_SIZE(%rdi, %rax, CHAR_SIZE), %rax
+# else
+	/* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count.  */
+	leaq	(VEC_SIZE * 5)(%rdi, %rax, CHAR_SIZE), %rax
 # endif
-	/* Check the end of data.  */
-	cmpq	%rax, %rdx
-	jbe	L(zero)
-	addq	$(VEC_SIZE * 3), %rax
-	addq	%rdi, %rax
 	ret
 
 	.p2align 4
-L(zero):
-	xorl	%eax, %eax
-	ret
-# endif
-
-	.p2align 4
-L(first_vec_x0):
+L(last_vec_x3_return):
 	tzcntl	%eax, %eax
-# ifdef USE_AS_WMEMCHR
-	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
-	leaq	(%rdi, %rax, 4), %rax
+# ifdef USE_AS_RAWMEMCHR
+	/* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count.  */
+	leaq	(VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
 # else
-	addq	%rdi, %rax
+	/* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count.  */
+	leaq	(VEC_SIZE * 6)(%rdi, %rax, CHAR_SIZE), %rax
 # endif
 	ret
 
+
+# ifndef USE_AS_RAWMEMCHR
+L(last_4x_vec_or_less_cmpeq):
+	VPCMP	$0, (VEC_SIZE * 5)(%rdi), %YMMMATCH, %k0
+	kmovd	%k0, %eax
+	subq	$-(VEC_SIZE * 4), %rdi
+	/* Check first VEC regardless.  */
+	testl	%eax, %eax
+	jnz	L(first_vec_x1_check)
+
+	/* If remaining length <= CHAR_PER_VEC * 2.  */
+	addl	$(CHAR_PER_VEC * 2), %edx
+	jle	L(last_2x_vec)
+
 	.p2align 4
-L(first_vec_x1):
+L(last_4x_vec):
+	VPCMP	$0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k0
+	kmovd	%k0, %eax
+	testl	%eax, %eax
+	jnz	L(last_vec_x2)
+
+
+	VPCMP	$0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k0
+	kmovd	%k0, %eax
+	/* Create mask for possible matches within remaining length.  */
+#  ifdef USE_AS_WMEMCHR
+	movl	$((1 << (CHAR_PER_VEC * 2)) - 1), %ecx
+	bzhil	%edx, %ecx, %ecx
+#  else
+	movq	$-1, %rcx
+	bzhiq	%rdx, %rcx, %rcx
+#  endif
+	/* Test matches in data against length match.  */
+	andl	%ecx, %eax
+	jnz	L(last_vec_x3)
+
+	/* if remaining length <= CHAR_PER_VEC * 3 (Note this is after
+	   remaining length was found to be > CHAR_PER_VEC * 2.  */
+	subl	$CHAR_PER_VEC, %edx
+	jbe	L(zero_end2)
+
+
+	VPCMP	$0, (VEC_SIZE * 4)(%rdi), %YMMMATCH, %k0
+	kmovd	%k0, %eax
+	/* Shift remaining length mask for last VEC.  */
+#  ifdef USE_AS_WMEMCHR
+	shrl	$CHAR_PER_VEC, %ecx
+#  else
+	shrq	$CHAR_PER_VEC, %rcx
+#  endif
+	andl	%ecx, %eax
+	jz	L(zero_end2)
 	tzcntl	%eax, %eax
-# ifdef USE_AS_WMEMCHR
-	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
-	leaq	VEC_SIZE(%rdi, %rax, 4), %rax
-# else
-	addq	$VEC_SIZE, %rax
-	addq	%rdi, %rax
-# endif
+	leaq	(VEC_SIZE * 4)(%rdi, %rax, CHAR_SIZE), %rax
+L(zero_end2):
 	ret
 
-	.p2align 4
-L(first_vec_x2):
+L(last_vec_x2):
 	tzcntl	%eax, %eax
-# ifdef USE_AS_WMEMCHR
-	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
-	leaq	(VEC_SIZE * 2)(%rdi, %rax, 4), %rax
-# else
-	addq	$(VEC_SIZE * 2), %rax
-	addq	%rdi, %rax
-# endif
+	leaq	(VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
 	ret
 
 	.p2align 4
-L(4x_vec_end):
-	kmovd	%k1, %eax
-	testl	%eax, %eax
-	jnz	L(first_vec_x0)
-	kmovd	%k2, %eax
-	testl	%eax, %eax
-	jnz	L(first_vec_x1)
-	kmovd	%k3, %eax
-	testl	%eax, %eax
-	jnz	L(first_vec_x2)
-	kmovd	%k4, %eax
-	testl	%eax, %eax
-L(first_vec_x3):
+L(last_vec_x3):
 	tzcntl	%eax, %eax
-# ifdef USE_AS_WMEMCHR
-	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
-	leaq	(VEC_SIZE * 3)(%rdi, %rax, 4), %rax
-# else
-	addq	$(VEC_SIZE * 3), %rax
-	addq	%rdi, %rax
-# endif
+	leaq	(VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax
 	ret
+# endif
 
 END (MEMCHR)
 #endif
-- 
2.29.2


^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH v1 1/3] Bench: Expand bench-memchr.c
  2021-05-03 19:51   ` Noah Goldstein
@ 2021-05-03 20:59     ` H.J. Lu
  0 siblings, 0 replies; 20+ messages in thread
From: H.J. Lu @ 2021-05-03 20:59 UTC (permalink / raw)
  To: Noah Goldstein; +Cc: GNU C Library, Carlos O'Donell

On Mon, May 3, 2021 at 12:52 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> On Mon, May 3, 2021 at 1:18 PM H.J. Lu <hjl.tools@gmail.com> wrote:
> >
> > On Mon, May 3, 2021 at 1:45 AM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> > >
> > > No bug. This commit adds some additional cases for bench-memchr.c
> > > including testing medium sizes and testing short length with both an
> > > inbound match and out of bound match.
> > >
> > > Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
> > > ---
> > >  benchtests/bench-memchr.c | 13 +++++++++++++
> > >  1 file changed, 13 insertions(+)
> > >
> > > diff --git a/benchtests/bench-memchr.c b/benchtests/bench-memchr.c
> > > index f5ced9d80d..5573f93312 100644
> > > --- a/benchtests/bench-memchr.c
> > > +++ b/benchtests/bench-memchr.c
> > > @@ -135,12 +135,25 @@ test_main (void)
> > >        do_test (i, i, 256, 0);
> > >  #endif
> > >      }
> > > +  for (i = 1; i < 8; ++i)
> > > +    {
> > > +      do_test (i, i << 5, 192, 23);
> > > +      do_test (i, i << 5, 192, 0);
> > > +      do_test (i, i << 5, 256, 23);
> > > +      do_test (i, i << 5, 256, 0);
> > > +      do_test (i, i << 5, 512, 23);
> > > +      do_test (i, i << 5, 512, 0);
> > > +    }
> > >    for (i = 1; i < 32; ++i)
> > >      {
> > >        do_test (0, i, i + 1, 23);
> > >        do_test (0, i, i + 1, 0);
> > >        do_test (i, i, i + 1, 23);
> > >        do_test (i, i, i + 1, 0);
> > > +      do_test (0, i, i - 1, 23);
> > > +      do_test (0, i, i - 1, 0);
> > > +      do_test (i, i, i - 1, 23);
> > > +      do_test (i, i, i - 1, 0);
> > >  #ifdef USE_AS_MEMRCHR
> > >        /* Also test the position close to the beginning for memrchr.  */
> > >        do_test (0, 1, i + 1, 23);
> > > --
> > > 2.29.2
> > >
> >
> > LGTM.  I will check it in for you.
>
> Thanks!
>
> >
> > BTW,  can you apply an account on sourceware.org:
> >
> > https://sourceware.org/
> >
> > so that you can push your commits directly?  You can put me down
> > as your sponsor.
>
> Done. Are there any wikis / manuals on how to properly use write access?

https://sourceware.org/glibc/wiki/HomePage

has a lot of good information.

> All I'm finding are resources on how to obtain it.
>
> >
> > Thanks.
> >
> > --
> > H.J.



-- 
H.J.

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH v2 2/3] x86: Optimize memchr-avx2.S
  2021-05-03 20:06   ` [PATCH v2 " Noah Goldstein
  2021-05-03 20:06     ` [PATCH v2 3/3] x86: Optimize memchr-evex.S Noah Goldstein
@ 2021-05-03 22:25     ` H.J. Lu
  2021-05-03 22:58       ` Noah Goldstein
  1 sibling, 1 reply; 20+ messages in thread
From: H.J. Lu @ 2021-05-03 22:25 UTC (permalink / raw)
  To: Noah Goldstein; +Cc: libc-alpha, carlos, hjl.tools

On Mon, May 03, 2021 at 04:06:54PM -0400, Noah Goldstein wrote:
> No bug. This commit optimizes memchr-avx2.S. The optimizations include
> replacing some branches with cmovcc, avoiding some branches entirely
> in the less_4x_vec case, making the page cross logic less strict,
> asaving a few instructions the in loop return loop. test-memchr,
> test-rawmemchr, and test-wmemchr are all passing.
> 
> Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
> ---
>  sysdeps/x86_64/multiarch/memchr-avx2.S | 426 ++++++++++++++-----------
>  1 file changed, 247 insertions(+), 179 deletions(-)
> 
> diff --git a/sysdeps/x86_64/multiarch/memchr-avx2.S b/sysdeps/x86_64/multiarch/memchr-avx2.S
> index 1fcb1c350f..8b862fb9d1 100644
> --- a/sysdeps/x86_64/multiarch/memchr-avx2.S
> +++ b/sysdeps/x86_64/multiarch/memchr-avx2.S
> @@ -26,8 +26,22 @@
>  
>  # ifdef USE_AS_WMEMCHR
>  #  define VPCMPEQ	vpcmpeqd
> +#  define VPBROADCAST	vpbroadcastd
> +#  define CHAR_SIZE	4
>  # else
>  #  define VPCMPEQ	vpcmpeqb
> +#  define VPBROADCAST	vpbroadcastb
> +#  define CHAR_SIZE	1
> +# endif
> +
> +# ifdef USE_AS_RAWMEMCHR
> +#  define ERAW_PTR_REG	ecx
> +#  define RRAW_PTR_REG	rcx
> +#  define ALGN_PTR_REG	rdi
> +# else
> +#  define ERAW_PTR_REG	edi
> +#  define RRAW_PTR_REG	rdi
> +#  define ALGN_PTR_REG	rcx
>  # endif
>  
>  # ifndef VZEROUPPER
> @@ -39,6 +53,7 @@
>  # endif
>  
>  # define VEC_SIZE 32
> +# define PAGE_SIZE 4096
>  
>  	.section SECTION(.text),"ax",@progbits
>  ENTRY (MEMCHR)
> @@ -47,295 +62,348 @@ ENTRY (MEMCHR)
>  	test	%RDX_LP, %RDX_LP
>  	jz	L(null)
>  # endif
> -	movl	%edi, %ecx
> -	/* Broadcast CHAR to YMM0.  */
> -	vmovd	%esi, %xmm0
>  # ifdef USE_AS_WMEMCHR
>  	shl	$2, %RDX_LP
> -	vpbroadcastd %xmm0, %ymm0
>  # else
>  #  ifdef __ILP32__
>  	/* Clear the upper 32 bits.  */
>  	movl	%edx, %edx
>  #  endif
> -	vpbroadcastb %xmm0, %ymm0
>  # endif
> +	/* Broadcast CHAR to YMMMATCH.  */
> +	vmovd	%esi, %xmm0
> +	VPBROADCAST %xmm0, %ymm0
>  	/* Check if we may cross page boundary with one vector load.  */
> -	andl	$(2 * VEC_SIZE - 1), %ecx
> -	cmpl	$VEC_SIZE, %ecx
> -	ja	L(cros_page_boundary)
> +	movl	%edi, %eax
> +	andl	$(PAGE_SIZE - 1), %eax
> +	cmpl	$(PAGE_SIZE - VEC_SIZE), %eax
> +	ja	L(cross_page_boundary)
>  
>  	/* Check the first VEC_SIZE bytes.  */
> -	VPCMPEQ (%rdi), %ymm0, %ymm1
> +	VPCMPEQ	(%rdi), %ymm0, %ymm1
>  	vpmovmskb %ymm1, %eax
> -	testl	%eax, %eax
> -
>  # ifndef USE_AS_RAWMEMCHR
> -	jnz	L(first_vec_x0_check)
> -	/* Adjust length and check the end of data.  */
> -	subq	$VEC_SIZE, %rdx
> -	jbe	L(zero)
> -# else
> -	jnz	L(first_vec_x0)
> +	/* If length < CHAR_PER_VEC handle special.  */
> +	cmpq	$VEC_SIZE, %rdx
> +	jbe	L(first_vec_x0)
>  # endif
> -
> -	/* Align data for aligned loads in the loop.  */
> -	addq	$VEC_SIZE, %rdi
> -	andl	$(VEC_SIZE - 1), %ecx
> -	andq	$-VEC_SIZE, %rdi
> +	testl	%eax, %eax
> +	jz	L(aligned_more)
> +	tzcntl	%eax, %eax
> +	addq	%rdi, %rax
> +	VZEROUPPER_RETURN
>  
>  # ifndef USE_AS_RAWMEMCHR
> -	/* Adjust length.  */
> -	addq	%rcx, %rdx
> -
> -	subq	$(VEC_SIZE * 4), %rdx
> -	jbe	L(last_4x_vec_or_less)
> +	.p2align 5
> +L(first_vec_x0):
> +	/* Check if first match was before length.  */
> +	tzcntl	%eax, %eax
> +	xorl	%ecx, %ecx
> +	cmpl	%eax, %edx
> +	leaq	(%rdi, %rax), %rax
> +	cmovle	%rcx, %rax
> +	VZEROUPPER_RETURN

Please add a blank line here to indicate this begin a new block.
OK with this change.  You should be able to push it yourself now.

Thanks.

> +L(null):
> +	xorl	%eax, %eax
> +	ret
>  # endif
> -	jmp	L(more_4x_vec)
> -
>  	.p2align 4
> -L(cros_page_boundary):
> -	andl	$(VEC_SIZE - 1), %ecx
> -	andq	$-VEC_SIZE, %rdi
> -	VPCMPEQ (%rdi), %ymm0, %ymm1
> +L(cross_page_boundary):
> +	/* Save pointer before aligning as its original value is necessary
> +	   for computer return address if byte is found or adjusting length
> +	   if it is not and this is memchr.  */
> +	movq	%rdi, %rcx
> +	/* Align data to VEC_SIZE - 1. ALGN_PTR_REG is rcx for memchr and
> +	   rdi for rawmemchr.  */
> +	orq	$(VEC_SIZE - 1), %ALGN_PTR_REG
> +	VPCMPEQ	-(VEC_SIZE - 1)(%ALGN_PTR_REG), %ymm0, %ymm1
>  	vpmovmskb %ymm1, %eax
> +# ifndef USE_AS_RAWMEMCHR
> +	/* Calculate length until end of page (length checked for a
> +	   match).  */
> +	leaq	1(%ALGN_PTR_REG), %rsi
> +	subq	%RRAW_PTR_REG, %rsi
> +# endif
>  	/* Remove the leading bytes.  */
> -	sarl	%cl, %eax
> -	testl	%eax, %eax
> -	jz	L(aligned_more)
> -	tzcntl	%eax, %eax
> +	sarxl	%ERAW_PTR_REG, %eax, %eax
>  # ifndef USE_AS_RAWMEMCHR
>  	/* Check the end of data.  */
> -	cmpq	%rax, %rdx
> -	jbe	L(zero)
> +	cmpq	%rsi, %rdx
> +	jbe	L(first_vec_x0)
>  # endif
> -	addq	%rdi, %rax
> -	addq	%rcx, %rax
> +	testl	%eax, %eax
> +	jz	L(cross_page_continue)
> +	tzcntl	%eax, %eax
> +	addq	%RRAW_PTR_REG, %rax
>  L(return_vzeroupper):
>  	ZERO_UPPER_VEC_REGISTERS_RETURN
>  
>  	.p2align 4
> -L(aligned_more):
> -# ifndef USE_AS_RAWMEMCHR
> -        /* Calculate "rdx + rcx - VEC_SIZE" with "rdx - (VEC_SIZE - rcx)"
> -	   instead of "(rdx + rcx) - VEC_SIZE" to void possible addition
> -	   overflow.  */
> -	negq	%rcx
> -	addq	$VEC_SIZE, %rcx
> +L(first_vec_x1):
> +	tzcntl	%eax, %eax
> +	incq	%rdi
> +	addq	%rdi, %rax
> +	VZEROUPPER_RETURN
>  
> -	/* Check the end of data.  */
> -	subq	%rcx, %rdx
> -	jbe	L(zero)
> -# endif
> +	.p2align 4
> +L(first_vec_x2):
> +	tzcntl	%eax, %eax
> +	addq	$(VEC_SIZE + 1), %rdi
> +	addq	%rdi, %rax
> +	VZEROUPPER_RETURN
>  
> -	addq	$VEC_SIZE, %rdi
> +	.p2align 4
> +L(first_vec_x3):
> +	tzcntl	%eax, %eax
> +	addq	$(VEC_SIZE * 2 + 1), %rdi
> +	addq	%rdi, %rax
> +	VZEROUPPER_RETURN
>  
> -# ifndef USE_AS_RAWMEMCHR
> -	subq	$(VEC_SIZE * 4), %rdx
> -	jbe	L(last_4x_vec_or_less)
> -# endif
>  
> -L(more_4x_vec):
> +	.p2align 4
> +L(first_vec_x4):
> +	tzcntl	%eax, %eax
> +	addq	$(VEC_SIZE * 3 + 1), %rdi
> +	addq	%rdi, %rax
> +	VZEROUPPER_RETURN
> +
> +	.p2align 4
> +L(aligned_more):
>  	/* Check the first 4 * VEC_SIZE.  Only one VEC_SIZE at a time
>  	   since data is only aligned to VEC_SIZE.  */
> -	VPCMPEQ (%rdi), %ymm0, %ymm1
> -	vpmovmskb %ymm1, %eax
> -	testl	%eax, %eax
> -	jnz	L(first_vec_x0)
>  
> -	VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1
> +# ifndef USE_AS_RAWMEMCHR
> +L(cross_page_continue):
> +	/* Align data to VEC_SIZE - 1.  */
> +	xorl	%ecx, %ecx
> +	subl	%edi, %ecx
> +	orq	$(VEC_SIZE - 1), %rdi
> +	/* esi is for adjusting length to see if near the end.  */
> +	leal	(VEC_SIZE * 4 + 1)(%rdi, %rcx), %esi
> +# else
> +	orq	$(VEC_SIZE - 1), %rdi
> +L(cross_page_continue):
> +# endif
> +	/* Load first VEC regardless.  */
> +	VPCMPEQ	1(%rdi), %ymm0, %ymm1
>  	vpmovmskb %ymm1, %eax
> +# ifndef USE_AS_RAWMEMCHR
> +	/* Adjust length. If near end handle specially.  */
> +	subq	%rsi, %rdx
> +	jbe	L(last_4x_vec_or_less)
> +# endif
>  	testl	%eax, %eax
>  	jnz	L(first_vec_x1)
>  
> -	VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm0, %ymm1
> +	VPCMPEQ	(VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
>  	vpmovmskb %ymm1, %eax
>  	testl	%eax, %eax
>  	jnz	L(first_vec_x2)
>  
> -	VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1
> +	VPCMPEQ	(VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm1
>  	vpmovmskb %ymm1, %eax
>  	testl	%eax, %eax
>  	jnz	L(first_vec_x3)
>  
> -	addq	$(VEC_SIZE * 4), %rdi
> +	VPCMPEQ	(VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm1
> +	vpmovmskb %ymm1, %eax
> +	testl	%eax, %eax
> +	jnz	L(first_vec_x4)
>  
>  # ifndef USE_AS_RAWMEMCHR
> +	/* Check if at last VEC_SIZE * 4 length.  */
>  	subq	$(VEC_SIZE * 4), %rdx
> -	jbe	L(last_4x_vec_or_less)
> -# endif
> -
> -	/* Align data to 4 * VEC_SIZE.  */
> -	movq	%rdi, %rcx
> -	andl	$(4 * VEC_SIZE - 1), %ecx
> -	andq	$-(4 * VEC_SIZE), %rdi
> -
> -# ifndef USE_AS_RAWMEMCHR
> -	/* Adjust length.  */
> +	jbe	L(last_4x_vec_or_less_cmpeq)
> +	/* Align data to VEC_SIZE * 4 - 1 for the loop and readjust
> +	   length.  */
> +	incq	%rdi
> +	movl	%edi, %ecx
> +	orq	$(VEC_SIZE * 4 - 1), %rdi
> +	andl	$(VEC_SIZE * 4 - 1), %ecx
>  	addq	%rcx, %rdx
> +# else
> +	/* Align data to VEC_SIZE * 4 - 1 for loop.  */
> +	incq	%rdi
> +	orq	$(VEC_SIZE * 4 - 1), %rdi
>  # endif
>  
> +	/* Compare 4 * VEC at a time forward.  */
>  	.p2align 4
>  L(loop_4x_vec):
> -	/* Compare 4 * VEC at a time forward.  */
> -	VPCMPEQ (%rdi), %ymm0, %ymm1
> -	VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm2
> -	VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm0, %ymm3
> -	VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm0, %ymm4
> -
> +	VPCMPEQ	1(%rdi), %ymm0, %ymm1
> +	VPCMPEQ	(VEC_SIZE + 1)(%rdi), %ymm0, %ymm2
> +	VPCMPEQ	(VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm3
> +	VPCMPEQ	(VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm4
>  	vpor	%ymm1, %ymm2, %ymm5
>  	vpor	%ymm3, %ymm4, %ymm6
>  	vpor	%ymm5, %ymm6, %ymm5
>  
> -	vpmovmskb %ymm5, %eax
> -	testl	%eax, %eax
> -	jnz	L(4x_vec_end)
> -
> -	addq	$(VEC_SIZE * 4), %rdi
> -
> +	vpmovmskb %ymm5, %ecx
>  # ifdef USE_AS_RAWMEMCHR
> -	jmp	L(loop_4x_vec)
> +	subq	$-(VEC_SIZE * 4), %rdi
> +	testl	%ecx, %ecx
> +	jz	L(loop_4x_vec)
>  # else
> -	subq	$(VEC_SIZE * 4), %rdx
> -	ja	L(loop_4x_vec)
> +	testl	%ecx, %ecx
> +	jnz	L(loop_4x_vec_end)
>  
> -L(last_4x_vec_or_less):
> -	/* Less than 4 * VEC and aligned to VEC_SIZE.  */
> -	addl	$(VEC_SIZE * 2), %edx
> -	jle	L(last_2x_vec)
> +	subq	$-(VEC_SIZE * 4), %rdi
>  
> -	VPCMPEQ (%rdi), %ymm0, %ymm1
> -	vpmovmskb %ymm1, %eax
> -	testl	%eax, %eax
> -	jnz	L(first_vec_x0)
> +	subq	$(VEC_SIZE * 4), %rdx
> +	ja	L(loop_4x_vec)
>  
> -	VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1
> +	/* Fall through into less than 4 remaining vectors of length case.
> +	 */
> +	VPCMPEQ	(VEC_SIZE * 0 + 1)(%rdi), %ymm0, %ymm1
>  	vpmovmskb %ymm1, %eax
> +	.p2align 4
> +L(last_4x_vec_or_less):
> +	/* Check if first VEC contained match.  */
>  	testl	%eax, %eax
> -	jnz	L(first_vec_x1)
> +	jnz	L(first_vec_x1_check)
>  
> -	VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm0, %ymm1
> -	vpmovmskb %ymm1, %eax
> -	testl	%eax, %eax
> +	/* If remaining length > VEC_SIZE * 2.  */
> +	addl	$(VEC_SIZE * 2), %edx
> +	jg	L(last_4x_vec)
>  
> -	jnz	L(first_vec_x2_check)
> -	subl	$VEC_SIZE, %edx
> -	jle	L(zero)
> +L(last_2x_vec):
> +	/* If remaining length < VEC_SIZE.  */
> +	addl	$VEC_SIZE, %edx
> +	jle	L(zero_end)
>  
> -	VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1
> +	/* Check VEC2 and compare any match with remaining length.  */
> +	VPCMPEQ	(VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
>  	vpmovmskb %ymm1, %eax
> -	testl	%eax, %eax
> -
> -	jnz	L(first_vec_x3_check)
> -	xorl	%eax, %eax
> +	tzcntl	%eax, %eax
> +	cmpl	%eax, %edx
> +	jbe	L(set_zero_end)
> +	addq	$(VEC_SIZE + 1), %rdi
> +	addq	%rdi, %rax
> +L(zero_end):
>  	VZEROUPPER_RETURN
>  
>  	.p2align 4
> -L(last_2x_vec):
> -	addl	$(VEC_SIZE * 2), %edx
> -	VPCMPEQ (%rdi), %ymm0, %ymm1
> +L(loop_4x_vec_end):
> +# endif
> +	/* rawmemchr will fall through into this if match was found in
> +	   loop.  */
> +
>  	vpmovmskb %ymm1, %eax
>  	testl	%eax, %eax
> +	jnz	L(last_vec_x1_return)
>  
> -	jnz	L(first_vec_x0_check)
> -	subl	$VEC_SIZE, %edx
> -	jle	L(zero)
> -
> -	VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1
> -	vpmovmskb %ymm1, %eax
> +	vpmovmskb %ymm2, %eax
>  	testl	%eax, %eax
> -	jnz	L(first_vec_x1_check)
> -	xorl	%eax, %eax
> -	VZEROUPPER_RETURN
> +	jnz	L(last_vec_x2_return)
>  
> -	.p2align 4
> -L(first_vec_x0_check):
> -	tzcntl	%eax, %eax
> -	/* Check the end of data.  */
> -	cmpq	%rax, %rdx
> -	jbe	L(zero)
> +	vpmovmskb %ymm3, %eax
> +	/* Combine VEC3 matches (eax) with VEC4 matches (ecx).  */
> +	salq	$32, %rcx
> +	orq	%rcx, %rax
> +	tzcntq	%rax, %rax
> +# ifdef USE_AS_RAWMEMCHR
> +	subq	$(VEC_SIZE * 2 - 1), %rdi
> +# else
> +	subq	$-(VEC_SIZE * 2 + 1), %rdi
> +# endif
>  	addq	%rdi, %rax
>  	VZEROUPPER_RETURN
> +# ifndef USE_AS_RAWMEMCHR
>  
>  	.p2align 4
>  L(first_vec_x1_check):
>  	tzcntl	%eax, %eax
> -	/* Check the end of data.  */
> -	cmpq	%rax, %rdx
> -	jbe	L(zero)
> -	addq	$VEC_SIZE, %rax
> +	/* Adjust length.  */
> +	subl	$-(VEC_SIZE * 4), %edx
> +	/* Check if match within remaining length.  */
> +	cmpl	%eax, %edx
> +	jbe	L(set_zero_end)
> +	incq	%rdi
>  	addq	%rdi, %rax
>  	VZEROUPPER_RETURN
> +	.p2align 4
> +L(set_zero_end):
> +	xorl	%eax, %eax
> +	VZEROUPPER_RETURN
> +# endif
>  
>  	.p2align 4
> -L(first_vec_x2_check):
> +L(last_vec_x1_return):
>  	tzcntl	%eax, %eax
> -	/* Check the end of data.  */
> -	cmpq	%rax, %rdx
> -	jbe	L(zero)
> -	addq	$(VEC_SIZE * 2), %rax
> +# ifdef USE_AS_RAWMEMCHR
> +	subq	$(VEC_SIZE * 4 - 1), %rdi
> +# else
> +	incq	%rdi
> +# endif
>  	addq	%rdi, %rax
>  	VZEROUPPER_RETURN
>  
>  	.p2align 4
> -L(first_vec_x3_check):
> +L(last_vec_x2_return):
>  	tzcntl	%eax, %eax
> -	/* Check the end of data.  */
> -	cmpq	%rax, %rdx
> -	jbe	L(zero)
> -	addq	$(VEC_SIZE * 3), %rax
> +# ifdef USE_AS_RAWMEMCHR
> +	subq	$(VEC_SIZE * 3 - 1), %rdi
> +# else
> +	subq	$-(VEC_SIZE + 1), %rdi
> +# endif
>  	addq	%rdi, %rax
>  	VZEROUPPER_RETURN
>  
> +# ifndef USE_AS_RAWMEMCHR
>  	.p2align 4
> -L(zero):
> -	xorl	%eax, %eax
> -	jmp     L(return_vzeroupper)
> +L(last_4x_vec_or_less_cmpeq):
> +	VPCMPEQ	(VEC_SIZE * 4 + 1)(%rdi), %ymm0, %ymm1
> +	vpmovmskb %ymm1, %eax
> +	subq	$-(VEC_SIZE * 4), %rdi
> +	/* Check first VEC regardless.  */
> +	testl	%eax, %eax
> +	jnz	L(first_vec_x1_check)
>  
> +	/* If remaining length <= CHAR_PER_VEC * 2.  */
> +	addl	$(VEC_SIZE * 2), %edx
> +	jle	L(last_2x_vec)
>  	.p2align 4
> -L(null):
> -	xorl	%eax, %eax
> -	ret
> -# endif
> +L(last_4x_vec):
> +	VPCMPEQ	(VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
> +	vpmovmskb %ymm1, %eax
> +	testl	%eax, %eax
> +	jnz	L(last_vec_x2_return)
>  
> -	.p2align 4
> -L(first_vec_x0):
> -	tzcntl	%eax, %eax
> -	addq	%rdi, %rax
> -	VZEROUPPER_RETURN
> +	VPCMPEQ	(VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm1
> +	vpmovmskb %ymm1, %eax
>  
> -	.p2align 4
> -L(first_vec_x1):
> -	tzcntl	%eax, %eax
> -	addq	$VEC_SIZE, %rax
> -	addq	%rdi, %rax
> -	VZEROUPPER_RETURN
> +	/* Create mask for possible matches within remaining length.  */
> +	movq	$-1, %rcx
> +	bzhiq	%rdx, %rcx, %rcx
>  
> -	.p2align 4
> -L(first_vec_x2):
> +	/* Test matches in data against length match.  */
> +	andl	%ecx, %eax
> +	jnz	L(last_vec_x3)
> +
> +	/* if remaining length <= VEC_SIZE * 3 (Note this is after
> +	   remaining length was found to be > VEC_SIZE * 2.  */
> +	subl	$VEC_SIZE, %edx
> +	jbe	L(zero_end2)
> +
> +	VPCMPEQ	(VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm1
> +	vpmovmskb %ymm1, %eax
> +	/* Shift remaining length mask for last VEC.  */
> +	shrq	$32, %rcx
> +	andl	%ecx, %eax
> +	jz	L(zero_end2)
>  	tzcntl	%eax, %eax
> -	addq	$(VEC_SIZE * 2), %rax
> +	addq	$(VEC_SIZE * 3 + 1), %rdi
>  	addq	%rdi, %rax
> +L(zero_end2):
>  	VZEROUPPER_RETURN
>  
>  	.p2align 4
> -L(4x_vec_end):
> -	vpmovmskb %ymm1, %eax
> -	testl	%eax, %eax
> -	jnz	L(first_vec_x0)
> -	vpmovmskb %ymm2, %eax
> -	testl	%eax, %eax
> -	jnz	L(first_vec_x1)
> -	vpmovmskb %ymm3, %eax
> -	testl	%eax, %eax
> -	jnz	L(first_vec_x2)
> -	vpmovmskb %ymm4, %eax
> -	testl	%eax, %eax
> -L(first_vec_x3):
> +L(last_vec_x3):
>  	tzcntl	%eax, %eax
> -	addq	$(VEC_SIZE * 3), %rax
> +	subq	$-(VEC_SIZE * 2 + 1), %rdi
>  	addq	%rdi, %rax
>  	VZEROUPPER_RETURN
> +# endif
>  
>  END (MEMCHR)
>  #endif
> -- 
> 2.29.2
> 

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH v2 3/3] x86: Optimize memchr-evex.S
  2021-05-03 20:06     ` [PATCH v2 3/3] x86: Optimize memchr-evex.S Noah Goldstein
@ 2021-05-03 22:26       ` H.J. Lu
  2021-05-03 22:58         ` Noah Goldstein
  0 siblings, 1 reply; 20+ messages in thread
From: H.J. Lu @ 2021-05-03 22:26 UTC (permalink / raw)
  To: Noah Goldstein; +Cc: libc-alpha, carlos, hjl.tools

On Mon, May 03, 2021 at 04:06:55PM -0400, Noah Goldstein wrote:
> No bug. This commit optimizes memchr-evex.S. The optimizations include
> replacing some branches with cmovcc, avoiding some branches entirely
> in the less_4x_vec case, making the page cross logic less strict,
> saving some ALU in the alignment process, and most importantly
> increasing ILP in the 4x loop. test-memchr, test-rawmemchr, and
> test-wmemchr are all passing.
> 
> Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
> ---
>  sysdeps/x86_64/multiarch/memchr-evex.S | 547 +++++++++++++++----------
>  1 file changed, 322 insertions(+), 225 deletions(-)
> 
> diff --git a/sysdeps/x86_64/multiarch/memchr-evex.S b/sysdeps/x86_64/multiarch/memchr-evex.S
> index 6dd5d67b90..147d7aa8ee 100644
> --- a/sysdeps/x86_64/multiarch/memchr-evex.S
> +++ b/sysdeps/x86_64/multiarch/memchr-evex.S
> @@ -26,14 +26,28 @@
>  
>  # ifdef USE_AS_WMEMCHR
>  #  define VPBROADCAST	vpbroadcastd
> -#  define VPCMP		vpcmpd
> -#  define SHIFT_REG	r8d
> +#  define VPMINU	vpminud
> +#  define VPCMP	vpcmpd
> +#  define VPCMPEQ	vpcmpeqd
> +#  define CHAR_SIZE	4
>  # else
>  #  define VPBROADCAST	vpbroadcastb
> -#  define VPCMP		vpcmpb
> -#  define SHIFT_REG	ecx
> +#  define VPMINU	vpminub
> +#  define VPCMP	vpcmpb
> +#  define VPCMPEQ	vpcmpeqb
> +#  define CHAR_SIZE	1
>  # endif
>  
> +# ifdef USE_AS_RAWMEMCHR
> +#  define RAW_PTR_REG	rcx
> +#  define ALGN_PTR_REG	rdi
> +# else
> +#  define RAW_PTR_REG	rdi
> +#  define ALGN_PTR_REG	rcx
> +# endif
> +
> +# define XZERO		xmm23
> +# define YZERO		ymm23

Please rename XZERO/YZERO to XMMZERO/YMMZERO.  OK with this change.

Thanks.

>  # define XMMMATCH	xmm16
>  # define YMMMATCH	ymm16
>  # define YMM1		ymm17
> @@ -44,6 +58,8 @@
>  # define YMM6		ymm22
>  
>  # define VEC_SIZE 32
> +# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE)
> +# define PAGE_SIZE 4096
>  
>  	.section .text.evex,"ax",@progbits
>  ENTRY (MEMCHR)
> @@ -51,11 +67,7 @@ ENTRY (MEMCHR)
>  	/* Check for zero length.  */
>  	test	%RDX_LP, %RDX_LP
>  	jz	L(zero)
> -# endif
> -	movl	%edi, %ecx
> -# ifdef USE_AS_WMEMCHR
> -	shl	$2, %RDX_LP
> -# else
> +
>  #  ifdef __ILP32__
>  	/* Clear the upper 32 bits.  */
>  	movl	%edx, %edx
> @@ -64,318 +76,403 @@ ENTRY (MEMCHR)
>  	/* Broadcast CHAR to YMMMATCH.  */
>  	VPBROADCAST %esi, %YMMMATCH
>  	/* Check if we may cross page boundary with one vector load.  */
> -	andl	$(2 * VEC_SIZE - 1), %ecx
> -	cmpl	$VEC_SIZE, %ecx
> -	ja	L(cros_page_boundary)
> +	movl	%edi, %eax
> +	andl	$(PAGE_SIZE - 1), %eax
> +	cmpl	$(PAGE_SIZE - VEC_SIZE), %eax
> +	ja	L(cross_page_boundary)
>  
>  	/* Check the first VEC_SIZE bytes.  */
> -	VPCMP	$0, (%rdi), %YMMMATCH, %k1
> -	kmovd	%k1, %eax
> -	testl	%eax, %eax
> -
> +	VPCMP	$0, (%rdi), %YMMMATCH, %k0
> +	kmovd	%k0, %eax
>  # ifndef USE_AS_RAWMEMCHR
> -	jnz	L(first_vec_x0_check)
> -	/* Adjust length and check the end of data.  */
> -	subq	$VEC_SIZE, %rdx
> -	jbe	L(zero)
> +	/* If length < CHAR_PER_VEC handle special.  */
> +	cmpq	$CHAR_PER_VEC, %rdx
> +	jbe	L(first_vec_x0)
> +# endif
> +	testl	%eax, %eax
> +	jz	L(aligned_more)
> +	tzcntl	%eax, %eax
> +# ifdef USE_AS_WMEMCHR
> +	/* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count.  */
> +	leaq	(%rdi, %rax, CHAR_SIZE), %rax
>  # else
> -	jnz	L(first_vec_x0)
> +	addq	%rdi, %rax
>  # endif
> -
> -	/* Align data for aligned loads in the loop.  */
> -	addq	$VEC_SIZE, %rdi
> -	andl	$(VEC_SIZE - 1), %ecx
> -	andq	$-VEC_SIZE, %rdi
> +	ret
>  
>  # ifndef USE_AS_RAWMEMCHR
> -	/* Adjust length.  */
> -	addq	%rcx, %rdx
> -
> -	subq	$(VEC_SIZE * 4), %rdx
> -	jbe	L(last_4x_vec_or_less)
> -# endif
> -	jmp	L(more_4x_vec)
> +L(zero):
> +	xorl	%eax, %eax
> +	ret
>  
> +	.p2align 5
> +L(first_vec_x0):
> +	/* Check if first match was before length.  */
> +	tzcntl	%eax, %eax
> +	xorl	%ecx, %ecx
> +	cmpl	%eax, %edx
> +	leaq	(%rdi, %rax, CHAR_SIZE), %rax
> +	cmovle	%rcx, %rax
> +	ret
> +# else
> +	/* NB: first_vec_x0 is 17 bytes which will leave
> +	   cross_page_boundary (which is relatively cold) close enough
> +	   to ideal alignment. So only realign L(cross_page_boundary) if
> +	   rawmemchr.  */
>  	.p2align 4
> -L(cros_page_boundary):
> -	andl	$(VEC_SIZE - 1), %ecx
> +# endif
> +L(cross_page_boundary):
> +	/* Save pointer before aligning as its original value is
> +	   necessary for computer return address if byte is found or
> +	   adjusting length if it is not and this is memchr.  */
> +	movq	%rdi, %rcx
> +	/* Align data to VEC_SIZE. ALGN_PTR_REG is rcx for memchr and rdi
> +	   for rawmemchr.  */
> +	andq	$-VEC_SIZE, %ALGN_PTR_REG
> +	VPCMP	$0, (%ALGN_PTR_REG), %YMMMATCH, %k0
> +	kmovd	%k0, %r8d
>  # ifdef USE_AS_WMEMCHR
> -	/* NB: Divide shift count by 4 since each bit in K1 represent 4
> +	/* NB: Divide shift count by 4 since each bit in K0 represent 4
>  	   bytes.  */
> -	movl	%ecx, %SHIFT_REG
> -	sarl	$2, %SHIFT_REG
> +	sarl	$2, %eax
> +# endif
> +# ifndef USE_AS_RAWMEMCHR
> +	movl	$(PAGE_SIZE / CHAR_SIZE), %esi
> +	subl	%eax, %esi
>  # endif
> -	andq	$-VEC_SIZE, %rdi
> -	VPCMP	$0, (%rdi), %YMMMATCH, %k1
> -	kmovd	%k1, %eax
> -	/* Remove the leading bytes.  */
> -	sarxl	%SHIFT_REG, %eax, %eax
> -	testl	%eax, %eax
> -	jz	L(aligned_more)
> -	tzcntl	%eax, %eax
>  # ifdef USE_AS_WMEMCHR
> -	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
> -	sall	$2, %eax
> +	andl	$(CHAR_PER_VEC - 1), %eax
>  # endif
> +	/* Remove the leading bytes.  */
> +	sarxl	%eax, %r8d, %eax
>  # ifndef USE_AS_RAWMEMCHR
>  	/* Check the end of data.  */
> -	cmpq	%rax, %rdx
> -	jbe	L(zero)
> +	cmpq	%rsi, %rdx
> +	jbe	L(first_vec_x0)
> +# endif
> +	testl	%eax, %eax
> +	jz	L(cross_page_continue)
> +	tzcntl	%eax, %eax
> +# ifdef USE_AS_WMEMCHR
> +	/* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count.  */
> +	leaq	(%RAW_PTR_REG, %rax, CHAR_SIZE), %rax
> +# else
> +	addq	%RAW_PTR_REG, %rax
>  # endif
> -	addq	%rdi, %rax
> -	addq	%rcx, %rax
>  	ret
>  
>  	.p2align 4
> -L(aligned_more):
> -# ifndef USE_AS_RAWMEMCHR
> -        /* Calculate "rdx + rcx - VEC_SIZE" with "rdx - (VEC_SIZE - rcx)"
> -	   instead of "(rdx + rcx) - VEC_SIZE" to void possible addition
> -	   overflow.  */
> -	negq	%rcx
> -	addq	$VEC_SIZE, %rcx
> +L(first_vec_x1):
> +	tzcntl	%eax, %eax
> +	leaq	VEC_SIZE(%rdi, %rax, CHAR_SIZE), %rax
> +	ret
>  
> -	/* Check the end of data.  */
> -	subq	%rcx, %rdx
> -	jbe	L(zero)
> -# endif
> +	.p2align 4
> +L(first_vec_x2):
> +	tzcntl	%eax, %eax
> +	leaq	(VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
> +	ret
>  
> -	addq	$VEC_SIZE, %rdi
> +	.p2align 4
> +L(first_vec_x3):
> +	tzcntl	%eax, %eax
> +	leaq	(VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax
> +	ret
>  
> -# ifndef USE_AS_RAWMEMCHR
> -	subq	$(VEC_SIZE * 4), %rdx
> -	jbe	L(last_4x_vec_or_less)
> -# endif
> +	.p2align 4
> +L(first_vec_x4):
> +	tzcntl	%eax, %eax
> +	leaq	(VEC_SIZE * 4)(%rdi, %rax, CHAR_SIZE), %rax
> +	ret
>  
> -L(more_4x_vec):
> +	.p2align 5
> +L(aligned_more):
>  	/* Check the first 4 * VEC_SIZE.  Only one VEC_SIZE at a time
>  	   since data is only aligned to VEC_SIZE.  */
> -	VPCMP	$0, (%rdi), %YMMMATCH, %k1
> -	kmovd	%k1, %eax
> -	testl	%eax, %eax
> -	jnz	L(first_vec_x0)
>  
> -	VPCMP	$0, VEC_SIZE(%rdi), %YMMMATCH, %k1
> -	kmovd	%k1, %eax
> +# ifndef USE_AS_RAWMEMCHR
> +	/* Align data to VEC_SIZE.  */
> +L(cross_page_continue):
> +	xorl	%ecx, %ecx
> +	subl	%edi, %ecx
> +	andq	$-VEC_SIZE, %rdi
> +	/* esi is for adjusting length to see if near the end.  */
> +	leal	(VEC_SIZE * 5)(%rdi, %rcx), %esi
> +#  ifdef USE_AS_WMEMCHR
> +	/* NB: Divide bytes by 4 to get the wchar_t count.  */
> +	sarl	$2, %esi
> +#  endif
> +# else
> +	andq	$-VEC_SIZE, %rdi
> +L(cross_page_continue):
> +# endif
> +	/* Load first VEC regardless.  */
> +	VPCMP	$0, (VEC_SIZE)(%rdi), %YMMMATCH, %k0
> +	kmovd	%k0, %eax
> +# ifndef USE_AS_RAWMEMCHR
> +	/* Adjust length. If near end handle specially.  */
> +	subq	%rsi, %rdx
> +	jbe	L(last_4x_vec_or_less)
> +# endif
>  	testl	%eax, %eax
>  	jnz	L(first_vec_x1)
>  
> -	VPCMP	$0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k1
> -	kmovd	%k1, %eax
> +	VPCMP	$0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k0
> +	kmovd	%k0, %eax
>  	testl	%eax, %eax
>  	jnz	L(first_vec_x2)
>  
> -	VPCMP	$0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k1
> -	kmovd	%k1, %eax
> +	VPCMP	$0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k0
> +	kmovd	%k0, %eax
>  	testl	%eax, %eax
>  	jnz	L(first_vec_x3)
>  
> -	addq	$(VEC_SIZE * 4), %rdi
> +	VPCMP	$0, (VEC_SIZE * 4)(%rdi), %YMMMATCH, %k0
> +	kmovd	%k0, %eax
> +	testl	%eax, %eax
> +	jnz	L(first_vec_x4)
> +
>  
>  # ifndef USE_AS_RAWMEMCHR
> -	subq	$(VEC_SIZE * 4), %rdx
> -	jbe	L(last_4x_vec_or_less)
> -# endif
> +	/* Check if at last CHAR_PER_VEC * 4 length.  */
> +	subq	$(CHAR_PER_VEC * 4), %rdx
> +	jbe	L(last_4x_vec_or_less_cmpeq)
> +	addq	$VEC_SIZE, %rdi
>  
> -	/* Align data to 4 * VEC_SIZE.  */
> -	movq	%rdi, %rcx
> -	andl	$(4 * VEC_SIZE - 1), %ecx
> +	/* Align data to VEC_SIZE * 4 for the loop and readjust length.
> +	 */
> +#  ifdef USE_AS_WMEMCHR
> +	movl	%edi, %ecx
>  	andq	$-(4 * VEC_SIZE), %rdi
> -
> -# ifndef USE_AS_RAWMEMCHR
> -	/* Adjust length.  */
> +	andl	$(VEC_SIZE * 4 - 1), %ecx
> +	/* NB: Divide bytes by 4 to get the wchar_t count.  */
> +	sarl	$2, %ecx
>  	addq	%rcx, %rdx
> +#  else
> +	addq	%rdi, %rdx
> +	andq	$-(4 * VEC_SIZE), %rdi
> +	subq	%rdi, %rdx
> +#  endif
> +# else
> +	addq	$VEC_SIZE, %rdi
> +	andq	$-(4 * VEC_SIZE), %rdi
>  # endif
>  
> +	vpxorq	%XZERO, %XZERO, %XZERO
> +
> +	/* Compare 4 * VEC at a time forward.  */
>  	.p2align 4
>  L(loop_4x_vec):
> -	/* Compare 4 * VEC at a time forward.  */
> -	VPCMP	$0, (%rdi), %YMMMATCH, %k1
> -	VPCMP	$0, VEC_SIZE(%rdi), %YMMMATCH, %k2
> -	kord	%k1, %k2, %k5
> -	VPCMP	$0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k3
> -	VPCMP	$0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k4
> -
> -	kord	%k3, %k4, %k6
> -	kortestd %k5, %k6
> -	jnz	L(4x_vec_end)
> -
> -	addq	$(VEC_SIZE * 4), %rdi
> -
> +	/* It would be possible to save some instructions using 4x VPCMP
> +	   but bottleneck on port 5 makes it not woth it.  */
> +	VPCMP	$4, (VEC_SIZE * 4)(%rdi), %YMMMATCH, %k1
> +	/* xor will set bytes match esi to zero.  */
> +	vpxorq	(VEC_SIZE * 5)(%rdi), %YMMMATCH, %YMM2
> +	vpxorq	(VEC_SIZE * 6)(%rdi), %YMMMATCH, %YMM3
> +	VPCMP	$0, (VEC_SIZE * 7)(%rdi), %YMMMATCH, %k3
> +	/* Reduce VEC2 / VEC3 with min and VEC1 with zero mask.  */
> +	VPMINU	%YMM2, %YMM3, %YMM3 {%k1} {z}
> +	VPCMP	$0, %YMM3, %YZERO, %k2
>  # ifdef USE_AS_RAWMEMCHR
> -	jmp	L(loop_4x_vec)
> +	subq	$-(VEC_SIZE * 4), %rdi
> +	kortestd %k2, %k3
> +	jz	L(loop_4x_vec)
>  # else
> -	subq	$(VEC_SIZE * 4), %rdx
> +	kortestd %k2, %k3
> +	jnz	L(loop_4x_vec_end)
> +
> +	subq	$-(VEC_SIZE * 4), %rdi
> +
> +	subq	$(CHAR_PER_VEC * 4), %rdx
>  	ja	L(loop_4x_vec)
>  
> +	/* Fall through into less than 4 remaining vectors of length case.
> +	 */
> +	VPCMP	$0, (VEC_SIZE * 4)(%rdi), %YMMMATCH, %k0
> +	kmovd	%k0, %eax
> +	addq	$(VEC_SIZE * 3), %rdi
> +	.p2align 4
>  L(last_4x_vec_or_less):
> -	/* Less than 4 * VEC and aligned to VEC_SIZE.  */
> -	addl	$(VEC_SIZE * 2), %edx
> -	jle	L(last_2x_vec)
> -
> -	VPCMP	$0, (%rdi), %YMMMATCH, %k1
> -	kmovd	%k1, %eax
> +	/* Check if first VEC contained match.  */
>  	testl	%eax, %eax
> -	jnz	L(first_vec_x0)
> +	jnz	L(first_vec_x1_check)
>  
> -	VPCMP	$0, VEC_SIZE(%rdi), %YMMMATCH, %k1
> -	kmovd	%k1, %eax
> -	testl	%eax, %eax
> -	jnz	L(first_vec_x1)
> +	/* If remaining length > CHAR_PER_VEC * 2.  */
> +	addl	$(CHAR_PER_VEC * 2), %edx
> +	jg	L(last_4x_vec)
>  
> -	VPCMP	$0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k1
> -	kmovd	%k1, %eax
> -	testl	%eax, %eax
> +L(last_2x_vec):
> +	/* If remaining length < CHAR_PER_VEC.  */
> +	addl	$CHAR_PER_VEC, %edx
> +	jle	L(zero_end)
>  
> -	jnz	L(first_vec_x2_check)
> -	subl	$VEC_SIZE, %edx
> -	jle	L(zero)
> +	/* Check VEC2 and compare any match with remaining length.  */
> +	VPCMP	$0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k0
> +	kmovd	%k0, %eax
> +	tzcntl	%eax, %eax
> +	cmpl	%eax, %edx
> +	jbe	L(set_zero_end)
> +	leaq	(VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
> +L(zero_end):
> +	ret
>  
> -	VPCMP	$0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k1
> -	kmovd	%k1, %eax
> -	testl	%eax, %eax
>  
> -	jnz	L(first_vec_x3_check)
> +	.p2align 4
> +L(first_vec_x1_check):
> +	tzcntl	%eax, %eax
> +	/* Adjust length.  */
> +	subl	$-(CHAR_PER_VEC * 4), %edx
> +	/* Check if match within remaining length.  */
> +	cmpl	%eax, %edx
> +	jbe	L(set_zero_end)
> +	/* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count.  */
> +	leaq	VEC_SIZE(%rdi, %rax, CHAR_SIZE), %rax
> +	ret
> +L(set_zero_end):
>  	xorl	%eax, %eax
>  	ret
>  
>  	.p2align 4
> -L(last_2x_vec):
> -	addl	$(VEC_SIZE * 2), %edx
> -	VPCMP	$0, (%rdi), %YMMMATCH, %k1
> +L(loop_4x_vec_end):
> +# endif
> +	/* rawmemchr will fall through into this if match was found in
> +	   loop.  */
> +
> +	/* k1 has not of matches with VEC1.  */
>  	kmovd	%k1, %eax
> -	testl	%eax, %eax
> +# ifdef USE_AS_WMEMCHR
> +	subl	$((1 << CHAR_PER_VEC) - 1), %eax
> +# else
> +	incl	%eax
> +# endif
> +	jnz	L(last_vec_x1_return)
>  
> -	jnz	L(first_vec_x0_check)
> -	subl	$VEC_SIZE, %edx
> -	jle	L(zero)
> +	VPCMP	$0, %YMM2, %YZERO, %k0
> +	kmovd	%k0, %eax
> +	testl	%eax, %eax
> +	jnz	L(last_vec_x2_return)
>  
> -	VPCMP	$0, VEC_SIZE(%rdi), %YMMMATCH, %k1
> -	kmovd	%k1, %eax
> +	kmovd	%k2, %eax
>  	testl	%eax, %eax
> -	jnz	L(first_vec_x1_check)
> -	xorl	%eax, %eax
> -	ret
> +	jnz	L(last_vec_x3_return)
>  
> -	.p2align 4
> -L(first_vec_x0_check):
> +	kmovd	%k3, %eax
>  	tzcntl	%eax, %eax
> -# ifdef USE_AS_WMEMCHR
> -	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
> -	sall	$2, %eax
> +# ifdef USE_AS_RAWMEMCHR
> +	leaq	(VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax
> +# else
> +	leaq	(VEC_SIZE * 7)(%rdi, %rax, CHAR_SIZE), %rax
>  # endif
> -	/* Check the end of data.  */
> -	cmpq	%rax, %rdx
> -	jbe	L(zero)
> -	addq	%rdi, %rax
>  	ret
>  
>  	.p2align 4
> -L(first_vec_x1_check):
> +L(last_vec_x1_return):
>  	tzcntl	%eax, %eax
> -# ifdef USE_AS_WMEMCHR
> -	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
> -	sall	$2, %eax
> -# endif
> -	/* Check the end of data.  */
> -	cmpq	%rax, %rdx
> -	jbe	L(zero)
> -	addq	$VEC_SIZE, %rax
> +# ifdef USE_AS_RAWMEMCHR
> +#  ifdef USE_AS_WMEMCHR
> +	/* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count.  */
> +	leaq	(%rdi, %rax, CHAR_SIZE), %rax
> +#  else
>  	addq	%rdi, %rax
> -	ret
> -
> -	.p2align 4
> -L(first_vec_x2_check):
> -	tzcntl	%eax, %eax
> -# ifdef USE_AS_WMEMCHR
> -	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
> -	sall	$2, %eax
> +#  endif
> +# else
> +	/* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count.  */
> +	leaq	(VEC_SIZE * 4)(%rdi, %rax, CHAR_SIZE), %rax
>  # endif
> -	/* Check the end of data.  */
> -	cmpq	%rax, %rdx
> -	jbe	L(zero)
> -	addq	$(VEC_SIZE * 2), %rax
> -	addq	%rdi, %rax
>  	ret
>  
>  	.p2align 4
> -L(first_vec_x3_check):
> +L(last_vec_x2_return):
>  	tzcntl	%eax, %eax
> -# ifdef USE_AS_WMEMCHR
> -	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
> -	sall	$2, %eax
> +# ifdef USE_AS_RAWMEMCHR
> +	/* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count.  */
> +	leaq	VEC_SIZE(%rdi, %rax, CHAR_SIZE), %rax
> +# else
> +	/* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count.  */
> +	leaq	(VEC_SIZE * 5)(%rdi, %rax, CHAR_SIZE), %rax
>  # endif
> -	/* Check the end of data.  */
> -	cmpq	%rax, %rdx
> -	jbe	L(zero)
> -	addq	$(VEC_SIZE * 3), %rax
> -	addq	%rdi, %rax
>  	ret
>  
>  	.p2align 4
> -L(zero):
> -	xorl	%eax, %eax
> -	ret
> -# endif
> -
> -	.p2align 4
> -L(first_vec_x0):
> +L(last_vec_x3_return):
>  	tzcntl	%eax, %eax
> -# ifdef USE_AS_WMEMCHR
> -	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
> -	leaq	(%rdi, %rax, 4), %rax
> +# ifdef USE_AS_RAWMEMCHR
> +	/* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count.  */
> +	leaq	(VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
>  # else
> -	addq	%rdi, %rax
> +	/* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count.  */
> +	leaq	(VEC_SIZE * 6)(%rdi, %rax, CHAR_SIZE), %rax
>  # endif
>  	ret
>  
> +
> +# ifndef USE_AS_RAWMEMCHR
> +L(last_4x_vec_or_less_cmpeq):
> +	VPCMP	$0, (VEC_SIZE * 5)(%rdi), %YMMMATCH, %k0
> +	kmovd	%k0, %eax
> +	subq	$-(VEC_SIZE * 4), %rdi
> +	/* Check first VEC regardless.  */
> +	testl	%eax, %eax
> +	jnz	L(first_vec_x1_check)
> +
> +	/* If remaining length <= CHAR_PER_VEC * 2.  */
> +	addl	$(CHAR_PER_VEC * 2), %edx
> +	jle	L(last_2x_vec)
> +
>  	.p2align 4
> -L(first_vec_x1):
> +L(last_4x_vec):
> +	VPCMP	$0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k0
> +	kmovd	%k0, %eax
> +	testl	%eax, %eax
> +	jnz	L(last_vec_x2)
> +
> +
> +	VPCMP	$0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k0
> +	kmovd	%k0, %eax
> +	/* Create mask for possible matches within remaining length.  */
> +#  ifdef USE_AS_WMEMCHR
> +	movl	$((1 << (CHAR_PER_VEC * 2)) - 1), %ecx
> +	bzhil	%edx, %ecx, %ecx
> +#  else
> +	movq	$-1, %rcx
> +	bzhiq	%rdx, %rcx, %rcx
> +#  endif
> +	/* Test matches in data against length match.  */
> +	andl	%ecx, %eax
> +	jnz	L(last_vec_x3)
> +
> +	/* if remaining length <= CHAR_PER_VEC * 3 (Note this is after
> +	   remaining length was found to be > CHAR_PER_VEC * 2.  */
> +	subl	$CHAR_PER_VEC, %edx
> +	jbe	L(zero_end2)
> +
> +
> +	VPCMP	$0, (VEC_SIZE * 4)(%rdi), %YMMMATCH, %k0
> +	kmovd	%k0, %eax
> +	/* Shift remaining length mask for last VEC.  */
> +#  ifdef USE_AS_WMEMCHR
> +	shrl	$CHAR_PER_VEC, %ecx
> +#  else
> +	shrq	$CHAR_PER_VEC, %rcx
> +#  endif
> +	andl	%ecx, %eax
> +	jz	L(zero_end2)
>  	tzcntl	%eax, %eax
> -# ifdef USE_AS_WMEMCHR
> -	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
> -	leaq	VEC_SIZE(%rdi, %rax, 4), %rax
> -# else
> -	addq	$VEC_SIZE, %rax
> -	addq	%rdi, %rax
> -# endif
> +	leaq	(VEC_SIZE * 4)(%rdi, %rax, CHAR_SIZE), %rax
> +L(zero_end2):
>  	ret
>  
> -	.p2align 4
> -L(first_vec_x2):
> +L(last_vec_x2):
>  	tzcntl	%eax, %eax
> -# ifdef USE_AS_WMEMCHR
> -	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
> -	leaq	(VEC_SIZE * 2)(%rdi, %rax, 4), %rax
> -# else
> -	addq	$(VEC_SIZE * 2), %rax
> -	addq	%rdi, %rax
> -# endif
> +	leaq	(VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
>  	ret
>  
>  	.p2align 4
> -L(4x_vec_end):
> -	kmovd	%k1, %eax
> -	testl	%eax, %eax
> -	jnz	L(first_vec_x0)
> -	kmovd	%k2, %eax
> -	testl	%eax, %eax
> -	jnz	L(first_vec_x1)
> -	kmovd	%k3, %eax
> -	testl	%eax, %eax
> -	jnz	L(first_vec_x2)
> -	kmovd	%k4, %eax
> -	testl	%eax, %eax
> -L(first_vec_x3):
> +L(last_vec_x3):
>  	tzcntl	%eax, %eax
> -# ifdef USE_AS_WMEMCHR
> -	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
> -	leaq	(VEC_SIZE * 3)(%rdi, %rax, 4), %rax
> -# else
> -	addq	$(VEC_SIZE * 3), %rax
> -	addq	%rdi, %rax
> -# endif
> +	leaq	(VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax
>  	ret
> +# endif
>  
>  END (MEMCHR)
>  #endif
> -- 
> 2.29.2
> 

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH v2 2/3] x86: Optimize memchr-avx2.S
  2021-05-03 22:25     ` [PATCH v2 2/3] x86: Optimize memchr-avx2.S H.J. Lu
@ 2021-05-03 22:58       ` Noah Goldstein
  0 siblings, 0 replies; 20+ messages in thread
From: Noah Goldstein @ 2021-05-03 22:58 UTC (permalink / raw)
  To: H.J. Lu; +Cc: GNU C Library, Carlos O'Donell

On Mon, May 3, 2021 at 6:25 PM H.J. Lu <hjl.tools@gmail.com> wrote:
>
> On Mon, May 03, 2021 at 04:06:54PM -0400, Noah Goldstein wrote:
> > No bug. This commit optimizes memchr-avx2.S. The optimizations include
> > replacing some branches with cmovcc, avoiding some branches entirely
> > in the less_4x_vec case, making the page cross logic less strict,
> > asaving a few instructions the in loop return loop. test-memchr,
> > test-rawmemchr, and test-wmemchr are all passing.
> >
> > Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
> > ---
> >  sysdeps/x86_64/multiarch/memchr-avx2.S | 426 ++++++++++++++-----------
> >  1 file changed, 247 insertions(+), 179 deletions(-)
> >
> > diff --git a/sysdeps/x86_64/multiarch/memchr-avx2.S b/sysdeps/x86_64/multiarch/memchr-avx2.S
> > index 1fcb1c350f..8b862fb9d1 100644
> > --- a/sysdeps/x86_64/multiarch/memchr-avx2.S
> > +++ b/sysdeps/x86_64/multiarch/memchr-avx2.S
> > @@ -26,8 +26,22 @@
> >
> >  # ifdef USE_AS_WMEMCHR
> >  #  define VPCMPEQ    vpcmpeqd
> > +#  define VPBROADCAST        vpbroadcastd
> > +#  define CHAR_SIZE  4
> >  # else
> >  #  define VPCMPEQ    vpcmpeqb
> > +#  define VPBROADCAST        vpbroadcastb
> > +#  define CHAR_SIZE  1
> > +# endif
> > +
> > +# ifdef USE_AS_RAWMEMCHR
> > +#  define ERAW_PTR_REG       ecx
> > +#  define RRAW_PTR_REG       rcx
> > +#  define ALGN_PTR_REG       rdi
> > +# else
> > +#  define ERAW_PTR_REG       edi
> > +#  define RRAW_PTR_REG       rdi
> > +#  define ALGN_PTR_REG       rcx
> >  # endif
> >
> >  # ifndef VZEROUPPER
> > @@ -39,6 +53,7 @@
> >  # endif
> >
> >  # define VEC_SIZE 32
> > +# define PAGE_SIZE 4096
> >
> >       .section SECTION(.text),"ax",@progbits
> >  ENTRY (MEMCHR)
> > @@ -47,295 +62,348 @@ ENTRY (MEMCHR)
> >       test    %RDX_LP, %RDX_LP
> >       jz      L(null)
> >  # endif
> > -     movl    %edi, %ecx
> > -     /* Broadcast CHAR to YMM0.  */
> > -     vmovd   %esi, %xmm0
> >  # ifdef USE_AS_WMEMCHR
> >       shl     $2, %RDX_LP
> > -     vpbroadcastd %xmm0, %ymm0
> >  # else
> >  #  ifdef __ILP32__
> >       /* Clear the upper 32 bits.  */
> >       movl    %edx, %edx
> >  #  endif
> > -     vpbroadcastb %xmm0, %ymm0
> >  # endif
> > +     /* Broadcast CHAR to YMMMATCH.  */
> > +     vmovd   %esi, %xmm0
> > +     VPBROADCAST %xmm0, %ymm0
> >       /* Check if we may cross page boundary with one vector load.  */
> > -     andl    $(2 * VEC_SIZE - 1), %ecx
> > -     cmpl    $VEC_SIZE, %ecx
> > -     ja      L(cros_page_boundary)
> > +     movl    %edi, %eax
> > +     andl    $(PAGE_SIZE - 1), %eax
> > +     cmpl    $(PAGE_SIZE - VEC_SIZE), %eax
> > +     ja      L(cross_page_boundary)
> >
> >       /* Check the first VEC_SIZE bytes.  */
> > -     VPCMPEQ (%rdi), %ymm0, %ymm1
> > +     VPCMPEQ (%rdi), %ymm0, %ymm1
> >       vpmovmskb %ymm1, %eax
> > -     testl   %eax, %eax
> > -
> >  # ifndef USE_AS_RAWMEMCHR
> > -     jnz     L(first_vec_x0_check)
> > -     /* Adjust length and check the end of data.  */
> > -     subq    $VEC_SIZE, %rdx
> > -     jbe     L(zero)
> > -# else
> > -     jnz     L(first_vec_x0)
> > +     /* If length < CHAR_PER_VEC handle special.  */
> > +     cmpq    $VEC_SIZE, %rdx
> > +     jbe     L(first_vec_x0)
> >  # endif
> > -
> > -     /* Align data for aligned loads in the loop.  */
> > -     addq    $VEC_SIZE, %rdi
> > -     andl    $(VEC_SIZE - 1), %ecx
> > -     andq    $-VEC_SIZE, %rdi
> > +     testl   %eax, %eax
> > +     jz      L(aligned_more)
> > +     tzcntl  %eax, %eax
> > +     addq    %rdi, %rax
> > +     VZEROUPPER_RETURN
> >
> >  # ifndef USE_AS_RAWMEMCHR
> > -     /* Adjust length.  */
> > -     addq    %rcx, %rdx
> > -
> > -     subq    $(VEC_SIZE * 4), %rdx
> > -     jbe     L(last_4x_vec_or_less)
> > +     .p2align 5
> > +L(first_vec_x0):
> > +     /* Check if first match was before length.  */
> > +     tzcntl  %eax, %eax
> > +     xorl    %ecx, %ecx
> > +     cmpl    %eax, %edx
> > +     leaq    (%rdi, %rax), %rax
> > +     cmovle  %rcx, %rax
> > +     VZEROUPPER_RETURN
>
> Please add a blank line here to indicate this begin a new block.

Done.

> OK with this change.  You should be able to push it yourself now.

Will do. Thanks!


>
> Thanks.
>
> > +L(null):
> > +     xorl    %eax, %eax
> > +     ret
> >  # endif
> > -     jmp     L(more_4x_vec)
> > -
> >       .p2align 4
> > -L(cros_page_boundary):
> > -     andl    $(VEC_SIZE - 1), %ecx
> > -     andq    $-VEC_SIZE, %rdi
> > -     VPCMPEQ (%rdi), %ymm0, %ymm1
> > +L(cross_page_boundary):
> > +     /* Save pointer before aligning as its original value is necessary
> > +        for computer return address if byte is found or adjusting length
> > +        if it is not and this is memchr.  */
> > +     movq    %rdi, %rcx
> > +     /* Align data to VEC_SIZE - 1. ALGN_PTR_REG is rcx for memchr and
> > +        rdi for rawmemchr.  */
> > +     orq     $(VEC_SIZE - 1), %ALGN_PTR_REG
> > +     VPCMPEQ -(VEC_SIZE - 1)(%ALGN_PTR_REG), %ymm0, %ymm1
> >       vpmovmskb %ymm1, %eax
> > +# ifndef USE_AS_RAWMEMCHR
> > +     /* Calculate length until end of page (length checked for a
> > +        match).  */
> > +     leaq    1(%ALGN_PTR_REG), %rsi
> > +     subq    %RRAW_PTR_REG, %rsi
> > +# endif
> >       /* Remove the leading bytes.  */
> > -     sarl    %cl, %eax
> > -     testl   %eax, %eax
> > -     jz      L(aligned_more)
> > -     tzcntl  %eax, %eax
> > +     sarxl   %ERAW_PTR_REG, %eax, %eax
> >  # ifndef USE_AS_RAWMEMCHR
> >       /* Check the end of data.  */
> > -     cmpq    %rax, %rdx
> > -     jbe     L(zero)
> > +     cmpq    %rsi, %rdx
> > +     jbe     L(first_vec_x0)
> >  # endif
> > -     addq    %rdi, %rax
> > -     addq    %rcx, %rax
> > +     testl   %eax, %eax
> > +     jz      L(cross_page_continue)
> > +     tzcntl  %eax, %eax
> > +     addq    %RRAW_PTR_REG, %rax
> >  L(return_vzeroupper):
> >       ZERO_UPPER_VEC_REGISTERS_RETURN
> >
> >       .p2align 4
> > -L(aligned_more):
> > -# ifndef USE_AS_RAWMEMCHR
> > -        /* Calculate "rdx + rcx - VEC_SIZE" with "rdx - (VEC_SIZE - rcx)"
> > -        instead of "(rdx + rcx) - VEC_SIZE" to void possible addition
> > -        overflow.  */
> > -     negq    %rcx
> > -     addq    $VEC_SIZE, %rcx
> > +L(first_vec_x1):
> > +     tzcntl  %eax, %eax
> > +     incq    %rdi
> > +     addq    %rdi, %rax
> > +     VZEROUPPER_RETURN
> >
> > -     /* Check the end of data.  */
> > -     subq    %rcx, %rdx
> > -     jbe     L(zero)
> > -# endif
> > +     .p2align 4
> > +L(first_vec_x2):
> > +     tzcntl  %eax, %eax
> > +     addq    $(VEC_SIZE + 1), %rdi
> > +     addq    %rdi, %rax
> > +     VZEROUPPER_RETURN
> >
> > -     addq    $VEC_SIZE, %rdi
> > +     .p2align 4
> > +L(first_vec_x3):
> > +     tzcntl  %eax, %eax
> > +     addq    $(VEC_SIZE * 2 + 1), %rdi
> > +     addq    %rdi, %rax
> > +     VZEROUPPER_RETURN
> >
> > -# ifndef USE_AS_RAWMEMCHR
> > -     subq    $(VEC_SIZE * 4), %rdx
> > -     jbe     L(last_4x_vec_or_less)
> > -# endif
> >
> > -L(more_4x_vec):
> > +     .p2align 4
> > +L(first_vec_x4):
> > +     tzcntl  %eax, %eax
> > +     addq    $(VEC_SIZE * 3 + 1), %rdi
> > +     addq    %rdi, %rax
> > +     VZEROUPPER_RETURN
> > +
> > +     .p2align 4
> > +L(aligned_more):
> >       /* Check the first 4 * VEC_SIZE.  Only one VEC_SIZE at a time
> >          since data is only aligned to VEC_SIZE.  */
> > -     VPCMPEQ (%rdi), %ymm0, %ymm1
> > -     vpmovmskb %ymm1, %eax
> > -     testl   %eax, %eax
> > -     jnz     L(first_vec_x0)
> >
> > -     VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1
> > +# ifndef USE_AS_RAWMEMCHR
> > +L(cross_page_continue):
> > +     /* Align data to VEC_SIZE - 1.  */
> > +     xorl    %ecx, %ecx
> > +     subl    %edi, %ecx
> > +     orq     $(VEC_SIZE - 1), %rdi
> > +     /* esi is for adjusting length to see if near the end.  */
> > +     leal    (VEC_SIZE * 4 + 1)(%rdi, %rcx), %esi
> > +# else
> > +     orq     $(VEC_SIZE - 1), %rdi
> > +L(cross_page_continue):
> > +# endif
> > +     /* Load first VEC regardless.  */
> > +     VPCMPEQ 1(%rdi), %ymm0, %ymm1
> >       vpmovmskb %ymm1, %eax
> > +# ifndef USE_AS_RAWMEMCHR
> > +     /* Adjust length. If near end handle specially.  */
> > +     subq    %rsi, %rdx
> > +     jbe     L(last_4x_vec_or_less)
> > +# endif
> >       testl   %eax, %eax
> >       jnz     L(first_vec_x1)
> >
> > -     VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm0, %ymm1
> > +     VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
> >       vpmovmskb %ymm1, %eax
> >       testl   %eax, %eax
> >       jnz     L(first_vec_x2)
> >
> > -     VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1
> > +     VPCMPEQ (VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm1
> >       vpmovmskb %ymm1, %eax
> >       testl   %eax, %eax
> >       jnz     L(first_vec_x3)
> >
> > -     addq    $(VEC_SIZE * 4), %rdi
> > +     VPCMPEQ (VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm1
> > +     vpmovmskb %ymm1, %eax
> > +     testl   %eax, %eax
> > +     jnz     L(first_vec_x4)
> >
> >  # ifndef USE_AS_RAWMEMCHR
> > +     /* Check if at last VEC_SIZE * 4 length.  */
> >       subq    $(VEC_SIZE * 4), %rdx
> > -     jbe     L(last_4x_vec_or_less)
> > -# endif
> > -
> > -     /* Align data to 4 * VEC_SIZE.  */
> > -     movq    %rdi, %rcx
> > -     andl    $(4 * VEC_SIZE - 1), %ecx
> > -     andq    $-(4 * VEC_SIZE), %rdi
> > -
> > -# ifndef USE_AS_RAWMEMCHR
> > -     /* Adjust length.  */
> > +     jbe     L(last_4x_vec_or_less_cmpeq)
> > +     /* Align data to VEC_SIZE * 4 - 1 for the loop and readjust
> > +        length.  */
> > +     incq    %rdi
> > +     movl    %edi, %ecx
> > +     orq     $(VEC_SIZE * 4 - 1), %rdi
> > +     andl    $(VEC_SIZE * 4 - 1), %ecx
> >       addq    %rcx, %rdx
> > +# else
> > +     /* Align data to VEC_SIZE * 4 - 1 for loop.  */
> > +     incq    %rdi
> > +     orq     $(VEC_SIZE * 4 - 1), %rdi
> >  # endif
> >
> > +     /* Compare 4 * VEC at a time forward.  */
> >       .p2align 4
> >  L(loop_4x_vec):
> > -     /* Compare 4 * VEC at a time forward.  */
> > -     VPCMPEQ (%rdi), %ymm0, %ymm1
> > -     VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm2
> > -     VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm0, %ymm3
> > -     VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm0, %ymm4
> > -
> > +     VPCMPEQ 1(%rdi), %ymm0, %ymm1
> > +     VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm2
> > +     VPCMPEQ (VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm3
> > +     VPCMPEQ (VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm4
> >       vpor    %ymm1, %ymm2, %ymm5
> >       vpor    %ymm3, %ymm4, %ymm6
> >       vpor    %ymm5, %ymm6, %ymm5
> >
> > -     vpmovmskb %ymm5, %eax
> > -     testl   %eax, %eax
> > -     jnz     L(4x_vec_end)
> > -
> > -     addq    $(VEC_SIZE * 4), %rdi
> > -
> > +     vpmovmskb %ymm5, %ecx
> >  # ifdef USE_AS_RAWMEMCHR
> > -     jmp     L(loop_4x_vec)
> > +     subq    $-(VEC_SIZE * 4), %rdi
> > +     testl   %ecx, %ecx
> > +     jz      L(loop_4x_vec)
> >  # else
> > -     subq    $(VEC_SIZE * 4), %rdx
> > -     ja      L(loop_4x_vec)
> > +     testl   %ecx, %ecx
> > +     jnz     L(loop_4x_vec_end)
> >
> > -L(last_4x_vec_or_less):
> > -     /* Less than 4 * VEC and aligned to VEC_SIZE.  */
> > -     addl    $(VEC_SIZE * 2), %edx
> > -     jle     L(last_2x_vec)
> > +     subq    $-(VEC_SIZE * 4), %rdi
> >
> > -     VPCMPEQ (%rdi), %ymm0, %ymm1
> > -     vpmovmskb %ymm1, %eax
> > -     testl   %eax, %eax
> > -     jnz     L(first_vec_x0)
> > +     subq    $(VEC_SIZE * 4), %rdx
> > +     ja      L(loop_4x_vec)
> >
> > -     VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1
> > +     /* Fall through into less than 4 remaining vectors of length case.
> > +      */
> > +     VPCMPEQ (VEC_SIZE * 0 + 1)(%rdi), %ymm0, %ymm1
> >       vpmovmskb %ymm1, %eax
> > +     .p2align 4
> > +L(last_4x_vec_or_less):
> > +     /* Check if first VEC contained match.  */
> >       testl   %eax, %eax
> > -     jnz     L(first_vec_x1)
> > +     jnz     L(first_vec_x1_check)
> >
> > -     VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm0, %ymm1
> > -     vpmovmskb %ymm1, %eax
> > -     testl   %eax, %eax
> > +     /* If remaining length > VEC_SIZE * 2.  */
> > +     addl    $(VEC_SIZE * 2), %edx
> > +     jg      L(last_4x_vec)
> >
> > -     jnz     L(first_vec_x2_check)
> > -     subl    $VEC_SIZE, %edx
> > -     jle     L(zero)
> > +L(last_2x_vec):
> > +     /* If remaining length < VEC_SIZE.  */
> > +     addl    $VEC_SIZE, %edx
> > +     jle     L(zero_end)
> >
> > -     VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1
> > +     /* Check VEC2 and compare any match with remaining length.  */
> > +     VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
> >       vpmovmskb %ymm1, %eax
> > -     testl   %eax, %eax
> > -
> > -     jnz     L(first_vec_x3_check)
> > -     xorl    %eax, %eax
> > +     tzcntl  %eax, %eax
> > +     cmpl    %eax, %edx
> > +     jbe     L(set_zero_end)
> > +     addq    $(VEC_SIZE + 1), %rdi
> > +     addq    %rdi, %rax
> > +L(zero_end):
> >       VZEROUPPER_RETURN
> >
> >       .p2align 4
> > -L(last_2x_vec):
> > -     addl    $(VEC_SIZE * 2), %edx
> > -     VPCMPEQ (%rdi), %ymm0, %ymm1
> > +L(loop_4x_vec_end):
> > +# endif
> > +     /* rawmemchr will fall through into this if match was found in
> > +        loop.  */
> > +
> >       vpmovmskb %ymm1, %eax
> >       testl   %eax, %eax
> > +     jnz     L(last_vec_x1_return)
> >
> > -     jnz     L(first_vec_x0_check)
> > -     subl    $VEC_SIZE, %edx
> > -     jle     L(zero)
> > -
> > -     VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1
> > -     vpmovmskb %ymm1, %eax
> > +     vpmovmskb %ymm2, %eax
> >       testl   %eax, %eax
> > -     jnz     L(first_vec_x1_check)
> > -     xorl    %eax, %eax
> > -     VZEROUPPER_RETURN
> > +     jnz     L(last_vec_x2_return)
> >
> > -     .p2align 4
> > -L(first_vec_x0_check):
> > -     tzcntl  %eax, %eax
> > -     /* Check the end of data.  */
> > -     cmpq    %rax, %rdx
> > -     jbe     L(zero)
> > +     vpmovmskb %ymm3, %eax
> > +     /* Combine VEC3 matches (eax) with VEC4 matches (ecx).  */
> > +     salq    $32, %rcx
> > +     orq     %rcx, %rax
> > +     tzcntq  %rax, %rax
> > +# ifdef USE_AS_RAWMEMCHR
> > +     subq    $(VEC_SIZE * 2 - 1), %rdi
> > +# else
> > +     subq    $-(VEC_SIZE * 2 + 1), %rdi
> > +# endif
> >       addq    %rdi, %rax
> >       VZEROUPPER_RETURN
> > +# ifndef USE_AS_RAWMEMCHR
> >
> >       .p2align 4
> >  L(first_vec_x1_check):
> >       tzcntl  %eax, %eax
> > -     /* Check the end of data.  */
> > -     cmpq    %rax, %rdx
> > -     jbe     L(zero)
> > -     addq    $VEC_SIZE, %rax
> > +     /* Adjust length.  */
> > +     subl    $-(VEC_SIZE * 4), %edx
> > +     /* Check if match within remaining length.  */
> > +     cmpl    %eax, %edx
> > +     jbe     L(set_zero_end)
> > +     incq    %rdi
> >       addq    %rdi, %rax
> >       VZEROUPPER_RETURN
> > +     .p2align 4
> > +L(set_zero_end):
> > +     xorl    %eax, %eax
> > +     VZEROUPPER_RETURN
> > +# endif
> >
> >       .p2align 4
> > -L(first_vec_x2_check):
> > +L(last_vec_x1_return):
> >       tzcntl  %eax, %eax
> > -     /* Check the end of data.  */
> > -     cmpq    %rax, %rdx
> > -     jbe     L(zero)
> > -     addq    $(VEC_SIZE * 2), %rax
> > +# ifdef USE_AS_RAWMEMCHR
> > +     subq    $(VEC_SIZE * 4 - 1), %rdi
> > +# else
> > +     incq    %rdi
> > +# endif
> >       addq    %rdi, %rax
> >       VZEROUPPER_RETURN
> >
> >       .p2align 4
> > -L(first_vec_x3_check):
> > +L(last_vec_x2_return):
> >       tzcntl  %eax, %eax
> > -     /* Check the end of data.  */
> > -     cmpq    %rax, %rdx
> > -     jbe     L(zero)
> > -     addq    $(VEC_SIZE * 3), %rax
> > +# ifdef USE_AS_RAWMEMCHR
> > +     subq    $(VEC_SIZE * 3 - 1), %rdi
> > +# else
> > +     subq    $-(VEC_SIZE + 1), %rdi
> > +# endif
> >       addq    %rdi, %rax
> >       VZEROUPPER_RETURN
> >
> > +# ifndef USE_AS_RAWMEMCHR
> >       .p2align 4
> > -L(zero):
> > -     xorl    %eax, %eax
> > -     jmp     L(return_vzeroupper)
> > +L(last_4x_vec_or_less_cmpeq):
> > +     VPCMPEQ (VEC_SIZE * 4 + 1)(%rdi), %ymm0, %ymm1
> > +     vpmovmskb %ymm1, %eax
> > +     subq    $-(VEC_SIZE * 4), %rdi
> > +     /* Check first VEC regardless.  */
> > +     testl   %eax, %eax
> > +     jnz     L(first_vec_x1_check)
> >
> > +     /* If remaining length <= CHAR_PER_VEC * 2.  */
> > +     addl    $(VEC_SIZE * 2), %edx
> > +     jle     L(last_2x_vec)
> >       .p2align 4
> > -L(null):
> > -     xorl    %eax, %eax
> > -     ret
> > -# endif
> > +L(last_4x_vec):
> > +     VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
> > +     vpmovmskb %ymm1, %eax
> > +     testl   %eax, %eax
> > +     jnz     L(last_vec_x2_return)
> >
> > -     .p2align 4
> > -L(first_vec_x0):
> > -     tzcntl  %eax, %eax
> > -     addq    %rdi, %rax
> > -     VZEROUPPER_RETURN
> > +     VPCMPEQ (VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm1
> > +     vpmovmskb %ymm1, %eax
> >
> > -     .p2align 4
> > -L(first_vec_x1):
> > -     tzcntl  %eax, %eax
> > -     addq    $VEC_SIZE, %rax
> > -     addq    %rdi, %rax
> > -     VZEROUPPER_RETURN
> > +     /* Create mask for possible matches within remaining length.  */
> > +     movq    $-1, %rcx
> > +     bzhiq   %rdx, %rcx, %rcx
> >
> > -     .p2align 4
> > -L(first_vec_x2):
> > +     /* Test matches in data against length match.  */
> > +     andl    %ecx, %eax
> > +     jnz     L(last_vec_x3)
> > +
> > +     /* if remaining length <= VEC_SIZE * 3 (Note this is after
> > +        remaining length was found to be > VEC_SIZE * 2.  */
> > +     subl    $VEC_SIZE, %edx
> > +     jbe     L(zero_end2)
> > +
> > +     VPCMPEQ (VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm1
> > +     vpmovmskb %ymm1, %eax
> > +     /* Shift remaining length mask for last VEC.  */
> > +     shrq    $32, %rcx
> > +     andl    %ecx, %eax
> > +     jz      L(zero_end2)
> >       tzcntl  %eax, %eax
> > -     addq    $(VEC_SIZE * 2), %rax
> > +     addq    $(VEC_SIZE * 3 + 1), %rdi
> >       addq    %rdi, %rax
> > +L(zero_end2):
> >       VZEROUPPER_RETURN
> >
> >       .p2align 4
> > -L(4x_vec_end):
> > -     vpmovmskb %ymm1, %eax
> > -     testl   %eax, %eax
> > -     jnz     L(first_vec_x0)
> > -     vpmovmskb %ymm2, %eax
> > -     testl   %eax, %eax
> > -     jnz     L(first_vec_x1)
> > -     vpmovmskb %ymm3, %eax
> > -     testl   %eax, %eax
> > -     jnz     L(first_vec_x2)
> > -     vpmovmskb %ymm4, %eax
> > -     testl   %eax, %eax
> > -L(first_vec_x3):
> > +L(last_vec_x3):
> >       tzcntl  %eax, %eax
> > -     addq    $(VEC_SIZE * 3), %rax
> > +     subq    $-(VEC_SIZE * 2 + 1), %rdi
> >       addq    %rdi, %rax
> >       VZEROUPPER_RETURN
> > +# endif
> >
> >  END (MEMCHR)
> >  #endif
> > --
> > 2.29.2
> >

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH v2 3/3] x86: Optimize memchr-evex.S
  2021-05-03 22:26       ` H.J. Lu
@ 2021-05-03 22:58         ` Noah Goldstein
  0 siblings, 0 replies; 20+ messages in thread
From: Noah Goldstein @ 2021-05-03 22:58 UTC (permalink / raw)
  To: H.J. Lu; +Cc: GNU C Library, Carlos O'Donell

On Mon, May 3, 2021 at 6:26 PM H.J. Lu <hjl.tools@gmail.com> wrote:
>
> On Mon, May 03, 2021 at 04:06:55PM -0400, Noah Goldstein wrote:
> > No bug. This commit optimizes memchr-evex.S. The optimizations include
> > replacing some branches with cmovcc, avoiding some branches entirely
> > in the less_4x_vec case, making the page cross logic less strict,
> > saving some ALU in the alignment process, and most importantly
> > increasing ILP in the 4x loop. test-memchr, test-rawmemchr, and
> > test-wmemchr are all passing.
> >
> > Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
> > ---
> >  sysdeps/x86_64/multiarch/memchr-evex.S | 547 +++++++++++++++----------
> >  1 file changed, 322 insertions(+), 225 deletions(-)
> >
> > diff --git a/sysdeps/x86_64/multiarch/memchr-evex.S b/sysdeps/x86_64/multiarch/memchr-evex.S
> > index 6dd5d67b90..147d7aa8ee 100644
> > --- a/sysdeps/x86_64/multiarch/memchr-evex.S
> > +++ b/sysdeps/x86_64/multiarch/memchr-evex.S
> > @@ -26,14 +26,28 @@
> >
> >  # ifdef USE_AS_WMEMCHR
> >  #  define VPBROADCAST        vpbroadcastd
> > -#  define VPCMP              vpcmpd
> > -#  define SHIFT_REG  r8d
> > +#  define VPMINU     vpminud
> > +#  define VPCMP      vpcmpd
> > +#  define VPCMPEQ    vpcmpeqd
> > +#  define CHAR_SIZE  4
> >  # else
> >  #  define VPBROADCAST        vpbroadcastb
> > -#  define VPCMP              vpcmpb
> > -#  define SHIFT_REG  ecx
> > +#  define VPMINU     vpminub
> > +#  define VPCMP      vpcmpb
> > +#  define VPCMPEQ    vpcmpeqb
> > +#  define CHAR_SIZE  1
> >  # endif
> >
> > +# ifdef USE_AS_RAWMEMCHR
> > +#  define RAW_PTR_REG        rcx
> > +#  define ALGN_PTR_REG       rdi
> > +# else
> > +#  define RAW_PTR_REG        rdi
> > +#  define ALGN_PTR_REG       rcx
> > +# endif
> > +
> > +# define XZERO               xmm23
> > +# define YZERO               ymm23
>
> Please rename XZERO/YZERO to XMMZERO/YMMZERO.  OK with this change.

Done and thanks!

>
> Thanks.
>
> >  # define XMMMATCH    xmm16
> >  # define YMMMATCH    ymm16
> >  # define YMM1                ymm17
> > @@ -44,6 +58,8 @@
> >  # define YMM6                ymm22
> >
> >  # define VEC_SIZE 32
> > +# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE)
> > +# define PAGE_SIZE 4096
> >
> >       .section .text.evex,"ax",@progbits
> >  ENTRY (MEMCHR)
> > @@ -51,11 +67,7 @@ ENTRY (MEMCHR)
> >       /* Check for zero length.  */
> >       test    %RDX_LP, %RDX_LP
> >       jz      L(zero)
> > -# endif
> > -     movl    %edi, %ecx
> > -# ifdef USE_AS_WMEMCHR
> > -     shl     $2, %RDX_LP
> > -# else
> > +
> >  #  ifdef __ILP32__
> >       /* Clear the upper 32 bits.  */
> >       movl    %edx, %edx
> > @@ -64,318 +76,403 @@ ENTRY (MEMCHR)
> >       /* Broadcast CHAR to YMMMATCH.  */
> >       VPBROADCAST %esi, %YMMMATCH
> >       /* Check if we may cross page boundary with one vector load.  */
> > -     andl    $(2 * VEC_SIZE - 1), %ecx
> > -     cmpl    $VEC_SIZE, %ecx
> > -     ja      L(cros_page_boundary)
> > +     movl    %edi, %eax
> > +     andl    $(PAGE_SIZE - 1), %eax
> > +     cmpl    $(PAGE_SIZE - VEC_SIZE), %eax
> > +     ja      L(cross_page_boundary)
> >
> >       /* Check the first VEC_SIZE bytes.  */
> > -     VPCMP   $0, (%rdi), %YMMMATCH, %k1
> > -     kmovd   %k1, %eax
> > -     testl   %eax, %eax
> > -
> > +     VPCMP   $0, (%rdi), %YMMMATCH, %k0
> > +     kmovd   %k0, %eax
> >  # ifndef USE_AS_RAWMEMCHR
> > -     jnz     L(first_vec_x0_check)
> > -     /* Adjust length and check the end of data.  */
> > -     subq    $VEC_SIZE, %rdx
> > -     jbe     L(zero)
> > +     /* If length < CHAR_PER_VEC handle special.  */
> > +     cmpq    $CHAR_PER_VEC, %rdx
> > +     jbe     L(first_vec_x0)
> > +# endif
> > +     testl   %eax, %eax
> > +     jz      L(aligned_more)
> > +     tzcntl  %eax, %eax
> > +# ifdef USE_AS_WMEMCHR
> > +     /* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count.  */
> > +     leaq    (%rdi, %rax, CHAR_SIZE), %rax
> >  # else
> > -     jnz     L(first_vec_x0)
> > +     addq    %rdi, %rax
> >  # endif
> > -
> > -     /* Align data for aligned loads in the loop.  */
> > -     addq    $VEC_SIZE, %rdi
> > -     andl    $(VEC_SIZE - 1), %ecx
> > -     andq    $-VEC_SIZE, %rdi
> > +     ret
> >
> >  # ifndef USE_AS_RAWMEMCHR
> > -     /* Adjust length.  */
> > -     addq    %rcx, %rdx
> > -
> > -     subq    $(VEC_SIZE * 4), %rdx
> > -     jbe     L(last_4x_vec_or_less)
> > -# endif
> > -     jmp     L(more_4x_vec)
> > +L(zero):
> > +     xorl    %eax, %eax
> > +     ret
> >
> > +     .p2align 5
> > +L(first_vec_x0):
> > +     /* Check if first match was before length.  */
> > +     tzcntl  %eax, %eax
> > +     xorl    %ecx, %ecx
> > +     cmpl    %eax, %edx
> > +     leaq    (%rdi, %rax, CHAR_SIZE), %rax
> > +     cmovle  %rcx, %rax
> > +     ret
> > +# else
> > +     /* NB: first_vec_x0 is 17 bytes which will leave
> > +        cross_page_boundary (which is relatively cold) close enough
> > +        to ideal alignment. So only realign L(cross_page_boundary) if
> > +        rawmemchr.  */
> >       .p2align 4
> > -L(cros_page_boundary):
> > -     andl    $(VEC_SIZE - 1), %ecx
> > +# endif
> > +L(cross_page_boundary):
> > +     /* Save pointer before aligning as its original value is
> > +        necessary for computer return address if byte is found or
> > +        adjusting length if it is not and this is memchr.  */
> > +     movq    %rdi, %rcx
> > +     /* Align data to VEC_SIZE. ALGN_PTR_REG is rcx for memchr and rdi
> > +        for rawmemchr.  */
> > +     andq    $-VEC_SIZE, %ALGN_PTR_REG
> > +     VPCMP   $0, (%ALGN_PTR_REG), %YMMMATCH, %k0
> > +     kmovd   %k0, %r8d
> >  # ifdef USE_AS_WMEMCHR
> > -     /* NB: Divide shift count by 4 since each bit in K1 represent 4
> > +     /* NB: Divide shift count by 4 since each bit in K0 represent 4
> >          bytes.  */
> > -     movl    %ecx, %SHIFT_REG
> > -     sarl    $2, %SHIFT_REG
> > +     sarl    $2, %eax
> > +# endif
> > +# ifndef USE_AS_RAWMEMCHR
> > +     movl    $(PAGE_SIZE / CHAR_SIZE), %esi
> > +     subl    %eax, %esi
> >  # endif
> > -     andq    $-VEC_SIZE, %rdi
> > -     VPCMP   $0, (%rdi), %YMMMATCH, %k1
> > -     kmovd   %k1, %eax
> > -     /* Remove the leading bytes.  */
> > -     sarxl   %SHIFT_REG, %eax, %eax
> > -     testl   %eax, %eax
> > -     jz      L(aligned_more)
> > -     tzcntl  %eax, %eax
> >  # ifdef USE_AS_WMEMCHR
> > -     /* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
> > -     sall    $2, %eax
> > +     andl    $(CHAR_PER_VEC - 1), %eax
> >  # endif
> > +     /* Remove the leading bytes.  */
> > +     sarxl   %eax, %r8d, %eax
> >  # ifndef USE_AS_RAWMEMCHR
> >       /* Check the end of data.  */
> > -     cmpq    %rax, %rdx
> > -     jbe     L(zero)
> > +     cmpq    %rsi, %rdx
> > +     jbe     L(first_vec_x0)
> > +# endif
> > +     testl   %eax, %eax
> > +     jz      L(cross_page_continue)
> > +     tzcntl  %eax, %eax
> > +# ifdef USE_AS_WMEMCHR
> > +     /* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count.  */
> > +     leaq    (%RAW_PTR_REG, %rax, CHAR_SIZE), %rax
> > +# else
> > +     addq    %RAW_PTR_REG, %rax
> >  # endif
> > -     addq    %rdi, %rax
> > -     addq    %rcx, %rax
> >       ret
> >
> >       .p2align 4
> > -L(aligned_more):
> > -# ifndef USE_AS_RAWMEMCHR
> > -        /* Calculate "rdx + rcx - VEC_SIZE" with "rdx - (VEC_SIZE - rcx)"
> > -        instead of "(rdx + rcx) - VEC_SIZE" to void possible addition
> > -        overflow.  */
> > -     negq    %rcx
> > -     addq    $VEC_SIZE, %rcx
> > +L(first_vec_x1):
> > +     tzcntl  %eax, %eax
> > +     leaq    VEC_SIZE(%rdi, %rax, CHAR_SIZE), %rax
> > +     ret
> >
> > -     /* Check the end of data.  */
> > -     subq    %rcx, %rdx
> > -     jbe     L(zero)
> > -# endif
> > +     .p2align 4
> > +L(first_vec_x2):
> > +     tzcntl  %eax, %eax
> > +     leaq    (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
> > +     ret
> >
> > -     addq    $VEC_SIZE, %rdi
> > +     .p2align 4
> > +L(first_vec_x3):
> > +     tzcntl  %eax, %eax
> > +     leaq    (VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax
> > +     ret
> >
> > -# ifndef USE_AS_RAWMEMCHR
> > -     subq    $(VEC_SIZE * 4), %rdx
> > -     jbe     L(last_4x_vec_or_less)
> > -# endif
> > +     .p2align 4
> > +L(first_vec_x4):
> > +     tzcntl  %eax, %eax
> > +     leaq    (VEC_SIZE * 4)(%rdi, %rax, CHAR_SIZE), %rax
> > +     ret
> >
> > -L(more_4x_vec):
> > +     .p2align 5
> > +L(aligned_more):
> >       /* Check the first 4 * VEC_SIZE.  Only one VEC_SIZE at a time
> >          since data is only aligned to VEC_SIZE.  */
> > -     VPCMP   $0, (%rdi), %YMMMATCH, %k1
> > -     kmovd   %k1, %eax
> > -     testl   %eax, %eax
> > -     jnz     L(first_vec_x0)
> >
> > -     VPCMP   $0, VEC_SIZE(%rdi), %YMMMATCH, %k1
> > -     kmovd   %k1, %eax
> > +# ifndef USE_AS_RAWMEMCHR
> > +     /* Align data to VEC_SIZE.  */
> > +L(cross_page_continue):
> > +     xorl    %ecx, %ecx
> > +     subl    %edi, %ecx
> > +     andq    $-VEC_SIZE, %rdi
> > +     /* esi is for adjusting length to see if near the end.  */
> > +     leal    (VEC_SIZE * 5)(%rdi, %rcx), %esi
> > +#  ifdef USE_AS_WMEMCHR
> > +     /* NB: Divide bytes by 4 to get the wchar_t count.  */
> > +     sarl    $2, %esi
> > +#  endif
> > +# else
> > +     andq    $-VEC_SIZE, %rdi
> > +L(cross_page_continue):
> > +# endif
> > +     /* Load first VEC regardless.  */
> > +     VPCMP   $0, (VEC_SIZE)(%rdi), %YMMMATCH, %k0
> > +     kmovd   %k0, %eax
> > +# ifndef USE_AS_RAWMEMCHR
> > +     /* Adjust length. If near end handle specially.  */
> > +     subq    %rsi, %rdx
> > +     jbe     L(last_4x_vec_or_less)
> > +# endif
> >       testl   %eax, %eax
> >       jnz     L(first_vec_x1)
> >
> > -     VPCMP   $0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k1
> > -     kmovd   %k1, %eax
> > +     VPCMP   $0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k0
> > +     kmovd   %k0, %eax
> >       testl   %eax, %eax
> >       jnz     L(first_vec_x2)
> >
> > -     VPCMP   $0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k1
> > -     kmovd   %k1, %eax
> > +     VPCMP   $0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k0
> > +     kmovd   %k0, %eax
> >       testl   %eax, %eax
> >       jnz     L(first_vec_x3)
> >
> > -     addq    $(VEC_SIZE * 4), %rdi
> > +     VPCMP   $0, (VEC_SIZE * 4)(%rdi), %YMMMATCH, %k0
> > +     kmovd   %k0, %eax
> > +     testl   %eax, %eax
> > +     jnz     L(first_vec_x4)
> > +
> >
> >  # ifndef USE_AS_RAWMEMCHR
> > -     subq    $(VEC_SIZE * 4), %rdx
> > -     jbe     L(last_4x_vec_or_less)
> > -# endif
> > +     /* Check if at last CHAR_PER_VEC * 4 length.  */
> > +     subq    $(CHAR_PER_VEC * 4), %rdx
> > +     jbe     L(last_4x_vec_or_less_cmpeq)
> > +     addq    $VEC_SIZE, %rdi
> >
> > -     /* Align data to 4 * VEC_SIZE.  */
> > -     movq    %rdi, %rcx
> > -     andl    $(4 * VEC_SIZE - 1), %ecx
> > +     /* Align data to VEC_SIZE * 4 for the loop and readjust length.
> > +      */
> > +#  ifdef USE_AS_WMEMCHR
> > +     movl    %edi, %ecx
> >       andq    $-(4 * VEC_SIZE), %rdi
> > -
> > -# ifndef USE_AS_RAWMEMCHR
> > -     /* Adjust length.  */
> > +     andl    $(VEC_SIZE * 4 - 1), %ecx
> > +     /* NB: Divide bytes by 4 to get the wchar_t count.  */
> > +     sarl    $2, %ecx
> >       addq    %rcx, %rdx
> > +#  else
> > +     addq    %rdi, %rdx
> > +     andq    $-(4 * VEC_SIZE), %rdi
> > +     subq    %rdi, %rdx
> > +#  endif
> > +# else
> > +     addq    $VEC_SIZE, %rdi
> > +     andq    $-(4 * VEC_SIZE), %rdi
> >  # endif
> >
> > +     vpxorq  %XZERO, %XZERO, %XZERO
> > +
> > +     /* Compare 4 * VEC at a time forward.  */
> >       .p2align 4
> >  L(loop_4x_vec):
> > -     /* Compare 4 * VEC at a time forward.  */
> > -     VPCMP   $0, (%rdi), %YMMMATCH, %k1
> > -     VPCMP   $0, VEC_SIZE(%rdi), %YMMMATCH, %k2
> > -     kord    %k1, %k2, %k5
> > -     VPCMP   $0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k3
> > -     VPCMP   $0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k4
> > -
> > -     kord    %k3, %k4, %k6
> > -     kortestd %k5, %k6
> > -     jnz     L(4x_vec_end)
> > -
> > -     addq    $(VEC_SIZE * 4), %rdi
> > -
> > +     /* It would be possible to save some instructions using 4x VPCMP
> > +        but bottleneck on port 5 makes it not woth it.  */
> > +     VPCMP   $4, (VEC_SIZE * 4)(%rdi), %YMMMATCH, %k1
> > +     /* xor will set bytes match esi to zero.  */
> > +     vpxorq  (VEC_SIZE * 5)(%rdi), %YMMMATCH, %YMM2
> > +     vpxorq  (VEC_SIZE * 6)(%rdi), %YMMMATCH, %YMM3
> > +     VPCMP   $0, (VEC_SIZE * 7)(%rdi), %YMMMATCH, %k3
> > +     /* Reduce VEC2 / VEC3 with min and VEC1 with zero mask.  */
> > +     VPMINU  %YMM2, %YMM3, %YMM3 {%k1} {z}
> > +     VPCMP   $0, %YMM3, %YZERO, %k2
> >  # ifdef USE_AS_RAWMEMCHR
> > -     jmp     L(loop_4x_vec)
> > +     subq    $-(VEC_SIZE * 4), %rdi
> > +     kortestd %k2, %k3
> > +     jz      L(loop_4x_vec)
> >  # else
> > -     subq    $(VEC_SIZE * 4), %rdx
> > +     kortestd %k2, %k3
> > +     jnz     L(loop_4x_vec_end)
> > +
> > +     subq    $-(VEC_SIZE * 4), %rdi
> > +
> > +     subq    $(CHAR_PER_VEC * 4), %rdx
> >       ja      L(loop_4x_vec)
> >
> > +     /* Fall through into less than 4 remaining vectors of length case.
> > +      */
> > +     VPCMP   $0, (VEC_SIZE * 4)(%rdi), %YMMMATCH, %k0
> > +     kmovd   %k0, %eax
> > +     addq    $(VEC_SIZE * 3), %rdi
> > +     .p2align 4
> >  L(last_4x_vec_or_less):
> > -     /* Less than 4 * VEC and aligned to VEC_SIZE.  */
> > -     addl    $(VEC_SIZE * 2), %edx
> > -     jle     L(last_2x_vec)
> > -
> > -     VPCMP   $0, (%rdi), %YMMMATCH, %k1
> > -     kmovd   %k1, %eax
> > +     /* Check if first VEC contained match.  */
> >       testl   %eax, %eax
> > -     jnz     L(first_vec_x0)
> > +     jnz     L(first_vec_x1_check)
> >
> > -     VPCMP   $0, VEC_SIZE(%rdi), %YMMMATCH, %k1
> > -     kmovd   %k1, %eax
> > -     testl   %eax, %eax
> > -     jnz     L(first_vec_x1)
> > +     /* If remaining length > CHAR_PER_VEC * 2.  */
> > +     addl    $(CHAR_PER_VEC * 2), %edx
> > +     jg      L(last_4x_vec)
> >
> > -     VPCMP   $0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k1
> > -     kmovd   %k1, %eax
> > -     testl   %eax, %eax
> > +L(last_2x_vec):
> > +     /* If remaining length < CHAR_PER_VEC.  */
> > +     addl    $CHAR_PER_VEC, %edx
> > +     jle     L(zero_end)
> >
> > -     jnz     L(first_vec_x2_check)
> > -     subl    $VEC_SIZE, %edx
> > -     jle     L(zero)
> > +     /* Check VEC2 and compare any match with remaining length.  */
> > +     VPCMP   $0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k0
> > +     kmovd   %k0, %eax
> > +     tzcntl  %eax, %eax
> > +     cmpl    %eax, %edx
> > +     jbe     L(set_zero_end)
> > +     leaq    (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
> > +L(zero_end):
> > +     ret
> >
> > -     VPCMP   $0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k1
> > -     kmovd   %k1, %eax
> > -     testl   %eax, %eax
> >
> > -     jnz     L(first_vec_x3_check)
> > +     .p2align 4
> > +L(first_vec_x1_check):
> > +     tzcntl  %eax, %eax
> > +     /* Adjust length.  */
> > +     subl    $-(CHAR_PER_VEC * 4), %edx
> > +     /* Check if match within remaining length.  */
> > +     cmpl    %eax, %edx
> > +     jbe     L(set_zero_end)
> > +     /* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count.  */
> > +     leaq    VEC_SIZE(%rdi, %rax, CHAR_SIZE), %rax
> > +     ret
> > +L(set_zero_end):
> >       xorl    %eax, %eax
> >       ret
> >
> >       .p2align 4
> > -L(last_2x_vec):
> > -     addl    $(VEC_SIZE * 2), %edx
> > -     VPCMP   $0, (%rdi), %YMMMATCH, %k1
> > +L(loop_4x_vec_end):
> > +# endif
> > +     /* rawmemchr will fall through into this if match was found in
> > +        loop.  */
> > +
> > +     /* k1 has not of matches with VEC1.  */
> >       kmovd   %k1, %eax
> > -     testl   %eax, %eax
> > +# ifdef USE_AS_WMEMCHR
> > +     subl    $((1 << CHAR_PER_VEC) - 1), %eax
> > +# else
> > +     incl    %eax
> > +# endif
> > +     jnz     L(last_vec_x1_return)
> >
> > -     jnz     L(first_vec_x0_check)
> > -     subl    $VEC_SIZE, %edx
> > -     jle     L(zero)
> > +     VPCMP   $0, %YMM2, %YZERO, %k0
> > +     kmovd   %k0, %eax
> > +     testl   %eax, %eax
> > +     jnz     L(last_vec_x2_return)
> >
> > -     VPCMP   $0, VEC_SIZE(%rdi), %YMMMATCH, %k1
> > -     kmovd   %k1, %eax
> > +     kmovd   %k2, %eax
> >       testl   %eax, %eax
> > -     jnz     L(first_vec_x1_check)
> > -     xorl    %eax, %eax
> > -     ret
> > +     jnz     L(last_vec_x3_return)
> >
> > -     .p2align 4
> > -L(first_vec_x0_check):
> > +     kmovd   %k3, %eax
> >       tzcntl  %eax, %eax
> > -# ifdef USE_AS_WMEMCHR
> > -     /* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
> > -     sall    $2, %eax
> > +# ifdef USE_AS_RAWMEMCHR
> > +     leaq    (VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax
> > +# else
> > +     leaq    (VEC_SIZE * 7)(%rdi, %rax, CHAR_SIZE), %rax
> >  # endif
> > -     /* Check the end of data.  */
> > -     cmpq    %rax, %rdx
> > -     jbe     L(zero)
> > -     addq    %rdi, %rax
> >       ret
> >
> >       .p2align 4
> > -L(first_vec_x1_check):
> > +L(last_vec_x1_return):
> >       tzcntl  %eax, %eax
> > -# ifdef USE_AS_WMEMCHR
> > -     /* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
> > -     sall    $2, %eax
> > -# endif
> > -     /* Check the end of data.  */
> > -     cmpq    %rax, %rdx
> > -     jbe     L(zero)
> > -     addq    $VEC_SIZE, %rax
> > +# ifdef USE_AS_RAWMEMCHR
> > +#  ifdef USE_AS_WMEMCHR
> > +     /* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count.  */
> > +     leaq    (%rdi, %rax, CHAR_SIZE), %rax
> > +#  else
> >       addq    %rdi, %rax
> > -     ret
> > -
> > -     .p2align 4
> > -L(first_vec_x2_check):
> > -     tzcntl  %eax, %eax
> > -# ifdef USE_AS_WMEMCHR
> > -     /* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
> > -     sall    $2, %eax
> > +#  endif
> > +# else
> > +     /* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count.  */
> > +     leaq    (VEC_SIZE * 4)(%rdi, %rax, CHAR_SIZE), %rax
> >  # endif
> > -     /* Check the end of data.  */
> > -     cmpq    %rax, %rdx
> > -     jbe     L(zero)
> > -     addq    $(VEC_SIZE * 2), %rax
> > -     addq    %rdi, %rax
> >       ret
> >
> >       .p2align 4
> > -L(first_vec_x3_check):
> > +L(last_vec_x2_return):
> >       tzcntl  %eax, %eax
> > -# ifdef USE_AS_WMEMCHR
> > -     /* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
> > -     sall    $2, %eax
> > +# ifdef USE_AS_RAWMEMCHR
> > +     /* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count.  */
> > +     leaq    VEC_SIZE(%rdi, %rax, CHAR_SIZE), %rax
> > +# else
> > +     /* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count.  */
> > +     leaq    (VEC_SIZE * 5)(%rdi, %rax, CHAR_SIZE), %rax
> >  # endif
> > -     /* Check the end of data.  */
> > -     cmpq    %rax, %rdx
> > -     jbe     L(zero)
> > -     addq    $(VEC_SIZE * 3), %rax
> > -     addq    %rdi, %rax
> >       ret
> >
> >       .p2align 4
> > -L(zero):
> > -     xorl    %eax, %eax
> > -     ret
> > -# endif
> > -
> > -     .p2align 4
> > -L(first_vec_x0):
> > +L(last_vec_x3_return):
> >       tzcntl  %eax, %eax
> > -# ifdef USE_AS_WMEMCHR
> > -     /* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
> > -     leaq    (%rdi, %rax, 4), %rax
> > +# ifdef USE_AS_RAWMEMCHR
> > +     /* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count.  */
> > +     leaq    (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
> >  # else
> > -     addq    %rdi, %rax
> > +     /* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count.  */
> > +     leaq    (VEC_SIZE * 6)(%rdi, %rax, CHAR_SIZE), %rax
> >  # endif
> >       ret
> >
> > +
> > +# ifndef USE_AS_RAWMEMCHR
> > +L(last_4x_vec_or_less_cmpeq):
> > +     VPCMP   $0, (VEC_SIZE * 5)(%rdi), %YMMMATCH, %k0
> > +     kmovd   %k0, %eax
> > +     subq    $-(VEC_SIZE * 4), %rdi
> > +     /* Check first VEC regardless.  */
> > +     testl   %eax, %eax
> > +     jnz     L(first_vec_x1_check)
> > +
> > +     /* If remaining length <= CHAR_PER_VEC * 2.  */
> > +     addl    $(CHAR_PER_VEC * 2), %edx
> > +     jle     L(last_2x_vec)
> > +
> >       .p2align 4
> > -L(first_vec_x1):
> > +L(last_4x_vec):
> > +     VPCMP   $0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k0
> > +     kmovd   %k0, %eax
> > +     testl   %eax, %eax
> > +     jnz     L(last_vec_x2)
> > +
> > +
> > +     VPCMP   $0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k0
> > +     kmovd   %k0, %eax
> > +     /* Create mask for possible matches within remaining length.  */
> > +#  ifdef USE_AS_WMEMCHR
> > +     movl    $((1 << (CHAR_PER_VEC * 2)) - 1), %ecx
> > +     bzhil   %edx, %ecx, %ecx
> > +#  else
> > +     movq    $-1, %rcx
> > +     bzhiq   %rdx, %rcx, %rcx
> > +#  endif
> > +     /* Test matches in data against length match.  */
> > +     andl    %ecx, %eax
> > +     jnz     L(last_vec_x3)
> > +
> > +     /* if remaining length <= CHAR_PER_VEC * 3 (Note this is after
> > +        remaining length was found to be > CHAR_PER_VEC * 2.  */
> > +     subl    $CHAR_PER_VEC, %edx
> > +     jbe     L(zero_end2)
> > +
> > +
> > +     VPCMP   $0, (VEC_SIZE * 4)(%rdi), %YMMMATCH, %k0
> > +     kmovd   %k0, %eax
> > +     /* Shift remaining length mask for last VEC.  */
> > +#  ifdef USE_AS_WMEMCHR
> > +     shrl    $CHAR_PER_VEC, %ecx
> > +#  else
> > +     shrq    $CHAR_PER_VEC, %rcx
> > +#  endif
> > +     andl    %ecx, %eax
> > +     jz      L(zero_end2)
> >       tzcntl  %eax, %eax
> > -# ifdef USE_AS_WMEMCHR
> > -     /* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
> > -     leaq    VEC_SIZE(%rdi, %rax, 4), %rax
> > -# else
> > -     addq    $VEC_SIZE, %rax
> > -     addq    %rdi, %rax
> > -# endif
> > +     leaq    (VEC_SIZE * 4)(%rdi, %rax, CHAR_SIZE), %rax
> > +L(zero_end2):
> >       ret
> >
> > -     .p2align 4
> > -L(first_vec_x2):
> > +L(last_vec_x2):
> >       tzcntl  %eax, %eax
> > -# ifdef USE_AS_WMEMCHR
> > -     /* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
> > -     leaq    (VEC_SIZE * 2)(%rdi, %rax, 4), %rax
> > -# else
> > -     addq    $(VEC_SIZE * 2), %rax
> > -     addq    %rdi, %rax
> > -# endif
> > +     leaq    (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
> >       ret
> >
> >       .p2align 4
> > -L(4x_vec_end):
> > -     kmovd   %k1, %eax
> > -     testl   %eax, %eax
> > -     jnz     L(first_vec_x0)
> > -     kmovd   %k2, %eax
> > -     testl   %eax, %eax
> > -     jnz     L(first_vec_x1)
> > -     kmovd   %k3, %eax
> > -     testl   %eax, %eax
> > -     jnz     L(first_vec_x2)
> > -     kmovd   %k4, %eax
> > -     testl   %eax, %eax
> > -L(first_vec_x3):
> > +L(last_vec_x3):
> >       tzcntl  %eax, %eax
> > -# ifdef USE_AS_WMEMCHR
> > -     /* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
> > -     leaq    (VEC_SIZE * 3)(%rdi, %rax, 4), %rax
> > -# else
> > -     addq    $(VEC_SIZE * 3), %rax
> > -     addq    %rdi, %rax
> > -# endif
> > +     leaq    (VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax
> >       ret
> > +# endif
> >
> >  END (MEMCHR)
> >  #endif
> > --
> > 2.29.2
> >

^ permalink raw reply	[flat|nested] 20+ messages in thread

* [PATCH v3 2/3] x86: Optimize memchr-avx2.S
  2021-05-03  8:44 ` [PATCH v1 2/3] x86: Optimize memchr-avx2.S Noah Goldstein
  2021-05-03 18:50   ` H.J. Lu
  2021-05-03 20:06   ` [PATCH v2 " Noah Goldstein
@ 2021-05-03 22:58   ` Noah Goldstein
  2021-05-03 22:58     ` [PATCH v3 3/3] x86: Optimize memchr-evex.S Noah Goldstein
  2021-05-03 22:59     ` [PATCH v3 2/3] x86: Optimize memchr-avx2.S Noah Goldstein
  2 siblings, 2 replies; 20+ messages in thread
From: Noah Goldstein @ 2021-05-03 22:58 UTC (permalink / raw)
  To: libc-alpha

No bug. This commit optimizes memchr-avx2.S. The optimizations include
replacing some branches with cmovcc, avoiding some branches entirely
in the less_4x_vec case, making the page cross logic less strict,
asaving a few instructions the in loop return loop. test-memchr,
test-rawmemchr, and test-wmemchr are all passing.

Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
---
 sysdeps/x86_64/multiarch/memchr-avx2.S | 425 ++++++++++++++-----------
 1 file changed, 247 insertions(+), 178 deletions(-)

diff --git a/sysdeps/x86_64/multiarch/memchr-avx2.S b/sysdeps/x86_64/multiarch/memchr-avx2.S
index 1fcb1c350f..0d8758e3e7 100644
--- a/sysdeps/x86_64/multiarch/memchr-avx2.S
+++ b/sysdeps/x86_64/multiarch/memchr-avx2.S
@@ -26,8 +26,22 @@
 
 # ifdef USE_AS_WMEMCHR
 #  define VPCMPEQ	vpcmpeqd
+#  define VPBROADCAST	vpbroadcastd
+#  define CHAR_SIZE	4
 # else
 #  define VPCMPEQ	vpcmpeqb
+#  define VPBROADCAST	vpbroadcastb
+#  define CHAR_SIZE	1
+# endif
+
+# ifdef USE_AS_RAWMEMCHR
+#  define ERAW_PTR_REG	ecx
+#  define RRAW_PTR_REG	rcx
+#  define ALGN_PTR_REG	rdi
+# else
+#  define ERAW_PTR_REG	edi
+#  define RRAW_PTR_REG	rdi
+#  define ALGN_PTR_REG	rcx
 # endif
 
 # ifndef VZEROUPPER
@@ -39,6 +53,7 @@
 # endif
 
 # define VEC_SIZE 32
+# define PAGE_SIZE 4096
 
 	.section SECTION(.text),"ax",@progbits
 ENTRY (MEMCHR)
@@ -47,295 +62,349 @@ ENTRY (MEMCHR)
 	test	%RDX_LP, %RDX_LP
 	jz	L(null)
 # endif
-	movl	%edi, %ecx
-	/* Broadcast CHAR to YMM0.  */
-	vmovd	%esi, %xmm0
 # ifdef USE_AS_WMEMCHR
 	shl	$2, %RDX_LP
-	vpbroadcastd %xmm0, %ymm0
 # else
 #  ifdef __ILP32__
 	/* Clear the upper 32 bits.  */
 	movl	%edx, %edx
 #  endif
-	vpbroadcastb %xmm0, %ymm0
 # endif
+	/* Broadcast CHAR to YMMMATCH.  */
+	vmovd	%esi, %xmm0
+	VPBROADCAST %xmm0, %ymm0
 	/* Check if we may cross page boundary with one vector load.  */
-	andl	$(2 * VEC_SIZE - 1), %ecx
-	cmpl	$VEC_SIZE, %ecx
-	ja	L(cros_page_boundary)
+	movl	%edi, %eax
+	andl	$(PAGE_SIZE - 1), %eax
+	cmpl	$(PAGE_SIZE - VEC_SIZE), %eax
+	ja	L(cross_page_boundary)
 
 	/* Check the first VEC_SIZE bytes.  */
-	VPCMPEQ (%rdi), %ymm0, %ymm1
+	VPCMPEQ	(%rdi), %ymm0, %ymm1
 	vpmovmskb %ymm1, %eax
-	testl	%eax, %eax
-
 # ifndef USE_AS_RAWMEMCHR
-	jnz	L(first_vec_x0_check)
-	/* Adjust length and check the end of data.  */
-	subq	$VEC_SIZE, %rdx
-	jbe	L(zero)
-# else
-	jnz	L(first_vec_x0)
+	/* If length < CHAR_PER_VEC handle special.  */
+	cmpq	$VEC_SIZE, %rdx
+	jbe	L(first_vec_x0)
 # endif
-
-	/* Align data for aligned loads in the loop.  */
-	addq	$VEC_SIZE, %rdi
-	andl	$(VEC_SIZE - 1), %ecx
-	andq	$-VEC_SIZE, %rdi
+	testl	%eax, %eax
+	jz	L(aligned_more)
+	tzcntl	%eax, %eax
+	addq	%rdi, %rax
+	VZEROUPPER_RETURN
 
 # ifndef USE_AS_RAWMEMCHR
-	/* Adjust length.  */
-	addq	%rcx, %rdx
+	.p2align 5
+L(first_vec_x0):
+	/* Check if first match was before length.  */
+	tzcntl	%eax, %eax
+	xorl	%ecx, %ecx
+	cmpl	%eax, %edx
+	leaq	(%rdi, %rax), %rax
+	cmovle	%rcx, %rax
+	VZEROUPPER_RETURN
 
-	subq	$(VEC_SIZE * 4), %rdx
-	jbe	L(last_4x_vec_or_less)
+L(null):
+	xorl	%eax, %eax
+	ret
 # endif
-	jmp	L(more_4x_vec)
-
 	.p2align 4
-L(cros_page_boundary):
-	andl	$(VEC_SIZE - 1), %ecx
-	andq	$-VEC_SIZE, %rdi
-	VPCMPEQ (%rdi), %ymm0, %ymm1
+L(cross_page_boundary):
+	/* Save pointer before aligning as its original value is necessary
+	   for computer return address if byte is found or adjusting length
+	   if it is not and this is memchr.  */
+	movq	%rdi, %rcx
+	/* Align data to VEC_SIZE - 1. ALGN_PTR_REG is rcx for memchr and
+	   rdi for rawmemchr.  */
+	orq	$(VEC_SIZE - 1), %ALGN_PTR_REG
+	VPCMPEQ	-(VEC_SIZE - 1)(%ALGN_PTR_REG), %ymm0, %ymm1
 	vpmovmskb %ymm1, %eax
+# ifndef USE_AS_RAWMEMCHR
+	/* Calculate length until end of page (length checked for a
+	   match).  */
+	leaq	1(%ALGN_PTR_REG), %rsi
+	subq	%RRAW_PTR_REG, %rsi
+# endif
 	/* Remove the leading bytes.  */
-	sarl	%cl, %eax
-	testl	%eax, %eax
-	jz	L(aligned_more)
-	tzcntl	%eax, %eax
+	sarxl	%ERAW_PTR_REG, %eax, %eax
 # ifndef USE_AS_RAWMEMCHR
 	/* Check the end of data.  */
-	cmpq	%rax, %rdx
-	jbe	L(zero)
+	cmpq	%rsi, %rdx
+	jbe	L(first_vec_x0)
 # endif
-	addq	%rdi, %rax
-	addq	%rcx, %rax
+	testl	%eax, %eax
+	jz	L(cross_page_continue)
+	tzcntl	%eax, %eax
+	addq	%RRAW_PTR_REG, %rax
 L(return_vzeroupper):
 	ZERO_UPPER_VEC_REGISTERS_RETURN
 
 	.p2align 4
-L(aligned_more):
-# ifndef USE_AS_RAWMEMCHR
-        /* Calculate "rdx + rcx - VEC_SIZE" with "rdx - (VEC_SIZE - rcx)"
-	   instead of "(rdx + rcx) - VEC_SIZE" to void possible addition
-	   overflow.  */
-	negq	%rcx
-	addq	$VEC_SIZE, %rcx
+L(first_vec_x1):
+	tzcntl	%eax, %eax
+	incq	%rdi
+	addq	%rdi, %rax
+	VZEROUPPER_RETURN
 
-	/* Check the end of data.  */
-	subq	%rcx, %rdx
-	jbe	L(zero)
-# endif
+	.p2align 4
+L(first_vec_x2):
+	tzcntl	%eax, %eax
+	addq	$(VEC_SIZE + 1), %rdi
+	addq	%rdi, %rax
+	VZEROUPPER_RETURN
+
+	.p2align 4
+L(first_vec_x3):
+	tzcntl	%eax, %eax
+	addq	$(VEC_SIZE * 2 + 1), %rdi
+	addq	%rdi, %rax
+	VZEROUPPER_RETURN
 
-	addq	$VEC_SIZE, %rdi
 
-# ifndef USE_AS_RAWMEMCHR
-	subq	$(VEC_SIZE * 4), %rdx
-	jbe	L(last_4x_vec_or_less)
-# endif
+	.p2align 4
+L(first_vec_x4):
+	tzcntl	%eax, %eax
+	addq	$(VEC_SIZE * 3 + 1), %rdi
+	addq	%rdi, %rax
+	VZEROUPPER_RETURN
 
-L(more_4x_vec):
+	.p2align 4
+L(aligned_more):
 	/* Check the first 4 * VEC_SIZE.  Only one VEC_SIZE at a time
 	   since data is only aligned to VEC_SIZE.  */
-	VPCMPEQ (%rdi), %ymm0, %ymm1
-	vpmovmskb %ymm1, %eax
-	testl	%eax, %eax
-	jnz	L(first_vec_x0)
 
-	VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1
+# ifndef USE_AS_RAWMEMCHR
+L(cross_page_continue):
+	/* Align data to VEC_SIZE - 1.  */
+	xorl	%ecx, %ecx
+	subl	%edi, %ecx
+	orq	$(VEC_SIZE - 1), %rdi
+	/* esi is for adjusting length to see if near the end.  */
+	leal	(VEC_SIZE * 4 + 1)(%rdi, %rcx), %esi
+# else
+	orq	$(VEC_SIZE - 1), %rdi
+L(cross_page_continue):
+# endif
+	/* Load first VEC regardless.  */
+	VPCMPEQ	1(%rdi), %ymm0, %ymm1
 	vpmovmskb %ymm1, %eax
+# ifndef USE_AS_RAWMEMCHR
+	/* Adjust length. If near end handle specially.  */
+	subq	%rsi, %rdx
+	jbe	L(last_4x_vec_or_less)
+# endif
 	testl	%eax, %eax
 	jnz	L(first_vec_x1)
 
-	VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm0, %ymm1
+	VPCMPEQ	(VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
 	vpmovmskb %ymm1, %eax
 	testl	%eax, %eax
 	jnz	L(first_vec_x2)
 
-	VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1
+	VPCMPEQ	(VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm1
 	vpmovmskb %ymm1, %eax
 	testl	%eax, %eax
 	jnz	L(first_vec_x3)
 
-	addq	$(VEC_SIZE * 4), %rdi
+	VPCMPEQ	(VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm1
+	vpmovmskb %ymm1, %eax
+	testl	%eax, %eax
+	jnz	L(first_vec_x4)
 
 # ifndef USE_AS_RAWMEMCHR
+	/* Check if at last VEC_SIZE * 4 length.  */
 	subq	$(VEC_SIZE * 4), %rdx
-	jbe	L(last_4x_vec_or_less)
-# endif
-
-	/* Align data to 4 * VEC_SIZE.  */
-	movq	%rdi, %rcx
-	andl	$(4 * VEC_SIZE - 1), %ecx
-	andq	$-(4 * VEC_SIZE), %rdi
-
-# ifndef USE_AS_RAWMEMCHR
-	/* Adjust length.  */
+	jbe	L(last_4x_vec_or_less_cmpeq)
+	/* Align data to VEC_SIZE * 4 - 1 for the loop and readjust
+	   length.  */
+	incq	%rdi
+	movl	%edi, %ecx
+	orq	$(VEC_SIZE * 4 - 1), %rdi
+	andl	$(VEC_SIZE * 4 - 1), %ecx
 	addq	%rcx, %rdx
+# else
+	/* Align data to VEC_SIZE * 4 - 1 for loop.  */
+	incq	%rdi
+	orq	$(VEC_SIZE * 4 - 1), %rdi
 # endif
 
+	/* Compare 4 * VEC at a time forward.  */
 	.p2align 4
 L(loop_4x_vec):
-	/* Compare 4 * VEC at a time forward.  */
-	VPCMPEQ (%rdi), %ymm0, %ymm1
-	VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm2
-	VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm0, %ymm3
-	VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm0, %ymm4
-
+	VPCMPEQ	1(%rdi), %ymm0, %ymm1
+	VPCMPEQ	(VEC_SIZE + 1)(%rdi), %ymm0, %ymm2
+	VPCMPEQ	(VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm3
+	VPCMPEQ	(VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm4
 	vpor	%ymm1, %ymm2, %ymm5
 	vpor	%ymm3, %ymm4, %ymm6
 	vpor	%ymm5, %ymm6, %ymm5
 
-	vpmovmskb %ymm5, %eax
-	testl	%eax, %eax
-	jnz	L(4x_vec_end)
-
-	addq	$(VEC_SIZE * 4), %rdi
-
+	vpmovmskb %ymm5, %ecx
 # ifdef USE_AS_RAWMEMCHR
-	jmp	L(loop_4x_vec)
+	subq	$-(VEC_SIZE * 4), %rdi
+	testl	%ecx, %ecx
+	jz	L(loop_4x_vec)
 # else
-	subq	$(VEC_SIZE * 4), %rdx
-	ja	L(loop_4x_vec)
+	testl	%ecx, %ecx
+	jnz	L(loop_4x_vec_end)
 
-L(last_4x_vec_or_less):
-	/* Less than 4 * VEC and aligned to VEC_SIZE.  */
-	addl	$(VEC_SIZE * 2), %edx
-	jle	L(last_2x_vec)
+	subq	$-(VEC_SIZE * 4), %rdi
 
-	VPCMPEQ (%rdi), %ymm0, %ymm1
-	vpmovmskb %ymm1, %eax
-	testl	%eax, %eax
-	jnz	L(first_vec_x0)
+	subq	$(VEC_SIZE * 4), %rdx
+	ja	L(loop_4x_vec)
 
-	VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1
+	/* Fall through into less than 4 remaining vectors of length case.
+	 */
+	VPCMPEQ	(VEC_SIZE * 0 + 1)(%rdi), %ymm0, %ymm1
 	vpmovmskb %ymm1, %eax
+	.p2align 4
+L(last_4x_vec_or_less):
+	/* Check if first VEC contained match.  */
 	testl	%eax, %eax
-	jnz	L(first_vec_x1)
+	jnz	L(first_vec_x1_check)
 
-	VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm0, %ymm1
-	vpmovmskb %ymm1, %eax
-	testl	%eax, %eax
+	/* If remaining length > VEC_SIZE * 2.  */
+	addl	$(VEC_SIZE * 2), %edx
+	jg	L(last_4x_vec)
 
-	jnz	L(first_vec_x2_check)
-	subl	$VEC_SIZE, %edx
-	jle	L(zero)
+L(last_2x_vec):
+	/* If remaining length < VEC_SIZE.  */
+	addl	$VEC_SIZE, %edx
+	jle	L(zero_end)
 
-	VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1
+	/* Check VEC2 and compare any match with remaining length.  */
+	VPCMPEQ	(VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
 	vpmovmskb %ymm1, %eax
-	testl	%eax, %eax
-
-	jnz	L(first_vec_x3_check)
-	xorl	%eax, %eax
+	tzcntl	%eax, %eax
+	cmpl	%eax, %edx
+	jbe	L(set_zero_end)
+	addq	$(VEC_SIZE + 1), %rdi
+	addq	%rdi, %rax
+L(zero_end):
 	VZEROUPPER_RETURN
 
 	.p2align 4
-L(last_2x_vec):
-	addl	$(VEC_SIZE * 2), %edx
-	VPCMPEQ (%rdi), %ymm0, %ymm1
+L(loop_4x_vec_end):
+# endif
+	/* rawmemchr will fall through into this if match was found in
+	   loop.  */
+
 	vpmovmskb %ymm1, %eax
 	testl	%eax, %eax
+	jnz	L(last_vec_x1_return)
 
-	jnz	L(first_vec_x0_check)
-	subl	$VEC_SIZE, %edx
-	jle	L(zero)
-
-	VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1
-	vpmovmskb %ymm1, %eax
+	vpmovmskb %ymm2, %eax
 	testl	%eax, %eax
-	jnz	L(first_vec_x1_check)
-	xorl	%eax, %eax
-	VZEROUPPER_RETURN
+	jnz	L(last_vec_x2_return)
 
-	.p2align 4
-L(first_vec_x0_check):
-	tzcntl	%eax, %eax
-	/* Check the end of data.  */
-	cmpq	%rax, %rdx
-	jbe	L(zero)
+	vpmovmskb %ymm3, %eax
+	/* Combine VEC3 matches (eax) with VEC4 matches (ecx).  */
+	salq	$32, %rcx
+	orq	%rcx, %rax
+	tzcntq	%rax, %rax
+# ifdef USE_AS_RAWMEMCHR
+	subq	$(VEC_SIZE * 2 - 1), %rdi
+# else
+	subq	$-(VEC_SIZE * 2 + 1), %rdi
+# endif
 	addq	%rdi, %rax
 	VZEROUPPER_RETURN
+# ifndef USE_AS_RAWMEMCHR
 
 	.p2align 4
 L(first_vec_x1_check):
 	tzcntl	%eax, %eax
-	/* Check the end of data.  */
-	cmpq	%rax, %rdx
-	jbe	L(zero)
-	addq	$VEC_SIZE, %rax
+	/* Adjust length.  */
+	subl	$-(VEC_SIZE * 4), %edx
+	/* Check if match within remaining length.  */
+	cmpl	%eax, %edx
+	jbe	L(set_zero_end)
+	incq	%rdi
 	addq	%rdi, %rax
 	VZEROUPPER_RETURN
+	.p2align 4
+L(set_zero_end):
+	xorl	%eax, %eax
+	VZEROUPPER_RETURN
+# endif
 
 	.p2align 4
-L(first_vec_x2_check):
+L(last_vec_x1_return):
 	tzcntl	%eax, %eax
-	/* Check the end of data.  */
-	cmpq	%rax, %rdx
-	jbe	L(zero)
-	addq	$(VEC_SIZE * 2), %rax
+# ifdef USE_AS_RAWMEMCHR
+	subq	$(VEC_SIZE * 4 - 1), %rdi
+# else
+	incq	%rdi
+# endif
 	addq	%rdi, %rax
 	VZEROUPPER_RETURN
 
 	.p2align 4
-L(first_vec_x3_check):
+L(last_vec_x2_return):
 	tzcntl	%eax, %eax
-	/* Check the end of data.  */
-	cmpq	%rax, %rdx
-	jbe	L(zero)
-	addq	$(VEC_SIZE * 3), %rax
+# ifdef USE_AS_RAWMEMCHR
+	subq	$(VEC_SIZE * 3 - 1), %rdi
+# else
+	subq	$-(VEC_SIZE + 1), %rdi
+# endif
 	addq	%rdi, %rax
 	VZEROUPPER_RETURN
 
+# ifndef USE_AS_RAWMEMCHR
 	.p2align 4
-L(zero):
-	xorl	%eax, %eax
-	jmp     L(return_vzeroupper)
+L(last_4x_vec_or_less_cmpeq):
+	VPCMPEQ	(VEC_SIZE * 4 + 1)(%rdi), %ymm0, %ymm1
+	vpmovmskb %ymm1, %eax
+	subq	$-(VEC_SIZE * 4), %rdi
+	/* Check first VEC regardless.  */
+	testl	%eax, %eax
+	jnz	L(first_vec_x1_check)
 
+	/* If remaining length <= CHAR_PER_VEC * 2.  */
+	addl	$(VEC_SIZE * 2), %edx
+	jle	L(last_2x_vec)
 	.p2align 4
-L(null):
-	xorl	%eax, %eax
-	ret
-# endif
+L(last_4x_vec):
+	VPCMPEQ	(VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
+	vpmovmskb %ymm1, %eax
+	testl	%eax, %eax
+	jnz	L(last_vec_x2_return)
 
-	.p2align 4
-L(first_vec_x0):
-	tzcntl	%eax, %eax
-	addq	%rdi, %rax
-	VZEROUPPER_RETURN
+	VPCMPEQ	(VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm1
+	vpmovmskb %ymm1, %eax
 
-	.p2align 4
-L(first_vec_x1):
-	tzcntl	%eax, %eax
-	addq	$VEC_SIZE, %rax
-	addq	%rdi, %rax
-	VZEROUPPER_RETURN
+	/* Create mask for possible matches within remaining length.  */
+	movq	$-1, %rcx
+	bzhiq	%rdx, %rcx, %rcx
 
-	.p2align 4
-L(first_vec_x2):
+	/* Test matches in data against length match.  */
+	andl	%ecx, %eax
+	jnz	L(last_vec_x3)
+
+	/* if remaining length <= VEC_SIZE * 3 (Note this is after
+	   remaining length was found to be > VEC_SIZE * 2.  */
+	subl	$VEC_SIZE, %edx
+	jbe	L(zero_end2)
+
+	VPCMPEQ	(VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm1
+	vpmovmskb %ymm1, %eax
+	/* Shift remaining length mask for last VEC.  */
+	shrq	$32, %rcx
+	andl	%ecx, %eax
+	jz	L(zero_end2)
 	tzcntl	%eax, %eax
-	addq	$(VEC_SIZE * 2), %rax
+	addq	$(VEC_SIZE * 3 + 1), %rdi
 	addq	%rdi, %rax
+L(zero_end2):
 	VZEROUPPER_RETURN
 
 	.p2align 4
-L(4x_vec_end):
-	vpmovmskb %ymm1, %eax
-	testl	%eax, %eax
-	jnz	L(first_vec_x0)
-	vpmovmskb %ymm2, %eax
-	testl	%eax, %eax
-	jnz	L(first_vec_x1)
-	vpmovmskb %ymm3, %eax
-	testl	%eax, %eax
-	jnz	L(first_vec_x2)
-	vpmovmskb %ymm4, %eax
-	testl	%eax, %eax
-L(first_vec_x3):
+L(last_vec_x3):
 	tzcntl	%eax, %eax
-	addq	$(VEC_SIZE * 3), %rax
+	subq	$-(VEC_SIZE * 2 + 1), %rdi
 	addq	%rdi, %rax
 	VZEROUPPER_RETURN
+# endif
 
 END (MEMCHR)
 #endif
-- 
2.29.2


^ permalink raw reply	[flat|nested] 20+ messages in thread

* [PATCH v3 3/3] x86: Optimize memchr-evex.S
  2021-05-03 22:58   ` [PATCH v3 " Noah Goldstein
@ 2021-05-03 22:58     ` Noah Goldstein
  2021-05-03 22:59       ` Noah Goldstein
  2021-05-03 22:59     ` [PATCH v3 2/3] x86: Optimize memchr-avx2.S Noah Goldstein
  1 sibling, 1 reply; 20+ messages in thread
From: Noah Goldstein @ 2021-05-03 22:58 UTC (permalink / raw)
  To: libc-alpha

No bug. This commit optimizes memchr-evex.S. The optimizations include
replacing some branches with cmovcc, avoiding some branches entirely
in the less_4x_vec case, making the page cross logic less strict,
saving some ALU in the alignment process, and most importantly
increasing ILP in the 4x loop. test-memchr, test-rawmemchr, and
test-wmemchr are all passing.

Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
---
 sysdeps/x86_64/multiarch/memchr-evex.S | 547 +++++++++++++++----------
 1 file changed, 322 insertions(+), 225 deletions(-)

diff --git a/sysdeps/x86_64/multiarch/memchr-evex.S b/sysdeps/x86_64/multiarch/memchr-evex.S
index 6dd5d67b90..81d5cd6486 100644
--- a/sysdeps/x86_64/multiarch/memchr-evex.S
+++ b/sysdeps/x86_64/multiarch/memchr-evex.S
@@ -26,14 +26,28 @@
 
 # ifdef USE_AS_WMEMCHR
 #  define VPBROADCAST	vpbroadcastd
-#  define VPCMP		vpcmpd
-#  define SHIFT_REG	r8d
+#  define VPMINU	vpminud
+#  define VPCMP	vpcmpd
+#  define VPCMPEQ	vpcmpeqd
+#  define CHAR_SIZE	4
 # else
 #  define VPBROADCAST	vpbroadcastb
-#  define VPCMP		vpcmpb
-#  define SHIFT_REG	ecx
+#  define VPMINU	vpminub
+#  define VPCMP	vpcmpb
+#  define VPCMPEQ	vpcmpeqb
+#  define CHAR_SIZE	1
 # endif
 
+# ifdef USE_AS_RAWMEMCHR
+#  define RAW_PTR_REG	rcx
+#  define ALGN_PTR_REG	rdi
+# else
+#  define RAW_PTR_REG	rdi
+#  define ALGN_PTR_REG	rcx
+# endif
+
+# define XMMZERO	xmm23
+# define YMMZERO	ymm23
 # define XMMMATCH	xmm16
 # define YMMMATCH	ymm16
 # define YMM1		ymm17
@@ -44,6 +58,8 @@
 # define YMM6		ymm22
 
 # define VEC_SIZE 32
+# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE)
+# define PAGE_SIZE 4096
 
 	.section .text.evex,"ax",@progbits
 ENTRY (MEMCHR)
@@ -51,11 +67,7 @@ ENTRY (MEMCHR)
 	/* Check for zero length.  */
 	test	%RDX_LP, %RDX_LP
 	jz	L(zero)
-# endif
-	movl	%edi, %ecx
-# ifdef USE_AS_WMEMCHR
-	shl	$2, %RDX_LP
-# else
+
 #  ifdef __ILP32__
 	/* Clear the upper 32 bits.  */
 	movl	%edx, %edx
@@ -64,318 +76,403 @@ ENTRY (MEMCHR)
 	/* Broadcast CHAR to YMMMATCH.  */
 	VPBROADCAST %esi, %YMMMATCH
 	/* Check if we may cross page boundary with one vector load.  */
-	andl	$(2 * VEC_SIZE - 1), %ecx
-	cmpl	$VEC_SIZE, %ecx
-	ja	L(cros_page_boundary)
+	movl	%edi, %eax
+	andl	$(PAGE_SIZE - 1), %eax
+	cmpl	$(PAGE_SIZE - VEC_SIZE), %eax
+	ja	L(cross_page_boundary)
 
 	/* Check the first VEC_SIZE bytes.  */
-	VPCMP	$0, (%rdi), %YMMMATCH, %k1
-	kmovd	%k1, %eax
-	testl	%eax, %eax
-
+	VPCMP	$0, (%rdi), %YMMMATCH, %k0
+	kmovd	%k0, %eax
 # ifndef USE_AS_RAWMEMCHR
-	jnz	L(first_vec_x0_check)
-	/* Adjust length and check the end of data.  */
-	subq	$VEC_SIZE, %rdx
-	jbe	L(zero)
+	/* If length < CHAR_PER_VEC handle special.  */
+	cmpq	$CHAR_PER_VEC, %rdx
+	jbe	L(first_vec_x0)
+# endif
+	testl	%eax, %eax
+	jz	L(aligned_more)
+	tzcntl	%eax, %eax
+# ifdef USE_AS_WMEMCHR
+	/* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count.  */
+	leaq	(%rdi, %rax, CHAR_SIZE), %rax
 # else
-	jnz	L(first_vec_x0)
+	addq	%rdi, %rax
 # endif
-
-	/* Align data for aligned loads in the loop.  */
-	addq	$VEC_SIZE, %rdi
-	andl	$(VEC_SIZE - 1), %ecx
-	andq	$-VEC_SIZE, %rdi
+	ret
 
 # ifndef USE_AS_RAWMEMCHR
-	/* Adjust length.  */
-	addq	%rcx, %rdx
-
-	subq	$(VEC_SIZE * 4), %rdx
-	jbe	L(last_4x_vec_or_less)
-# endif
-	jmp	L(more_4x_vec)
+L(zero):
+	xorl	%eax, %eax
+	ret
 
+	.p2align 5
+L(first_vec_x0):
+	/* Check if first match was before length.  */
+	tzcntl	%eax, %eax
+	xorl	%ecx, %ecx
+	cmpl	%eax, %edx
+	leaq	(%rdi, %rax, CHAR_SIZE), %rax
+	cmovle	%rcx, %rax
+	ret
+# else
+	/* NB: first_vec_x0 is 17 bytes which will leave
+	   cross_page_boundary (which is relatively cold) close enough
+	   to ideal alignment. So only realign L(cross_page_boundary) if
+	   rawmemchr.  */
 	.p2align 4
-L(cros_page_boundary):
-	andl	$(VEC_SIZE - 1), %ecx
+# endif
+L(cross_page_boundary):
+	/* Save pointer before aligning as its original value is
+	   necessary for computer return address if byte is found or
+	   adjusting length if it is not and this is memchr.  */
+	movq	%rdi, %rcx
+	/* Align data to VEC_SIZE. ALGN_PTR_REG is rcx for memchr and rdi
+	   for rawmemchr.  */
+	andq	$-VEC_SIZE, %ALGN_PTR_REG
+	VPCMP	$0, (%ALGN_PTR_REG), %YMMMATCH, %k0
+	kmovd	%k0, %r8d
 # ifdef USE_AS_WMEMCHR
-	/* NB: Divide shift count by 4 since each bit in K1 represent 4
+	/* NB: Divide shift count by 4 since each bit in K0 represent 4
 	   bytes.  */
-	movl	%ecx, %SHIFT_REG
-	sarl	$2, %SHIFT_REG
+	sarl	$2, %eax
+# endif
+# ifndef USE_AS_RAWMEMCHR
+	movl	$(PAGE_SIZE / CHAR_SIZE), %esi
+	subl	%eax, %esi
 # endif
-	andq	$-VEC_SIZE, %rdi
-	VPCMP	$0, (%rdi), %YMMMATCH, %k1
-	kmovd	%k1, %eax
-	/* Remove the leading bytes.  */
-	sarxl	%SHIFT_REG, %eax, %eax
-	testl	%eax, %eax
-	jz	L(aligned_more)
-	tzcntl	%eax, %eax
 # ifdef USE_AS_WMEMCHR
-	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
-	sall	$2, %eax
+	andl	$(CHAR_PER_VEC - 1), %eax
 # endif
+	/* Remove the leading bytes.  */
+	sarxl	%eax, %r8d, %eax
 # ifndef USE_AS_RAWMEMCHR
 	/* Check the end of data.  */
-	cmpq	%rax, %rdx
-	jbe	L(zero)
+	cmpq	%rsi, %rdx
+	jbe	L(first_vec_x0)
+# endif
+	testl	%eax, %eax
+	jz	L(cross_page_continue)
+	tzcntl	%eax, %eax
+# ifdef USE_AS_WMEMCHR
+	/* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count.  */
+	leaq	(%RAW_PTR_REG, %rax, CHAR_SIZE), %rax
+# else
+	addq	%RAW_PTR_REG, %rax
 # endif
-	addq	%rdi, %rax
-	addq	%rcx, %rax
 	ret
 
 	.p2align 4
-L(aligned_more):
-# ifndef USE_AS_RAWMEMCHR
-        /* Calculate "rdx + rcx - VEC_SIZE" with "rdx - (VEC_SIZE - rcx)"
-	   instead of "(rdx + rcx) - VEC_SIZE" to void possible addition
-	   overflow.  */
-	negq	%rcx
-	addq	$VEC_SIZE, %rcx
+L(first_vec_x1):
+	tzcntl	%eax, %eax
+	leaq	VEC_SIZE(%rdi, %rax, CHAR_SIZE), %rax
+	ret
 
-	/* Check the end of data.  */
-	subq	%rcx, %rdx
-	jbe	L(zero)
-# endif
+	.p2align 4
+L(first_vec_x2):
+	tzcntl	%eax, %eax
+	leaq	(VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
+	ret
 
-	addq	$VEC_SIZE, %rdi
+	.p2align 4
+L(first_vec_x3):
+	tzcntl	%eax, %eax
+	leaq	(VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax
+	ret
 
-# ifndef USE_AS_RAWMEMCHR
-	subq	$(VEC_SIZE * 4), %rdx
-	jbe	L(last_4x_vec_or_less)
-# endif
+	.p2align 4
+L(first_vec_x4):
+	tzcntl	%eax, %eax
+	leaq	(VEC_SIZE * 4)(%rdi, %rax, CHAR_SIZE), %rax
+	ret
 
-L(more_4x_vec):
+	.p2align 5
+L(aligned_more):
 	/* Check the first 4 * VEC_SIZE.  Only one VEC_SIZE at a time
 	   since data is only aligned to VEC_SIZE.  */
-	VPCMP	$0, (%rdi), %YMMMATCH, %k1
-	kmovd	%k1, %eax
-	testl	%eax, %eax
-	jnz	L(first_vec_x0)
 
-	VPCMP	$0, VEC_SIZE(%rdi), %YMMMATCH, %k1
-	kmovd	%k1, %eax
+# ifndef USE_AS_RAWMEMCHR
+	/* Align data to VEC_SIZE.  */
+L(cross_page_continue):
+	xorl	%ecx, %ecx
+	subl	%edi, %ecx
+	andq	$-VEC_SIZE, %rdi
+	/* esi is for adjusting length to see if near the end.  */
+	leal	(VEC_SIZE * 5)(%rdi, %rcx), %esi
+#  ifdef USE_AS_WMEMCHR
+	/* NB: Divide bytes by 4 to get the wchar_t count.  */
+	sarl	$2, %esi
+#  endif
+# else
+	andq	$-VEC_SIZE, %rdi
+L(cross_page_continue):
+# endif
+	/* Load first VEC regardless.  */
+	VPCMP	$0, (VEC_SIZE)(%rdi), %YMMMATCH, %k0
+	kmovd	%k0, %eax
+# ifndef USE_AS_RAWMEMCHR
+	/* Adjust length. If near end handle specially.  */
+	subq	%rsi, %rdx
+	jbe	L(last_4x_vec_or_less)
+# endif
 	testl	%eax, %eax
 	jnz	L(first_vec_x1)
 
-	VPCMP	$0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k1
-	kmovd	%k1, %eax
+	VPCMP	$0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k0
+	kmovd	%k0, %eax
 	testl	%eax, %eax
 	jnz	L(first_vec_x2)
 
-	VPCMP	$0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k1
-	kmovd	%k1, %eax
+	VPCMP	$0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k0
+	kmovd	%k0, %eax
 	testl	%eax, %eax
 	jnz	L(first_vec_x3)
 
-	addq	$(VEC_SIZE * 4), %rdi
+	VPCMP	$0, (VEC_SIZE * 4)(%rdi), %YMMMATCH, %k0
+	kmovd	%k0, %eax
+	testl	%eax, %eax
+	jnz	L(first_vec_x4)
+
 
 # ifndef USE_AS_RAWMEMCHR
-	subq	$(VEC_SIZE * 4), %rdx
-	jbe	L(last_4x_vec_or_less)
-# endif
+	/* Check if at last CHAR_PER_VEC * 4 length.  */
+	subq	$(CHAR_PER_VEC * 4), %rdx
+	jbe	L(last_4x_vec_or_less_cmpeq)
+	addq	$VEC_SIZE, %rdi
 
-	/* Align data to 4 * VEC_SIZE.  */
-	movq	%rdi, %rcx
-	andl	$(4 * VEC_SIZE - 1), %ecx
+	/* Align data to VEC_SIZE * 4 for the loop and readjust length.
+	 */
+#  ifdef USE_AS_WMEMCHR
+	movl	%edi, %ecx
 	andq	$-(4 * VEC_SIZE), %rdi
-
-# ifndef USE_AS_RAWMEMCHR
-	/* Adjust length.  */
+	andl	$(VEC_SIZE * 4 - 1), %ecx
+	/* NB: Divide bytes by 4 to get the wchar_t count.  */
+	sarl	$2, %ecx
 	addq	%rcx, %rdx
+#  else
+	addq	%rdi, %rdx
+	andq	$-(4 * VEC_SIZE), %rdi
+	subq	%rdi, %rdx
+#  endif
+# else
+	addq	$VEC_SIZE, %rdi
+	andq	$-(4 * VEC_SIZE), %rdi
 # endif
 
+	vpxorq	%XMMZERO, %XMMZERO, %XMMZERO
+
+	/* Compare 4 * VEC at a time forward.  */
 	.p2align 4
 L(loop_4x_vec):
-	/* Compare 4 * VEC at a time forward.  */
-	VPCMP	$0, (%rdi), %YMMMATCH, %k1
-	VPCMP	$0, VEC_SIZE(%rdi), %YMMMATCH, %k2
-	kord	%k1, %k2, %k5
-	VPCMP	$0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k3
-	VPCMP	$0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k4
-
-	kord	%k3, %k4, %k6
-	kortestd %k5, %k6
-	jnz	L(4x_vec_end)
-
-	addq	$(VEC_SIZE * 4), %rdi
-
+	/* It would be possible to save some instructions using 4x VPCMP
+	   but bottleneck on port 5 makes it not woth it.  */
+	VPCMP	$4, (VEC_SIZE * 4)(%rdi), %YMMMATCH, %k1
+	/* xor will set bytes match esi to zero.  */
+	vpxorq	(VEC_SIZE * 5)(%rdi), %YMMMATCH, %YMM2
+	vpxorq	(VEC_SIZE * 6)(%rdi), %YMMMATCH, %YMM3
+	VPCMP	$0, (VEC_SIZE * 7)(%rdi), %YMMMATCH, %k3
+	/* Reduce VEC2 / VEC3 with min and VEC1 with zero mask.  */
+	VPMINU	%YMM2, %YMM3, %YMM3 {%k1} {z}
+	VPCMP	$0, %YMM3, %YMMZERO, %k2
 # ifdef USE_AS_RAWMEMCHR
-	jmp	L(loop_4x_vec)
+	subq	$-(VEC_SIZE * 4), %rdi
+	kortestd %k2, %k3
+	jz	L(loop_4x_vec)
 # else
-	subq	$(VEC_SIZE * 4), %rdx
+	kortestd %k2, %k3
+	jnz	L(loop_4x_vec_end)
+
+	subq	$-(VEC_SIZE * 4), %rdi
+
+	subq	$(CHAR_PER_VEC * 4), %rdx
 	ja	L(loop_4x_vec)
 
+	/* Fall through into less than 4 remaining vectors of length case.
+	 */
+	VPCMP	$0, (VEC_SIZE * 4)(%rdi), %YMMMATCH, %k0
+	kmovd	%k0, %eax
+	addq	$(VEC_SIZE * 3), %rdi
+	.p2align 4
 L(last_4x_vec_or_less):
-	/* Less than 4 * VEC and aligned to VEC_SIZE.  */
-	addl	$(VEC_SIZE * 2), %edx
-	jle	L(last_2x_vec)
-
-	VPCMP	$0, (%rdi), %YMMMATCH, %k1
-	kmovd	%k1, %eax
+	/* Check if first VEC contained match.  */
 	testl	%eax, %eax
-	jnz	L(first_vec_x0)
+	jnz	L(first_vec_x1_check)
 
-	VPCMP	$0, VEC_SIZE(%rdi), %YMMMATCH, %k1
-	kmovd	%k1, %eax
-	testl	%eax, %eax
-	jnz	L(first_vec_x1)
+	/* If remaining length > CHAR_PER_VEC * 2.  */
+	addl	$(CHAR_PER_VEC * 2), %edx
+	jg	L(last_4x_vec)
 
-	VPCMP	$0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k1
-	kmovd	%k1, %eax
-	testl	%eax, %eax
+L(last_2x_vec):
+	/* If remaining length < CHAR_PER_VEC.  */
+	addl	$CHAR_PER_VEC, %edx
+	jle	L(zero_end)
 
-	jnz	L(first_vec_x2_check)
-	subl	$VEC_SIZE, %edx
-	jle	L(zero)
+	/* Check VEC2 and compare any match with remaining length.  */
+	VPCMP	$0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k0
+	kmovd	%k0, %eax
+	tzcntl	%eax, %eax
+	cmpl	%eax, %edx
+	jbe	L(set_zero_end)
+	leaq	(VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
+L(zero_end):
+	ret
 
-	VPCMP	$0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k1
-	kmovd	%k1, %eax
-	testl	%eax, %eax
 
-	jnz	L(first_vec_x3_check)
+	.p2align 4
+L(first_vec_x1_check):
+	tzcntl	%eax, %eax
+	/* Adjust length.  */
+	subl	$-(CHAR_PER_VEC * 4), %edx
+	/* Check if match within remaining length.  */
+	cmpl	%eax, %edx
+	jbe	L(set_zero_end)
+	/* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count.  */
+	leaq	VEC_SIZE(%rdi, %rax, CHAR_SIZE), %rax
+	ret
+L(set_zero_end):
 	xorl	%eax, %eax
 	ret
 
 	.p2align 4
-L(last_2x_vec):
-	addl	$(VEC_SIZE * 2), %edx
-	VPCMP	$0, (%rdi), %YMMMATCH, %k1
+L(loop_4x_vec_end):
+# endif
+	/* rawmemchr will fall through into this if match was found in
+	   loop.  */
+
+	/* k1 has not of matches with VEC1.  */
 	kmovd	%k1, %eax
-	testl	%eax, %eax
+# ifdef USE_AS_WMEMCHR
+	subl	$((1 << CHAR_PER_VEC) - 1), %eax
+# else
+	incl	%eax
+# endif
+	jnz	L(last_vec_x1_return)
 
-	jnz	L(first_vec_x0_check)
-	subl	$VEC_SIZE, %edx
-	jle	L(zero)
+	VPCMP	$0, %YMM2, %YMMZERO, %k0
+	kmovd	%k0, %eax
+	testl	%eax, %eax
+	jnz	L(last_vec_x2_return)
 
-	VPCMP	$0, VEC_SIZE(%rdi), %YMMMATCH, %k1
-	kmovd	%k1, %eax
+	kmovd	%k2, %eax
 	testl	%eax, %eax
-	jnz	L(first_vec_x1_check)
-	xorl	%eax, %eax
-	ret
+	jnz	L(last_vec_x3_return)
 
-	.p2align 4
-L(first_vec_x0_check):
+	kmovd	%k3, %eax
 	tzcntl	%eax, %eax
-# ifdef USE_AS_WMEMCHR
-	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
-	sall	$2, %eax
+# ifdef USE_AS_RAWMEMCHR
+	leaq	(VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax
+# else
+	leaq	(VEC_SIZE * 7)(%rdi, %rax, CHAR_SIZE), %rax
 # endif
-	/* Check the end of data.  */
-	cmpq	%rax, %rdx
-	jbe	L(zero)
-	addq	%rdi, %rax
 	ret
 
 	.p2align 4
-L(first_vec_x1_check):
+L(last_vec_x1_return):
 	tzcntl	%eax, %eax
-# ifdef USE_AS_WMEMCHR
-	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
-	sall	$2, %eax
-# endif
-	/* Check the end of data.  */
-	cmpq	%rax, %rdx
-	jbe	L(zero)
-	addq	$VEC_SIZE, %rax
+# ifdef USE_AS_RAWMEMCHR
+#  ifdef USE_AS_WMEMCHR
+	/* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count.  */
+	leaq	(%rdi, %rax, CHAR_SIZE), %rax
+#  else
 	addq	%rdi, %rax
-	ret
-
-	.p2align 4
-L(first_vec_x2_check):
-	tzcntl	%eax, %eax
-# ifdef USE_AS_WMEMCHR
-	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
-	sall	$2, %eax
+#  endif
+# else
+	/* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count.  */
+	leaq	(VEC_SIZE * 4)(%rdi, %rax, CHAR_SIZE), %rax
 # endif
-	/* Check the end of data.  */
-	cmpq	%rax, %rdx
-	jbe	L(zero)
-	addq	$(VEC_SIZE * 2), %rax
-	addq	%rdi, %rax
 	ret
 
 	.p2align 4
-L(first_vec_x3_check):
+L(last_vec_x2_return):
 	tzcntl	%eax, %eax
-# ifdef USE_AS_WMEMCHR
-	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
-	sall	$2, %eax
+# ifdef USE_AS_RAWMEMCHR
+	/* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count.  */
+	leaq	VEC_SIZE(%rdi, %rax, CHAR_SIZE), %rax
+# else
+	/* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count.  */
+	leaq	(VEC_SIZE * 5)(%rdi, %rax, CHAR_SIZE), %rax
 # endif
-	/* Check the end of data.  */
-	cmpq	%rax, %rdx
-	jbe	L(zero)
-	addq	$(VEC_SIZE * 3), %rax
-	addq	%rdi, %rax
 	ret
 
 	.p2align 4
-L(zero):
-	xorl	%eax, %eax
-	ret
-# endif
-
-	.p2align 4
-L(first_vec_x0):
+L(last_vec_x3_return):
 	tzcntl	%eax, %eax
-# ifdef USE_AS_WMEMCHR
-	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
-	leaq	(%rdi, %rax, 4), %rax
+# ifdef USE_AS_RAWMEMCHR
+	/* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count.  */
+	leaq	(VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
 # else
-	addq	%rdi, %rax
+	/* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count.  */
+	leaq	(VEC_SIZE * 6)(%rdi, %rax, CHAR_SIZE), %rax
 # endif
 	ret
 
+
+# ifndef USE_AS_RAWMEMCHR
+L(last_4x_vec_or_less_cmpeq):
+	VPCMP	$0, (VEC_SIZE * 5)(%rdi), %YMMMATCH, %k0
+	kmovd	%k0, %eax
+	subq	$-(VEC_SIZE * 4), %rdi
+	/* Check first VEC regardless.  */
+	testl	%eax, %eax
+	jnz	L(first_vec_x1_check)
+
+	/* If remaining length <= CHAR_PER_VEC * 2.  */
+	addl	$(CHAR_PER_VEC * 2), %edx
+	jle	L(last_2x_vec)
+
 	.p2align 4
-L(first_vec_x1):
+L(last_4x_vec):
+	VPCMP	$0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k0
+	kmovd	%k0, %eax
+	testl	%eax, %eax
+	jnz	L(last_vec_x2)
+
+
+	VPCMP	$0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k0
+	kmovd	%k0, %eax
+	/* Create mask for possible matches within remaining length.  */
+#  ifdef USE_AS_WMEMCHR
+	movl	$((1 << (CHAR_PER_VEC * 2)) - 1), %ecx
+	bzhil	%edx, %ecx, %ecx
+#  else
+	movq	$-1, %rcx
+	bzhiq	%rdx, %rcx, %rcx
+#  endif
+	/* Test matches in data against length match.  */
+	andl	%ecx, %eax
+	jnz	L(last_vec_x3)
+
+	/* if remaining length <= CHAR_PER_VEC * 3 (Note this is after
+	   remaining length was found to be > CHAR_PER_VEC * 2.  */
+	subl	$CHAR_PER_VEC, %edx
+	jbe	L(zero_end2)
+
+
+	VPCMP	$0, (VEC_SIZE * 4)(%rdi), %YMMMATCH, %k0
+	kmovd	%k0, %eax
+	/* Shift remaining length mask for last VEC.  */
+#  ifdef USE_AS_WMEMCHR
+	shrl	$CHAR_PER_VEC, %ecx
+#  else
+	shrq	$CHAR_PER_VEC, %rcx
+#  endif
+	andl	%ecx, %eax
+	jz	L(zero_end2)
 	tzcntl	%eax, %eax
-# ifdef USE_AS_WMEMCHR
-	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
-	leaq	VEC_SIZE(%rdi, %rax, 4), %rax
-# else
-	addq	$VEC_SIZE, %rax
-	addq	%rdi, %rax
-# endif
+	leaq	(VEC_SIZE * 4)(%rdi, %rax, CHAR_SIZE), %rax
+L(zero_end2):
 	ret
 
-	.p2align 4
-L(first_vec_x2):
+L(last_vec_x2):
 	tzcntl	%eax, %eax
-# ifdef USE_AS_WMEMCHR
-	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
-	leaq	(VEC_SIZE * 2)(%rdi, %rax, 4), %rax
-# else
-	addq	$(VEC_SIZE * 2), %rax
-	addq	%rdi, %rax
-# endif
+	leaq	(VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
 	ret
 
 	.p2align 4
-L(4x_vec_end):
-	kmovd	%k1, %eax
-	testl	%eax, %eax
-	jnz	L(first_vec_x0)
-	kmovd	%k2, %eax
-	testl	%eax, %eax
-	jnz	L(first_vec_x1)
-	kmovd	%k3, %eax
-	testl	%eax, %eax
-	jnz	L(first_vec_x2)
-	kmovd	%k4, %eax
-	testl	%eax, %eax
-L(first_vec_x3):
+L(last_vec_x3):
 	tzcntl	%eax, %eax
-# ifdef USE_AS_WMEMCHR
-	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
-	leaq	(VEC_SIZE * 3)(%rdi, %rax, 4), %rax
-# else
-	addq	$(VEC_SIZE * 3), %rax
-	addq	%rdi, %rax
-# endif
+	leaq	(VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax
 	ret
+# endif
 
 END (MEMCHR)
 #endif
-- 
2.29.2


^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH v3 2/3] x86: Optimize memchr-avx2.S
  2021-05-03 22:58   ` [PATCH v3 " Noah Goldstein
  2021-05-03 22:58     ` [PATCH v3 3/3] x86: Optimize memchr-evex.S Noah Goldstein
@ 2021-05-03 22:59     ` Noah Goldstein
  1 sibling, 0 replies; 20+ messages in thread
From: Noah Goldstein @ 2021-05-03 22:59 UTC (permalink / raw)
  To: GNU C Library

On Mon, May 3, 2021 at 6:58 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> No bug. This commit optimizes memchr-avx2.S. The optimizations include
> replacing some branches with cmovcc, avoiding some branches entirely
> in the less_4x_vec case, making the page cross logic less strict,
> asaving a few instructions the in loop return loop. test-memchr,
> test-rawmemchr, and test-wmemchr are all passing.
>
> Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
> ---
>  sysdeps/x86_64/multiarch/memchr-avx2.S | 425 ++++++++++++++-----------
>  1 file changed, 247 insertions(+), 178 deletions(-)
>
> diff --git a/sysdeps/x86_64/multiarch/memchr-avx2.S b/sysdeps/x86_64/multiarch/memchr-avx2.S
> index 1fcb1c350f..0d8758e3e7 100644
> --- a/sysdeps/x86_64/multiarch/memchr-avx2.S
> +++ b/sysdeps/x86_64/multiarch/memchr-avx2.S
> @@ -26,8 +26,22 @@
>
>  # ifdef USE_AS_WMEMCHR
>  #  define VPCMPEQ      vpcmpeqd
> +#  define VPBROADCAST  vpbroadcastd
> +#  define CHAR_SIZE    4
>  # else
>  #  define VPCMPEQ      vpcmpeqb
> +#  define VPBROADCAST  vpbroadcastb
> +#  define CHAR_SIZE    1
> +# endif
> +
> +# ifdef USE_AS_RAWMEMCHR
> +#  define ERAW_PTR_REG ecx
> +#  define RRAW_PTR_REG rcx
> +#  define ALGN_PTR_REG rdi
> +# else
> +#  define ERAW_PTR_REG edi
> +#  define RRAW_PTR_REG rdi
> +#  define ALGN_PTR_REG rcx
>  # endif
>
>  # ifndef VZEROUPPER
> @@ -39,6 +53,7 @@
>  # endif
>
>  # define VEC_SIZE 32
> +# define PAGE_SIZE 4096
>
>         .section SECTION(.text),"ax",@progbits
>  ENTRY (MEMCHR)
> @@ -47,295 +62,349 @@ ENTRY (MEMCHR)
>         test    %RDX_LP, %RDX_LP
>         jz      L(null)
>  # endif
> -       movl    %edi, %ecx
> -       /* Broadcast CHAR to YMM0.  */
> -       vmovd   %esi, %xmm0
>  # ifdef USE_AS_WMEMCHR
>         shl     $2, %RDX_LP
> -       vpbroadcastd %xmm0, %ymm0
>  # else
>  #  ifdef __ILP32__
>         /* Clear the upper 32 bits.  */
>         movl    %edx, %edx
>  #  endif
> -       vpbroadcastb %xmm0, %ymm0
>  # endif
> +       /* Broadcast CHAR to YMMMATCH.  */
> +       vmovd   %esi, %xmm0
> +       VPBROADCAST %xmm0, %ymm0
>         /* Check if we may cross page boundary with one vector load.  */
> -       andl    $(2 * VEC_SIZE - 1), %ecx
> -       cmpl    $VEC_SIZE, %ecx
> -       ja      L(cros_page_boundary)
> +       movl    %edi, %eax
> +       andl    $(PAGE_SIZE - 1), %eax
> +       cmpl    $(PAGE_SIZE - VEC_SIZE), %eax
> +       ja      L(cross_page_boundary)
>
>         /* Check the first VEC_SIZE bytes.  */
> -       VPCMPEQ (%rdi), %ymm0, %ymm1
> +       VPCMPEQ (%rdi), %ymm0, %ymm1
>         vpmovmskb %ymm1, %eax
> -       testl   %eax, %eax
> -
>  # ifndef USE_AS_RAWMEMCHR
> -       jnz     L(first_vec_x0_check)
> -       /* Adjust length and check the end of data.  */
> -       subq    $VEC_SIZE, %rdx
> -       jbe     L(zero)
> -# else
> -       jnz     L(first_vec_x0)
> +       /* If length < CHAR_PER_VEC handle special.  */
> +       cmpq    $VEC_SIZE, %rdx
> +       jbe     L(first_vec_x0)
>  # endif
> -
> -       /* Align data for aligned loads in the loop.  */
> -       addq    $VEC_SIZE, %rdi
> -       andl    $(VEC_SIZE - 1), %ecx
> -       andq    $-VEC_SIZE, %rdi
> +       testl   %eax, %eax
> +       jz      L(aligned_more)
> +       tzcntl  %eax, %eax
> +       addq    %rdi, %rax
> +       VZEROUPPER_RETURN
>
>  # ifndef USE_AS_RAWMEMCHR
> -       /* Adjust length.  */
> -       addq    %rcx, %rdx
> +       .p2align 5
> +L(first_vec_x0):
> +       /* Check if first match was before length.  */
> +       tzcntl  %eax, %eax
> +       xorl    %ecx, %ecx
> +       cmpl    %eax, %edx
> +       leaq    (%rdi, %rax), %rax
> +       cmovle  %rcx, %rax
> +       VZEROUPPER_RETURN
>
> -       subq    $(VEC_SIZE * 4), %rdx
> -       jbe     L(last_4x_vec_or_less)
> +L(null):
> +       xorl    %eax, %eax
> +       ret
>  # endif
> -       jmp     L(more_4x_vec)
> -
>         .p2align 4
> -L(cros_page_boundary):
> -       andl    $(VEC_SIZE - 1), %ecx
> -       andq    $-VEC_SIZE, %rdi
> -       VPCMPEQ (%rdi), %ymm0, %ymm1
> +L(cross_page_boundary):
> +       /* Save pointer before aligning as its original value is necessary
> +          for computer return address if byte is found or adjusting length
> +          if it is not and this is memchr.  */
> +       movq    %rdi, %rcx
> +       /* Align data to VEC_SIZE - 1. ALGN_PTR_REG is rcx for memchr and
> +          rdi for rawmemchr.  */
> +       orq     $(VEC_SIZE - 1), %ALGN_PTR_REG
> +       VPCMPEQ -(VEC_SIZE - 1)(%ALGN_PTR_REG), %ymm0, %ymm1
>         vpmovmskb %ymm1, %eax
> +# ifndef USE_AS_RAWMEMCHR
> +       /* Calculate length until end of page (length checked for a
> +          match).  */
> +       leaq    1(%ALGN_PTR_REG), %rsi
> +       subq    %RRAW_PTR_REG, %rsi
> +# endif
>         /* Remove the leading bytes.  */
> -       sarl    %cl, %eax
> -       testl   %eax, %eax
> -       jz      L(aligned_more)
> -       tzcntl  %eax, %eax
> +       sarxl   %ERAW_PTR_REG, %eax, %eax
>  # ifndef USE_AS_RAWMEMCHR
>         /* Check the end of data.  */
> -       cmpq    %rax, %rdx
> -       jbe     L(zero)
> +       cmpq    %rsi, %rdx
> +       jbe     L(first_vec_x0)
>  # endif
> -       addq    %rdi, %rax
> -       addq    %rcx, %rax
> +       testl   %eax, %eax
> +       jz      L(cross_page_continue)
> +       tzcntl  %eax, %eax
> +       addq    %RRAW_PTR_REG, %rax
>  L(return_vzeroupper):
>         ZERO_UPPER_VEC_REGISTERS_RETURN
>
>         .p2align 4
> -L(aligned_more):
> -# ifndef USE_AS_RAWMEMCHR
> -        /* Calculate "rdx + rcx - VEC_SIZE" with "rdx - (VEC_SIZE - rcx)"
> -          instead of "(rdx + rcx) - VEC_SIZE" to void possible addition
> -          overflow.  */
> -       negq    %rcx
> -       addq    $VEC_SIZE, %rcx
> +L(first_vec_x1):
> +       tzcntl  %eax, %eax
> +       incq    %rdi
> +       addq    %rdi, %rax
> +       VZEROUPPER_RETURN
>
> -       /* Check the end of data.  */
> -       subq    %rcx, %rdx
> -       jbe     L(zero)
> -# endif
> +       .p2align 4
> +L(first_vec_x2):
> +       tzcntl  %eax, %eax
> +       addq    $(VEC_SIZE + 1), %rdi
> +       addq    %rdi, %rax
> +       VZEROUPPER_RETURN
> +
> +       .p2align 4
> +L(first_vec_x3):
> +       tzcntl  %eax, %eax
> +       addq    $(VEC_SIZE * 2 + 1), %rdi
> +       addq    %rdi, %rax
> +       VZEROUPPER_RETURN
>
> -       addq    $VEC_SIZE, %rdi
>
> -# ifndef USE_AS_RAWMEMCHR
> -       subq    $(VEC_SIZE * 4), %rdx
> -       jbe     L(last_4x_vec_or_less)
> -# endif
> +       .p2align 4
> +L(first_vec_x4):
> +       tzcntl  %eax, %eax
> +       addq    $(VEC_SIZE * 3 + 1), %rdi
> +       addq    %rdi, %rax
> +       VZEROUPPER_RETURN
>
> -L(more_4x_vec):
> +       .p2align 4
> +L(aligned_more):
>         /* Check the first 4 * VEC_SIZE.  Only one VEC_SIZE at a time
>            since data is only aligned to VEC_SIZE.  */
> -       VPCMPEQ (%rdi), %ymm0, %ymm1
> -       vpmovmskb %ymm1, %eax
> -       testl   %eax, %eax
> -       jnz     L(first_vec_x0)
>
> -       VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1
> +# ifndef USE_AS_RAWMEMCHR
> +L(cross_page_continue):
> +       /* Align data to VEC_SIZE - 1.  */
> +       xorl    %ecx, %ecx
> +       subl    %edi, %ecx
> +       orq     $(VEC_SIZE - 1), %rdi
> +       /* esi is for adjusting length to see if near the end.  */
> +       leal    (VEC_SIZE * 4 + 1)(%rdi, %rcx), %esi
> +# else
> +       orq     $(VEC_SIZE - 1), %rdi
> +L(cross_page_continue):
> +# endif
> +       /* Load first VEC regardless.  */
> +       VPCMPEQ 1(%rdi), %ymm0, %ymm1
>         vpmovmskb %ymm1, %eax
> +# ifndef USE_AS_RAWMEMCHR
> +       /* Adjust length. If near end handle specially.  */
> +       subq    %rsi, %rdx
> +       jbe     L(last_4x_vec_or_less)
> +# endif
>         testl   %eax, %eax
>         jnz     L(first_vec_x1)
>
> -       VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm0, %ymm1
> +       VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
>         vpmovmskb %ymm1, %eax
>         testl   %eax, %eax
>         jnz     L(first_vec_x2)
>
> -       VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1
> +       VPCMPEQ (VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm1
>         vpmovmskb %ymm1, %eax
>         testl   %eax, %eax
>         jnz     L(first_vec_x3)
>
> -       addq    $(VEC_SIZE * 4), %rdi
> +       VPCMPEQ (VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm1
> +       vpmovmskb %ymm1, %eax
> +       testl   %eax, %eax
> +       jnz     L(first_vec_x4)
>
>  # ifndef USE_AS_RAWMEMCHR
> +       /* Check if at last VEC_SIZE * 4 length.  */
>         subq    $(VEC_SIZE * 4), %rdx
> -       jbe     L(last_4x_vec_or_less)
> -# endif
> -
> -       /* Align data to 4 * VEC_SIZE.  */
> -       movq    %rdi, %rcx
> -       andl    $(4 * VEC_SIZE - 1), %ecx
> -       andq    $-(4 * VEC_SIZE), %rdi
> -
> -# ifndef USE_AS_RAWMEMCHR
> -       /* Adjust length.  */
> +       jbe     L(last_4x_vec_or_less_cmpeq)
> +       /* Align data to VEC_SIZE * 4 - 1 for the loop and readjust
> +          length.  */
> +       incq    %rdi
> +       movl    %edi, %ecx
> +       orq     $(VEC_SIZE * 4 - 1), %rdi
> +       andl    $(VEC_SIZE * 4 - 1), %ecx
>         addq    %rcx, %rdx
> +# else
> +       /* Align data to VEC_SIZE * 4 - 1 for loop.  */
> +       incq    %rdi
> +       orq     $(VEC_SIZE * 4 - 1), %rdi
>  # endif
>
> +       /* Compare 4 * VEC at a time forward.  */
>         .p2align 4
>  L(loop_4x_vec):
> -       /* Compare 4 * VEC at a time forward.  */
> -       VPCMPEQ (%rdi), %ymm0, %ymm1
> -       VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm2
> -       VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm0, %ymm3
> -       VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm0, %ymm4
> -
> +       VPCMPEQ 1(%rdi), %ymm0, %ymm1
> +       VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm2
> +       VPCMPEQ (VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm3
> +       VPCMPEQ (VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm4
>         vpor    %ymm1, %ymm2, %ymm5
>         vpor    %ymm3, %ymm4, %ymm6
>         vpor    %ymm5, %ymm6, %ymm5
>
> -       vpmovmskb %ymm5, %eax
> -       testl   %eax, %eax
> -       jnz     L(4x_vec_end)
> -
> -       addq    $(VEC_SIZE * 4), %rdi
> -
> +       vpmovmskb %ymm5, %ecx
>  # ifdef USE_AS_RAWMEMCHR
> -       jmp     L(loop_4x_vec)
> +       subq    $-(VEC_SIZE * 4), %rdi
> +       testl   %ecx, %ecx
> +       jz      L(loop_4x_vec)
>  # else
> -       subq    $(VEC_SIZE * 4), %rdx
> -       ja      L(loop_4x_vec)
> +       testl   %ecx, %ecx
> +       jnz     L(loop_4x_vec_end)
>
> -L(last_4x_vec_or_less):
> -       /* Less than 4 * VEC and aligned to VEC_SIZE.  */
> -       addl    $(VEC_SIZE * 2), %edx
> -       jle     L(last_2x_vec)
> +       subq    $-(VEC_SIZE * 4), %rdi
>
> -       VPCMPEQ (%rdi), %ymm0, %ymm1
> -       vpmovmskb %ymm1, %eax
> -       testl   %eax, %eax
> -       jnz     L(first_vec_x0)
> +       subq    $(VEC_SIZE * 4), %rdx
> +       ja      L(loop_4x_vec)
>
> -       VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1
> +       /* Fall through into less than 4 remaining vectors of length case.
> +        */
> +       VPCMPEQ (VEC_SIZE * 0 + 1)(%rdi), %ymm0, %ymm1
>         vpmovmskb %ymm1, %eax
> +       .p2align 4
> +L(last_4x_vec_or_less):
> +       /* Check if first VEC contained match.  */
>         testl   %eax, %eax
> -       jnz     L(first_vec_x1)
> +       jnz     L(first_vec_x1_check)
>
> -       VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm0, %ymm1
> -       vpmovmskb %ymm1, %eax
> -       testl   %eax, %eax
> +       /* If remaining length > VEC_SIZE * 2.  */
> +       addl    $(VEC_SIZE * 2), %edx
> +       jg      L(last_4x_vec)
>
> -       jnz     L(first_vec_x2_check)
> -       subl    $VEC_SIZE, %edx
> -       jle     L(zero)
> +L(last_2x_vec):
> +       /* If remaining length < VEC_SIZE.  */
> +       addl    $VEC_SIZE, %edx
> +       jle     L(zero_end)
>
> -       VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1
> +       /* Check VEC2 and compare any match with remaining length.  */
> +       VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
>         vpmovmskb %ymm1, %eax
> -       testl   %eax, %eax
> -
> -       jnz     L(first_vec_x3_check)
> -       xorl    %eax, %eax
> +       tzcntl  %eax, %eax
> +       cmpl    %eax, %edx
> +       jbe     L(set_zero_end)
> +       addq    $(VEC_SIZE + 1), %rdi
> +       addq    %rdi, %rax
> +L(zero_end):
>         VZEROUPPER_RETURN
>
>         .p2align 4
> -L(last_2x_vec):
> -       addl    $(VEC_SIZE * 2), %edx
> -       VPCMPEQ (%rdi), %ymm0, %ymm1
> +L(loop_4x_vec_end):
> +# endif
> +       /* rawmemchr will fall through into this if match was found in
> +          loop.  */
> +
>         vpmovmskb %ymm1, %eax
>         testl   %eax, %eax
> +       jnz     L(last_vec_x1_return)
>
> -       jnz     L(first_vec_x0_check)
> -       subl    $VEC_SIZE, %edx
> -       jle     L(zero)
> -
> -       VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1
> -       vpmovmskb %ymm1, %eax
> +       vpmovmskb %ymm2, %eax
>         testl   %eax, %eax
> -       jnz     L(first_vec_x1_check)
> -       xorl    %eax, %eax
> -       VZEROUPPER_RETURN
> +       jnz     L(last_vec_x2_return)
>
> -       .p2align 4
> -L(first_vec_x0_check):
> -       tzcntl  %eax, %eax
> -       /* Check the end of data.  */
> -       cmpq    %rax, %rdx
> -       jbe     L(zero)
> +       vpmovmskb %ymm3, %eax
> +       /* Combine VEC3 matches (eax) with VEC4 matches (ecx).  */
> +       salq    $32, %rcx
> +       orq     %rcx, %rax
> +       tzcntq  %rax, %rax
> +# ifdef USE_AS_RAWMEMCHR
> +       subq    $(VEC_SIZE * 2 - 1), %rdi
> +# else
> +       subq    $-(VEC_SIZE * 2 + 1), %rdi
> +# endif
>         addq    %rdi, %rax
>         VZEROUPPER_RETURN
> +# ifndef USE_AS_RAWMEMCHR
>
>         .p2align 4
>  L(first_vec_x1_check):
>         tzcntl  %eax, %eax
> -       /* Check the end of data.  */
> -       cmpq    %rax, %rdx
> -       jbe     L(zero)
> -       addq    $VEC_SIZE, %rax
> +       /* Adjust length.  */
> +       subl    $-(VEC_SIZE * 4), %edx
> +       /* Check if match within remaining length.  */
> +       cmpl    %eax, %edx
> +       jbe     L(set_zero_end)
> +       incq    %rdi
>         addq    %rdi, %rax
>         VZEROUPPER_RETURN
> +       .p2align 4
> +L(set_zero_end):
> +       xorl    %eax, %eax
> +       VZEROUPPER_RETURN
> +# endif
>
>         .p2align 4
> -L(first_vec_x2_check):
> +L(last_vec_x1_return):
>         tzcntl  %eax, %eax
> -       /* Check the end of data.  */
> -       cmpq    %rax, %rdx
> -       jbe     L(zero)
> -       addq    $(VEC_SIZE * 2), %rax
> +# ifdef USE_AS_RAWMEMCHR
> +       subq    $(VEC_SIZE * 4 - 1), %rdi
> +# else
> +       incq    %rdi
> +# endif
>         addq    %rdi, %rax
>         VZEROUPPER_RETURN
>
>         .p2align 4
> -L(first_vec_x3_check):
> +L(last_vec_x2_return):
>         tzcntl  %eax, %eax
> -       /* Check the end of data.  */
> -       cmpq    %rax, %rdx
> -       jbe     L(zero)
> -       addq    $(VEC_SIZE * 3), %rax
> +# ifdef USE_AS_RAWMEMCHR
> +       subq    $(VEC_SIZE * 3 - 1), %rdi
> +# else
> +       subq    $-(VEC_SIZE + 1), %rdi
> +# endif
>         addq    %rdi, %rax
>         VZEROUPPER_RETURN
>
> +# ifndef USE_AS_RAWMEMCHR
>         .p2align 4
> -L(zero):
> -       xorl    %eax, %eax
> -       jmp     L(return_vzeroupper)
> +L(last_4x_vec_or_less_cmpeq):
> +       VPCMPEQ (VEC_SIZE * 4 + 1)(%rdi), %ymm0, %ymm1
> +       vpmovmskb %ymm1, %eax
> +       subq    $-(VEC_SIZE * 4), %rdi
> +       /* Check first VEC regardless.  */
> +       testl   %eax, %eax
> +       jnz     L(first_vec_x1_check)
>
> +       /* If remaining length <= CHAR_PER_VEC * 2.  */
> +       addl    $(VEC_SIZE * 2), %edx
> +       jle     L(last_2x_vec)
>         .p2align 4
> -L(null):
> -       xorl    %eax, %eax
> -       ret
> -# endif
> +L(last_4x_vec):
> +       VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
> +       vpmovmskb %ymm1, %eax
> +       testl   %eax, %eax
> +       jnz     L(last_vec_x2_return)
>
> -       .p2align 4
> -L(first_vec_x0):
> -       tzcntl  %eax, %eax
> -       addq    %rdi, %rax
> -       VZEROUPPER_RETURN
> +       VPCMPEQ (VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm1
> +       vpmovmskb %ymm1, %eax
>
> -       .p2align 4
> -L(first_vec_x1):
> -       tzcntl  %eax, %eax
> -       addq    $VEC_SIZE, %rax
> -       addq    %rdi, %rax
> -       VZEROUPPER_RETURN
> +       /* Create mask for possible matches within remaining length.  */
> +       movq    $-1, %rcx
> +       bzhiq   %rdx, %rcx, %rcx
>
> -       .p2align 4
> -L(first_vec_x2):
> +       /* Test matches in data against length match.  */
> +       andl    %ecx, %eax
> +       jnz     L(last_vec_x3)
> +
> +       /* if remaining length <= VEC_SIZE * 3 (Note this is after
> +          remaining length was found to be > VEC_SIZE * 2.  */
> +       subl    $VEC_SIZE, %edx
> +       jbe     L(zero_end2)
> +
> +       VPCMPEQ (VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm1
> +       vpmovmskb %ymm1, %eax
> +       /* Shift remaining length mask for last VEC.  */
> +       shrq    $32, %rcx
> +       andl    %ecx, %eax
> +       jz      L(zero_end2)
>         tzcntl  %eax, %eax
> -       addq    $(VEC_SIZE * 2), %rax
> +       addq    $(VEC_SIZE * 3 + 1), %rdi
>         addq    %rdi, %rax
> +L(zero_end2):
>         VZEROUPPER_RETURN
>
>         .p2align 4
> -L(4x_vec_end):
> -       vpmovmskb %ymm1, %eax
> -       testl   %eax, %eax
> -       jnz     L(first_vec_x0)
> -       vpmovmskb %ymm2, %eax
> -       testl   %eax, %eax
> -       jnz     L(first_vec_x1)
> -       vpmovmskb %ymm3, %eax
> -       testl   %eax, %eax
> -       jnz     L(first_vec_x2)
> -       vpmovmskb %ymm4, %eax
> -       testl   %eax, %eax
> -L(first_vec_x3):
> +L(last_vec_x3):
>         tzcntl  %eax, %eax
> -       addq    $(VEC_SIZE * 3), %rax
> +       subq    $-(VEC_SIZE * 2 + 1), %rdi
>         addq    %rdi, %rax
>         VZEROUPPER_RETURN
> +# endif
>
>  END (MEMCHR)
>  #endif
> --
> 2.29.2
>

This is the patch I will push unless anyone has objections.

^ permalink raw reply	[flat|nested] 20+ messages in thread

* Re: [PATCH v3 3/3] x86: Optimize memchr-evex.S
  2021-05-03 22:58     ` [PATCH v3 3/3] x86: Optimize memchr-evex.S Noah Goldstein
@ 2021-05-03 22:59       ` Noah Goldstein
  0 siblings, 0 replies; 20+ messages in thread
From: Noah Goldstein @ 2021-05-03 22:59 UTC (permalink / raw)
  To: GNU C Library

On Mon, May 3, 2021 at 6:58 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> No bug. This commit optimizes memchr-evex.S. The optimizations include
> replacing some branches with cmovcc, avoiding some branches entirely
> in the less_4x_vec case, making the page cross logic less strict,
> saving some ALU in the alignment process, and most importantly
> increasing ILP in the 4x loop. test-memchr, test-rawmemchr, and
> test-wmemchr are all passing.
>
> Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
> ---
>  sysdeps/x86_64/multiarch/memchr-evex.S | 547 +++++++++++++++----------
>  1 file changed, 322 insertions(+), 225 deletions(-)
>
> diff --git a/sysdeps/x86_64/multiarch/memchr-evex.S b/sysdeps/x86_64/multiarch/memchr-evex.S
> index 6dd5d67b90..81d5cd6486 100644
> --- a/sysdeps/x86_64/multiarch/memchr-evex.S
> +++ b/sysdeps/x86_64/multiarch/memchr-evex.S
> @@ -26,14 +26,28 @@
>
>  # ifdef USE_AS_WMEMCHR
>  #  define VPBROADCAST  vpbroadcastd
> -#  define VPCMP                vpcmpd
> -#  define SHIFT_REG    r8d
> +#  define VPMINU       vpminud
> +#  define VPCMP        vpcmpd
> +#  define VPCMPEQ      vpcmpeqd
> +#  define CHAR_SIZE    4
>  # else
>  #  define VPBROADCAST  vpbroadcastb
> -#  define VPCMP                vpcmpb
> -#  define SHIFT_REG    ecx
> +#  define VPMINU       vpminub
> +#  define VPCMP        vpcmpb
> +#  define VPCMPEQ      vpcmpeqb
> +#  define CHAR_SIZE    1
>  # endif
>
> +# ifdef USE_AS_RAWMEMCHR
> +#  define RAW_PTR_REG  rcx
> +#  define ALGN_PTR_REG rdi
> +# else
> +#  define RAW_PTR_REG  rdi
> +#  define ALGN_PTR_REG rcx
> +# endif
> +
> +# define XMMZERO       xmm23
> +# define YMMZERO       ymm23
>  # define XMMMATCH      xmm16
>  # define YMMMATCH      ymm16
>  # define YMM1          ymm17
> @@ -44,6 +58,8 @@
>  # define YMM6          ymm22
>
>  # define VEC_SIZE 32
> +# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE)
> +# define PAGE_SIZE 4096
>
>         .section .text.evex,"ax",@progbits
>  ENTRY (MEMCHR)
> @@ -51,11 +67,7 @@ ENTRY (MEMCHR)
>         /* Check for zero length.  */
>         test    %RDX_LP, %RDX_LP
>         jz      L(zero)
> -# endif
> -       movl    %edi, %ecx
> -# ifdef USE_AS_WMEMCHR
> -       shl     $2, %RDX_LP
> -# else
> +
>  #  ifdef __ILP32__
>         /* Clear the upper 32 bits.  */
>         movl    %edx, %edx
> @@ -64,318 +76,403 @@ ENTRY (MEMCHR)
>         /* Broadcast CHAR to YMMMATCH.  */
>         VPBROADCAST %esi, %YMMMATCH
>         /* Check if we may cross page boundary with one vector load.  */
> -       andl    $(2 * VEC_SIZE - 1), %ecx
> -       cmpl    $VEC_SIZE, %ecx
> -       ja      L(cros_page_boundary)
> +       movl    %edi, %eax
> +       andl    $(PAGE_SIZE - 1), %eax
> +       cmpl    $(PAGE_SIZE - VEC_SIZE), %eax
> +       ja      L(cross_page_boundary)
>
>         /* Check the first VEC_SIZE bytes.  */
> -       VPCMP   $0, (%rdi), %YMMMATCH, %k1
> -       kmovd   %k1, %eax
> -       testl   %eax, %eax
> -
> +       VPCMP   $0, (%rdi), %YMMMATCH, %k0
> +       kmovd   %k0, %eax
>  # ifndef USE_AS_RAWMEMCHR
> -       jnz     L(first_vec_x0_check)
> -       /* Adjust length and check the end of data.  */
> -       subq    $VEC_SIZE, %rdx
> -       jbe     L(zero)
> +       /* If length < CHAR_PER_VEC handle special.  */
> +       cmpq    $CHAR_PER_VEC, %rdx
> +       jbe     L(first_vec_x0)
> +# endif
> +       testl   %eax, %eax
> +       jz      L(aligned_more)
> +       tzcntl  %eax, %eax
> +# ifdef USE_AS_WMEMCHR
> +       /* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count.  */
> +       leaq    (%rdi, %rax, CHAR_SIZE), %rax
>  # else
> -       jnz     L(first_vec_x0)
> +       addq    %rdi, %rax
>  # endif
> -
> -       /* Align data for aligned loads in the loop.  */
> -       addq    $VEC_SIZE, %rdi
> -       andl    $(VEC_SIZE - 1), %ecx
> -       andq    $-VEC_SIZE, %rdi
> +       ret
>
>  # ifndef USE_AS_RAWMEMCHR
> -       /* Adjust length.  */
> -       addq    %rcx, %rdx
> -
> -       subq    $(VEC_SIZE * 4), %rdx
> -       jbe     L(last_4x_vec_or_less)
> -# endif
> -       jmp     L(more_4x_vec)
> +L(zero):
> +       xorl    %eax, %eax
> +       ret
>
> +       .p2align 5
> +L(first_vec_x0):
> +       /* Check if first match was before length.  */
> +       tzcntl  %eax, %eax
> +       xorl    %ecx, %ecx
> +       cmpl    %eax, %edx
> +       leaq    (%rdi, %rax, CHAR_SIZE), %rax
> +       cmovle  %rcx, %rax
> +       ret
> +# else
> +       /* NB: first_vec_x0 is 17 bytes which will leave
> +          cross_page_boundary (which is relatively cold) close enough
> +          to ideal alignment. So only realign L(cross_page_boundary) if
> +          rawmemchr.  */
>         .p2align 4
> -L(cros_page_boundary):
> -       andl    $(VEC_SIZE - 1), %ecx
> +# endif
> +L(cross_page_boundary):
> +       /* Save pointer before aligning as its original value is
> +          necessary for computer return address if byte is found or
> +          adjusting length if it is not and this is memchr.  */
> +       movq    %rdi, %rcx
> +       /* Align data to VEC_SIZE. ALGN_PTR_REG is rcx for memchr and rdi
> +          for rawmemchr.  */
> +       andq    $-VEC_SIZE, %ALGN_PTR_REG
> +       VPCMP   $0, (%ALGN_PTR_REG), %YMMMATCH, %k0
> +       kmovd   %k0, %r8d
>  # ifdef USE_AS_WMEMCHR
> -       /* NB: Divide shift count by 4 since each bit in K1 represent 4
> +       /* NB: Divide shift count by 4 since each bit in K0 represent 4
>            bytes.  */
> -       movl    %ecx, %SHIFT_REG
> -       sarl    $2, %SHIFT_REG
> +       sarl    $2, %eax
> +# endif
> +# ifndef USE_AS_RAWMEMCHR
> +       movl    $(PAGE_SIZE / CHAR_SIZE), %esi
> +       subl    %eax, %esi
>  # endif
> -       andq    $-VEC_SIZE, %rdi
> -       VPCMP   $0, (%rdi), %YMMMATCH, %k1
> -       kmovd   %k1, %eax
> -       /* Remove the leading bytes.  */
> -       sarxl   %SHIFT_REG, %eax, %eax
> -       testl   %eax, %eax
> -       jz      L(aligned_more)
> -       tzcntl  %eax, %eax
>  # ifdef USE_AS_WMEMCHR
> -       /* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
> -       sall    $2, %eax
> +       andl    $(CHAR_PER_VEC - 1), %eax
>  # endif
> +       /* Remove the leading bytes.  */
> +       sarxl   %eax, %r8d, %eax
>  # ifndef USE_AS_RAWMEMCHR
>         /* Check the end of data.  */
> -       cmpq    %rax, %rdx
> -       jbe     L(zero)
> +       cmpq    %rsi, %rdx
> +       jbe     L(first_vec_x0)
> +# endif
> +       testl   %eax, %eax
> +       jz      L(cross_page_continue)
> +       tzcntl  %eax, %eax
> +# ifdef USE_AS_WMEMCHR
> +       /* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count.  */
> +       leaq    (%RAW_PTR_REG, %rax, CHAR_SIZE), %rax
> +# else
> +       addq    %RAW_PTR_REG, %rax
>  # endif
> -       addq    %rdi, %rax
> -       addq    %rcx, %rax
>         ret
>
>         .p2align 4
> -L(aligned_more):
> -# ifndef USE_AS_RAWMEMCHR
> -        /* Calculate "rdx + rcx - VEC_SIZE" with "rdx - (VEC_SIZE - rcx)"
> -          instead of "(rdx + rcx) - VEC_SIZE" to void possible addition
> -          overflow.  */
> -       negq    %rcx
> -       addq    $VEC_SIZE, %rcx
> +L(first_vec_x1):
> +       tzcntl  %eax, %eax
> +       leaq    VEC_SIZE(%rdi, %rax, CHAR_SIZE), %rax
> +       ret
>
> -       /* Check the end of data.  */
> -       subq    %rcx, %rdx
> -       jbe     L(zero)
> -# endif
> +       .p2align 4
> +L(first_vec_x2):
> +       tzcntl  %eax, %eax
> +       leaq    (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
> +       ret
>
> -       addq    $VEC_SIZE, %rdi
> +       .p2align 4
> +L(first_vec_x3):
> +       tzcntl  %eax, %eax
> +       leaq    (VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax
> +       ret
>
> -# ifndef USE_AS_RAWMEMCHR
> -       subq    $(VEC_SIZE * 4), %rdx
> -       jbe     L(last_4x_vec_or_less)
> -# endif
> +       .p2align 4
> +L(first_vec_x4):
> +       tzcntl  %eax, %eax
> +       leaq    (VEC_SIZE * 4)(%rdi, %rax, CHAR_SIZE), %rax
> +       ret
>
> -L(more_4x_vec):
> +       .p2align 5
> +L(aligned_more):
>         /* Check the first 4 * VEC_SIZE.  Only one VEC_SIZE at a time
>            since data is only aligned to VEC_SIZE.  */
> -       VPCMP   $0, (%rdi), %YMMMATCH, %k1
> -       kmovd   %k1, %eax
> -       testl   %eax, %eax
> -       jnz     L(first_vec_x0)
>
> -       VPCMP   $0, VEC_SIZE(%rdi), %YMMMATCH, %k1
> -       kmovd   %k1, %eax
> +# ifndef USE_AS_RAWMEMCHR
> +       /* Align data to VEC_SIZE.  */
> +L(cross_page_continue):
> +       xorl    %ecx, %ecx
> +       subl    %edi, %ecx
> +       andq    $-VEC_SIZE, %rdi
> +       /* esi is for adjusting length to see if near the end.  */
> +       leal    (VEC_SIZE * 5)(%rdi, %rcx), %esi
> +#  ifdef USE_AS_WMEMCHR
> +       /* NB: Divide bytes by 4 to get the wchar_t count.  */
> +       sarl    $2, %esi
> +#  endif
> +# else
> +       andq    $-VEC_SIZE, %rdi
> +L(cross_page_continue):
> +# endif
> +       /* Load first VEC regardless.  */
> +       VPCMP   $0, (VEC_SIZE)(%rdi), %YMMMATCH, %k0
> +       kmovd   %k0, %eax
> +# ifndef USE_AS_RAWMEMCHR
> +       /* Adjust length. If near end handle specially.  */
> +       subq    %rsi, %rdx
> +       jbe     L(last_4x_vec_or_less)
> +# endif
>         testl   %eax, %eax
>         jnz     L(first_vec_x1)
>
> -       VPCMP   $0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k1
> -       kmovd   %k1, %eax
> +       VPCMP   $0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k0
> +       kmovd   %k0, %eax
>         testl   %eax, %eax
>         jnz     L(first_vec_x2)
>
> -       VPCMP   $0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k1
> -       kmovd   %k1, %eax
> +       VPCMP   $0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k0
> +       kmovd   %k0, %eax
>         testl   %eax, %eax
>         jnz     L(first_vec_x3)
>
> -       addq    $(VEC_SIZE * 4), %rdi
> +       VPCMP   $0, (VEC_SIZE * 4)(%rdi), %YMMMATCH, %k0
> +       kmovd   %k0, %eax
> +       testl   %eax, %eax
> +       jnz     L(first_vec_x4)
> +
>
>  # ifndef USE_AS_RAWMEMCHR
> -       subq    $(VEC_SIZE * 4), %rdx
> -       jbe     L(last_4x_vec_or_less)
> -# endif
> +       /* Check if at last CHAR_PER_VEC * 4 length.  */
> +       subq    $(CHAR_PER_VEC * 4), %rdx
> +       jbe     L(last_4x_vec_or_less_cmpeq)
> +       addq    $VEC_SIZE, %rdi
>
> -       /* Align data to 4 * VEC_SIZE.  */
> -       movq    %rdi, %rcx
> -       andl    $(4 * VEC_SIZE - 1), %ecx
> +       /* Align data to VEC_SIZE * 4 for the loop and readjust length.
> +        */
> +#  ifdef USE_AS_WMEMCHR
> +       movl    %edi, %ecx
>         andq    $-(4 * VEC_SIZE), %rdi
> -
> -# ifndef USE_AS_RAWMEMCHR
> -       /* Adjust length.  */
> +       andl    $(VEC_SIZE * 4 - 1), %ecx
> +       /* NB: Divide bytes by 4 to get the wchar_t count.  */
> +       sarl    $2, %ecx
>         addq    %rcx, %rdx
> +#  else
> +       addq    %rdi, %rdx
> +       andq    $-(4 * VEC_SIZE), %rdi
> +       subq    %rdi, %rdx
> +#  endif
> +# else
> +       addq    $VEC_SIZE, %rdi
> +       andq    $-(4 * VEC_SIZE), %rdi
>  # endif
>
> +       vpxorq  %XMMZERO, %XMMZERO, %XMMZERO
> +
> +       /* Compare 4 * VEC at a time forward.  */
>         .p2align 4
>  L(loop_4x_vec):
> -       /* Compare 4 * VEC at a time forward.  */
> -       VPCMP   $0, (%rdi), %YMMMATCH, %k1
> -       VPCMP   $0, VEC_SIZE(%rdi), %YMMMATCH, %k2
> -       kord    %k1, %k2, %k5
> -       VPCMP   $0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k3
> -       VPCMP   $0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k4
> -
> -       kord    %k3, %k4, %k6
> -       kortestd %k5, %k6
> -       jnz     L(4x_vec_end)
> -
> -       addq    $(VEC_SIZE * 4), %rdi
> -
> +       /* It would be possible to save some instructions using 4x VPCMP
> +          but bottleneck on port 5 makes it not woth it.  */
> +       VPCMP   $4, (VEC_SIZE * 4)(%rdi), %YMMMATCH, %k1
> +       /* xor will set bytes match esi to zero.  */
> +       vpxorq  (VEC_SIZE * 5)(%rdi), %YMMMATCH, %YMM2
> +       vpxorq  (VEC_SIZE * 6)(%rdi), %YMMMATCH, %YMM3
> +       VPCMP   $0, (VEC_SIZE * 7)(%rdi), %YMMMATCH, %k3
> +       /* Reduce VEC2 / VEC3 with min and VEC1 with zero mask.  */
> +       VPMINU  %YMM2, %YMM3, %YMM3 {%k1} {z}
> +       VPCMP   $0, %YMM3, %YMMZERO, %k2
>  # ifdef USE_AS_RAWMEMCHR
> -       jmp     L(loop_4x_vec)
> +       subq    $-(VEC_SIZE * 4), %rdi
> +       kortestd %k2, %k3
> +       jz      L(loop_4x_vec)
>  # else
> -       subq    $(VEC_SIZE * 4), %rdx
> +       kortestd %k2, %k3
> +       jnz     L(loop_4x_vec_end)
> +
> +       subq    $-(VEC_SIZE * 4), %rdi
> +
> +       subq    $(CHAR_PER_VEC * 4), %rdx
>         ja      L(loop_4x_vec)
>
> +       /* Fall through into less than 4 remaining vectors of length case.
> +        */
> +       VPCMP   $0, (VEC_SIZE * 4)(%rdi), %YMMMATCH, %k0
> +       kmovd   %k0, %eax
> +       addq    $(VEC_SIZE * 3), %rdi
> +       .p2align 4
>  L(last_4x_vec_or_less):
> -       /* Less than 4 * VEC and aligned to VEC_SIZE.  */
> -       addl    $(VEC_SIZE * 2), %edx
> -       jle     L(last_2x_vec)
> -
> -       VPCMP   $0, (%rdi), %YMMMATCH, %k1
> -       kmovd   %k1, %eax
> +       /* Check if first VEC contained match.  */
>         testl   %eax, %eax
> -       jnz     L(first_vec_x0)
> +       jnz     L(first_vec_x1_check)
>
> -       VPCMP   $0, VEC_SIZE(%rdi), %YMMMATCH, %k1
> -       kmovd   %k1, %eax
> -       testl   %eax, %eax
> -       jnz     L(first_vec_x1)
> +       /* If remaining length > CHAR_PER_VEC * 2.  */
> +       addl    $(CHAR_PER_VEC * 2), %edx
> +       jg      L(last_4x_vec)
>
> -       VPCMP   $0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k1
> -       kmovd   %k1, %eax
> -       testl   %eax, %eax
> +L(last_2x_vec):
> +       /* If remaining length < CHAR_PER_VEC.  */
> +       addl    $CHAR_PER_VEC, %edx
> +       jle     L(zero_end)
>
> -       jnz     L(first_vec_x2_check)
> -       subl    $VEC_SIZE, %edx
> -       jle     L(zero)
> +       /* Check VEC2 and compare any match with remaining length.  */
> +       VPCMP   $0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k0
> +       kmovd   %k0, %eax
> +       tzcntl  %eax, %eax
> +       cmpl    %eax, %edx
> +       jbe     L(set_zero_end)
> +       leaq    (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
> +L(zero_end):
> +       ret
>
> -       VPCMP   $0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k1
> -       kmovd   %k1, %eax
> -       testl   %eax, %eax
>
> -       jnz     L(first_vec_x3_check)
> +       .p2align 4
> +L(first_vec_x1_check):
> +       tzcntl  %eax, %eax
> +       /* Adjust length.  */
> +       subl    $-(CHAR_PER_VEC * 4), %edx
> +       /* Check if match within remaining length.  */
> +       cmpl    %eax, %edx
> +       jbe     L(set_zero_end)
> +       /* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count.  */
> +       leaq    VEC_SIZE(%rdi, %rax, CHAR_SIZE), %rax
> +       ret
> +L(set_zero_end):
>         xorl    %eax, %eax
>         ret
>
>         .p2align 4
> -L(last_2x_vec):
> -       addl    $(VEC_SIZE * 2), %edx
> -       VPCMP   $0, (%rdi), %YMMMATCH, %k1
> +L(loop_4x_vec_end):
> +# endif
> +       /* rawmemchr will fall through into this if match was found in
> +          loop.  */
> +
> +       /* k1 has not of matches with VEC1.  */
>         kmovd   %k1, %eax
> -       testl   %eax, %eax
> +# ifdef USE_AS_WMEMCHR
> +       subl    $((1 << CHAR_PER_VEC) - 1), %eax
> +# else
> +       incl    %eax
> +# endif
> +       jnz     L(last_vec_x1_return)
>
> -       jnz     L(first_vec_x0_check)
> -       subl    $VEC_SIZE, %edx
> -       jle     L(zero)
> +       VPCMP   $0, %YMM2, %YMMZERO, %k0
> +       kmovd   %k0, %eax
> +       testl   %eax, %eax
> +       jnz     L(last_vec_x2_return)
>
> -       VPCMP   $0, VEC_SIZE(%rdi), %YMMMATCH, %k1
> -       kmovd   %k1, %eax
> +       kmovd   %k2, %eax
>         testl   %eax, %eax
> -       jnz     L(first_vec_x1_check)
> -       xorl    %eax, %eax
> -       ret
> +       jnz     L(last_vec_x3_return)
>
> -       .p2align 4
> -L(first_vec_x0_check):
> +       kmovd   %k3, %eax
>         tzcntl  %eax, %eax
> -# ifdef USE_AS_WMEMCHR
> -       /* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
> -       sall    $2, %eax
> +# ifdef USE_AS_RAWMEMCHR
> +       leaq    (VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax
> +# else
> +       leaq    (VEC_SIZE * 7)(%rdi, %rax, CHAR_SIZE), %rax
>  # endif
> -       /* Check the end of data.  */
> -       cmpq    %rax, %rdx
> -       jbe     L(zero)
> -       addq    %rdi, %rax
>         ret
>
>         .p2align 4
> -L(first_vec_x1_check):
> +L(last_vec_x1_return):
>         tzcntl  %eax, %eax
> -# ifdef USE_AS_WMEMCHR
> -       /* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
> -       sall    $2, %eax
> -# endif
> -       /* Check the end of data.  */
> -       cmpq    %rax, %rdx
> -       jbe     L(zero)
> -       addq    $VEC_SIZE, %rax
> +# ifdef USE_AS_RAWMEMCHR
> +#  ifdef USE_AS_WMEMCHR
> +       /* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count.  */
> +       leaq    (%rdi, %rax, CHAR_SIZE), %rax
> +#  else
>         addq    %rdi, %rax
> -       ret
> -
> -       .p2align 4
> -L(first_vec_x2_check):
> -       tzcntl  %eax, %eax
> -# ifdef USE_AS_WMEMCHR
> -       /* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
> -       sall    $2, %eax
> +#  endif
> +# else
> +       /* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count.  */
> +       leaq    (VEC_SIZE * 4)(%rdi, %rax, CHAR_SIZE), %rax
>  # endif
> -       /* Check the end of data.  */
> -       cmpq    %rax, %rdx
> -       jbe     L(zero)
> -       addq    $(VEC_SIZE * 2), %rax
> -       addq    %rdi, %rax
>         ret
>
>         .p2align 4
> -L(first_vec_x3_check):
> +L(last_vec_x2_return):
>         tzcntl  %eax, %eax
> -# ifdef USE_AS_WMEMCHR
> -       /* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
> -       sall    $2, %eax
> +# ifdef USE_AS_RAWMEMCHR
> +       /* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count.  */
> +       leaq    VEC_SIZE(%rdi, %rax, CHAR_SIZE), %rax
> +# else
> +       /* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count.  */
> +       leaq    (VEC_SIZE * 5)(%rdi, %rax, CHAR_SIZE), %rax
>  # endif
> -       /* Check the end of data.  */
> -       cmpq    %rax, %rdx
> -       jbe     L(zero)
> -       addq    $(VEC_SIZE * 3), %rax
> -       addq    %rdi, %rax
>         ret
>
>         .p2align 4
> -L(zero):
> -       xorl    %eax, %eax
> -       ret
> -# endif
> -
> -       .p2align 4
> -L(first_vec_x0):
> +L(last_vec_x3_return):
>         tzcntl  %eax, %eax
> -# ifdef USE_AS_WMEMCHR
> -       /* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
> -       leaq    (%rdi, %rax, 4), %rax
> +# ifdef USE_AS_RAWMEMCHR
> +       /* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count.  */
> +       leaq    (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
>  # else
> -       addq    %rdi, %rax
> +       /* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count.  */
> +       leaq    (VEC_SIZE * 6)(%rdi, %rax, CHAR_SIZE), %rax
>  # endif
>         ret
>
> +
> +# ifndef USE_AS_RAWMEMCHR
> +L(last_4x_vec_or_less_cmpeq):
> +       VPCMP   $0, (VEC_SIZE * 5)(%rdi), %YMMMATCH, %k0
> +       kmovd   %k0, %eax
> +       subq    $-(VEC_SIZE * 4), %rdi
> +       /* Check first VEC regardless.  */
> +       testl   %eax, %eax
> +       jnz     L(first_vec_x1_check)
> +
> +       /* If remaining length <= CHAR_PER_VEC * 2.  */
> +       addl    $(CHAR_PER_VEC * 2), %edx
> +       jle     L(last_2x_vec)
> +
>         .p2align 4
> -L(first_vec_x1):
> +L(last_4x_vec):
> +       VPCMP   $0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k0
> +       kmovd   %k0, %eax
> +       testl   %eax, %eax
> +       jnz     L(last_vec_x2)
> +
> +
> +       VPCMP   $0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k0
> +       kmovd   %k0, %eax
> +       /* Create mask for possible matches within remaining length.  */
> +#  ifdef USE_AS_WMEMCHR
> +       movl    $((1 << (CHAR_PER_VEC * 2)) - 1), %ecx
> +       bzhil   %edx, %ecx, %ecx
> +#  else
> +       movq    $-1, %rcx
> +       bzhiq   %rdx, %rcx, %rcx
> +#  endif
> +       /* Test matches in data against length match.  */
> +       andl    %ecx, %eax
> +       jnz     L(last_vec_x3)
> +
> +       /* if remaining length <= CHAR_PER_VEC * 3 (Note this is after
> +          remaining length was found to be > CHAR_PER_VEC * 2.  */
> +       subl    $CHAR_PER_VEC, %edx
> +       jbe     L(zero_end2)
> +
> +
> +       VPCMP   $0, (VEC_SIZE * 4)(%rdi), %YMMMATCH, %k0
> +       kmovd   %k0, %eax
> +       /* Shift remaining length mask for last VEC.  */
> +#  ifdef USE_AS_WMEMCHR
> +       shrl    $CHAR_PER_VEC, %ecx
> +#  else
> +       shrq    $CHAR_PER_VEC, %rcx
> +#  endif
> +       andl    %ecx, %eax
> +       jz      L(zero_end2)
>         tzcntl  %eax, %eax
> -# ifdef USE_AS_WMEMCHR
> -       /* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
> -       leaq    VEC_SIZE(%rdi, %rax, 4), %rax
> -# else
> -       addq    $VEC_SIZE, %rax
> -       addq    %rdi, %rax
> -# endif
> +       leaq    (VEC_SIZE * 4)(%rdi, %rax, CHAR_SIZE), %rax
> +L(zero_end2):
>         ret
>
> -       .p2align 4
> -L(first_vec_x2):
> +L(last_vec_x2):
>         tzcntl  %eax, %eax
> -# ifdef USE_AS_WMEMCHR
> -       /* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
> -       leaq    (VEC_SIZE * 2)(%rdi, %rax, 4), %rax
> -# else
> -       addq    $(VEC_SIZE * 2), %rax
> -       addq    %rdi, %rax
> -# endif
> +       leaq    (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
>         ret
>
>         .p2align 4
> -L(4x_vec_end):
> -       kmovd   %k1, %eax
> -       testl   %eax, %eax
> -       jnz     L(first_vec_x0)
> -       kmovd   %k2, %eax
> -       testl   %eax, %eax
> -       jnz     L(first_vec_x1)
> -       kmovd   %k3, %eax
> -       testl   %eax, %eax
> -       jnz     L(first_vec_x2)
> -       kmovd   %k4, %eax
> -       testl   %eax, %eax
> -L(first_vec_x3):
> +L(last_vec_x3):
>         tzcntl  %eax, %eax
> -# ifdef USE_AS_WMEMCHR
> -       /* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
> -       leaq    (VEC_SIZE * 3)(%rdi, %rax, 4), %rax
> -# else
> -       addq    $(VEC_SIZE * 3), %rax
> -       addq    %rdi, %rax
> -# endif
> +       leaq    (VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax
>         ret
> +# endif
>
>  END (MEMCHR)
>  #endif
> --
> 2.29.2
>

This is the patch I will push unless anyone has any objections.

^ permalink raw reply	[flat|nested] 20+ messages in thread

end of thread, other threads:[~2021-05-03 22:59 UTC | newest]

Thread overview: 20+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-05-03  8:44 [PATCH v1 1/3] Bench: Expand bench-memchr.c Noah Goldstein
2021-05-03  8:44 ` [PATCH v1 2/3] x86: Optimize memchr-avx2.S Noah Goldstein
2021-05-03 18:50   ` H.J. Lu
2021-05-03 20:06     ` Noah Goldstein
2021-05-03 20:06   ` [PATCH v2 " Noah Goldstein
2021-05-03 20:06     ` [PATCH v2 3/3] x86: Optimize memchr-evex.S Noah Goldstein
2021-05-03 22:26       ` H.J. Lu
2021-05-03 22:58         ` Noah Goldstein
2021-05-03 22:25     ` [PATCH v2 2/3] x86: Optimize memchr-avx2.S H.J. Lu
2021-05-03 22:58       ` Noah Goldstein
2021-05-03 22:58   ` [PATCH v3 " Noah Goldstein
2021-05-03 22:58     ` [PATCH v3 3/3] x86: Optimize memchr-evex.S Noah Goldstein
2021-05-03 22:59       ` Noah Goldstein
2021-05-03 22:59     ` [PATCH v3 2/3] x86: Optimize memchr-avx2.S Noah Goldstein
2021-05-03  8:44 ` [PATCH v1 3/3] x86: Optimize memchr-evex.S Noah Goldstein
2021-05-03 18:58   ` H.J. Lu
2021-05-03 20:06     ` Noah Goldstein
2021-05-03 17:17 ` [PATCH v1 1/3] Bench: Expand bench-memchr.c H.J. Lu
2021-05-03 19:51   ` Noah Goldstein
2021-05-03 20:59     ` H.J. Lu

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).