public inbox for libc-alpha@sourceware.org
 help / color / mirror / Atom feed
* [PATCH v5 1/2] x86: Optimize strlen-evex.S
@ 2021-04-19 23:36 Noah Goldstein
  2021-04-19 23:36 ` [PATCH v5 2/2] x86: Optimize strlen-avx2.S Noah Goldstein
  2021-04-20  1:01 ` [PATCH v5 1/2] x86: Optimize strlen-evex.S H.J. Lu
  0 siblings, 2 replies; 24+ messages in thread
From: Noah Goldstein @ 2021-04-19 23:36 UTC (permalink / raw)
  To: libc-alpha

No bug. This commit optimizes strlen-evex.S. The
optimizations are mostly small things but they add up to roughly
10-30% performance improvement for strlen. The results for strnlen are
bit more ambiguous. test-strlen, test-strnlen, test-wcslen, and
test-wcsnlen are all passing.

Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
---
 sysdeps/x86_64/multiarch/strlen-evex.S | 581 ++++++++++++++-----------
 1 file changed, 317 insertions(+), 264 deletions(-)

diff --git a/sysdeps/x86_64/multiarch/strlen-evex.S b/sysdeps/x86_64/multiarch/strlen-evex.S
index 0583819078..4bf6874b82 100644
--- a/sysdeps/x86_64/multiarch/strlen-evex.S
+++ b/sysdeps/x86_64/multiarch/strlen-evex.S
@@ -29,11 +29,13 @@
 # ifdef USE_AS_WCSLEN
 #  define VPCMP		vpcmpd
 #  define VPMINU	vpminud
-#  define SHIFT_REG	r9d
+#  define SHIFT_REG ecx
+#  define CHAR_SIZE	4
 # else
 #  define VPCMP		vpcmpb
 #  define VPMINU	vpminub
-#  define SHIFT_REG	ecx
+#  define SHIFT_REG edx
+#  define CHAR_SIZE	1
 # endif
 
 # define XMMZERO	xmm16
@@ -46,132 +48,165 @@
 # define YMM6		ymm22
 
 # define VEC_SIZE 32
+# define PAGE_SIZE 4096
+# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE)
 
 	.section .text.evex,"ax",@progbits
 ENTRY (STRLEN)
 # ifdef USE_AS_STRNLEN
-	/* Check for zero length.  */
+	/* Check zero length.  */
 	test	%RSI_LP, %RSI_LP
 	jz	L(zero)
-#  ifdef USE_AS_WCSLEN
-	shl	$2, %RSI_LP
-#  elif defined __ILP32__
+#  ifdef __ILP32__
 	/* Clear the upper 32 bits.  */
 	movl	%esi, %esi
 #  endif
 	mov	%RSI_LP, %R8_LP
 # endif
-	movl	%edi, %ecx
-	movq	%rdi, %rdx
+	movl	%edi, %eax
 	vpxorq	%XMMZERO, %XMMZERO, %XMMZERO
-
+	/* Clear high bits from edi. Only keeping bits relevant to page
+	   cross check.  */
+	andl	$(PAGE_SIZE - 1), %eax
 	/* Check if we may cross page boundary with one vector load.  */
-	andl	$(2 * VEC_SIZE - 1), %ecx
-	cmpl	$VEC_SIZE, %ecx
-	ja	L(cros_page_boundary)
+	cmpl	$(PAGE_SIZE - VEC_SIZE), %eax
+	ja	L(cross_page_boundary)
 
 	/* Check the first VEC_SIZE bytes.  Each bit in K0 represents a
 	   null byte.  */
 	VPCMP	$0, (%rdi), %YMMZERO, %k0
 	kmovd	%k0, %eax
-	testl	%eax, %eax
-
 # ifdef USE_AS_STRNLEN
-	jnz	L(first_vec_x0_check)
-	/* Adjust length and check the end of data.  */
-	subq	$VEC_SIZE, %rsi
-	jbe	L(max)
-# else
-	jnz	L(first_vec_x0)
+	/* If length < CHAR_PER_VEC handle special.  */
+	cmpq	$CHAR_PER_VEC, %rsi
+	jbe	L(first_vec_x0)
 # endif
-
-	/* Align data for aligned loads in the loop.  */
-	addq	$VEC_SIZE, %rdi
-	andl	$(VEC_SIZE - 1), %ecx
-	andq	$-VEC_SIZE, %rdi
-
+	testl	%eax, %eax
+	jz	L(aligned_more)
+	tzcntl	%eax, %eax
+	ret
 # ifdef USE_AS_STRNLEN
-	/* Adjust length.  */
-	addq	%rcx, %rsi
+L(zero):
+	xorl	%eax, %eax
+	ret
 
-	subq	$(VEC_SIZE * 4), %rsi
-	jbe	L(last_4x_vec_or_less)
+	.p2align 4
+L(first_vec_x0):
+	/* Set bit for max len so that tzcnt will return min of max len
+	   and position of first match.  */
+	btsq	%rsi, %rax
+	tzcntl	%eax, %eax
+	ret
 # endif
-	jmp	L(more_4x_vec)
 
 	.p2align 4
-L(cros_page_boundary):
-	andl	$(VEC_SIZE - 1), %ecx
-	andq	$-VEC_SIZE, %rdi
-
-# ifdef USE_AS_WCSLEN
-	/* NB: Divide shift count by 4 since each bit in K0 represent 4
-	   bytes.  */
-	movl	%ecx, %SHIFT_REG
-	sarl	$2, %SHIFT_REG
+L(first_vec_x1):
+	tzcntl	%eax, %eax
+	/* Safe to use 32 bit instructions as these are only called for
+	   size = [1, 159].  */
+# ifdef USE_AS_STRNLEN
+	/* Use ecx which was computed earlier to compute correct value.
+	 */
+	leal	-(CHAR_PER_VEC * 4 + 1)(%rcx, %rax), %eax
+# else
+	subl	%edx, %edi
+#  ifdef USE_AS_WCSLEN
+	/* NB: Divide bytes by 4 to get the wchar_t count.  */
+	sarl	$2, %edi
+#  endif
+	leal	CHAR_PER_VEC(%rdi, %rax), %eax
 # endif
-	VPCMP	$0, (%rdi), %YMMZERO, %k0
-	kmovd	%k0, %eax
+	ret
 
-	/* Remove the leading bytes.  */
-	sarxl	%SHIFT_REG, %eax, %eax
-	testl	%eax, %eax
-	jz	L(aligned_more)
+	.p2align 4
+L(first_vec_x2):
 	tzcntl	%eax, %eax
-# ifdef USE_AS_WCSLEN
-	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
-	sall	$2, %eax
-# endif
+	/* Safe to use 32 bit instructions as these are only called for
+	   size = [1, 159].  */
 # ifdef USE_AS_STRNLEN
-	/* Check the end of data.  */
-	cmpq	%rax, %rsi
-	jbe	L(max)
-# endif
-	addq	%rdi, %rax
-	addq	%rcx, %rax
-	subq	%rdx, %rax
-# ifdef USE_AS_WCSLEN
-	shrq	$2, %rax
+	/* Use ecx which was computed earlier to compute correct value.
+	 */
+	leal	-(CHAR_PER_VEC * 3 + 1)(%rcx, %rax), %eax
+# else
+	subl	%edx, %edi
+#  ifdef USE_AS_WCSLEN
+	/* NB: Divide bytes by 4 to get the wchar_t count.  */
+	sarl	$2, %edi
+#  endif
+	leal	(CHAR_PER_VEC * 2)(%rdi, %rax), %eax
 # endif
 	ret
 
 	.p2align 4
-L(aligned_more):
+L(first_vec_x3):
+	tzcntl	%eax, %eax
+	/* Safe to use 32 bit instructions as these are only called for
+	   size = [1, 159].  */
 # ifdef USE_AS_STRNLEN
-        /* "rcx" is less than VEC_SIZE.  Calculate "rdx + rcx - VEC_SIZE"
-	    with "rdx - (VEC_SIZE - rcx)" instead of "(rdx + rcx) - VEC_SIZE"
-	    to void possible addition overflow.  */
-	negq	%rcx
-	addq	$VEC_SIZE, %rcx
-
-	/* Check the end of data.  */
-	subq	%rcx, %rsi
-	jbe	L(max)
+	/* Use ecx which was computed earlier to compute correct value.
+	 */
+	leal	-(CHAR_PER_VEC * 2 + 1)(%rcx, %rax), %eax
+# else
+	subl	%edx, %edi
+#  ifdef USE_AS_WCSLEN
+	/* NB: Divide bytes by 4 to get the wchar_t count.  */
+	sarl	$2, %edi
+#  endif
+	leal	(CHAR_PER_VEC * 3)(%rdi, %rax), %eax
 # endif
+	ret
 
-	addq	$VEC_SIZE, %rdi
-
+	.p2align 4
+L(first_vec_x4):
+	tzcntl	%eax, %eax
+	/* Safe to use 32 bit instructions as these are only called for
+	   size = [1, 159].  */
 # ifdef USE_AS_STRNLEN
-	subq	$(VEC_SIZE * 4), %rsi
-	jbe	L(last_4x_vec_or_less)
+	/* Use ecx which was computed earlier to compute correct value.
+	 */
+	leal	-(CHAR_PER_VEC + 1)(%rcx, %rax), %eax
+# else
+	subl	%edx, %edi
+#  ifdef USE_AS_WCSLEN
+	/* NB: Divide bytes by 4 to get the wchar_t count.  */
+	sarl	$2, %edi
+#  endif
+	leal	(CHAR_PER_VEC * 4)(%rdi, %rax), %eax
 # endif
+	ret
 
-L(more_4x_vec):
+	.p2align 5
+L(aligned_more):
+	movq	%rdi, %rdx
+	/* Align data to VEC_SIZE.  */
+	andq	$-(VEC_SIZE), %rdi
+L(cross_page_continue):
 	/* Check the first 4 * VEC_SIZE.  Only one VEC_SIZE at a time
 	   since data is only aligned to VEC_SIZE.  */
-	VPCMP	$0, (%rdi), %YMMZERO, %k0
-	kmovd	%k0, %eax
-	testl	%eax, %eax
-	jnz	L(first_vec_x0)
-
+# ifdef USE_AS_STRNLEN
+	/* + CHAR_SIZE because it simplies the logic in
+	   last_4x_vec_or_less.  */
+	leaq	(VEC_SIZE * 5 + CHAR_SIZE)(%rdi), %rcx
+	subq	%rdx, %rcx
+#  ifdef USE_AS_WCSLEN
+	/* NB: Divide bytes by 4 to get the wchar_t count.  */
+	sarl	$2, %ecx
+#  endif
+# endif
+	/* Load first VEC regardless.  */
 	VPCMP	$0, VEC_SIZE(%rdi), %YMMZERO, %k0
+# ifdef USE_AS_STRNLEN
+	/* Adjust length. If near end handle specially.  */
+	subq	%rcx, %rsi
+	jb	L(last_4x_vec_or_less)
+# endif
 	kmovd	%k0, %eax
 	testl	%eax, %eax
 	jnz	L(first_vec_x1)
 
 	VPCMP	$0, (VEC_SIZE * 2)(%rdi), %YMMZERO, %k0
 	kmovd	%k0, %eax
-	testl	%eax, %eax
+	test	%eax, %eax
 	jnz	L(first_vec_x2)
 
 	VPCMP	$0, (VEC_SIZE * 3)(%rdi), %YMMZERO, %k0
@@ -179,258 +214,276 @@ L(more_4x_vec):
 	testl	%eax, %eax
 	jnz	L(first_vec_x3)
 
-	addq	$(VEC_SIZE * 4), %rdi
-
-# ifdef USE_AS_STRNLEN
-	subq	$(VEC_SIZE * 4), %rsi
-	jbe	L(last_4x_vec_or_less)
-# endif
-
-	/* Align data to 4 * VEC_SIZE.  */
-	movq	%rdi, %rcx
-	andl	$(4 * VEC_SIZE - 1), %ecx
-	andq	$-(4 * VEC_SIZE), %rdi
+	VPCMP	$0, (VEC_SIZE * 4)(%rdi), %YMMZERO, %k0
+	kmovd	%k0, %eax
+	testl	%eax, %eax
+	jnz	L(first_vec_x4)
 
+	addq	$VEC_SIZE, %rdi
 # ifdef USE_AS_STRNLEN
-	/* Adjust length.  */
+	/* Check if at last VEC_SIZE * 4 length.  */
+	cmpq	$(CHAR_PER_VEC * 4 - 1), %rsi
+	jbe	L(last_4x_vec_or_less_load)
+	movl	%edi, %ecx
+	andl	$(VEC_SIZE * 4 - 1), %ecx
+#  ifdef USE_AS_WCSLEN
+	/* NB: Divide bytes by 4 to get the wchar_t count.  */
+	sarl	$2, %ecx
+#  endif
+	/* Readjust length.  */
 	addq	%rcx, %rsi
 # endif
+	/* Align data to VEC_SIZE * 4.  */
+	andq	$-(VEC_SIZE * 4), %rdi
 
+	/* Compare 4 * VEC at a time forward.  */
 	.p2align 4
 L(loop_4x_vec):
-	/* Compare 4 * VEC at a time forward.  */
-	VMOVA	(%rdi), %YMM1
-	VMOVA	VEC_SIZE(%rdi), %YMM2
-	VMOVA	(VEC_SIZE * 2)(%rdi), %YMM3
-	VMOVA	(VEC_SIZE * 3)(%rdi), %YMM4
-
-	VPMINU	%YMM1, %YMM2, %YMM5
-	VPMINU	%YMM3, %YMM4, %YMM6
+	/* Load first VEC regardless.  */
+	VMOVA	(VEC_SIZE * 4)(%rdi), %YMM1
+# ifdef USE_AS_STRNLEN
+	/* Break if at end of length.  */
+	subq	$(CHAR_PER_VEC * 4), %rsi
+	jb	L(last_4x_vec_or_less_cmpeq)
+# endif
+	/* Save some code size by microfusing VPMINU with the load. Since
+	   the matches in ymm2/ymm4 can only be returned if there where no
+	   matches in ymm1/ymm3 respectively there is no issue with overlap.
+	 */
+	VPMINU	(VEC_SIZE * 5)(%rdi), %YMM1, %YMM2
+	VMOVA	(VEC_SIZE * 6)(%rdi), %YMM3
+	VPMINU	(VEC_SIZE * 7)(%rdi), %YMM3, %YMM4
+
+	VPCMP	$0, %YMM2, %YMMZERO, %k0
+	VPCMP	$0, %YMM4, %YMMZERO, %k1
+	subq	$-(VEC_SIZE * 4), %rdi
+	kortestd	%k0, %k1
+	jz	L(loop_4x_vec)
+
+	/* Check if end was in first half.  */
+	kmovd	%k0, %eax
+	subq	%rdx, %rdi
+# ifdef USE_AS_WCSLEN
+	shrq	$2, %rdi
+# endif
+	testl	%eax, %eax
+	jz	L(second_vec_return)
 
-	VPMINU	%YMM5, %YMM6, %YMM5
-	VPCMP	$0, %YMM5, %YMMZERO, %k0
-	ktestd	%k0, %k0
-	jnz	L(4x_vec_end)
+	VPCMP	$0, %YMM1, %YMMZERO, %k2
+	kmovd	%k2, %edx
+	/* Combine VEC1 matches (edx) with VEC2 matches (eax).  */
+# ifdef USE_AS_WCSLEN
+	sall	$CHAR_PER_VEC, %eax
+	orl	%edx, %eax
+	tzcntl	%eax, %eax
+# else
+	salq	$CHAR_PER_VEC, %rax
+	orq	%rdx, %rax
+	tzcntq	%rax, %rax
+# endif
+	addq	%rdi, %rax
+	ret
 
-	addq	$(VEC_SIZE * 4), %rdi
 
-# ifndef USE_AS_STRNLEN
-	jmp	L(loop_4x_vec)
-# else
-	subq	$(VEC_SIZE * 4), %rsi
-	ja	L(loop_4x_vec)
+# ifdef USE_AS_STRNLEN
 
+L(last_4x_vec_or_less_load):
+	/* Depending on entry adjust rdi / prepare first VEC in YMM1.  */
+	VMOVA	(VEC_SIZE * 4)(%rdi), %YMM1
+L(last_4x_vec_or_less_cmpeq):
+	VPCMP	$0, %YMM1, %YMMZERO, %k0
+	addq	$(VEC_SIZE * 3), %rdi
 L(last_4x_vec_or_less):
-	/* Less than 4 * VEC and aligned to VEC_SIZE.  */
-	addl	$(VEC_SIZE * 2), %esi
-	jle	L(last_2x_vec)
-
-	VPCMP	$0, (%rdi), %YMMZERO, %k0
 	kmovd	%k0, %eax
+	/* If remaining length > VEC_SIZE * 2. This works if esi is off by
+	   VEC_SIZE * 4.  */
+	testl	$(CHAR_PER_VEC * 2), %esi
+	jnz	L(last_4x_vec)
+
+	/* length may have been negative or positive by an offset of
+	   CHAR_PER_VEC * 4 depending on where this was called from. This
+	   fixes that.  */
+	andl	$(CHAR_PER_VEC * 4 - 1), %esi
 	testl	%eax, %eax
-	jnz	L(first_vec_x0)
+	jnz	L(last_vec_x1_check)
 
-	VPCMP	$0, VEC_SIZE(%rdi), %YMMZERO, %k0
-	kmovd	%k0, %eax
-	testl	%eax, %eax
-	jnz	L(first_vec_x1)
+	/* Check the end of data.  */
+	subl	$CHAR_PER_VEC, %esi
+	jb	L(max)
 
 	VPCMP	$0, (VEC_SIZE * 2)(%rdi), %YMMZERO, %k0
 	kmovd	%k0, %eax
-	testl	%eax, %eax
-	jnz	L(first_vec_x2_check)
-	subl	$VEC_SIZE, %esi
-	jle	L(max)
+	tzcntl	%eax, %eax
+	/* Check the end of data.  */
+	cmpl	%eax, %esi
+	jb	L(max)
 
-	VPCMP	$0, (VEC_SIZE * 3)(%rdi), %YMMZERO, %k0
-	kmovd	%k0, %eax
-	testl	%eax, %eax
-	jnz	L(first_vec_x3_check)
+	subq	%rdx, %rdi
+#  ifdef USE_AS_WCSLEN
+	/* NB: Divide bytes by 4 to get the wchar_t count.  */
+	sarq	$2, %rdi
+#  endif
+	leaq	(CHAR_PER_VEC * 2)(%rdi, %rax), %rax
+	ret
+L(max):
 	movq	%r8, %rax
+	ret
+# endif
+
+	/* Placed here in strnlen so that the jcc L(last_4x_vec_or_less)
+	   in the 4x VEC loop can use 2 byte encoding.  */
+	.p2align 4
+L(second_vec_return):
+	VPCMP	$0, %YMM3, %YMMZERO, %k0
+	/* Combine YMM3 matches (k0) with YMM4 matches (k1).  */
+# ifdef USE_AS_WCSLEN
+	kunpckbw	%k0, %k1, %k0
+	kmovd	%k0, %eax
+	tzcntl	%eax, %eax
+# else
+	kunpckdq	%k0, %k1, %k0
+	kmovq	%k0, %rax
+	tzcntq	%rax, %rax
+# endif
+	leaq	(CHAR_PER_VEC * 2)(%rdi, %rax), %rax
+	ret
+
+
+# ifdef USE_AS_STRNLEN
+L(last_vec_x1_check):
+	tzcntl	%eax, %eax
+	/* Check the end of data.  */
+	cmpl	%eax, %esi
+	jb	L(max)
+	subq	%rdx, %rdi
 #  ifdef USE_AS_WCSLEN
-	shrq	$2, %rax
+	/* NB: Divide bytes by 4 to get the wchar_t count.  */
+	sarq	$2, %rdi
 #  endif
+	leaq	(CHAR_PER_VEC)(%rdi, %rax), %rax
 	ret
 
 	.p2align 4
-L(last_2x_vec):
-	addl	$(VEC_SIZE * 2), %esi
+L(last_4x_vec):
+	/* Test first 2x VEC normally.  */
+	testl	%eax, %eax
+	jnz	L(last_vec_x1)
 
-	VPCMP	$0, (%rdi), %YMMZERO, %k0
+	VPCMP	$0, (VEC_SIZE * 2)(%rdi), %YMMZERO, %k0
 	kmovd	%k0, %eax
 	testl	%eax, %eax
-	jnz	L(first_vec_x0_check)
-	subl	$VEC_SIZE, %esi
-	jle	L(max)
+	jnz	L(last_vec_x2)
 
-	VPCMP	$0, VEC_SIZE(%rdi), %YMMZERO, %k0
+	/* Normalize length.  */
+	andl	$(CHAR_PER_VEC * 4 - 1), %esi
+	VPCMP	$0, (VEC_SIZE * 3)(%rdi), %YMMZERO, %k0
 	kmovd	%k0, %eax
 	testl	%eax, %eax
-	jnz	L(first_vec_x1_check)
-	movq	%r8, %rax
-#  ifdef USE_AS_WCSLEN
-	shrq	$2, %rax
-#  endif
-	ret
+	jnz	L(last_vec_x3)
 
-	.p2align 4
-L(first_vec_x0_check):
+	/* Check the end of data.  */
+	subl	$(CHAR_PER_VEC * 3), %esi
+	jb	L(max)
+
+	VPCMP	$0, (VEC_SIZE * 4)(%rdi), %YMMZERO, %k0
+	kmovd	%k0, %eax
 	tzcntl	%eax, %eax
-#  ifdef USE_AS_WCSLEN
-	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
-	sall	$2, %eax
-#  endif
 	/* Check the end of data.  */
-	cmpq	%rax, %rsi
-	jbe	L(max)
-	addq	%rdi, %rax
-	subq	%rdx, %rax
+	cmpl	%eax, %esi
+	jb	L(max_end)
+
+	subq	%rdx, %rdi
 #  ifdef USE_AS_WCSLEN
-	shrq	$2, %rax
+	/* NB: Divide bytes by 4 to get the wchar_t count.  */
+	sarq	$2, %rdi
 #  endif
+	leaq	(CHAR_PER_VEC * 4)(%rdi, %rax), %rax
 	ret
 
 	.p2align 4
-L(first_vec_x1_check):
+L(last_vec_x1):
 	tzcntl	%eax, %eax
+	subq	%rdx, %rdi
 #  ifdef USE_AS_WCSLEN
-	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
-	sall	$2, %eax
-#  endif
-	/* Check the end of data.  */
-	cmpq	%rax, %rsi
-	jbe	L(max)
-	addq	$VEC_SIZE, %rax
-	addq	%rdi, %rax
-	subq	%rdx, %rax
-#  ifdef USE_AS_WCSLEN
-	shrq	$2, %rax
+	/* NB: Divide bytes by 4 to get the wchar_t count.  */
+	sarq	$2, %rdi
 #  endif
+	leaq	(CHAR_PER_VEC)(%rdi, %rax), %rax
 	ret
 
 	.p2align 4
-L(first_vec_x2_check):
+L(last_vec_x2):
 	tzcntl	%eax, %eax
+	subq	%rdx, %rdi
 #  ifdef USE_AS_WCSLEN
-	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
-	sall	$2, %eax
-#  endif
-	/* Check the end of data.  */
-	cmpq	%rax, %rsi
-	jbe	L(max)
-	addq	$(VEC_SIZE * 2), %rax
-	addq	%rdi, %rax
-	subq	%rdx, %rax
-#  ifdef USE_AS_WCSLEN
-	shrq	$2, %rax
+	/* NB: Divide bytes by 4 to get the wchar_t count.  */
+	sarq	$2, %rdi
 #  endif
+	leaq	(CHAR_PER_VEC * 2)(%rdi, %rax), %rax
 	ret
 
 	.p2align 4
-L(first_vec_x3_check):
+L(last_vec_x3):
 	tzcntl	%eax, %eax
-#  ifdef USE_AS_WCSLEN
-	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
-	sall	$2, %eax
-#  endif
+	subl	$(CHAR_PER_VEC * 2), %esi
 	/* Check the end of data.  */
-	cmpq	%rax, %rsi
-	jbe	L(max)
-	addq	$(VEC_SIZE * 3), %rax
-	addq	%rdi, %rax
-	subq	%rdx, %rax
+	cmpl	%eax, %esi
+	jb	L(max_end)
+	subq	%rdx, %rdi
 #  ifdef USE_AS_WCSLEN
-	shrq	$2, %rax
+	/* NB: Divide bytes by 4 to get the wchar_t count.  */
+	sarq	$2, %rdi
 #  endif
+	leaq	(CHAR_PER_VEC * 3)(%rdi, %rax), %rax
 	ret
-
-	.p2align 4
-L(max):
+L(max_end):
 	movq	%r8, %rax
-#  ifdef USE_AS_WCSLEN
-	shrq	$2, %rax
-#  endif
-	ret
-
-	.p2align 4
-L(zero):
-	xorl	%eax, %eax
 	ret
 # endif
 
+	/* Cold case for crossing page with first load.	 */
 	.p2align 4
-L(first_vec_x0):
-	tzcntl	%eax, %eax
-# ifdef USE_AS_WCSLEN
-	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
-	sall	$2, %eax
-# endif
-	addq	%rdi, %rax
-	subq	%rdx, %rax
+L(cross_page_boundary):
+	movq	%rdi, %rdx
+	/* Align data to VEC_SIZE.  */
+	andq	$-VEC_SIZE, %rdi
+	VPCMP	$0, (%rdi), %YMMZERO, %k0
+	kmovd	%k0, %eax
+	/* Remove the leading bytes.  */
 # ifdef USE_AS_WCSLEN
-	shrq	$2, %rax
+	/* NB: Divide shift count by 4 since each bit in K0 represent 4
+	   bytes.  */
+	movl	%edx, %ecx
+	shrl	$2, %ecx
+	andl	$(CHAR_PER_VEC - 1), %ecx
 # endif
-	ret
-
-	.p2align 4
-L(first_vec_x1):
+	/* SHIFT_REG is ecx for USE_AS_WCSLEN and edx otherwise.  */
+	sarxl	%SHIFT_REG, %eax, %eax
+	testl	%eax, %eax
+# ifndef USE_AS_STRNLEN
+	jz	L(cross_page_continue)
 	tzcntl	%eax, %eax
-# ifdef USE_AS_WCSLEN
-	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
-	sall	$2, %eax
-# endif
-	addq	$VEC_SIZE, %rax
-	addq	%rdi, %rax
-	subq	%rdx, %rax
-# ifdef USE_AS_WCSLEN
-	shrq	$2, %rax
-# endif
 	ret
-
-	.p2align 4
-L(first_vec_x2):
-	tzcntl	%eax, %eax
-# ifdef USE_AS_WCSLEN
-	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
-	sall	$2, %eax
-# endif
-	addq	$(VEC_SIZE * 2), %rax
-	addq	%rdi, %rax
-	subq	%rdx, %rax
-# ifdef USE_AS_WCSLEN
-	shrq	$2, %rax
-# endif
+# else
+	jnz	L(cross_page_less_vec)
+#  ifndef USE_AS_WCSLEN
+	movl	%edx, %ecx
+	andl	$(CHAR_PER_VEC - 1), %ecx
+#  endif
+	movl	$CHAR_PER_VEC, %eax
+	subl	%ecx, %eax
+	/* Check the end of data.  */
+	cmpq	%rax, %rsi
+	ja	L(cross_page_continue)
+	movl	%esi, %eax
 	ret
-
-	.p2align 4
-L(4x_vec_end):
-	VPCMP	$0, %YMM1, %YMMZERO, %k0
-	kmovd	%k0, %eax
-	testl	%eax, %eax
-	jnz	L(first_vec_x0)
-	VPCMP	$0, %YMM2, %YMMZERO, %k1
-	kmovd	%k1, %eax
-	testl	%eax, %eax
-	jnz	L(first_vec_x1)
-	VPCMP	$0, %YMM3, %YMMZERO, %k2
-	kmovd	%k2, %eax
-	testl	%eax, %eax
-	jnz	L(first_vec_x2)
-	VPCMP	$0, %YMM4, %YMMZERO, %k3
-	kmovd	%k3, %eax
-L(first_vec_x3):
+L(cross_page_less_vec):
 	tzcntl	%eax, %eax
-# ifdef USE_AS_WCSLEN
-	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
-	sall	$2, %eax
-# endif
-	addq	$(VEC_SIZE * 3), %rax
-	addq	%rdi, %rax
-	subq	%rdx, %rax
-# ifdef USE_AS_WCSLEN
-	shrq	$2, %rax
-# endif
+	/* Select min of length and position of first null.  */
+	cmpq	%rax, %rsi
+	cmovb	%esi, %eax
 	ret
+# endif
 
 END (STRLEN)
 #endif
-- 
2.29.2


^ permalink raw reply	[flat|nested] 24+ messages in thread

* [PATCH v5 2/2] x86: Optimize strlen-avx2.S
  2021-04-19 23:36 [PATCH v5 1/2] x86: Optimize strlen-evex.S Noah Goldstein
@ 2021-04-19 23:36 ` Noah Goldstein
  2021-04-20  1:01   ` H.J. Lu
  2022-09-25  8:19   ` Aurelien Jarno
  2021-04-20  1:01 ` [PATCH v5 1/2] x86: Optimize strlen-evex.S H.J. Lu
  1 sibling, 2 replies; 24+ messages in thread
From: Noah Goldstein @ 2021-04-19 23:36 UTC (permalink / raw)
  To: libc-alpha

No bug. This commit optimizes strlen-avx2.S. The optimizations are
mostly small things but they add up to roughly 10-30% performance
improvement for strlen. The results for strnlen are bit more
ambiguous. test-strlen, test-strnlen, test-wcslen, and test-wcsnlen
are all passing.

Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
---
 sysdeps/x86_64/multiarch/ifunc-impl-list.c |  16 +-
 sysdeps/x86_64/multiarch/strlen-avx2.S     | 532 +++++++++++++--------
 2 files changed, 334 insertions(+), 214 deletions(-)

diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
index c377cab629..651b32908e 100644
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
@@ -293,10 +293,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
   /* Support sysdeps/x86_64/multiarch/strlen.c.  */
   IFUNC_IMPL (i, name, strlen,
 	      IFUNC_IMPL_ADD (array, i, strlen,
-			      CPU_FEATURE_USABLE (AVX2),
+			      (CPU_FEATURE_USABLE (AVX2)
+			       && CPU_FEATURE_USABLE (BMI2)),
 			      __strlen_avx2)
 	      IFUNC_IMPL_ADD (array, i, strlen,
 			      (CPU_FEATURE_USABLE (AVX2)
+			       && CPU_FEATURE_USABLE (BMI2)
 			       && CPU_FEATURE_USABLE (RTM)),
 			      __strlen_avx2_rtm)
 	      IFUNC_IMPL_ADD (array, i, strlen,
@@ -309,10 +311,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
   /* Support sysdeps/x86_64/multiarch/strnlen.c.  */
   IFUNC_IMPL (i, name, strnlen,
 	      IFUNC_IMPL_ADD (array, i, strnlen,
-			      CPU_FEATURE_USABLE (AVX2),
+			      (CPU_FEATURE_USABLE (AVX2)
+			       && CPU_FEATURE_USABLE (BMI2)),
 			      __strnlen_avx2)
 	      IFUNC_IMPL_ADD (array, i, strnlen,
 			      (CPU_FEATURE_USABLE (AVX2)
+			       && CPU_FEATURE_USABLE (BMI2)
 			       && CPU_FEATURE_USABLE (RTM)),
 			      __strnlen_avx2_rtm)
 	      IFUNC_IMPL_ADD (array, i, strnlen,
@@ -654,10 +658,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
   /* Support sysdeps/x86_64/multiarch/wcslen.c.  */
   IFUNC_IMPL (i, name, wcslen,
 	      IFUNC_IMPL_ADD (array, i, wcslen,
-			      CPU_FEATURE_USABLE (AVX2),
+			      (CPU_FEATURE_USABLE (AVX2)
+			       && CPU_FEATURE_USABLE (BMI2)),
 			      __wcslen_avx2)
 	      IFUNC_IMPL_ADD (array, i, wcslen,
 			      (CPU_FEATURE_USABLE (AVX2)
+			       && CPU_FEATURE_USABLE (BMI2)
 			       && CPU_FEATURE_USABLE (RTM)),
 			      __wcslen_avx2_rtm)
 	      IFUNC_IMPL_ADD (array, i, wcslen,
@@ -670,10 +676,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
   /* Support sysdeps/x86_64/multiarch/wcsnlen.c.  */
   IFUNC_IMPL (i, name, wcsnlen,
 	      IFUNC_IMPL_ADD (array, i, wcsnlen,
-			      CPU_FEATURE_USABLE (AVX2),
+			      (CPU_FEATURE_USABLE (AVX2)
+			       && CPU_FEATURE_USABLE (BMI2)),
 			      __wcsnlen_avx2)
 	      IFUNC_IMPL_ADD (array, i, wcsnlen,
 			      (CPU_FEATURE_USABLE (AVX2)
+			       && CPU_FEATURE_USABLE (BMI2)
 			       && CPU_FEATURE_USABLE (RTM)),
 			      __wcsnlen_avx2_rtm)
 	      IFUNC_IMPL_ADD (array, i, wcsnlen,
diff --git a/sysdeps/x86_64/multiarch/strlen-avx2.S b/sysdeps/x86_64/multiarch/strlen-avx2.S
index 1caae9e6bc..bd2e6ee44a 100644
--- a/sysdeps/x86_64/multiarch/strlen-avx2.S
+++ b/sysdeps/x86_64/multiarch/strlen-avx2.S
@@ -27,9 +27,11 @@
 # ifdef USE_AS_WCSLEN
 #  define VPCMPEQ	vpcmpeqd
 #  define VPMINU	vpminud
+#  define CHAR_SIZE	4
 # else
 #  define VPCMPEQ	vpcmpeqb
 #  define VPMINU	vpminub
+#  define CHAR_SIZE	1
 # endif
 
 # ifndef VZEROUPPER
@@ -41,349 +43,459 @@
 # endif
 
 # define VEC_SIZE 32
+# define PAGE_SIZE 4096
 
 	.section SECTION(.text),"ax",@progbits
 ENTRY (STRLEN)
 # ifdef USE_AS_STRNLEN
-	/* Check for zero length.  */
+	/* Check zero length.  */
 	test	%RSI_LP, %RSI_LP
 	jz	L(zero)
+	/* Store max len in R8_LP before adjusting if using WCSLEN.  */
+	mov	%RSI_LP, %R8_LP
 #  ifdef USE_AS_WCSLEN
 	shl	$2, %RSI_LP
 #  elif defined __ILP32__
 	/* Clear the upper 32 bits.  */
 	movl	%esi, %esi
 #  endif
-	mov	%RSI_LP, %R8_LP
 # endif
-	movl	%edi, %ecx
+	movl	%edi, %eax
 	movq	%rdi, %rdx
 	vpxor	%xmm0, %xmm0, %xmm0
-
+	/* Clear high bits from edi. Only keeping bits relevant to page
+	   cross check.  */
+	andl	$(PAGE_SIZE - 1), %eax
 	/* Check if we may cross page boundary with one vector load.  */
-	andl	$(2 * VEC_SIZE - 1), %ecx
-	cmpl	$VEC_SIZE, %ecx
-	ja	L(cros_page_boundary)
+	cmpl	$(PAGE_SIZE - VEC_SIZE), %eax
+	ja	L(cross_page_boundary)
 
 	/* Check the first VEC_SIZE bytes.  */
-	VPCMPEQ (%rdi), %ymm0, %ymm1
-	vpmovmskb %ymm1, %eax
-	testl	%eax, %eax
-
+	VPCMPEQ	(%rdi), %ymm0, %ymm1
+	vpmovmskb	%ymm1, %eax
 # ifdef USE_AS_STRNLEN
-	jnz	L(first_vec_x0_check)
-	/* Adjust length and check the end of data.  */
-	subq	$VEC_SIZE, %rsi
-	jbe	L(max)
-# else
-	jnz	L(first_vec_x0)
+	/* If length < VEC_SIZE handle special.  */
+	cmpq	$VEC_SIZE, %rsi
+	jbe	L(first_vec_x0)
 # endif
-
-	/* Align data for aligned loads in the loop.  */
-	addq	$VEC_SIZE, %rdi
-	andl	$(VEC_SIZE - 1), %ecx
-	andq	$-VEC_SIZE, %rdi
+	/* If empty continue to aligned_more. Otherwise return bit
+	   position of first match.  */
+	testl	%eax, %eax
+	jz	L(aligned_more)
+	tzcntl	%eax, %eax
+# ifdef USE_AS_WCSLEN
+	shrl	$2, %eax
+# endif
+	VZEROUPPER_RETURN
 
 # ifdef USE_AS_STRNLEN
-	/* Adjust length.  */
-	addq	%rcx, %rsi
+L(zero):
+	xorl	%eax, %eax
+	ret
 
-	subq	$(VEC_SIZE * 4), %rsi
-	jbe	L(last_4x_vec_or_less)
+	.p2align 4
+L(first_vec_x0):
+	/* Set bit for max len so that tzcnt will return min of max len
+	   and position of first match.  */
+	btsq	%rsi, %rax
+	tzcntl	%eax, %eax
+#  ifdef USE_AS_WCSLEN
+	shrl	$2, %eax
+#  endif
+	VZEROUPPER_RETURN
 # endif
-	jmp	L(more_4x_vec)
 
 	.p2align 4
-L(cros_page_boundary):
-	andl	$(VEC_SIZE - 1), %ecx
-	andq	$-VEC_SIZE, %rdi
-	VPCMPEQ (%rdi), %ymm0, %ymm1
-	vpmovmskb %ymm1, %eax
-	/* Remove the leading bytes.  */
-	sarl	%cl, %eax
-	testl	%eax, %eax
-	jz	L(aligned_more)
+L(first_vec_x1):
 	tzcntl	%eax, %eax
+	/* Safe to use 32 bit instructions as these are only called for
+	   size = [1, 159].  */
 # ifdef USE_AS_STRNLEN
-	/* Check the end of data.  */
-	cmpq	%rax, %rsi
-	jbe	L(max)
+	/* Use ecx which was computed earlier to compute correct value.
+	 */
+	subl	$(VEC_SIZE * 4 + 1), %ecx
+	addl	%ecx, %eax
+# else
+	subl	%edx, %edi
+	incl	%edi
+	addl	%edi, %eax
 # endif
-	addq	%rdi, %rax
-	addq	%rcx, %rax
-	subq	%rdx, %rax
 # ifdef USE_AS_WCSLEN
-	shrq	$2, %rax
+	shrl	$2, %eax
 # endif
-L(return_vzeroupper):
-	ZERO_UPPER_VEC_REGISTERS_RETURN
+	VZEROUPPER_RETURN
 
 	.p2align 4
-L(aligned_more):
+L(first_vec_x2):
+	tzcntl	%eax, %eax
+	/* Safe to use 32 bit instructions as these are only called for
+	   size = [1, 159].  */
 # ifdef USE_AS_STRNLEN
-        /* "rcx" is less than VEC_SIZE.  Calculate "rdx + rcx - VEC_SIZE"
-	    with "rdx - (VEC_SIZE - rcx)" instead of "(rdx + rcx) - VEC_SIZE"
-	    to void possible addition overflow.  */
-	negq	%rcx
-	addq	$VEC_SIZE, %rcx
-
-	/* Check the end of data.  */
-	subq	%rcx, %rsi
-	jbe	L(max)
+	/* Use ecx which was computed earlier to compute correct value.
+	 */
+	subl	$(VEC_SIZE * 3 + 1), %ecx
+	addl	%ecx, %eax
+# else
+	subl	%edx, %edi
+	addl	$(VEC_SIZE + 1), %edi
+	addl	%edi, %eax
 # endif
+# ifdef USE_AS_WCSLEN
+	shrl	$2, %eax
+# endif
+	VZEROUPPER_RETURN
 
-	addq	$VEC_SIZE, %rdi
+	.p2align 4
+L(first_vec_x3):
+	tzcntl	%eax, %eax
+	/* Safe to use 32 bit instructions as these are only called for
+	   size = [1, 159].  */
+# ifdef USE_AS_STRNLEN
+	/* Use ecx which was computed earlier to compute correct value.
+	 */
+	subl	$(VEC_SIZE * 2 + 1), %ecx
+	addl	%ecx, %eax
+# else
+	subl	%edx, %edi
+	addl	$(VEC_SIZE * 2 + 1), %edi
+	addl	%edi, %eax
+# endif
+# ifdef USE_AS_WCSLEN
+	shrl	$2, %eax
+# endif
+	VZEROUPPER_RETURN
 
+	.p2align 4
+L(first_vec_x4):
+	tzcntl	%eax, %eax
+	/* Safe to use 32 bit instructions as these are only called for
+	   size = [1, 159].  */
 # ifdef USE_AS_STRNLEN
-	subq	$(VEC_SIZE * 4), %rsi
-	jbe	L(last_4x_vec_or_less)
+	/* Use ecx which was computed earlier to compute correct value.
+	 */
+	subl	$(VEC_SIZE + 1), %ecx
+	addl	%ecx, %eax
+# else
+	subl	%edx, %edi
+	addl	$(VEC_SIZE * 3 + 1), %edi
+	addl	%edi, %eax
 # endif
+# ifdef USE_AS_WCSLEN
+	shrl	$2, %eax
+# endif
+	VZEROUPPER_RETURN
 
-L(more_4x_vec):
+	.p2align 5
+L(aligned_more):
+	/* Align data to VEC_SIZE - 1. This is the same number of
+	   instructions as using andq with -VEC_SIZE but saves 4 bytes of
+	   code on the x4 check.  */
+	orq	$(VEC_SIZE - 1), %rdi
+L(cross_page_continue):
 	/* Check the first 4 * VEC_SIZE.  Only one VEC_SIZE at a time
 	   since data is only aligned to VEC_SIZE.  */
-	VPCMPEQ (%rdi), %ymm0, %ymm1
-	vpmovmskb %ymm1, %eax
-	testl	%eax, %eax
-	jnz	L(first_vec_x0)
-
-	VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1
-	vpmovmskb %ymm1, %eax
+# ifdef USE_AS_STRNLEN
+	/* + 1 because rdi is aligned to VEC_SIZE - 1. + CHAR_SIZE because
+	   it simplies the logic in last_4x_vec_or_less.  */
+	leaq	(VEC_SIZE * 4 + CHAR_SIZE + 1)(%rdi), %rcx
+	subq	%rdx, %rcx
+# endif
+	/* Load first VEC regardless.  */
+	VPCMPEQ	1(%rdi), %ymm0, %ymm1
+# ifdef USE_AS_STRNLEN
+	/* Adjust length. If near end handle specially.  */
+	subq	%rcx, %rsi
+	jb	L(last_4x_vec_or_less)
+# endif
+	vpmovmskb	%ymm1, %eax
 	testl	%eax, %eax
 	jnz	L(first_vec_x1)
 
-	VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm0, %ymm1
-	vpmovmskb %ymm1, %eax
+	VPCMPEQ	(VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
+	vpmovmskb	%ymm1, %eax
 	testl	%eax, %eax
 	jnz	L(first_vec_x2)
 
-	VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1
-	vpmovmskb %ymm1, %eax
+	VPCMPEQ	(VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm1
+	vpmovmskb	%ymm1, %eax
 	testl	%eax, %eax
 	jnz	L(first_vec_x3)
 
-	addq	$(VEC_SIZE * 4), %rdi
-
-# ifdef USE_AS_STRNLEN
-	subq	$(VEC_SIZE * 4), %rsi
-	jbe	L(last_4x_vec_or_less)
-# endif
-
-	/* Align data to 4 * VEC_SIZE.  */
-	movq	%rdi, %rcx
-	andl	$(4 * VEC_SIZE - 1), %ecx
-	andq	$-(4 * VEC_SIZE), %rdi
+	VPCMPEQ	(VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm1
+	vpmovmskb	%ymm1, %eax
+	testl	%eax, %eax
+	jnz	L(first_vec_x4)
 
+	/* Align data to VEC_SIZE * 4 - 1.  */
 # ifdef USE_AS_STRNLEN
-	/* Adjust length.  */
+	/* Before adjusting length check if at last VEC_SIZE * 4.  */
+	cmpq	$(VEC_SIZE * 4 - 1), %rsi
+	jbe	L(last_4x_vec_or_less_load)
+	incq	%rdi
+	movl	%edi, %ecx
+	orq	$(VEC_SIZE * 4 - 1), %rdi
+	andl	$(VEC_SIZE * 4 - 1), %ecx
+	/* Readjust length.  */
 	addq	%rcx, %rsi
+# else
+	incq	%rdi
+	orq	$(VEC_SIZE * 4 - 1), %rdi
 # endif
-
+	/* Compare 4 * VEC at a time forward.  */
 	.p2align 4
 L(loop_4x_vec):
-	/* Compare 4 * VEC at a time forward.  */
-	vmovdqa (%rdi), %ymm1
-	vmovdqa	VEC_SIZE(%rdi), %ymm2
-	vmovdqa	(VEC_SIZE * 2)(%rdi), %ymm3
-	vmovdqa	(VEC_SIZE * 3)(%rdi), %ymm4
-	VPMINU	%ymm1, %ymm2, %ymm5
-	VPMINU	%ymm3, %ymm4, %ymm6
-	VPMINU	%ymm5, %ymm6, %ymm5
-
-	VPCMPEQ	%ymm5, %ymm0, %ymm5
-	vpmovmskb %ymm5, %eax
-	testl	%eax, %eax
-	jnz	L(4x_vec_end)
-
-	addq	$(VEC_SIZE * 4), %rdi
-
-# ifndef USE_AS_STRNLEN
-	jmp	L(loop_4x_vec)
-# else
+# ifdef USE_AS_STRNLEN
+	/* Break if at end of length.  */
 	subq	$(VEC_SIZE * 4), %rsi
-	ja	L(loop_4x_vec)
-
-L(last_4x_vec_or_less):
-	/* Less than 4 * VEC and aligned to VEC_SIZE.  */
-	addl	$(VEC_SIZE * 2), %esi
-	jle	L(last_2x_vec)
+	jb	L(last_4x_vec_or_less_cmpeq)
+# endif
+	/* Save some code size by microfusing VPMINU with the load. Since
+	   the matches in ymm2/ymm4 can only be returned if there where no
+	   matches in ymm1/ymm3 respectively there is no issue with overlap.
+	 */
+	vmovdqa	1(%rdi), %ymm1
+	VPMINU	(VEC_SIZE + 1)(%rdi), %ymm1, %ymm2
+	vmovdqa	(VEC_SIZE * 2 + 1)(%rdi), %ymm3
+	VPMINU	(VEC_SIZE * 3 + 1)(%rdi), %ymm3, %ymm4
+
+	VPMINU	%ymm2, %ymm4, %ymm5
+	VPCMPEQ	%ymm5, %ymm0, %ymm5
+	vpmovmskb	%ymm5, %ecx
 
-	VPCMPEQ (%rdi), %ymm0, %ymm1
-	vpmovmskb %ymm1, %eax
-	testl	%eax, %eax
-	jnz	L(first_vec_x0)
+	subq	$-(VEC_SIZE * 4), %rdi
+	testl	%ecx, %ecx
+	jz	L(loop_4x_vec)
 
-	VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1
-	vpmovmskb %ymm1, %eax
-	testl	%eax, %eax
-	jnz	L(first_vec_x1)
 
-	VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm0, %ymm1
-	vpmovmskb %ymm1, %eax
+	VPCMPEQ	%ymm1, %ymm0, %ymm1
+	vpmovmskb	%ymm1, %eax
+	subq	%rdx, %rdi
 	testl	%eax, %eax
+	jnz	L(last_vec_return_x0)
 
-	jnz	L(first_vec_x2_check)
-	subl	$VEC_SIZE, %esi
-	jle	L(max)
-
-	VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1
-	vpmovmskb %ymm1, %eax
+	VPCMPEQ	%ymm2, %ymm0, %ymm2
+	vpmovmskb	%ymm2, %eax
 	testl	%eax, %eax
-
-	jnz	L(first_vec_x3_check)
-	movq	%r8, %rax
-#  ifdef USE_AS_WCSLEN
+	jnz	L(last_vec_return_x1)
+
+	/* Combine last 2 VEC.  */
+	VPCMPEQ	%ymm3, %ymm0, %ymm3
+	vpmovmskb	%ymm3, %eax
+	/* rcx has combined result from all 4 VEC. It will only be used if
+	   the first 3 other VEC all did not contain a match.  */
+	salq	$32, %rcx
+	orq	%rcx, %rax
+	tzcntq	%rax, %rax
+	subq	$(VEC_SIZE * 2 - 1), %rdi
+	addq	%rdi, %rax
+# ifdef USE_AS_WCSLEN
 	shrq	$2, %rax
-#  endif
+# endif
 	VZEROUPPER_RETURN
 
+
+# ifdef USE_AS_STRNLEN
 	.p2align 4
-L(last_2x_vec):
-	addl	$(VEC_SIZE * 2), %esi
-	VPCMPEQ (%rdi), %ymm0, %ymm1
-	vpmovmskb %ymm1, %eax
-	testl	%eax, %eax
+L(last_4x_vec_or_less_load):
+	/* Depending on entry adjust rdi / prepare first VEC in ymm1.  */
+	subq	$-(VEC_SIZE * 4), %rdi
+L(last_4x_vec_or_less_cmpeq):
+	VPCMPEQ	1(%rdi), %ymm0, %ymm1
+L(last_4x_vec_or_less):
 
-	jnz	L(first_vec_x0_check)
-	subl	$VEC_SIZE, %esi
-	jle	L(max)
+	vpmovmskb	%ymm1, %eax
+	/* If remaining length > VEC_SIZE * 2. This works if esi is off by
+	   VEC_SIZE * 4.  */
+	testl	$(VEC_SIZE * 2), %esi
+	jnz	L(last_4x_vec)
 
-	VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1
-	vpmovmskb %ymm1, %eax
+	/* length may have been negative or positive by an offset of
+	   VEC_SIZE * 4 depending on where this was called from. This fixes
+	   that.  */
+	andl	$(VEC_SIZE * 4 - 1), %esi
 	testl	%eax, %eax
-	jnz	L(first_vec_x1_check)
-	movq	%r8, %rax
-#  ifdef USE_AS_WCSLEN
-	shrq	$2, %rax
-#  endif
-	VZEROUPPER_RETURN
+	jnz	L(last_vec_x1_check)
 
-	.p2align 4
-L(first_vec_x0_check):
+	subl	$VEC_SIZE, %esi
+	jb	L(max)
+
+	VPCMPEQ	(VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
+	vpmovmskb	%ymm1, %eax
 	tzcntl	%eax, %eax
 	/* Check the end of data.  */
-	cmpq	%rax, %rsi
-	jbe	L(max)
+	cmpl	%eax, %esi
+	jb	L(max)
+	subq	%rdx, %rdi
+	addl	$(VEC_SIZE + 1), %eax
 	addq	%rdi, %rax
-	subq	%rdx, %rax
 #  ifdef USE_AS_WCSLEN
 	shrq	$2, %rax
 #  endif
 	VZEROUPPER_RETURN
+# endif
 
 	.p2align 4
-L(first_vec_x1_check):
+L(last_vec_return_x0):
 	tzcntl	%eax, %eax
-	/* Check the end of data.  */
-	cmpq	%rax, %rsi
-	jbe	L(max)
-	addq	$VEC_SIZE, %rax
+	subq	$(VEC_SIZE * 4 - 1), %rdi
 	addq	%rdi, %rax
-	subq	%rdx, %rax
-#  ifdef USE_AS_WCSLEN
+# ifdef USE_AS_WCSLEN
 	shrq	$2, %rax
-#  endif
+# endif
 	VZEROUPPER_RETURN
 
 	.p2align 4
-L(first_vec_x2_check):
+L(last_vec_return_x1):
 	tzcntl	%eax, %eax
-	/* Check the end of data.  */
-	cmpq	%rax, %rsi
-	jbe	L(max)
-	addq	$(VEC_SIZE * 2), %rax
+	subq	$(VEC_SIZE * 3 - 1), %rdi
 	addq	%rdi, %rax
-	subq	%rdx, %rax
-#  ifdef USE_AS_WCSLEN
+# ifdef USE_AS_WCSLEN
 	shrq	$2, %rax
-#  endif
+# endif
 	VZEROUPPER_RETURN
 
+# ifdef USE_AS_STRNLEN
 	.p2align 4
-L(first_vec_x3_check):
+L(last_vec_x1_check):
+
 	tzcntl	%eax, %eax
 	/* Check the end of data.  */
-	cmpq	%rax, %rsi
-	jbe	L(max)
-	addq	$(VEC_SIZE * 3), %rax
+	cmpl	%eax, %esi
+	jb	L(max)
+	subq	%rdx, %rdi
+	incl	%eax
 	addq	%rdi, %rax
-	subq	%rdx, %rax
 #  ifdef USE_AS_WCSLEN
 	shrq	$2, %rax
 #  endif
 	VZEROUPPER_RETURN
 
-	.p2align 4
 L(max):
 	movq	%r8, %rax
+	VZEROUPPER_RETURN
+
+	.p2align 4
+L(last_4x_vec):
+	/* Test first 2x VEC normally.  */
+	testl	%eax, %eax
+	jnz	L(last_vec_x1)
+
+	VPCMPEQ	(VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
+	vpmovmskb	%ymm1, %eax
+	testl	%eax, %eax
+	jnz	L(last_vec_x2)
+
+	/* Normalize length.  */
+	andl	$(VEC_SIZE * 4 - 1), %esi
+	VPCMPEQ	(VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm1
+	vpmovmskb	%ymm1, %eax
+	testl	%eax, %eax
+	jnz	L(last_vec_x3)
+
+	subl	$(VEC_SIZE * 3), %esi
+	jb	L(max)
+
+	VPCMPEQ	(VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm1
+	vpmovmskb	%ymm1, %eax
+	tzcntl	%eax, %eax
+	/* Check the end of data.  */
+	cmpl	%eax, %esi
+	jb	L(max)
+	subq	%rdx, %rdi
+	addl	$(VEC_SIZE * 3 + 1), %eax
+	addq	%rdi, %rax
 #  ifdef USE_AS_WCSLEN
 	shrq	$2, %rax
 #  endif
 	VZEROUPPER_RETURN
 
-	.p2align 4
-L(zero):
-	xorl	%eax, %eax
-	ret
-# endif
 
 	.p2align 4
-L(first_vec_x0):
+L(last_vec_x1):
+	/* essentially duplicates of first_vec_x1 but use 64 bit
+	   instructions.  */
 	tzcntl	%eax, %eax
+	subq	%rdx, %rdi
+	incl	%eax
 	addq	%rdi, %rax
-	subq	%rdx, %rax
-# ifdef USE_AS_WCSLEN
+#  ifdef USE_AS_WCSLEN
 	shrq	$2, %rax
-# endif
+#  endif
 	VZEROUPPER_RETURN
 
 	.p2align 4
-L(first_vec_x1):
+L(last_vec_x2):
+	/* essentially duplicates of first_vec_x1 but use 64 bit
+	   instructions.  */
 	tzcntl	%eax, %eax
-	addq	$VEC_SIZE, %rax
+	subq	%rdx, %rdi
+	addl	$(VEC_SIZE + 1), %eax
 	addq	%rdi, %rax
-	subq	%rdx, %rax
-# ifdef USE_AS_WCSLEN
+#  ifdef USE_AS_WCSLEN
 	shrq	$2, %rax
-# endif
+#  endif
 	VZEROUPPER_RETURN
 
 	.p2align 4
-L(first_vec_x2):
+L(last_vec_x3):
 	tzcntl	%eax, %eax
-	addq	$(VEC_SIZE * 2), %rax
+	subl	$(VEC_SIZE * 2), %esi
+	/* Check the end of data.  */
+	cmpl	%eax, %esi
+	jb	L(max_end)
+	subq	%rdx, %rdi
+	addl	$(VEC_SIZE * 2 + 1), %eax
 	addq	%rdi, %rax
-	subq	%rdx, %rax
-# ifdef USE_AS_WCSLEN
+#  ifdef USE_AS_WCSLEN
 	shrq	$2, %rax
-# endif
+#  endif
+	VZEROUPPER_RETURN
+L(max_end):
+	movq	%r8, %rax
 	VZEROUPPER_RETURN
+# endif
 
+	/* Cold case for crossing page with first load.	 */
 	.p2align 4
-L(4x_vec_end):
-	VPCMPEQ	%ymm1, %ymm0, %ymm1
-	vpmovmskb %ymm1, %eax
-	testl	%eax, %eax
-	jnz	L(first_vec_x0)
-	VPCMPEQ %ymm2, %ymm0, %ymm2
-	vpmovmskb %ymm2, %eax
+L(cross_page_boundary):
+	/* Align data to VEC_SIZE - 1.  */
+	orq	$(VEC_SIZE - 1), %rdi
+	VPCMPEQ	-(VEC_SIZE - 1)(%rdi), %ymm0, %ymm1
+	vpmovmskb	%ymm1, %eax
+	/* Remove the leading bytes. sarxl only uses bits [5:0] of COUNT
+	   so no need to manually mod rdx.  */
+	sarxl	%edx, %eax, %eax
+# ifdef USE_AS_STRNLEN
 	testl	%eax, %eax
-	jnz	L(first_vec_x1)
-	VPCMPEQ %ymm3, %ymm0, %ymm3
-	vpmovmskb %ymm3, %eax
+	jnz	L(cross_page_less_vec)
+	leaq	1(%rdi), %rcx
+	subq	%rdx, %rcx
+	/* Check length.  */
+	cmpq	%rsi, %rcx
+	jb	L(cross_page_continue)
+	movq	%r8, %rax
+# else
 	testl	%eax, %eax
-	jnz	L(first_vec_x2)
-	VPCMPEQ %ymm4, %ymm0, %ymm4
-	vpmovmskb %ymm4, %eax
-L(first_vec_x3):
+	jz	L(cross_page_continue)
 	tzcntl	%eax, %eax
-	addq	$(VEC_SIZE * 3), %rax
-	addq	%rdi, %rax
-	subq	%rdx, %rax
-# ifdef USE_AS_WCSLEN
-	shrq	$2, %rax
+#  ifdef USE_AS_WCSLEN
+	shrl	$2, %eax
+#  endif
 # endif
+L(return_vzeroupper):
+	ZERO_UPPER_VEC_REGISTERS_RETURN
+
+# ifdef USE_AS_STRNLEN
+	.p2align 4
+L(cross_page_less_vec):
+	tzcntl	%eax, %eax
+	cmpq	%rax, %rsi
+	cmovb	%esi, %eax
+#  ifdef USE_AS_WCSLEN
+	shrl	$2, %eax
+#  endif
 	VZEROUPPER_RETURN
+# endif
 
 END (STRLEN)
 #endif
-- 
2.29.2


^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH v5 1/2] x86: Optimize strlen-evex.S
  2021-04-19 23:36 [PATCH v5 1/2] x86: Optimize strlen-evex.S Noah Goldstein
  2021-04-19 23:36 ` [PATCH v5 2/2] x86: Optimize strlen-avx2.S Noah Goldstein
@ 2021-04-20  1:01 ` H.J. Lu
  1 sibling, 0 replies; 24+ messages in thread
From: H.J. Lu @ 2021-04-20  1:01 UTC (permalink / raw)
  To: Noah Goldstein; +Cc: libc-alpha, carlos

On Mon, Apr 19, 2021 at 4:36 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> No bug. This commit optimizes strlen-evex.S. The
> optimizations are mostly small things but they add up to roughly
> 10-30% performance improvement for strlen. The results for strnlen are
> bit more ambiguous. test-strlen, test-strnlen, test-wcslen, and
> test-wcsnlen are all passing.
>
> Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
> ---
>  sysdeps/x86_64/multiarch/strlen-evex.S | 581 ++++++++++++++-----------
>  1 file changed, 317 insertions(+), 264 deletions(-)
>
> diff --git a/sysdeps/x86_64/multiarch/strlen-evex.S b/sysdeps/x86_64/multiarch/strlen-evex.S
> index 0583819078..4bf6874b82 100644
> --- a/sysdeps/x86_64/multiarch/strlen-evex.S
> +++ b/sysdeps/x86_64/multiarch/strlen-evex.S
> @@ -29,11 +29,13 @@
>  # ifdef USE_AS_WCSLEN
>  #  define VPCMP                vpcmpd
>  #  define VPMINU       vpminud
> -#  define SHIFT_REG    r9d
> +#  define SHIFT_REG ecx
> +#  define CHAR_SIZE    4
>  # else
>  #  define VPCMP                vpcmpb
>  #  define VPMINU       vpminub
> -#  define SHIFT_REG    ecx
> +#  define SHIFT_REG edx
> +#  define CHAR_SIZE    1
>  # endif
>
>  # define XMMZERO       xmm16
> @@ -46,132 +48,165 @@
>  # define YMM6          ymm22
>
>  # define VEC_SIZE 32
> +# define PAGE_SIZE 4096
> +# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE)
>
>         .section .text.evex,"ax",@progbits
>  ENTRY (STRLEN)
>  # ifdef USE_AS_STRNLEN
> -       /* Check for zero length.  */
> +       /* Check zero length.  */
>         test    %RSI_LP, %RSI_LP
>         jz      L(zero)
> -#  ifdef USE_AS_WCSLEN
> -       shl     $2, %RSI_LP
> -#  elif defined __ILP32__
> +#  ifdef __ILP32__
>         /* Clear the upper 32 bits.  */
>         movl    %esi, %esi
>  #  endif
>         mov     %RSI_LP, %R8_LP
>  # endif
> -       movl    %edi, %ecx
> -       movq    %rdi, %rdx
> +       movl    %edi, %eax
>         vpxorq  %XMMZERO, %XMMZERO, %XMMZERO
> -
> +       /* Clear high bits from edi. Only keeping bits relevant to page
> +          cross check.  */
> +       andl    $(PAGE_SIZE - 1), %eax
>         /* Check if we may cross page boundary with one vector load.  */
> -       andl    $(2 * VEC_SIZE - 1), %ecx
> -       cmpl    $VEC_SIZE, %ecx
> -       ja      L(cros_page_boundary)
> +       cmpl    $(PAGE_SIZE - VEC_SIZE), %eax
> +       ja      L(cross_page_boundary)
>
>         /* Check the first VEC_SIZE bytes.  Each bit in K0 represents a
>            null byte.  */
>         VPCMP   $0, (%rdi), %YMMZERO, %k0
>         kmovd   %k0, %eax
> -       testl   %eax, %eax
> -
>  # ifdef USE_AS_STRNLEN
> -       jnz     L(first_vec_x0_check)
> -       /* Adjust length and check the end of data.  */
> -       subq    $VEC_SIZE, %rsi
> -       jbe     L(max)
> -# else
> -       jnz     L(first_vec_x0)
> +       /* If length < CHAR_PER_VEC handle special.  */
> +       cmpq    $CHAR_PER_VEC, %rsi
> +       jbe     L(first_vec_x0)
>  # endif
> -
> -       /* Align data for aligned loads in the loop.  */
> -       addq    $VEC_SIZE, %rdi
> -       andl    $(VEC_SIZE - 1), %ecx
> -       andq    $-VEC_SIZE, %rdi
> -
> +       testl   %eax, %eax
> +       jz      L(aligned_more)
> +       tzcntl  %eax, %eax
> +       ret
>  # ifdef USE_AS_STRNLEN
> -       /* Adjust length.  */
> -       addq    %rcx, %rsi
> +L(zero):
> +       xorl    %eax, %eax
> +       ret
>
> -       subq    $(VEC_SIZE * 4), %rsi
> -       jbe     L(last_4x_vec_or_less)
> +       .p2align 4
> +L(first_vec_x0):
> +       /* Set bit for max len so that tzcnt will return min of max len
> +          and position of first match.  */
> +       btsq    %rsi, %rax
> +       tzcntl  %eax, %eax
> +       ret
>  # endif
> -       jmp     L(more_4x_vec)
>
>         .p2align 4
> -L(cros_page_boundary):
> -       andl    $(VEC_SIZE - 1), %ecx
> -       andq    $-VEC_SIZE, %rdi
> -
> -# ifdef USE_AS_WCSLEN
> -       /* NB: Divide shift count by 4 since each bit in K0 represent 4
> -          bytes.  */
> -       movl    %ecx, %SHIFT_REG
> -       sarl    $2, %SHIFT_REG
> +L(first_vec_x1):
> +       tzcntl  %eax, %eax
> +       /* Safe to use 32 bit instructions as these are only called for
> +          size = [1, 159].  */
> +# ifdef USE_AS_STRNLEN
> +       /* Use ecx which was computed earlier to compute correct value.
> +        */
> +       leal    -(CHAR_PER_VEC * 4 + 1)(%rcx, %rax), %eax
> +# else
> +       subl    %edx, %edi
> +#  ifdef USE_AS_WCSLEN
> +       /* NB: Divide bytes by 4 to get the wchar_t count.  */
> +       sarl    $2, %edi
> +#  endif
> +       leal    CHAR_PER_VEC(%rdi, %rax), %eax
>  # endif
> -       VPCMP   $0, (%rdi), %YMMZERO, %k0
> -       kmovd   %k0, %eax
> +       ret
>
> -       /* Remove the leading bytes.  */
> -       sarxl   %SHIFT_REG, %eax, %eax
> -       testl   %eax, %eax
> -       jz      L(aligned_more)
> +       .p2align 4
> +L(first_vec_x2):
>         tzcntl  %eax, %eax
> -# ifdef USE_AS_WCSLEN
> -       /* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
> -       sall    $2, %eax
> -# endif
> +       /* Safe to use 32 bit instructions as these are only called for
> +          size = [1, 159].  */
>  # ifdef USE_AS_STRNLEN
> -       /* Check the end of data.  */
> -       cmpq    %rax, %rsi
> -       jbe     L(max)
> -# endif
> -       addq    %rdi, %rax
> -       addq    %rcx, %rax
> -       subq    %rdx, %rax
> -# ifdef USE_AS_WCSLEN
> -       shrq    $2, %rax
> +       /* Use ecx which was computed earlier to compute correct value.
> +        */
> +       leal    -(CHAR_PER_VEC * 3 + 1)(%rcx, %rax), %eax
> +# else
> +       subl    %edx, %edi
> +#  ifdef USE_AS_WCSLEN
> +       /* NB: Divide bytes by 4 to get the wchar_t count.  */
> +       sarl    $2, %edi
> +#  endif
> +       leal    (CHAR_PER_VEC * 2)(%rdi, %rax), %eax
>  # endif
>         ret
>
>         .p2align 4
> -L(aligned_more):
> +L(first_vec_x3):
> +       tzcntl  %eax, %eax
> +       /* Safe to use 32 bit instructions as these are only called for
> +          size = [1, 159].  */
>  # ifdef USE_AS_STRNLEN
> -        /* "rcx" is less than VEC_SIZE.  Calculate "rdx + rcx - VEC_SIZE"
> -           with "rdx - (VEC_SIZE - rcx)" instead of "(rdx + rcx) - VEC_SIZE"
> -           to void possible addition overflow.  */
> -       negq    %rcx
> -       addq    $VEC_SIZE, %rcx
> -
> -       /* Check the end of data.  */
> -       subq    %rcx, %rsi
> -       jbe     L(max)
> +       /* Use ecx which was computed earlier to compute correct value.
> +        */
> +       leal    -(CHAR_PER_VEC * 2 + 1)(%rcx, %rax), %eax
> +# else
> +       subl    %edx, %edi
> +#  ifdef USE_AS_WCSLEN
> +       /* NB: Divide bytes by 4 to get the wchar_t count.  */
> +       sarl    $2, %edi
> +#  endif
> +       leal    (CHAR_PER_VEC * 3)(%rdi, %rax), %eax
>  # endif
> +       ret
>
> -       addq    $VEC_SIZE, %rdi
> -
> +       .p2align 4
> +L(first_vec_x4):
> +       tzcntl  %eax, %eax
> +       /* Safe to use 32 bit instructions as these are only called for
> +          size = [1, 159].  */
>  # ifdef USE_AS_STRNLEN
> -       subq    $(VEC_SIZE * 4), %rsi
> -       jbe     L(last_4x_vec_or_less)
> +       /* Use ecx which was computed earlier to compute correct value.
> +        */
> +       leal    -(CHAR_PER_VEC + 1)(%rcx, %rax), %eax
> +# else
> +       subl    %edx, %edi
> +#  ifdef USE_AS_WCSLEN
> +       /* NB: Divide bytes by 4 to get the wchar_t count.  */
> +       sarl    $2, %edi
> +#  endif
> +       leal    (CHAR_PER_VEC * 4)(%rdi, %rax), %eax
>  # endif
> +       ret
>
> -L(more_4x_vec):
> +       .p2align 5
> +L(aligned_more):
> +       movq    %rdi, %rdx
> +       /* Align data to VEC_SIZE.  */
> +       andq    $-(VEC_SIZE), %rdi
> +L(cross_page_continue):
>         /* Check the first 4 * VEC_SIZE.  Only one VEC_SIZE at a time
>            since data is only aligned to VEC_SIZE.  */
> -       VPCMP   $0, (%rdi), %YMMZERO, %k0
> -       kmovd   %k0, %eax
> -       testl   %eax, %eax
> -       jnz     L(first_vec_x0)
> -
> +# ifdef USE_AS_STRNLEN
> +       /* + CHAR_SIZE because it simplies the logic in
> +          last_4x_vec_or_less.  */
> +       leaq    (VEC_SIZE * 5 + CHAR_SIZE)(%rdi), %rcx
> +       subq    %rdx, %rcx
> +#  ifdef USE_AS_WCSLEN
> +       /* NB: Divide bytes by 4 to get the wchar_t count.  */
> +       sarl    $2, %ecx
> +#  endif
> +# endif
> +       /* Load first VEC regardless.  */
>         VPCMP   $0, VEC_SIZE(%rdi), %YMMZERO, %k0
> +# ifdef USE_AS_STRNLEN
> +       /* Adjust length. If near end handle specially.  */
> +       subq    %rcx, %rsi
> +       jb      L(last_4x_vec_or_less)
> +# endif
>         kmovd   %k0, %eax
>         testl   %eax, %eax
>         jnz     L(first_vec_x1)
>
>         VPCMP   $0, (VEC_SIZE * 2)(%rdi), %YMMZERO, %k0
>         kmovd   %k0, %eax
> -       testl   %eax, %eax
> +       test    %eax, %eax
>         jnz     L(first_vec_x2)
>
>         VPCMP   $0, (VEC_SIZE * 3)(%rdi), %YMMZERO, %k0
> @@ -179,258 +214,276 @@ L(more_4x_vec):
>         testl   %eax, %eax
>         jnz     L(first_vec_x3)
>
> -       addq    $(VEC_SIZE * 4), %rdi
> -
> -# ifdef USE_AS_STRNLEN
> -       subq    $(VEC_SIZE * 4), %rsi
> -       jbe     L(last_4x_vec_or_less)
> -# endif
> -
> -       /* Align data to 4 * VEC_SIZE.  */
> -       movq    %rdi, %rcx
> -       andl    $(4 * VEC_SIZE - 1), %ecx
> -       andq    $-(4 * VEC_SIZE), %rdi
> +       VPCMP   $0, (VEC_SIZE * 4)(%rdi), %YMMZERO, %k0
> +       kmovd   %k0, %eax
> +       testl   %eax, %eax
> +       jnz     L(first_vec_x4)
>
> +       addq    $VEC_SIZE, %rdi
>  # ifdef USE_AS_STRNLEN
> -       /* Adjust length.  */
> +       /* Check if at last VEC_SIZE * 4 length.  */
> +       cmpq    $(CHAR_PER_VEC * 4 - 1), %rsi
> +       jbe     L(last_4x_vec_or_less_load)
> +       movl    %edi, %ecx
> +       andl    $(VEC_SIZE * 4 - 1), %ecx
> +#  ifdef USE_AS_WCSLEN
> +       /* NB: Divide bytes by 4 to get the wchar_t count.  */
> +       sarl    $2, %ecx
> +#  endif
> +       /* Readjust length.  */
>         addq    %rcx, %rsi
>  # endif
> +       /* Align data to VEC_SIZE * 4.  */
> +       andq    $-(VEC_SIZE * 4), %rdi
>
> +       /* Compare 4 * VEC at a time forward.  */
>         .p2align 4
>  L(loop_4x_vec):
> -       /* Compare 4 * VEC at a time forward.  */
> -       VMOVA   (%rdi), %YMM1
> -       VMOVA   VEC_SIZE(%rdi), %YMM2
> -       VMOVA   (VEC_SIZE * 2)(%rdi), %YMM3
> -       VMOVA   (VEC_SIZE * 3)(%rdi), %YMM4
> -
> -       VPMINU  %YMM1, %YMM2, %YMM5
> -       VPMINU  %YMM3, %YMM4, %YMM6
> +       /* Load first VEC regardless.  */
> +       VMOVA   (VEC_SIZE * 4)(%rdi), %YMM1
> +# ifdef USE_AS_STRNLEN
> +       /* Break if at end of length.  */
> +       subq    $(CHAR_PER_VEC * 4), %rsi
> +       jb      L(last_4x_vec_or_less_cmpeq)
> +# endif
> +       /* Save some code size by microfusing VPMINU with the load. Since
> +          the matches in ymm2/ymm4 can only be returned if there where no
> +          matches in ymm1/ymm3 respectively there is no issue with overlap.
> +        */
> +       VPMINU  (VEC_SIZE * 5)(%rdi), %YMM1, %YMM2
> +       VMOVA   (VEC_SIZE * 6)(%rdi), %YMM3
> +       VPMINU  (VEC_SIZE * 7)(%rdi), %YMM3, %YMM4
> +
> +       VPCMP   $0, %YMM2, %YMMZERO, %k0
> +       VPCMP   $0, %YMM4, %YMMZERO, %k1
> +       subq    $-(VEC_SIZE * 4), %rdi
> +       kortestd        %k0, %k1
> +       jz      L(loop_4x_vec)
> +
> +       /* Check if end was in first half.  */
> +       kmovd   %k0, %eax
> +       subq    %rdx, %rdi
> +# ifdef USE_AS_WCSLEN
> +       shrq    $2, %rdi
> +# endif
> +       testl   %eax, %eax
> +       jz      L(second_vec_return)
>
> -       VPMINU  %YMM5, %YMM6, %YMM5
> -       VPCMP   $0, %YMM5, %YMMZERO, %k0
> -       ktestd  %k0, %k0
> -       jnz     L(4x_vec_end)
> +       VPCMP   $0, %YMM1, %YMMZERO, %k2
> +       kmovd   %k2, %edx
> +       /* Combine VEC1 matches (edx) with VEC2 matches (eax).  */
> +# ifdef USE_AS_WCSLEN
> +       sall    $CHAR_PER_VEC, %eax
> +       orl     %edx, %eax
> +       tzcntl  %eax, %eax
> +# else
> +       salq    $CHAR_PER_VEC, %rax
> +       orq     %rdx, %rax
> +       tzcntq  %rax, %rax
> +# endif
> +       addq    %rdi, %rax
> +       ret
>
> -       addq    $(VEC_SIZE * 4), %rdi
>
> -# ifndef USE_AS_STRNLEN
> -       jmp     L(loop_4x_vec)
> -# else
> -       subq    $(VEC_SIZE * 4), %rsi
> -       ja      L(loop_4x_vec)
> +# ifdef USE_AS_STRNLEN
>
> +L(last_4x_vec_or_less_load):
> +       /* Depending on entry adjust rdi / prepare first VEC in YMM1.  */
> +       VMOVA   (VEC_SIZE * 4)(%rdi), %YMM1
> +L(last_4x_vec_or_less_cmpeq):
> +       VPCMP   $0, %YMM1, %YMMZERO, %k0
> +       addq    $(VEC_SIZE * 3), %rdi
>  L(last_4x_vec_or_less):
> -       /* Less than 4 * VEC and aligned to VEC_SIZE.  */
> -       addl    $(VEC_SIZE * 2), %esi
> -       jle     L(last_2x_vec)
> -
> -       VPCMP   $0, (%rdi), %YMMZERO, %k0
>         kmovd   %k0, %eax
> +       /* If remaining length > VEC_SIZE * 2. This works if esi is off by
> +          VEC_SIZE * 4.  */
> +       testl   $(CHAR_PER_VEC * 2), %esi
> +       jnz     L(last_4x_vec)
> +
> +       /* length may have been negative or positive by an offset of
> +          CHAR_PER_VEC * 4 depending on where this was called from. This
> +          fixes that.  */
> +       andl    $(CHAR_PER_VEC * 4 - 1), %esi
>         testl   %eax, %eax
> -       jnz     L(first_vec_x0)
> +       jnz     L(last_vec_x1_check)
>
> -       VPCMP   $0, VEC_SIZE(%rdi), %YMMZERO, %k0
> -       kmovd   %k0, %eax
> -       testl   %eax, %eax
> -       jnz     L(first_vec_x1)
> +       /* Check the end of data.  */
> +       subl    $CHAR_PER_VEC, %esi
> +       jb      L(max)
>
>         VPCMP   $0, (VEC_SIZE * 2)(%rdi), %YMMZERO, %k0
>         kmovd   %k0, %eax
> -       testl   %eax, %eax
> -       jnz     L(first_vec_x2_check)
> -       subl    $VEC_SIZE, %esi
> -       jle     L(max)
> +       tzcntl  %eax, %eax
> +       /* Check the end of data.  */
> +       cmpl    %eax, %esi
> +       jb      L(max)
>
> -       VPCMP   $0, (VEC_SIZE * 3)(%rdi), %YMMZERO, %k0
> -       kmovd   %k0, %eax
> -       testl   %eax, %eax
> -       jnz     L(first_vec_x3_check)
> +       subq    %rdx, %rdi
> +#  ifdef USE_AS_WCSLEN
> +       /* NB: Divide bytes by 4 to get the wchar_t count.  */
> +       sarq    $2, %rdi
> +#  endif
> +       leaq    (CHAR_PER_VEC * 2)(%rdi, %rax), %rax
> +       ret
> +L(max):
>         movq    %r8, %rax
> +       ret
> +# endif
> +
> +       /* Placed here in strnlen so that the jcc L(last_4x_vec_or_less)
> +          in the 4x VEC loop can use 2 byte encoding.  */
> +       .p2align 4
> +L(second_vec_return):
> +       VPCMP   $0, %YMM3, %YMMZERO, %k0
> +       /* Combine YMM3 matches (k0) with YMM4 matches (k1).  */
> +# ifdef USE_AS_WCSLEN
> +       kunpckbw        %k0, %k1, %k0
> +       kmovd   %k0, %eax
> +       tzcntl  %eax, %eax
> +# else
> +       kunpckdq        %k0, %k1, %k0
> +       kmovq   %k0, %rax
> +       tzcntq  %rax, %rax
> +# endif
> +       leaq    (CHAR_PER_VEC * 2)(%rdi, %rax), %rax
> +       ret
> +
> +
> +# ifdef USE_AS_STRNLEN
> +L(last_vec_x1_check):
> +       tzcntl  %eax, %eax
> +       /* Check the end of data.  */
> +       cmpl    %eax, %esi
> +       jb      L(max)
> +       subq    %rdx, %rdi
>  #  ifdef USE_AS_WCSLEN
> -       shrq    $2, %rax
> +       /* NB: Divide bytes by 4 to get the wchar_t count.  */
> +       sarq    $2, %rdi
>  #  endif
> +       leaq    (CHAR_PER_VEC)(%rdi, %rax), %rax
>         ret
>
>         .p2align 4
> -L(last_2x_vec):
> -       addl    $(VEC_SIZE * 2), %esi
> +L(last_4x_vec):
> +       /* Test first 2x VEC normally.  */
> +       testl   %eax, %eax
> +       jnz     L(last_vec_x1)
>
> -       VPCMP   $0, (%rdi), %YMMZERO, %k0
> +       VPCMP   $0, (VEC_SIZE * 2)(%rdi), %YMMZERO, %k0
>         kmovd   %k0, %eax
>         testl   %eax, %eax
> -       jnz     L(first_vec_x0_check)
> -       subl    $VEC_SIZE, %esi
> -       jle     L(max)
> +       jnz     L(last_vec_x2)
>
> -       VPCMP   $0, VEC_SIZE(%rdi), %YMMZERO, %k0
> +       /* Normalize length.  */
> +       andl    $(CHAR_PER_VEC * 4 - 1), %esi
> +       VPCMP   $0, (VEC_SIZE * 3)(%rdi), %YMMZERO, %k0
>         kmovd   %k0, %eax
>         testl   %eax, %eax
> -       jnz     L(first_vec_x1_check)
> -       movq    %r8, %rax
> -#  ifdef USE_AS_WCSLEN
> -       shrq    $2, %rax
> -#  endif
> -       ret
> +       jnz     L(last_vec_x3)
>
> -       .p2align 4
> -L(first_vec_x0_check):
> +       /* Check the end of data.  */
> +       subl    $(CHAR_PER_VEC * 3), %esi
> +       jb      L(max)
> +
> +       VPCMP   $0, (VEC_SIZE * 4)(%rdi), %YMMZERO, %k0
> +       kmovd   %k0, %eax
>         tzcntl  %eax, %eax
> -#  ifdef USE_AS_WCSLEN
> -       /* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
> -       sall    $2, %eax
> -#  endif
>         /* Check the end of data.  */
> -       cmpq    %rax, %rsi
> -       jbe     L(max)
> -       addq    %rdi, %rax
> -       subq    %rdx, %rax
> +       cmpl    %eax, %esi
> +       jb      L(max_end)
> +
> +       subq    %rdx, %rdi
>  #  ifdef USE_AS_WCSLEN
> -       shrq    $2, %rax
> +       /* NB: Divide bytes by 4 to get the wchar_t count.  */
> +       sarq    $2, %rdi
>  #  endif
> +       leaq    (CHAR_PER_VEC * 4)(%rdi, %rax), %rax
>         ret
>
>         .p2align 4
> -L(first_vec_x1_check):
> +L(last_vec_x1):
>         tzcntl  %eax, %eax
> +       subq    %rdx, %rdi
>  #  ifdef USE_AS_WCSLEN
> -       /* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
> -       sall    $2, %eax
> -#  endif
> -       /* Check the end of data.  */
> -       cmpq    %rax, %rsi
> -       jbe     L(max)
> -       addq    $VEC_SIZE, %rax
> -       addq    %rdi, %rax
> -       subq    %rdx, %rax
> -#  ifdef USE_AS_WCSLEN
> -       shrq    $2, %rax
> +       /* NB: Divide bytes by 4 to get the wchar_t count.  */
> +       sarq    $2, %rdi
>  #  endif
> +       leaq    (CHAR_PER_VEC)(%rdi, %rax), %rax
>         ret
>
>         .p2align 4
> -L(first_vec_x2_check):
> +L(last_vec_x2):
>         tzcntl  %eax, %eax
> +       subq    %rdx, %rdi
>  #  ifdef USE_AS_WCSLEN
> -       /* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
> -       sall    $2, %eax
> -#  endif
> -       /* Check the end of data.  */
> -       cmpq    %rax, %rsi
> -       jbe     L(max)
> -       addq    $(VEC_SIZE * 2), %rax
> -       addq    %rdi, %rax
> -       subq    %rdx, %rax
> -#  ifdef USE_AS_WCSLEN
> -       shrq    $2, %rax
> +       /* NB: Divide bytes by 4 to get the wchar_t count.  */
> +       sarq    $2, %rdi
>  #  endif
> +       leaq    (CHAR_PER_VEC * 2)(%rdi, %rax), %rax
>         ret
>
>         .p2align 4
> -L(first_vec_x3_check):
> +L(last_vec_x3):
>         tzcntl  %eax, %eax
> -#  ifdef USE_AS_WCSLEN
> -       /* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
> -       sall    $2, %eax
> -#  endif
> +       subl    $(CHAR_PER_VEC * 2), %esi
>         /* Check the end of data.  */
> -       cmpq    %rax, %rsi
> -       jbe     L(max)
> -       addq    $(VEC_SIZE * 3), %rax
> -       addq    %rdi, %rax
> -       subq    %rdx, %rax
> +       cmpl    %eax, %esi
> +       jb      L(max_end)
> +       subq    %rdx, %rdi
>  #  ifdef USE_AS_WCSLEN
> -       shrq    $2, %rax
> +       /* NB: Divide bytes by 4 to get the wchar_t count.  */
> +       sarq    $2, %rdi
>  #  endif
> +       leaq    (CHAR_PER_VEC * 3)(%rdi, %rax), %rax
>         ret
> -
> -       .p2align 4
> -L(max):
> +L(max_end):
>         movq    %r8, %rax
> -#  ifdef USE_AS_WCSLEN
> -       shrq    $2, %rax
> -#  endif
> -       ret
> -
> -       .p2align 4
> -L(zero):
> -       xorl    %eax, %eax
>         ret
>  # endif
>
> +       /* Cold case for crossing page with first load.  */
>         .p2align 4
> -L(first_vec_x0):
> -       tzcntl  %eax, %eax
> -# ifdef USE_AS_WCSLEN
> -       /* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
> -       sall    $2, %eax
> -# endif
> -       addq    %rdi, %rax
> -       subq    %rdx, %rax
> +L(cross_page_boundary):
> +       movq    %rdi, %rdx
> +       /* Align data to VEC_SIZE.  */
> +       andq    $-VEC_SIZE, %rdi
> +       VPCMP   $0, (%rdi), %YMMZERO, %k0
> +       kmovd   %k0, %eax
> +       /* Remove the leading bytes.  */
>  # ifdef USE_AS_WCSLEN
> -       shrq    $2, %rax
> +       /* NB: Divide shift count by 4 since each bit in K0 represent 4
> +          bytes.  */
> +       movl    %edx, %ecx
> +       shrl    $2, %ecx
> +       andl    $(CHAR_PER_VEC - 1), %ecx
>  # endif
> -       ret
> -
> -       .p2align 4
> -L(first_vec_x1):
> +       /* SHIFT_REG is ecx for USE_AS_WCSLEN and edx otherwise.  */
> +       sarxl   %SHIFT_REG, %eax, %eax
> +       testl   %eax, %eax
> +# ifndef USE_AS_STRNLEN
> +       jz      L(cross_page_continue)
>         tzcntl  %eax, %eax
> -# ifdef USE_AS_WCSLEN
> -       /* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
> -       sall    $2, %eax
> -# endif
> -       addq    $VEC_SIZE, %rax
> -       addq    %rdi, %rax
> -       subq    %rdx, %rax
> -# ifdef USE_AS_WCSLEN
> -       shrq    $2, %rax
> -# endif
>         ret
> -
> -       .p2align 4
> -L(first_vec_x2):
> -       tzcntl  %eax, %eax
> -# ifdef USE_AS_WCSLEN
> -       /* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
> -       sall    $2, %eax
> -# endif
> -       addq    $(VEC_SIZE * 2), %rax
> -       addq    %rdi, %rax
> -       subq    %rdx, %rax
> -# ifdef USE_AS_WCSLEN
> -       shrq    $2, %rax
> -# endif
> +# else
> +       jnz     L(cross_page_less_vec)
> +#  ifndef USE_AS_WCSLEN
> +       movl    %edx, %ecx
> +       andl    $(CHAR_PER_VEC - 1), %ecx
> +#  endif
> +       movl    $CHAR_PER_VEC, %eax
> +       subl    %ecx, %eax
> +       /* Check the end of data.  */
> +       cmpq    %rax, %rsi
> +       ja      L(cross_page_continue)
> +       movl    %esi, %eax
>         ret
> -
> -       .p2align 4
> -L(4x_vec_end):
> -       VPCMP   $0, %YMM1, %YMMZERO, %k0
> -       kmovd   %k0, %eax
> -       testl   %eax, %eax
> -       jnz     L(first_vec_x0)
> -       VPCMP   $0, %YMM2, %YMMZERO, %k1
> -       kmovd   %k1, %eax
> -       testl   %eax, %eax
> -       jnz     L(first_vec_x1)
> -       VPCMP   $0, %YMM3, %YMMZERO, %k2
> -       kmovd   %k2, %eax
> -       testl   %eax, %eax
> -       jnz     L(first_vec_x2)
> -       VPCMP   $0, %YMM4, %YMMZERO, %k3
> -       kmovd   %k3, %eax
> -L(first_vec_x3):
> +L(cross_page_less_vec):
>         tzcntl  %eax, %eax
> -# ifdef USE_AS_WCSLEN
> -       /* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
> -       sall    $2, %eax
> -# endif
> -       addq    $(VEC_SIZE * 3), %rax
> -       addq    %rdi, %rax
> -       subq    %rdx, %rax
> -# ifdef USE_AS_WCSLEN
> -       shrq    $2, %rax
> -# endif
> +       /* Select min of length and position of first null.  */
> +       cmpq    %rax, %rsi
> +       cmovb   %esi, %eax
>         ret
> +# endif
>
>  END (STRLEN)
>  #endif
> --
> 2.29.2
>

LGTM.  I am checking it in for you.

Thanks.

-- 
H.J.

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH v5 2/2] x86: Optimize strlen-avx2.S
  2021-04-19 23:36 ` [PATCH v5 2/2] x86: Optimize strlen-avx2.S Noah Goldstein
@ 2021-04-20  1:01   ` H.J. Lu
  2022-09-25  8:19   ` Aurelien Jarno
  1 sibling, 0 replies; 24+ messages in thread
From: H.J. Lu @ 2021-04-20  1:01 UTC (permalink / raw)
  To: Noah Goldstein; +Cc: libc-alpha, carlos

On Mon, Apr 19, 2021 at 4:36 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> No bug. This commit optimizes strlen-avx2.S. The optimizations are
> mostly small things but they add up to roughly 10-30% performance
> improvement for strlen. The results for strnlen are bit more
> ambiguous. test-strlen, test-strnlen, test-wcslen, and test-wcsnlen
> are all passing.
>
> Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
> ---
>  sysdeps/x86_64/multiarch/ifunc-impl-list.c |  16 +-
>  sysdeps/x86_64/multiarch/strlen-avx2.S     | 532 +++++++++++++--------
>  2 files changed, 334 insertions(+), 214 deletions(-)
>
> diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> index c377cab629..651b32908e 100644
> --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> @@ -293,10 +293,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
>    /* Support sysdeps/x86_64/multiarch/strlen.c.  */
>    IFUNC_IMPL (i, name, strlen,
>               IFUNC_IMPL_ADD (array, i, strlen,
> -                             CPU_FEATURE_USABLE (AVX2),
> +                             (CPU_FEATURE_USABLE (AVX2)
> +                              && CPU_FEATURE_USABLE (BMI2)),
>                               __strlen_avx2)
>               IFUNC_IMPL_ADD (array, i, strlen,
>                               (CPU_FEATURE_USABLE (AVX2)
> +                              && CPU_FEATURE_USABLE (BMI2)
>                                && CPU_FEATURE_USABLE (RTM)),
>                               __strlen_avx2_rtm)
>               IFUNC_IMPL_ADD (array, i, strlen,
> @@ -309,10 +311,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
>    /* Support sysdeps/x86_64/multiarch/strnlen.c.  */
>    IFUNC_IMPL (i, name, strnlen,
>               IFUNC_IMPL_ADD (array, i, strnlen,
> -                             CPU_FEATURE_USABLE (AVX2),
> +                             (CPU_FEATURE_USABLE (AVX2)
> +                              && CPU_FEATURE_USABLE (BMI2)),
>                               __strnlen_avx2)
>               IFUNC_IMPL_ADD (array, i, strnlen,
>                               (CPU_FEATURE_USABLE (AVX2)
> +                              && CPU_FEATURE_USABLE (BMI2)
>                                && CPU_FEATURE_USABLE (RTM)),
>                               __strnlen_avx2_rtm)
>               IFUNC_IMPL_ADD (array, i, strnlen,
> @@ -654,10 +658,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
>    /* Support sysdeps/x86_64/multiarch/wcslen.c.  */
>    IFUNC_IMPL (i, name, wcslen,
>               IFUNC_IMPL_ADD (array, i, wcslen,
> -                             CPU_FEATURE_USABLE (AVX2),
> +                             (CPU_FEATURE_USABLE (AVX2)
> +                              && CPU_FEATURE_USABLE (BMI2)),
>                               __wcslen_avx2)
>               IFUNC_IMPL_ADD (array, i, wcslen,
>                               (CPU_FEATURE_USABLE (AVX2)
> +                              && CPU_FEATURE_USABLE (BMI2)
>                                && CPU_FEATURE_USABLE (RTM)),
>                               __wcslen_avx2_rtm)
>               IFUNC_IMPL_ADD (array, i, wcslen,
> @@ -670,10 +676,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
>    /* Support sysdeps/x86_64/multiarch/wcsnlen.c.  */
>    IFUNC_IMPL (i, name, wcsnlen,
>               IFUNC_IMPL_ADD (array, i, wcsnlen,
> -                             CPU_FEATURE_USABLE (AVX2),
> +                             (CPU_FEATURE_USABLE (AVX2)
> +                              && CPU_FEATURE_USABLE (BMI2)),
>                               __wcsnlen_avx2)
>               IFUNC_IMPL_ADD (array, i, wcsnlen,
>                               (CPU_FEATURE_USABLE (AVX2)
> +                              && CPU_FEATURE_USABLE (BMI2)
>                                && CPU_FEATURE_USABLE (RTM)),
>                               __wcsnlen_avx2_rtm)
>               IFUNC_IMPL_ADD (array, i, wcsnlen,
> diff --git a/sysdeps/x86_64/multiarch/strlen-avx2.S b/sysdeps/x86_64/multiarch/strlen-avx2.S
> index 1caae9e6bc..bd2e6ee44a 100644
> --- a/sysdeps/x86_64/multiarch/strlen-avx2.S
> +++ b/sysdeps/x86_64/multiarch/strlen-avx2.S
> @@ -27,9 +27,11 @@
>  # ifdef USE_AS_WCSLEN
>  #  define VPCMPEQ      vpcmpeqd
>  #  define VPMINU       vpminud
> +#  define CHAR_SIZE    4
>  # else
>  #  define VPCMPEQ      vpcmpeqb
>  #  define VPMINU       vpminub
> +#  define CHAR_SIZE    1
>  # endif
>
>  # ifndef VZEROUPPER
> @@ -41,349 +43,459 @@
>  # endif
>
>  # define VEC_SIZE 32
> +# define PAGE_SIZE 4096
>
>         .section SECTION(.text),"ax",@progbits
>  ENTRY (STRLEN)
>  # ifdef USE_AS_STRNLEN
> -       /* Check for zero length.  */
> +       /* Check zero length.  */
>         test    %RSI_LP, %RSI_LP
>         jz      L(zero)
> +       /* Store max len in R8_LP before adjusting if using WCSLEN.  */
> +       mov     %RSI_LP, %R8_LP
>  #  ifdef USE_AS_WCSLEN
>         shl     $2, %RSI_LP
>  #  elif defined __ILP32__
>         /* Clear the upper 32 bits.  */
>         movl    %esi, %esi
>  #  endif
> -       mov     %RSI_LP, %R8_LP
>  # endif
> -       movl    %edi, %ecx
> +       movl    %edi, %eax
>         movq    %rdi, %rdx
>         vpxor   %xmm0, %xmm0, %xmm0
> -
> +       /* Clear high bits from edi. Only keeping bits relevant to page
> +          cross check.  */
> +       andl    $(PAGE_SIZE - 1), %eax
>         /* Check if we may cross page boundary with one vector load.  */
> -       andl    $(2 * VEC_SIZE - 1), %ecx
> -       cmpl    $VEC_SIZE, %ecx
> -       ja      L(cros_page_boundary)
> +       cmpl    $(PAGE_SIZE - VEC_SIZE), %eax
> +       ja      L(cross_page_boundary)
>
>         /* Check the first VEC_SIZE bytes.  */
> -       VPCMPEQ (%rdi), %ymm0, %ymm1
> -       vpmovmskb %ymm1, %eax
> -       testl   %eax, %eax
> -
> +       VPCMPEQ (%rdi), %ymm0, %ymm1
> +       vpmovmskb       %ymm1, %eax
>  # ifdef USE_AS_STRNLEN
> -       jnz     L(first_vec_x0_check)
> -       /* Adjust length and check the end of data.  */
> -       subq    $VEC_SIZE, %rsi
> -       jbe     L(max)
> -# else
> -       jnz     L(first_vec_x0)
> +       /* If length < VEC_SIZE handle special.  */
> +       cmpq    $VEC_SIZE, %rsi
> +       jbe     L(first_vec_x0)
>  # endif
> -
> -       /* Align data for aligned loads in the loop.  */
> -       addq    $VEC_SIZE, %rdi
> -       andl    $(VEC_SIZE - 1), %ecx
> -       andq    $-VEC_SIZE, %rdi
> +       /* If empty continue to aligned_more. Otherwise return bit
> +          position of first match.  */
> +       testl   %eax, %eax
> +       jz      L(aligned_more)
> +       tzcntl  %eax, %eax
> +# ifdef USE_AS_WCSLEN
> +       shrl    $2, %eax
> +# endif
> +       VZEROUPPER_RETURN
>
>  # ifdef USE_AS_STRNLEN
> -       /* Adjust length.  */
> -       addq    %rcx, %rsi
> +L(zero):
> +       xorl    %eax, %eax
> +       ret
>
> -       subq    $(VEC_SIZE * 4), %rsi
> -       jbe     L(last_4x_vec_or_less)
> +       .p2align 4
> +L(first_vec_x0):
> +       /* Set bit for max len so that tzcnt will return min of max len
> +          and position of first match.  */
> +       btsq    %rsi, %rax
> +       tzcntl  %eax, %eax
> +#  ifdef USE_AS_WCSLEN
> +       shrl    $2, %eax
> +#  endif
> +       VZEROUPPER_RETURN
>  # endif
> -       jmp     L(more_4x_vec)
>
>         .p2align 4
> -L(cros_page_boundary):
> -       andl    $(VEC_SIZE - 1), %ecx
> -       andq    $-VEC_SIZE, %rdi
> -       VPCMPEQ (%rdi), %ymm0, %ymm1
> -       vpmovmskb %ymm1, %eax
> -       /* Remove the leading bytes.  */
> -       sarl    %cl, %eax
> -       testl   %eax, %eax
> -       jz      L(aligned_more)
> +L(first_vec_x1):
>         tzcntl  %eax, %eax
> +       /* Safe to use 32 bit instructions as these are only called for
> +          size = [1, 159].  */
>  # ifdef USE_AS_STRNLEN
> -       /* Check the end of data.  */
> -       cmpq    %rax, %rsi
> -       jbe     L(max)
> +       /* Use ecx which was computed earlier to compute correct value.
> +        */
> +       subl    $(VEC_SIZE * 4 + 1), %ecx
> +       addl    %ecx, %eax
> +# else
> +       subl    %edx, %edi
> +       incl    %edi
> +       addl    %edi, %eax
>  # endif
> -       addq    %rdi, %rax
> -       addq    %rcx, %rax
> -       subq    %rdx, %rax
>  # ifdef USE_AS_WCSLEN
> -       shrq    $2, %rax
> +       shrl    $2, %eax
>  # endif
> -L(return_vzeroupper):
> -       ZERO_UPPER_VEC_REGISTERS_RETURN
> +       VZEROUPPER_RETURN
>
>         .p2align 4
> -L(aligned_more):
> +L(first_vec_x2):
> +       tzcntl  %eax, %eax
> +       /* Safe to use 32 bit instructions as these are only called for
> +          size = [1, 159].  */
>  # ifdef USE_AS_STRNLEN
> -        /* "rcx" is less than VEC_SIZE.  Calculate "rdx + rcx - VEC_SIZE"
> -           with "rdx - (VEC_SIZE - rcx)" instead of "(rdx + rcx) - VEC_SIZE"
> -           to void possible addition overflow.  */
> -       negq    %rcx
> -       addq    $VEC_SIZE, %rcx
> -
> -       /* Check the end of data.  */
> -       subq    %rcx, %rsi
> -       jbe     L(max)
> +       /* Use ecx which was computed earlier to compute correct value.
> +        */
> +       subl    $(VEC_SIZE * 3 + 1), %ecx
> +       addl    %ecx, %eax
> +# else
> +       subl    %edx, %edi
> +       addl    $(VEC_SIZE + 1), %edi
> +       addl    %edi, %eax
>  # endif
> +# ifdef USE_AS_WCSLEN
> +       shrl    $2, %eax
> +# endif
> +       VZEROUPPER_RETURN
>
> -       addq    $VEC_SIZE, %rdi
> +       .p2align 4
> +L(first_vec_x3):
> +       tzcntl  %eax, %eax
> +       /* Safe to use 32 bit instructions as these are only called for
> +          size = [1, 159].  */
> +# ifdef USE_AS_STRNLEN
> +       /* Use ecx which was computed earlier to compute correct value.
> +        */
> +       subl    $(VEC_SIZE * 2 + 1), %ecx
> +       addl    %ecx, %eax
> +# else
> +       subl    %edx, %edi
> +       addl    $(VEC_SIZE * 2 + 1), %edi
> +       addl    %edi, %eax
> +# endif
> +# ifdef USE_AS_WCSLEN
> +       shrl    $2, %eax
> +# endif
> +       VZEROUPPER_RETURN
>
> +       .p2align 4
> +L(first_vec_x4):
> +       tzcntl  %eax, %eax
> +       /* Safe to use 32 bit instructions as these are only called for
> +          size = [1, 159].  */
>  # ifdef USE_AS_STRNLEN
> -       subq    $(VEC_SIZE * 4), %rsi
> -       jbe     L(last_4x_vec_or_less)
> +       /* Use ecx which was computed earlier to compute correct value.
> +        */
> +       subl    $(VEC_SIZE + 1), %ecx
> +       addl    %ecx, %eax
> +# else
> +       subl    %edx, %edi
> +       addl    $(VEC_SIZE * 3 + 1), %edi
> +       addl    %edi, %eax
>  # endif
> +# ifdef USE_AS_WCSLEN
> +       shrl    $2, %eax
> +# endif
> +       VZEROUPPER_RETURN
>
> -L(more_4x_vec):
> +       .p2align 5
> +L(aligned_more):
> +       /* Align data to VEC_SIZE - 1. This is the same number of
> +          instructions as using andq with -VEC_SIZE but saves 4 bytes of
> +          code on the x4 check.  */
> +       orq     $(VEC_SIZE - 1), %rdi
> +L(cross_page_continue):
>         /* Check the first 4 * VEC_SIZE.  Only one VEC_SIZE at a time
>            since data is only aligned to VEC_SIZE.  */
> -       VPCMPEQ (%rdi), %ymm0, %ymm1
> -       vpmovmskb %ymm1, %eax
> -       testl   %eax, %eax
> -       jnz     L(first_vec_x0)
> -
> -       VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1
> -       vpmovmskb %ymm1, %eax
> +# ifdef USE_AS_STRNLEN
> +       /* + 1 because rdi is aligned to VEC_SIZE - 1. + CHAR_SIZE because
> +          it simplies the logic in last_4x_vec_or_less.  */
> +       leaq    (VEC_SIZE * 4 + CHAR_SIZE + 1)(%rdi), %rcx
> +       subq    %rdx, %rcx
> +# endif
> +       /* Load first VEC regardless.  */
> +       VPCMPEQ 1(%rdi), %ymm0, %ymm1
> +# ifdef USE_AS_STRNLEN
> +       /* Adjust length. If near end handle specially.  */
> +       subq    %rcx, %rsi
> +       jb      L(last_4x_vec_or_less)
> +# endif
> +       vpmovmskb       %ymm1, %eax
>         testl   %eax, %eax
>         jnz     L(first_vec_x1)
>
> -       VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm0, %ymm1
> -       vpmovmskb %ymm1, %eax
> +       VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
> +       vpmovmskb       %ymm1, %eax
>         testl   %eax, %eax
>         jnz     L(first_vec_x2)
>
> -       VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1
> -       vpmovmskb %ymm1, %eax
> +       VPCMPEQ (VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm1
> +       vpmovmskb       %ymm1, %eax
>         testl   %eax, %eax
>         jnz     L(first_vec_x3)
>
> -       addq    $(VEC_SIZE * 4), %rdi
> -
> -# ifdef USE_AS_STRNLEN
> -       subq    $(VEC_SIZE * 4), %rsi
> -       jbe     L(last_4x_vec_or_less)
> -# endif
> -
> -       /* Align data to 4 * VEC_SIZE.  */
> -       movq    %rdi, %rcx
> -       andl    $(4 * VEC_SIZE - 1), %ecx
> -       andq    $-(4 * VEC_SIZE), %rdi
> +       VPCMPEQ (VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm1
> +       vpmovmskb       %ymm1, %eax
> +       testl   %eax, %eax
> +       jnz     L(first_vec_x4)
>
> +       /* Align data to VEC_SIZE * 4 - 1.  */
>  # ifdef USE_AS_STRNLEN
> -       /* Adjust length.  */
> +       /* Before adjusting length check if at last VEC_SIZE * 4.  */
> +       cmpq    $(VEC_SIZE * 4 - 1), %rsi
> +       jbe     L(last_4x_vec_or_less_load)
> +       incq    %rdi
> +       movl    %edi, %ecx
> +       orq     $(VEC_SIZE * 4 - 1), %rdi
> +       andl    $(VEC_SIZE * 4 - 1), %ecx
> +       /* Readjust length.  */
>         addq    %rcx, %rsi
> +# else
> +       incq    %rdi
> +       orq     $(VEC_SIZE * 4 - 1), %rdi
>  # endif
> -
> +       /* Compare 4 * VEC at a time forward.  */
>         .p2align 4
>  L(loop_4x_vec):
> -       /* Compare 4 * VEC at a time forward.  */
> -       vmovdqa (%rdi), %ymm1
> -       vmovdqa VEC_SIZE(%rdi), %ymm2
> -       vmovdqa (VEC_SIZE * 2)(%rdi), %ymm3
> -       vmovdqa (VEC_SIZE * 3)(%rdi), %ymm4
> -       VPMINU  %ymm1, %ymm2, %ymm5
> -       VPMINU  %ymm3, %ymm4, %ymm6
> -       VPMINU  %ymm5, %ymm6, %ymm5
> -
> -       VPCMPEQ %ymm5, %ymm0, %ymm5
> -       vpmovmskb %ymm5, %eax
> -       testl   %eax, %eax
> -       jnz     L(4x_vec_end)
> -
> -       addq    $(VEC_SIZE * 4), %rdi
> -
> -# ifndef USE_AS_STRNLEN
> -       jmp     L(loop_4x_vec)
> -# else
> +# ifdef USE_AS_STRNLEN
> +       /* Break if at end of length.  */
>         subq    $(VEC_SIZE * 4), %rsi
> -       ja      L(loop_4x_vec)
> -
> -L(last_4x_vec_or_less):
> -       /* Less than 4 * VEC and aligned to VEC_SIZE.  */
> -       addl    $(VEC_SIZE * 2), %esi
> -       jle     L(last_2x_vec)
> +       jb      L(last_4x_vec_or_less_cmpeq)
> +# endif
> +       /* Save some code size by microfusing VPMINU with the load. Since
> +          the matches in ymm2/ymm4 can only be returned if there where no
> +          matches in ymm1/ymm3 respectively there is no issue with overlap.
> +        */
> +       vmovdqa 1(%rdi), %ymm1
> +       VPMINU  (VEC_SIZE + 1)(%rdi), %ymm1, %ymm2
> +       vmovdqa (VEC_SIZE * 2 + 1)(%rdi), %ymm3
> +       VPMINU  (VEC_SIZE * 3 + 1)(%rdi), %ymm3, %ymm4
> +
> +       VPMINU  %ymm2, %ymm4, %ymm5
> +       VPCMPEQ %ymm5, %ymm0, %ymm5
> +       vpmovmskb       %ymm5, %ecx
>
> -       VPCMPEQ (%rdi), %ymm0, %ymm1
> -       vpmovmskb %ymm1, %eax
> -       testl   %eax, %eax
> -       jnz     L(first_vec_x0)
> +       subq    $-(VEC_SIZE * 4), %rdi
> +       testl   %ecx, %ecx
> +       jz      L(loop_4x_vec)
>
> -       VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1
> -       vpmovmskb %ymm1, %eax
> -       testl   %eax, %eax
> -       jnz     L(first_vec_x1)
>
> -       VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm0, %ymm1
> -       vpmovmskb %ymm1, %eax
> +       VPCMPEQ %ymm1, %ymm0, %ymm1
> +       vpmovmskb       %ymm1, %eax
> +       subq    %rdx, %rdi
>         testl   %eax, %eax
> +       jnz     L(last_vec_return_x0)
>
> -       jnz     L(first_vec_x2_check)
> -       subl    $VEC_SIZE, %esi
> -       jle     L(max)
> -
> -       VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1
> -       vpmovmskb %ymm1, %eax
> +       VPCMPEQ %ymm2, %ymm0, %ymm2
> +       vpmovmskb       %ymm2, %eax
>         testl   %eax, %eax
> -
> -       jnz     L(first_vec_x3_check)
> -       movq    %r8, %rax
> -#  ifdef USE_AS_WCSLEN
> +       jnz     L(last_vec_return_x1)
> +
> +       /* Combine last 2 VEC.  */
> +       VPCMPEQ %ymm3, %ymm0, %ymm3
> +       vpmovmskb       %ymm3, %eax
> +       /* rcx has combined result from all 4 VEC. It will only be used if
> +          the first 3 other VEC all did not contain a match.  */
> +       salq    $32, %rcx
> +       orq     %rcx, %rax
> +       tzcntq  %rax, %rax
> +       subq    $(VEC_SIZE * 2 - 1), %rdi
> +       addq    %rdi, %rax
> +# ifdef USE_AS_WCSLEN
>         shrq    $2, %rax
> -#  endif
> +# endif
>         VZEROUPPER_RETURN
>
> +
> +# ifdef USE_AS_STRNLEN
>         .p2align 4
> -L(last_2x_vec):
> -       addl    $(VEC_SIZE * 2), %esi
> -       VPCMPEQ (%rdi), %ymm0, %ymm1
> -       vpmovmskb %ymm1, %eax
> -       testl   %eax, %eax
> +L(last_4x_vec_or_less_load):
> +       /* Depending on entry adjust rdi / prepare first VEC in ymm1.  */
> +       subq    $-(VEC_SIZE * 4), %rdi
> +L(last_4x_vec_or_less_cmpeq):
> +       VPCMPEQ 1(%rdi), %ymm0, %ymm1
> +L(last_4x_vec_or_less):
>
> -       jnz     L(first_vec_x0_check)
> -       subl    $VEC_SIZE, %esi
> -       jle     L(max)
> +       vpmovmskb       %ymm1, %eax
> +       /* If remaining length > VEC_SIZE * 2. This works if esi is off by
> +          VEC_SIZE * 4.  */
> +       testl   $(VEC_SIZE * 2), %esi
> +       jnz     L(last_4x_vec)
>
> -       VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1
> -       vpmovmskb %ymm1, %eax
> +       /* length may have been negative or positive by an offset of
> +          VEC_SIZE * 4 depending on where this was called from. This fixes
> +          that.  */
> +       andl    $(VEC_SIZE * 4 - 1), %esi
>         testl   %eax, %eax
> -       jnz     L(first_vec_x1_check)
> -       movq    %r8, %rax
> -#  ifdef USE_AS_WCSLEN
> -       shrq    $2, %rax
> -#  endif
> -       VZEROUPPER_RETURN
> +       jnz     L(last_vec_x1_check)
>
> -       .p2align 4
> -L(first_vec_x0_check):
> +       subl    $VEC_SIZE, %esi
> +       jb      L(max)
> +
> +       VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
> +       vpmovmskb       %ymm1, %eax
>         tzcntl  %eax, %eax
>         /* Check the end of data.  */
> -       cmpq    %rax, %rsi
> -       jbe     L(max)
> +       cmpl    %eax, %esi
> +       jb      L(max)
> +       subq    %rdx, %rdi
> +       addl    $(VEC_SIZE + 1), %eax
>         addq    %rdi, %rax
> -       subq    %rdx, %rax
>  #  ifdef USE_AS_WCSLEN
>         shrq    $2, %rax
>  #  endif
>         VZEROUPPER_RETURN
> +# endif
>
>         .p2align 4
> -L(first_vec_x1_check):
> +L(last_vec_return_x0):
>         tzcntl  %eax, %eax
> -       /* Check the end of data.  */
> -       cmpq    %rax, %rsi
> -       jbe     L(max)
> -       addq    $VEC_SIZE, %rax
> +       subq    $(VEC_SIZE * 4 - 1), %rdi
>         addq    %rdi, %rax
> -       subq    %rdx, %rax
> -#  ifdef USE_AS_WCSLEN
> +# ifdef USE_AS_WCSLEN
>         shrq    $2, %rax
> -#  endif
> +# endif
>         VZEROUPPER_RETURN
>
>         .p2align 4
> -L(first_vec_x2_check):
> +L(last_vec_return_x1):
>         tzcntl  %eax, %eax
> -       /* Check the end of data.  */
> -       cmpq    %rax, %rsi
> -       jbe     L(max)
> -       addq    $(VEC_SIZE * 2), %rax
> +       subq    $(VEC_SIZE * 3 - 1), %rdi
>         addq    %rdi, %rax
> -       subq    %rdx, %rax
> -#  ifdef USE_AS_WCSLEN
> +# ifdef USE_AS_WCSLEN
>         shrq    $2, %rax
> -#  endif
> +# endif
>         VZEROUPPER_RETURN
>
> +# ifdef USE_AS_STRNLEN
>         .p2align 4
> -L(first_vec_x3_check):
> +L(last_vec_x1_check):
> +
>         tzcntl  %eax, %eax
>         /* Check the end of data.  */
> -       cmpq    %rax, %rsi
> -       jbe     L(max)
> -       addq    $(VEC_SIZE * 3), %rax
> +       cmpl    %eax, %esi
> +       jb      L(max)
> +       subq    %rdx, %rdi
> +       incl    %eax
>         addq    %rdi, %rax
> -       subq    %rdx, %rax
>  #  ifdef USE_AS_WCSLEN
>         shrq    $2, %rax
>  #  endif
>         VZEROUPPER_RETURN
>
> -       .p2align 4
>  L(max):
>         movq    %r8, %rax
> +       VZEROUPPER_RETURN
> +
> +       .p2align 4
> +L(last_4x_vec):
> +       /* Test first 2x VEC normally.  */
> +       testl   %eax, %eax
> +       jnz     L(last_vec_x1)
> +
> +       VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
> +       vpmovmskb       %ymm1, %eax
> +       testl   %eax, %eax
> +       jnz     L(last_vec_x2)
> +
> +       /* Normalize length.  */
> +       andl    $(VEC_SIZE * 4 - 1), %esi
> +       VPCMPEQ (VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm1
> +       vpmovmskb       %ymm1, %eax
> +       testl   %eax, %eax
> +       jnz     L(last_vec_x3)
> +
> +       subl    $(VEC_SIZE * 3), %esi
> +       jb      L(max)
> +
> +       VPCMPEQ (VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm1
> +       vpmovmskb       %ymm1, %eax
> +       tzcntl  %eax, %eax
> +       /* Check the end of data.  */
> +       cmpl    %eax, %esi
> +       jb      L(max)
> +       subq    %rdx, %rdi
> +       addl    $(VEC_SIZE * 3 + 1), %eax
> +       addq    %rdi, %rax
>  #  ifdef USE_AS_WCSLEN
>         shrq    $2, %rax
>  #  endif
>         VZEROUPPER_RETURN
>
> -       .p2align 4
> -L(zero):
> -       xorl    %eax, %eax
> -       ret
> -# endif
>
>         .p2align 4
> -L(first_vec_x0):
> +L(last_vec_x1):
> +       /* essentially duplicates of first_vec_x1 but use 64 bit
> +          instructions.  */
>         tzcntl  %eax, %eax
> +       subq    %rdx, %rdi
> +       incl    %eax
>         addq    %rdi, %rax
> -       subq    %rdx, %rax
> -# ifdef USE_AS_WCSLEN
> +#  ifdef USE_AS_WCSLEN
>         shrq    $2, %rax
> -# endif
> +#  endif
>         VZEROUPPER_RETURN
>
>         .p2align 4
> -L(first_vec_x1):
> +L(last_vec_x2):
> +       /* essentially duplicates of first_vec_x1 but use 64 bit
> +          instructions.  */
>         tzcntl  %eax, %eax
> -       addq    $VEC_SIZE, %rax
> +       subq    %rdx, %rdi
> +       addl    $(VEC_SIZE + 1), %eax
>         addq    %rdi, %rax
> -       subq    %rdx, %rax
> -# ifdef USE_AS_WCSLEN
> +#  ifdef USE_AS_WCSLEN
>         shrq    $2, %rax
> -# endif
> +#  endif
>         VZEROUPPER_RETURN
>
>         .p2align 4
> -L(first_vec_x2):
> +L(last_vec_x3):
>         tzcntl  %eax, %eax
> -       addq    $(VEC_SIZE * 2), %rax
> +       subl    $(VEC_SIZE * 2), %esi
> +       /* Check the end of data.  */
> +       cmpl    %eax, %esi
> +       jb      L(max_end)
> +       subq    %rdx, %rdi
> +       addl    $(VEC_SIZE * 2 + 1), %eax
>         addq    %rdi, %rax
> -       subq    %rdx, %rax
> -# ifdef USE_AS_WCSLEN
> +#  ifdef USE_AS_WCSLEN
>         shrq    $2, %rax
> -# endif
> +#  endif
> +       VZEROUPPER_RETURN
> +L(max_end):
> +       movq    %r8, %rax
>         VZEROUPPER_RETURN
> +# endif
>
> +       /* Cold case for crossing page with first load.  */
>         .p2align 4
> -L(4x_vec_end):
> -       VPCMPEQ %ymm1, %ymm0, %ymm1
> -       vpmovmskb %ymm1, %eax
> -       testl   %eax, %eax
> -       jnz     L(first_vec_x0)
> -       VPCMPEQ %ymm2, %ymm0, %ymm2
> -       vpmovmskb %ymm2, %eax
> +L(cross_page_boundary):
> +       /* Align data to VEC_SIZE - 1.  */
> +       orq     $(VEC_SIZE - 1), %rdi
> +       VPCMPEQ -(VEC_SIZE - 1)(%rdi), %ymm0, %ymm1
> +       vpmovmskb       %ymm1, %eax
> +       /* Remove the leading bytes. sarxl only uses bits [5:0] of COUNT
> +          so no need to manually mod rdx.  */
> +       sarxl   %edx, %eax, %eax
> +# ifdef USE_AS_STRNLEN
>         testl   %eax, %eax
> -       jnz     L(first_vec_x1)
> -       VPCMPEQ %ymm3, %ymm0, %ymm3
> -       vpmovmskb %ymm3, %eax
> +       jnz     L(cross_page_less_vec)
> +       leaq    1(%rdi), %rcx
> +       subq    %rdx, %rcx
> +       /* Check length.  */
> +       cmpq    %rsi, %rcx
> +       jb      L(cross_page_continue)
> +       movq    %r8, %rax
> +# else
>         testl   %eax, %eax
> -       jnz     L(first_vec_x2)
> -       VPCMPEQ %ymm4, %ymm0, %ymm4
> -       vpmovmskb %ymm4, %eax
> -L(first_vec_x3):
> +       jz      L(cross_page_continue)
>         tzcntl  %eax, %eax
> -       addq    $(VEC_SIZE * 3), %rax
> -       addq    %rdi, %rax
> -       subq    %rdx, %rax
> -# ifdef USE_AS_WCSLEN
> -       shrq    $2, %rax
> +#  ifdef USE_AS_WCSLEN
> +       shrl    $2, %eax
> +#  endif
>  # endif
> +L(return_vzeroupper):
> +       ZERO_UPPER_VEC_REGISTERS_RETURN
> +
> +# ifdef USE_AS_STRNLEN
> +       .p2align 4
> +L(cross_page_less_vec):
> +       tzcntl  %eax, %eax
> +       cmpq    %rax, %rsi
> +       cmovb   %esi, %eax
> +#  ifdef USE_AS_WCSLEN
> +       shrl    $2, %eax
> +#  endif
>         VZEROUPPER_RETURN
> +# endif
>
>  END (STRLEN)
>  #endif
> --
> 2.29.2
>

LGTM.  I am checking it in for you.

Thanks.

-- 
H.J.

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH v5 2/2] x86: Optimize strlen-avx2.S
  2021-04-19 23:36 ` [PATCH v5 2/2] x86: Optimize strlen-avx2.S Noah Goldstein
  2021-04-20  1:01   ` H.J. Lu
@ 2022-09-25  8:19   ` Aurelien Jarno
  2022-09-25 14:00     ` Noah Goldstein
  1 sibling, 1 reply; 24+ messages in thread
From: Aurelien Jarno @ 2022-09-25  8:19 UTC (permalink / raw)
  To: Noah Goldstein; +Cc: libc-alpha

On 2021-04-19 19:36, Noah Goldstein via Libc-alpha wrote:
> No bug. This commit optimizes strlen-avx2.S. The optimizations are
> mostly small things but they add up to roughly 10-30% performance
> improvement for strlen. The results for strnlen are bit more
> ambiguous. test-strlen, test-strnlen, test-wcslen, and test-wcsnlen
> are all passing.
> 
> Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
> ---
>  sysdeps/x86_64/multiarch/ifunc-impl-list.c |  16 +-
>  sysdeps/x86_64/multiarch/strlen-avx2.S     | 532 +++++++++++++--------
>  2 files changed, 334 insertions(+), 214 deletions(-)
> 
> diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> index c377cab629..651b32908e 100644
> --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> @@ -293,10 +293,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
>    /* Support sysdeps/x86_64/multiarch/strlen.c.  */
>    IFUNC_IMPL (i, name, strlen,
>  	      IFUNC_IMPL_ADD (array, i, strlen,
> -			      CPU_FEATURE_USABLE (AVX2),
> +			      (CPU_FEATURE_USABLE (AVX2)
> +			       && CPU_FEATURE_USABLE (BMI2)),
>  			      __strlen_avx2)
>  	      IFUNC_IMPL_ADD (array, i, strlen,
>  			      (CPU_FEATURE_USABLE (AVX2)
> +			       && CPU_FEATURE_USABLE (BMI2)
>  			       && CPU_FEATURE_USABLE (RTM)),
>  			      __strlen_avx2_rtm)
>  	      IFUNC_IMPL_ADD (array, i, strlen,
> @@ -309,10 +311,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
>    /* Support sysdeps/x86_64/multiarch/strnlen.c.  */
>    IFUNC_IMPL (i, name, strnlen,
>  	      IFUNC_IMPL_ADD (array, i, strnlen,
> -			      CPU_FEATURE_USABLE (AVX2),
> +			      (CPU_FEATURE_USABLE (AVX2)
> +			       && CPU_FEATURE_USABLE (BMI2)),
>  			      __strnlen_avx2)
>  	      IFUNC_IMPL_ADD (array, i, strnlen,
>  			      (CPU_FEATURE_USABLE (AVX2)
> +			       && CPU_FEATURE_USABLE (BMI2)
>  			       && CPU_FEATURE_USABLE (RTM)),
>  			      __strnlen_avx2_rtm)
>  	      IFUNC_IMPL_ADD (array, i, strnlen,
> @@ -654,10 +658,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
>    /* Support sysdeps/x86_64/multiarch/wcslen.c.  */
>    IFUNC_IMPL (i, name, wcslen,
>  	      IFUNC_IMPL_ADD (array, i, wcslen,
> -			      CPU_FEATURE_USABLE (AVX2),
> +			      (CPU_FEATURE_USABLE (AVX2)
> +			       && CPU_FEATURE_USABLE (BMI2)),
>  			      __wcslen_avx2)
>  	      IFUNC_IMPL_ADD (array, i, wcslen,
>  			      (CPU_FEATURE_USABLE (AVX2)
> +			       && CPU_FEATURE_USABLE (BMI2)
>  			       && CPU_FEATURE_USABLE (RTM)),
>  			      __wcslen_avx2_rtm)
>  	      IFUNC_IMPL_ADD (array, i, wcslen,
> @@ -670,10 +676,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
>    /* Support sysdeps/x86_64/multiarch/wcsnlen.c.  */
>    IFUNC_IMPL (i, name, wcsnlen,
>  	      IFUNC_IMPL_ADD (array, i, wcsnlen,
> -			      CPU_FEATURE_USABLE (AVX2),
> +			      (CPU_FEATURE_USABLE (AVX2)
> +			       && CPU_FEATURE_USABLE (BMI2)),
>  			      __wcsnlen_avx2)
>  	      IFUNC_IMPL_ADD (array, i, wcsnlen,
>  			      (CPU_FEATURE_USABLE (AVX2)
> +			       && CPU_FEATURE_USABLE (BMI2)
>  			       && CPU_FEATURE_USABLE (RTM)),
>  			      __wcsnlen_avx2_rtm)
>  	      IFUNC_IMPL_ADD (array, i, wcsnlen,
> diff --git a/sysdeps/x86_64/multiarch/strlen-avx2.S b/sysdeps/x86_64/multiarch/strlen-avx2.S
> index 1caae9e6bc..bd2e6ee44a 100644
> --- a/sysdeps/x86_64/multiarch/strlen-avx2.S
> +++ b/sysdeps/x86_64/multiarch/strlen-avx2.S
> @@ -27,9 +27,11 @@
>  # ifdef USE_AS_WCSLEN
>  #  define VPCMPEQ	vpcmpeqd
>  #  define VPMINU	vpminud
> +#  define CHAR_SIZE	4
>  # else
>  #  define VPCMPEQ	vpcmpeqb
>  #  define VPMINU	vpminub
> +#  define CHAR_SIZE	1
>  # endif
>  
>  # ifndef VZEROUPPER
> @@ -41,349 +43,459 @@
>  # endif
>  
>  # define VEC_SIZE 32
> +# define PAGE_SIZE 4096
>  
>  	.section SECTION(.text),"ax",@progbits
>  ENTRY (STRLEN)
>  # ifdef USE_AS_STRNLEN
> -	/* Check for zero length.  */
> +	/* Check zero length.  */
>  	test	%RSI_LP, %RSI_LP
>  	jz	L(zero)
> +	/* Store max len in R8_LP before adjusting if using WCSLEN.  */
> +	mov	%RSI_LP, %R8_LP
>  #  ifdef USE_AS_WCSLEN
>  	shl	$2, %RSI_LP
>  #  elif defined __ILP32__
>  	/* Clear the upper 32 bits.  */
>  	movl	%esi, %esi
>  #  endif
> -	mov	%RSI_LP, %R8_LP
>  # endif
> -	movl	%edi, %ecx
> +	movl	%edi, %eax
>  	movq	%rdi, %rdx
>  	vpxor	%xmm0, %xmm0, %xmm0
> -
> +	/* Clear high bits from edi. Only keeping bits relevant to page
> +	   cross check.  */
> +	andl	$(PAGE_SIZE - 1), %eax
>  	/* Check if we may cross page boundary with one vector load.  */
> -	andl	$(2 * VEC_SIZE - 1), %ecx
> -	cmpl	$VEC_SIZE, %ecx
> -	ja	L(cros_page_boundary)
> +	cmpl	$(PAGE_SIZE - VEC_SIZE), %eax
> +	ja	L(cross_page_boundary)
>  
>  	/* Check the first VEC_SIZE bytes.  */
> -	VPCMPEQ (%rdi), %ymm0, %ymm1
> -	vpmovmskb %ymm1, %eax
> -	testl	%eax, %eax
> -
> +	VPCMPEQ	(%rdi), %ymm0, %ymm1
> +	vpmovmskb	%ymm1, %eax
>  # ifdef USE_AS_STRNLEN
> -	jnz	L(first_vec_x0_check)
> -	/* Adjust length and check the end of data.  */
> -	subq	$VEC_SIZE, %rsi
> -	jbe	L(max)
> -# else
> -	jnz	L(first_vec_x0)
> +	/* If length < VEC_SIZE handle special.  */
> +	cmpq	$VEC_SIZE, %rsi
> +	jbe	L(first_vec_x0)
>  # endif
> -
> -	/* Align data for aligned loads in the loop.  */
> -	addq	$VEC_SIZE, %rdi
> -	andl	$(VEC_SIZE - 1), %ecx
> -	andq	$-VEC_SIZE, %rdi
> +	/* If empty continue to aligned_more. Otherwise return bit
> +	   position of first match.  */
> +	testl	%eax, %eax
> +	jz	L(aligned_more)
> +	tzcntl	%eax, %eax
> +# ifdef USE_AS_WCSLEN
> +	shrl	$2, %eax
> +# endif
> +	VZEROUPPER_RETURN
>  
>  # ifdef USE_AS_STRNLEN
> -	/* Adjust length.  */
> -	addq	%rcx, %rsi
> +L(zero):
> +	xorl	%eax, %eax
> +	ret
>  
> -	subq	$(VEC_SIZE * 4), %rsi
> -	jbe	L(last_4x_vec_or_less)
> +	.p2align 4
> +L(first_vec_x0):
> +	/* Set bit for max len so that tzcnt will return min of max len
> +	   and position of first match.  */
> +	btsq	%rsi, %rax
> +	tzcntl	%eax, %eax
> +#  ifdef USE_AS_WCSLEN
> +	shrl	$2, %eax
> +#  endif
> +	VZEROUPPER_RETURN
>  # endif
> -	jmp	L(more_4x_vec)
>  
>  	.p2align 4
> -L(cros_page_boundary):
> -	andl	$(VEC_SIZE - 1), %ecx
> -	andq	$-VEC_SIZE, %rdi
> -	VPCMPEQ (%rdi), %ymm0, %ymm1
> -	vpmovmskb %ymm1, %eax
> -	/* Remove the leading bytes.  */
> -	sarl	%cl, %eax
> -	testl	%eax, %eax
> -	jz	L(aligned_more)
> +L(first_vec_x1):
>  	tzcntl	%eax, %eax
> +	/* Safe to use 32 bit instructions as these are only called for
> +	   size = [1, 159].  */
>  # ifdef USE_AS_STRNLEN
> -	/* Check the end of data.  */
> -	cmpq	%rax, %rsi
> -	jbe	L(max)
> +	/* Use ecx which was computed earlier to compute correct value.
> +	 */
> +	subl	$(VEC_SIZE * 4 + 1), %ecx
> +	addl	%ecx, %eax
> +# else
> +	subl	%edx, %edi
> +	incl	%edi
> +	addl	%edi, %eax
>  # endif
> -	addq	%rdi, %rax
> -	addq	%rcx, %rax
> -	subq	%rdx, %rax
>  # ifdef USE_AS_WCSLEN
> -	shrq	$2, %rax
> +	shrl	$2, %eax
>  # endif
> -L(return_vzeroupper):
> -	ZERO_UPPER_VEC_REGISTERS_RETURN
> +	VZEROUPPER_RETURN
>  
>  	.p2align 4
> -L(aligned_more):
> +L(first_vec_x2):
> +	tzcntl	%eax, %eax
> +	/* Safe to use 32 bit instructions as these are only called for
> +	   size = [1, 159].  */
>  # ifdef USE_AS_STRNLEN
> -        /* "rcx" is less than VEC_SIZE.  Calculate "rdx + rcx - VEC_SIZE"
> -	    with "rdx - (VEC_SIZE - rcx)" instead of "(rdx + rcx) - VEC_SIZE"
> -	    to void possible addition overflow.  */
> -	negq	%rcx
> -	addq	$VEC_SIZE, %rcx
> -
> -	/* Check the end of data.  */
> -	subq	%rcx, %rsi
> -	jbe	L(max)
> +	/* Use ecx which was computed earlier to compute correct value.
> +	 */
> +	subl	$(VEC_SIZE * 3 + 1), %ecx
> +	addl	%ecx, %eax
> +# else
> +	subl	%edx, %edi
> +	addl	$(VEC_SIZE + 1), %edi
> +	addl	%edi, %eax
>  # endif
> +# ifdef USE_AS_WCSLEN
> +	shrl	$2, %eax
> +# endif
> +	VZEROUPPER_RETURN
>  
> -	addq	$VEC_SIZE, %rdi
> +	.p2align 4
> +L(first_vec_x3):
> +	tzcntl	%eax, %eax
> +	/* Safe to use 32 bit instructions as these are only called for
> +	   size = [1, 159].  */
> +# ifdef USE_AS_STRNLEN
> +	/* Use ecx which was computed earlier to compute correct value.
> +	 */
> +	subl	$(VEC_SIZE * 2 + 1), %ecx
> +	addl	%ecx, %eax
> +# else
> +	subl	%edx, %edi
> +	addl	$(VEC_SIZE * 2 + 1), %edi
> +	addl	%edi, %eax
> +# endif
> +# ifdef USE_AS_WCSLEN
> +	shrl	$2, %eax
> +# endif
> +	VZEROUPPER_RETURN
>  
> +	.p2align 4
> +L(first_vec_x4):
> +	tzcntl	%eax, %eax
> +	/* Safe to use 32 bit instructions as these are only called for
> +	   size = [1, 159].  */
>  # ifdef USE_AS_STRNLEN
> -	subq	$(VEC_SIZE * 4), %rsi
> -	jbe	L(last_4x_vec_or_less)
> +	/* Use ecx which was computed earlier to compute correct value.
> +	 */
> +	subl	$(VEC_SIZE + 1), %ecx
> +	addl	%ecx, %eax
> +# else
> +	subl	%edx, %edi
> +	addl	$(VEC_SIZE * 3 + 1), %edi
> +	addl	%edi, %eax
>  # endif
> +# ifdef USE_AS_WCSLEN
> +	shrl	$2, %eax
> +# endif
> +	VZEROUPPER_RETURN
>  
> -L(more_4x_vec):
> +	.p2align 5
> +L(aligned_more):
> +	/* Align data to VEC_SIZE - 1. This is the same number of
> +	   instructions as using andq with -VEC_SIZE but saves 4 bytes of
> +	   code on the x4 check.  */
> +	orq	$(VEC_SIZE - 1), %rdi
> +L(cross_page_continue):
>  	/* Check the first 4 * VEC_SIZE.  Only one VEC_SIZE at a time
>  	   since data is only aligned to VEC_SIZE.  */
> -	VPCMPEQ (%rdi), %ymm0, %ymm1
> -	vpmovmskb %ymm1, %eax
> -	testl	%eax, %eax
> -	jnz	L(first_vec_x0)
> -
> -	VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1
> -	vpmovmskb %ymm1, %eax
> +# ifdef USE_AS_STRNLEN
> +	/* + 1 because rdi is aligned to VEC_SIZE - 1. + CHAR_SIZE because
> +	   it simplies the logic in last_4x_vec_or_less.  */
> +	leaq	(VEC_SIZE * 4 + CHAR_SIZE + 1)(%rdi), %rcx
> +	subq	%rdx, %rcx
> +# endif
> +	/* Load first VEC regardless.  */
> +	VPCMPEQ	1(%rdi), %ymm0, %ymm1
> +# ifdef USE_AS_STRNLEN
> +	/* Adjust length. If near end handle specially.  */
> +	subq	%rcx, %rsi
> +	jb	L(last_4x_vec_or_less)
> +# endif
> +	vpmovmskb	%ymm1, %eax
>  	testl	%eax, %eax
>  	jnz	L(first_vec_x1)
>  
> -	VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm0, %ymm1
> -	vpmovmskb %ymm1, %eax
> +	VPCMPEQ	(VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
> +	vpmovmskb	%ymm1, %eax
>  	testl	%eax, %eax
>  	jnz	L(first_vec_x2)
>  
> -	VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1
> -	vpmovmskb %ymm1, %eax
> +	VPCMPEQ	(VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm1
> +	vpmovmskb	%ymm1, %eax
>  	testl	%eax, %eax
>  	jnz	L(first_vec_x3)
>  
> -	addq	$(VEC_SIZE * 4), %rdi
> -
> -# ifdef USE_AS_STRNLEN
> -	subq	$(VEC_SIZE * 4), %rsi
> -	jbe	L(last_4x_vec_or_less)
> -# endif
> -
> -	/* Align data to 4 * VEC_SIZE.  */
> -	movq	%rdi, %rcx
> -	andl	$(4 * VEC_SIZE - 1), %ecx
> -	andq	$-(4 * VEC_SIZE), %rdi
> +	VPCMPEQ	(VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm1
> +	vpmovmskb	%ymm1, %eax
> +	testl	%eax, %eax
> +	jnz	L(first_vec_x4)
>  
> +	/* Align data to VEC_SIZE * 4 - 1.  */
>  # ifdef USE_AS_STRNLEN
> -	/* Adjust length.  */
> +	/* Before adjusting length check if at last VEC_SIZE * 4.  */
> +	cmpq	$(VEC_SIZE * 4 - 1), %rsi
> +	jbe	L(last_4x_vec_or_less_load)
> +	incq	%rdi
> +	movl	%edi, %ecx
> +	orq	$(VEC_SIZE * 4 - 1), %rdi
> +	andl	$(VEC_SIZE * 4 - 1), %ecx
> +	/* Readjust length.  */
>  	addq	%rcx, %rsi
> +# else
> +	incq	%rdi
> +	orq	$(VEC_SIZE * 4 - 1), %rdi
>  # endif
> -
> +	/* Compare 4 * VEC at a time forward.  */
>  	.p2align 4
>  L(loop_4x_vec):
> -	/* Compare 4 * VEC at a time forward.  */
> -	vmovdqa (%rdi), %ymm1
> -	vmovdqa	VEC_SIZE(%rdi), %ymm2
> -	vmovdqa	(VEC_SIZE * 2)(%rdi), %ymm3
> -	vmovdqa	(VEC_SIZE * 3)(%rdi), %ymm4
> -	VPMINU	%ymm1, %ymm2, %ymm5
> -	VPMINU	%ymm3, %ymm4, %ymm6
> -	VPMINU	%ymm5, %ymm6, %ymm5
> -
> -	VPCMPEQ	%ymm5, %ymm0, %ymm5
> -	vpmovmskb %ymm5, %eax
> -	testl	%eax, %eax
> -	jnz	L(4x_vec_end)
> -
> -	addq	$(VEC_SIZE * 4), %rdi
> -
> -# ifndef USE_AS_STRNLEN
> -	jmp	L(loop_4x_vec)
> -# else
> +# ifdef USE_AS_STRNLEN
> +	/* Break if at end of length.  */
>  	subq	$(VEC_SIZE * 4), %rsi
> -	ja	L(loop_4x_vec)
> -
> -L(last_4x_vec_or_less):
> -	/* Less than 4 * VEC and aligned to VEC_SIZE.  */
> -	addl	$(VEC_SIZE * 2), %esi
> -	jle	L(last_2x_vec)
> +	jb	L(last_4x_vec_or_less_cmpeq)
> +# endif
> +	/* Save some code size by microfusing VPMINU with the load. Since
> +	   the matches in ymm2/ymm4 can only be returned if there where no
> +	   matches in ymm1/ymm3 respectively there is no issue with overlap.
> +	 */
> +	vmovdqa	1(%rdi), %ymm1
> +	VPMINU	(VEC_SIZE + 1)(%rdi), %ymm1, %ymm2
> +	vmovdqa	(VEC_SIZE * 2 + 1)(%rdi), %ymm3
> +	VPMINU	(VEC_SIZE * 3 + 1)(%rdi), %ymm3, %ymm4
> +
> +	VPMINU	%ymm2, %ymm4, %ymm5
> +	VPCMPEQ	%ymm5, %ymm0, %ymm5
> +	vpmovmskb	%ymm5, %ecx
>  
> -	VPCMPEQ (%rdi), %ymm0, %ymm1
> -	vpmovmskb %ymm1, %eax
> -	testl	%eax, %eax
> -	jnz	L(first_vec_x0)
> +	subq	$-(VEC_SIZE * 4), %rdi
> +	testl	%ecx, %ecx
> +	jz	L(loop_4x_vec)
>  
> -	VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1
> -	vpmovmskb %ymm1, %eax
> -	testl	%eax, %eax
> -	jnz	L(first_vec_x1)
>  
> -	VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm0, %ymm1
> -	vpmovmskb %ymm1, %eax
> +	VPCMPEQ	%ymm1, %ymm0, %ymm1
> +	vpmovmskb	%ymm1, %eax
> +	subq	%rdx, %rdi
>  	testl	%eax, %eax
> +	jnz	L(last_vec_return_x0)
>  
> -	jnz	L(first_vec_x2_check)
> -	subl	$VEC_SIZE, %esi
> -	jle	L(max)
> -
> -	VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1
> -	vpmovmskb %ymm1, %eax
> +	VPCMPEQ	%ymm2, %ymm0, %ymm2
> +	vpmovmskb	%ymm2, %eax
>  	testl	%eax, %eax
> -
> -	jnz	L(first_vec_x3_check)
> -	movq	%r8, %rax
> -#  ifdef USE_AS_WCSLEN
> +	jnz	L(last_vec_return_x1)
> +
> +	/* Combine last 2 VEC.  */
> +	VPCMPEQ	%ymm3, %ymm0, %ymm3
> +	vpmovmskb	%ymm3, %eax
> +	/* rcx has combined result from all 4 VEC. It will only be used if
> +	   the first 3 other VEC all did not contain a match.  */
> +	salq	$32, %rcx
> +	orq	%rcx, %rax
> +	tzcntq	%rax, %rax
> +	subq	$(VEC_SIZE * 2 - 1), %rdi
> +	addq	%rdi, %rax
> +# ifdef USE_AS_WCSLEN
>  	shrq	$2, %rax
> -#  endif
> +# endif
>  	VZEROUPPER_RETURN
>  
> +
> +# ifdef USE_AS_STRNLEN
>  	.p2align 4
> -L(last_2x_vec):
> -	addl	$(VEC_SIZE * 2), %esi
> -	VPCMPEQ (%rdi), %ymm0, %ymm1
> -	vpmovmskb %ymm1, %eax
> -	testl	%eax, %eax
> +L(last_4x_vec_or_less_load):
> +	/* Depending on entry adjust rdi / prepare first VEC in ymm1.  */
> +	subq	$-(VEC_SIZE * 4), %rdi
> +L(last_4x_vec_or_less_cmpeq):
> +	VPCMPEQ	1(%rdi), %ymm0, %ymm1
> +L(last_4x_vec_or_less):
>  
> -	jnz	L(first_vec_x0_check)
> -	subl	$VEC_SIZE, %esi
> -	jle	L(max)
> +	vpmovmskb	%ymm1, %eax
> +	/* If remaining length > VEC_SIZE * 2. This works if esi is off by
> +	   VEC_SIZE * 4.  */
> +	testl	$(VEC_SIZE * 2), %esi
> +	jnz	L(last_4x_vec)
>  
> -	VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1
> -	vpmovmskb %ymm1, %eax
> +	/* length may have been negative or positive by an offset of
> +	   VEC_SIZE * 4 depending on where this was called from. This fixes
> +	   that.  */
> +	andl	$(VEC_SIZE * 4 - 1), %esi
>  	testl	%eax, %eax
> -	jnz	L(first_vec_x1_check)
> -	movq	%r8, %rax
> -#  ifdef USE_AS_WCSLEN
> -	shrq	$2, %rax
> -#  endif
> -	VZEROUPPER_RETURN
> +	jnz	L(last_vec_x1_check)
>  
> -	.p2align 4
> -L(first_vec_x0_check):
> +	subl	$VEC_SIZE, %esi
> +	jb	L(max)
> +
> +	VPCMPEQ	(VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
> +	vpmovmskb	%ymm1, %eax
>  	tzcntl	%eax, %eax
>  	/* Check the end of data.  */
> -	cmpq	%rax, %rsi
> -	jbe	L(max)
> +	cmpl	%eax, %esi
> +	jb	L(max)
> +	subq	%rdx, %rdi
> +	addl	$(VEC_SIZE + 1), %eax
>  	addq	%rdi, %rax
> -	subq	%rdx, %rax
>  #  ifdef USE_AS_WCSLEN
>  	shrq	$2, %rax
>  #  endif
>  	VZEROUPPER_RETURN
> +# endif
>  
>  	.p2align 4
> -L(first_vec_x1_check):
> +L(last_vec_return_x0):
>  	tzcntl	%eax, %eax
> -	/* Check the end of data.  */
> -	cmpq	%rax, %rsi
> -	jbe	L(max)
> -	addq	$VEC_SIZE, %rax
> +	subq	$(VEC_SIZE * 4 - 1), %rdi
>  	addq	%rdi, %rax
> -	subq	%rdx, %rax
> -#  ifdef USE_AS_WCSLEN
> +# ifdef USE_AS_WCSLEN
>  	shrq	$2, %rax
> -#  endif
> +# endif
>  	VZEROUPPER_RETURN
>  
>  	.p2align 4
> -L(first_vec_x2_check):
> +L(last_vec_return_x1):
>  	tzcntl	%eax, %eax
> -	/* Check the end of data.  */
> -	cmpq	%rax, %rsi
> -	jbe	L(max)
> -	addq	$(VEC_SIZE * 2), %rax
> +	subq	$(VEC_SIZE * 3 - 1), %rdi
>  	addq	%rdi, %rax
> -	subq	%rdx, %rax
> -#  ifdef USE_AS_WCSLEN
> +# ifdef USE_AS_WCSLEN
>  	shrq	$2, %rax
> -#  endif
> +# endif
>  	VZEROUPPER_RETURN
>  
> +# ifdef USE_AS_STRNLEN
>  	.p2align 4
> -L(first_vec_x3_check):
> +L(last_vec_x1_check):
> +
>  	tzcntl	%eax, %eax
>  	/* Check the end of data.  */
> -	cmpq	%rax, %rsi
> -	jbe	L(max)
> -	addq	$(VEC_SIZE * 3), %rax
> +	cmpl	%eax, %esi
> +	jb	L(max)
> +	subq	%rdx, %rdi
> +	incl	%eax
>  	addq	%rdi, %rax
> -	subq	%rdx, %rax
>  #  ifdef USE_AS_WCSLEN
>  	shrq	$2, %rax
>  #  endif
>  	VZEROUPPER_RETURN
>  
> -	.p2align 4
>  L(max):
>  	movq	%r8, %rax
> +	VZEROUPPER_RETURN
> +
> +	.p2align 4
> +L(last_4x_vec):
> +	/* Test first 2x VEC normally.  */
> +	testl	%eax, %eax
> +	jnz	L(last_vec_x1)
> +
> +	VPCMPEQ	(VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
> +	vpmovmskb	%ymm1, %eax
> +	testl	%eax, %eax
> +	jnz	L(last_vec_x2)
> +
> +	/* Normalize length.  */
> +	andl	$(VEC_SIZE * 4 - 1), %esi
> +	VPCMPEQ	(VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm1
> +	vpmovmskb	%ymm1, %eax
> +	testl	%eax, %eax
> +	jnz	L(last_vec_x3)
> +
> +	subl	$(VEC_SIZE * 3), %esi
> +	jb	L(max)
> +
> +	VPCMPEQ	(VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm1
> +	vpmovmskb	%ymm1, %eax
> +	tzcntl	%eax, %eax
> +	/* Check the end of data.  */
> +	cmpl	%eax, %esi
> +	jb	L(max)
> +	subq	%rdx, %rdi
> +	addl	$(VEC_SIZE * 3 + 1), %eax
> +	addq	%rdi, %rax
>  #  ifdef USE_AS_WCSLEN
>  	shrq	$2, %rax
>  #  endif
>  	VZEROUPPER_RETURN
>  
> -	.p2align 4
> -L(zero):
> -	xorl	%eax, %eax
> -	ret
> -# endif
>  
>  	.p2align 4
> -L(first_vec_x0):
> +L(last_vec_x1):
> +	/* essentially duplicates of first_vec_x1 but use 64 bit
> +	   instructions.  */
>  	tzcntl	%eax, %eax
> +	subq	%rdx, %rdi
> +	incl	%eax
>  	addq	%rdi, %rax
> -	subq	%rdx, %rax
> -# ifdef USE_AS_WCSLEN
> +#  ifdef USE_AS_WCSLEN
>  	shrq	$2, %rax
> -# endif
> +#  endif
>  	VZEROUPPER_RETURN
>  
>  	.p2align 4
> -L(first_vec_x1):
> +L(last_vec_x2):
> +	/* essentially duplicates of first_vec_x1 but use 64 bit
> +	   instructions.  */
>  	tzcntl	%eax, %eax
> -	addq	$VEC_SIZE, %rax
> +	subq	%rdx, %rdi
> +	addl	$(VEC_SIZE + 1), %eax
>  	addq	%rdi, %rax
> -	subq	%rdx, %rax
> -# ifdef USE_AS_WCSLEN
> +#  ifdef USE_AS_WCSLEN
>  	shrq	$2, %rax
> -# endif
> +#  endif
>  	VZEROUPPER_RETURN
>  
>  	.p2align 4
> -L(first_vec_x2):
> +L(last_vec_x3):
>  	tzcntl	%eax, %eax
> -	addq	$(VEC_SIZE * 2), %rax
> +	subl	$(VEC_SIZE * 2), %esi
> +	/* Check the end of data.  */
> +	cmpl	%eax, %esi
> +	jb	L(max_end)
> +	subq	%rdx, %rdi
> +	addl	$(VEC_SIZE * 2 + 1), %eax
>  	addq	%rdi, %rax
> -	subq	%rdx, %rax
> -# ifdef USE_AS_WCSLEN
> +#  ifdef USE_AS_WCSLEN
>  	shrq	$2, %rax
> -# endif
> +#  endif
> +	VZEROUPPER_RETURN
> +L(max_end):
> +	movq	%r8, %rax
>  	VZEROUPPER_RETURN
> +# endif
>  
> +	/* Cold case for crossing page with first load.	 */
>  	.p2align 4
> -L(4x_vec_end):
> -	VPCMPEQ	%ymm1, %ymm0, %ymm1
> -	vpmovmskb %ymm1, %eax
> -	testl	%eax, %eax
> -	jnz	L(first_vec_x0)
> -	VPCMPEQ %ymm2, %ymm0, %ymm2
> -	vpmovmskb %ymm2, %eax
> +L(cross_page_boundary):
> +	/* Align data to VEC_SIZE - 1.  */
> +	orq	$(VEC_SIZE - 1), %rdi
> +	VPCMPEQ	-(VEC_SIZE - 1)(%rdi), %ymm0, %ymm1
> +	vpmovmskb	%ymm1, %eax
> +	/* Remove the leading bytes. sarxl only uses bits [5:0] of COUNT
> +	   so no need to manually mod rdx.  */
> +	sarxl	%edx, %eax, %eax

This is a BMI2 instruction, which is not necessary available when AVX2
is available. This causes SIGILL on some CPU. I have reported that in 
https://sourceware.org/bugzilla/show_bug.cgi?id=29611

Regards
Aurelien

-- 
Aurelien Jarno                          GPG: 4096R/1DDD8C9B
aurelien@aurel32.net                 http://www.aurel32.net

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH v5 2/2] x86: Optimize strlen-avx2.S
  2022-09-25  8:19   ` Aurelien Jarno
@ 2022-09-25 14:00     ` Noah Goldstein
  2022-09-28 13:54       ` Sunil Pandey
  0 siblings, 1 reply; 24+ messages in thread
From: Noah Goldstein @ 2022-09-25 14:00 UTC (permalink / raw)
  To: Noah Goldstein, GNU C Library

On Sun, Sep 25, 2022 at 1:19 AM Aurelien Jarno <aurelien@aurel32.net> wrote:
>
> On 2021-04-19 19:36, Noah Goldstein via Libc-alpha wrote:
> > No bug. This commit optimizes strlen-avx2.S. The optimizations are
> > mostly small things but they add up to roughly 10-30% performance
> > improvement for strlen. The results for strnlen are bit more
> > ambiguous. test-strlen, test-strnlen, test-wcslen, and test-wcsnlen
> > are all passing.
> >
> > Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
> > ---
> >  sysdeps/x86_64/multiarch/ifunc-impl-list.c |  16 +-
> >  sysdeps/x86_64/multiarch/strlen-avx2.S     | 532 +++++++++++++--------
> >  2 files changed, 334 insertions(+), 214 deletions(-)
> >
> > diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> > index c377cab629..651b32908e 100644
> > --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> > +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> > @@ -293,10 +293,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> >    /* Support sysdeps/x86_64/multiarch/strlen.c.  */
> >    IFUNC_IMPL (i, name, strlen,
> >             IFUNC_IMPL_ADD (array, i, strlen,
> > -                           CPU_FEATURE_USABLE (AVX2),
> > +                           (CPU_FEATURE_USABLE (AVX2)
> > +                            && CPU_FEATURE_USABLE (BMI2)),
> >                             __strlen_avx2)
> >             IFUNC_IMPL_ADD (array, i, strlen,
> >                             (CPU_FEATURE_USABLE (AVX2)
> > +                            && CPU_FEATURE_USABLE (BMI2)
> >                              && CPU_FEATURE_USABLE (RTM)),
> >                             __strlen_avx2_rtm)
> >             IFUNC_IMPL_ADD (array, i, strlen,
> > @@ -309,10 +311,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> >    /* Support sysdeps/x86_64/multiarch/strnlen.c.  */
> >    IFUNC_IMPL (i, name, strnlen,
> >             IFUNC_IMPL_ADD (array, i, strnlen,
> > -                           CPU_FEATURE_USABLE (AVX2),
> > +                           (CPU_FEATURE_USABLE (AVX2)
> > +                            && CPU_FEATURE_USABLE (BMI2)),
> >                             __strnlen_avx2)
> >             IFUNC_IMPL_ADD (array, i, strnlen,
> >                             (CPU_FEATURE_USABLE (AVX2)
> > +                            && CPU_FEATURE_USABLE (BMI2)
> >                              && CPU_FEATURE_USABLE (RTM)),
> >                             __strnlen_avx2_rtm)
> >             IFUNC_IMPL_ADD (array, i, strnlen,
> > @@ -654,10 +658,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> >    /* Support sysdeps/x86_64/multiarch/wcslen.c.  */
> >    IFUNC_IMPL (i, name, wcslen,
> >             IFUNC_IMPL_ADD (array, i, wcslen,
> > -                           CPU_FEATURE_USABLE (AVX2),
> > +                           (CPU_FEATURE_USABLE (AVX2)
> > +                            && CPU_FEATURE_USABLE (BMI2)),
> >                             __wcslen_avx2)
> >             IFUNC_IMPL_ADD (array, i, wcslen,
> >                             (CPU_FEATURE_USABLE (AVX2)
> > +                            && CPU_FEATURE_USABLE (BMI2)
> >                              && CPU_FEATURE_USABLE (RTM)),
> >                             __wcslen_avx2_rtm)
> >             IFUNC_IMPL_ADD (array, i, wcslen,
> > @@ -670,10 +676,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> >    /* Support sysdeps/x86_64/multiarch/wcsnlen.c.  */
> >    IFUNC_IMPL (i, name, wcsnlen,
> >             IFUNC_IMPL_ADD (array, i, wcsnlen,
> > -                           CPU_FEATURE_USABLE (AVX2),
> > +                           (CPU_FEATURE_USABLE (AVX2)
> > +                            && CPU_FEATURE_USABLE (BMI2)),
> >                             __wcsnlen_avx2)
> >             IFUNC_IMPL_ADD (array, i, wcsnlen,
> >                             (CPU_FEATURE_USABLE (AVX2)
> > +                            && CPU_FEATURE_USABLE (BMI2)
> >                              && CPU_FEATURE_USABLE (RTM)),
> >                             __wcsnlen_avx2_rtm)
> >             IFUNC_IMPL_ADD (array, i, wcsnlen,
> > diff --git a/sysdeps/x86_64/multiarch/strlen-avx2.S b/sysdeps/x86_64/multiarch/strlen-avx2.S
> > index 1caae9e6bc..bd2e6ee44a 100644
> > --- a/sysdeps/x86_64/multiarch/strlen-avx2.S
> > +++ b/sysdeps/x86_64/multiarch/strlen-avx2.S
> > @@ -27,9 +27,11 @@
> >  # ifdef USE_AS_WCSLEN
> >  #  define VPCMPEQ    vpcmpeqd
> >  #  define VPMINU     vpminud
> > +#  define CHAR_SIZE  4
> >  # else
> >  #  define VPCMPEQ    vpcmpeqb
> >  #  define VPMINU     vpminub
> > +#  define CHAR_SIZE  1
> >  # endif
> >
> >  # ifndef VZEROUPPER
> > @@ -41,349 +43,459 @@
> >  # endif
> >
> >  # define VEC_SIZE 32
> > +# define PAGE_SIZE 4096
> >
> >       .section SECTION(.text),"ax",@progbits
> >  ENTRY (STRLEN)
> >  # ifdef USE_AS_STRNLEN
> > -     /* Check for zero length.  */
> > +     /* Check zero length.  */
> >       test    %RSI_LP, %RSI_LP
> >       jz      L(zero)
> > +     /* Store max len in R8_LP before adjusting if using WCSLEN.  */
> > +     mov     %RSI_LP, %R8_LP
> >  #  ifdef USE_AS_WCSLEN
> >       shl     $2, %RSI_LP
> >  #  elif defined __ILP32__
> >       /* Clear the upper 32 bits.  */
> >       movl    %esi, %esi
> >  #  endif
> > -     mov     %RSI_LP, %R8_LP
> >  # endif
> > -     movl    %edi, %ecx
> > +     movl    %edi, %eax
> >       movq    %rdi, %rdx
> >       vpxor   %xmm0, %xmm0, %xmm0
> > -
> > +     /* Clear high bits from edi. Only keeping bits relevant to page
> > +        cross check.  */
> > +     andl    $(PAGE_SIZE - 1), %eax
> >       /* Check if we may cross page boundary with one vector load.  */
> > -     andl    $(2 * VEC_SIZE - 1), %ecx
> > -     cmpl    $VEC_SIZE, %ecx
> > -     ja      L(cros_page_boundary)
> > +     cmpl    $(PAGE_SIZE - VEC_SIZE), %eax
> > +     ja      L(cross_page_boundary)
> >
> >       /* Check the first VEC_SIZE bytes.  */
> > -     VPCMPEQ (%rdi), %ymm0, %ymm1
> > -     vpmovmskb %ymm1, %eax
> > -     testl   %eax, %eax
> > -
> > +     VPCMPEQ (%rdi), %ymm0, %ymm1
> > +     vpmovmskb       %ymm1, %eax
> >  # ifdef USE_AS_STRNLEN
> > -     jnz     L(first_vec_x0_check)
> > -     /* Adjust length and check the end of data.  */
> > -     subq    $VEC_SIZE, %rsi
> > -     jbe     L(max)
> > -# else
> > -     jnz     L(first_vec_x0)
> > +     /* If length < VEC_SIZE handle special.  */
> > +     cmpq    $VEC_SIZE, %rsi
> > +     jbe     L(first_vec_x0)
> >  # endif
> > -
> > -     /* Align data for aligned loads in the loop.  */
> > -     addq    $VEC_SIZE, %rdi
> > -     andl    $(VEC_SIZE - 1), %ecx
> > -     andq    $-VEC_SIZE, %rdi
> > +     /* If empty continue to aligned_more. Otherwise return bit
> > +        position of first match.  */
> > +     testl   %eax, %eax
> > +     jz      L(aligned_more)
> > +     tzcntl  %eax, %eax
> > +# ifdef USE_AS_WCSLEN
> > +     shrl    $2, %eax
> > +# endif
> > +     VZEROUPPER_RETURN
> >
> >  # ifdef USE_AS_STRNLEN
> > -     /* Adjust length.  */
> > -     addq    %rcx, %rsi
> > +L(zero):
> > +     xorl    %eax, %eax
> > +     ret
> >
> > -     subq    $(VEC_SIZE * 4), %rsi
> > -     jbe     L(last_4x_vec_or_less)
> > +     .p2align 4
> > +L(first_vec_x0):
> > +     /* Set bit for max len so that tzcnt will return min of max len
> > +        and position of first match.  */
> > +     btsq    %rsi, %rax
> > +     tzcntl  %eax, %eax
> > +#  ifdef USE_AS_WCSLEN
> > +     shrl    $2, %eax
> > +#  endif
> > +     VZEROUPPER_RETURN
> >  # endif
> > -     jmp     L(more_4x_vec)
> >
> >       .p2align 4
> > -L(cros_page_boundary):
> > -     andl    $(VEC_SIZE - 1), %ecx
> > -     andq    $-VEC_SIZE, %rdi
> > -     VPCMPEQ (%rdi), %ymm0, %ymm1
> > -     vpmovmskb %ymm1, %eax
> > -     /* Remove the leading bytes.  */
> > -     sarl    %cl, %eax
> > -     testl   %eax, %eax
> > -     jz      L(aligned_more)
> > +L(first_vec_x1):
> >       tzcntl  %eax, %eax
> > +     /* Safe to use 32 bit instructions as these are only called for
> > +        size = [1, 159].  */
> >  # ifdef USE_AS_STRNLEN
> > -     /* Check the end of data.  */
> > -     cmpq    %rax, %rsi
> > -     jbe     L(max)
> > +     /* Use ecx which was computed earlier to compute correct value.
> > +      */
> > +     subl    $(VEC_SIZE * 4 + 1), %ecx
> > +     addl    %ecx, %eax
> > +# else
> > +     subl    %edx, %edi
> > +     incl    %edi
> > +     addl    %edi, %eax
> >  # endif
> > -     addq    %rdi, %rax
> > -     addq    %rcx, %rax
> > -     subq    %rdx, %rax
> >  # ifdef USE_AS_WCSLEN
> > -     shrq    $2, %rax
> > +     shrl    $2, %eax
> >  # endif
> > -L(return_vzeroupper):
> > -     ZERO_UPPER_VEC_REGISTERS_RETURN
> > +     VZEROUPPER_RETURN
> >
> >       .p2align 4
> > -L(aligned_more):
> > +L(first_vec_x2):
> > +     tzcntl  %eax, %eax
> > +     /* Safe to use 32 bit instructions as these are only called for
> > +        size = [1, 159].  */
> >  # ifdef USE_AS_STRNLEN
> > -        /* "rcx" is less than VEC_SIZE.  Calculate "rdx + rcx - VEC_SIZE"
> > -         with "rdx - (VEC_SIZE - rcx)" instead of "(rdx + rcx) - VEC_SIZE"
> > -         to void possible addition overflow.  */
> > -     negq    %rcx
> > -     addq    $VEC_SIZE, %rcx
> > -
> > -     /* Check the end of data.  */
> > -     subq    %rcx, %rsi
> > -     jbe     L(max)
> > +     /* Use ecx which was computed earlier to compute correct value.
> > +      */
> > +     subl    $(VEC_SIZE * 3 + 1), %ecx
> > +     addl    %ecx, %eax
> > +# else
> > +     subl    %edx, %edi
> > +     addl    $(VEC_SIZE + 1), %edi
> > +     addl    %edi, %eax
> >  # endif
> > +# ifdef USE_AS_WCSLEN
> > +     shrl    $2, %eax
> > +# endif
> > +     VZEROUPPER_RETURN
> >
> > -     addq    $VEC_SIZE, %rdi
> > +     .p2align 4
> > +L(first_vec_x3):
> > +     tzcntl  %eax, %eax
> > +     /* Safe to use 32 bit instructions as these are only called for
> > +        size = [1, 159].  */
> > +# ifdef USE_AS_STRNLEN
> > +     /* Use ecx which was computed earlier to compute correct value.
> > +      */
> > +     subl    $(VEC_SIZE * 2 + 1), %ecx
> > +     addl    %ecx, %eax
> > +# else
> > +     subl    %edx, %edi
> > +     addl    $(VEC_SIZE * 2 + 1), %edi
> > +     addl    %edi, %eax
> > +# endif
> > +# ifdef USE_AS_WCSLEN
> > +     shrl    $2, %eax
> > +# endif
> > +     VZEROUPPER_RETURN
> >
> > +     .p2align 4
> > +L(first_vec_x4):
> > +     tzcntl  %eax, %eax
> > +     /* Safe to use 32 bit instructions as these are only called for
> > +        size = [1, 159].  */
> >  # ifdef USE_AS_STRNLEN
> > -     subq    $(VEC_SIZE * 4), %rsi
> > -     jbe     L(last_4x_vec_or_less)
> > +     /* Use ecx which was computed earlier to compute correct value.
> > +      */
> > +     subl    $(VEC_SIZE + 1), %ecx
> > +     addl    %ecx, %eax
> > +# else
> > +     subl    %edx, %edi
> > +     addl    $(VEC_SIZE * 3 + 1), %edi
> > +     addl    %edi, %eax
> >  # endif
> > +# ifdef USE_AS_WCSLEN
> > +     shrl    $2, %eax
> > +# endif
> > +     VZEROUPPER_RETURN
> >
> > -L(more_4x_vec):
> > +     .p2align 5
> > +L(aligned_more):
> > +     /* Align data to VEC_SIZE - 1. This is the same number of
> > +        instructions as using andq with -VEC_SIZE but saves 4 bytes of
> > +        code on the x4 check.  */
> > +     orq     $(VEC_SIZE - 1), %rdi
> > +L(cross_page_continue):
> >       /* Check the first 4 * VEC_SIZE.  Only one VEC_SIZE at a time
> >          since data is only aligned to VEC_SIZE.  */
> > -     VPCMPEQ (%rdi), %ymm0, %ymm1
> > -     vpmovmskb %ymm1, %eax
> > -     testl   %eax, %eax
> > -     jnz     L(first_vec_x0)
> > -
> > -     VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1
> > -     vpmovmskb %ymm1, %eax
> > +# ifdef USE_AS_STRNLEN
> > +     /* + 1 because rdi is aligned to VEC_SIZE - 1. + CHAR_SIZE because
> > +        it simplies the logic in last_4x_vec_or_less.  */
> > +     leaq    (VEC_SIZE * 4 + CHAR_SIZE + 1)(%rdi), %rcx
> > +     subq    %rdx, %rcx
> > +# endif
> > +     /* Load first VEC regardless.  */
> > +     VPCMPEQ 1(%rdi), %ymm0, %ymm1
> > +# ifdef USE_AS_STRNLEN
> > +     /* Adjust length. If near end handle specially.  */
> > +     subq    %rcx, %rsi
> > +     jb      L(last_4x_vec_or_less)
> > +# endif
> > +     vpmovmskb       %ymm1, %eax
> >       testl   %eax, %eax
> >       jnz     L(first_vec_x1)
> >
> > -     VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm0, %ymm1
> > -     vpmovmskb %ymm1, %eax
> > +     VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
> > +     vpmovmskb       %ymm1, %eax
> >       testl   %eax, %eax
> >       jnz     L(first_vec_x2)
> >
> > -     VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1
> > -     vpmovmskb %ymm1, %eax
> > +     VPCMPEQ (VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm1
> > +     vpmovmskb       %ymm1, %eax
> >       testl   %eax, %eax
> >       jnz     L(first_vec_x3)
> >
> > -     addq    $(VEC_SIZE * 4), %rdi
> > -
> > -# ifdef USE_AS_STRNLEN
> > -     subq    $(VEC_SIZE * 4), %rsi
> > -     jbe     L(last_4x_vec_or_less)
> > -# endif
> > -
> > -     /* Align data to 4 * VEC_SIZE.  */
> > -     movq    %rdi, %rcx
> > -     andl    $(4 * VEC_SIZE - 1), %ecx
> > -     andq    $-(4 * VEC_SIZE), %rdi
> > +     VPCMPEQ (VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm1
> > +     vpmovmskb       %ymm1, %eax
> > +     testl   %eax, %eax
> > +     jnz     L(first_vec_x4)
> >
> > +     /* Align data to VEC_SIZE * 4 - 1.  */
> >  # ifdef USE_AS_STRNLEN
> > -     /* Adjust length.  */
> > +     /* Before adjusting length check if at last VEC_SIZE * 4.  */
> > +     cmpq    $(VEC_SIZE * 4 - 1), %rsi
> > +     jbe     L(last_4x_vec_or_less_load)
> > +     incq    %rdi
> > +     movl    %edi, %ecx
> > +     orq     $(VEC_SIZE * 4 - 1), %rdi
> > +     andl    $(VEC_SIZE * 4 - 1), %ecx
> > +     /* Readjust length.  */
> >       addq    %rcx, %rsi
> > +# else
> > +     incq    %rdi
> > +     orq     $(VEC_SIZE * 4 - 1), %rdi
> >  # endif
> > -
> > +     /* Compare 4 * VEC at a time forward.  */
> >       .p2align 4
> >  L(loop_4x_vec):
> > -     /* Compare 4 * VEC at a time forward.  */
> > -     vmovdqa (%rdi), %ymm1
> > -     vmovdqa VEC_SIZE(%rdi), %ymm2
> > -     vmovdqa (VEC_SIZE * 2)(%rdi), %ymm3
> > -     vmovdqa (VEC_SIZE * 3)(%rdi), %ymm4
> > -     VPMINU  %ymm1, %ymm2, %ymm5
> > -     VPMINU  %ymm3, %ymm4, %ymm6
> > -     VPMINU  %ymm5, %ymm6, %ymm5
> > -
> > -     VPCMPEQ %ymm5, %ymm0, %ymm5
> > -     vpmovmskb %ymm5, %eax
> > -     testl   %eax, %eax
> > -     jnz     L(4x_vec_end)
> > -
> > -     addq    $(VEC_SIZE * 4), %rdi
> > -
> > -# ifndef USE_AS_STRNLEN
> > -     jmp     L(loop_4x_vec)
> > -# else
> > +# ifdef USE_AS_STRNLEN
> > +     /* Break if at end of length.  */
> >       subq    $(VEC_SIZE * 4), %rsi
> > -     ja      L(loop_4x_vec)
> > -
> > -L(last_4x_vec_or_less):
> > -     /* Less than 4 * VEC and aligned to VEC_SIZE.  */
> > -     addl    $(VEC_SIZE * 2), %esi
> > -     jle     L(last_2x_vec)
> > +     jb      L(last_4x_vec_or_less_cmpeq)
> > +# endif
> > +     /* Save some code size by microfusing VPMINU with the load. Since
> > +        the matches in ymm2/ymm4 can only be returned if there where no
> > +        matches in ymm1/ymm3 respectively there is no issue with overlap.
> > +      */
> > +     vmovdqa 1(%rdi), %ymm1
> > +     VPMINU  (VEC_SIZE + 1)(%rdi), %ymm1, %ymm2
> > +     vmovdqa (VEC_SIZE * 2 + 1)(%rdi), %ymm3
> > +     VPMINU  (VEC_SIZE * 3 + 1)(%rdi), %ymm3, %ymm4
> > +
> > +     VPMINU  %ymm2, %ymm4, %ymm5
> > +     VPCMPEQ %ymm5, %ymm0, %ymm5
> > +     vpmovmskb       %ymm5, %ecx
> >
> > -     VPCMPEQ (%rdi), %ymm0, %ymm1
> > -     vpmovmskb %ymm1, %eax
> > -     testl   %eax, %eax
> > -     jnz     L(first_vec_x0)
> > +     subq    $-(VEC_SIZE * 4), %rdi
> > +     testl   %ecx, %ecx
> > +     jz      L(loop_4x_vec)
> >
> > -     VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1
> > -     vpmovmskb %ymm1, %eax
> > -     testl   %eax, %eax
> > -     jnz     L(first_vec_x1)
> >
> > -     VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm0, %ymm1
> > -     vpmovmskb %ymm1, %eax
> > +     VPCMPEQ %ymm1, %ymm0, %ymm1
> > +     vpmovmskb       %ymm1, %eax
> > +     subq    %rdx, %rdi
> >       testl   %eax, %eax
> > +     jnz     L(last_vec_return_x0)
> >
> > -     jnz     L(first_vec_x2_check)
> > -     subl    $VEC_SIZE, %esi
> > -     jle     L(max)
> > -
> > -     VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1
> > -     vpmovmskb %ymm1, %eax
> > +     VPCMPEQ %ymm2, %ymm0, %ymm2
> > +     vpmovmskb       %ymm2, %eax
> >       testl   %eax, %eax
> > -
> > -     jnz     L(first_vec_x3_check)
> > -     movq    %r8, %rax
> > -#  ifdef USE_AS_WCSLEN
> > +     jnz     L(last_vec_return_x1)
> > +
> > +     /* Combine last 2 VEC.  */
> > +     VPCMPEQ %ymm3, %ymm0, %ymm3
> > +     vpmovmskb       %ymm3, %eax
> > +     /* rcx has combined result from all 4 VEC. It will only be used if
> > +        the first 3 other VEC all did not contain a match.  */
> > +     salq    $32, %rcx
> > +     orq     %rcx, %rax
> > +     tzcntq  %rax, %rax
> > +     subq    $(VEC_SIZE * 2 - 1), %rdi
> > +     addq    %rdi, %rax
> > +# ifdef USE_AS_WCSLEN
> >       shrq    $2, %rax
> > -#  endif
> > +# endif
> >       VZEROUPPER_RETURN
> >
> > +
> > +# ifdef USE_AS_STRNLEN
> >       .p2align 4
> > -L(last_2x_vec):
> > -     addl    $(VEC_SIZE * 2), %esi
> > -     VPCMPEQ (%rdi), %ymm0, %ymm1
> > -     vpmovmskb %ymm1, %eax
> > -     testl   %eax, %eax
> > +L(last_4x_vec_or_less_load):
> > +     /* Depending on entry adjust rdi / prepare first VEC in ymm1.  */
> > +     subq    $-(VEC_SIZE * 4), %rdi
> > +L(last_4x_vec_or_less_cmpeq):
> > +     VPCMPEQ 1(%rdi), %ymm0, %ymm1
> > +L(last_4x_vec_or_less):
> >
> > -     jnz     L(first_vec_x0_check)
> > -     subl    $VEC_SIZE, %esi
> > -     jle     L(max)
> > +     vpmovmskb       %ymm1, %eax
> > +     /* If remaining length > VEC_SIZE * 2. This works if esi is off by
> > +        VEC_SIZE * 4.  */
> > +     testl   $(VEC_SIZE * 2), %esi
> > +     jnz     L(last_4x_vec)
> >
> > -     VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1
> > -     vpmovmskb %ymm1, %eax
> > +     /* length may have been negative or positive by an offset of
> > +        VEC_SIZE * 4 depending on where this was called from. This fixes
> > +        that.  */
> > +     andl    $(VEC_SIZE * 4 - 1), %esi
> >       testl   %eax, %eax
> > -     jnz     L(first_vec_x1_check)
> > -     movq    %r8, %rax
> > -#  ifdef USE_AS_WCSLEN
> > -     shrq    $2, %rax
> > -#  endif
> > -     VZEROUPPER_RETURN
> > +     jnz     L(last_vec_x1_check)
> >
> > -     .p2align 4
> > -L(first_vec_x0_check):
> > +     subl    $VEC_SIZE, %esi
> > +     jb      L(max)
> > +
> > +     VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
> > +     vpmovmskb       %ymm1, %eax
> >       tzcntl  %eax, %eax
> >       /* Check the end of data.  */
> > -     cmpq    %rax, %rsi
> > -     jbe     L(max)
> > +     cmpl    %eax, %esi
> > +     jb      L(max)
> > +     subq    %rdx, %rdi
> > +     addl    $(VEC_SIZE + 1), %eax
> >       addq    %rdi, %rax
> > -     subq    %rdx, %rax
> >  #  ifdef USE_AS_WCSLEN
> >       shrq    $2, %rax
> >  #  endif
> >       VZEROUPPER_RETURN
> > +# endif
> >
> >       .p2align 4
> > -L(first_vec_x1_check):
> > +L(last_vec_return_x0):
> >       tzcntl  %eax, %eax
> > -     /* Check the end of data.  */
> > -     cmpq    %rax, %rsi
> > -     jbe     L(max)
> > -     addq    $VEC_SIZE, %rax
> > +     subq    $(VEC_SIZE * 4 - 1), %rdi
> >       addq    %rdi, %rax
> > -     subq    %rdx, %rax
> > -#  ifdef USE_AS_WCSLEN
> > +# ifdef USE_AS_WCSLEN
> >       shrq    $2, %rax
> > -#  endif
> > +# endif
> >       VZEROUPPER_RETURN
> >
> >       .p2align 4
> > -L(first_vec_x2_check):
> > +L(last_vec_return_x1):
> >       tzcntl  %eax, %eax
> > -     /* Check the end of data.  */
> > -     cmpq    %rax, %rsi
> > -     jbe     L(max)
> > -     addq    $(VEC_SIZE * 2), %rax
> > +     subq    $(VEC_SIZE * 3 - 1), %rdi
> >       addq    %rdi, %rax
> > -     subq    %rdx, %rax
> > -#  ifdef USE_AS_WCSLEN
> > +# ifdef USE_AS_WCSLEN
> >       shrq    $2, %rax
> > -#  endif
> > +# endif
> >       VZEROUPPER_RETURN
> >
> > +# ifdef USE_AS_STRNLEN
> >       .p2align 4
> > -L(first_vec_x3_check):
> > +L(last_vec_x1_check):
> > +
> >       tzcntl  %eax, %eax
> >       /* Check the end of data.  */
> > -     cmpq    %rax, %rsi
> > -     jbe     L(max)
> > -     addq    $(VEC_SIZE * 3), %rax
> > +     cmpl    %eax, %esi
> > +     jb      L(max)
> > +     subq    %rdx, %rdi
> > +     incl    %eax
> >       addq    %rdi, %rax
> > -     subq    %rdx, %rax
> >  #  ifdef USE_AS_WCSLEN
> >       shrq    $2, %rax
> >  #  endif
> >       VZEROUPPER_RETURN
> >
> > -     .p2align 4
> >  L(max):
> >       movq    %r8, %rax
> > +     VZEROUPPER_RETURN
> > +
> > +     .p2align 4
> > +L(last_4x_vec):
> > +     /* Test first 2x VEC normally.  */
> > +     testl   %eax, %eax
> > +     jnz     L(last_vec_x1)
> > +
> > +     VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
> > +     vpmovmskb       %ymm1, %eax
> > +     testl   %eax, %eax
> > +     jnz     L(last_vec_x2)
> > +
> > +     /* Normalize length.  */
> > +     andl    $(VEC_SIZE * 4 - 1), %esi
> > +     VPCMPEQ (VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm1
> > +     vpmovmskb       %ymm1, %eax
> > +     testl   %eax, %eax
> > +     jnz     L(last_vec_x3)
> > +
> > +     subl    $(VEC_SIZE * 3), %esi
> > +     jb      L(max)
> > +
> > +     VPCMPEQ (VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm1
> > +     vpmovmskb       %ymm1, %eax
> > +     tzcntl  %eax, %eax
> > +     /* Check the end of data.  */
> > +     cmpl    %eax, %esi
> > +     jb      L(max)
> > +     subq    %rdx, %rdi
> > +     addl    $(VEC_SIZE * 3 + 1), %eax
> > +     addq    %rdi, %rax
> >  #  ifdef USE_AS_WCSLEN
> >       shrq    $2, %rax
> >  #  endif
> >       VZEROUPPER_RETURN
> >
> > -     .p2align 4
> > -L(zero):
> > -     xorl    %eax, %eax
> > -     ret
> > -# endif
> >
> >       .p2align 4
> > -L(first_vec_x0):
> > +L(last_vec_x1):
> > +     /* essentially duplicates of first_vec_x1 but use 64 bit
> > +        instructions.  */
> >       tzcntl  %eax, %eax
> > +     subq    %rdx, %rdi
> > +     incl    %eax
> >       addq    %rdi, %rax
> > -     subq    %rdx, %rax
> > -# ifdef USE_AS_WCSLEN
> > +#  ifdef USE_AS_WCSLEN
> >       shrq    $2, %rax
> > -# endif
> > +#  endif
> >       VZEROUPPER_RETURN
> >
> >       .p2align 4
> > -L(first_vec_x1):
> > +L(last_vec_x2):
> > +     /* essentially duplicates of first_vec_x1 but use 64 bit
> > +        instructions.  */
> >       tzcntl  %eax, %eax
> > -     addq    $VEC_SIZE, %rax
> > +     subq    %rdx, %rdi
> > +     addl    $(VEC_SIZE + 1), %eax
> >       addq    %rdi, %rax
> > -     subq    %rdx, %rax
> > -# ifdef USE_AS_WCSLEN
> > +#  ifdef USE_AS_WCSLEN
> >       shrq    $2, %rax
> > -# endif
> > +#  endif
> >       VZEROUPPER_RETURN
> >
> >       .p2align 4
> > -L(first_vec_x2):
> > +L(last_vec_x3):
> >       tzcntl  %eax, %eax
> > -     addq    $(VEC_SIZE * 2), %rax
> > +     subl    $(VEC_SIZE * 2), %esi
> > +     /* Check the end of data.  */
> > +     cmpl    %eax, %esi
> > +     jb      L(max_end)
> > +     subq    %rdx, %rdi
> > +     addl    $(VEC_SIZE * 2 + 1), %eax
> >       addq    %rdi, %rax
> > -     subq    %rdx, %rax
> > -# ifdef USE_AS_WCSLEN
> > +#  ifdef USE_AS_WCSLEN
> >       shrq    $2, %rax
> > -# endif
> > +#  endif
> > +     VZEROUPPER_RETURN
> > +L(max_end):
> > +     movq    %r8, %rax
> >       VZEROUPPER_RETURN
> > +# endif
> >
> > +     /* Cold case for crossing page with first load.  */
> >       .p2align 4
> > -L(4x_vec_end):
> > -     VPCMPEQ %ymm1, %ymm0, %ymm1
> > -     vpmovmskb %ymm1, %eax
> > -     testl   %eax, %eax
> > -     jnz     L(first_vec_x0)
> > -     VPCMPEQ %ymm2, %ymm0, %ymm2
> > -     vpmovmskb %ymm2, %eax
> > +L(cross_page_boundary):
> > +     /* Align data to VEC_SIZE - 1.  */
> > +     orq     $(VEC_SIZE - 1), %rdi
> > +     VPCMPEQ -(VEC_SIZE - 1)(%rdi), %ymm0, %ymm1
> > +     vpmovmskb       %ymm1, %eax
> > +     /* Remove the leading bytes. sarxl only uses bits [5:0] of COUNT
> > +        so no need to manually mod rdx.  */
> > +     sarxl   %edx, %eax, %eax
>
> This is a BMI2 instruction, which is not necessary available when AVX2
> is available. This causes SIGILL on some CPU. I have reported that in
> https://sourceware.org/bugzilla/show_bug.cgi?id=29611

This is not a bug on master as:

commit 83c5b368226c34a2f0a5287df40fc290b2b34359
Author: H.J. Lu <hjl.tools@gmail.com>
Date:   Mon Apr 19 10:45:07 2021 -0700

    x86-64: Require BMI2 for strchr-avx2.S

is already in tree. The issue is the avx2 changes where backported
w.o H.J's changes.
>
> Regards
> Aurelien
>
> --
> Aurelien Jarno                          GPG: 4096R/1DDD8C9B
> aurelien@aurel32.net                 http://www.aurel32.net

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH v5 2/2] x86: Optimize strlen-avx2.S
  2022-09-25 14:00     ` Noah Goldstein
@ 2022-09-28 13:54       ` Sunil Pandey
  2022-09-28 14:02         ` Darren Tristano
                           ` (3 more replies)
  0 siblings, 4 replies; 24+ messages in thread
From: Sunil Pandey @ 2022-09-28 13:54 UTC (permalink / raw)
  To: Noah Goldstein, Libc-stable Mailing List, Hongjiu Lu; +Cc: GNU C Library

[-- Attachment #1: Type: text/plain, Size: 26445 bytes --]

Attached patch fixes BZ# 29611.

I would like to backport it to 2.32,2.31,2.30,2.29 and 2.29. Let me know
if there is any objection.


On Sun, Sep 25, 2022 at 7:00 AM Noah Goldstein via Libc-alpha
<libc-alpha@sourceware.org> wrote:
>
> On Sun, Sep 25, 2022 at 1:19 AM Aurelien Jarno <aurelien@aurel32.net> wrote:
> >
> > On 2021-04-19 19:36, Noah Goldstein via Libc-alpha wrote:
> > > No bug. This commit optimizes strlen-avx2.S. The optimizations are
> > > mostly small things but they add up to roughly 10-30% performance
> > > improvement for strlen. The results for strnlen are bit more
> > > ambiguous. test-strlen, test-strnlen, test-wcslen, and test-wcsnlen
> > > are all passing.
> > >
> > > Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
> > > ---
> > >  sysdeps/x86_64/multiarch/ifunc-impl-list.c |  16 +-
> > >  sysdeps/x86_64/multiarch/strlen-avx2.S     | 532 +++++++++++++--------
> > >  2 files changed, 334 insertions(+), 214 deletions(-)
> > >
> > > diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> > > index c377cab629..651b32908e 100644
> > > --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> > > +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> > > @@ -293,10 +293,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> > >    /* Support sysdeps/x86_64/multiarch/strlen.c.  */
> > >    IFUNC_IMPL (i, name, strlen,
> > >             IFUNC_IMPL_ADD (array, i, strlen,
> > > -                           CPU_FEATURE_USABLE (AVX2),
> > > +                           (CPU_FEATURE_USABLE (AVX2)
> > > +                            && CPU_FEATURE_USABLE (BMI2)),
> > >                             __strlen_avx2)
> > >             IFUNC_IMPL_ADD (array, i, strlen,
> > >                             (CPU_FEATURE_USABLE (AVX2)
> > > +                            && CPU_FEATURE_USABLE (BMI2)
> > >                              && CPU_FEATURE_USABLE (RTM)),
> > >                             __strlen_avx2_rtm)
> > >             IFUNC_IMPL_ADD (array, i, strlen,
> > > @@ -309,10 +311,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> > >    /* Support sysdeps/x86_64/multiarch/strnlen.c.  */
> > >    IFUNC_IMPL (i, name, strnlen,
> > >             IFUNC_IMPL_ADD (array, i, strnlen,
> > > -                           CPU_FEATURE_USABLE (AVX2),
> > > +                           (CPU_FEATURE_USABLE (AVX2)
> > > +                            && CPU_FEATURE_USABLE (BMI2)),
> > >                             __strnlen_avx2)
> > >             IFUNC_IMPL_ADD (array, i, strnlen,
> > >                             (CPU_FEATURE_USABLE (AVX2)
> > > +                            && CPU_FEATURE_USABLE (BMI2)
> > >                              && CPU_FEATURE_USABLE (RTM)),
> > >                             __strnlen_avx2_rtm)
> > >             IFUNC_IMPL_ADD (array, i, strnlen,
> > > @@ -654,10 +658,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> > >    /* Support sysdeps/x86_64/multiarch/wcslen.c.  */
> > >    IFUNC_IMPL (i, name, wcslen,
> > >             IFUNC_IMPL_ADD (array, i, wcslen,
> > > -                           CPU_FEATURE_USABLE (AVX2),
> > > +                           (CPU_FEATURE_USABLE (AVX2)
> > > +                            && CPU_FEATURE_USABLE (BMI2)),
> > >                             __wcslen_avx2)
> > >             IFUNC_IMPL_ADD (array, i, wcslen,
> > >                             (CPU_FEATURE_USABLE (AVX2)
> > > +                            && CPU_FEATURE_USABLE (BMI2)
> > >                              && CPU_FEATURE_USABLE (RTM)),
> > >                             __wcslen_avx2_rtm)
> > >             IFUNC_IMPL_ADD (array, i, wcslen,
> > > @@ -670,10 +676,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> > >    /* Support sysdeps/x86_64/multiarch/wcsnlen.c.  */
> > >    IFUNC_IMPL (i, name, wcsnlen,
> > >             IFUNC_IMPL_ADD (array, i, wcsnlen,
> > > -                           CPU_FEATURE_USABLE (AVX2),
> > > +                           (CPU_FEATURE_USABLE (AVX2)
> > > +                            && CPU_FEATURE_USABLE (BMI2)),
> > >                             __wcsnlen_avx2)
> > >             IFUNC_IMPL_ADD (array, i, wcsnlen,
> > >                             (CPU_FEATURE_USABLE (AVX2)
> > > +                            && CPU_FEATURE_USABLE (BMI2)
> > >                              && CPU_FEATURE_USABLE (RTM)),
> > >                             __wcsnlen_avx2_rtm)
> > >             IFUNC_IMPL_ADD (array, i, wcsnlen,
> > > diff --git a/sysdeps/x86_64/multiarch/strlen-avx2.S b/sysdeps/x86_64/multiarch/strlen-avx2.S
> > > index 1caae9e6bc..bd2e6ee44a 100644
> > > --- a/sysdeps/x86_64/multiarch/strlen-avx2.S
> > > +++ b/sysdeps/x86_64/multiarch/strlen-avx2.S
> > > @@ -27,9 +27,11 @@
> > >  # ifdef USE_AS_WCSLEN
> > >  #  define VPCMPEQ    vpcmpeqd
> > >  #  define VPMINU     vpminud
> > > +#  define CHAR_SIZE  4
> > >  # else
> > >  #  define VPCMPEQ    vpcmpeqb
> > >  #  define VPMINU     vpminub
> > > +#  define CHAR_SIZE  1
> > >  # endif
> > >
> > >  # ifndef VZEROUPPER
> > > @@ -41,349 +43,459 @@
> > >  # endif
> > >
> > >  # define VEC_SIZE 32
> > > +# define PAGE_SIZE 4096
> > >
> > >       .section SECTION(.text),"ax",@progbits
> > >  ENTRY (STRLEN)
> > >  # ifdef USE_AS_STRNLEN
> > > -     /* Check for zero length.  */
> > > +     /* Check zero length.  */
> > >       test    %RSI_LP, %RSI_LP
> > >       jz      L(zero)
> > > +     /* Store max len in R8_LP before adjusting if using WCSLEN.  */
> > > +     mov     %RSI_LP, %R8_LP
> > >  #  ifdef USE_AS_WCSLEN
> > >       shl     $2, %RSI_LP
> > >  #  elif defined __ILP32__
> > >       /* Clear the upper 32 bits.  */
> > >       movl    %esi, %esi
> > >  #  endif
> > > -     mov     %RSI_LP, %R8_LP
> > >  # endif
> > > -     movl    %edi, %ecx
> > > +     movl    %edi, %eax
> > >       movq    %rdi, %rdx
> > >       vpxor   %xmm0, %xmm0, %xmm0
> > > -
> > > +     /* Clear high bits from edi. Only keeping bits relevant to page
> > > +        cross check.  */
> > > +     andl    $(PAGE_SIZE - 1), %eax
> > >       /* Check if we may cross page boundary with one vector load.  */
> > > -     andl    $(2 * VEC_SIZE - 1), %ecx
> > > -     cmpl    $VEC_SIZE, %ecx
> > > -     ja      L(cros_page_boundary)
> > > +     cmpl    $(PAGE_SIZE - VEC_SIZE), %eax
> > > +     ja      L(cross_page_boundary)
> > >
> > >       /* Check the first VEC_SIZE bytes.  */
> > > -     VPCMPEQ (%rdi), %ymm0, %ymm1
> > > -     vpmovmskb %ymm1, %eax
> > > -     testl   %eax, %eax
> > > -
> > > +     VPCMPEQ (%rdi), %ymm0, %ymm1
> > > +     vpmovmskb       %ymm1, %eax
> > >  # ifdef USE_AS_STRNLEN
> > > -     jnz     L(first_vec_x0_check)
> > > -     /* Adjust length and check the end of data.  */
> > > -     subq    $VEC_SIZE, %rsi
> > > -     jbe     L(max)
> > > -# else
> > > -     jnz     L(first_vec_x0)
> > > +     /* If length < VEC_SIZE handle special.  */
> > > +     cmpq    $VEC_SIZE, %rsi
> > > +     jbe     L(first_vec_x0)
> > >  # endif
> > > -
> > > -     /* Align data for aligned loads in the loop.  */
> > > -     addq    $VEC_SIZE, %rdi
> > > -     andl    $(VEC_SIZE - 1), %ecx
> > > -     andq    $-VEC_SIZE, %rdi
> > > +     /* If empty continue to aligned_more. Otherwise return bit
> > > +        position of first match.  */
> > > +     testl   %eax, %eax
> > > +     jz      L(aligned_more)
> > > +     tzcntl  %eax, %eax
> > > +# ifdef USE_AS_WCSLEN
> > > +     shrl    $2, %eax
> > > +# endif
> > > +     VZEROUPPER_RETURN
> > >
> > >  # ifdef USE_AS_STRNLEN
> > > -     /* Adjust length.  */
> > > -     addq    %rcx, %rsi
> > > +L(zero):
> > > +     xorl    %eax, %eax
> > > +     ret
> > >
> > > -     subq    $(VEC_SIZE * 4), %rsi
> > > -     jbe     L(last_4x_vec_or_less)
> > > +     .p2align 4
> > > +L(first_vec_x0):
> > > +     /* Set bit for max len so that tzcnt will return min of max len
> > > +        and position of first match.  */
> > > +     btsq    %rsi, %rax
> > > +     tzcntl  %eax, %eax
> > > +#  ifdef USE_AS_WCSLEN
> > > +     shrl    $2, %eax
> > > +#  endif
> > > +     VZEROUPPER_RETURN
> > >  # endif
> > > -     jmp     L(more_4x_vec)
> > >
> > >       .p2align 4
> > > -L(cros_page_boundary):
> > > -     andl    $(VEC_SIZE - 1), %ecx
> > > -     andq    $-VEC_SIZE, %rdi
> > > -     VPCMPEQ (%rdi), %ymm0, %ymm1
> > > -     vpmovmskb %ymm1, %eax
> > > -     /* Remove the leading bytes.  */
> > > -     sarl    %cl, %eax
> > > -     testl   %eax, %eax
> > > -     jz      L(aligned_more)
> > > +L(first_vec_x1):
> > >       tzcntl  %eax, %eax
> > > +     /* Safe to use 32 bit instructions as these are only called for
> > > +        size = [1, 159].  */
> > >  # ifdef USE_AS_STRNLEN
> > > -     /* Check the end of data.  */
> > > -     cmpq    %rax, %rsi
> > > -     jbe     L(max)
> > > +     /* Use ecx which was computed earlier to compute correct value.
> > > +      */
> > > +     subl    $(VEC_SIZE * 4 + 1), %ecx
> > > +     addl    %ecx, %eax
> > > +# else
> > > +     subl    %edx, %edi
> > > +     incl    %edi
> > > +     addl    %edi, %eax
> > >  # endif
> > > -     addq    %rdi, %rax
> > > -     addq    %rcx, %rax
> > > -     subq    %rdx, %rax
> > >  # ifdef USE_AS_WCSLEN
> > > -     shrq    $2, %rax
> > > +     shrl    $2, %eax
> > >  # endif
> > > -L(return_vzeroupper):
> > > -     ZERO_UPPER_VEC_REGISTERS_RETURN
> > > +     VZEROUPPER_RETURN
> > >
> > >       .p2align 4
> > > -L(aligned_more):
> > > +L(first_vec_x2):
> > > +     tzcntl  %eax, %eax
> > > +     /* Safe to use 32 bit instructions as these are only called for
> > > +        size = [1, 159].  */
> > >  # ifdef USE_AS_STRNLEN
> > > -        /* "rcx" is less than VEC_SIZE.  Calculate "rdx + rcx - VEC_SIZE"
> > > -         with "rdx - (VEC_SIZE - rcx)" instead of "(rdx + rcx) - VEC_SIZE"
> > > -         to void possible addition overflow.  */
> > > -     negq    %rcx
> > > -     addq    $VEC_SIZE, %rcx
> > > -
> > > -     /* Check the end of data.  */
> > > -     subq    %rcx, %rsi
> > > -     jbe     L(max)
> > > +     /* Use ecx which was computed earlier to compute correct value.
> > > +      */
> > > +     subl    $(VEC_SIZE * 3 + 1), %ecx
> > > +     addl    %ecx, %eax
> > > +# else
> > > +     subl    %edx, %edi
> > > +     addl    $(VEC_SIZE + 1), %edi
> > > +     addl    %edi, %eax
> > >  # endif
> > > +# ifdef USE_AS_WCSLEN
> > > +     shrl    $2, %eax
> > > +# endif
> > > +     VZEROUPPER_RETURN
> > >
> > > -     addq    $VEC_SIZE, %rdi
> > > +     .p2align 4
> > > +L(first_vec_x3):
> > > +     tzcntl  %eax, %eax
> > > +     /* Safe to use 32 bit instructions as these are only called for
> > > +        size = [1, 159].  */
> > > +# ifdef USE_AS_STRNLEN
> > > +     /* Use ecx which was computed earlier to compute correct value.
> > > +      */
> > > +     subl    $(VEC_SIZE * 2 + 1), %ecx
> > > +     addl    %ecx, %eax
> > > +# else
> > > +     subl    %edx, %edi
> > > +     addl    $(VEC_SIZE * 2 + 1), %edi
> > > +     addl    %edi, %eax
> > > +# endif
> > > +# ifdef USE_AS_WCSLEN
> > > +     shrl    $2, %eax
> > > +# endif
> > > +     VZEROUPPER_RETURN
> > >
> > > +     .p2align 4
> > > +L(first_vec_x4):
> > > +     tzcntl  %eax, %eax
> > > +     /* Safe to use 32 bit instructions as these are only called for
> > > +        size = [1, 159].  */
> > >  # ifdef USE_AS_STRNLEN
> > > -     subq    $(VEC_SIZE * 4), %rsi
> > > -     jbe     L(last_4x_vec_or_less)
> > > +     /* Use ecx which was computed earlier to compute correct value.
> > > +      */
> > > +     subl    $(VEC_SIZE + 1), %ecx
> > > +     addl    %ecx, %eax
> > > +# else
> > > +     subl    %edx, %edi
> > > +     addl    $(VEC_SIZE * 3 + 1), %edi
> > > +     addl    %edi, %eax
> > >  # endif
> > > +# ifdef USE_AS_WCSLEN
> > > +     shrl    $2, %eax
> > > +# endif
> > > +     VZEROUPPER_RETURN
> > >
> > > -L(more_4x_vec):
> > > +     .p2align 5
> > > +L(aligned_more):
> > > +     /* Align data to VEC_SIZE - 1. This is the same number of
> > > +        instructions as using andq with -VEC_SIZE but saves 4 bytes of
> > > +        code on the x4 check.  */
> > > +     orq     $(VEC_SIZE - 1), %rdi
> > > +L(cross_page_continue):
> > >       /* Check the first 4 * VEC_SIZE.  Only one VEC_SIZE at a time
> > >          since data is only aligned to VEC_SIZE.  */
> > > -     VPCMPEQ (%rdi), %ymm0, %ymm1
> > > -     vpmovmskb %ymm1, %eax
> > > -     testl   %eax, %eax
> > > -     jnz     L(first_vec_x0)
> > > -
> > > -     VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1
> > > -     vpmovmskb %ymm1, %eax
> > > +# ifdef USE_AS_STRNLEN
> > > +     /* + 1 because rdi is aligned to VEC_SIZE - 1. + CHAR_SIZE because
> > > +        it simplies the logic in last_4x_vec_or_less.  */
> > > +     leaq    (VEC_SIZE * 4 + CHAR_SIZE + 1)(%rdi), %rcx
> > > +     subq    %rdx, %rcx
> > > +# endif
> > > +     /* Load first VEC regardless.  */
> > > +     VPCMPEQ 1(%rdi), %ymm0, %ymm1
> > > +# ifdef USE_AS_STRNLEN
> > > +     /* Adjust length. If near end handle specially.  */
> > > +     subq    %rcx, %rsi
> > > +     jb      L(last_4x_vec_or_less)
> > > +# endif
> > > +     vpmovmskb       %ymm1, %eax
> > >       testl   %eax, %eax
> > >       jnz     L(first_vec_x1)
> > >
> > > -     VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm0, %ymm1
> > > -     vpmovmskb %ymm1, %eax
> > > +     VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
> > > +     vpmovmskb       %ymm1, %eax
> > >       testl   %eax, %eax
> > >       jnz     L(first_vec_x2)
> > >
> > > -     VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1
> > > -     vpmovmskb %ymm1, %eax
> > > +     VPCMPEQ (VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm1
> > > +     vpmovmskb       %ymm1, %eax
> > >       testl   %eax, %eax
> > >       jnz     L(first_vec_x3)
> > >
> > > -     addq    $(VEC_SIZE * 4), %rdi
> > > -
> > > -# ifdef USE_AS_STRNLEN
> > > -     subq    $(VEC_SIZE * 4), %rsi
> > > -     jbe     L(last_4x_vec_or_less)
> > > -# endif
> > > -
> > > -     /* Align data to 4 * VEC_SIZE.  */
> > > -     movq    %rdi, %rcx
> > > -     andl    $(4 * VEC_SIZE - 1), %ecx
> > > -     andq    $-(4 * VEC_SIZE), %rdi
> > > +     VPCMPEQ (VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm1
> > > +     vpmovmskb       %ymm1, %eax
> > > +     testl   %eax, %eax
> > > +     jnz     L(first_vec_x4)
> > >
> > > +     /* Align data to VEC_SIZE * 4 - 1.  */
> > >  # ifdef USE_AS_STRNLEN
> > > -     /* Adjust length.  */
> > > +     /* Before adjusting length check if at last VEC_SIZE * 4.  */
> > > +     cmpq    $(VEC_SIZE * 4 - 1), %rsi
> > > +     jbe     L(last_4x_vec_or_less_load)
> > > +     incq    %rdi
> > > +     movl    %edi, %ecx
> > > +     orq     $(VEC_SIZE * 4 - 1), %rdi
> > > +     andl    $(VEC_SIZE * 4 - 1), %ecx
> > > +     /* Readjust length.  */
> > >       addq    %rcx, %rsi
> > > +# else
> > > +     incq    %rdi
> > > +     orq     $(VEC_SIZE * 4 - 1), %rdi
> > >  # endif
> > > -
> > > +     /* Compare 4 * VEC at a time forward.  */
> > >       .p2align 4
> > >  L(loop_4x_vec):
> > > -     /* Compare 4 * VEC at a time forward.  */
> > > -     vmovdqa (%rdi), %ymm1
> > > -     vmovdqa VEC_SIZE(%rdi), %ymm2
> > > -     vmovdqa (VEC_SIZE * 2)(%rdi), %ymm3
> > > -     vmovdqa (VEC_SIZE * 3)(%rdi), %ymm4
> > > -     VPMINU  %ymm1, %ymm2, %ymm5
> > > -     VPMINU  %ymm3, %ymm4, %ymm6
> > > -     VPMINU  %ymm5, %ymm6, %ymm5
> > > -
> > > -     VPCMPEQ %ymm5, %ymm0, %ymm5
> > > -     vpmovmskb %ymm5, %eax
> > > -     testl   %eax, %eax
> > > -     jnz     L(4x_vec_end)
> > > -
> > > -     addq    $(VEC_SIZE * 4), %rdi
> > > -
> > > -# ifndef USE_AS_STRNLEN
> > > -     jmp     L(loop_4x_vec)
> > > -# else
> > > +# ifdef USE_AS_STRNLEN
> > > +     /* Break if at end of length.  */
> > >       subq    $(VEC_SIZE * 4), %rsi
> > > -     ja      L(loop_4x_vec)
> > > -
> > > -L(last_4x_vec_or_less):
> > > -     /* Less than 4 * VEC and aligned to VEC_SIZE.  */
> > > -     addl    $(VEC_SIZE * 2), %esi
> > > -     jle     L(last_2x_vec)
> > > +     jb      L(last_4x_vec_or_less_cmpeq)
> > > +# endif
> > > +     /* Save some code size by microfusing VPMINU with the load. Since
> > > +        the matches in ymm2/ymm4 can only be returned if there where no
> > > +        matches in ymm1/ymm3 respectively there is no issue with overlap.
> > > +      */
> > > +     vmovdqa 1(%rdi), %ymm1
> > > +     VPMINU  (VEC_SIZE + 1)(%rdi), %ymm1, %ymm2
> > > +     vmovdqa (VEC_SIZE * 2 + 1)(%rdi), %ymm3
> > > +     VPMINU  (VEC_SIZE * 3 + 1)(%rdi), %ymm3, %ymm4
> > > +
> > > +     VPMINU  %ymm2, %ymm4, %ymm5
> > > +     VPCMPEQ %ymm5, %ymm0, %ymm5
> > > +     vpmovmskb       %ymm5, %ecx
> > >
> > > -     VPCMPEQ (%rdi), %ymm0, %ymm1
> > > -     vpmovmskb %ymm1, %eax
> > > -     testl   %eax, %eax
> > > -     jnz     L(first_vec_x0)
> > > +     subq    $-(VEC_SIZE * 4), %rdi
> > > +     testl   %ecx, %ecx
> > > +     jz      L(loop_4x_vec)
> > >
> > > -     VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1
> > > -     vpmovmskb %ymm1, %eax
> > > -     testl   %eax, %eax
> > > -     jnz     L(first_vec_x1)
> > >
> > > -     VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm0, %ymm1
> > > -     vpmovmskb %ymm1, %eax
> > > +     VPCMPEQ %ymm1, %ymm0, %ymm1
> > > +     vpmovmskb       %ymm1, %eax
> > > +     subq    %rdx, %rdi
> > >       testl   %eax, %eax
> > > +     jnz     L(last_vec_return_x0)
> > >
> > > -     jnz     L(first_vec_x2_check)
> > > -     subl    $VEC_SIZE, %esi
> > > -     jle     L(max)
> > > -
> > > -     VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1
> > > -     vpmovmskb %ymm1, %eax
> > > +     VPCMPEQ %ymm2, %ymm0, %ymm2
> > > +     vpmovmskb       %ymm2, %eax
> > >       testl   %eax, %eax
> > > -
> > > -     jnz     L(first_vec_x3_check)
> > > -     movq    %r8, %rax
> > > -#  ifdef USE_AS_WCSLEN
> > > +     jnz     L(last_vec_return_x1)
> > > +
> > > +     /* Combine last 2 VEC.  */
> > > +     VPCMPEQ %ymm3, %ymm0, %ymm3
> > > +     vpmovmskb       %ymm3, %eax
> > > +     /* rcx has combined result from all 4 VEC. It will only be used if
> > > +        the first 3 other VEC all did not contain a match.  */
> > > +     salq    $32, %rcx
> > > +     orq     %rcx, %rax
> > > +     tzcntq  %rax, %rax
> > > +     subq    $(VEC_SIZE * 2 - 1), %rdi
> > > +     addq    %rdi, %rax
> > > +# ifdef USE_AS_WCSLEN
> > >       shrq    $2, %rax
> > > -#  endif
> > > +# endif
> > >       VZEROUPPER_RETURN
> > >
> > > +
> > > +# ifdef USE_AS_STRNLEN
> > >       .p2align 4
> > > -L(last_2x_vec):
> > > -     addl    $(VEC_SIZE * 2), %esi
> > > -     VPCMPEQ (%rdi), %ymm0, %ymm1
> > > -     vpmovmskb %ymm1, %eax
> > > -     testl   %eax, %eax
> > > +L(last_4x_vec_or_less_load):
> > > +     /* Depending on entry adjust rdi / prepare first VEC in ymm1.  */
> > > +     subq    $-(VEC_SIZE * 4), %rdi
> > > +L(last_4x_vec_or_less_cmpeq):
> > > +     VPCMPEQ 1(%rdi), %ymm0, %ymm1
> > > +L(last_4x_vec_or_less):
> > >
> > > -     jnz     L(first_vec_x0_check)
> > > -     subl    $VEC_SIZE, %esi
> > > -     jle     L(max)
> > > +     vpmovmskb       %ymm1, %eax
> > > +     /* If remaining length > VEC_SIZE * 2. This works if esi is off by
> > > +        VEC_SIZE * 4.  */
> > > +     testl   $(VEC_SIZE * 2), %esi
> > > +     jnz     L(last_4x_vec)
> > >
> > > -     VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1
> > > -     vpmovmskb %ymm1, %eax
> > > +     /* length may have been negative or positive by an offset of
> > > +        VEC_SIZE * 4 depending on where this was called from. This fixes
> > > +        that.  */
> > > +     andl    $(VEC_SIZE * 4 - 1), %esi
> > >       testl   %eax, %eax
> > > -     jnz     L(first_vec_x1_check)
> > > -     movq    %r8, %rax
> > > -#  ifdef USE_AS_WCSLEN
> > > -     shrq    $2, %rax
> > > -#  endif
> > > -     VZEROUPPER_RETURN
> > > +     jnz     L(last_vec_x1_check)
> > >
> > > -     .p2align 4
> > > -L(first_vec_x0_check):
> > > +     subl    $VEC_SIZE, %esi
> > > +     jb      L(max)
> > > +
> > > +     VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
> > > +     vpmovmskb       %ymm1, %eax
> > >       tzcntl  %eax, %eax
> > >       /* Check the end of data.  */
> > > -     cmpq    %rax, %rsi
> > > -     jbe     L(max)
> > > +     cmpl    %eax, %esi
> > > +     jb      L(max)
> > > +     subq    %rdx, %rdi
> > > +     addl    $(VEC_SIZE + 1), %eax
> > >       addq    %rdi, %rax
> > > -     subq    %rdx, %rax
> > >  #  ifdef USE_AS_WCSLEN
> > >       shrq    $2, %rax
> > >  #  endif
> > >       VZEROUPPER_RETURN
> > > +# endif
> > >
> > >       .p2align 4
> > > -L(first_vec_x1_check):
> > > +L(last_vec_return_x0):
> > >       tzcntl  %eax, %eax
> > > -     /* Check the end of data.  */
> > > -     cmpq    %rax, %rsi
> > > -     jbe     L(max)
> > > -     addq    $VEC_SIZE, %rax
> > > +     subq    $(VEC_SIZE * 4 - 1), %rdi
> > >       addq    %rdi, %rax
> > > -     subq    %rdx, %rax
> > > -#  ifdef USE_AS_WCSLEN
> > > +# ifdef USE_AS_WCSLEN
> > >       shrq    $2, %rax
> > > -#  endif
> > > +# endif
> > >       VZEROUPPER_RETURN
> > >
> > >       .p2align 4
> > > -L(first_vec_x2_check):
> > > +L(last_vec_return_x1):
> > >       tzcntl  %eax, %eax
> > > -     /* Check the end of data.  */
> > > -     cmpq    %rax, %rsi
> > > -     jbe     L(max)
> > > -     addq    $(VEC_SIZE * 2), %rax
> > > +     subq    $(VEC_SIZE * 3 - 1), %rdi
> > >       addq    %rdi, %rax
> > > -     subq    %rdx, %rax
> > > -#  ifdef USE_AS_WCSLEN
> > > +# ifdef USE_AS_WCSLEN
> > >       shrq    $2, %rax
> > > -#  endif
> > > +# endif
> > >       VZEROUPPER_RETURN
> > >
> > > +# ifdef USE_AS_STRNLEN
> > >       .p2align 4
> > > -L(first_vec_x3_check):
> > > +L(last_vec_x1_check):
> > > +
> > >       tzcntl  %eax, %eax
> > >       /* Check the end of data.  */
> > > -     cmpq    %rax, %rsi
> > > -     jbe     L(max)
> > > -     addq    $(VEC_SIZE * 3), %rax
> > > +     cmpl    %eax, %esi
> > > +     jb      L(max)
> > > +     subq    %rdx, %rdi
> > > +     incl    %eax
> > >       addq    %rdi, %rax
> > > -     subq    %rdx, %rax
> > >  #  ifdef USE_AS_WCSLEN
> > >       shrq    $2, %rax
> > >  #  endif
> > >       VZEROUPPER_RETURN
> > >
> > > -     .p2align 4
> > >  L(max):
> > >       movq    %r8, %rax
> > > +     VZEROUPPER_RETURN
> > > +
> > > +     .p2align 4
> > > +L(last_4x_vec):
> > > +     /* Test first 2x VEC normally.  */
> > > +     testl   %eax, %eax
> > > +     jnz     L(last_vec_x1)
> > > +
> > > +     VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
> > > +     vpmovmskb       %ymm1, %eax
> > > +     testl   %eax, %eax
> > > +     jnz     L(last_vec_x2)
> > > +
> > > +     /* Normalize length.  */
> > > +     andl    $(VEC_SIZE * 4 - 1), %esi
> > > +     VPCMPEQ (VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm1
> > > +     vpmovmskb       %ymm1, %eax
> > > +     testl   %eax, %eax
> > > +     jnz     L(last_vec_x3)
> > > +
> > > +     subl    $(VEC_SIZE * 3), %esi
> > > +     jb      L(max)
> > > +
> > > +     VPCMPEQ (VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm1
> > > +     vpmovmskb       %ymm1, %eax
> > > +     tzcntl  %eax, %eax
> > > +     /* Check the end of data.  */
> > > +     cmpl    %eax, %esi
> > > +     jb      L(max)
> > > +     subq    %rdx, %rdi
> > > +     addl    $(VEC_SIZE * 3 + 1), %eax
> > > +     addq    %rdi, %rax
> > >  #  ifdef USE_AS_WCSLEN
> > >       shrq    $2, %rax
> > >  #  endif
> > >       VZEROUPPER_RETURN
> > >
> > > -     .p2align 4
> > > -L(zero):
> > > -     xorl    %eax, %eax
> > > -     ret
> > > -# endif
> > >
> > >       .p2align 4
> > > -L(first_vec_x0):
> > > +L(last_vec_x1):
> > > +     /* essentially duplicates of first_vec_x1 but use 64 bit
> > > +        instructions.  */
> > >       tzcntl  %eax, %eax
> > > +     subq    %rdx, %rdi
> > > +     incl    %eax
> > >       addq    %rdi, %rax
> > > -     subq    %rdx, %rax
> > > -# ifdef USE_AS_WCSLEN
> > > +#  ifdef USE_AS_WCSLEN
> > >       shrq    $2, %rax
> > > -# endif
> > > +#  endif
> > >       VZEROUPPER_RETURN
> > >
> > >       .p2align 4
> > > -L(first_vec_x1):
> > > +L(last_vec_x2):
> > > +     /* essentially duplicates of first_vec_x1 but use 64 bit
> > > +        instructions.  */
> > >       tzcntl  %eax, %eax
> > > -     addq    $VEC_SIZE, %rax
> > > +     subq    %rdx, %rdi
> > > +     addl    $(VEC_SIZE + 1), %eax
> > >       addq    %rdi, %rax
> > > -     subq    %rdx, %rax
> > > -# ifdef USE_AS_WCSLEN
> > > +#  ifdef USE_AS_WCSLEN
> > >       shrq    $2, %rax
> > > -# endif
> > > +#  endif
> > >       VZEROUPPER_RETURN
> > >
> > >       .p2align 4
> > > -L(first_vec_x2):
> > > +L(last_vec_x3):
> > >       tzcntl  %eax, %eax
> > > -     addq    $(VEC_SIZE * 2), %rax
> > > +     subl    $(VEC_SIZE * 2), %esi
> > > +     /* Check the end of data.  */
> > > +     cmpl    %eax, %esi
> > > +     jb      L(max_end)
> > > +     subq    %rdx, %rdi
> > > +     addl    $(VEC_SIZE * 2 + 1), %eax
> > >       addq    %rdi, %rax
> > > -     subq    %rdx, %rax
> > > -# ifdef USE_AS_WCSLEN
> > > +#  ifdef USE_AS_WCSLEN
> > >       shrq    $2, %rax
> > > -# endif
> > > +#  endif
> > > +     VZEROUPPER_RETURN
> > > +L(max_end):
> > > +     movq    %r8, %rax
> > >       VZEROUPPER_RETURN
> > > +# endif
> > >
> > > +     /* Cold case for crossing page with first load.  */
> > >       .p2align 4
> > > -L(4x_vec_end):
> > > -     VPCMPEQ %ymm1, %ymm0, %ymm1
> > > -     vpmovmskb %ymm1, %eax
> > > -     testl   %eax, %eax
> > > -     jnz     L(first_vec_x0)
> > > -     VPCMPEQ %ymm2, %ymm0, %ymm2
> > > -     vpmovmskb %ymm2, %eax
> > > +L(cross_page_boundary):
> > > +     /* Align data to VEC_SIZE - 1.  */
> > > +     orq     $(VEC_SIZE - 1), %rdi
> > > +     VPCMPEQ -(VEC_SIZE - 1)(%rdi), %ymm0, %ymm1
> > > +     vpmovmskb       %ymm1, %eax
> > > +     /* Remove the leading bytes. sarxl only uses bits [5:0] of COUNT
> > > +        so no need to manually mod rdx.  */
> > > +     sarxl   %edx, %eax, %eax
> >
> > This is a BMI2 instruction, which is not necessary available when AVX2
> > is available. This causes SIGILL on some CPU. I have reported that in
> > https://sourceware.org/bugzilla/show_bug.cgi?id=29611
>
> This is not a bug on master as:
>
> commit 83c5b368226c34a2f0a5287df40fc290b2b34359
> Author: H.J. Lu <hjl.tools@gmail.com>
> Date:   Mon Apr 19 10:45:07 2021 -0700
>
>     x86-64: Require BMI2 for strchr-avx2.S
>
> is already in tree. The issue is the avx2 changes where backported
> w.o H.J's changes.
> >
> > Regards
> > Aurelien
> >
> > --
> > Aurelien Jarno                          GPG: 4096R/1DDD8C9B
> > aurelien@aurel32.net                 http://www.aurel32.net

[-- Attachment #2: 2.31-2.30-2.29-2.28.patch --]
[-- Type: application/octet-stream, Size: 3697 bytes --]

From 86e1d88e1a3c126597ef39165275ada7564cfce9 Mon Sep 17 00:00:00 2001
From: "H.J. Lu" <hjl.tools@gmail.com>
Date: Mon, 19 Apr 2021 10:45:07 -0700
Subject: [PATCH] x86-64: Require BMI2 for strchr-avx2.S

Since strchr-avx2.S updated by

commit 1f745ecc2109890886b161d4791e1406fdfc29b8
Author: noah <goldstein.w.n@gmail.com>
Date:   Wed Feb 3 00:38:59 2021 -0500

    x86-64: Refactor and improve performance of strchr-avx2.S

uses sarx:

c4 e2 72 f7 c0       	sarx   %ecx,%eax,%eax

for strchr-avx2 family functions, require BMI2 in ifunc-impl-list.c and
ifunc-avx2.h.

(cherry picked from commit 83c5b368226c34a2f0a5287df40fc290b2b34359)
---
 sysdeps/x86_64/multiarch/ifunc-avx2.h      |  4 ++--
 sysdeps/x86_64/multiarch/ifunc-impl-list.c | 12 +++++++++---
 2 files changed, 11 insertions(+), 5 deletions(-)

diff --git a/sysdeps/x86_64/multiarch/ifunc-avx2.h b/sysdeps/x86_64/multiarch/ifunc-avx2.h
index 74189b6aa5..925e5b61eb 100644
--- a/sysdeps/x86_64/multiarch/ifunc-avx2.h
+++ b/sysdeps/x86_64/multiarch/ifunc-avx2.h
@@ -30,11 +30,11 @@ IFUNC_SELECTOR (void)
   const struct cpu_features* cpu_features = __get_cpu_features ();
 
   if (CPU_FEATURES_ARCH_P (cpu_features, AVX2_Usable)
+      && CPU_FEATURES_CPU_P (cpu_features, BMI2)
       && CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load))
     {
       if (CPU_FEATURES_ARCH_P (cpu_features, AVX512VL_Usable)
-	  && CPU_FEATURES_ARCH_P (cpu_features, AVX512BW_Usable)
-	  && CPU_FEATURES_CPU_P (cpu_features, BMI2))
+	  && CPU_FEATURES_ARCH_P (cpu_features, AVX512BW_Usable))
 	return OPTIMIZE (evex);
 
       if (CPU_FEATURES_CPU_P (cpu_features, RTM))
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
index 56b05ee741..f76326e0b2 100644
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
@@ -400,10 +400,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
   /* Support sysdeps/x86_64/multiarch/strchr.c.  */
   IFUNC_IMPL (i, name, strchr,
 	      IFUNC_IMPL_ADD (array, i, strchr,
-			      HAS_ARCH_FEATURE (AVX2_Usable),
+			      (HAS_ARCH_FEATURE (AVX2_Usable)
+			       && HAS_CPU_FEATURE (BMI2)),
 			      __strchr_avx2)
 	      IFUNC_IMPL_ADD (array, i, strchr,
 			      (HAS_ARCH_FEATURE (AVX2_Usable)
+			       && HAS_CPU_FEATURE (BMI2)
 			       && HAS_CPU_FEATURE (RTM)),
 			      __strchr_avx2_rtm)
 	      IFUNC_IMPL_ADD (array, i, strchr,
@@ -417,10 +419,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
   /* Support sysdeps/x86_64/multiarch/strchrnul.c.  */
   IFUNC_IMPL (i, name, strchrnul,
 	      IFUNC_IMPL_ADD (array, i, strchrnul,
-			      HAS_ARCH_FEATURE (AVX2_Usable),
+			      (HAS_ARCH_FEATURE (AVX2_Usable)
+			       && HAS_CPU_FEATURE (BMI2)),
 			      __strchrnul_avx2)
 	      IFUNC_IMPL_ADD (array, i, strchrnul,
 			      (HAS_ARCH_FEATURE (AVX2_Usable)
+			       && HAS_CPU_FEATURE (BMI2)
 			       && HAS_CPU_FEATURE (RTM)),
 			      __strchrnul_avx2_rtm)
 	      IFUNC_IMPL_ADD (array, i, strchrnul,
@@ -574,10 +578,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
   /* Support sysdeps/x86_64/multiarch/wcschr.c.  */
   IFUNC_IMPL (i, name, wcschr,
 	      IFUNC_IMPL_ADD (array, i, wcschr,
-			      HAS_ARCH_FEATURE (AVX2_Usable),
+			      (HAS_ARCH_FEATURE (AVX2_Usable)
+			       && HAS_CPU_FEATURE (BMI2)),
 			      __wcschr_avx2)
 	      IFUNC_IMPL_ADD (array, i, wcschr,
 			      (HAS_ARCH_FEATURE (AVX2_Usable)
+			       && HAS_CPU_FEATURE (BMI2)
 			       && HAS_CPU_FEATURE (RTM)),
 			      __wcschr_avx2_rtm)
 	      IFUNC_IMPL_ADD (array, i, wcschr,
-- 
2.36.1


[-- Attachment #3: 2.32.patch --]
[-- Type: application/octet-stream, Size: 3661 bytes --]

From c06b2890275868d7b8b4eeb5d57cb28288170899 Mon Sep 17 00:00:00 2001
From: "H.J. Lu" <hjl.tools@gmail.com>
Date: Mon, 19 Apr 2021 10:45:07 -0700
Subject: [PATCH] x86-64: Require BMI2 for strchr-avx2.S

Since strchr-avx2.S updated by

commit 1f745ecc2109890886b161d4791e1406fdfc29b8
Author: noah <goldstein.w.n@gmail.com>
Date:   Wed Feb 3 00:38:59 2021 -0500

    x86-64: Refactor and improve performance of strchr-avx2.S

uses sarx:

c4 e2 72 f7 c0       	sarx   %ecx,%eax,%eax

for strchr-avx2 family functions, require BMI2 in ifunc-impl-list.c and
ifunc-avx2.h.

(cherry picked from commit 83c5b368226c34a2f0a5287df40fc290b2b34359)
---
 sysdeps/x86_64/multiarch/ifunc-avx2.h      |  4 ++--
 sysdeps/x86_64/multiarch/ifunc-impl-list.c | 12 +++++++++---
 2 files changed, 11 insertions(+), 5 deletions(-)

diff --git a/sysdeps/x86_64/multiarch/ifunc-avx2.h b/sysdeps/x86_64/multiarch/ifunc-avx2.h
index f450c786f0..0d9d837488 100644
--- a/sysdeps/x86_64/multiarch/ifunc-avx2.h
+++ b/sysdeps/x86_64/multiarch/ifunc-avx2.h
@@ -30,11 +30,11 @@ IFUNC_SELECTOR (void)
   const struct cpu_features* cpu_features = __get_cpu_features ();
 
   if (CPU_FEATURE_USABLE_P (cpu_features, AVX2)
+      && CPU_FEATURE_USABLE_P (cpu_features, BMI2)
       && CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load))
     {
       if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
-	  && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW)
-	  && CPU_FEATURE_USABLE_P (cpu_features, BMI2))
+	  && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW))
 	return OPTIMIZE (evex);
 
       if (CPU_FEATURE_USABLE_P (cpu_features, RTM))
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
index 920e64241e..d4bbf61030 100644
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
@@ -400,10 +400,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
   /* Support sysdeps/x86_64/multiarch/strchr.c.  */
   IFUNC_IMPL (i, name, strchr,
 	      IFUNC_IMPL_ADD (array, i, strchr,
-			      CPU_FEATURE_USABLE (AVX2),
+			      (CPU_FEATURE_USABLE (AVX2)
+			       && CPU_FEATURE_USABLE (BMI2)),
 			      __strchr_avx2)
 	      IFUNC_IMPL_ADD (array, i, strchr,
 			      (CPU_FEATURE_USABLE (AVX2)
+			       && CPU_FEATURE_USABLE (BMI2)
 			       && CPU_FEATURE_USABLE (RTM)),
 			      __strchr_avx2_rtm)
 	      IFUNC_IMPL_ADD (array, i, strchr,
@@ -417,10 +419,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
   /* Support sysdeps/x86_64/multiarch/strchrnul.c.  */
   IFUNC_IMPL (i, name, strchrnul,
 	      IFUNC_IMPL_ADD (array, i, strchrnul,
-			      CPU_FEATURE_USABLE (AVX2),
+			      (CPU_FEATURE_USABLE (AVX2)
+			       && CPU_FEATURE_USABLE (BMI2)),
 			      __strchrnul_avx2)
 	      IFUNC_IMPL_ADD (array, i, strchrnul,
 			      (CPU_FEATURE_USABLE (AVX2)
+			       && CPU_FEATURE_USABLE (BMI2)
 			       && CPU_FEATURE_USABLE (RTM)),
 			      __strchrnul_avx2_rtm)
 	      IFUNC_IMPL_ADD (array, i, strchrnul,
@@ -574,10 +578,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
   /* Support sysdeps/x86_64/multiarch/wcschr.c.  */
   IFUNC_IMPL (i, name, wcschr,
 	      IFUNC_IMPL_ADD (array, i, wcschr,
-			      CPU_FEATURE_USABLE (AVX2),
+			      (CPU_FEATURE_USABLE (AVX2)
+			       && CPU_FEATURE_USABLE (BMI2)),
 			      __wcschr_avx2)
 	      IFUNC_IMPL_ADD (array, i, wcschr,
 			      (CPU_FEATURE_USABLE (AVX2)
+			       && CPU_FEATURE_USABLE (BMI2)
 			       && CPU_FEATURE_USABLE (RTM)),
 			      __wcschr_avx2_rtm)
 	      IFUNC_IMPL_ADD (array, i, wcschr,
-- 
2.36.1


^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH v5 2/2] x86: Optimize strlen-avx2.S
  2022-09-28 13:54       ` Sunil Pandey
@ 2022-09-28 14:02         ` Darren Tristano
  2022-09-28 14:42         ` Noah Goldstein
                           ` (2 subsequent siblings)
  3 siblings, 0 replies; 24+ messages in thread
From: Darren Tristano @ 2022-09-28 14:02 UTC (permalink / raw)
  To: Noah Goldstein, Libc-stable Mailing List, Hongjiu Lu, Sunil Pandey
  Cc: GNU C Library

[-- Attachment #1: Type: text/plain, Size: 27733 bytes --]

Please Remove me from this string. I should not be on it.





________________________________
From: Libc-stable <libc-stable-bounces+darren=darrentristano.com@sourceware.org> on behalf of Sunil Pandey via Libc-stable <libc-stable@sourceware.org>
Sent: Wednesday, September 28, 2022 8:54 AM
To: Noah Goldstein <goldstein.w.n@gmail.com>; Libc-stable Mailing List <libc-stable@sourceware.org>; Hongjiu Lu <hjl.tools@gmail.com>
Cc: GNU C Library <libc-alpha@sourceware.org>
Subject: Re: [PATCH v5 2/2] x86: Optimize strlen-avx2.S

Attached patch fixes BZ# 29611.

I would like to backport it to 2.32,2.31,2.30,2.29 and 2.29. Let me know
if there is any objection.


On Sun, Sep 25, 2022 at 7:00 AM Noah Goldstein via Libc-alpha
<libc-alpha@sourceware.org> wrote:
>
> On Sun, Sep 25, 2022 at 1:19 AM Aurelien Jarno <aurelien@aurel32.net> wrote:
> >
> > On 2021-04-19 19:36, Noah Goldstein via Libc-alpha wrote:
> > > No bug. This commit optimizes strlen-avx2.S. The optimizations are
> > > mostly small things but they add up to roughly 10-30% performance
> > > improvement for strlen. The results for strnlen are bit more
> > > ambiguous. test-strlen, test-strnlen, test-wcslen, and test-wcsnlen
> > > are all passing.
> > >
> > > Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
> > > ---
> > >  sysdeps/x86_64/multiarch/ifunc-impl-list.c |  16 +-
> > >  sysdeps/x86_64/multiarch/strlen-avx2.S     | 532 +++++++++++++--------
> > >  2 files changed, 334 insertions(+), 214 deletions(-)
> > >
> > > diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> > > index c377cab629..651b32908e 100644
> > > --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> > > +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> > > @@ -293,10 +293,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> > >    /* Support sysdeps/x86_64/multiarch/strlen.c.  */
> > >    IFUNC_IMPL (i, name, strlen,
> > >             IFUNC_IMPL_ADD (array, i, strlen,
> > > -                           CPU_FEATURE_USABLE (AVX2),
> > > +                           (CPU_FEATURE_USABLE (AVX2)
> > > +                            && CPU_FEATURE_USABLE (BMI2)),
> > >                             __strlen_avx2)
> > >             IFUNC_IMPL_ADD (array, i, strlen,
> > >                             (CPU_FEATURE_USABLE (AVX2)
> > > +                            && CPU_FEATURE_USABLE (BMI2)
> > >                              && CPU_FEATURE_USABLE (RTM)),
> > >                             __strlen_avx2_rtm)
> > >             IFUNC_IMPL_ADD (array, i, strlen,
> > > @@ -309,10 +311,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> > >    /* Support sysdeps/x86_64/multiarch/strnlen.c.  */
> > >    IFUNC_IMPL (i, name, strnlen,
> > >             IFUNC_IMPL_ADD (array, i, strnlen,
> > > -                           CPU_FEATURE_USABLE (AVX2),
> > > +                           (CPU_FEATURE_USABLE (AVX2)
> > > +                            && CPU_FEATURE_USABLE (BMI2)),
> > >                             __strnlen_avx2)
> > >             IFUNC_IMPL_ADD (array, i, strnlen,
> > >                             (CPU_FEATURE_USABLE (AVX2)
> > > +                            && CPU_FEATURE_USABLE (BMI2)
> > >                              && CPU_FEATURE_USABLE (RTM)),
> > >                             __strnlen_avx2_rtm)
> > >             IFUNC_IMPL_ADD (array, i, strnlen,
> > > @@ -654,10 +658,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> > >    /* Support sysdeps/x86_64/multiarch/wcslen.c.  */
> > >    IFUNC_IMPL (i, name, wcslen,
> > >             IFUNC_IMPL_ADD (array, i, wcslen,
> > > -                           CPU_FEATURE_USABLE (AVX2),
> > > +                           (CPU_FEATURE_USABLE (AVX2)
> > > +                            && CPU_FEATURE_USABLE (BMI2)),
> > >                             __wcslen_avx2)
> > >             IFUNC_IMPL_ADD (array, i, wcslen,
> > >                             (CPU_FEATURE_USABLE (AVX2)
> > > +                            && CPU_FEATURE_USABLE (BMI2)
> > >                              && CPU_FEATURE_USABLE (RTM)),
> > >                             __wcslen_avx2_rtm)
> > >             IFUNC_IMPL_ADD (array, i, wcslen,
> > > @@ -670,10 +676,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> > >    /* Support sysdeps/x86_64/multiarch/wcsnlen.c.  */
> > >    IFUNC_IMPL (i, name, wcsnlen,
> > >             IFUNC_IMPL_ADD (array, i, wcsnlen,
> > > -                           CPU_FEATURE_USABLE (AVX2),
> > > +                           (CPU_FEATURE_USABLE (AVX2)
> > > +                            && CPU_FEATURE_USABLE (BMI2)),
> > >                             __wcsnlen_avx2)
> > >             IFUNC_IMPL_ADD (array, i, wcsnlen,
> > >                             (CPU_FEATURE_USABLE (AVX2)
> > > +                            && CPU_FEATURE_USABLE (BMI2)
> > >                              && CPU_FEATURE_USABLE (RTM)),
> > >                             __wcsnlen_avx2_rtm)
> > >             IFUNC_IMPL_ADD (array, i, wcsnlen,
> > > diff --git a/sysdeps/x86_64/multiarch/strlen-avx2.S b/sysdeps/x86_64/multiarch/strlen-avx2.S
> > > index 1caae9e6bc..bd2e6ee44a 100644
> > > --- a/sysdeps/x86_64/multiarch/strlen-avx2.S
> > > +++ b/sysdeps/x86_64/multiarch/strlen-avx2.S
> > > @@ -27,9 +27,11 @@
> > >  # ifdef USE_AS_WCSLEN
> > >  #  define VPCMPEQ    vpcmpeqd
> > >  #  define VPMINU     vpminud
> > > +#  define CHAR_SIZE  4
> > >  # else
> > >  #  define VPCMPEQ    vpcmpeqb
> > >  #  define VPMINU     vpminub
> > > +#  define CHAR_SIZE  1
> > >  # endif
> > >
> > >  # ifndef VZEROUPPER
> > > @@ -41,349 +43,459 @@
> > >  # endif
> > >
> > >  # define VEC_SIZE 32
> > > +# define PAGE_SIZE 4096
> > >
> > >       .section SECTION(.text),"ax",@progbits
> > >  ENTRY (STRLEN)
> > >  # ifdef USE_AS_STRNLEN
> > > -     /* Check for zero length.  */
> > > +     /* Check zero length.  */
> > >       test    %RSI_LP, %RSI_LP
> > >       jz      L(zero)
> > > +     /* Store max len in R8_LP before adjusting if using WCSLEN.  */
> > > +     mov     %RSI_LP, %R8_LP
> > >  #  ifdef USE_AS_WCSLEN
> > >       shl     $2, %RSI_LP
> > >  #  elif defined __ILP32__
> > >       /* Clear the upper 32 bits.  */
> > >       movl    %esi, %esi
> > >  #  endif
> > > -     mov     %RSI_LP, %R8_LP
> > >  # endif
> > > -     movl    %edi, %ecx
> > > +     movl    %edi, %eax
> > >       movq    %rdi, %rdx
> > >       vpxor   %xmm0, %xmm0, %xmm0
> > > -
> > > +     /* Clear high bits from edi. Only keeping bits relevant to page
> > > +        cross check.  */
> > > +     andl    $(PAGE_SIZE - 1), %eax
> > >       /* Check if we may cross page boundary with one vector load.  */
> > > -     andl    $(2 * VEC_SIZE - 1), %ecx
> > > -     cmpl    $VEC_SIZE, %ecx
> > > -     ja      L(cros_page_boundary)
> > > +     cmpl    $(PAGE_SIZE - VEC_SIZE), %eax
> > > +     ja      L(cross_page_boundary)
> > >
> > >       /* Check the first VEC_SIZE bytes.  */
> > > -     VPCMPEQ (%rdi), %ymm0, %ymm1
> > > -     vpmovmskb %ymm1, %eax
> > > -     testl   %eax, %eax
> > > -
> > > +     VPCMPEQ (%rdi), %ymm0, %ymm1
> > > +     vpmovmskb       %ymm1, %eax
> > >  # ifdef USE_AS_STRNLEN
> > > -     jnz     L(first_vec_x0_check)
> > > -     /* Adjust length and check the end of data.  */
> > > -     subq    $VEC_SIZE, %rsi
> > > -     jbe     L(max)
> > > -# else
> > > -     jnz     L(first_vec_x0)
> > > +     /* If length < VEC_SIZE handle special.  */
> > > +     cmpq    $VEC_SIZE, %rsi
> > > +     jbe     L(first_vec_x0)
> > >  # endif
> > > -
> > > -     /* Align data for aligned loads in the loop.  */
> > > -     addq    $VEC_SIZE, %rdi
> > > -     andl    $(VEC_SIZE - 1), %ecx
> > > -     andq    $-VEC_SIZE, %rdi
> > > +     /* If empty continue to aligned_more. Otherwise return bit
> > > +        position of first match.  */
> > > +     testl   %eax, %eax
> > > +     jz      L(aligned_more)
> > > +     tzcntl  %eax, %eax
> > > +# ifdef USE_AS_WCSLEN
> > > +     shrl    $2, %eax
> > > +# endif
> > > +     VZEROUPPER_RETURN
> > >
> > >  # ifdef USE_AS_STRNLEN
> > > -     /* Adjust length.  */
> > > -     addq    %rcx, %rsi
> > > +L(zero):
> > > +     xorl    %eax, %eax
> > > +     ret
> > >
> > > -     subq    $(VEC_SIZE * 4), %rsi
> > > -     jbe     L(last_4x_vec_or_less)
> > > +     .p2align 4
> > > +L(first_vec_x0):
> > > +     /* Set bit for max len so that tzcnt will return min of max len
> > > +        and position of first match.  */
> > > +     btsq    %rsi, %rax
> > > +     tzcntl  %eax, %eax
> > > +#  ifdef USE_AS_WCSLEN
> > > +     shrl    $2, %eax
> > > +#  endif
> > > +     VZEROUPPER_RETURN
> > >  # endif
> > > -     jmp     L(more_4x_vec)
> > >
> > >       .p2align 4
> > > -L(cros_page_boundary):
> > > -     andl    $(VEC_SIZE - 1), %ecx
> > > -     andq    $-VEC_SIZE, %rdi
> > > -     VPCMPEQ (%rdi), %ymm0, %ymm1
> > > -     vpmovmskb %ymm1, %eax
> > > -     /* Remove the leading bytes.  */
> > > -     sarl    %cl, %eax
> > > -     testl   %eax, %eax
> > > -     jz      L(aligned_more)
> > > +L(first_vec_x1):
> > >       tzcntl  %eax, %eax
> > > +     /* Safe to use 32 bit instructions as these are only called for
> > > +        size = [1, 159].  */
> > >  # ifdef USE_AS_STRNLEN
> > > -     /* Check the end of data.  */
> > > -     cmpq    %rax, %rsi
> > > -     jbe     L(max)
> > > +     /* Use ecx which was computed earlier to compute correct value.
> > > +      */
> > > +     subl    $(VEC_SIZE * 4 + 1), %ecx
> > > +     addl    %ecx, %eax
> > > +# else
> > > +     subl    %edx, %edi
> > > +     incl    %edi
> > > +     addl    %edi, %eax
> > >  # endif
> > > -     addq    %rdi, %rax
> > > -     addq    %rcx, %rax
> > > -     subq    %rdx, %rax
> > >  # ifdef USE_AS_WCSLEN
> > > -     shrq    $2, %rax
> > > +     shrl    $2, %eax
> > >  # endif
> > > -L(return_vzeroupper):
> > > -     ZERO_UPPER_VEC_REGISTERS_RETURN
> > > +     VZEROUPPER_RETURN
> > >
> > >       .p2align 4
> > > -L(aligned_more):
> > > +L(first_vec_x2):
> > > +     tzcntl  %eax, %eax
> > > +     /* Safe to use 32 bit instructions as these are only called for
> > > +        size = [1, 159].  */
> > >  # ifdef USE_AS_STRNLEN
> > > -        /* "rcx" is less than VEC_SIZE.  Calculate "rdx + rcx - VEC_SIZE"
> > > -         with "rdx - (VEC_SIZE - rcx)" instead of "(rdx + rcx) - VEC_SIZE"
> > > -         to void possible addition overflow.  */
> > > -     negq    %rcx
> > > -     addq    $VEC_SIZE, %rcx
> > > -
> > > -     /* Check the end of data.  */
> > > -     subq    %rcx, %rsi
> > > -     jbe     L(max)
> > > +     /* Use ecx which was computed earlier to compute correct value.
> > > +      */
> > > +     subl    $(VEC_SIZE * 3 + 1), %ecx
> > > +     addl    %ecx, %eax
> > > +# else
> > > +     subl    %edx, %edi
> > > +     addl    $(VEC_SIZE + 1), %edi
> > > +     addl    %edi, %eax
> > >  # endif
> > > +# ifdef USE_AS_WCSLEN
> > > +     shrl    $2, %eax
> > > +# endif
> > > +     VZEROUPPER_RETURN
> > >
> > > -     addq    $VEC_SIZE, %rdi
> > > +     .p2align 4
> > > +L(first_vec_x3):
> > > +     tzcntl  %eax, %eax
> > > +     /* Safe to use 32 bit instructions as these are only called for
> > > +        size = [1, 159].  */
> > > +# ifdef USE_AS_STRNLEN
> > > +     /* Use ecx which was computed earlier to compute correct value.
> > > +      */
> > > +     subl    $(VEC_SIZE * 2 + 1), %ecx
> > > +     addl    %ecx, %eax
> > > +# else
> > > +     subl    %edx, %edi
> > > +     addl    $(VEC_SIZE * 2 + 1), %edi
> > > +     addl    %edi, %eax
> > > +# endif
> > > +# ifdef USE_AS_WCSLEN
> > > +     shrl    $2, %eax
> > > +# endif
> > > +     VZEROUPPER_RETURN
> > >
> > > +     .p2align 4
> > > +L(first_vec_x4):
> > > +     tzcntl  %eax, %eax
> > > +     /* Safe to use 32 bit instructions as these are only called for
> > > +        size = [1, 159].  */
> > >  # ifdef USE_AS_STRNLEN
> > > -     subq    $(VEC_SIZE * 4), %rsi
> > > -     jbe     L(last_4x_vec_or_less)
> > > +     /* Use ecx which was computed earlier to compute correct value.
> > > +      */
> > > +     subl    $(VEC_SIZE + 1), %ecx
> > > +     addl    %ecx, %eax
> > > +# else
> > > +     subl    %edx, %edi
> > > +     addl    $(VEC_SIZE * 3 + 1), %edi
> > > +     addl    %edi, %eax
> > >  # endif
> > > +# ifdef USE_AS_WCSLEN
> > > +     shrl    $2, %eax
> > > +# endif
> > > +     VZEROUPPER_RETURN
> > >
> > > -L(more_4x_vec):
> > > +     .p2align 5
> > > +L(aligned_more):
> > > +     /* Align data to VEC_SIZE - 1. This is the same number of
> > > +        instructions as using andq with -VEC_SIZE but saves 4 bytes of
> > > +        code on the x4 check.  */
> > > +     orq     $(VEC_SIZE - 1), %rdi
> > > +L(cross_page_continue):
> > >       /* Check the first 4 * VEC_SIZE.  Only one VEC_SIZE at a time
> > >          since data is only aligned to VEC_SIZE.  */
> > > -     VPCMPEQ (%rdi), %ymm0, %ymm1
> > > -     vpmovmskb %ymm1, %eax
> > > -     testl   %eax, %eax
> > > -     jnz     L(first_vec_x0)
> > > -
> > > -     VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1
> > > -     vpmovmskb %ymm1, %eax
> > > +# ifdef USE_AS_STRNLEN
> > > +     /* + 1 because rdi is aligned to VEC_SIZE - 1. + CHAR_SIZE because
> > > +        it simplies the logic in last_4x_vec_or_less.  */
> > > +     leaq    (VEC_SIZE * 4 + CHAR_SIZE + 1)(%rdi), %rcx
> > > +     subq    %rdx, %rcx
> > > +# endif
> > > +     /* Load first VEC regardless.  */
> > > +     VPCMPEQ 1(%rdi), %ymm0, %ymm1
> > > +# ifdef USE_AS_STRNLEN
> > > +     /* Adjust length. If near end handle specially.  */
> > > +     subq    %rcx, %rsi
> > > +     jb      L(last_4x_vec_or_less)
> > > +# endif
> > > +     vpmovmskb       %ymm1, %eax
> > >       testl   %eax, %eax
> > >       jnz     L(first_vec_x1)
> > >
> > > -     VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm0, %ymm1
> > > -     vpmovmskb %ymm1, %eax
> > > +     VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
> > > +     vpmovmskb       %ymm1, %eax
> > >       testl   %eax, %eax
> > >       jnz     L(first_vec_x2)
> > >
> > > -     VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1
> > > -     vpmovmskb %ymm1, %eax
> > > +     VPCMPEQ (VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm1
> > > +     vpmovmskb       %ymm1, %eax
> > >       testl   %eax, %eax
> > >       jnz     L(first_vec_x3)
> > >
> > > -     addq    $(VEC_SIZE * 4), %rdi
> > > -
> > > -# ifdef USE_AS_STRNLEN
> > > -     subq    $(VEC_SIZE * 4), %rsi
> > > -     jbe     L(last_4x_vec_or_less)
> > > -# endif
> > > -
> > > -     /* Align data to 4 * VEC_SIZE.  */
> > > -     movq    %rdi, %rcx
> > > -     andl    $(4 * VEC_SIZE - 1), %ecx
> > > -     andq    $-(4 * VEC_SIZE), %rdi
> > > +     VPCMPEQ (VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm1
> > > +     vpmovmskb       %ymm1, %eax
> > > +     testl   %eax, %eax
> > > +     jnz     L(first_vec_x4)
> > >
> > > +     /* Align data to VEC_SIZE * 4 - 1.  */
> > >  # ifdef USE_AS_STRNLEN
> > > -     /* Adjust length.  */
> > > +     /* Before adjusting length check if at last VEC_SIZE * 4.  */
> > > +     cmpq    $(VEC_SIZE * 4 - 1), %rsi
> > > +     jbe     L(last_4x_vec_or_less_load)
> > > +     incq    %rdi
> > > +     movl    %edi, %ecx
> > > +     orq     $(VEC_SIZE * 4 - 1), %rdi
> > > +     andl    $(VEC_SIZE * 4 - 1), %ecx
> > > +     /* Readjust length.  */
> > >       addq    %rcx, %rsi
> > > +# else
> > > +     incq    %rdi
> > > +     orq     $(VEC_SIZE * 4 - 1), %rdi
> > >  # endif
> > > -
> > > +     /* Compare 4 * VEC at a time forward.  */
> > >       .p2align 4
> > >  L(loop_4x_vec):
> > > -     /* Compare 4 * VEC at a time forward.  */
> > > -     vmovdqa (%rdi), %ymm1
> > > -     vmovdqa VEC_SIZE(%rdi), %ymm2
> > > -     vmovdqa (VEC_SIZE * 2)(%rdi), %ymm3
> > > -     vmovdqa (VEC_SIZE * 3)(%rdi), %ymm4
> > > -     VPMINU  %ymm1, %ymm2, %ymm5
> > > -     VPMINU  %ymm3, %ymm4, %ymm6
> > > -     VPMINU  %ymm5, %ymm6, %ymm5
> > > -
> > > -     VPCMPEQ %ymm5, %ymm0, %ymm5
> > > -     vpmovmskb %ymm5, %eax
> > > -     testl   %eax, %eax
> > > -     jnz     L(4x_vec_end)
> > > -
> > > -     addq    $(VEC_SIZE * 4), %rdi
> > > -
> > > -# ifndef USE_AS_STRNLEN
> > > -     jmp     L(loop_4x_vec)
> > > -# else
> > > +# ifdef USE_AS_STRNLEN
> > > +     /* Break if at end of length.  */
> > >       subq    $(VEC_SIZE * 4), %rsi
> > > -     ja      L(loop_4x_vec)
> > > -
> > > -L(last_4x_vec_or_less):
> > > -     /* Less than 4 * VEC and aligned to VEC_SIZE.  */
> > > -     addl    $(VEC_SIZE * 2), %esi
> > > -     jle     L(last_2x_vec)
> > > +     jb      L(last_4x_vec_or_less_cmpeq)
> > > +# endif
> > > +     /* Save some code size by microfusing VPMINU with the load. Since
> > > +        the matches in ymm2/ymm4 can only be returned if there where no
> > > +        matches in ymm1/ymm3 respectively there is no issue with overlap.
> > > +      */
> > > +     vmovdqa 1(%rdi), %ymm1
> > > +     VPMINU  (VEC_SIZE + 1)(%rdi), %ymm1, %ymm2
> > > +     vmovdqa (VEC_SIZE * 2 + 1)(%rdi), %ymm3
> > > +     VPMINU  (VEC_SIZE * 3 + 1)(%rdi), %ymm3, %ymm4
> > > +
> > > +     VPMINU  %ymm2, %ymm4, %ymm5
> > > +     VPCMPEQ %ymm5, %ymm0, %ymm5
> > > +     vpmovmskb       %ymm5, %ecx
> > >
> > > -     VPCMPEQ (%rdi), %ymm0, %ymm1
> > > -     vpmovmskb %ymm1, %eax
> > > -     testl   %eax, %eax
> > > -     jnz     L(first_vec_x0)
> > > +     subq    $-(VEC_SIZE * 4), %rdi
> > > +     testl   %ecx, %ecx
> > > +     jz      L(loop_4x_vec)
> > >
> > > -     VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1
> > > -     vpmovmskb %ymm1, %eax
> > > -     testl   %eax, %eax
> > > -     jnz     L(first_vec_x1)
> > >
> > > -     VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm0, %ymm1
> > > -     vpmovmskb %ymm1, %eax
> > > +     VPCMPEQ %ymm1, %ymm0, %ymm1
> > > +     vpmovmskb       %ymm1, %eax
> > > +     subq    %rdx, %rdi
> > >       testl   %eax, %eax
> > > +     jnz     L(last_vec_return_x0)
> > >
> > > -     jnz     L(first_vec_x2_check)
> > > -     subl    $VEC_SIZE, %esi
> > > -     jle     L(max)
> > > -
> > > -     VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1
> > > -     vpmovmskb %ymm1, %eax
> > > +     VPCMPEQ %ymm2, %ymm0, %ymm2
> > > +     vpmovmskb       %ymm2, %eax
> > >       testl   %eax, %eax
> > > -
> > > -     jnz     L(first_vec_x3_check)
> > > -     movq    %r8, %rax
> > > -#  ifdef USE_AS_WCSLEN
> > > +     jnz     L(last_vec_return_x1)
> > > +
> > > +     /* Combine last 2 VEC.  */
> > > +     VPCMPEQ %ymm3, %ymm0, %ymm3
> > > +     vpmovmskb       %ymm3, %eax
> > > +     /* rcx has combined result from all 4 VEC. It will only be used if
> > > +        the first 3 other VEC all did not contain a match.  */
> > > +     salq    $32, %rcx
> > > +     orq     %rcx, %rax
> > > +     tzcntq  %rax, %rax
> > > +     subq    $(VEC_SIZE * 2 - 1), %rdi
> > > +     addq    %rdi, %rax
> > > +# ifdef USE_AS_WCSLEN
> > >       shrq    $2, %rax
> > > -#  endif
> > > +# endif
> > >       VZEROUPPER_RETURN
> > >
> > > +
> > > +# ifdef USE_AS_STRNLEN
> > >       .p2align 4
> > > -L(last_2x_vec):
> > > -     addl    $(VEC_SIZE * 2), %esi
> > > -     VPCMPEQ (%rdi), %ymm0, %ymm1
> > > -     vpmovmskb %ymm1, %eax
> > > -     testl   %eax, %eax
> > > +L(last_4x_vec_or_less_load):
> > > +     /* Depending on entry adjust rdi / prepare first VEC in ymm1.  */
> > > +     subq    $-(VEC_SIZE * 4), %rdi
> > > +L(last_4x_vec_or_less_cmpeq):
> > > +     VPCMPEQ 1(%rdi), %ymm0, %ymm1
> > > +L(last_4x_vec_or_less):
> > >
> > > -     jnz     L(first_vec_x0_check)
> > > -     subl    $VEC_SIZE, %esi
> > > -     jle     L(max)
> > > +     vpmovmskb       %ymm1, %eax
> > > +     /* If remaining length > VEC_SIZE * 2. This works if esi is off by
> > > +        VEC_SIZE * 4.  */
> > > +     testl   $(VEC_SIZE * 2), %esi
> > > +     jnz     L(last_4x_vec)
> > >
> > > -     VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1
> > > -     vpmovmskb %ymm1, %eax
> > > +     /* length may have been negative or positive by an offset of
> > > +        VEC_SIZE * 4 depending on where this was called from. This fixes
> > > +        that.  */
> > > +     andl    $(VEC_SIZE * 4 - 1), %esi
> > >       testl   %eax, %eax
> > > -     jnz     L(first_vec_x1_check)
> > > -     movq    %r8, %rax
> > > -#  ifdef USE_AS_WCSLEN
> > > -     shrq    $2, %rax
> > > -#  endif
> > > -     VZEROUPPER_RETURN
> > > +     jnz     L(last_vec_x1_check)
> > >
> > > -     .p2align 4
> > > -L(first_vec_x0_check):
> > > +     subl    $VEC_SIZE, %esi
> > > +     jb      L(max)
> > > +
> > > +     VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
> > > +     vpmovmskb       %ymm1, %eax
> > >       tzcntl  %eax, %eax
> > >       /* Check the end of data.  */
> > > -     cmpq    %rax, %rsi
> > > -     jbe     L(max)
> > > +     cmpl    %eax, %esi
> > > +     jb      L(max)
> > > +     subq    %rdx, %rdi
> > > +     addl    $(VEC_SIZE + 1), %eax
> > >       addq    %rdi, %rax
> > > -     subq    %rdx, %rax
> > >  #  ifdef USE_AS_WCSLEN
> > >       shrq    $2, %rax
> > >  #  endif
> > >       VZEROUPPER_RETURN
> > > +# endif
> > >
> > >       .p2align 4
> > > -L(first_vec_x1_check):
> > > +L(last_vec_return_x0):
> > >       tzcntl  %eax, %eax
> > > -     /* Check the end of data.  */
> > > -     cmpq    %rax, %rsi
> > > -     jbe     L(max)
> > > -     addq    $VEC_SIZE, %rax
> > > +     subq    $(VEC_SIZE * 4 - 1), %rdi
> > >       addq    %rdi, %rax
> > > -     subq    %rdx, %rax
> > > -#  ifdef USE_AS_WCSLEN
> > > +# ifdef USE_AS_WCSLEN
> > >       shrq    $2, %rax
> > > -#  endif
> > > +# endif
> > >       VZEROUPPER_RETURN
> > >
> > >       .p2align 4
> > > -L(first_vec_x2_check):
> > > +L(last_vec_return_x1):
> > >       tzcntl  %eax, %eax
> > > -     /* Check the end of data.  */
> > > -     cmpq    %rax, %rsi
> > > -     jbe     L(max)
> > > -     addq    $(VEC_SIZE * 2), %rax
> > > +     subq    $(VEC_SIZE * 3 - 1), %rdi
> > >       addq    %rdi, %rax
> > > -     subq    %rdx, %rax
> > > -#  ifdef USE_AS_WCSLEN
> > > +# ifdef USE_AS_WCSLEN
> > >       shrq    $2, %rax
> > > -#  endif
> > > +# endif
> > >       VZEROUPPER_RETURN
> > >
> > > +# ifdef USE_AS_STRNLEN
> > >       .p2align 4
> > > -L(first_vec_x3_check):
> > > +L(last_vec_x1_check):
> > > +
> > >       tzcntl  %eax, %eax
> > >       /* Check the end of data.  */
> > > -     cmpq    %rax, %rsi
> > > -     jbe     L(max)
> > > -     addq    $(VEC_SIZE * 3), %rax
> > > +     cmpl    %eax, %esi
> > > +     jb      L(max)
> > > +     subq    %rdx, %rdi
> > > +     incl    %eax
> > >       addq    %rdi, %rax
> > > -     subq    %rdx, %rax
> > >  #  ifdef USE_AS_WCSLEN
> > >       shrq    $2, %rax
> > >  #  endif
> > >       VZEROUPPER_RETURN
> > >
> > > -     .p2align 4
> > >  L(max):
> > >       movq    %r8, %rax
> > > +     VZEROUPPER_RETURN
> > > +
> > > +     .p2align 4
> > > +L(last_4x_vec):
> > > +     /* Test first 2x VEC normally.  */
> > > +     testl   %eax, %eax
> > > +     jnz     L(last_vec_x1)
> > > +
> > > +     VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
> > > +     vpmovmskb       %ymm1, %eax
> > > +     testl   %eax, %eax
> > > +     jnz     L(last_vec_x2)
> > > +
> > > +     /* Normalize length.  */
> > > +     andl    $(VEC_SIZE * 4 - 1), %esi
> > > +     VPCMPEQ (VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm1
> > > +     vpmovmskb       %ymm1, %eax
> > > +     testl   %eax, %eax
> > > +     jnz     L(last_vec_x3)
> > > +
> > > +     subl    $(VEC_SIZE * 3), %esi
> > > +     jb      L(max)
> > > +
> > > +     VPCMPEQ (VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm1
> > > +     vpmovmskb       %ymm1, %eax
> > > +     tzcntl  %eax, %eax
> > > +     /* Check the end of data.  */
> > > +     cmpl    %eax, %esi
> > > +     jb      L(max)
> > > +     subq    %rdx, %rdi
> > > +     addl    $(VEC_SIZE * 3 + 1), %eax
> > > +     addq    %rdi, %rax
> > >  #  ifdef USE_AS_WCSLEN
> > >       shrq    $2, %rax
> > >  #  endif
> > >       VZEROUPPER_RETURN
> > >
> > > -     .p2align 4
> > > -L(zero):
> > > -     xorl    %eax, %eax
> > > -     ret
> > > -# endif
> > >
> > >       .p2align 4
> > > -L(first_vec_x0):
> > > +L(last_vec_x1):
> > > +     /* essentially duplicates of first_vec_x1 but use 64 bit
> > > +        instructions.  */
> > >       tzcntl  %eax, %eax
> > > +     subq    %rdx, %rdi
> > > +     incl    %eax
> > >       addq    %rdi, %rax
> > > -     subq    %rdx, %rax
> > > -# ifdef USE_AS_WCSLEN
> > > +#  ifdef USE_AS_WCSLEN
> > >       shrq    $2, %rax
> > > -# endif
> > > +#  endif
> > >       VZEROUPPER_RETURN
> > >
> > >       .p2align 4
> > > -L(first_vec_x1):
> > > +L(last_vec_x2):
> > > +     /* essentially duplicates of first_vec_x1 but use 64 bit
> > > +        instructions.  */
> > >       tzcntl  %eax, %eax
> > > -     addq    $VEC_SIZE, %rax
> > > +     subq    %rdx, %rdi
> > > +     addl    $(VEC_SIZE + 1), %eax
> > >       addq    %rdi, %rax
> > > -     subq    %rdx, %rax
> > > -# ifdef USE_AS_WCSLEN
> > > +#  ifdef USE_AS_WCSLEN
> > >       shrq    $2, %rax
> > > -# endif
> > > +#  endif
> > >       VZEROUPPER_RETURN
> > >
> > >       .p2align 4
> > > -L(first_vec_x2):
> > > +L(last_vec_x3):
> > >       tzcntl  %eax, %eax
> > > -     addq    $(VEC_SIZE * 2), %rax
> > > +     subl    $(VEC_SIZE * 2), %esi
> > > +     /* Check the end of data.  */
> > > +     cmpl    %eax, %esi
> > > +     jb      L(max_end)
> > > +     subq    %rdx, %rdi
> > > +     addl    $(VEC_SIZE * 2 + 1), %eax
> > >       addq    %rdi, %rax
> > > -     subq    %rdx, %rax
> > > -# ifdef USE_AS_WCSLEN
> > > +#  ifdef USE_AS_WCSLEN
> > >       shrq    $2, %rax
> > > -# endif
> > > +#  endif
> > > +     VZEROUPPER_RETURN
> > > +L(max_end):
> > > +     movq    %r8, %rax
> > >       VZEROUPPER_RETURN
> > > +# endif
> > >
> > > +     /* Cold case for crossing page with first load.  */
> > >       .p2align 4
> > > -L(4x_vec_end):
> > > -     VPCMPEQ %ymm1, %ymm0, %ymm1
> > > -     vpmovmskb %ymm1, %eax
> > > -     testl   %eax, %eax
> > > -     jnz     L(first_vec_x0)
> > > -     VPCMPEQ %ymm2, %ymm0, %ymm2
> > > -     vpmovmskb %ymm2, %eax
> > > +L(cross_page_boundary):
> > > +     /* Align data to VEC_SIZE - 1.  */
> > > +     orq     $(VEC_SIZE - 1), %rdi
> > > +     VPCMPEQ -(VEC_SIZE - 1)(%rdi), %ymm0, %ymm1
> > > +     vpmovmskb       %ymm1, %eax
> > > +     /* Remove the leading bytes. sarxl only uses bits [5:0] of COUNT
> > > +        so no need to manually mod rdx.  */
> > > +     sarxl   %edx, %eax, %eax
> >
> > This is a BMI2 instruction, which is not necessary available when AVX2
> > is available. This causes SIGILL on some CPU. I have reported that in
> > https://sourceware.org/bugzilla/show_bug.cgi?id=29611
>
> This is not a bug on master as:
>
> commit 83c5b368226c34a2f0a5287df40fc290b2b34359
> Author: H.J. Lu <hjl.tools@gmail.com>
> Date:   Mon Apr 19 10:45:07 2021 -0700
>
>     x86-64: Require BMI2 for strchr-avx2.S
>
> is already in tree. The issue is the avx2 changes where backported
> w.o H.J's changes.
> >
> > Regards
> > Aurelien
> >
> > --
> > Aurelien Jarno                          GPG: 4096R/1DDD8C9B
> > aurelien@aurel32.net                 http://www.aurel32.net

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH v5 2/2] x86: Optimize strlen-avx2.S
  2022-09-28 13:54       ` Sunil Pandey
  2022-09-28 14:02         ` Darren Tristano
@ 2022-09-28 14:42         ` Noah Goldstein
  2022-09-28 14:54           ` Sunil Pandey
  2022-09-28 18:23         ` [PATCH v5 2/2] x86: Optimize strlen-avx2.S H.J. Lu
  2022-10-04 21:19         ` [PATCH v5 2/2] x86: Optimize strlen-avx2.S Aurelien Jarno
  3 siblings, 1 reply; 24+ messages in thread
From: Noah Goldstein @ 2022-09-28 14:42 UTC (permalink / raw)
  To: Sunil Pandey; +Cc: Libc-stable Mailing List, Hongjiu Lu, GNU C Library

On Wed, Sep 28, 2022 at 6:55 AM Sunil Pandey <skpgkp2@gmail.com> wrote:
>
> Attached patch fixes BZ# 29611.
>
> I would like to backport it to 2.32,2.31,2.30,2.29 and 2.29. Let me know
> if there is any objection.
The ifunc-impl-list changes are missing BMI2 for memchr-avx2.S

Can you post these as separate emails with the patches embedded instead of
attached?

>
>
> On Sun, Sep 25, 2022 at 7:00 AM Noah Goldstein via Libc-alpha
> <libc-alpha@sourceware.org> wrote:
> >
> > On Sun, Sep 25, 2022 at 1:19 AM Aurelien Jarno <aurelien@aurel32.net> wrote:
> > >
> > > On 2021-04-19 19:36, Noah Goldstein via Libc-alpha wrote:
> > > > No bug. This commit optimizes strlen-avx2.S. The optimizations are
> > > > mostly small things but they add up to roughly 10-30% performance
> > > > improvement for strlen. The results for strnlen are bit more
> > > > ambiguous. test-strlen, test-strnlen, test-wcslen, and test-wcsnlen
> > > > are all passing.
> > > >
> > > > Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
> > > > ---
> > > >  sysdeps/x86_64/multiarch/ifunc-impl-list.c |  16 +-
> > > >  sysdeps/x86_64/multiarch/strlen-avx2.S     | 532 +++++++++++++--------
> > > >  2 files changed, 334 insertions(+), 214 deletions(-)
> > > >
> > > > diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> > > > index c377cab629..651b32908e 100644
> > > > --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> > > > +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> > > > @@ -293,10 +293,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> > > >    /* Support sysdeps/x86_64/multiarch/strlen.c.  */
> > > >    IFUNC_IMPL (i, name, strlen,
> > > >             IFUNC_IMPL_ADD (array, i, strlen,
> > > > -                           CPU_FEATURE_USABLE (AVX2),
> > > > +                           (CPU_FEATURE_USABLE (AVX2)
> > > > +                            && CPU_FEATURE_USABLE (BMI2)),
> > > >                             __strlen_avx2)
> > > >             IFUNC_IMPL_ADD (array, i, strlen,
> > > >                             (CPU_FEATURE_USABLE (AVX2)
> > > > +                            && CPU_FEATURE_USABLE (BMI2)
> > > >                              && CPU_FEATURE_USABLE (RTM)),
> > > >                             __strlen_avx2_rtm)
> > > >             IFUNC_IMPL_ADD (array, i, strlen,
> > > > @@ -309,10 +311,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> > > >    /* Support sysdeps/x86_64/multiarch/strnlen.c.  */
> > > >    IFUNC_IMPL (i, name, strnlen,
> > > >             IFUNC_IMPL_ADD (array, i, strnlen,
> > > > -                           CPU_FEATURE_USABLE (AVX2),
> > > > +                           (CPU_FEATURE_USABLE (AVX2)
> > > > +                            && CPU_FEATURE_USABLE (BMI2)),
> > > >                             __strnlen_avx2)
> > > >             IFUNC_IMPL_ADD (array, i, strnlen,
> > > >                             (CPU_FEATURE_USABLE (AVX2)
> > > > +                            && CPU_FEATURE_USABLE (BMI2)
> > > >                              && CPU_FEATURE_USABLE (RTM)),
> > > >                             __strnlen_avx2_rtm)
> > > >             IFUNC_IMPL_ADD (array, i, strnlen,
> > > > @@ -654,10 +658,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> > > >    /* Support sysdeps/x86_64/multiarch/wcslen.c.  */
> > > >    IFUNC_IMPL (i, name, wcslen,
> > > >             IFUNC_IMPL_ADD (array, i, wcslen,
> > > > -                           CPU_FEATURE_USABLE (AVX2),
> > > > +                           (CPU_FEATURE_USABLE (AVX2)
> > > > +                            && CPU_FEATURE_USABLE (BMI2)),
> > > >                             __wcslen_avx2)
> > > >             IFUNC_IMPL_ADD (array, i, wcslen,
> > > >                             (CPU_FEATURE_USABLE (AVX2)
> > > > +                            && CPU_FEATURE_USABLE (BMI2)
> > > >                              && CPU_FEATURE_USABLE (RTM)),
> > > >                             __wcslen_avx2_rtm)
> > > >             IFUNC_IMPL_ADD (array, i, wcslen,
> > > > @@ -670,10 +676,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> > > >    /* Support sysdeps/x86_64/multiarch/wcsnlen.c.  */
> > > >    IFUNC_IMPL (i, name, wcsnlen,
> > > >             IFUNC_IMPL_ADD (array, i, wcsnlen,
> > > > -                           CPU_FEATURE_USABLE (AVX2),
> > > > +                           (CPU_FEATURE_USABLE (AVX2)
> > > > +                            && CPU_FEATURE_USABLE (BMI2)),
> > > >                             __wcsnlen_avx2)
> > > >             IFUNC_IMPL_ADD (array, i, wcsnlen,
> > > >                             (CPU_FEATURE_USABLE (AVX2)
> > > > +                            && CPU_FEATURE_USABLE (BMI2)
> > > >                              && CPU_FEATURE_USABLE (RTM)),
> > > >                             __wcsnlen_avx2_rtm)
> > > >             IFUNC_IMPL_ADD (array, i, wcsnlen,
> > > > diff --git a/sysdeps/x86_64/multiarch/strlen-avx2.S b/sysdeps/x86_64/multiarch/strlen-avx2.S
> > > > index 1caae9e6bc..bd2e6ee44a 100644
> > > > --- a/sysdeps/x86_64/multiarch/strlen-avx2.S
> > > > +++ b/sysdeps/x86_64/multiarch/strlen-avx2.S
> > > > @@ -27,9 +27,11 @@
> > > >  # ifdef USE_AS_WCSLEN
> > > >  #  define VPCMPEQ    vpcmpeqd
> > > >  #  define VPMINU     vpminud
> > > > +#  define CHAR_SIZE  4
> > > >  # else
> > > >  #  define VPCMPEQ    vpcmpeqb
> > > >  #  define VPMINU     vpminub
> > > > +#  define CHAR_SIZE  1
> > > >  # endif
> > > >
> > > >  # ifndef VZEROUPPER
> > > > @@ -41,349 +43,459 @@
> > > >  # endif
> > > >
> > > >  # define VEC_SIZE 32
> > > > +# define PAGE_SIZE 4096
> > > >
> > > >       .section SECTION(.text),"ax",@progbits
> > > >  ENTRY (STRLEN)
> > > >  # ifdef USE_AS_STRNLEN
> > > > -     /* Check for zero length.  */
> > > > +     /* Check zero length.  */
> > > >       test    %RSI_LP, %RSI_LP
> > > >       jz      L(zero)
> > > > +     /* Store max len in R8_LP before adjusting if using WCSLEN.  */
> > > > +     mov     %RSI_LP, %R8_LP
> > > >  #  ifdef USE_AS_WCSLEN
> > > >       shl     $2, %RSI_LP
> > > >  #  elif defined __ILP32__
> > > >       /* Clear the upper 32 bits.  */
> > > >       movl    %esi, %esi
> > > >  #  endif
> > > > -     mov     %RSI_LP, %R8_LP
> > > >  # endif
> > > > -     movl    %edi, %ecx
> > > > +     movl    %edi, %eax
> > > >       movq    %rdi, %rdx
> > > >       vpxor   %xmm0, %xmm0, %xmm0
> > > > -
> > > > +     /* Clear high bits from edi. Only keeping bits relevant to page
> > > > +        cross check.  */
> > > > +     andl    $(PAGE_SIZE - 1), %eax
> > > >       /* Check if we may cross page boundary with one vector load.  */
> > > > -     andl    $(2 * VEC_SIZE - 1), %ecx
> > > > -     cmpl    $VEC_SIZE, %ecx
> > > > -     ja      L(cros_page_boundary)
> > > > +     cmpl    $(PAGE_SIZE - VEC_SIZE), %eax
> > > > +     ja      L(cross_page_boundary)
> > > >
> > > >       /* Check the first VEC_SIZE bytes.  */
> > > > -     VPCMPEQ (%rdi), %ymm0, %ymm1
> > > > -     vpmovmskb %ymm1, %eax
> > > > -     testl   %eax, %eax
> > > > -
> > > > +     VPCMPEQ (%rdi), %ymm0, %ymm1
> > > > +     vpmovmskb       %ymm1, %eax
> > > >  # ifdef USE_AS_STRNLEN
> > > > -     jnz     L(first_vec_x0_check)
> > > > -     /* Adjust length and check the end of data.  */
> > > > -     subq    $VEC_SIZE, %rsi
> > > > -     jbe     L(max)
> > > > -# else
> > > > -     jnz     L(first_vec_x0)
> > > > +     /* If length < VEC_SIZE handle special.  */
> > > > +     cmpq    $VEC_SIZE, %rsi
> > > > +     jbe     L(first_vec_x0)
> > > >  # endif
> > > > -
> > > > -     /* Align data for aligned loads in the loop.  */
> > > > -     addq    $VEC_SIZE, %rdi
> > > > -     andl    $(VEC_SIZE - 1), %ecx
> > > > -     andq    $-VEC_SIZE, %rdi
> > > > +     /* If empty continue to aligned_more. Otherwise return bit
> > > > +        position of first match.  */
> > > > +     testl   %eax, %eax
> > > > +     jz      L(aligned_more)
> > > > +     tzcntl  %eax, %eax
> > > > +# ifdef USE_AS_WCSLEN
> > > > +     shrl    $2, %eax
> > > > +# endif
> > > > +     VZEROUPPER_RETURN
> > > >
> > > >  # ifdef USE_AS_STRNLEN
> > > > -     /* Adjust length.  */
> > > > -     addq    %rcx, %rsi
> > > > +L(zero):
> > > > +     xorl    %eax, %eax
> > > > +     ret
> > > >
> > > > -     subq    $(VEC_SIZE * 4), %rsi
> > > > -     jbe     L(last_4x_vec_or_less)
> > > > +     .p2align 4
> > > > +L(first_vec_x0):
> > > > +     /* Set bit for max len so that tzcnt will return min of max len
> > > > +        and position of first match.  */
> > > > +     btsq    %rsi, %rax
> > > > +     tzcntl  %eax, %eax
> > > > +#  ifdef USE_AS_WCSLEN
> > > > +     shrl    $2, %eax
> > > > +#  endif
> > > > +     VZEROUPPER_RETURN
> > > >  # endif
> > > > -     jmp     L(more_4x_vec)
> > > >
> > > >       .p2align 4
> > > > -L(cros_page_boundary):
> > > > -     andl    $(VEC_SIZE - 1), %ecx
> > > > -     andq    $-VEC_SIZE, %rdi
> > > > -     VPCMPEQ (%rdi), %ymm0, %ymm1
> > > > -     vpmovmskb %ymm1, %eax
> > > > -     /* Remove the leading bytes.  */
> > > > -     sarl    %cl, %eax
> > > > -     testl   %eax, %eax
> > > > -     jz      L(aligned_more)
> > > > +L(first_vec_x1):
> > > >       tzcntl  %eax, %eax
> > > > +     /* Safe to use 32 bit instructions as these are only called for
> > > > +        size = [1, 159].  */
> > > >  # ifdef USE_AS_STRNLEN
> > > > -     /* Check the end of data.  */
> > > > -     cmpq    %rax, %rsi
> > > > -     jbe     L(max)
> > > > +     /* Use ecx which was computed earlier to compute correct value.
> > > > +      */
> > > > +     subl    $(VEC_SIZE * 4 + 1), %ecx
> > > > +     addl    %ecx, %eax
> > > > +# else
> > > > +     subl    %edx, %edi
> > > > +     incl    %edi
> > > > +     addl    %edi, %eax
> > > >  # endif
> > > > -     addq    %rdi, %rax
> > > > -     addq    %rcx, %rax
> > > > -     subq    %rdx, %rax
> > > >  # ifdef USE_AS_WCSLEN
> > > > -     shrq    $2, %rax
> > > > +     shrl    $2, %eax
> > > >  # endif
> > > > -L(return_vzeroupper):
> > > > -     ZERO_UPPER_VEC_REGISTERS_RETURN
> > > > +     VZEROUPPER_RETURN
> > > >
> > > >       .p2align 4
> > > > -L(aligned_more):
> > > > +L(first_vec_x2):
> > > > +     tzcntl  %eax, %eax
> > > > +     /* Safe to use 32 bit instructions as these are only called for
> > > > +        size = [1, 159].  */
> > > >  # ifdef USE_AS_STRNLEN
> > > > -        /* "rcx" is less than VEC_SIZE.  Calculate "rdx + rcx - VEC_SIZE"
> > > > -         with "rdx - (VEC_SIZE - rcx)" instead of "(rdx + rcx) - VEC_SIZE"
> > > > -         to void possible addition overflow.  */
> > > > -     negq    %rcx
> > > > -     addq    $VEC_SIZE, %rcx
> > > > -
> > > > -     /* Check the end of data.  */
> > > > -     subq    %rcx, %rsi
> > > > -     jbe     L(max)
> > > > +     /* Use ecx which was computed earlier to compute correct value.
> > > > +      */
> > > > +     subl    $(VEC_SIZE * 3 + 1), %ecx
> > > > +     addl    %ecx, %eax
> > > > +# else
> > > > +     subl    %edx, %edi
> > > > +     addl    $(VEC_SIZE + 1), %edi
> > > > +     addl    %edi, %eax
> > > >  # endif
> > > > +# ifdef USE_AS_WCSLEN
> > > > +     shrl    $2, %eax
> > > > +# endif
> > > > +     VZEROUPPER_RETURN
> > > >
> > > > -     addq    $VEC_SIZE, %rdi
> > > > +     .p2align 4
> > > > +L(first_vec_x3):
> > > > +     tzcntl  %eax, %eax
> > > > +     /* Safe to use 32 bit instructions as these are only called for
> > > > +        size = [1, 159].  */
> > > > +# ifdef USE_AS_STRNLEN
> > > > +     /* Use ecx which was computed earlier to compute correct value.
> > > > +      */
> > > > +     subl    $(VEC_SIZE * 2 + 1), %ecx
> > > > +     addl    %ecx, %eax
> > > > +# else
> > > > +     subl    %edx, %edi
> > > > +     addl    $(VEC_SIZE * 2 + 1), %edi
> > > > +     addl    %edi, %eax
> > > > +# endif
> > > > +# ifdef USE_AS_WCSLEN
> > > > +     shrl    $2, %eax
> > > > +# endif
> > > > +     VZEROUPPER_RETURN
> > > >
> > > > +     .p2align 4
> > > > +L(first_vec_x4):
> > > > +     tzcntl  %eax, %eax
> > > > +     /* Safe to use 32 bit instructions as these are only called for
> > > > +        size = [1, 159].  */
> > > >  # ifdef USE_AS_STRNLEN
> > > > -     subq    $(VEC_SIZE * 4), %rsi
> > > > -     jbe     L(last_4x_vec_or_less)
> > > > +     /* Use ecx which was computed earlier to compute correct value.
> > > > +      */
> > > > +     subl    $(VEC_SIZE + 1), %ecx
> > > > +     addl    %ecx, %eax
> > > > +# else
> > > > +     subl    %edx, %edi
> > > > +     addl    $(VEC_SIZE * 3 + 1), %edi
> > > > +     addl    %edi, %eax
> > > >  # endif
> > > > +# ifdef USE_AS_WCSLEN
> > > > +     shrl    $2, %eax
> > > > +# endif
> > > > +     VZEROUPPER_RETURN
> > > >
> > > > -L(more_4x_vec):
> > > > +     .p2align 5
> > > > +L(aligned_more):
> > > > +     /* Align data to VEC_SIZE - 1. This is the same number of
> > > > +        instructions as using andq with -VEC_SIZE but saves 4 bytes of
> > > > +        code on the x4 check.  */
> > > > +     orq     $(VEC_SIZE - 1), %rdi
> > > > +L(cross_page_continue):
> > > >       /* Check the first 4 * VEC_SIZE.  Only one VEC_SIZE at a time
> > > >          since data is only aligned to VEC_SIZE.  */
> > > > -     VPCMPEQ (%rdi), %ymm0, %ymm1
> > > > -     vpmovmskb %ymm1, %eax
> > > > -     testl   %eax, %eax
> > > > -     jnz     L(first_vec_x0)
> > > > -
> > > > -     VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1
> > > > -     vpmovmskb %ymm1, %eax
> > > > +# ifdef USE_AS_STRNLEN
> > > > +     /* + 1 because rdi is aligned to VEC_SIZE - 1. + CHAR_SIZE because
> > > > +        it simplies the logic in last_4x_vec_or_less.  */
> > > > +     leaq    (VEC_SIZE * 4 + CHAR_SIZE + 1)(%rdi), %rcx
> > > > +     subq    %rdx, %rcx
> > > > +# endif
> > > > +     /* Load first VEC regardless.  */
> > > > +     VPCMPEQ 1(%rdi), %ymm0, %ymm1
> > > > +# ifdef USE_AS_STRNLEN
> > > > +     /* Adjust length. If near end handle specially.  */
> > > > +     subq    %rcx, %rsi
> > > > +     jb      L(last_4x_vec_or_less)
> > > > +# endif
> > > > +     vpmovmskb       %ymm1, %eax
> > > >       testl   %eax, %eax
> > > >       jnz     L(first_vec_x1)
> > > >
> > > > -     VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm0, %ymm1
> > > > -     vpmovmskb %ymm1, %eax
> > > > +     VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
> > > > +     vpmovmskb       %ymm1, %eax
> > > >       testl   %eax, %eax
> > > >       jnz     L(first_vec_x2)
> > > >
> > > > -     VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1
> > > > -     vpmovmskb %ymm1, %eax
> > > > +     VPCMPEQ (VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm1
> > > > +     vpmovmskb       %ymm1, %eax
> > > >       testl   %eax, %eax
> > > >       jnz     L(first_vec_x3)
> > > >
> > > > -     addq    $(VEC_SIZE * 4), %rdi
> > > > -
> > > > -# ifdef USE_AS_STRNLEN
> > > > -     subq    $(VEC_SIZE * 4), %rsi
> > > > -     jbe     L(last_4x_vec_or_less)
> > > > -# endif
> > > > -
> > > > -     /* Align data to 4 * VEC_SIZE.  */
> > > > -     movq    %rdi, %rcx
> > > > -     andl    $(4 * VEC_SIZE - 1), %ecx
> > > > -     andq    $-(4 * VEC_SIZE), %rdi
> > > > +     VPCMPEQ (VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm1
> > > > +     vpmovmskb       %ymm1, %eax
> > > > +     testl   %eax, %eax
> > > > +     jnz     L(first_vec_x4)
> > > >
> > > > +     /* Align data to VEC_SIZE * 4 - 1.  */
> > > >  # ifdef USE_AS_STRNLEN
> > > > -     /* Adjust length.  */
> > > > +     /* Before adjusting length check if at last VEC_SIZE * 4.  */
> > > > +     cmpq    $(VEC_SIZE * 4 - 1), %rsi
> > > > +     jbe     L(last_4x_vec_or_less_load)
> > > > +     incq    %rdi
> > > > +     movl    %edi, %ecx
> > > > +     orq     $(VEC_SIZE * 4 - 1), %rdi
> > > > +     andl    $(VEC_SIZE * 4 - 1), %ecx
> > > > +     /* Readjust length.  */
> > > >       addq    %rcx, %rsi
> > > > +# else
> > > > +     incq    %rdi
> > > > +     orq     $(VEC_SIZE * 4 - 1), %rdi
> > > >  # endif
> > > > -
> > > > +     /* Compare 4 * VEC at a time forward.  */
> > > >       .p2align 4
> > > >  L(loop_4x_vec):
> > > > -     /* Compare 4 * VEC at a time forward.  */
> > > > -     vmovdqa (%rdi), %ymm1
> > > > -     vmovdqa VEC_SIZE(%rdi), %ymm2
> > > > -     vmovdqa (VEC_SIZE * 2)(%rdi), %ymm3
> > > > -     vmovdqa (VEC_SIZE * 3)(%rdi), %ymm4
> > > > -     VPMINU  %ymm1, %ymm2, %ymm5
> > > > -     VPMINU  %ymm3, %ymm4, %ymm6
> > > > -     VPMINU  %ymm5, %ymm6, %ymm5
> > > > -
> > > > -     VPCMPEQ %ymm5, %ymm0, %ymm5
> > > > -     vpmovmskb %ymm5, %eax
> > > > -     testl   %eax, %eax
> > > > -     jnz     L(4x_vec_end)
> > > > -
> > > > -     addq    $(VEC_SIZE * 4), %rdi
> > > > -
> > > > -# ifndef USE_AS_STRNLEN
> > > > -     jmp     L(loop_4x_vec)
> > > > -# else
> > > > +# ifdef USE_AS_STRNLEN
> > > > +     /* Break if at end of length.  */
> > > >       subq    $(VEC_SIZE * 4), %rsi
> > > > -     ja      L(loop_4x_vec)
> > > > -
> > > > -L(last_4x_vec_or_less):
> > > > -     /* Less than 4 * VEC and aligned to VEC_SIZE.  */
> > > > -     addl    $(VEC_SIZE * 2), %esi
> > > > -     jle     L(last_2x_vec)
> > > > +     jb      L(last_4x_vec_or_less_cmpeq)
> > > > +# endif
> > > > +     /* Save some code size by microfusing VPMINU with the load. Since
> > > > +        the matches in ymm2/ymm4 can only be returned if there where no
> > > > +        matches in ymm1/ymm3 respectively there is no issue with overlap.
> > > > +      */
> > > > +     vmovdqa 1(%rdi), %ymm1
> > > > +     VPMINU  (VEC_SIZE + 1)(%rdi), %ymm1, %ymm2
> > > > +     vmovdqa (VEC_SIZE * 2 + 1)(%rdi), %ymm3
> > > > +     VPMINU  (VEC_SIZE * 3 + 1)(%rdi), %ymm3, %ymm4
> > > > +
> > > > +     VPMINU  %ymm2, %ymm4, %ymm5
> > > > +     VPCMPEQ %ymm5, %ymm0, %ymm5
> > > > +     vpmovmskb       %ymm5, %ecx
> > > >
> > > > -     VPCMPEQ (%rdi), %ymm0, %ymm1
> > > > -     vpmovmskb %ymm1, %eax
> > > > -     testl   %eax, %eax
> > > > -     jnz     L(first_vec_x0)
> > > > +     subq    $-(VEC_SIZE * 4), %rdi
> > > > +     testl   %ecx, %ecx
> > > > +     jz      L(loop_4x_vec)
> > > >
> > > > -     VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1
> > > > -     vpmovmskb %ymm1, %eax
> > > > -     testl   %eax, %eax
> > > > -     jnz     L(first_vec_x1)
> > > >
> > > > -     VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm0, %ymm1
> > > > -     vpmovmskb %ymm1, %eax
> > > > +     VPCMPEQ %ymm1, %ymm0, %ymm1
> > > > +     vpmovmskb       %ymm1, %eax
> > > > +     subq    %rdx, %rdi
> > > >       testl   %eax, %eax
> > > > +     jnz     L(last_vec_return_x0)
> > > >
> > > > -     jnz     L(first_vec_x2_check)
> > > > -     subl    $VEC_SIZE, %esi
> > > > -     jle     L(max)
> > > > -
> > > > -     VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1
> > > > -     vpmovmskb %ymm1, %eax
> > > > +     VPCMPEQ %ymm2, %ymm0, %ymm2
> > > > +     vpmovmskb       %ymm2, %eax
> > > >       testl   %eax, %eax
> > > > -
> > > > -     jnz     L(first_vec_x3_check)
> > > > -     movq    %r8, %rax
> > > > -#  ifdef USE_AS_WCSLEN
> > > > +     jnz     L(last_vec_return_x1)
> > > > +
> > > > +     /* Combine last 2 VEC.  */
> > > > +     VPCMPEQ %ymm3, %ymm0, %ymm3
> > > > +     vpmovmskb       %ymm3, %eax
> > > > +     /* rcx has combined result from all 4 VEC. It will only be used if
> > > > +        the first 3 other VEC all did not contain a match.  */
> > > > +     salq    $32, %rcx
> > > > +     orq     %rcx, %rax
> > > > +     tzcntq  %rax, %rax
> > > > +     subq    $(VEC_SIZE * 2 - 1), %rdi
> > > > +     addq    %rdi, %rax
> > > > +# ifdef USE_AS_WCSLEN
> > > >       shrq    $2, %rax
> > > > -#  endif
> > > > +# endif
> > > >       VZEROUPPER_RETURN
> > > >
> > > > +
> > > > +# ifdef USE_AS_STRNLEN
> > > >       .p2align 4
> > > > -L(last_2x_vec):
> > > > -     addl    $(VEC_SIZE * 2), %esi
> > > > -     VPCMPEQ (%rdi), %ymm0, %ymm1
> > > > -     vpmovmskb %ymm1, %eax
> > > > -     testl   %eax, %eax
> > > > +L(last_4x_vec_or_less_load):
> > > > +     /* Depending on entry adjust rdi / prepare first VEC in ymm1.  */
> > > > +     subq    $-(VEC_SIZE * 4), %rdi
> > > > +L(last_4x_vec_or_less_cmpeq):
> > > > +     VPCMPEQ 1(%rdi), %ymm0, %ymm1
> > > > +L(last_4x_vec_or_less):
> > > >
> > > > -     jnz     L(first_vec_x0_check)
> > > > -     subl    $VEC_SIZE, %esi
> > > > -     jle     L(max)
> > > > +     vpmovmskb       %ymm1, %eax
> > > > +     /* If remaining length > VEC_SIZE * 2. This works if esi is off by
> > > > +        VEC_SIZE * 4.  */
> > > > +     testl   $(VEC_SIZE * 2), %esi
> > > > +     jnz     L(last_4x_vec)
> > > >
> > > > -     VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1
> > > > -     vpmovmskb %ymm1, %eax
> > > > +     /* length may have been negative or positive by an offset of
> > > > +        VEC_SIZE * 4 depending on where this was called from. This fixes
> > > > +        that.  */
> > > > +     andl    $(VEC_SIZE * 4 - 1), %esi
> > > >       testl   %eax, %eax
> > > > -     jnz     L(first_vec_x1_check)
> > > > -     movq    %r8, %rax
> > > > -#  ifdef USE_AS_WCSLEN
> > > > -     shrq    $2, %rax
> > > > -#  endif
> > > > -     VZEROUPPER_RETURN
> > > > +     jnz     L(last_vec_x1_check)
> > > >
> > > > -     .p2align 4
> > > > -L(first_vec_x0_check):
> > > > +     subl    $VEC_SIZE, %esi
> > > > +     jb      L(max)
> > > > +
> > > > +     VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
> > > > +     vpmovmskb       %ymm1, %eax
> > > >       tzcntl  %eax, %eax
> > > >       /* Check the end of data.  */
> > > > -     cmpq    %rax, %rsi
> > > > -     jbe     L(max)
> > > > +     cmpl    %eax, %esi
> > > > +     jb      L(max)
> > > > +     subq    %rdx, %rdi
> > > > +     addl    $(VEC_SIZE + 1), %eax
> > > >       addq    %rdi, %rax
> > > > -     subq    %rdx, %rax
> > > >  #  ifdef USE_AS_WCSLEN
> > > >       shrq    $2, %rax
> > > >  #  endif
> > > >       VZEROUPPER_RETURN
> > > > +# endif
> > > >
> > > >       .p2align 4
> > > > -L(first_vec_x1_check):
> > > > +L(last_vec_return_x0):
> > > >       tzcntl  %eax, %eax
> > > > -     /* Check the end of data.  */
> > > > -     cmpq    %rax, %rsi
> > > > -     jbe     L(max)
> > > > -     addq    $VEC_SIZE, %rax
> > > > +     subq    $(VEC_SIZE * 4 - 1), %rdi
> > > >       addq    %rdi, %rax
> > > > -     subq    %rdx, %rax
> > > > -#  ifdef USE_AS_WCSLEN
> > > > +# ifdef USE_AS_WCSLEN
> > > >       shrq    $2, %rax
> > > > -#  endif
> > > > +# endif
> > > >       VZEROUPPER_RETURN
> > > >
> > > >       .p2align 4
> > > > -L(first_vec_x2_check):
> > > > +L(last_vec_return_x1):
> > > >       tzcntl  %eax, %eax
> > > > -     /* Check the end of data.  */
> > > > -     cmpq    %rax, %rsi
> > > > -     jbe     L(max)
> > > > -     addq    $(VEC_SIZE * 2), %rax
> > > > +     subq    $(VEC_SIZE * 3 - 1), %rdi
> > > >       addq    %rdi, %rax
> > > > -     subq    %rdx, %rax
> > > > -#  ifdef USE_AS_WCSLEN
> > > > +# ifdef USE_AS_WCSLEN
> > > >       shrq    $2, %rax
> > > > -#  endif
> > > > +# endif
> > > >       VZEROUPPER_RETURN
> > > >
> > > > +# ifdef USE_AS_STRNLEN
> > > >       .p2align 4
> > > > -L(first_vec_x3_check):
> > > > +L(last_vec_x1_check):
> > > > +
> > > >       tzcntl  %eax, %eax
> > > >       /* Check the end of data.  */
> > > > -     cmpq    %rax, %rsi
> > > > -     jbe     L(max)
> > > > -     addq    $(VEC_SIZE * 3), %rax
> > > > +     cmpl    %eax, %esi
> > > > +     jb      L(max)
> > > > +     subq    %rdx, %rdi
> > > > +     incl    %eax
> > > >       addq    %rdi, %rax
> > > > -     subq    %rdx, %rax
> > > >  #  ifdef USE_AS_WCSLEN
> > > >       shrq    $2, %rax
> > > >  #  endif
> > > >       VZEROUPPER_RETURN
> > > >
> > > > -     .p2align 4
> > > >  L(max):
> > > >       movq    %r8, %rax
> > > > +     VZEROUPPER_RETURN
> > > > +
> > > > +     .p2align 4
> > > > +L(last_4x_vec):
> > > > +     /* Test first 2x VEC normally.  */
> > > > +     testl   %eax, %eax
> > > > +     jnz     L(last_vec_x1)
> > > > +
> > > > +     VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
> > > > +     vpmovmskb       %ymm1, %eax
> > > > +     testl   %eax, %eax
> > > > +     jnz     L(last_vec_x2)
> > > > +
> > > > +     /* Normalize length.  */
> > > > +     andl    $(VEC_SIZE * 4 - 1), %esi
> > > > +     VPCMPEQ (VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm1
> > > > +     vpmovmskb       %ymm1, %eax
> > > > +     testl   %eax, %eax
> > > > +     jnz     L(last_vec_x3)
> > > > +
> > > > +     subl    $(VEC_SIZE * 3), %esi
> > > > +     jb      L(max)
> > > > +
> > > > +     VPCMPEQ (VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm1
> > > > +     vpmovmskb       %ymm1, %eax
> > > > +     tzcntl  %eax, %eax
> > > > +     /* Check the end of data.  */
> > > > +     cmpl    %eax, %esi
> > > > +     jb      L(max)
> > > > +     subq    %rdx, %rdi
> > > > +     addl    $(VEC_SIZE * 3 + 1), %eax
> > > > +     addq    %rdi, %rax
> > > >  #  ifdef USE_AS_WCSLEN
> > > >       shrq    $2, %rax
> > > >  #  endif
> > > >       VZEROUPPER_RETURN
> > > >
> > > > -     .p2align 4
> > > > -L(zero):
> > > > -     xorl    %eax, %eax
> > > > -     ret
> > > > -# endif
> > > >
> > > >       .p2align 4
> > > > -L(first_vec_x0):
> > > > +L(last_vec_x1):
> > > > +     /* essentially duplicates of first_vec_x1 but use 64 bit
> > > > +        instructions.  */
> > > >       tzcntl  %eax, %eax
> > > > +     subq    %rdx, %rdi
> > > > +     incl    %eax
> > > >       addq    %rdi, %rax
> > > > -     subq    %rdx, %rax
> > > > -# ifdef USE_AS_WCSLEN
> > > > +#  ifdef USE_AS_WCSLEN
> > > >       shrq    $2, %rax
> > > > -# endif
> > > > +#  endif
> > > >       VZEROUPPER_RETURN
> > > >
> > > >       .p2align 4
> > > > -L(first_vec_x1):
> > > > +L(last_vec_x2):
> > > > +     /* essentially duplicates of first_vec_x1 but use 64 bit
> > > > +        instructions.  */
> > > >       tzcntl  %eax, %eax
> > > > -     addq    $VEC_SIZE, %rax
> > > > +     subq    %rdx, %rdi
> > > > +     addl    $(VEC_SIZE + 1), %eax
> > > >       addq    %rdi, %rax
> > > > -     subq    %rdx, %rax
> > > > -# ifdef USE_AS_WCSLEN
> > > > +#  ifdef USE_AS_WCSLEN
> > > >       shrq    $2, %rax
> > > > -# endif
> > > > +#  endif
> > > >       VZEROUPPER_RETURN
> > > >
> > > >       .p2align 4
> > > > -L(first_vec_x2):
> > > > +L(last_vec_x3):
> > > >       tzcntl  %eax, %eax
> > > > -     addq    $(VEC_SIZE * 2), %rax
> > > > +     subl    $(VEC_SIZE * 2), %esi
> > > > +     /* Check the end of data.  */
> > > > +     cmpl    %eax, %esi
> > > > +     jb      L(max_end)
> > > > +     subq    %rdx, %rdi
> > > > +     addl    $(VEC_SIZE * 2 + 1), %eax
> > > >       addq    %rdi, %rax
> > > > -     subq    %rdx, %rax
> > > > -# ifdef USE_AS_WCSLEN
> > > > +#  ifdef USE_AS_WCSLEN
> > > >       shrq    $2, %rax
> > > > -# endif
> > > > +#  endif
> > > > +     VZEROUPPER_RETURN
> > > > +L(max_end):
> > > > +     movq    %r8, %rax
> > > >       VZEROUPPER_RETURN
> > > > +# endif
> > > >
> > > > +     /* Cold case for crossing page with first load.  */
> > > >       .p2align 4
> > > > -L(4x_vec_end):
> > > > -     VPCMPEQ %ymm1, %ymm0, %ymm1
> > > > -     vpmovmskb %ymm1, %eax
> > > > -     testl   %eax, %eax
> > > > -     jnz     L(first_vec_x0)
> > > > -     VPCMPEQ %ymm2, %ymm0, %ymm2
> > > > -     vpmovmskb %ymm2, %eax
> > > > +L(cross_page_boundary):
> > > > +     /* Align data to VEC_SIZE - 1.  */
> > > > +     orq     $(VEC_SIZE - 1), %rdi
> > > > +     VPCMPEQ -(VEC_SIZE - 1)(%rdi), %ymm0, %ymm1
> > > > +     vpmovmskb       %ymm1, %eax
> > > > +     /* Remove the leading bytes. sarxl only uses bits [5:0] of COUNT
> > > > +        so no need to manually mod rdx.  */
> > > > +     sarxl   %edx, %eax, %eax
> > >
> > > This is a BMI2 instruction, which is not necessary available when AVX2
> > > is available. This causes SIGILL on some CPU. I have reported that in
> > > https://sourceware.org/bugzilla/show_bug.cgi?id=29611
> >
> > This is not a bug on master as:
> >
> > commit 83c5b368226c34a2f0a5287df40fc290b2b34359
> > Author: H.J. Lu <hjl.tools@gmail.com>
> > Date:   Mon Apr 19 10:45:07 2021 -0700
> >
> >     x86-64: Require BMI2 for strchr-avx2.S
> >
> > is already in tree. The issue is the avx2 changes where backported
> > w.o H.J's changes.
> > >
> > > Regards
> > > Aurelien
> > >
> > > --
> > > Aurelien Jarno                          GPG: 4096R/1DDD8C9B
> > > aurelien@aurel32.net                 http://www.aurel32.net

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH v5 2/2] x86: Optimize strlen-avx2.S
  2022-09-28 14:42         ` Noah Goldstein
@ 2022-09-28 14:54           ` Sunil Pandey
  2022-09-28 15:00             ` Noah Goldstein
  0 siblings, 1 reply; 24+ messages in thread
From: Sunil Pandey @ 2022-09-28 14:54 UTC (permalink / raw)
  To: Noah Goldstein; +Cc: Libc-stable Mailing List, Hongjiu Lu, GNU C Library

On Wed, Sep 28, 2022 at 7:42 AM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> On Wed, Sep 28, 2022 at 6:55 AM Sunil Pandey <skpgkp2@gmail.com> wrote:
> >
> > Attached patch fixes BZ# 29611.
> >
> > I would like to backport it to 2.32,2.31,2.30,2.29 and 2.29. Let me know
> > if there is any objection.
> The ifunc-impl-list changes are missing BMI2 for memchr-avx2.S
>
> Can you post these as separate emails with the patches embedded instead of
> attached?
>
> >

Patches are also posted on bug report 29611.

https://sourceware.org/bugzilla/show_bug.cgi?id=29611

> >
> > On Sun, Sep 25, 2022 at 7:00 AM Noah Goldstein via Libc-alpha
> > <libc-alpha@sourceware.org> wrote:
> > >
> > > On Sun, Sep 25, 2022 at 1:19 AM Aurelien Jarno <aurelien@aurel32.net> wrote:
> > > >
> > > > On 2021-04-19 19:36, Noah Goldstein via Libc-alpha wrote:
> > > > > No bug. This commit optimizes strlen-avx2.S. The optimizations are
> > > > > mostly small things but they add up to roughly 10-30% performance
> > > > > improvement for strlen. The results for strnlen are bit more
> > > > > ambiguous. test-strlen, test-strnlen, test-wcslen, and test-wcsnlen
> > > > > are all passing.
> > > > >
> > > > > Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
> > > > > ---
> > > > >  sysdeps/x86_64/multiarch/ifunc-impl-list.c |  16 +-
> > > > >  sysdeps/x86_64/multiarch/strlen-avx2.S     | 532 +++++++++++++--------
> > > > >  2 files changed, 334 insertions(+), 214 deletions(-)
> > > > >
> > > > > diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> > > > > index c377cab629..651b32908e 100644
> > > > > --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> > > > > +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> > > > > @@ -293,10 +293,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> > > > >    /* Support sysdeps/x86_64/multiarch/strlen.c.  */
> > > > >    IFUNC_IMPL (i, name, strlen,
> > > > >             IFUNC_IMPL_ADD (array, i, strlen,
> > > > > -                           CPU_FEATURE_USABLE (AVX2),
> > > > > +                           (CPU_FEATURE_USABLE (AVX2)
> > > > > +                            && CPU_FEATURE_USABLE (BMI2)),
> > > > >                             __strlen_avx2)
> > > > >             IFUNC_IMPL_ADD (array, i, strlen,
> > > > >                             (CPU_FEATURE_USABLE (AVX2)
> > > > > +                            && CPU_FEATURE_USABLE (BMI2)
> > > > >                              && CPU_FEATURE_USABLE (RTM)),
> > > > >                             __strlen_avx2_rtm)
> > > > >             IFUNC_IMPL_ADD (array, i, strlen,
> > > > > @@ -309,10 +311,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> > > > >    /* Support sysdeps/x86_64/multiarch/strnlen.c.  */
> > > > >    IFUNC_IMPL (i, name, strnlen,
> > > > >             IFUNC_IMPL_ADD (array, i, strnlen,
> > > > > -                           CPU_FEATURE_USABLE (AVX2),
> > > > > +                           (CPU_FEATURE_USABLE (AVX2)
> > > > > +                            && CPU_FEATURE_USABLE (BMI2)),
> > > > >                             __strnlen_avx2)
> > > > >             IFUNC_IMPL_ADD (array, i, strnlen,
> > > > >                             (CPU_FEATURE_USABLE (AVX2)
> > > > > +                            && CPU_FEATURE_USABLE (BMI2)
> > > > >                              && CPU_FEATURE_USABLE (RTM)),
> > > > >                             __strnlen_avx2_rtm)
> > > > >             IFUNC_IMPL_ADD (array, i, strnlen,
> > > > > @@ -654,10 +658,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> > > > >    /* Support sysdeps/x86_64/multiarch/wcslen.c.  */
> > > > >    IFUNC_IMPL (i, name, wcslen,
> > > > >             IFUNC_IMPL_ADD (array, i, wcslen,
> > > > > -                           CPU_FEATURE_USABLE (AVX2),
> > > > > +                           (CPU_FEATURE_USABLE (AVX2)
> > > > > +                            && CPU_FEATURE_USABLE (BMI2)),
> > > > >                             __wcslen_avx2)
> > > > >             IFUNC_IMPL_ADD (array, i, wcslen,
> > > > >                             (CPU_FEATURE_USABLE (AVX2)
> > > > > +                            && CPU_FEATURE_USABLE (BMI2)
> > > > >                              && CPU_FEATURE_USABLE (RTM)),
> > > > >                             __wcslen_avx2_rtm)
> > > > >             IFUNC_IMPL_ADD (array, i, wcslen,
> > > > > @@ -670,10 +676,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> > > > >    /* Support sysdeps/x86_64/multiarch/wcsnlen.c.  */
> > > > >    IFUNC_IMPL (i, name, wcsnlen,
> > > > >             IFUNC_IMPL_ADD (array, i, wcsnlen,
> > > > > -                           CPU_FEATURE_USABLE (AVX2),
> > > > > +                           (CPU_FEATURE_USABLE (AVX2)
> > > > > +                            && CPU_FEATURE_USABLE (BMI2)),
> > > > >                             __wcsnlen_avx2)
> > > > >             IFUNC_IMPL_ADD (array, i, wcsnlen,
> > > > >                             (CPU_FEATURE_USABLE (AVX2)
> > > > > +                            && CPU_FEATURE_USABLE (BMI2)
> > > > >                              && CPU_FEATURE_USABLE (RTM)),
> > > > >                             __wcsnlen_avx2_rtm)
> > > > >             IFUNC_IMPL_ADD (array, i, wcsnlen,
> > > > > diff --git a/sysdeps/x86_64/multiarch/strlen-avx2.S b/sysdeps/x86_64/multiarch/strlen-avx2.S
> > > > > index 1caae9e6bc..bd2e6ee44a 100644
> > > > > --- a/sysdeps/x86_64/multiarch/strlen-avx2.S
> > > > > +++ b/sysdeps/x86_64/multiarch/strlen-avx2.S
> > > > > @@ -27,9 +27,11 @@
> > > > >  # ifdef USE_AS_WCSLEN
> > > > >  #  define VPCMPEQ    vpcmpeqd
> > > > >  #  define VPMINU     vpminud
> > > > > +#  define CHAR_SIZE  4
> > > > >  # else
> > > > >  #  define VPCMPEQ    vpcmpeqb
> > > > >  #  define VPMINU     vpminub
> > > > > +#  define CHAR_SIZE  1
> > > > >  # endif
> > > > >
> > > > >  # ifndef VZEROUPPER
> > > > > @@ -41,349 +43,459 @@
> > > > >  # endif
> > > > >
> > > > >  # define VEC_SIZE 32
> > > > > +# define PAGE_SIZE 4096
> > > > >
> > > > >       .section SECTION(.text),"ax",@progbits
> > > > >  ENTRY (STRLEN)
> > > > >  # ifdef USE_AS_STRNLEN
> > > > > -     /* Check for zero length.  */
> > > > > +     /* Check zero length.  */
> > > > >       test    %RSI_LP, %RSI_LP
> > > > >       jz      L(zero)
> > > > > +     /* Store max len in R8_LP before adjusting if using WCSLEN.  */
> > > > > +     mov     %RSI_LP, %R8_LP
> > > > >  #  ifdef USE_AS_WCSLEN
> > > > >       shl     $2, %RSI_LP
> > > > >  #  elif defined __ILP32__
> > > > >       /* Clear the upper 32 bits.  */
> > > > >       movl    %esi, %esi
> > > > >  #  endif
> > > > > -     mov     %RSI_LP, %R8_LP
> > > > >  # endif
> > > > > -     movl    %edi, %ecx
> > > > > +     movl    %edi, %eax
> > > > >       movq    %rdi, %rdx
> > > > >       vpxor   %xmm0, %xmm0, %xmm0
> > > > > -
> > > > > +     /* Clear high bits from edi. Only keeping bits relevant to page
> > > > > +        cross check.  */
> > > > > +     andl    $(PAGE_SIZE - 1), %eax
> > > > >       /* Check if we may cross page boundary with one vector load.  */
> > > > > -     andl    $(2 * VEC_SIZE - 1), %ecx
> > > > > -     cmpl    $VEC_SIZE, %ecx
> > > > > -     ja      L(cros_page_boundary)
> > > > > +     cmpl    $(PAGE_SIZE - VEC_SIZE), %eax
> > > > > +     ja      L(cross_page_boundary)
> > > > >
> > > > >       /* Check the first VEC_SIZE bytes.  */
> > > > > -     VPCMPEQ (%rdi), %ymm0, %ymm1
> > > > > -     vpmovmskb %ymm1, %eax
> > > > > -     testl   %eax, %eax
> > > > > -
> > > > > +     VPCMPEQ (%rdi), %ymm0, %ymm1
> > > > > +     vpmovmskb       %ymm1, %eax
> > > > >  # ifdef USE_AS_STRNLEN
> > > > > -     jnz     L(first_vec_x0_check)
> > > > > -     /* Adjust length and check the end of data.  */
> > > > > -     subq    $VEC_SIZE, %rsi
> > > > > -     jbe     L(max)
> > > > > -# else
> > > > > -     jnz     L(first_vec_x0)
> > > > > +     /* If length < VEC_SIZE handle special.  */
> > > > > +     cmpq    $VEC_SIZE, %rsi
> > > > > +     jbe     L(first_vec_x0)
> > > > >  # endif
> > > > > -
> > > > > -     /* Align data for aligned loads in the loop.  */
> > > > > -     addq    $VEC_SIZE, %rdi
> > > > > -     andl    $(VEC_SIZE - 1), %ecx
> > > > > -     andq    $-VEC_SIZE, %rdi
> > > > > +     /* If empty continue to aligned_more. Otherwise return bit
> > > > > +        position of first match.  */
> > > > > +     testl   %eax, %eax
> > > > > +     jz      L(aligned_more)
> > > > > +     tzcntl  %eax, %eax
> > > > > +# ifdef USE_AS_WCSLEN
> > > > > +     shrl    $2, %eax
> > > > > +# endif
> > > > > +     VZEROUPPER_RETURN
> > > > >
> > > > >  # ifdef USE_AS_STRNLEN
> > > > > -     /* Adjust length.  */
> > > > > -     addq    %rcx, %rsi
> > > > > +L(zero):
> > > > > +     xorl    %eax, %eax
> > > > > +     ret
> > > > >
> > > > > -     subq    $(VEC_SIZE * 4), %rsi
> > > > > -     jbe     L(last_4x_vec_or_less)
> > > > > +     .p2align 4
> > > > > +L(first_vec_x0):
> > > > > +     /* Set bit for max len so that tzcnt will return min of max len
> > > > > +        and position of first match.  */
> > > > > +     btsq    %rsi, %rax
> > > > > +     tzcntl  %eax, %eax
> > > > > +#  ifdef USE_AS_WCSLEN
> > > > > +     shrl    $2, %eax
> > > > > +#  endif
> > > > > +     VZEROUPPER_RETURN
> > > > >  # endif
> > > > > -     jmp     L(more_4x_vec)
> > > > >
> > > > >       .p2align 4
> > > > > -L(cros_page_boundary):
> > > > > -     andl    $(VEC_SIZE - 1), %ecx
> > > > > -     andq    $-VEC_SIZE, %rdi
> > > > > -     VPCMPEQ (%rdi), %ymm0, %ymm1
> > > > > -     vpmovmskb %ymm1, %eax
> > > > > -     /* Remove the leading bytes.  */
> > > > > -     sarl    %cl, %eax
> > > > > -     testl   %eax, %eax
> > > > > -     jz      L(aligned_more)
> > > > > +L(first_vec_x1):
> > > > >       tzcntl  %eax, %eax
> > > > > +     /* Safe to use 32 bit instructions as these are only called for
> > > > > +        size = [1, 159].  */
> > > > >  # ifdef USE_AS_STRNLEN
> > > > > -     /* Check the end of data.  */
> > > > > -     cmpq    %rax, %rsi
> > > > > -     jbe     L(max)
> > > > > +     /* Use ecx which was computed earlier to compute correct value.
> > > > > +      */
> > > > > +     subl    $(VEC_SIZE * 4 + 1), %ecx
> > > > > +     addl    %ecx, %eax
> > > > > +# else
> > > > > +     subl    %edx, %edi
> > > > > +     incl    %edi
> > > > > +     addl    %edi, %eax
> > > > >  # endif
> > > > > -     addq    %rdi, %rax
> > > > > -     addq    %rcx, %rax
> > > > > -     subq    %rdx, %rax
> > > > >  # ifdef USE_AS_WCSLEN
> > > > > -     shrq    $2, %rax
> > > > > +     shrl    $2, %eax
> > > > >  # endif
> > > > > -L(return_vzeroupper):
> > > > > -     ZERO_UPPER_VEC_REGISTERS_RETURN
> > > > > +     VZEROUPPER_RETURN
> > > > >
> > > > >       .p2align 4
> > > > > -L(aligned_more):
> > > > > +L(first_vec_x2):
> > > > > +     tzcntl  %eax, %eax
> > > > > +     /* Safe to use 32 bit instructions as these are only called for
> > > > > +        size = [1, 159].  */
> > > > >  # ifdef USE_AS_STRNLEN
> > > > > -        /* "rcx" is less than VEC_SIZE.  Calculate "rdx + rcx - VEC_SIZE"
> > > > > -         with "rdx - (VEC_SIZE - rcx)" instead of "(rdx + rcx) - VEC_SIZE"
> > > > > -         to void possible addition overflow.  */
> > > > > -     negq    %rcx
> > > > > -     addq    $VEC_SIZE, %rcx
> > > > > -
> > > > > -     /* Check the end of data.  */
> > > > > -     subq    %rcx, %rsi
> > > > > -     jbe     L(max)
> > > > > +     /* Use ecx which was computed earlier to compute correct value.
> > > > > +      */
> > > > > +     subl    $(VEC_SIZE * 3 + 1), %ecx
> > > > > +     addl    %ecx, %eax
> > > > > +# else
> > > > > +     subl    %edx, %edi
> > > > > +     addl    $(VEC_SIZE + 1), %edi
> > > > > +     addl    %edi, %eax
> > > > >  # endif
> > > > > +# ifdef USE_AS_WCSLEN
> > > > > +     shrl    $2, %eax
> > > > > +# endif
> > > > > +     VZEROUPPER_RETURN
> > > > >
> > > > > -     addq    $VEC_SIZE, %rdi
> > > > > +     .p2align 4
> > > > > +L(first_vec_x3):
> > > > > +     tzcntl  %eax, %eax
> > > > > +     /* Safe to use 32 bit instructions as these are only called for
> > > > > +        size = [1, 159].  */
> > > > > +# ifdef USE_AS_STRNLEN
> > > > > +     /* Use ecx which was computed earlier to compute correct value.
> > > > > +      */
> > > > > +     subl    $(VEC_SIZE * 2 + 1), %ecx
> > > > > +     addl    %ecx, %eax
> > > > > +# else
> > > > > +     subl    %edx, %edi
> > > > > +     addl    $(VEC_SIZE * 2 + 1), %edi
> > > > > +     addl    %edi, %eax
> > > > > +# endif
> > > > > +# ifdef USE_AS_WCSLEN
> > > > > +     shrl    $2, %eax
> > > > > +# endif
> > > > > +     VZEROUPPER_RETURN
> > > > >
> > > > > +     .p2align 4
> > > > > +L(first_vec_x4):
> > > > > +     tzcntl  %eax, %eax
> > > > > +     /* Safe to use 32 bit instructions as these are only called for
> > > > > +        size = [1, 159].  */
> > > > >  # ifdef USE_AS_STRNLEN
> > > > > -     subq    $(VEC_SIZE * 4), %rsi
> > > > > -     jbe     L(last_4x_vec_or_less)
> > > > > +     /* Use ecx which was computed earlier to compute correct value.
> > > > > +      */
> > > > > +     subl    $(VEC_SIZE + 1), %ecx
> > > > > +     addl    %ecx, %eax
> > > > > +# else
> > > > > +     subl    %edx, %edi
> > > > > +     addl    $(VEC_SIZE * 3 + 1), %edi
> > > > > +     addl    %edi, %eax
> > > > >  # endif
> > > > > +# ifdef USE_AS_WCSLEN
> > > > > +     shrl    $2, %eax
> > > > > +# endif
> > > > > +     VZEROUPPER_RETURN
> > > > >
> > > > > -L(more_4x_vec):
> > > > > +     .p2align 5
> > > > > +L(aligned_more):
> > > > > +     /* Align data to VEC_SIZE - 1. This is the same number of
> > > > > +        instructions as using andq with -VEC_SIZE but saves 4 bytes of
> > > > > +        code on the x4 check.  */
> > > > > +     orq     $(VEC_SIZE - 1), %rdi
> > > > > +L(cross_page_continue):
> > > > >       /* Check the first 4 * VEC_SIZE.  Only one VEC_SIZE at a time
> > > > >          since data is only aligned to VEC_SIZE.  */
> > > > > -     VPCMPEQ (%rdi), %ymm0, %ymm1
> > > > > -     vpmovmskb %ymm1, %eax
> > > > > -     testl   %eax, %eax
> > > > > -     jnz     L(first_vec_x0)
> > > > > -
> > > > > -     VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1
> > > > > -     vpmovmskb %ymm1, %eax
> > > > > +# ifdef USE_AS_STRNLEN
> > > > > +     /* + 1 because rdi is aligned to VEC_SIZE - 1. + CHAR_SIZE because
> > > > > +        it simplies the logic in last_4x_vec_or_less.  */
> > > > > +     leaq    (VEC_SIZE * 4 + CHAR_SIZE + 1)(%rdi), %rcx
> > > > > +     subq    %rdx, %rcx
> > > > > +# endif
> > > > > +     /* Load first VEC regardless.  */
> > > > > +     VPCMPEQ 1(%rdi), %ymm0, %ymm1
> > > > > +# ifdef USE_AS_STRNLEN
> > > > > +     /* Adjust length. If near end handle specially.  */
> > > > > +     subq    %rcx, %rsi
> > > > > +     jb      L(last_4x_vec_or_less)
> > > > > +# endif
> > > > > +     vpmovmskb       %ymm1, %eax
> > > > >       testl   %eax, %eax
> > > > >       jnz     L(first_vec_x1)
> > > > >
> > > > > -     VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm0, %ymm1
> > > > > -     vpmovmskb %ymm1, %eax
> > > > > +     VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
> > > > > +     vpmovmskb       %ymm1, %eax
> > > > >       testl   %eax, %eax
> > > > >       jnz     L(first_vec_x2)
> > > > >
> > > > > -     VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1
> > > > > -     vpmovmskb %ymm1, %eax
> > > > > +     VPCMPEQ (VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm1
> > > > > +     vpmovmskb       %ymm1, %eax
> > > > >       testl   %eax, %eax
> > > > >       jnz     L(first_vec_x3)
> > > > >
> > > > > -     addq    $(VEC_SIZE * 4), %rdi
> > > > > -
> > > > > -# ifdef USE_AS_STRNLEN
> > > > > -     subq    $(VEC_SIZE * 4), %rsi
> > > > > -     jbe     L(last_4x_vec_or_less)
> > > > > -# endif
> > > > > -
> > > > > -     /* Align data to 4 * VEC_SIZE.  */
> > > > > -     movq    %rdi, %rcx
> > > > > -     andl    $(4 * VEC_SIZE - 1), %ecx
> > > > > -     andq    $-(4 * VEC_SIZE), %rdi
> > > > > +     VPCMPEQ (VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm1
> > > > > +     vpmovmskb       %ymm1, %eax
> > > > > +     testl   %eax, %eax
> > > > > +     jnz     L(first_vec_x4)
> > > > >
> > > > > +     /* Align data to VEC_SIZE * 4 - 1.  */
> > > > >  # ifdef USE_AS_STRNLEN
> > > > > -     /* Adjust length.  */
> > > > > +     /* Before adjusting length check if at last VEC_SIZE * 4.  */
> > > > > +     cmpq    $(VEC_SIZE * 4 - 1), %rsi
> > > > > +     jbe     L(last_4x_vec_or_less_load)
> > > > > +     incq    %rdi
> > > > > +     movl    %edi, %ecx
> > > > > +     orq     $(VEC_SIZE * 4 - 1), %rdi
> > > > > +     andl    $(VEC_SIZE * 4 - 1), %ecx
> > > > > +     /* Readjust length.  */
> > > > >       addq    %rcx, %rsi
> > > > > +# else
> > > > > +     incq    %rdi
> > > > > +     orq     $(VEC_SIZE * 4 - 1), %rdi
> > > > >  # endif
> > > > > -
> > > > > +     /* Compare 4 * VEC at a time forward.  */
> > > > >       .p2align 4
> > > > >  L(loop_4x_vec):
> > > > > -     /* Compare 4 * VEC at a time forward.  */
> > > > > -     vmovdqa (%rdi), %ymm1
> > > > > -     vmovdqa VEC_SIZE(%rdi), %ymm2
> > > > > -     vmovdqa (VEC_SIZE * 2)(%rdi), %ymm3
> > > > > -     vmovdqa (VEC_SIZE * 3)(%rdi), %ymm4
> > > > > -     VPMINU  %ymm1, %ymm2, %ymm5
> > > > > -     VPMINU  %ymm3, %ymm4, %ymm6
> > > > > -     VPMINU  %ymm5, %ymm6, %ymm5
> > > > > -
> > > > > -     VPCMPEQ %ymm5, %ymm0, %ymm5
> > > > > -     vpmovmskb %ymm5, %eax
> > > > > -     testl   %eax, %eax
> > > > > -     jnz     L(4x_vec_end)
> > > > > -
> > > > > -     addq    $(VEC_SIZE * 4), %rdi
> > > > > -
> > > > > -# ifndef USE_AS_STRNLEN
> > > > > -     jmp     L(loop_4x_vec)
> > > > > -# else
> > > > > +# ifdef USE_AS_STRNLEN
> > > > > +     /* Break if at end of length.  */
> > > > >       subq    $(VEC_SIZE * 4), %rsi
> > > > > -     ja      L(loop_4x_vec)
> > > > > -
> > > > > -L(last_4x_vec_or_less):
> > > > > -     /* Less than 4 * VEC and aligned to VEC_SIZE.  */
> > > > > -     addl    $(VEC_SIZE * 2), %esi
> > > > > -     jle     L(last_2x_vec)
> > > > > +     jb      L(last_4x_vec_or_less_cmpeq)
> > > > > +# endif
> > > > > +     /* Save some code size by microfusing VPMINU with the load. Since
> > > > > +        the matches in ymm2/ymm4 can only be returned if there where no
> > > > > +        matches in ymm1/ymm3 respectively there is no issue with overlap.
> > > > > +      */
> > > > > +     vmovdqa 1(%rdi), %ymm1
> > > > > +     VPMINU  (VEC_SIZE + 1)(%rdi), %ymm1, %ymm2
> > > > > +     vmovdqa (VEC_SIZE * 2 + 1)(%rdi), %ymm3
> > > > > +     VPMINU  (VEC_SIZE * 3 + 1)(%rdi), %ymm3, %ymm4
> > > > > +
> > > > > +     VPMINU  %ymm2, %ymm4, %ymm5
> > > > > +     VPCMPEQ %ymm5, %ymm0, %ymm5
> > > > > +     vpmovmskb       %ymm5, %ecx
> > > > >
> > > > > -     VPCMPEQ (%rdi), %ymm0, %ymm1
> > > > > -     vpmovmskb %ymm1, %eax
> > > > > -     testl   %eax, %eax
> > > > > -     jnz     L(first_vec_x0)
> > > > > +     subq    $-(VEC_SIZE * 4), %rdi
> > > > > +     testl   %ecx, %ecx
> > > > > +     jz      L(loop_4x_vec)
> > > > >
> > > > > -     VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1
> > > > > -     vpmovmskb %ymm1, %eax
> > > > > -     testl   %eax, %eax
> > > > > -     jnz     L(first_vec_x1)
> > > > >
> > > > > -     VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm0, %ymm1
> > > > > -     vpmovmskb %ymm1, %eax
> > > > > +     VPCMPEQ %ymm1, %ymm0, %ymm1
> > > > > +     vpmovmskb       %ymm1, %eax
> > > > > +     subq    %rdx, %rdi
> > > > >       testl   %eax, %eax
> > > > > +     jnz     L(last_vec_return_x0)
> > > > >
> > > > > -     jnz     L(first_vec_x2_check)
> > > > > -     subl    $VEC_SIZE, %esi
> > > > > -     jle     L(max)
> > > > > -
> > > > > -     VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1
> > > > > -     vpmovmskb %ymm1, %eax
> > > > > +     VPCMPEQ %ymm2, %ymm0, %ymm2
> > > > > +     vpmovmskb       %ymm2, %eax
> > > > >       testl   %eax, %eax
> > > > > -
> > > > > -     jnz     L(first_vec_x3_check)
> > > > > -     movq    %r8, %rax
> > > > > -#  ifdef USE_AS_WCSLEN
> > > > > +     jnz     L(last_vec_return_x1)
> > > > > +
> > > > > +     /* Combine last 2 VEC.  */
> > > > > +     VPCMPEQ %ymm3, %ymm0, %ymm3
> > > > > +     vpmovmskb       %ymm3, %eax
> > > > > +     /* rcx has combined result from all 4 VEC. It will only be used if
> > > > > +        the first 3 other VEC all did not contain a match.  */
> > > > > +     salq    $32, %rcx
> > > > > +     orq     %rcx, %rax
> > > > > +     tzcntq  %rax, %rax
> > > > > +     subq    $(VEC_SIZE * 2 - 1), %rdi
> > > > > +     addq    %rdi, %rax
> > > > > +# ifdef USE_AS_WCSLEN
> > > > >       shrq    $2, %rax
> > > > > -#  endif
> > > > > +# endif
> > > > >       VZEROUPPER_RETURN
> > > > >
> > > > > +
> > > > > +# ifdef USE_AS_STRNLEN
> > > > >       .p2align 4
> > > > > -L(last_2x_vec):
> > > > > -     addl    $(VEC_SIZE * 2), %esi
> > > > > -     VPCMPEQ (%rdi), %ymm0, %ymm1
> > > > > -     vpmovmskb %ymm1, %eax
> > > > > -     testl   %eax, %eax
> > > > > +L(last_4x_vec_or_less_load):
> > > > > +     /* Depending on entry adjust rdi / prepare first VEC in ymm1.  */
> > > > > +     subq    $-(VEC_SIZE * 4), %rdi
> > > > > +L(last_4x_vec_or_less_cmpeq):
> > > > > +     VPCMPEQ 1(%rdi), %ymm0, %ymm1
> > > > > +L(last_4x_vec_or_less):
> > > > >
> > > > > -     jnz     L(first_vec_x0_check)
> > > > > -     subl    $VEC_SIZE, %esi
> > > > > -     jle     L(max)
> > > > > +     vpmovmskb       %ymm1, %eax
> > > > > +     /* If remaining length > VEC_SIZE * 2. This works if esi is off by
> > > > > +        VEC_SIZE * 4.  */
> > > > > +     testl   $(VEC_SIZE * 2), %esi
> > > > > +     jnz     L(last_4x_vec)
> > > > >
> > > > > -     VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1
> > > > > -     vpmovmskb %ymm1, %eax
> > > > > +     /* length may have been negative or positive by an offset of
> > > > > +        VEC_SIZE * 4 depending on where this was called from. This fixes
> > > > > +        that.  */
> > > > > +     andl    $(VEC_SIZE * 4 - 1), %esi
> > > > >       testl   %eax, %eax
> > > > > -     jnz     L(first_vec_x1_check)
> > > > > -     movq    %r8, %rax
> > > > > -#  ifdef USE_AS_WCSLEN
> > > > > -     shrq    $2, %rax
> > > > > -#  endif
> > > > > -     VZEROUPPER_RETURN
> > > > > +     jnz     L(last_vec_x1_check)
> > > > >
> > > > > -     .p2align 4
> > > > > -L(first_vec_x0_check):
> > > > > +     subl    $VEC_SIZE, %esi
> > > > > +     jb      L(max)
> > > > > +
> > > > > +     VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
> > > > > +     vpmovmskb       %ymm1, %eax
> > > > >       tzcntl  %eax, %eax
> > > > >       /* Check the end of data.  */
> > > > > -     cmpq    %rax, %rsi
> > > > > -     jbe     L(max)
> > > > > +     cmpl    %eax, %esi
> > > > > +     jb      L(max)
> > > > > +     subq    %rdx, %rdi
> > > > > +     addl    $(VEC_SIZE + 1), %eax
> > > > >       addq    %rdi, %rax
> > > > > -     subq    %rdx, %rax
> > > > >  #  ifdef USE_AS_WCSLEN
> > > > >       shrq    $2, %rax
> > > > >  #  endif
> > > > >       VZEROUPPER_RETURN
> > > > > +# endif
> > > > >
> > > > >       .p2align 4
> > > > > -L(first_vec_x1_check):
> > > > > +L(last_vec_return_x0):
> > > > >       tzcntl  %eax, %eax
> > > > > -     /* Check the end of data.  */
> > > > > -     cmpq    %rax, %rsi
> > > > > -     jbe     L(max)
> > > > > -     addq    $VEC_SIZE, %rax
> > > > > +     subq    $(VEC_SIZE * 4 - 1), %rdi
> > > > >       addq    %rdi, %rax
> > > > > -     subq    %rdx, %rax
> > > > > -#  ifdef USE_AS_WCSLEN
> > > > > +# ifdef USE_AS_WCSLEN
> > > > >       shrq    $2, %rax
> > > > > -#  endif
> > > > > +# endif
> > > > >       VZEROUPPER_RETURN
> > > > >
> > > > >       .p2align 4
> > > > > -L(first_vec_x2_check):
> > > > > +L(last_vec_return_x1):
> > > > >       tzcntl  %eax, %eax
> > > > > -     /* Check the end of data.  */
> > > > > -     cmpq    %rax, %rsi
> > > > > -     jbe     L(max)
> > > > > -     addq    $(VEC_SIZE * 2), %rax
> > > > > +     subq    $(VEC_SIZE * 3 - 1), %rdi
> > > > >       addq    %rdi, %rax
> > > > > -     subq    %rdx, %rax
> > > > > -#  ifdef USE_AS_WCSLEN
> > > > > +# ifdef USE_AS_WCSLEN
> > > > >       shrq    $2, %rax
> > > > > -#  endif
> > > > > +# endif
> > > > >       VZEROUPPER_RETURN
> > > > >
> > > > > +# ifdef USE_AS_STRNLEN
> > > > >       .p2align 4
> > > > > -L(first_vec_x3_check):
> > > > > +L(last_vec_x1_check):
> > > > > +
> > > > >       tzcntl  %eax, %eax
> > > > >       /* Check the end of data.  */
> > > > > -     cmpq    %rax, %rsi
> > > > > -     jbe     L(max)
> > > > > -     addq    $(VEC_SIZE * 3), %rax
> > > > > +     cmpl    %eax, %esi
> > > > > +     jb      L(max)
> > > > > +     subq    %rdx, %rdi
> > > > > +     incl    %eax
> > > > >       addq    %rdi, %rax
> > > > > -     subq    %rdx, %rax
> > > > >  #  ifdef USE_AS_WCSLEN
> > > > >       shrq    $2, %rax
> > > > >  #  endif
> > > > >       VZEROUPPER_RETURN
> > > > >
> > > > > -     .p2align 4
> > > > >  L(max):
> > > > >       movq    %r8, %rax
> > > > > +     VZEROUPPER_RETURN
> > > > > +
> > > > > +     .p2align 4
> > > > > +L(last_4x_vec):
> > > > > +     /* Test first 2x VEC normally.  */
> > > > > +     testl   %eax, %eax
> > > > > +     jnz     L(last_vec_x1)
> > > > > +
> > > > > +     VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
> > > > > +     vpmovmskb       %ymm1, %eax
> > > > > +     testl   %eax, %eax
> > > > > +     jnz     L(last_vec_x2)
> > > > > +
> > > > > +     /* Normalize length.  */
> > > > > +     andl    $(VEC_SIZE * 4 - 1), %esi
> > > > > +     VPCMPEQ (VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm1
> > > > > +     vpmovmskb       %ymm1, %eax
> > > > > +     testl   %eax, %eax
> > > > > +     jnz     L(last_vec_x3)
> > > > > +
> > > > > +     subl    $(VEC_SIZE * 3), %esi
> > > > > +     jb      L(max)
> > > > > +
> > > > > +     VPCMPEQ (VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm1
> > > > > +     vpmovmskb       %ymm1, %eax
> > > > > +     tzcntl  %eax, %eax
> > > > > +     /* Check the end of data.  */
> > > > > +     cmpl    %eax, %esi
> > > > > +     jb      L(max)
> > > > > +     subq    %rdx, %rdi
> > > > > +     addl    $(VEC_SIZE * 3 + 1), %eax
> > > > > +     addq    %rdi, %rax
> > > > >  #  ifdef USE_AS_WCSLEN
> > > > >       shrq    $2, %rax
> > > > >  #  endif
> > > > >       VZEROUPPER_RETURN
> > > > >
> > > > > -     .p2align 4
> > > > > -L(zero):
> > > > > -     xorl    %eax, %eax
> > > > > -     ret
> > > > > -# endif
> > > > >
> > > > >       .p2align 4
> > > > > -L(first_vec_x0):
> > > > > +L(last_vec_x1):
> > > > > +     /* essentially duplicates of first_vec_x1 but use 64 bit
> > > > > +        instructions.  */
> > > > >       tzcntl  %eax, %eax
> > > > > +     subq    %rdx, %rdi
> > > > > +     incl    %eax
> > > > >       addq    %rdi, %rax
> > > > > -     subq    %rdx, %rax
> > > > > -# ifdef USE_AS_WCSLEN
> > > > > +#  ifdef USE_AS_WCSLEN
> > > > >       shrq    $2, %rax
> > > > > -# endif
> > > > > +#  endif
> > > > >       VZEROUPPER_RETURN
> > > > >
> > > > >       .p2align 4
> > > > > -L(first_vec_x1):
> > > > > +L(last_vec_x2):
> > > > > +     /* essentially duplicates of first_vec_x1 but use 64 bit
> > > > > +        instructions.  */
> > > > >       tzcntl  %eax, %eax
> > > > > -     addq    $VEC_SIZE, %rax
> > > > > +     subq    %rdx, %rdi
> > > > > +     addl    $(VEC_SIZE + 1), %eax
> > > > >       addq    %rdi, %rax
> > > > > -     subq    %rdx, %rax
> > > > > -# ifdef USE_AS_WCSLEN
> > > > > +#  ifdef USE_AS_WCSLEN
> > > > >       shrq    $2, %rax
> > > > > -# endif
> > > > > +#  endif
> > > > >       VZEROUPPER_RETURN
> > > > >
> > > > >       .p2align 4
> > > > > -L(first_vec_x2):
> > > > > +L(last_vec_x3):
> > > > >       tzcntl  %eax, %eax
> > > > > -     addq    $(VEC_SIZE * 2), %rax
> > > > > +     subl    $(VEC_SIZE * 2), %esi
> > > > > +     /* Check the end of data.  */
> > > > > +     cmpl    %eax, %esi
> > > > > +     jb      L(max_end)
> > > > > +     subq    %rdx, %rdi
> > > > > +     addl    $(VEC_SIZE * 2 + 1), %eax
> > > > >       addq    %rdi, %rax
> > > > > -     subq    %rdx, %rax
> > > > > -# ifdef USE_AS_WCSLEN
> > > > > +#  ifdef USE_AS_WCSLEN
> > > > >       shrq    $2, %rax
> > > > > -# endif
> > > > > +#  endif
> > > > > +     VZEROUPPER_RETURN
> > > > > +L(max_end):
> > > > > +     movq    %r8, %rax
> > > > >       VZEROUPPER_RETURN
> > > > > +# endif
> > > > >
> > > > > +     /* Cold case for crossing page with first load.  */
> > > > >       .p2align 4
> > > > > -L(4x_vec_end):
> > > > > -     VPCMPEQ %ymm1, %ymm0, %ymm1
> > > > > -     vpmovmskb %ymm1, %eax
> > > > > -     testl   %eax, %eax
> > > > > -     jnz     L(first_vec_x0)
> > > > > -     VPCMPEQ %ymm2, %ymm0, %ymm2
> > > > > -     vpmovmskb %ymm2, %eax
> > > > > +L(cross_page_boundary):
> > > > > +     /* Align data to VEC_SIZE - 1.  */
> > > > > +     orq     $(VEC_SIZE - 1), %rdi
> > > > > +     VPCMPEQ -(VEC_SIZE - 1)(%rdi), %ymm0, %ymm1
> > > > > +     vpmovmskb       %ymm1, %eax
> > > > > +     /* Remove the leading bytes. sarxl only uses bits [5:0] of COUNT
> > > > > +        so no need to manually mod rdx.  */
> > > > > +     sarxl   %edx, %eax, %eax
> > > >
> > > > This is a BMI2 instruction, which is not necessary available when AVX2
> > > > is available. This causes SIGILL on some CPU. I have reported that in
> > > > https://sourceware.org/bugzilla/show_bug.cgi?id=29611
> > >
> > > This is not a bug on master as:
> > >
> > > commit 83c5b368226c34a2f0a5287df40fc290b2b34359
> > > Author: H.J. Lu <hjl.tools@gmail.com>
> > > Date:   Mon Apr 19 10:45:07 2021 -0700
> > >
> > >     x86-64: Require BMI2 for strchr-avx2.S
> > >
> > > is already in tree. The issue is the avx2 changes where backported
> > > w.o H.J's changes.
> > > >
> > > > Regards
> > > > Aurelien
> > > >
> > > > --
> > > > Aurelien Jarno                          GPG: 4096R/1DDD8C9B
> > > > aurelien@aurel32.net                 http://www.aurel32.net

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH v5 2/2] x86: Optimize strlen-avx2.S
  2022-09-28 14:54           ` Sunil Pandey
@ 2022-09-28 15:00             ` Noah Goldstein
  2022-09-28 18:24               ` H.J. Lu
  0 siblings, 1 reply; 24+ messages in thread
From: Noah Goldstein @ 2022-09-28 15:00 UTC (permalink / raw)
  To: Sunil Pandey; +Cc: Libc-stable Mailing List, Hongjiu Lu, GNU C Library

On Wed, Sep 28, 2022 at 7:55 AM Sunil Pandey <skpgkp2@gmail.com> wrote:
>
> On Wed, Sep 28, 2022 at 7:42 AM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> >
> > On Wed, Sep 28, 2022 at 6:55 AM Sunil Pandey <skpgkp2@gmail.com> wrote:
> > >
> > > Attached patch fixes BZ# 29611.
> > >
> > > I would like to backport it to 2.32,2.31,2.30,2.29 and 2.29. Let me know
> > > if there is any objection.
> > The ifunc-impl-list changes are missing BMI2 for memchr-avx2.S
> >
> > Can you post these as separate emails with the patches embedded instead of
> > attached?
> >
> > >
>
> Patches are also posted on bug report 29611.
>
> https://sourceware.org/bugzilla/show_bug.cgi?id=29611

is there a mailing list for backport patches like this?
>
> > >
> > > On Sun, Sep 25, 2022 at 7:00 AM Noah Goldstein via Libc-alpha
> > > <libc-alpha@sourceware.org> wrote:
> > > >
> > > > On Sun, Sep 25, 2022 at 1:19 AM Aurelien Jarno <aurelien@aurel32.net> wrote:
> > > > >
> > > > > On 2021-04-19 19:36, Noah Goldstein via Libc-alpha wrote:
> > > > > > No bug. This commit optimizes strlen-avx2.S. The optimizations are
> > > > > > mostly small things but they add up to roughly 10-30% performance
> > > > > > improvement for strlen. The results for strnlen are bit more
> > > > > > ambiguous. test-strlen, test-strnlen, test-wcslen, and test-wcsnlen
> > > > > > are all passing.
> > > > > >
> > > > > > Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
> > > > > > ---
> > > > > >  sysdeps/x86_64/multiarch/ifunc-impl-list.c |  16 +-
> > > > > >  sysdeps/x86_64/multiarch/strlen-avx2.S     | 532 +++++++++++++--------
> > > > > >  2 files changed, 334 insertions(+), 214 deletions(-)
> > > > > >
> > > > > > diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> > > > > > index c377cab629..651b32908e 100644
> > > > > > --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> > > > > > +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> > > > > > @@ -293,10 +293,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> > > > > >    /* Support sysdeps/x86_64/multiarch/strlen.c.  */
> > > > > >    IFUNC_IMPL (i, name, strlen,
> > > > > >             IFUNC_IMPL_ADD (array, i, strlen,
> > > > > > -                           CPU_FEATURE_USABLE (AVX2),
> > > > > > +                           (CPU_FEATURE_USABLE (AVX2)
> > > > > > +                            && CPU_FEATURE_USABLE (BMI2)),
> > > > > >                             __strlen_avx2)
> > > > > >             IFUNC_IMPL_ADD (array, i, strlen,
> > > > > >                             (CPU_FEATURE_USABLE (AVX2)
> > > > > > +                            && CPU_FEATURE_USABLE (BMI2)
> > > > > >                              && CPU_FEATURE_USABLE (RTM)),
> > > > > >                             __strlen_avx2_rtm)
> > > > > >             IFUNC_IMPL_ADD (array, i, strlen,
> > > > > > @@ -309,10 +311,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> > > > > >    /* Support sysdeps/x86_64/multiarch/strnlen.c.  */
> > > > > >    IFUNC_IMPL (i, name, strnlen,
> > > > > >             IFUNC_IMPL_ADD (array, i, strnlen,
> > > > > > -                           CPU_FEATURE_USABLE (AVX2),
> > > > > > +                           (CPU_FEATURE_USABLE (AVX2)
> > > > > > +                            && CPU_FEATURE_USABLE (BMI2)),
> > > > > >                             __strnlen_avx2)
> > > > > >             IFUNC_IMPL_ADD (array, i, strnlen,
> > > > > >                             (CPU_FEATURE_USABLE (AVX2)
> > > > > > +                            && CPU_FEATURE_USABLE (BMI2)
> > > > > >                              && CPU_FEATURE_USABLE (RTM)),
> > > > > >                             __strnlen_avx2_rtm)
> > > > > >             IFUNC_IMPL_ADD (array, i, strnlen,
> > > > > > @@ -654,10 +658,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> > > > > >    /* Support sysdeps/x86_64/multiarch/wcslen.c.  */
> > > > > >    IFUNC_IMPL (i, name, wcslen,
> > > > > >             IFUNC_IMPL_ADD (array, i, wcslen,
> > > > > > -                           CPU_FEATURE_USABLE (AVX2),
> > > > > > +                           (CPU_FEATURE_USABLE (AVX2)
> > > > > > +                            && CPU_FEATURE_USABLE (BMI2)),
> > > > > >                             __wcslen_avx2)
> > > > > >             IFUNC_IMPL_ADD (array, i, wcslen,
> > > > > >                             (CPU_FEATURE_USABLE (AVX2)
> > > > > > +                            && CPU_FEATURE_USABLE (BMI2)
> > > > > >                              && CPU_FEATURE_USABLE (RTM)),
> > > > > >                             __wcslen_avx2_rtm)
> > > > > >             IFUNC_IMPL_ADD (array, i, wcslen,
> > > > > > @@ -670,10 +676,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> > > > > >    /* Support sysdeps/x86_64/multiarch/wcsnlen.c.  */
> > > > > >    IFUNC_IMPL (i, name, wcsnlen,
> > > > > >             IFUNC_IMPL_ADD (array, i, wcsnlen,
> > > > > > -                           CPU_FEATURE_USABLE (AVX2),
> > > > > > +                           (CPU_FEATURE_USABLE (AVX2)
> > > > > > +                            && CPU_FEATURE_USABLE (BMI2)),
> > > > > >                             __wcsnlen_avx2)
> > > > > >             IFUNC_IMPL_ADD (array, i, wcsnlen,
> > > > > >                             (CPU_FEATURE_USABLE (AVX2)
> > > > > > +                            && CPU_FEATURE_USABLE (BMI2)
> > > > > >                              && CPU_FEATURE_USABLE (RTM)),
> > > > > >                             __wcsnlen_avx2_rtm)
> > > > > >             IFUNC_IMPL_ADD (array, i, wcsnlen,
> > > > > > diff --git a/sysdeps/x86_64/multiarch/strlen-avx2.S b/sysdeps/x86_64/multiarch/strlen-avx2.S
> > > > > > index 1caae9e6bc..bd2e6ee44a 100644
> > > > > > --- a/sysdeps/x86_64/multiarch/strlen-avx2.S
> > > > > > +++ b/sysdeps/x86_64/multiarch/strlen-avx2.S
> > > > > > @@ -27,9 +27,11 @@
> > > > > >  # ifdef USE_AS_WCSLEN
> > > > > >  #  define VPCMPEQ    vpcmpeqd
> > > > > >  #  define VPMINU     vpminud
> > > > > > +#  define CHAR_SIZE  4
> > > > > >  # else
> > > > > >  #  define VPCMPEQ    vpcmpeqb
> > > > > >  #  define VPMINU     vpminub
> > > > > > +#  define CHAR_SIZE  1
> > > > > >  # endif
> > > > > >
> > > > > >  # ifndef VZEROUPPER
> > > > > > @@ -41,349 +43,459 @@
> > > > > >  # endif
> > > > > >
> > > > > >  # define VEC_SIZE 32
> > > > > > +# define PAGE_SIZE 4096
> > > > > >
> > > > > >       .section SECTION(.text),"ax",@progbits
> > > > > >  ENTRY (STRLEN)
> > > > > >  # ifdef USE_AS_STRNLEN
> > > > > > -     /* Check for zero length.  */
> > > > > > +     /* Check zero length.  */
> > > > > >       test    %RSI_LP, %RSI_LP
> > > > > >       jz      L(zero)
> > > > > > +     /* Store max len in R8_LP before adjusting if using WCSLEN.  */
> > > > > > +     mov     %RSI_LP, %R8_LP
> > > > > >  #  ifdef USE_AS_WCSLEN
> > > > > >       shl     $2, %RSI_LP
> > > > > >  #  elif defined __ILP32__
> > > > > >       /* Clear the upper 32 bits.  */
> > > > > >       movl    %esi, %esi
> > > > > >  #  endif
> > > > > > -     mov     %RSI_LP, %R8_LP
> > > > > >  # endif
> > > > > > -     movl    %edi, %ecx
> > > > > > +     movl    %edi, %eax
> > > > > >       movq    %rdi, %rdx
> > > > > >       vpxor   %xmm0, %xmm0, %xmm0
> > > > > > -
> > > > > > +     /* Clear high bits from edi. Only keeping bits relevant to page
> > > > > > +        cross check.  */
> > > > > > +     andl    $(PAGE_SIZE - 1), %eax
> > > > > >       /* Check if we may cross page boundary with one vector load.  */
> > > > > > -     andl    $(2 * VEC_SIZE - 1), %ecx
> > > > > > -     cmpl    $VEC_SIZE, %ecx
> > > > > > -     ja      L(cros_page_boundary)
> > > > > > +     cmpl    $(PAGE_SIZE - VEC_SIZE), %eax
> > > > > > +     ja      L(cross_page_boundary)
> > > > > >
> > > > > >       /* Check the first VEC_SIZE bytes.  */
> > > > > > -     VPCMPEQ (%rdi), %ymm0, %ymm1
> > > > > > -     vpmovmskb %ymm1, %eax
> > > > > > -     testl   %eax, %eax
> > > > > > -
> > > > > > +     VPCMPEQ (%rdi), %ymm0, %ymm1
> > > > > > +     vpmovmskb       %ymm1, %eax
> > > > > >  # ifdef USE_AS_STRNLEN
> > > > > > -     jnz     L(first_vec_x0_check)
> > > > > > -     /* Adjust length and check the end of data.  */
> > > > > > -     subq    $VEC_SIZE, %rsi
> > > > > > -     jbe     L(max)
> > > > > > -# else
> > > > > > -     jnz     L(first_vec_x0)
> > > > > > +     /* If length < VEC_SIZE handle special.  */
> > > > > > +     cmpq    $VEC_SIZE, %rsi
> > > > > > +     jbe     L(first_vec_x0)
> > > > > >  # endif
> > > > > > -
> > > > > > -     /* Align data for aligned loads in the loop.  */
> > > > > > -     addq    $VEC_SIZE, %rdi
> > > > > > -     andl    $(VEC_SIZE - 1), %ecx
> > > > > > -     andq    $-VEC_SIZE, %rdi
> > > > > > +     /* If empty continue to aligned_more. Otherwise return bit
> > > > > > +        position of first match.  */
> > > > > > +     testl   %eax, %eax
> > > > > > +     jz      L(aligned_more)
> > > > > > +     tzcntl  %eax, %eax
> > > > > > +# ifdef USE_AS_WCSLEN
> > > > > > +     shrl    $2, %eax
> > > > > > +# endif
> > > > > > +     VZEROUPPER_RETURN
> > > > > >
> > > > > >  # ifdef USE_AS_STRNLEN
> > > > > > -     /* Adjust length.  */
> > > > > > -     addq    %rcx, %rsi
> > > > > > +L(zero):
> > > > > > +     xorl    %eax, %eax
> > > > > > +     ret
> > > > > >
> > > > > > -     subq    $(VEC_SIZE * 4), %rsi
> > > > > > -     jbe     L(last_4x_vec_or_less)
> > > > > > +     .p2align 4
> > > > > > +L(first_vec_x0):
> > > > > > +     /* Set bit for max len so that tzcnt will return min of max len
> > > > > > +        and position of first match.  */
> > > > > > +     btsq    %rsi, %rax
> > > > > > +     tzcntl  %eax, %eax
> > > > > > +#  ifdef USE_AS_WCSLEN
> > > > > > +     shrl    $2, %eax
> > > > > > +#  endif
> > > > > > +     VZEROUPPER_RETURN
> > > > > >  # endif
> > > > > > -     jmp     L(more_4x_vec)
> > > > > >
> > > > > >       .p2align 4
> > > > > > -L(cros_page_boundary):
> > > > > > -     andl    $(VEC_SIZE - 1), %ecx
> > > > > > -     andq    $-VEC_SIZE, %rdi
> > > > > > -     VPCMPEQ (%rdi), %ymm0, %ymm1
> > > > > > -     vpmovmskb %ymm1, %eax
> > > > > > -     /* Remove the leading bytes.  */
> > > > > > -     sarl    %cl, %eax
> > > > > > -     testl   %eax, %eax
> > > > > > -     jz      L(aligned_more)
> > > > > > +L(first_vec_x1):
> > > > > >       tzcntl  %eax, %eax
> > > > > > +     /* Safe to use 32 bit instructions as these are only called for
> > > > > > +        size = [1, 159].  */
> > > > > >  # ifdef USE_AS_STRNLEN
> > > > > > -     /* Check the end of data.  */
> > > > > > -     cmpq    %rax, %rsi
> > > > > > -     jbe     L(max)
> > > > > > +     /* Use ecx which was computed earlier to compute correct value.
> > > > > > +      */
> > > > > > +     subl    $(VEC_SIZE * 4 + 1), %ecx
> > > > > > +     addl    %ecx, %eax
> > > > > > +# else
> > > > > > +     subl    %edx, %edi
> > > > > > +     incl    %edi
> > > > > > +     addl    %edi, %eax
> > > > > >  # endif
> > > > > > -     addq    %rdi, %rax
> > > > > > -     addq    %rcx, %rax
> > > > > > -     subq    %rdx, %rax
> > > > > >  # ifdef USE_AS_WCSLEN
> > > > > > -     shrq    $2, %rax
> > > > > > +     shrl    $2, %eax
> > > > > >  # endif
> > > > > > -L(return_vzeroupper):
> > > > > > -     ZERO_UPPER_VEC_REGISTERS_RETURN
> > > > > > +     VZEROUPPER_RETURN
> > > > > >
> > > > > >       .p2align 4
> > > > > > -L(aligned_more):
> > > > > > +L(first_vec_x2):
> > > > > > +     tzcntl  %eax, %eax
> > > > > > +     /* Safe to use 32 bit instructions as these are only called for
> > > > > > +        size = [1, 159].  */
> > > > > >  # ifdef USE_AS_STRNLEN
> > > > > > -        /* "rcx" is less than VEC_SIZE.  Calculate "rdx + rcx - VEC_SIZE"
> > > > > > -         with "rdx - (VEC_SIZE - rcx)" instead of "(rdx + rcx) - VEC_SIZE"
> > > > > > -         to void possible addition overflow.  */
> > > > > > -     negq    %rcx
> > > > > > -     addq    $VEC_SIZE, %rcx
> > > > > > -
> > > > > > -     /* Check the end of data.  */
> > > > > > -     subq    %rcx, %rsi
> > > > > > -     jbe     L(max)
> > > > > > +     /* Use ecx which was computed earlier to compute correct value.
> > > > > > +      */
> > > > > > +     subl    $(VEC_SIZE * 3 + 1), %ecx
> > > > > > +     addl    %ecx, %eax
> > > > > > +# else
> > > > > > +     subl    %edx, %edi
> > > > > > +     addl    $(VEC_SIZE + 1), %edi
> > > > > > +     addl    %edi, %eax
> > > > > >  # endif
> > > > > > +# ifdef USE_AS_WCSLEN
> > > > > > +     shrl    $2, %eax
> > > > > > +# endif
> > > > > > +     VZEROUPPER_RETURN
> > > > > >
> > > > > > -     addq    $VEC_SIZE, %rdi
> > > > > > +     .p2align 4
> > > > > > +L(first_vec_x3):
> > > > > > +     tzcntl  %eax, %eax
> > > > > > +     /* Safe to use 32 bit instructions as these are only called for
> > > > > > +        size = [1, 159].  */
> > > > > > +# ifdef USE_AS_STRNLEN
> > > > > > +     /* Use ecx which was computed earlier to compute correct value.
> > > > > > +      */
> > > > > > +     subl    $(VEC_SIZE * 2 + 1), %ecx
> > > > > > +     addl    %ecx, %eax
> > > > > > +# else
> > > > > > +     subl    %edx, %edi
> > > > > > +     addl    $(VEC_SIZE * 2 + 1), %edi
> > > > > > +     addl    %edi, %eax
> > > > > > +# endif
> > > > > > +# ifdef USE_AS_WCSLEN
> > > > > > +     shrl    $2, %eax
> > > > > > +# endif
> > > > > > +     VZEROUPPER_RETURN
> > > > > >
> > > > > > +     .p2align 4
> > > > > > +L(first_vec_x4):
> > > > > > +     tzcntl  %eax, %eax
> > > > > > +     /* Safe to use 32 bit instructions as these are only called for
> > > > > > +        size = [1, 159].  */
> > > > > >  # ifdef USE_AS_STRNLEN
> > > > > > -     subq    $(VEC_SIZE * 4), %rsi
> > > > > > -     jbe     L(last_4x_vec_or_less)
> > > > > > +     /* Use ecx which was computed earlier to compute correct value.
> > > > > > +      */
> > > > > > +     subl    $(VEC_SIZE + 1), %ecx
> > > > > > +     addl    %ecx, %eax
> > > > > > +# else
> > > > > > +     subl    %edx, %edi
> > > > > > +     addl    $(VEC_SIZE * 3 + 1), %edi
> > > > > > +     addl    %edi, %eax
> > > > > >  # endif
> > > > > > +# ifdef USE_AS_WCSLEN
> > > > > > +     shrl    $2, %eax
> > > > > > +# endif
> > > > > > +     VZEROUPPER_RETURN
> > > > > >
> > > > > > -L(more_4x_vec):
> > > > > > +     .p2align 5
> > > > > > +L(aligned_more):
> > > > > > +     /* Align data to VEC_SIZE - 1. This is the same number of
> > > > > > +        instructions as using andq with -VEC_SIZE but saves 4 bytes of
> > > > > > +        code on the x4 check.  */
> > > > > > +     orq     $(VEC_SIZE - 1), %rdi
> > > > > > +L(cross_page_continue):
> > > > > >       /* Check the first 4 * VEC_SIZE.  Only one VEC_SIZE at a time
> > > > > >          since data is only aligned to VEC_SIZE.  */
> > > > > > -     VPCMPEQ (%rdi), %ymm0, %ymm1
> > > > > > -     vpmovmskb %ymm1, %eax
> > > > > > -     testl   %eax, %eax
> > > > > > -     jnz     L(first_vec_x0)
> > > > > > -
> > > > > > -     VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1
> > > > > > -     vpmovmskb %ymm1, %eax
> > > > > > +# ifdef USE_AS_STRNLEN
> > > > > > +     /* + 1 because rdi is aligned to VEC_SIZE - 1. + CHAR_SIZE because
> > > > > > +        it simplies the logic in last_4x_vec_or_less.  */
> > > > > > +     leaq    (VEC_SIZE * 4 + CHAR_SIZE + 1)(%rdi), %rcx
> > > > > > +     subq    %rdx, %rcx
> > > > > > +# endif
> > > > > > +     /* Load first VEC regardless.  */
> > > > > > +     VPCMPEQ 1(%rdi), %ymm0, %ymm1
> > > > > > +# ifdef USE_AS_STRNLEN
> > > > > > +     /* Adjust length. If near end handle specially.  */
> > > > > > +     subq    %rcx, %rsi
> > > > > > +     jb      L(last_4x_vec_or_less)
> > > > > > +# endif
> > > > > > +     vpmovmskb       %ymm1, %eax
> > > > > >       testl   %eax, %eax
> > > > > >       jnz     L(first_vec_x1)
> > > > > >
> > > > > > -     VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm0, %ymm1
> > > > > > -     vpmovmskb %ymm1, %eax
> > > > > > +     VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
> > > > > > +     vpmovmskb       %ymm1, %eax
> > > > > >       testl   %eax, %eax
> > > > > >       jnz     L(first_vec_x2)
> > > > > >
> > > > > > -     VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1
> > > > > > -     vpmovmskb %ymm1, %eax
> > > > > > +     VPCMPEQ (VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm1
> > > > > > +     vpmovmskb       %ymm1, %eax
> > > > > >       testl   %eax, %eax
> > > > > >       jnz     L(first_vec_x3)
> > > > > >
> > > > > > -     addq    $(VEC_SIZE * 4), %rdi
> > > > > > -
> > > > > > -# ifdef USE_AS_STRNLEN
> > > > > > -     subq    $(VEC_SIZE * 4), %rsi
> > > > > > -     jbe     L(last_4x_vec_or_less)
> > > > > > -# endif
> > > > > > -
> > > > > > -     /* Align data to 4 * VEC_SIZE.  */
> > > > > > -     movq    %rdi, %rcx
> > > > > > -     andl    $(4 * VEC_SIZE - 1), %ecx
> > > > > > -     andq    $-(4 * VEC_SIZE), %rdi
> > > > > > +     VPCMPEQ (VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm1
> > > > > > +     vpmovmskb       %ymm1, %eax
> > > > > > +     testl   %eax, %eax
> > > > > > +     jnz     L(first_vec_x4)
> > > > > >
> > > > > > +     /* Align data to VEC_SIZE * 4 - 1.  */
> > > > > >  # ifdef USE_AS_STRNLEN
> > > > > > -     /* Adjust length.  */
> > > > > > +     /* Before adjusting length check if at last VEC_SIZE * 4.  */
> > > > > > +     cmpq    $(VEC_SIZE * 4 - 1), %rsi
> > > > > > +     jbe     L(last_4x_vec_or_less_load)
> > > > > > +     incq    %rdi
> > > > > > +     movl    %edi, %ecx
> > > > > > +     orq     $(VEC_SIZE * 4 - 1), %rdi
> > > > > > +     andl    $(VEC_SIZE * 4 - 1), %ecx
> > > > > > +     /* Readjust length.  */
> > > > > >       addq    %rcx, %rsi
> > > > > > +# else
> > > > > > +     incq    %rdi
> > > > > > +     orq     $(VEC_SIZE * 4 - 1), %rdi
> > > > > >  # endif
> > > > > > -
> > > > > > +     /* Compare 4 * VEC at a time forward.  */
> > > > > >       .p2align 4
> > > > > >  L(loop_4x_vec):
> > > > > > -     /* Compare 4 * VEC at a time forward.  */
> > > > > > -     vmovdqa (%rdi), %ymm1
> > > > > > -     vmovdqa VEC_SIZE(%rdi), %ymm2
> > > > > > -     vmovdqa (VEC_SIZE * 2)(%rdi), %ymm3
> > > > > > -     vmovdqa (VEC_SIZE * 3)(%rdi), %ymm4
> > > > > > -     VPMINU  %ymm1, %ymm2, %ymm5
> > > > > > -     VPMINU  %ymm3, %ymm4, %ymm6
> > > > > > -     VPMINU  %ymm5, %ymm6, %ymm5
> > > > > > -
> > > > > > -     VPCMPEQ %ymm5, %ymm0, %ymm5
> > > > > > -     vpmovmskb %ymm5, %eax
> > > > > > -     testl   %eax, %eax
> > > > > > -     jnz     L(4x_vec_end)
> > > > > > -
> > > > > > -     addq    $(VEC_SIZE * 4), %rdi
> > > > > > -
> > > > > > -# ifndef USE_AS_STRNLEN
> > > > > > -     jmp     L(loop_4x_vec)
> > > > > > -# else
> > > > > > +# ifdef USE_AS_STRNLEN
> > > > > > +     /* Break if at end of length.  */
> > > > > >       subq    $(VEC_SIZE * 4), %rsi
> > > > > > -     ja      L(loop_4x_vec)
> > > > > > -
> > > > > > -L(last_4x_vec_or_less):
> > > > > > -     /* Less than 4 * VEC and aligned to VEC_SIZE.  */
> > > > > > -     addl    $(VEC_SIZE * 2), %esi
> > > > > > -     jle     L(last_2x_vec)
> > > > > > +     jb      L(last_4x_vec_or_less_cmpeq)
> > > > > > +# endif
> > > > > > +     /* Save some code size by microfusing VPMINU with the load. Since
> > > > > > +        the matches in ymm2/ymm4 can only be returned if there where no
> > > > > > +        matches in ymm1/ymm3 respectively there is no issue with overlap.
> > > > > > +      */
> > > > > > +     vmovdqa 1(%rdi), %ymm1
> > > > > > +     VPMINU  (VEC_SIZE + 1)(%rdi), %ymm1, %ymm2
> > > > > > +     vmovdqa (VEC_SIZE * 2 + 1)(%rdi), %ymm3
> > > > > > +     VPMINU  (VEC_SIZE * 3 + 1)(%rdi), %ymm3, %ymm4
> > > > > > +
> > > > > > +     VPMINU  %ymm2, %ymm4, %ymm5
> > > > > > +     VPCMPEQ %ymm5, %ymm0, %ymm5
> > > > > > +     vpmovmskb       %ymm5, %ecx
> > > > > >
> > > > > > -     VPCMPEQ (%rdi), %ymm0, %ymm1
> > > > > > -     vpmovmskb %ymm1, %eax
> > > > > > -     testl   %eax, %eax
> > > > > > -     jnz     L(first_vec_x0)
> > > > > > +     subq    $-(VEC_SIZE * 4), %rdi
> > > > > > +     testl   %ecx, %ecx
> > > > > > +     jz      L(loop_4x_vec)
> > > > > >
> > > > > > -     VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1
> > > > > > -     vpmovmskb %ymm1, %eax
> > > > > > -     testl   %eax, %eax
> > > > > > -     jnz     L(first_vec_x1)
> > > > > >
> > > > > > -     VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm0, %ymm1
> > > > > > -     vpmovmskb %ymm1, %eax
> > > > > > +     VPCMPEQ %ymm1, %ymm0, %ymm1
> > > > > > +     vpmovmskb       %ymm1, %eax
> > > > > > +     subq    %rdx, %rdi
> > > > > >       testl   %eax, %eax
> > > > > > +     jnz     L(last_vec_return_x0)
> > > > > >
> > > > > > -     jnz     L(first_vec_x2_check)
> > > > > > -     subl    $VEC_SIZE, %esi
> > > > > > -     jle     L(max)
> > > > > > -
> > > > > > -     VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1
> > > > > > -     vpmovmskb %ymm1, %eax
> > > > > > +     VPCMPEQ %ymm2, %ymm0, %ymm2
> > > > > > +     vpmovmskb       %ymm2, %eax
> > > > > >       testl   %eax, %eax
> > > > > > -
> > > > > > -     jnz     L(first_vec_x3_check)
> > > > > > -     movq    %r8, %rax
> > > > > > -#  ifdef USE_AS_WCSLEN
> > > > > > +     jnz     L(last_vec_return_x1)
> > > > > > +
> > > > > > +     /* Combine last 2 VEC.  */
> > > > > > +     VPCMPEQ %ymm3, %ymm0, %ymm3
> > > > > > +     vpmovmskb       %ymm3, %eax
> > > > > > +     /* rcx has combined result from all 4 VEC. It will only be used if
> > > > > > +        the first 3 other VEC all did not contain a match.  */
> > > > > > +     salq    $32, %rcx
> > > > > > +     orq     %rcx, %rax
> > > > > > +     tzcntq  %rax, %rax
> > > > > > +     subq    $(VEC_SIZE * 2 - 1), %rdi
> > > > > > +     addq    %rdi, %rax
> > > > > > +# ifdef USE_AS_WCSLEN
> > > > > >       shrq    $2, %rax
> > > > > > -#  endif
> > > > > > +# endif
> > > > > >       VZEROUPPER_RETURN
> > > > > >
> > > > > > +
> > > > > > +# ifdef USE_AS_STRNLEN
> > > > > >       .p2align 4
> > > > > > -L(last_2x_vec):
> > > > > > -     addl    $(VEC_SIZE * 2), %esi
> > > > > > -     VPCMPEQ (%rdi), %ymm0, %ymm1
> > > > > > -     vpmovmskb %ymm1, %eax
> > > > > > -     testl   %eax, %eax
> > > > > > +L(last_4x_vec_or_less_load):
> > > > > > +     /* Depending on entry adjust rdi / prepare first VEC in ymm1.  */
> > > > > > +     subq    $-(VEC_SIZE * 4), %rdi
> > > > > > +L(last_4x_vec_or_less_cmpeq):
> > > > > > +     VPCMPEQ 1(%rdi), %ymm0, %ymm1
> > > > > > +L(last_4x_vec_or_less):
> > > > > >
> > > > > > -     jnz     L(first_vec_x0_check)
> > > > > > -     subl    $VEC_SIZE, %esi
> > > > > > -     jle     L(max)
> > > > > > +     vpmovmskb       %ymm1, %eax
> > > > > > +     /* If remaining length > VEC_SIZE * 2. This works if esi is off by
> > > > > > +        VEC_SIZE * 4.  */
> > > > > > +     testl   $(VEC_SIZE * 2), %esi
> > > > > > +     jnz     L(last_4x_vec)
> > > > > >
> > > > > > -     VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1
> > > > > > -     vpmovmskb %ymm1, %eax
> > > > > > +     /* length may have been negative or positive by an offset of
> > > > > > +        VEC_SIZE * 4 depending on where this was called from. This fixes
> > > > > > +        that.  */
> > > > > > +     andl    $(VEC_SIZE * 4 - 1), %esi
> > > > > >       testl   %eax, %eax
> > > > > > -     jnz     L(first_vec_x1_check)
> > > > > > -     movq    %r8, %rax
> > > > > > -#  ifdef USE_AS_WCSLEN
> > > > > > -     shrq    $2, %rax
> > > > > > -#  endif
> > > > > > -     VZEROUPPER_RETURN
> > > > > > +     jnz     L(last_vec_x1_check)
> > > > > >
> > > > > > -     .p2align 4
> > > > > > -L(first_vec_x0_check):
> > > > > > +     subl    $VEC_SIZE, %esi
> > > > > > +     jb      L(max)
> > > > > > +
> > > > > > +     VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
> > > > > > +     vpmovmskb       %ymm1, %eax
> > > > > >       tzcntl  %eax, %eax
> > > > > >       /* Check the end of data.  */
> > > > > > -     cmpq    %rax, %rsi
> > > > > > -     jbe     L(max)
> > > > > > +     cmpl    %eax, %esi
> > > > > > +     jb      L(max)
> > > > > > +     subq    %rdx, %rdi
> > > > > > +     addl    $(VEC_SIZE + 1), %eax
> > > > > >       addq    %rdi, %rax
> > > > > > -     subq    %rdx, %rax
> > > > > >  #  ifdef USE_AS_WCSLEN
> > > > > >       shrq    $2, %rax
> > > > > >  #  endif
> > > > > >       VZEROUPPER_RETURN
> > > > > > +# endif
> > > > > >
> > > > > >       .p2align 4
> > > > > > -L(first_vec_x1_check):
> > > > > > +L(last_vec_return_x0):
> > > > > >       tzcntl  %eax, %eax
> > > > > > -     /* Check the end of data.  */
> > > > > > -     cmpq    %rax, %rsi
> > > > > > -     jbe     L(max)
> > > > > > -     addq    $VEC_SIZE, %rax
> > > > > > +     subq    $(VEC_SIZE * 4 - 1), %rdi
> > > > > >       addq    %rdi, %rax
> > > > > > -     subq    %rdx, %rax
> > > > > > -#  ifdef USE_AS_WCSLEN
> > > > > > +# ifdef USE_AS_WCSLEN
> > > > > >       shrq    $2, %rax
> > > > > > -#  endif
> > > > > > +# endif
> > > > > >       VZEROUPPER_RETURN
> > > > > >
> > > > > >       .p2align 4
> > > > > > -L(first_vec_x2_check):
> > > > > > +L(last_vec_return_x1):
> > > > > >       tzcntl  %eax, %eax
> > > > > > -     /* Check the end of data.  */
> > > > > > -     cmpq    %rax, %rsi
> > > > > > -     jbe     L(max)
> > > > > > -     addq    $(VEC_SIZE * 2), %rax
> > > > > > +     subq    $(VEC_SIZE * 3 - 1), %rdi
> > > > > >       addq    %rdi, %rax
> > > > > > -     subq    %rdx, %rax
> > > > > > -#  ifdef USE_AS_WCSLEN
> > > > > > +# ifdef USE_AS_WCSLEN
> > > > > >       shrq    $2, %rax
> > > > > > -#  endif
> > > > > > +# endif
> > > > > >       VZEROUPPER_RETURN
> > > > > >
> > > > > > +# ifdef USE_AS_STRNLEN
> > > > > >       .p2align 4
> > > > > > -L(first_vec_x3_check):
> > > > > > +L(last_vec_x1_check):
> > > > > > +
> > > > > >       tzcntl  %eax, %eax
> > > > > >       /* Check the end of data.  */
> > > > > > -     cmpq    %rax, %rsi
> > > > > > -     jbe     L(max)
> > > > > > -     addq    $(VEC_SIZE * 3), %rax
> > > > > > +     cmpl    %eax, %esi
> > > > > > +     jb      L(max)
> > > > > > +     subq    %rdx, %rdi
> > > > > > +     incl    %eax
> > > > > >       addq    %rdi, %rax
> > > > > > -     subq    %rdx, %rax
> > > > > >  #  ifdef USE_AS_WCSLEN
> > > > > >       shrq    $2, %rax
> > > > > >  #  endif
> > > > > >       VZEROUPPER_RETURN
> > > > > >
> > > > > > -     .p2align 4
> > > > > >  L(max):
> > > > > >       movq    %r8, %rax
> > > > > > +     VZEROUPPER_RETURN
> > > > > > +
> > > > > > +     .p2align 4
> > > > > > +L(last_4x_vec):
> > > > > > +     /* Test first 2x VEC normally.  */
> > > > > > +     testl   %eax, %eax
> > > > > > +     jnz     L(last_vec_x1)
> > > > > > +
> > > > > > +     VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
> > > > > > +     vpmovmskb       %ymm1, %eax
> > > > > > +     testl   %eax, %eax
> > > > > > +     jnz     L(last_vec_x2)
> > > > > > +
> > > > > > +     /* Normalize length.  */
> > > > > > +     andl    $(VEC_SIZE * 4 - 1), %esi
> > > > > > +     VPCMPEQ (VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm1
> > > > > > +     vpmovmskb       %ymm1, %eax
> > > > > > +     testl   %eax, %eax
> > > > > > +     jnz     L(last_vec_x3)
> > > > > > +
> > > > > > +     subl    $(VEC_SIZE * 3), %esi
> > > > > > +     jb      L(max)
> > > > > > +
> > > > > > +     VPCMPEQ (VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm1
> > > > > > +     vpmovmskb       %ymm1, %eax
> > > > > > +     tzcntl  %eax, %eax
> > > > > > +     /* Check the end of data.  */
> > > > > > +     cmpl    %eax, %esi
> > > > > > +     jb      L(max)
> > > > > > +     subq    %rdx, %rdi
> > > > > > +     addl    $(VEC_SIZE * 3 + 1), %eax
> > > > > > +     addq    %rdi, %rax
> > > > > >  #  ifdef USE_AS_WCSLEN
> > > > > >       shrq    $2, %rax
> > > > > >  #  endif
> > > > > >       VZEROUPPER_RETURN
> > > > > >
> > > > > > -     .p2align 4
> > > > > > -L(zero):
> > > > > > -     xorl    %eax, %eax
> > > > > > -     ret
> > > > > > -# endif
> > > > > >
> > > > > >       .p2align 4
> > > > > > -L(first_vec_x0):
> > > > > > +L(last_vec_x1):
> > > > > > +     /* essentially duplicates of first_vec_x1 but use 64 bit
> > > > > > +        instructions.  */
> > > > > >       tzcntl  %eax, %eax
> > > > > > +     subq    %rdx, %rdi
> > > > > > +     incl    %eax
> > > > > >       addq    %rdi, %rax
> > > > > > -     subq    %rdx, %rax
> > > > > > -# ifdef USE_AS_WCSLEN
> > > > > > +#  ifdef USE_AS_WCSLEN
> > > > > >       shrq    $2, %rax
> > > > > > -# endif
> > > > > > +#  endif
> > > > > >       VZEROUPPER_RETURN
> > > > > >
> > > > > >       .p2align 4
> > > > > > -L(first_vec_x1):
> > > > > > +L(last_vec_x2):
> > > > > > +     /* essentially duplicates of first_vec_x1 but use 64 bit
> > > > > > +        instructions.  */
> > > > > >       tzcntl  %eax, %eax
> > > > > > -     addq    $VEC_SIZE, %rax
> > > > > > +     subq    %rdx, %rdi
> > > > > > +     addl    $(VEC_SIZE + 1), %eax
> > > > > >       addq    %rdi, %rax
> > > > > > -     subq    %rdx, %rax
> > > > > > -# ifdef USE_AS_WCSLEN
> > > > > > +#  ifdef USE_AS_WCSLEN
> > > > > >       shrq    $2, %rax
> > > > > > -# endif
> > > > > > +#  endif
> > > > > >       VZEROUPPER_RETURN
> > > > > >
> > > > > >       .p2align 4
> > > > > > -L(first_vec_x2):
> > > > > > +L(last_vec_x3):
> > > > > >       tzcntl  %eax, %eax
> > > > > > -     addq    $(VEC_SIZE * 2), %rax
> > > > > > +     subl    $(VEC_SIZE * 2), %esi
> > > > > > +     /* Check the end of data.  */
> > > > > > +     cmpl    %eax, %esi
> > > > > > +     jb      L(max_end)
> > > > > > +     subq    %rdx, %rdi
> > > > > > +     addl    $(VEC_SIZE * 2 + 1), %eax
> > > > > >       addq    %rdi, %rax
> > > > > > -     subq    %rdx, %rax
> > > > > > -# ifdef USE_AS_WCSLEN
> > > > > > +#  ifdef USE_AS_WCSLEN
> > > > > >       shrq    $2, %rax
> > > > > > -# endif
> > > > > > +#  endif
> > > > > > +     VZEROUPPER_RETURN
> > > > > > +L(max_end):
> > > > > > +     movq    %r8, %rax
> > > > > >       VZEROUPPER_RETURN
> > > > > > +# endif
> > > > > >
> > > > > > +     /* Cold case for crossing page with first load.  */
> > > > > >       .p2align 4
> > > > > > -L(4x_vec_end):
> > > > > > -     VPCMPEQ %ymm1, %ymm0, %ymm1
> > > > > > -     vpmovmskb %ymm1, %eax
> > > > > > -     testl   %eax, %eax
> > > > > > -     jnz     L(first_vec_x0)
> > > > > > -     VPCMPEQ %ymm2, %ymm0, %ymm2
> > > > > > -     vpmovmskb %ymm2, %eax
> > > > > > +L(cross_page_boundary):
> > > > > > +     /* Align data to VEC_SIZE - 1.  */
> > > > > > +     orq     $(VEC_SIZE - 1), %rdi
> > > > > > +     VPCMPEQ -(VEC_SIZE - 1)(%rdi), %ymm0, %ymm1
> > > > > > +     vpmovmskb       %ymm1, %eax
> > > > > > +     /* Remove the leading bytes. sarxl only uses bits [5:0] of COUNT
> > > > > > +        so no need to manually mod rdx.  */
> > > > > > +     sarxl   %edx, %eax, %eax
> > > > >
> > > > > This is a BMI2 instruction, which is not necessary available when AVX2
> > > > > is available. This causes SIGILL on some CPU. I have reported that in
> > > > > https://sourceware.org/bugzilla/show_bug.cgi?id=29611
> > > >
> > > > This is not a bug on master as:
> > > >
> > > > commit 83c5b368226c34a2f0a5287df40fc290b2b34359
> > > > Author: H.J. Lu <hjl.tools@gmail.com>
> > > > Date:   Mon Apr 19 10:45:07 2021 -0700
> > > >
> > > >     x86-64: Require BMI2 for strchr-avx2.S
> > > >
> > > > is already in tree. The issue is the avx2 changes where backported
> > > > w.o H.J's changes.
> > > > >
> > > > > Regards
> > > > > Aurelien
> > > > >
> > > > > --
> > > > > Aurelien Jarno                          GPG: 4096R/1DDD8C9B
> > > > > aurelien@aurel32.net                 http://www.aurel32.net

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH v5 2/2] x86: Optimize strlen-avx2.S
  2022-09-28 13:54       ` Sunil Pandey
  2022-09-28 14:02         ` Darren Tristano
  2022-09-28 14:42         ` Noah Goldstein
@ 2022-09-28 18:23         ` H.J. Lu
  2022-09-28 19:09           ` Sunil Pandey
  2022-09-30 13:19           ` FUCKETY FUCK FUCK FUCK - PLEASE FUCKING REMOVE ME> Darren Tristano
  2022-10-04 21:19         ` [PATCH v5 2/2] x86: Optimize strlen-avx2.S Aurelien Jarno
  3 siblings, 2 replies; 24+ messages in thread
From: H.J. Lu @ 2022-09-28 18:23 UTC (permalink / raw)
  To: Sunil Pandey; +Cc: Noah Goldstein, Libc-stable Mailing List, GNU C Library

On Wed, Sep 28, 2022 at 6:55 AM Sunil Pandey <skpgkp2@gmail.com> wrote:
>
> Attached patch fixes BZ# 29611.
>
> I would like to backport it to 2.32,2.31,2.30,2.29 and 2.29. Let me know
> if there is any objection.

It doesn't fully fix BZ #29611.  Like Noah mentioned, we need to add
BMI2 check to ifunc-impl-list.c for all functions which use "ifunc-avx2.h".


H.J.

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH v5 2/2] x86: Optimize strlen-avx2.S
  2022-09-28 15:00             ` Noah Goldstein
@ 2022-09-28 18:24               ` H.J. Lu
  2022-09-30 13:19                 ` FUCKETY FUCK FUCK FUCK - PLEASE FUCKING REMOVE ME> Darren Tristano
  0 siblings, 1 reply; 24+ messages in thread
From: H.J. Lu @ 2022-09-28 18:24 UTC (permalink / raw)
  To: Noah Goldstein; +Cc: Sunil Pandey, Libc-stable Mailing List, GNU C Library

On Wed, Sep 28, 2022 at 8:00 AM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> On Wed, Sep 28, 2022 at 7:55 AM Sunil Pandey <skpgkp2@gmail.com> wrote:
> >
> > On Wed, Sep 28, 2022 at 7:42 AM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> > >
> > > On Wed, Sep 28, 2022 at 6:55 AM Sunil Pandey <skpgkp2@gmail.com> wrote:
> > > >
> > > > Attached patch fixes BZ# 29611.
> > > >
> > > > I would like to backport it to 2.32,2.31,2.30,2.29 and 2.29. Let me know
> > > > if there is any objection.
> > > The ifunc-impl-list changes are missing BMI2 for memchr-avx2.S
> > >
> > > Can you post these as separate emails with the patches embedded instead of
> > > attached?
> > >
> > > >
> >
> > Patches are also posted on bug report 29611.
> >
> > https://sourceware.org/bugzilla/show_bug.cgi?id=29611
>
> is there a mailing list for backport patches like this?

It is libc-stable.

-- 
H.J.

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH v5 2/2] x86: Optimize strlen-avx2.S
  2022-09-28 18:23         ` [PATCH v5 2/2] x86: Optimize strlen-avx2.S H.J. Lu
@ 2022-09-28 19:09           ` Sunil Pandey
  2022-09-28 19:23             ` H.J. Lu
  2022-09-30 13:19           ` FUCKETY FUCK FUCK FUCK - PLEASE FUCKING REMOVE ME> Darren Tristano
  1 sibling, 1 reply; 24+ messages in thread
From: Sunil Pandey @ 2022-09-28 19:09 UTC (permalink / raw)
  To: H.J. Lu; +Cc: Noah Goldstein, Libc-stable Mailing List, GNU C Library

On Wed, Sep 28, 2022 at 11:24 AM H.J. Lu <hjl.tools@gmail.com> wrote:
>
> On Wed, Sep 28, 2022 at 6:55 AM Sunil Pandey <skpgkp2@gmail.com> wrote:
> >
> > Attached patch fixes BZ# 29611.
> >
> > I would like to backport it to 2.32,2.31,2.30,2.29 and 2.29. Let me know
> > if there is any objection.
>
> It doesn't fully fix BZ #29611.  Like Noah mentioned, we need to add
> BMI2 check to ifunc-impl-list.c for all functions which use "ifunc-avx2.h".
>
>
> H.J.

Pulling up corresponding patches are extremely difficult as they are not
modular. I can modify existing patches (as posted on bug report) to
incorporate ifunc-impl-list.c functionality. If it is OK?

For backporting small incremental changes are preferred. Single monolithic
patch makes backporting extremely difficult, if not impossible.

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH v5 2/2] x86: Optimize strlen-avx2.S
  2022-09-28 19:09           ` Sunil Pandey
@ 2022-09-28 19:23             ` H.J. Lu
  0 siblings, 0 replies; 24+ messages in thread
From: H.J. Lu @ 2022-09-28 19:23 UTC (permalink / raw)
  To: Sunil Pandey; +Cc: Noah Goldstein, Libc-stable Mailing List, GNU C Library

On Wed, Sep 28, 2022 at 12:09 PM Sunil Pandey <skpgkp2@gmail.com> wrote:
>
> On Wed, Sep 28, 2022 at 11:24 AM H.J. Lu <hjl.tools@gmail.com> wrote:
> >
> > On Wed, Sep 28, 2022 at 6:55 AM Sunil Pandey <skpgkp2@gmail.com> wrote:
> > >
> > > Attached patch fixes BZ# 29611.
> > >
> > > I would like to backport it to 2.32,2.31,2.30,2.29 and 2.29. Let me know
> > > if there is any objection.
> >
> > It doesn't fully fix BZ #29611.  Like Noah mentioned, we need to add
> > BMI2 check to ifunc-impl-list.c for all functions which use "ifunc-avx2.h".
> >
> >
> > H.J.
>
> Pulling up corresponding patches are extremely difficult as they are not
> modular. I can modify existing patches (as posted on bug report) to
> incorporate ifunc-impl-list.c functionality. If it is OK?

Please mention BZ #29611 in the commit log of the backport and submit
a separate patch to fully fix BZ #29611.  We should use a patch set for
BZ #29611.

> For backporting small incremental changes are preferred. Single monolithic
> patch makes backporting extremely difficult, if not impossible.



-- 
H.J.

^ permalink raw reply	[flat|nested] 24+ messages in thread

* FUCKETY FUCK FUCK FUCK - PLEASE FUCKING REMOVE ME>
  2022-09-28 18:24               ` H.J. Lu
@ 2022-09-30 13:19                 ` Darren Tristano
  0 siblings, 0 replies; 24+ messages in thread
From: Darren Tristano @ 2022-09-30 13:19 UTC (permalink / raw)
  To: Noah Goldstein, H.J. Lu
  Cc: GNU C Library, Sunil Pandey, Libc-stable Mailing List

[-- Attachment #1: Type: text/plain, Size: 1447 bytes --]

FUCKETY FUCK FUCK FUCK - PLEASE FUCKING REMOVE ME>




________________________________
From: Libc-stable <libc-stable-bounces+darren=darrentristano.com@sourceware.org> on behalf of H.J. Lu via Libc-stable <libc-stable@sourceware.org>
Sent: Wednesday, September 28, 2022 1:24 PM
To: Noah Goldstein <goldstein.w.n@gmail.com>
Cc: GNU C Library <libc-alpha@sourceware.org>; Sunil Pandey <skpgkp2@gmail.com>; Libc-stable Mailing List <libc-stable@sourceware.org>
Subject: Re: [PATCH v5 2/2] x86: Optimize strlen-avx2.S

On Wed, Sep 28, 2022 at 8:00 AM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> On Wed, Sep 28, 2022 at 7:55 AM Sunil Pandey <skpgkp2@gmail.com> wrote:
> >
> > On Wed, Sep 28, 2022 at 7:42 AM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> > >
> > > On Wed, Sep 28, 2022 at 6:55 AM Sunil Pandey <skpgkp2@gmail.com> wrote:
> > > >
> > > > Attached patch fixes BZ# 29611.
> > > >
> > > > I would like to backport it to 2.32,2.31,2.30,2.29 and 2.29. Let me know
> > > > if there is any objection.
> > > The ifunc-impl-list changes are missing BMI2 for memchr-avx2.S
> > >
> > > Can you post these as separate emails with the patches embedded instead of
> > > attached?
> > >
> > > >
> >
> > Patches are also posted on bug report 29611.
> >
> > https://sourceware.org/bugzilla/show_bug.cgi?id=29611
>
> is there a mailing list for backport patches like this?

It is libc-stable.

--
H.J.

^ permalink raw reply	[flat|nested] 24+ messages in thread

* FUCKETY FUCK FUCK FUCK - PLEASE FUCKING REMOVE ME>
  2022-09-28 18:23         ` [PATCH v5 2/2] x86: Optimize strlen-avx2.S H.J. Lu
  2022-09-28 19:09           ` Sunil Pandey
@ 2022-09-30 13:19           ` Darren Tristano
  1 sibling, 0 replies; 24+ messages in thread
From: Darren Tristano @ 2022-09-30 13:19 UTC (permalink / raw)
  To: Sunil Pandey, H.J. Lu; +Cc: GNU C Library, Libc-stable Mailing List

[-- Attachment #1: Type: text/plain, Size: 954 bytes --]

FUCKETY FUCK FUCK FUCK - PLEASE FUCKING REMOVE ME>



Darren Tristano, CEO

FoodserviceResults

T: (708) 228-1427

darrentristano.com


________________________________
From: Libc-stable <libc-stable-bounces+darren=darrentristano.com@sourceware.org> on behalf of H.J. Lu via Libc-stable <libc-stable@sourceware.org>
Sent: Wednesday, September 28, 2022 1:23 PM
To: Sunil Pandey <skpgkp2@gmail.com>
Cc: GNU C Library <libc-alpha@sourceware.org>; Libc-stable Mailing List <libc-stable@sourceware.org>
Subject: Re: [PATCH v5 2/2] x86: Optimize strlen-avx2.S

On Wed, Sep 28, 2022 at 6:55 AM Sunil Pandey <skpgkp2@gmail.com> wrote:
>
> Attached patch fixes BZ# 29611.
>
> I would like to backport it to 2.32,2.31,2.30,2.29 and 2.29. Let me know
> if there is any objection.

It doesn't fully fix BZ #29611.  Like Noah mentioned, we need to add
BMI2 check to ifunc-impl-list.c for all functions which use "ifunc-avx2.h".


H.J.

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH v5 2/2] x86: Optimize strlen-avx2.S
  2022-09-28 13:54       ` Sunil Pandey
                           ` (2 preceding siblings ...)
  2022-09-28 18:23         ` [PATCH v5 2/2] x86: Optimize strlen-avx2.S H.J. Lu
@ 2022-10-04 21:19         ` Aurelien Jarno
  2022-10-04 21:29           ` H.J. Lu
  2022-10-05  1:10           ` Sunil Pandey
  3 siblings, 2 replies; 24+ messages in thread
From: Aurelien Jarno @ 2022-10-04 21:19 UTC (permalink / raw)
  To: Sunil Pandey
  Cc: Noah Goldstein, Libc-stable Mailing List, Hongjiu Lu, GNU C Library

On 2022-09-28 06:54, Sunil Pandey via Libc-stable wrote:
> Attached patch fixes BZ# 29611.
> 
> I would like to backport it to 2.32,2.31,2.30,2.29 and 2.29. Let me know
> if there is any objection.

Sorry to be late on this. I have a few comments about that patch:

> From 86e1d88e1a3c126597ef39165275ada7564cfce9 Mon Sep 17 00:00:00 2001
> From: "H.J. Lu" <hjl.tools@gmail.com>
> Date: Mon, 19 Apr 2021 10:45:07 -0700
> Subject: [PATCH] x86-64: Require BMI2 for strchr-avx2.S
> 
> Since strchr-avx2.S updated by
> 
> commit 1f745ecc2109890886b161d4791e1406fdfc29b8
> Author: noah <goldstein.w.n@gmail.com>
> Date:   Wed Feb 3 00:38:59 2021 -0500
> 
>     x86-64: Refactor and improve performance of strchr-avx2.S
> 
> uses sarx:
> 
> c4 e2 72 f7 c0       	sarx   %ecx,%eax,%eax
> 
> for strchr-avx2 family functions, require BMI2 in ifunc-impl-list.c and
> ifunc-avx2.h.
> 
> (cherry picked from commit 83c5b368226c34a2f0a5287df40fc290b2b34359)
> ---
>  sysdeps/x86_64/multiarch/ifunc-avx2.h      |  4 ++--
>  sysdeps/x86_64/multiarch/ifunc-impl-list.c | 12 +++++++++---
>  2 files changed, 11 insertions(+), 5 deletions(-)

First of all 1f745ecc2109890886b161d4791e1406fdfc29b8 never got
backported to 2.32 and older branches, and strchr-avx2.S in those
branches do not use BMI2 instructions. So it doesn't make sense to
backport it.

That said the change in ifunc-avx2.h fixes:

- memchr and rawmemchr, broken by the backport of acfd088a1963 ("x86:
  Optimize memchr-avx2.S")
- strlen and strnlen, broken by the backport of aaa23c350715 ("x86:
  Optimize strlen-avx2.S")

So the issues are fixed, but mostly by chance.

NB: at this stage, I haven't verified the consistency of the ifunc
selectors with ifunc-impl-list.c.

-- 
Aurelien Jarno                          GPG: 4096R/1DDD8C9B
aurelien@aurel32.net                 http://www.aurel32.net

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH v5 2/2] x86: Optimize strlen-avx2.S
  2022-10-04 21:19         ` [PATCH v5 2/2] x86: Optimize strlen-avx2.S Aurelien Jarno
@ 2022-10-04 21:29           ` H.J. Lu
  2022-10-05  1:10           ` Sunil Pandey
  1 sibling, 0 replies; 24+ messages in thread
From: H.J. Lu @ 2022-10-04 21:29 UTC (permalink / raw)
  To: Sunil Pandey, Noah Goldstein, Libc-stable Mailing List,
	Hongjiu Lu, GNU C Library

On Tue, Oct 4, 2022 at 2:20 PM Aurelien Jarno <aurelien@aurel32.net> wrote:
>
> On 2022-09-28 06:54, Sunil Pandey via Libc-stable wrote:
> > Attached patch fixes BZ# 29611.
> >
> > I would like to backport it to 2.32,2.31,2.30,2.29 and 2.29. Let me know
> > if there is any objection.
>
> Sorry to be late on this. I have a few comments about that patch:
>
> > From 86e1d88e1a3c126597ef39165275ada7564cfce9 Mon Sep 17 00:00:00 2001
> > From: "H.J. Lu" <hjl.tools@gmail.com>
> > Date: Mon, 19 Apr 2021 10:45:07 -0700
> > Subject: [PATCH] x86-64: Require BMI2 for strchr-avx2.S
> >
> > Since strchr-avx2.S updated by
> >
> > commit 1f745ecc2109890886b161d4791e1406fdfc29b8
> > Author: noah <goldstein.w.n@gmail.com>
> > Date:   Wed Feb 3 00:38:59 2021 -0500
> >
> >     x86-64: Refactor and improve performance of strchr-avx2.S
> >
> > uses sarx:
> >
> > c4 e2 72 f7 c0        sarx   %ecx,%eax,%eax
> >
> > for strchr-avx2 family functions, require BMI2 in ifunc-impl-list.c and
> > ifunc-avx2.h.
> >
> > (cherry picked from commit 83c5b368226c34a2f0a5287df40fc290b2b34359)
> > ---
> >  sysdeps/x86_64/multiarch/ifunc-avx2.h      |  4 ++--
> >  sysdeps/x86_64/multiarch/ifunc-impl-list.c | 12 +++++++++---
> >  2 files changed, 11 insertions(+), 5 deletions(-)
>
> First of all 1f745ecc2109890886b161d4791e1406fdfc29b8 never got
> backported to 2.32 and older branches, and strchr-avx2.S in those
> branches do not use BMI2 instructions. So it doesn't make sense to
> backport it.
>
> That said the change in ifunc-avx2.h fixes:
>
> - memchr and rawmemchr, broken by the backport of acfd088a1963 ("x86:
>   Optimize memchr-avx2.S")
> - strlen and strnlen, broken by the backport of aaa23c350715 ("x86:
>   Optimize strlen-avx2.S")
>
> So the issues are fixed, but mostly by chance.
>
> NB: at this stage, I haven't verified the consistency of the ifunc
> selectors with ifunc-impl-list.c.
>

Changes to ifunc-impl-list.c aren't strictly needed since strchr functions
don't use BMI2.  AVX2 strchr functions are still tested on machines with
AVX2 and BMI2.

-- 
H.J.

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH v5 2/2] x86: Optimize strlen-avx2.S
  2022-10-04 21:19         ` [PATCH v5 2/2] x86: Optimize strlen-avx2.S Aurelien Jarno
  2022-10-04 21:29           ` H.J. Lu
@ 2022-10-05  1:10           ` Sunil Pandey
  2022-10-05 14:23             ` Noah Goldstein
  2022-10-05 17:11             ` Aurelien Jarno
  1 sibling, 2 replies; 24+ messages in thread
From: Sunil Pandey @ 2022-10-05  1:10 UTC (permalink / raw)
  To: Sunil Pandey, Noah Goldstein, Libc-stable Mailing List,
	Hongjiu Lu, GNU C Library

On Tue, Oct 4, 2022 at 2:20 PM Aurelien Jarno <aurelien@aurel32.net> wrote:
>
> On 2022-09-28 06:54, Sunil Pandey via Libc-stable wrote:
> > Attached patch fixes BZ# 29611.
> >
> > I would like to backport it to 2.32,2.31,2.30,2.29 and 2.29. Let me know
> > if there is any objection.
>
> Sorry to be late on this. I have a few comments about that patch:
>
> > From 86e1d88e1a3c126597ef39165275ada7564cfce9 Mon Sep 17 00:00:00 2001
> > From: "H.J. Lu" <hjl.tools@gmail.com>
> > Date: Mon, 19 Apr 2021 10:45:07 -0700
> > Subject: [PATCH] x86-64: Require BMI2 for strchr-avx2.S
> >
> > Since strchr-avx2.S updated by
> >
> > commit 1f745ecc2109890886b161d4791e1406fdfc29b8
> > Author: noah <goldstein.w.n@gmail.com>
> > Date:   Wed Feb 3 00:38:59 2021 -0500
> >
> >     x86-64: Refactor and improve performance of strchr-avx2.S
> >
> > uses sarx:
> >
> > c4 e2 72 f7 c0        sarx   %ecx,%eax,%eax
> >
> > for strchr-avx2 family functions, require BMI2 in ifunc-impl-list.c and
> > ifunc-avx2.h.
> >
> > (cherry picked from commit 83c5b368226c34a2f0a5287df40fc290b2b34359)
> > ---
> >  sysdeps/x86_64/multiarch/ifunc-avx2.h      |  4 ++--
> >  sysdeps/x86_64/multiarch/ifunc-impl-list.c | 12 +++++++++---
> >  2 files changed, 11 insertions(+), 5 deletions(-)
>
> First of all 1f745ecc2109890886b161d4791e1406fdfc29b8 never got
> backported to 2.32 and older branches, and strchr-avx2.S in those
> branches do not use BMI2 instructions. So it doesn't make sense to
> backport it.
>
> That said the change in ifunc-avx2.h fixes:
>
> - memchr and rawmemchr, broken by the backport of acfd088a1963 ("x86:
>   Optimize memchr-avx2.S")
> - strlen and strnlen, broken by the backport of aaa23c350715 ("x86:
>   Optimize strlen-avx2.S")
>
> So the issues are fixed, but mostly by chance.

How do you know it is a "by chance" fix, do you have any evidence to back
your claim?

>
> NB: at this stage, I haven't verified the consistency of the ifunc
> selectors with ifunc-impl-list.c.
>
> --
> Aurelien Jarno                          GPG: 4096R/1DDD8C9B
> aurelien@aurel32.net                 http://www.aurel32.net

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH v5 2/2] x86: Optimize strlen-avx2.S
  2022-10-05  1:10           ` Sunil Pandey
@ 2022-10-05 14:23             ` Noah Goldstein
  2022-10-05 16:35               ` Sunil Pandey
  2022-10-05 17:11             ` Aurelien Jarno
  1 sibling, 1 reply; 24+ messages in thread
From: Noah Goldstein @ 2022-10-05 14:23 UTC (permalink / raw)
  To: Sunil Pandey; +Cc: Libc-stable Mailing List, Hongjiu Lu, GNU C Library

On Tue, Oct 4, 2022 at 6:11 PM Sunil Pandey <skpgkp2@gmail.com> wrote:
>
> On Tue, Oct 4, 2022 at 2:20 PM Aurelien Jarno <aurelien@aurel32.net> wrote:
> >
> > On 2022-09-28 06:54, Sunil Pandey via Libc-stable wrote:
> > > Attached patch fixes BZ# 29611.
> > >
> > > I would like to backport it to 2.32,2.31,2.30,2.29 and 2.29. Let me know
> > > if there is any objection.
> >
> > Sorry to be late on this. I have a few comments about that patch:
> >
> > > From 86e1d88e1a3c126597ef39165275ada7564cfce9 Mon Sep 17 00:00:00 2001
> > > From: "H.J. Lu" <hjl.tools@gmail.com>
> > > Date: Mon, 19 Apr 2021 10:45:07 -0700
> > > Subject: [PATCH] x86-64: Require BMI2 for strchr-avx2.S
> > >
> > > Since strchr-avx2.S updated by
> > >
> > > commit 1f745ecc2109890886b161d4791e1406fdfc29b8
> > > Author: noah <goldstein.w.n@gmail.com>
> > > Date:   Wed Feb 3 00:38:59 2021 -0500
> > >
> > >     x86-64: Refactor and improve performance of strchr-avx2.S
> > >
> > > uses sarx:
> > >
> > > c4 e2 72 f7 c0        sarx   %ecx,%eax,%eax
> > >
> > > for strchr-avx2 family functions, require BMI2 in ifunc-impl-list.c and
> > > ifunc-avx2.h.
> > >
> > > (cherry picked from commit 83c5b368226c34a2f0a5287df40fc290b2b34359)
> > > ---
> > >  sysdeps/x86_64/multiarch/ifunc-avx2.h      |  4 ++--
> > >  sysdeps/x86_64/multiarch/ifunc-impl-list.c | 12 +++++++++---
> > >  2 files changed, 11 insertions(+), 5 deletions(-)
> >
> > First of all 1f745ecc2109890886b161d4791e1406fdfc29b8 never got
> > backported to 2.32 and older branches, and strchr-avx2.S in those
> > branches do not use BMI2 instructions. So it doesn't make sense to
> > backport it.
> >
> > That said the change in ifunc-avx2.h fixes:
> >
> > - memchr and rawmemchr, broken by the backport of acfd088a1963 ("x86:
> >   Optimize memchr-avx2.S")
> > - strlen and strnlen, broken by the backport of aaa23c350715 ("x86:
> >   Optimize strlen-avx2.S")
> >
> > So the issues are fixed, but mostly by chance.
>
> How do you know it is a "by chance" fix, do you have any evidence to back
> your claim?

There might not be evidence about the intention of the authors but clearly
the strchr commit message does not clarify that it also fixes memchr/strlen.
>
> >
> > NB: at this stage, I haven't verified the consistency of the ifunc
> > selectors with ifunc-impl-list.c.
> >
> > --
> > Aurelien Jarno                          GPG: 4096R/1DDD8C9B
> > aurelien@aurel32.net                 http://www.aurel32.net

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH v5 2/2] x86: Optimize strlen-avx2.S
  2022-10-05 14:23             ` Noah Goldstein
@ 2022-10-05 16:35               ` Sunil Pandey
  0 siblings, 0 replies; 24+ messages in thread
From: Sunil Pandey @ 2022-10-05 16:35 UTC (permalink / raw)
  To: Noah Goldstein; +Cc: Libc-stable Mailing List, Hongjiu Lu, GNU C Library

On Wed, Oct 5, 2022 at 7:23 AM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> On Tue, Oct 4, 2022 at 6:11 PM Sunil Pandey <skpgkp2@gmail.com> wrote:
> >
> > On Tue, Oct 4, 2022 at 2:20 PM Aurelien Jarno <aurelien@aurel32.net> wrote:
> > >
> > > On 2022-09-28 06:54, Sunil Pandey via Libc-stable wrote:
> > > > Attached patch fixes BZ# 29611.
> > > >
> > > > I would like to backport it to 2.32,2.31,2.30,2.29 and 2.29. Let me know
> > > > if there is any objection.
> > >
> > > Sorry to be late on this. I have a few comments about that patch:
> > >
> > > > From 86e1d88e1a3c126597ef39165275ada7564cfce9 Mon Sep 17 00:00:00 2001
> > > > From: "H.J. Lu" <hjl.tools@gmail.com>
> > > > Date: Mon, 19 Apr 2021 10:45:07 -0700
> > > > Subject: [PATCH] x86-64: Require BMI2 for strchr-avx2.S
> > > >
> > > > Since strchr-avx2.S updated by
> > > >
> > > > commit 1f745ecc2109890886b161d4791e1406fdfc29b8
> > > > Author: noah <goldstein.w.n@gmail.com>
> > > > Date:   Wed Feb 3 00:38:59 2021 -0500
> > > >
> > > >     x86-64: Refactor and improve performance of strchr-avx2.S
> > > >
> > > > uses sarx:
> > > >
> > > > c4 e2 72 f7 c0        sarx   %ecx,%eax,%eax
> > > >
> > > > for strchr-avx2 family functions, require BMI2 in ifunc-impl-list.c and
> > > > ifunc-avx2.h.
> > > >
> > > > (cherry picked from commit 83c5b368226c34a2f0a5287df40fc290b2b34359)
> > > > ---
> > > >  sysdeps/x86_64/multiarch/ifunc-avx2.h      |  4 ++--
> > > >  sysdeps/x86_64/multiarch/ifunc-impl-list.c | 12 +++++++++---
> > > >  2 files changed, 11 insertions(+), 5 deletions(-)
> > >
> > > First of all 1f745ecc2109890886b161d4791e1406fdfc29b8 never got
> > > backported to 2.32 and older branches, and strchr-avx2.S in those
> > > branches do not use BMI2 instructions. So it doesn't make sense to
> > > backport it.
> > >
> > > That said the change in ifunc-avx2.h fixes:
> > >
> > > - memchr and rawmemchr, broken by the backport of acfd088a1963 ("x86:
> > >   Optimize memchr-avx2.S")
> > > - strlen and strnlen, broken by the backport of aaa23c350715 ("x86:
> > >   Optimize strlen-avx2.S")
> > >
> > > So the issues are fixed, but mostly by chance.
> >
> > How do you know it is a "by chance" fix, do you have any evidence to back
> > your claim?
>
> There might not be evidence about the intention of the authors but clearly
> the strchr commit message does not clarify that it also fixes memchr/strlen.

ifunc-avx2.h header file is used in many functions, so fix in ifunc-avx2.h fixes
all those functions too. It's not "by chance", I scan all the functions where
ifunc-avx2.h are used before backporting it.

Since this is a backport commit and no extra changes are made, there is no
need to modify the original author commit message.

> >
> > >
> > > NB: at this stage, I haven't verified the consistency of the ifunc
> > > selectors with ifunc-impl-list.c.
> > >
> > > --
> > > Aurelien Jarno                          GPG: 4096R/1DDD8C9B
> > > aurelien@aurel32.net                 http://www.aurel32.net

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH v5 2/2] x86: Optimize strlen-avx2.S
  2022-10-05  1:10           ` Sunil Pandey
  2022-10-05 14:23             ` Noah Goldstein
@ 2022-10-05 17:11             ` Aurelien Jarno
  2022-10-05 18:34               ` Sunil Pandey
  1 sibling, 1 reply; 24+ messages in thread
From: Aurelien Jarno @ 2022-10-05 17:11 UTC (permalink / raw)
  To: Sunil Pandey
  Cc: Noah Goldstein, Libc-stable Mailing List, Hongjiu Lu, GNU C Library

On 2022-10-04 18:10, Sunil Pandey via Libc-alpha wrote:
> On Tue, Oct 4, 2022 at 2:20 PM Aurelien Jarno <aurelien@aurel32.net> wrote:
> >
> > On 2022-09-28 06:54, Sunil Pandey via Libc-stable wrote:
> > > Attached patch fixes BZ# 29611.
> > >
> > > I would like to backport it to 2.32,2.31,2.30,2.29 and 2.29. Let me know
> > > if there is any objection.
> >
> > Sorry to be late on this. I have a few comments about that patch:
> >
> > > From 86e1d88e1a3c126597ef39165275ada7564cfce9 Mon Sep 17 00:00:00 2001
> > > From: "H.J. Lu" <hjl.tools@gmail.com>
> > > Date: Mon, 19 Apr 2021 10:45:07 -0700
> > > Subject: [PATCH] x86-64: Require BMI2 for strchr-avx2.S
> > >
> > > Since strchr-avx2.S updated by
> > >
> > > commit 1f745ecc2109890886b161d4791e1406fdfc29b8
> > > Author: noah <goldstein.w.n@gmail.com>
> > > Date:   Wed Feb 3 00:38:59 2021 -0500
> > >
> > >     x86-64: Refactor and improve performance of strchr-avx2.S
> > >
> > > uses sarx:
> > >
> > > c4 e2 72 f7 c0        sarx   %ecx,%eax,%eax
> > >
> > > for strchr-avx2 family functions, require BMI2 in ifunc-impl-list.c and
> > > ifunc-avx2.h.
> > >
> > > (cherry picked from commit 83c5b368226c34a2f0a5287df40fc290b2b34359)
> > > ---
> > >  sysdeps/x86_64/multiarch/ifunc-avx2.h      |  4 ++--
> > >  sysdeps/x86_64/multiarch/ifunc-impl-list.c | 12 +++++++++---
> > >  2 files changed, 11 insertions(+), 5 deletions(-)
> >
> > First of all 1f745ecc2109890886b161d4791e1406fdfc29b8 never got
> > backported to 2.32 and older branches, and strchr-avx2.S in those
> > branches do not use BMI2 instructions. So it doesn't make sense to
> > backport it.
> >
> > That said the change in ifunc-avx2.h fixes:
> >
> > - memchr and rawmemchr, broken by the backport of acfd088a1963 ("x86:
> >   Optimize memchr-avx2.S")
> > - strlen and strnlen, broken by the backport of aaa23c350715 ("x86:
> >   Optimize strlen-avx2.S")
> >
> > So the issues are fixed, but mostly by chance.
> 
> How do you know it is a "by chance" fix, do you have any evidence to back
> your claim?

My point is that the commit that has been backported is fixing a bug
that doesn't exist in 2.32 branches. strchr-avx2.S does not the sarx
instruction as the commit claims, and does not use other BMI2
instructions either.

However following the backport of commit acfd088a1963 and aaa23c350715
in these branches, memchr-avx2.S and strlen-avx2.S use BMI2
instructions, and as they use ifunc-avx2.h, this actually fixes the bug.

-- 
Aurelien Jarno                          GPG: 4096R/1DDD8C9B
aurelien@aurel32.net                 http://www.aurel32.net

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH v5 2/2] x86: Optimize strlen-avx2.S
  2022-10-05 17:11             ` Aurelien Jarno
@ 2022-10-05 18:34               ` Sunil Pandey
  0 siblings, 0 replies; 24+ messages in thread
From: Sunil Pandey @ 2022-10-05 18:34 UTC (permalink / raw)
  To: Sunil Pandey, Noah Goldstein, Libc-stable Mailing List,
	Hongjiu Lu, GNU C Library

On Wed, Oct 5, 2022 at 10:11 AM Aurelien Jarno <aurelien@aurel32.net> wrote:
>
> On 2022-10-04 18:10, Sunil Pandey via Libc-alpha wrote:
> > On Tue, Oct 4, 2022 at 2:20 PM Aurelien Jarno <aurelien@aurel32.net> wrote:
> > >
> > > On 2022-09-28 06:54, Sunil Pandey via Libc-stable wrote:
> > > > Attached patch fixes BZ# 29611.
> > > >
> > > > I would like to backport it to 2.32,2.31,2.30,2.29 and 2.29. Let me know
> > > > if there is any objection.
> > >
> > > Sorry to be late on this. I have a few comments about that patch:
> > >
> > > > From 86e1d88e1a3c126597ef39165275ada7564cfce9 Mon Sep 17 00:00:00 2001
> > > > From: "H.J. Lu" <hjl.tools@gmail.com>
> > > > Date: Mon, 19 Apr 2021 10:45:07 -0700
> > > > Subject: [PATCH] x86-64: Require BMI2 for strchr-avx2.S
> > > >
> > > > Since strchr-avx2.S updated by
> > > >
> > > > commit 1f745ecc2109890886b161d4791e1406fdfc29b8
> > > > Author: noah <goldstein.w.n@gmail.com>
> > > > Date:   Wed Feb 3 00:38:59 2021 -0500
> > > >
> > > >     x86-64: Refactor and improve performance of strchr-avx2.S
> > > >
> > > > uses sarx:
> > > >
> > > > c4 e2 72 f7 c0        sarx   %ecx,%eax,%eax
> > > >
> > > > for strchr-avx2 family functions, require BMI2 in ifunc-impl-list.c and
> > > > ifunc-avx2.h.
> > > >
> > > > (cherry picked from commit 83c5b368226c34a2f0a5287df40fc290b2b34359)
> > > > ---
> > > >  sysdeps/x86_64/multiarch/ifunc-avx2.h      |  4 ++--
> > > >  sysdeps/x86_64/multiarch/ifunc-impl-list.c | 12 +++++++++---
> > > >  2 files changed, 11 insertions(+), 5 deletions(-)
> > >
> > > First of all 1f745ecc2109890886b161d4791e1406fdfc29b8 never got
> > > backported to 2.32 and older branches, and strchr-avx2.S in those
> > > branches do not use BMI2 instructions. So it doesn't make sense to
> > > backport it.
> > >
> > > That said the change in ifunc-avx2.h fixes:
> > >
> > > - memchr and rawmemchr, broken by the backport of acfd088a1963 ("x86:
> > >   Optimize memchr-avx2.S")
> > > - strlen and strnlen, broken by the backport of aaa23c350715 ("x86:
> > >   Optimize strlen-avx2.S")
> > >
> > > So the issues are fixed, but mostly by chance.
> >
> > How do you know it is a "by chance" fix, do you have any evidence to back
> > your claim?
>
> My point is that the commit that has been backported is fixing a bug
> that doesn't exist in 2.32 branches. strchr-avx2.S does not the sarx
> instruction as the commit claims, and does not use other BMI2
> instructions either.
>
> However following the backport of commit acfd088a1963 and aaa23c350715
> in these branches, memchr-avx2.S and strlen-avx2.S use BMI2
> instructions, and as they use ifunc-avx2.h, this actually fixes the bug.
>

This patch got selected because it fixes the ifunc-avx2.h file. My preference
 is to take an existing patch if possible, rather than creating a new one for
 branches.

You are right, the original patch should have been composed differently to
make it crystal clear.

For backporting it's preferable to have small independent patches with
logical grouping.


> --
> Aurelien Jarno                          GPG: 4096R/1DDD8C9B
> aurelien@aurel32.net                 http://www.aurel32.net

^ permalink raw reply	[flat|nested] 24+ messages in thread

end of thread, other threads:[~2022-10-05 18:34 UTC | newest]

Thread overview: 24+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-04-19 23:36 [PATCH v5 1/2] x86: Optimize strlen-evex.S Noah Goldstein
2021-04-19 23:36 ` [PATCH v5 2/2] x86: Optimize strlen-avx2.S Noah Goldstein
2021-04-20  1:01   ` H.J. Lu
2022-09-25  8:19   ` Aurelien Jarno
2022-09-25 14:00     ` Noah Goldstein
2022-09-28 13:54       ` Sunil Pandey
2022-09-28 14:02         ` Darren Tristano
2022-09-28 14:42         ` Noah Goldstein
2022-09-28 14:54           ` Sunil Pandey
2022-09-28 15:00             ` Noah Goldstein
2022-09-28 18:24               ` H.J. Lu
2022-09-30 13:19                 ` FUCKETY FUCK FUCK FUCK - PLEASE FUCKING REMOVE ME> Darren Tristano
2022-09-28 18:23         ` [PATCH v5 2/2] x86: Optimize strlen-avx2.S H.J. Lu
2022-09-28 19:09           ` Sunil Pandey
2022-09-28 19:23             ` H.J. Lu
2022-09-30 13:19           ` FUCKETY FUCK FUCK FUCK - PLEASE FUCKING REMOVE ME> Darren Tristano
2022-10-04 21:19         ` [PATCH v5 2/2] x86: Optimize strlen-avx2.S Aurelien Jarno
2022-10-04 21:29           ` H.J. Lu
2022-10-05  1:10           ` Sunil Pandey
2022-10-05 14:23             ` Noah Goldstein
2022-10-05 16:35               ` Sunil Pandey
2022-10-05 17:11             ` Aurelien Jarno
2022-10-05 18:34               ` Sunil Pandey
2021-04-20  1:01 ` [PATCH v5 1/2] x86: Optimize strlen-evex.S H.J. Lu

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).