public inbox for libc-alpha@sourceware.org
 help / color / mirror / Atom feed
* [PATCH v1 1/7] x86: Optimize memchr-evex.S and implement with VMM headers
@ 2022-10-18  2:48 Noah Goldstein
  2022-10-18  2:48 ` [PATCH v1 2/7] x86: Shrink / minorly optimize strchr-evex " Noah Goldstein
                   ` (8 more replies)
  0 siblings, 9 replies; 41+ messages in thread
From: Noah Goldstein @ 2022-10-18  2:48 UTC (permalink / raw)
  To: libc-alpha; +Cc: goldstein.w.n, hjl.tools, carlos

Optimizations are:

1. Use the fact that tzcnt(0) -> VEC_SIZE for memchr to save a branch
   in short string case.
2. Restructure code so that small strings are given the hot path.
	- This is a net-zero on the benchmark suite but in general makes
      sense as smaller sizes are far more common.
3. Use more code-size efficient instructions.
	- tzcnt ...     -> bsf ...
	- vpcmpb $0 ... -> vpcmpeq ...
4. Align labels less aggressively, especially if it doesn't save fetch
   blocks / causes the basic-block to span extra cache-lines.

The optimizations (especially for point 2) make the memchr and
rawmemchr code essentially incompatible so split rawmemchr-evex
to a new file.

Code Size Changes:
memchr-evex.S       : -107 bytes
rawmemchr-evex.S    :  -53 bytes

Net perf changes:

Reported as geometric mean of all improvements / regressions from N=10
runs of the benchtests. Value as New Time / Old Time so < 1.0 is
improvement and 1.0 is regression.

memchr-evex.S       : 0.928
rawmemchr-evex.S    : 0.986 (Less targets cross cache lines)

Full results attached in email.

Full check passes on x86-64.
---
 sysdeps/x86_64/multiarch/memchr-evex.S        | 939 ++++++++++--------
 sysdeps/x86_64/multiarch/rawmemchr-evex-rtm.S |   9 +-
 sysdeps/x86_64/multiarch/rawmemchr-evex.S     | 313 +++++-
 3 files changed, 851 insertions(+), 410 deletions(-)

diff --git a/sysdeps/x86_64/multiarch/memchr-evex.S b/sysdeps/x86_64/multiarch/memchr-evex.S
index 0dd4f1dcce..23a1c0018e 100644
--- a/sysdeps/x86_64/multiarch/memchr-evex.S
+++ b/sysdeps/x86_64/multiarch/memchr-evex.S
@@ -21,17 +21,27 @@
 
 #if ISA_SHOULD_BUILD (4)
 
+# ifndef VEC_SIZE
+#  include "x86-evex256-vecs.h"
+# endif
+
 # ifndef MEMCHR
 #  define MEMCHR	__memchr_evex
 # endif
 
 # ifdef USE_AS_WMEMCHR
+#  define PC_SHIFT_GPR	rcx
+#  define VPTESTN	vptestnmd
 #  define VPBROADCAST	vpbroadcastd
 #  define VPMINU	vpminud
 #  define VPCMP	vpcmpd
 #  define VPCMPEQ	vpcmpeqd
 #  define CHAR_SIZE	4
+
+#  define USE_WIDE_CHAR
 # else
+#  define PC_SHIFT_GPR	rdi
+#  define VPTESTN	vptestnmb
 #  define VPBROADCAST	vpbroadcastb
 #  define VPMINU	vpminub
 #  define VPCMP	vpcmpb
@@ -39,534 +49,661 @@
 #  define CHAR_SIZE	1
 # endif
 
-	/* In the 4x loop the RTM and non-RTM versions have data pointer
-	   off by VEC_SIZE * 4 with RTM version being VEC_SIZE * 4 greater.
-	   This is represented by BASE_OFFSET. As well because the RTM
-	   version uses vpcmp which stores a bit per element compared where
-	   the non-RTM version uses vpcmpeq which stores a bit per byte
-	   compared RET_SCALE of CHAR_SIZE is only relevant for the RTM
-	   version.  */
-# ifdef USE_IN_RTM
+# include "reg-macros.h"
+
+
+/* If not in an RTM and VEC_SIZE != 64 (the VEC_SIZE = 64
+   doesn't have VEX encoding), use VEX encoding in loop so we
+   can use vpcmpeqb + vptern which is more efficient than the
+   EVEX alternative.  */
+# if defined USE_IN_RTM || VEC_SIZE == 64
+#  undef COND_VZEROUPPER
+#  undef VZEROUPPER_RETURN
+#  undef VZEROUPPER
+
+#  define COND_VZEROUPPER
+#  define VZEROUPPER_RETURN	ret
 #  define VZEROUPPER
-#  define BASE_OFFSET	(VEC_SIZE * 4)
-#  define RET_SCALE	CHAR_SIZE
+
+#  define USE_TERN_IN_LOOP	0
 # else
+#  define USE_TERN_IN_LOOP	1
+#  undef VZEROUPPER
 #  define VZEROUPPER	vzeroupper
-#  define BASE_OFFSET	0
-#  define RET_SCALE	1
 # endif
 
-	/* In the return from 4x loop memchr and rawmemchr versions have
-	   data pointers off by VEC_SIZE * 4 with memchr version being
-	   VEC_SIZE * 4 greater.  */
-# ifdef USE_AS_RAWMEMCHR
-#  define RET_OFFSET	(BASE_OFFSET - (VEC_SIZE * 4))
-#  define RAW_PTR_REG	rcx
-#  define ALGN_PTR_REG	rdi
+# if USE_TERN_IN_LOOP
+	/* Resulting bitmask for vpmovmskb has 4-bits set for each wchar
+	   so we don't want to multiply resulting index.  */
+#  define TERN_CHAR_MULT	1
+
+#  ifdef USE_AS_WMEMCHR
+#   define TEST_END()	inc %VRCX
+#  else
+#   define TEST_END()	add %rdx, %rcx
+#  endif
 # else
-#  define RET_OFFSET	BASE_OFFSET
-#  define RAW_PTR_REG	rdi
-#  define ALGN_PTR_REG	rcx
+#  define TERN_CHAR_MULT	CHAR_SIZE
+#  define TEST_END()	KORTEST %k2, %k3
 # endif
 
-# define XMMZERO	xmm23
-# define YMMZERO	ymm23
-# define XMMMATCH	xmm16
-# define YMMMATCH	ymm16
-# define YMM1		ymm17
-# define YMM2		ymm18
-# define YMM3		ymm19
-# define YMM4		ymm20
-# define YMM5		ymm21
-# define YMM6		ymm22
+# if defined USE_AS_WMEMCHR || !USE_TERN_IN_LOOP
+#  ifndef USE_AS_WMEMCHR
+#   define GPR_X0_IS_RET	1
+#  else
+#   define GPR_X0_IS_RET	0
+#  endif
+#  define GPR_X0	rax
+# else
+#  define GPR_X0_IS_RET	0
+#  define GPR_X0	rdx
+# endif
+
+# define CHAR_PER_VEC	(VEC_SIZE / CHAR_SIZE)
 
-# ifndef SECTION
-#  define SECTION(p)	p##.evex
+# if CHAR_PER_VEC == 64
+#  define LAST_VEC_OFFSET	(VEC_SIZE * 3)
+# else
+#  define LAST_VEC_OFFSET	(VEC_SIZE * 2)
+# endif
+# if CHAR_PER_VEC >= 32
+#  define MASK_GPR(...)	VGPR(__VA_ARGS__)
+# elif CHAR_PER_VEC == 16
+#  define MASK_GPR(reg)	VGPR_SZ(reg, 16)
+# else
+#  define MASK_GPR(reg)	VGPR_SZ(reg, 8)
 # endif
 
-# define VEC_SIZE 32
-# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE)
-# define PAGE_SIZE 4096
+# define VMATCH	VMM(0)
+# define VMATCH_LO	VMM_lo(0)
 
-	.section SECTION(.text),"ax",@progbits
+# define PAGE_SIZE	4096
+
+
+	.section SECTION(.text), "ax", @progbits
 ENTRY_P2ALIGN (MEMCHR, 6)
-# ifndef USE_AS_RAWMEMCHR
 	/* Check for zero length.  */
 	test	%RDX_LP, %RDX_LP
-	jz	L(zero)
+	jz	L(zero_0)
 
-#  ifdef __ILP32__
+# ifdef __ILP32__
 	/* Clear the upper 32 bits.  */
 	movl	%edx, %edx
-#  endif
 # endif
-	/* Broadcast CHAR to YMMMATCH.  */
-	VPBROADCAST %esi, %YMMMATCH
+	VPBROADCAST %esi, %VMATCH
 	/* Check if we may cross page boundary with one vector load.  */
 	movl	%edi, %eax
 	andl	$(PAGE_SIZE - 1), %eax
 	cmpl	$(PAGE_SIZE - VEC_SIZE), %eax
-	ja	L(cross_page_boundary)
+	ja	L(page_cross)
+
+	VPCMPEQ	(%rdi), %VMATCH, %k0
+	KMOV	%k0, %VRAX
+# ifndef USE_AS_WMEMCHR
+	/* If rcx is zero then tzcnt -> CHAR_PER_VEC.  NB: there is a
+	   already a dependency between rcx and rsi so no worries about
+	   false-dep here.  */
+	tzcnt	%VRAX, %VRSI
+	/* If rdx <= rsi then either 1) rcx was non-zero (there was a
+	   match) but it was out of bounds or 2) rcx was zero and rdx
+	   was <= VEC_SIZE so we are done scanning.  */
+	cmpq	%rsi, %rdx
+	/* NB: Use branch to return zero/non-zero.  Common usage will
+	   branch on result of function (if return is null/non-null).
+	   This branch can be used to predict the ensuing one so there
+	   is no reason to extend the data-dependency with cmovcc.  */
+	jbe	L(zero_0)
+
+	/* If rcx is zero then len must be > RDX, otherwise since we
+	   already tested len vs lzcnt(rcx) (in rsi) we are good to
+	   return this match.  */
+	test	%VRAX, %VRAX
+	jz	L(more_1x_vec)
+	leaq	(%rdi, %rsi), %rax
+# else
 
-	/* Check the first VEC_SIZE bytes.  */
-	VPCMP	$0, (%rdi), %YMMMATCH, %k0
-	kmovd	%k0, %eax
-# ifndef USE_AS_RAWMEMCHR
-	/* If length < CHAR_PER_VEC handle special.  */
+	/* We can't use the `tzcnt` trick for wmemchr because CHAR_SIZE
+	   > 1 so if rcx is tzcnt != CHAR_PER_VEC.  */
 	cmpq	$CHAR_PER_VEC, %rdx
-	jbe	L(first_vec_x0)
-# endif
-	testl	%eax, %eax
-	jz	L(aligned_more)
-	tzcntl	%eax, %eax
-# ifdef USE_AS_WMEMCHR
-	/* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count.  */
+	ja	L(more_1x_vec)
+	tzcnt	%VRAX, %VRAX
+	cmpl	%eax, %edx
+	jbe	L(zero_0)
+L(first_vec_x0_ret):
 	leaq	(%rdi, %rax, CHAR_SIZE), %rax
-# else
-	addq	%rdi, %rax
 # endif
 	ret
 
-# ifndef USE_AS_RAWMEMCHR
-L(zero):
-	xorl	%eax, %eax
-	ret
-
-	.p2align 4
-L(first_vec_x0):
-	/* Check if first match was before length. NB: tzcnt has false data-
-	   dependency on destination. eax already had a data-dependency on esi
-	   so this should have no affect here.  */
-	tzcntl	%eax, %esi
-#  ifdef USE_AS_WMEMCHR
-	leaq	(%rdi, %rsi, CHAR_SIZE), %rdi
-#  else
-	addq	%rsi, %rdi
-#  endif
+	/* Only fits in first cache line for VEC_SIZE == 32.  */
+# if VEC_SIZE == 32
+	.p2align 4,, 2
+L(zero_0):
 	xorl	%eax, %eax
-	cmpl	%esi, %edx
-	cmovg	%rdi, %rax
 	ret
 # endif
 
-	.p2align 4
-L(cross_page_boundary):
-	/* Save pointer before aligning as its original value is
-	   necessary for computer return address if byte is found or
-	   adjusting length if it is not and this is memchr.  */
-	movq	%rdi, %rcx
-	/* Align data to VEC_SIZE. ALGN_PTR_REG is rcx for memchr and rdi
-	   for rawmemchr.  */
-	andq	$-VEC_SIZE, %ALGN_PTR_REG
-	VPCMP	$0, (%ALGN_PTR_REG), %YMMMATCH, %k0
-	kmovd	%k0, %r8d
+	.p2align 4,, 9
+L(more_1x_vec):
 # ifdef USE_AS_WMEMCHR
-	/* NB: Divide shift count by 4 since each bit in K0 represent 4
-	   bytes.  */
-	sarl	$2, %eax
-# endif
-# ifndef USE_AS_RAWMEMCHR
-	movl	$(PAGE_SIZE / CHAR_SIZE), %esi
-	subl	%eax, %esi
+	/* If wmemchr still need to test if there was a match in first
+	   VEC.  Use bsf to test here so we can reuse
+	   L(first_vec_x0_ret).  */
+	bsf	%VRAX, %VRAX
+	jnz	L(first_vec_x0_ret)
 # endif
+
+L(page_cross_continue):
 # ifdef USE_AS_WMEMCHR
-	andl	$(CHAR_PER_VEC - 1), %eax
-# endif
-	/* Remove the leading bytes.  */
-	sarxl	%eax, %r8d, %eax
-# ifndef USE_AS_RAWMEMCHR
-	/* Check the end of data.  */
-	cmpq	%rsi, %rdx
-	jbe	L(first_vec_x0)
+	/* We can't use end of the buffer to re-calculate length for
+	   wmemchr as len * CHAR_SIZE may overflow.  */
+	leaq	-(VEC_SIZE + CHAR_SIZE)(%rdi), %rax
+	andq	$(VEC_SIZE * -1), %rdi
+	subq	%rdi, %rax
+	sarq	$2, %rax
+	addq	%rdx, %rax
+# else
+	leaq	-(VEC_SIZE + 1)(%rdx, %rdi), %rax
+	andq	$(VEC_SIZE * -1), %rdi
+	subq	%rdi, %rax
 # endif
-	testl	%eax, %eax
-	jz	L(cross_page_continue)
-	tzcntl	%eax, %eax
+
+	/* rax contains remaining length - 1.  -1 so we can get imm8
+	   encoding in a few additional places saving code size.  */
+
+	/* Needed regardless of remaining length.  */
+	VPCMPEQ	VEC_SIZE(%rdi), %VMATCH, %k0
+	KMOV	%k0, %VRDX
+
+	/* We cannot fold the above `sub %rdi, %rax` with the `cmp
+	   $(CHAR_PER_VEC * 2), %rax` because its possible for a very
+	   large length to overflow and cause the subtract to carry
+	   despite length being above CHAR_PER_VEC * 2.  */
+	cmpq	$(CHAR_PER_VEC * 2 - 1), %rax
+	ja	L(more_2x_vec)
+L(last_2x_vec):
+
+	test	%VRDX, %VRDX
+	jnz	L(first_vec_x1_check)
+
+	/* Check the end of data.  NB: use 8-bit operations to save code
+	   size.  We no longer need the full-width of eax and will
+	   perform a write-only operation over eax so there will be no
+	   partial-register stalls.  */
+	subb	$(CHAR_PER_VEC * 1 - 1), %al
+	jle	L(zero_0)
+
+	VPCMPEQ	(VEC_SIZE * 2)(%rdi), %VMATCH, %k0
+	KMOV	%k0, %VRCX
 # ifdef USE_AS_WMEMCHR
-	/* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count.  */
-	leaq	(%RAW_PTR_REG, %rax, CHAR_SIZE), %rax
+	/* For wmemchr against we can't take advantage of tzcnt(0) ==
+	   VEC_SIZE as CHAR_PER_VEC != VEC_SIZE.  */
+	test	%VRCX, %VRCX
+	jz	L(zero_0)
+# endif
+	tzcnt	%VRCX, %VRCX
+	cmp	%cl, %al
+
+	/* Same CFG for VEC_SIZE == 64 and VEC_SIZE == 32.  We give
+	   fallthrough to L(zero_0) for VEC_SIZE == 64 here as there is
+	   not enough space before the next cache line to fit the `lea`
+	   for return.  */
+# if VEC_SIZE == 64
+	ja	L(first_vec_x2_ret)
+L(zero_0):
+	xorl	%eax, %eax
+	ret
 # else
-	addq	%RAW_PTR_REG, %rax
+	jbe	L(zero_0)
+	leaq	(VEC_SIZE * 2)(%rdi, %rcx, CHAR_SIZE), %rax
+	ret
 # endif
+
+	.p2align 4,, 5
+L(first_vec_x1_check):
+	bsf	%VRDX, %VRDX
+	cmpb	%dl, %al
+	jb	L(zero_4)
+	leaq	(VEC_SIZE * 1)(%rdi, %rdx, CHAR_SIZE), %rax
 	ret
 
-	.p2align 4
-L(first_vec_x1):
-	tzcntl	%eax, %eax
-	leaq	VEC_SIZE(%rdi, %rax, CHAR_SIZE), %rax
+	/* Fits at the end of the cache line here for VEC_SIZE == 32.
+	 */
+# if VEC_SIZE == 32
+L(zero_4):
+	xorl	%eax, %eax
 	ret
+# endif
 
-	.p2align 4
+
+	.p2align 4,, 4
 L(first_vec_x2):
-	tzcntl	%eax, %eax
-	leaq	(VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
+	bsf	%VRCX, %VRCX
+L(first_vec_x2_ret):
+	leaq	(VEC_SIZE * 2)(%rdi, %rcx, CHAR_SIZE), %rax
 	ret
 
-	.p2align 4
-L(first_vec_x3):
-	tzcntl	%eax, %eax
-	leaq	(VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax
+	/* Fits at the end of the cache line here for VEC_SIZE == 64.
+	 */
+# if VEC_SIZE == 64
+L(zero_4):
+	xorl	%eax, %eax
 	ret
+# endif
 
-	.p2align 4
-L(first_vec_x4):
-	tzcntl	%eax, %eax
-	leaq	(VEC_SIZE * 4)(%rdi, %rax, CHAR_SIZE), %rax
+	.p2align 4,, 4
+L(first_vec_x1):
+	bsf	%VRDX, %VRDX
+	leaq	(VEC_SIZE * 1)(%rdi, %rdx, CHAR_SIZE), %rax
 	ret
 
-	.p2align 5
-L(aligned_more):
-	/* Check the first 4 * VEC_SIZE.  Only one VEC_SIZE at a time
-	   since data is only aligned to VEC_SIZE.  */
 
-# ifndef USE_AS_RAWMEMCHR
-	/* Align data to VEC_SIZE.  */
-L(cross_page_continue):
-	xorl	%ecx, %ecx
-	subl	%edi, %ecx
-	andq	$-VEC_SIZE, %rdi
-	/* esi is for adjusting length to see if near the end.  */
-	leal	(VEC_SIZE * 5)(%rdi, %rcx), %esi
-#  ifdef USE_AS_WMEMCHR
-	/* NB: Divide bytes by 4 to get the wchar_t count.  */
-	sarl	$2, %esi
-#  endif
-# else
-	andq	$-VEC_SIZE, %rdi
-L(cross_page_continue):
-# endif
-	/* Load first VEC regardless.  */
-	VPCMP	$0, (VEC_SIZE)(%rdi), %YMMMATCH, %k0
-	kmovd	%k0, %eax
-# ifndef USE_AS_RAWMEMCHR
-	/* Adjust length. If near end handle specially.  */
-	subq	%rsi, %rdx
-	jbe	L(last_4x_vec_or_less)
-# endif
-	testl	%eax, %eax
+	.p2align 4,, 5
+L(more_2x_vec):
+	/* Length > VEC_SIZE * 2 so check first 2x VEC before rechecking
+	   length.  */
+
+
+	/* Already computed matches for first VEC in rdx.  */
+	test	%VRDX, %VRDX
 	jnz	L(first_vec_x1)
 
-	VPCMP	$0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k0
-	kmovd	%k0, %eax
-	testl	%eax, %eax
+
+	VPCMPEQ	(VEC_SIZE * 2)(%rdi), %VMATCH, %k0
+	KMOV	%k0, %VRCX
+	test	%VRCX, %VRCX
 	jnz	L(first_vec_x2)
 
-	VPCMP	$0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k0
-	kmovd	%k0, %eax
-	testl	%eax, %eax
+	/* Needed regardless of next length check.  */
+	VPCMPEQ	(VEC_SIZE * 3)(%rdi), %VMATCH, %k0
+	KMOV	%k0, %VRCX
+
+	/* Check if we are near the end.  */
+	cmpq	$(CHAR_PER_VEC * 4 - 1), %rax
+	ja	L(more_4x_vec)
+
+	test	%VRCX, %VRCX
+	jnz	L(first_vec_x3_check)
+
+	/* Use 8-bit instructions to save code size.  We won't use full-
+	   width eax again and will perform a write-only operation to
+	   eax so no worries about partial-register stalls.  */
+	subb	$(CHAR_PER_VEC * 3), %al
+	jb	L(zero_2)
+L(last_vec_check):
+	VPCMPEQ	(VEC_SIZE * 4)(%rdi), %VMATCH, %k0
+	KMOV	%k0, %VRCX
+# ifdef USE_AS_WMEMCHR
+	/* For wmemchr against we can't take advantage of tzcnt(0) ==
+	   VEC_SIZE as CHAR_PER_VEC != VEC_SIZE.  */
+	test	%VRCX, %VRCX
+	jz	L(zero_2)
+# endif
+	tzcnt	%VRCX, %VRCX
+	cmp	%cl, %al
+	jae	L(first_vec_x4_ret)
+L(zero_2):
+	xorl	%eax, %eax
+	ret
+
+	/* Fits at the end of the cache line here for VEC_SIZE == 64.
+	   For VEC_SIZE == 32 we put the return label at the end of
+	   L(first_vec_x4).  */
+# if VEC_SIZE == 64
+L(first_vec_x4_ret):
+	leaq	(VEC_SIZE * 4)(%rdi, %rcx, CHAR_SIZE), %rax
+	ret
+# endif
+
+	.p2align 4,, 6
+L(first_vec_x4):
+	bsf	%VRCX, %VRCX
+# if VEC_SIZE == 32
+	/* Place L(first_vec_x4_ret) here as we can't fit it in the same
+	   cache line as where it is called from so we might as well
+	   save code size by reusing return of L(first_vec_x4).  */
+L(first_vec_x4_ret):
+# endif
+	leaq	(VEC_SIZE * 4)(%rdi, %rcx, CHAR_SIZE), %rax
+	ret
+
+	.p2align 4,, 6
+L(first_vec_x3_check):
+	/* Need to adjust remaining length before checking.  */
+	addb	$-(CHAR_PER_VEC * 2), %al
+	bsf	%VRCX, %VRCX
+	cmpb	%cl, %al
+	jb	L(zero_2)
+	leaq	(VEC_SIZE * 3)(%rdi, %rcx, CHAR_SIZE), %rax
+	ret
+
+	.p2align 4,, 6
+L(first_vec_x3):
+	bsf	%VRCX, %VRCX
+	leaq	(VEC_SIZE * 3)(%rdi, %rcx, CHAR_SIZE), %rax
+	ret
+
+	.p2align 4,, 3
+# if !USE_TERN_IN_LOOP
+	.p2align 4,, 10
+# endif
+L(more_4x_vec):
+	test	%VRCX, %VRCX
 	jnz	L(first_vec_x3)
 
-	VPCMP	$0, (VEC_SIZE * 4)(%rdi), %YMMMATCH, %k0
-	kmovd	%k0, %eax
-	testl	%eax, %eax
+	VPCMPEQ	(VEC_SIZE * 4)(%rdi), %VMATCH, %k0
+	KMOV	%k0, %VRCX
+	test	%VRCX, %VRCX
 	jnz	L(first_vec_x4)
 
+	subq	$-(VEC_SIZE * 5), %rdi
+	subq	$(CHAR_PER_VEC * 8), %rax
+	jb	L(last_4x_vec)
 
-# ifndef USE_AS_RAWMEMCHR
-	/* Check if at last CHAR_PER_VEC * 4 length.  */
-	subq	$(CHAR_PER_VEC * 4), %rdx
-	jbe	L(last_4x_vec_or_less_cmpeq)
-	/* +VEC_SIZE if USE_IN_RTM otherwise +VEC_SIZE * 5.  */
-	addq	$(VEC_SIZE + (VEC_SIZE * 4 - BASE_OFFSET)), %rdi
-
-	/* Align data to VEC_SIZE * 4 for the loop and readjust length.
-	 */
-#  ifdef USE_AS_WMEMCHR
+# ifdef USE_AS_WMEMCHR
 	movl	%edi, %ecx
-	andq	$-(4 * VEC_SIZE), %rdi
+# else
+	addq	%rdi, %rax
+# endif
+
+
+# if VEC_SIZE == 64
+	/* use xorb to do `andq $-(VEC_SIZE * 4), %rdi`. No evex
+	   processor has partial register stalls (all have merging
+	   uop). If that changes this can be removed.  */
+	xorb	%dil, %dil
+# else
+	andq	$-(VEC_SIZE * 4), %rdi
+# endif
+
+# ifdef USE_AS_WMEMCHR
 	subl	%edi, %ecx
-	/* NB: Divide bytes by 4 to get the wchar_t count.  */
 	sarl	$2, %ecx
-	addq	%rcx, %rdx
-#  else
-	addq	%rdi, %rdx
-	andq	$-(4 * VEC_SIZE), %rdi
-	subq	%rdi, %rdx
-#  endif
+	addq	%rcx, %rax
 # else
-	addq	$(VEC_SIZE + (VEC_SIZE * 4 - BASE_OFFSET)), %rdi
-	andq	$-(4 * VEC_SIZE), %rdi
+	subq	%rdi, %rax
 # endif
-# ifdef USE_IN_RTM
-	vpxorq	%XMMZERO, %XMMZERO, %XMMZERO
-# else
-	/* copy ymmmatch to ymm0 so we can use vpcmpeq which is not
-	   encodable with EVEX registers (ymm16-ymm31).  */
-	vmovdqa64 %YMMMATCH, %ymm0
+
+
+
+# if USE_TERN_IN_LOOP
+	/* copy VMATCH to low ymm so we can use vpcmpeq which is not
+	   encodable with EVEX registers.  NB: this is VEC_SIZE == 32
+	   only as there is no way to encode vpcmpeq with zmm0-15.  */
+	vmovdqa64 %VMATCH, %VMATCH_LO
 # endif
 
-	/* Compare 4 * VEC at a time forward.  */
-	.p2align 4
+	.p2align 4,, 11
 L(loop_4x_vec):
-	/* Two versions of the loop. One that does not require
-	   vzeroupper by not using ymm0-ymm15 and another does that require
-	   vzeroupper because it uses ymm0-ymm15. The reason why ymm0-ymm15
-	   is used at all is because there is no EVEX encoding vpcmpeq and
-	   with vpcmpeq this loop can be performed more efficiently. The
-	   non-vzeroupper version is safe for RTM while the vzeroupper
-	   version should be prefered if RTM are not supported.  */
-# ifdef USE_IN_RTM
-	/* It would be possible to save some instructions using 4x VPCMP
-	   but bottleneck on port 5 makes it not woth it.  */
-	VPCMP	$4, (VEC_SIZE * 4)(%rdi), %YMMMATCH, %k1
-	/* xor will set bytes match esi to zero.  */
-	vpxorq	(VEC_SIZE * 5)(%rdi), %YMMMATCH, %YMM2
-	vpxorq	(VEC_SIZE * 6)(%rdi), %YMMMATCH, %YMM3
-	VPCMP	$0, (VEC_SIZE * 7)(%rdi), %YMMMATCH, %k3
-	/* Reduce VEC2 / VEC3 with min and VEC1 with zero mask.  */
-	VPMINU	%YMM2, %YMM3, %YMM3{%k1}{z}
-	VPCMP	$0, %YMM3, %YMMZERO, %k2
-# else
+	/* Two versions of the loop.  One that does not require
+	   vzeroupper by not using ymmm0-15 and another does that
+	   require vzeroupper because it uses ymmm0-15.  The reason why
+	   ymm0-15 is used at all is because there is no EVEX encoding
+	   vpcmpeq and with vpcmpeq this loop can be performed more
+	   efficiently.  The non-vzeroupper version is safe for RTM
+	   while the vzeroupper version should be prefered if RTM are
+	   not supported.   Which loop version we use is determined by
+	   USE_TERN_IN_LOOP.  */
+
+# if USE_TERN_IN_LOOP
 	/* Since vptern can only take 3x vectors fastest to do 1 vec
 	   seperately with EVEX vpcmp.  */
 #  ifdef USE_AS_WMEMCHR
 	/* vptern can only accept masks for epi32/epi64 so can only save
-	   instruction using not equals mask on vptern with wmemchr.  */
-	VPCMP	$4, (%rdi), %YMMMATCH, %k1
+	   instruction using not equals mask on vptern with wmemchr.
+	 */
+	VPCMP	$4, (VEC_SIZE * 0)(%rdi), %VMATCH, %k1
 #  else
-	VPCMP	$0, (%rdi), %YMMMATCH, %k1
+	VPCMPEQ	(VEC_SIZE * 0)(%rdi), %VMATCH, %k1
 #  endif
 	/* Compare 3x with vpcmpeq and or them all together with vptern.
 	 */
-	VPCMPEQ	VEC_SIZE(%rdi), %ymm0, %ymm2
-	VPCMPEQ	(VEC_SIZE * 2)(%rdi), %ymm0, %ymm3
-	VPCMPEQ	(VEC_SIZE * 3)(%rdi), %ymm0, %ymm4
+	VPCMPEQ	(VEC_SIZE * 1)(%rdi), %VMATCH_LO, %VMM_lo(2)
+	VPCMPEQ	(VEC_SIZE * 2)(%rdi), %VMATCH_LO, %VMM_lo(3)
+	VPCMPEQ	(VEC_SIZE * 3)(%rdi), %VMATCH_LO, %VMM_lo(4)
 #  ifdef USE_AS_WMEMCHR
-	/* This takes the not of or between ymm2, ymm3, ymm4 as well as
-	   combines result from VEC0 with zero mask.  */
-	vpternlogd $1, %ymm2, %ymm3, %ymm4{%k1}{z}
-	vpmovmskb %ymm4, %ecx
+	/* This takes the not of or between VEC_lo(2), VEC_lo(3),
+	   VEC_lo(4) as well as combines result from VEC(0) with zero
+	   mask.  */
+	vpternlogd $1, %VMM_lo(2), %VMM_lo(3), %VMM_lo(4){%k1}{z}
+	vpmovmskb %VMM_lo(4), %VRCX
 #  else
-	/* 254 is mask for oring ymm2, ymm3, ymm4 into ymm4.  */
-	vpternlogd $254, %ymm2, %ymm3, %ymm4
-	vpmovmskb %ymm4, %ecx
-	kmovd	%k1, %eax
+	/* 254 is mask for oring VEC_lo(2), VEC_lo(3), VEC_lo(4) into
+	   VEC_lo(4).  */
+	vpternlogd $254, %VMM_lo(2), %VMM_lo(3), %VMM_lo(4)
+	vpmovmskb %VMM_lo(4), %VRCX
+	KMOV	%k1, %edx
 #  endif
-# endif
 
-# ifdef USE_AS_RAWMEMCHR
-	subq	$-(VEC_SIZE * 4), %rdi
-# endif
-# ifdef USE_IN_RTM
-	kortestd %k2, %k3
 # else
-#  ifdef USE_AS_WMEMCHR
-	/* ecx contains not of matches. All 1s means no matches. incl will
-	   overflow and set zeroflag if that is the case.  */
-	incl	%ecx
-#  else
-	/* If either VEC1 (eax) or VEC2-VEC4 (ecx) are not zero. Adding
-	   to ecx is not an issue because if eax is non-zero it will be
-	   used for returning the match. If it is zero the add does
-	   nothing.  */
-	addq	%rax, %rcx
-#  endif
+	/* Loop version that uses EVEX encoding.  */
+	VPCMP	$4, (VEC_SIZE * 0)(%rdi), %VMATCH, %k1
+	vpxorq	(VEC_SIZE * 1)(%rdi), %VMATCH, %VMM(2)
+	vpxorq	(VEC_SIZE * 2)(%rdi), %VMATCH, %VMM(3)
+	VPCMPEQ	(VEC_SIZE * 3)(%rdi), %VMATCH, %k3
+	VPMINU	%VMM(2), %VMM(3), %VMM(3){%k1}{z}
+	VPTESTN	%VMM(3), %VMM(3), %k2
 # endif
-# ifdef USE_AS_RAWMEMCHR
-	jz	L(loop_4x_vec)
-# else
-	jnz	L(loop_4x_vec_end)
+
+
+	TEST_END ()
+	jnz	L(loop_vec_ret)
 
 	subq	$-(VEC_SIZE * 4), %rdi
 
-	subq	$(CHAR_PER_VEC * 4), %rdx
-	ja	L(loop_4x_vec)
+	subq	$(CHAR_PER_VEC * 4), %rax
+	jae	L(loop_4x_vec)
 
-	/* Fall through into less than 4 remaining vectors of length case.
+	/* COND_VZEROUPPER is vzeroupper if we use the VEX encoded loop.
 	 */
-	VPCMP	$0, BASE_OFFSET(%rdi), %YMMMATCH, %k0
-	addq	$(BASE_OFFSET - VEC_SIZE), %rdi
-	kmovd	%k0, %eax
-	VZEROUPPER
-
-L(last_4x_vec_or_less):
-	/* Check if first VEC contained match.  */
-	testl	%eax, %eax
-	jnz	L(first_vec_x1_check)
+	COND_VZEROUPPER
 
-	/* If remaining length > CHAR_PER_VEC * 2.  */
-	addl	$(CHAR_PER_VEC * 2), %edx
-	jg	L(last_4x_vec)
-
-L(last_2x_vec):
-	/* If remaining length < CHAR_PER_VEC.  */
-	addl	$CHAR_PER_VEC, %edx
-	jle	L(zero_end)
-
-	/* Check VEC2 and compare any match with remaining length.  */
-	VPCMP	$0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k0
-	kmovd	%k0, %eax
-	tzcntl	%eax, %eax
-	cmpl	%eax, %edx
-	jbe	L(set_zero_end)
-	leaq	(VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
-L(zero_end):
-	ret
+	.p2align 4,, 10
+L(last_4x_vec):
+	/* For CHAR_PER_VEC == 64 we don't need to mask as we use 8-bit
+	   instructions on eax from here on out.  */
+# if CHAR_PER_VEC != 64
+	andl	$(CHAR_PER_VEC * 4 - 1), %eax
+# endif
+	VPCMPEQ	(VEC_SIZE * 0)(%rdi), %VMATCH, %k0
+	subq	$(VEC_SIZE * 1), %rdi
+	KMOV	%k0, %VRDX
+	cmpb	$(CHAR_PER_VEC * 2 - 1), %al
+	jbe	L(last_2x_vec)
+	test	%VRDX, %VRDX
+	jnz	L(last_vec_x1_novzero)
+
+	VPCMPEQ	(VEC_SIZE * 2)(%rdi), %VMATCH, %k0
+	KMOV	%k0, %VRDX
+	test	%VRDX, %VRDX
+	jnz	L(last_vec_x2_novzero)
+
+	VPCMPEQ	(VEC_SIZE * 3)(%rdi), %VMATCH, %k0
+	KMOV	%k0, %VRCX
+	test	%VRCX, %VRCX
+	jnz	L(first_vec_x3_check)
+
+	subb	$(CHAR_PER_VEC * 3), %al
+	jae	L(last_vec_check)
 
-L(set_zero_end):
 	xorl	%eax, %eax
 	ret
 
-	.p2align 4
-L(first_vec_x1_check):
-	/* eax must be non-zero. Use bsfl to save code size.  */
-	bsfl	%eax, %eax
-	/* Adjust length.  */
-	subl	$-(CHAR_PER_VEC * 4), %edx
-	/* Check if match within remaining length.  */
-	cmpl	%eax, %edx
-	jbe	L(set_zero_end)
-	/* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count.  */
-	leaq	VEC_SIZE(%rdi, %rax, CHAR_SIZE), %rax
+# if defined USE_AS_WMEMCHR && USE_TERN_IN_LOOP
+L(last_vec_x2_novzero):
+	addq	$VEC_SIZE, %rdi
+L(last_vec_x1_novzero):
+	bsf	%VRDX, %VRDX
+	leaq	(VEC_SIZE * 1)(%rdi, %rdx, CHAR_SIZE), %rax
 	ret
+# endif
 
-	.p2align 4
-L(loop_4x_vec_end):
+# if CHAR_PER_VEC == 64
+	/* Since we can't combine the last 2x VEC when CHAR_PER_VEC ==
+	   64 it needs a seperate return label.  */
+	.p2align 4,, 4
+L(last_vec_x2):
+L(last_vec_x2_novzero):
+	bsf	%VRDX, %VRDX
+	leaq	(VEC_SIZE * 2)(%rdi, %rdx, TERN_CHAR_MULT), %rax
+	ret
 # endif
-	/* rawmemchr will fall through into this if match was found in
-	   loop.  */
 
-# if defined USE_IN_RTM || defined USE_AS_WMEMCHR
-	/* k1 has not of matches with VEC1.  */
-	kmovd	%k1, %eax
-#  ifdef USE_AS_WMEMCHR
-	subl	$((1 << CHAR_PER_VEC) - 1), %eax
-#  else
-	incl	%eax
-#  endif
+	.p2align 4,, 4
+L(loop_vec_ret):
+# if defined USE_AS_WMEMCHR || !USE_TERN_IN_LOOP
+	KMOV	%k1, %VRAX
+	inc	%MASK_GPR(rax)
 # else
-	/* eax already has matches for VEC1.  */
-	testl	%eax, %eax
+	test	%VRDX, %VRDX
 # endif
-	jnz	L(last_vec_x1_return)
+	jnz	L(last_vec_x0)
 
-# ifdef USE_IN_RTM
-	VPCMP	$0, %YMM2, %YMMZERO, %k0
-	kmovd	%k0, %eax
+
+# if USE_TERN_IN_LOOP
+	vpmovmskb %VMM_lo(2), %VRDX
 # else
-	vpmovmskb %ymm2, %eax
+	VPTESTN	%VMM(2), %VMM(2), %k1
+	KMOV	%k1, %VRDX
 # endif
-	testl	%eax, %eax
-	jnz	L(last_vec_x2_return)
+	test	%VRDX, %VRDX
+	jnz	L(last_vec_x1)
 
-# ifdef USE_IN_RTM
-	kmovd	%k2, %eax
-	testl	%eax, %eax
-	jnz	L(last_vec_x3_return)
 
-	kmovd	%k3, %eax
-	tzcntl	%eax, %eax
-	leaq	(VEC_SIZE * 3 + RET_OFFSET)(%rdi, %rax, CHAR_SIZE), %rax
+# if USE_TERN_IN_LOOP
+	vpmovmskb %VMM_lo(3), %VRDX
 # else
-	vpmovmskb %ymm3, %eax
-	/* Combine matches in VEC3 (eax) with matches in VEC4 (ecx).  */
-	salq	$VEC_SIZE, %rcx
-	orq	%rcx, %rax
-	tzcntq	%rax, %rax
-	leaq	(VEC_SIZE * 2 + RET_OFFSET)(%rdi, %rax), %rax
-	VZEROUPPER
+	KMOV	%k2, %VRDX
 # endif
-	ret
 
-	.p2align 4,, 10
-L(last_vec_x1_return):
-	tzcntl	%eax, %eax
-# if defined USE_AS_WMEMCHR || RET_OFFSET != 0
-	/* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count.  */
-	leaq	RET_OFFSET(%rdi, %rax, CHAR_SIZE), %rax
+	/* No longer need any of the lo vecs (ymm0-15) so vzeroupper
+	   (only if used VEX encoded loop).  */
+	COND_VZEROUPPER
+
+	/* Seperate logic for CHAR_PER_VEC == 64 vs the rest.  For
+	   CHAR_PER_VEC we test the last 2x VEC seperately, for
+	   CHAR_PER_VEC <= 32 we can combine the results from the 2x
+	   VEC in a single GPR.  */
+# if CHAR_PER_VEC == 64
+#  if USE_TERN_IN_LOOP
+#   error "Unsupported"
+#  endif
+
+
+	/* If CHAR_PER_VEC == 64 we can't combine the last two VEC.  */
+	test	%VRDX, %VRDX
+	jnz	L(last_vec_x2)
+	KMOV	%k3, %VRDX
 # else
-	addq	%rdi, %rax
+	/* CHAR_PER_VEC <= 32 so we can combine the results from the
+	   last 2x VEC.  */
+
+#  if !USE_TERN_IN_LOOP
+	KMOV	%k3, %VRCX
+#  endif
+	salq	$(VEC_SIZE / TERN_CHAR_MULT), %rcx
+	addq	%rcx, %rdx
+#  if !defined USE_AS_WMEMCHR || !USE_TERN_IN_LOOP
+L(last_vec_x2_novzero):
+#  endif
 # endif
-	VZEROUPPER
+	bsf	%rdx, %rdx
+	leaq	(LAST_VEC_OFFSET)(%rdi, %rdx, TERN_CHAR_MULT), %rax
 	ret
 
-	.p2align 4
-L(last_vec_x2_return):
-	tzcntl	%eax, %eax
-	/* NB: Multiply bytes by RET_SCALE to get the wchar_t count
-	   if relevant (RET_SCALE = CHAR_SIZE if USE_AS_WMEMCHAR and
-	   USE_IN_RTM are both defined. Otherwise RET_SCALE = 1.  */
-	leaq	(VEC_SIZE + RET_OFFSET)(%rdi, %rax, RET_SCALE), %rax
-	VZEROUPPER
+	.p2align 4,, 8
+L(last_vec_x1):
+	COND_VZEROUPPER
+# if !defined USE_AS_WMEMCHR || !USE_TERN_IN_LOOP
+L(last_vec_x1_novzero):
+# endif
+	bsf	%VRDX, %VRDX
+	leaq	(VEC_SIZE * 1)(%rdi, %rdx, TERN_CHAR_MULT), %rax
 	ret
 
-# ifdef USE_IN_RTM
-	.p2align 4
-L(last_vec_x3_return):
-	tzcntl	%eax, %eax
-	/* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count.  */
-	leaq	(VEC_SIZE * 2 + RET_OFFSET)(%rdi, %rax, CHAR_SIZE), %rax
+
+	.p2align 4,, 4
+L(last_vec_x0):
+	COND_VZEROUPPER
+	bsf	%VGPR(GPR_X0), %VGPR(GPR_X0)
+# if GPR_X0_IS_RET
+	addq	%rdi, %rax
+# else
+	leaq	(%rdi, %GPR_X0, CHAR_SIZE), %rax
+# endif
 	ret
+
+	.p2align 4,, 6
+L(page_cross):
+	/* Need to preserve eax to compute inbound bytes we are
+	   checking.  */
+# ifdef USE_AS_WMEMCHR
+	movl	%eax, %ecx
+# else
+	xorl	%ecx, %ecx
+	subl	%eax, %ecx
 # endif
 
-# ifndef USE_AS_RAWMEMCHR
-	.p2align 4,, 5
-L(last_4x_vec_or_less_cmpeq):
-	VPCMP	$0, (VEC_SIZE * 5)(%rdi), %YMMMATCH, %k0
-	kmovd	%k0, %eax
-	subq	$-(VEC_SIZE * 4), %rdi
-	/* Check first VEC regardless.  */
-	testl	%eax, %eax
-	jnz	L(first_vec_x1_check)
+	xorq	%rdi, %rax
+	VPCMPEQ	(PAGE_SIZE - VEC_SIZE)(%rax), %VMATCH, %k0
+	KMOV	%k0, %VRAX
 
-	/* If remaining length <= CHAR_PER_VEC * 2.  */
-	addl	$(CHAR_PER_VEC * 2), %edx
-	jle	L(last_2x_vec)
+# ifdef USE_AS_WMEMCHR
+	/* NB: Divide by CHAR_SIZE to shift out out of bounds bytes.  */
+	shrl	$2, %ecx
+	andl	$(CHAR_PER_VEC - 1), %ecx
+# endif
 
-	.p2align 4
-L(last_4x_vec):
-	VPCMP	$0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k0
-	kmovd	%k0, %eax
-	testl	%eax, %eax
-	jnz	L(last_vec_x2)
 
+	shrx	%VGPR(PC_SHIFT_GPR), %VRAX, %VRAX
 
-	VPCMP	$0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k0
-	kmovd	%k0, %eax
-	/* Create mask for possible matches within remaining length.  */
-#  ifdef USE_AS_WMEMCHR
-	movl	$((1 << (CHAR_PER_VEC * 2)) - 1), %ecx
-	bzhil	%edx, %ecx, %ecx
-#  else
-	movq	$-1, %rcx
-	bzhiq	%rdx, %rcx, %rcx
-#  endif
-	/* Test matches in data against length match.  */
-	andl	%ecx, %eax
-	jnz	L(last_vec_x3)
+# ifdef USE_AS_WMEMCHR
+	negl	%ecx
+# endif
 
-	/* if remaining length <= CHAR_PER_VEC * 3 (Note this is after
-	   remaining length was found to be > CHAR_PER_VEC * 2.  */
-	subl	$CHAR_PER_VEC, %edx
-	jbe	L(zero_end2)
+	/* mask lower bits from ecx (negative eax) to get bytes till
+	   next VEC.  */
+	andl	$(CHAR_PER_VEC - 1), %ecx
 
+	/* Check if VEC is entirely contained in the remainder of the
+	   page.  */
+	cmpq	%rcx, %rdx
+	jbe	L(page_cross_ret)
 
-	VPCMP	$0, (VEC_SIZE * 4)(%rdi), %YMMMATCH, %k0
-	kmovd	%k0, %eax
-	/* Shift remaining length mask for last VEC.  */
-#  ifdef USE_AS_WMEMCHR
-	shrl	$CHAR_PER_VEC, %ecx
-#  else
-	shrq	$CHAR_PER_VEC, %rcx
-#  endif
-	andl	%ecx, %eax
-	jz	L(zero_end2)
-	bsfl	%eax, %eax
-	leaq	(VEC_SIZE * 4)(%rdi, %rax, CHAR_SIZE), %rax
-L(zero_end2):
-	ret
+	/* Length crosses the page so if rax is zero (no matches)
+	   continue.  */
+	test	%VRAX, %VRAX
+	jz	L(page_cross_continue)
 
-L(last_vec_x2):
-	tzcntl	%eax, %eax
-	leaq	(VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
+	/* if rdx > rcx then any match here must be in [buf:buf + len].
+	 */
+	tzcnt	%VRAX, %VRAX
+# ifdef USE_AS_WMEMCHR
+	leaq	(%rdi, %rax, CHAR_SIZE), %rax
+# else
+	addq	%rdi, %rax
+# endif
 	ret
 
-	.p2align 4
-L(last_vec_x3):
-	tzcntl	%eax, %eax
-	leaq	(VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax
+	.p2align 4,, 2
+L(page_cross_zero):
+	xorl	%eax, %eax
 	ret
+
+	.p2align 4,, 4
+L(page_cross_ret):
+	/* Search is entirely contained in page cross case.  */
+# ifdef USE_AS_WMEMCHR
+	test	%VRAX, %VRAX
+	jz	L(page_cross_zero)
+# endif
+	tzcnt	%VRAX, %VRAX
+	cmpl	%eax, %edx
+	jbe	L(page_cross_zero)
+# ifdef USE_AS_WMEMCHR
+	leaq	(%rdi, %rax, CHAR_SIZE), %rax
+# else
+	addq	%rdi, %rax
 # endif
-	/* 7 bytes from next cache line.  */
+	ret
 END (MEMCHR)
 #endif
diff --git a/sysdeps/x86_64/multiarch/rawmemchr-evex-rtm.S b/sysdeps/x86_64/multiarch/rawmemchr-evex-rtm.S
index deda1ca395..2073eaa620 100644
--- a/sysdeps/x86_64/multiarch/rawmemchr-evex-rtm.S
+++ b/sysdeps/x86_64/multiarch/rawmemchr-evex-rtm.S
@@ -1,3 +1,6 @@
-#define MEMCHR __rawmemchr_evex_rtm
-#define USE_AS_RAWMEMCHR 1
-#include "memchr-evex-rtm.S"
+#define RAWMEMCHR	__rawmemchr_evex_rtm
+
+#define USE_IN_RTM	1
+#define SECTION(p)	p##.evex.rtm
+
+#include "rawmemchr-evex.S"
diff --git a/sysdeps/x86_64/multiarch/rawmemchr-evex.S b/sysdeps/x86_64/multiarch/rawmemchr-evex.S
index dc1c450699..dad54def2b 100644
--- a/sysdeps/x86_64/multiarch/rawmemchr-evex.S
+++ b/sysdeps/x86_64/multiarch/rawmemchr-evex.S
@@ -1,7 +1,308 @@
-#ifndef RAWMEMCHR
-# define RAWMEMCHR	__rawmemchr_evex
-#endif
-#define USE_AS_RAWMEMCHR	1
-#define MEMCHR	RAWMEMCHR
+/* rawmemchr optimized with 256-bit EVEX instructions.
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <isa-level.h>
+#include <sysdep.h>
+
+#if ISA_SHOULD_BUILD (4)
+
+# ifndef VEC_SIZE
+#  include "x86-evex256-vecs.h"
+# endif
+
+# ifndef RAWMEMCHR
+#  define RAWMEMCHR	__rawmemchr_evex
+# endif
+
+
+# define PC_SHIFT_GPR	rdi
+# define REG_WIDTH	VEC_SIZE
+# define VPTESTN	vptestnmb
+# define VPBROADCAST	vpbroadcastb
+# define VPMINU	vpminub
+# define VPCMP	vpcmpb
+# define VPCMPEQ	vpcmpeqb
+# define CHAR_SIZE	1
+
+# include "reg-macros.h"
+
+/* If not in an RTM and VEC_SIZE != 64 (the VEC_SIZE = 64
+   doesn't have VEX encoding), use VEX encoding in loop so we
+   can use vpcmpeqb + vptern which is more efficient than the
+   EVEX alternative.  */
+# if defined USE_IN_RTM || VEC_SIZE == 64
+#  undef COND_VZEROUPPER
+#  undef VZEROUPPER_RETURN
+#  undef VZEROUPPER
+
+
+#  define COND_VZEROUPPER
+#  define VZEROUPPER_RETURN	ret
+#  define VZEROUPPER
+
+#  define USE_TERN_IN_LOOP	0
+# else
+#  define USE_TERN_IN_LOOP	1
+#  undef VZEROUPPER
+#  define VZEROUPPER	vzeroupper
+# endif
+
+# define CHAR_PER_VEC	VEC_SIZE
+
+# if CHAR_PER_VEC == 64
+
+#  define TAIL_RETURN_LBL	first_vec_x2
+#  define TAIL_RETURN_OFFSET	(CHAR_PER_VEC * 2)
+
+#  define FALLTHROUGH_RETURN_LBL	first_vec_x3
+#  define FALLTHROUGH_RETURN_OFFSET	(CHAR_PER_VEC * 3)
+
+# else	/* !(CHAR_PER_VEC == 64) */
+
+#  define TAIL_RETURN_LBL	first_vec_x3
+#  define TAIL_RETURN_OFFSET	(CHAR_PER_VEC * 3)
+
+#  define FALLTHROUGH_RETURN_LBL	first_vec_x2
+#  define FALLTHROUGH_RETURN_OFFSET	(CHAR_PER_VEC * 2)
+# endif	/* !(CHAR_PER_VEC == 64) */
+
+
+# define VMATCH	VMM(0)
+# define VMATCH_LO	VMM_lo(0)
+
+# define PAGE_SIZE	4096
+
+	.section SECTION(.text), "ax", @progbits
+ENTRY_P2ALIGN (RAWMEMCHR, 6)
+	VPBROADCAST %esi, %VMATCH
+	/* Check if we may cross page boundary with one vector load.  */
+	movl	%edi, %eax
+	andl	$(PAGE_SIZE - 1), %eax
+	cmpl	$(PAGE_SIZE - VEC_SIZE), %eax
+	ja	L(page_cross)
+
+	VPCMPEQ	(%rdi), %VMATCH, %k0
+	KMOV	%k0, %VRAX
+
+	test	%VRAX, %VRAX
+	jz	L(aligned_more)
+L(first_vec_x0):
+	bsf	%VRAX, %VRAX
+	addq	%rdi, %rax
+	ret
+
+	.p2align 4,, 4
+L(first_vec_x4):
+	bsf	%VRAX, %VRAX
+	leaq	(VEC_SIZE * 4)(%rdi, %rax), %rax
+	ret
 
-#include "memchr-evex.S"
+	/* For VEC_SIZE == 32 we can fit this in aligning bytes so might
+	   as well place it more locally.  For VEC_SIZE == 64 we reuse
+	   return code at the end of loop's return.  */
+# if VEC_SIZE == 32
+	.p2align 4,, 4
+L(FALLTHROUGH_RETURN_LBL):
+	bsf	%VRAX, %VRAX
+	leaq	(FALLTHROUGH_RETURN_OFFSET)(%rdi, %rax), %rax
+	ret
+# endif
+
+	.p2align 4,, 6
+L(page_cross):
+	/* eax has lower page-offset bits of rdi so xor will zero them
+	   out.  */
+	xorq	%rdi, %rax
+	VPCMPEQ	(PAGE_SIZE - VEC_SIZE)(%rax), %VMATCH, %k0
+	KMOV	%k0, %VRAX
+
+	/* Shift out out-of-bounds matches.  */
+	shrx	%VRDI, %VRAX, %VRAX
+	test	%VRAX, %VRAX
+	jnz	L(first_vec_x0)
+
+	.p2align 4,, 10
+L(aligned_more):
+L(page_cross_continue):
+	/* Align pointer.  */
+	andq	$(VEC_SIZE * -1), %rdi
+
+	VPCMPEQ	VEC_SIZE(%rdi), %VMATCH, %k0
+	KMOV	%k0, %VRAX
+	test	%VRAX, %VRAX
+	jnz	L(first_vec_x1)
+
+	VPCMPEQ	(VEC_SIZE * 2)(%rdi), %VMATCH, %k0
+	KMOV	%k0, %VRAX
+	test	%VRAX, %VRAX
+	jnz	L(first_vec_x2)
+
+	VPCMPEQ	(VEC_SIZE * 3)(%rdi), %VMATCH, %k0
+	KMOV	%k0, %VRAX
+	test	%VRAX, %VRAX
+	jnz	L(first_vec_x3)
+
+	VPCMPEQ	(VEC_SIZE * 4)(%rdi), %VMATCH, %k0
+	KMOV	%k0, %VRAX
+	test	%VRAX, %VRAX
+	jnz	L(first_vec_x4)
+
+	subq	$-(VEC_SIZE * 1), %rdi
+# if VEC_SIZE == 64
+	/* Saves code size.  No evex512 processor has partial register
+	   stalls.  If that change this can be replaced with `andq
+	   $-(VEC_SIZE * 4), %rdi`.  */
+	xorb	%dil, %dil
+# else
+	andq	$-(VEC_SIZE * 4), %rdi
+# endif
+
+# if USE_TERN_IN_LOOP
+	/* copy VMATCH to low ymm so we can use vpcmpeq which is not
+	   encodable with EVEX registers.  NB: this is VEC_SIZE == 32
+	   only as there is no way to encode vpcmpeq with zmm0-15.  */
+	vmovdqa64 %VMATCH, %VMATCH_LO
+# endif
+
+	.p2align 4
+L(loop_4x_vec):
+	/* Two versions of the loop.  One that does not require
+	   vzeroupper by not using ymm0-15 and another does that
+	   require vzeroupper because it uses ymm0-15.  The reason why
+	   ymm0-15 is used at all is because there is no EVEX encoding
+	   vpcmpeq and with vpcmpeq this loop can be performed more
+	   efficiently.  The non-vzeroupper version is safe for RTM
+	   while the vzeroupper version should be prefered if RTM are
+	   not supported.   Which loop version we use is determined by
+	   USE_TERN_IN_LOOP.  */
+
+# if USE_TERN_IN_LOOP
+	/* Since vptern can only take 3x vectors fastest to do 1 vec
+	   seperately with EVEX vpcmp.  */
+	VPCMPEQ	(VEC_SIZE * 4)(%rdi), %VMATCH, %k1
+	/* Compare 3x with vpcmpeq and or them all together with vptern.
+	 */
+
+	VPCMPEQ	(VEC_SIZE * 5)(%rdi), %VMATCH_LO, %VMM_lo(2)
+	subq	$(VEC_SIZE * -4), %rdi
+	VPCMPEQ	(VEC_SIZE * 2)(%rdi), %VMATCH_LO, %VMM_lo(3)
+	VPCMPEQ	(VEC_SIZE * 3)(%rdi), %VMATCH_LO, %VMM_lo(4)
+
+	/* 254 is mask for oring VEC_lo(2), VEC_lo(3), VEC_lo(4) into
+	   VEC_lo(4).  */
+	vpternlogd $254, %VMM_lo(2), %VMM_lo(3), %VMM_lo(4)
+	vpmovmskb %VMM_lo(4), %VRCX
+
+	KMOV	%k1, %eax
+
+	/* NB:  rax has match from first VEC and rcx has matches from
+	   VEC 2-4.  If rax is non-zero we will return that match.  If
+	   rax is zero adding won't disturb the bits in rcx.  */
+	add	%rax, %rcx
+# else
+	/* Loop version that uses EVEX encoding.  */
+	VPCMP	$4, (VEC_SIZE * 4)(%rdi), %VMATCH, %k1
+	vpxorq	(VEC_SIZE * 5)(%rdi), %VMATCH, %VMM(2)
+	vpxorq	(VEC_SIZE * 6)(%rdi), %VMATCH, %VMM(3)
+	VPCMPEQ	(VEC_SIZE * 7)(%rdi), %VMATCH, %k3
+	VPMINU	%VMM(2), %VMM(3), %VMM(3){%k1}{z}
+	VPTESTN	%VMM(3), %VMM(3), %k2
+	subq	$(VEC_SIZE * -4), %rdi
+	KORTEST %k2, %k3
+# endif
+	jz	L(loop_4x_vec)
+
+# if USE_TERN_IN_LOOP
+	test	%VRAX, %VRAX
+# else
+	KMOV	%k1, %VRAX
+	inc	%VRAX
+# endif
+	jnz	L(last_vec_x0)
+
+
+# if USE_TERN_IN_LOOP
+	vpmovmskb %VMM_lo(2), %VRAX
+# else
+	VPTESTN	%VMM(2), %VMM(2), %k1
+	KMOV	%k1, %VRAX
+# endif
+	test	%VRAX, %VRAX
+	jnz	L(last_vec_x1)
+
+
+# if USE_TERN_IN_LOOP
+	vpmovmskb %VMM_lo(3), %VRAX
+# else
+	KMOV	%k2, %VRAX
+# endif
+
+	/* No longer need any of the lo vecs (ymm0-15) so vzeroupper
+	   (only if used VEX encoded loop).  */
+	COND_VZEROUPPER
+
+	/* Seperate logic for VEC_SIZE == 64 and VEC_SIZE == 32 for
+	   returning last 2x VEC. For VEC_SIZE == 64 we test each VEC
+	   individually, for VEC_SIZE == 32 we combine them in a single
+	   64-bit GPR.  */
+# if CHAR_PER_VEC == 64
+#  if USE_TERN_IN_LOOP
+#   error "Unsupported"
+#  endif
+
+
+	/* If CHAR_PER_VEC == 64 we can't combine the last two VEC.  */
+	test	%VRAX, %VRAX
+	jnz	L(first_vec_x2)
+	KMOV	%k3, %VRAX
+L(FALLTHROUGH_RETURN_LBL):
+# else
+	/* CHAR_PER_VEC <= 32 so we can combine the results from the
+	   last 2x VEC.  */
+#  if !USE_TERN_IN_LOOP
+	KMOV	%k3, %VRCX
+#  endif
+	salq	$CHAR_PER_VEC, %rcx
+	addq	%rcx, %rax
+# endif
+	bsf	%rax, %rax
+	leaq	(FALLTHROUGH_RETURN_OFFSET)(%rdi, %rax), %rax
+	ret
+
+	.p2align 4,, 8
+L(TAIL_RETURN_LBL):
+	bsf	%rax, %rax
+	leaq	(TAIL_RETURN_OFFSET)(%rdi, %rax), %rax
+	ret
+
+	.p2align 4,, 8
+L(last_vec_x1):
+	COND_VZEROUPPER
+L(first_vec_x1):
+	bsf	%VRAX, %VRAX
+	leaq	(VEC_SIZE * 1)(%rdi, %rax), %rax
+	ret
+
+	.p2align 4,, 8
+L(last_vec_x0):
+	COND_VZEROUPPER
+	bsf	%VRAX, %VRAX
+	addq	%rdi, %rax
+	ret
+END (RAWMEMCHR)
+#endif
-- 
2.34.1


^ permalink raw reply	[flat|nested] 41+ messages in thread

* [PATCH v1 2/7] x86: Shrink / minorly optimize strchr-evex and implement with VMM headers
  2022-10-18  2:48 [PATCH v1 1/7] x86: Optimize memchr-evex.S and implement with VMM headers Noah Goldstein
@ 2022-10-18  2:48 ` Noah Goldstein
  2022-10-18  2:51   ` Noah Goldstein
  2022-10-18  2:48 ` [PATCH v1 3/7] x86: Optimize strnlen-evex.S " Noah Goldstein
                   ` (7 subsequent siblings)
  8 siblings, 1 reply; 41+ messages in thread
From: Noah Goldstein @ 2022-10-18  2:48 UTC (permalink / raw)
  To: libc-alpha; +Cc: goldstein.w.n, hjl.tools, carlos

Size Optimizations:
1. Condence hot path for better cache-locality.
    - This is most impact for strchrnul where the logic strings with
      len <= VEC_SIZE or with a match in the first VEC no fits entirely
      in the first cache line.
2. Reuse common targets in first 4x VEC and after the loop.
3. Don't align targets so aggressively if it doesn't change the number
   of fetch blocks it will require and put more care in avoiding the
   case where targets unnecessarily split cache lines.
4. Align the loop better for DSB/LSD
5. Use more code-size efficient instructions.
	- tzcnt ...     -> bsf ...
	- vpcmpb $0 ... -> vpcmpeq ...
6. Align labels less aggressively, especially if it doesn't save fetch
   blocks / causes the basic-block to span extra cache-lines.

Code Size Changes:
strchr-evex.S	: -63 bytes
strchrnul-evex.S: -48 bytes

Net perf changes:
Reported as geometric mean of all improvements / regressions from N=10
runs of the benchtests. Value as New Time / Old Time so < 1.0 is
improvement and 1.0 is regression.

strchr-evex.S (Fixed)   : 0.971
strchr-evex.S (Rand)    : 0.932
strchrnul-evex.S        : 0.965

Full results attached in email.

Full check passes on x86-64.
---
 sysdeps/x86_64/multiarch/strchr-evex.S | 558 +++++++++++++++----------
 1 file changed, 340 insertions(+), 218 deletions(-)

diff --git a/sysdeps/x86_64/multiarch/strchr-evex.S b/sysdeps/x86_64/multiarch/strchr-evex.S
index a1c15c4419..c2a0d112f7 100644
--- a/sysdeps/x86_64/multiarch/strchr-evex.S
+++ b/sysdeps/x86_64/multiarch/strchr-evex.S
@@ -26,48 +26,75 @@
 #  define STRCHR	__strchr_evex
 # endif
 
-# define VMOVU		vmovdqu64
-# define VMOVA		vmovdqa64
+# ifndef VEC_SIZE
+#  include "x86-evex256-vecs.h"
+# endif
 
 # ifdef USE_AS_WCSCHR
 #  define VPBROADCAST	vpbroadcastd
-#  define VPCMP		vpcmpd
+#  define VPCMP	vpcmpd
+#  define VPCMPEQ	vpcmpeqd
 #  define VPTESTN	vptestnmd
+#  define VPTEST	vptestmd
 #  define VPMINU	vpminud
 #  define CHAR_REG	esi
-#  define SHIFT_REG	ecx
+#  define SHIFT_REG	rcx
 #  define CHAR_SIZE	4
+
+#  define USE_WIDE_CHAR
 # else
 #  define VPBROADCAST	vpbroadcastb
-#  define VPCMP		vpcmpb
+#  define VPCMP	vpcmpb
+#  define VPCMPEQ	vpcmpeqb
 #  define VPTESTN	vptestnmb
+#  define VPTEST	vptestmb
 #  define VPMINU	vpminub
 #  define CHAR_REG	sil
-#  define SHIFT_REG	edx
+#  define SHIFT_REG	rdi
 #  define CHAR_SIZE	1
 # endif
 
-# define XMMZERO	xmm16
-
-# define YMMZERO	ymm16
-# define YMM0		ymm17
-# define YMM1		ymm18
-# define YMM2		ymm19
-# define YMM3		ymm20
-# define YMM4		ymm21
-# define YMM5		ymm22
-# define YMM6		ymm23
-# define YMM7		ymm24
-# define YMM8		ymm25
-
-# define VEC_SIZE 32
-# define PAGE_SIZE 4096
-# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE)
-
-	.section .text.evex,"ax",@progbits
-ENTRY_P2ALIGN (STRCHR, 5)
-	/* Broadcast CHAR to YMM0.	*/
-	VPBROADCAST	%esi, %YMM0
+# include "reg-macros.h"
+
+# if VEC_SIZE == 64
+#  define MASK_GPR	rcx
+#  define LOOP_REG	rax
+
+#  define COND_MASK(k_reg)	{%k_reg}
+# else
+#  define MASK_GPR	rax
+#  define LOOP_REG	rdi
+
+#  define COND_MASK(k_reg)
+# endif
+
+# define CHAR_PER_VEC	(VEC_SIZE / CHAR_SIZE)
+
+
+# if CHAR_PER_VEC == 64
+#  define LAST_VEC_OFFSET	(VEC_SIZE * 3)
+#  define TESTZ(reg)	incq %VGPR_SZ(reg, 64)
+# else
+
+#  if CHAR_PER_VEC == 32
+#   define TESTZ(reg)	incl %VGPR_SZ(reg, 32)
+#  elif CHAR_PER_VEC == 16
+#   define TESTZ(reg)	incw %VGPR_SZ(reg, 16)
+#  else
+#   define TESTZ(reg)	incb %VGPR_SZ(reg, 8)
+#  endif
+
+#  define LAST_VEC_OFFSET	(VEC_SIZE * 2)
+# endif
+
+# define VMATCH	VMM(0)
+
+# define PAGE_SIZE	4096
+
+	.section SECTION(.text), "ax", @progbits
+ENTRY_P2ALIGN (STRCHR, 6)
+	/* Broadcast CHAR to VEC_0.  */
+	VPBROADCAST %esi, %VMATCH
 	movl	%edi, %eax
 	andl	$(PAGE_SIZE - 1), %eax
 	/* Check if we cross page boundary with one vector load.
@@ -75,19 +102,27 @@ ENTRY_P2ALIGN (STRCHR, 5)
 	cmpl	$(PAGE_SIZE - VEC_SIZE), %eax
 	ja	L(cross_page_boundary)
 
+
 	/* Check the first VEC_SIZE bytes. Search for both CHAR and the
 	   null bytes.  */
-	VMOVU	(%rdi), %YMM1
-
+	VMOVU	(%rdi), %VMM(1)
 	/* Leaves only CHARS matching esi as 0.  */
-	vpxorq	%YMM1, %YMM0, %YMM2
-	VPMINU	%YMM2, %YMM1, %YMM2
-	/* Each bit in K0 represents a CHAR or a null byte in YMM1.  */
-	VPTESTN	%YMM2, %YMM2, %k0
-	kmovd	%k0, %eax
-	testl	%eax, %eax
+	vpxorq	%VMM(1), %VMATCH, %VMM(2)
+	VPMINU	%VMM(2), %VMM(1), %VMM(2)
+	/* Each bit in K0 represents a CHAR or a null byte in VEC_1.  */
+	VPTESTN	%VMM(2), %VMM(2), %k0
+	KMOV	%k0, %VRAX
+# if VEC_SIZE == 64 && defined USE_AS_STRCHRNUL
+	/* If VEC_SIZE == 64 && STRCHRNUL use bsf to test condition so
+	   that all logic for match/null in first VEC first in 1x cache
+	   lines.  This has a slight cost to larger sizes.  */
+	bsf	%VRAX, %VRAX
+	jz	L(aligned_more)
+# else
+	test	%VRAX, %VRAX
 	jz	L(aligned_more)
-	tzcntl	%eax, %eax
+	bsf	%VRAX, %VRAX
+# endif
 # ifndef USE_AS_STRCHRNUL
 	/* Found CHAR or the null byte.  */
 	cmp	(%rdi, %rax, CHAR_SIZE), %CHAR_REG
@@ -109,287 +144,374 @@ ENTRY_P2ALIGN (STRCHR, 5)
 # endif
 	ret
 
-
-
-	.p2align 4,, 10
-L(first_vec_x4):
-# ifndef USE_AS_STRCHRNUL
-	/* Check to see if first match was CHAR (k0) or null (k1).  */
-	kmovd	%k0, %eax
-	tzcntl	%eax, %eax
-	kmovd	%k1, %ecx
-	/* bzhil will not be 0 if first match was null.  */
-	bzhil	%eax, %ecx, %ecx
-	jne	L(zero)
-# else
-	/* Combine CHAR and null matches.  */
-	kord	%k0, %k1, %k0
-	kmovd	%k0, %eax
-	tzcntl	%eax, %eax
-# endif
-	/* NB: Multiply sizeof char type (1 or 4) to get the number of
-	   bytes.  */
-	leaq	(VEC_SIZE * 4)(%rdi, %rax, CHAR_SIZE), %rax
-	ret
-
 # ifndef USE_AS_STRCHRNUL
 L(zero):
 	xorl	%eax, %eax
 	ret
 # endif
 
-
-	.p2align 4
+	.p2align 4,, 2
+L(first_vec_x3):
+	subq	$-(VEC_SIZE * 2), %rdi
+# if VEC_SIZE == 32
+	/* Reuse L(first_vec_x3) for last VEC2 only for VEC_SIZE == 32.
+	   For VEC_SIZE == 64 the registers don't match.  */
+L(last_vec_x2):
+# endif
 L(first_vec_x1):
 	/* Use bsf here to save 1-byte keeping keeping the block in 1x
 	   fetch block. eax guranteed non-zero.  */
-	bsfl	%eax, %eax
+	bsf	%VRCX, %VRCX
 # ifndef USE_AS_STRCHRNUL
-	/* Found CHAR or the null byte.	 */
-	cmp	(VEC_SIZE)(%rdi, %rax, CHAR_SIZE), %CHAR_REG
+	/* Found CHAR or the null byte.  */
+	cmp	(VEC_SIZE)(%rdi, %rcx, CHAR_SIZE), %CHAR_REG
 	jne	L(zero)
-
 # endif
 	/* NB: Multiply sizeof char type (1 or 4) to get the number of
 	   bytes.  */
-	leaq	(VEC_SIZE)(%rdi, %rax, CHAR_SIZE), %rax
+	leaq	(VEC_SIZE)(%rdi, %rcx, CHAR_SIZE), %rax
 	ret
 
-	.p2align 4,, 10
+	.p2align 4,, 2
+L(first_vec_x4):
+	subq	$-(VEC_SIZE * 2), %rdi
 L(first_vec_x2):
 # ifndef USE_AS_STRCHRNUL
 	/* Check to see if first match was CHAR (k0) or null (k1).  */
-	kmovd	%k0, %eax
-	tzcntl	%eax, %eax
-	kmovd	%k1, %ecx
+	KMOV	%k0, %VRAX
+	tzcnt	%VRAX, %VRAX
+	KMOV	%k1, %VRCX
 	/* bzhil will not be 0 if first match was null.  */
-	bzhil	%eax, %ecx, %ecx
+	bzhi	%VRAX, %VRCX, %VRCX
 	jne	L(zero)
 # else
 	/* Combine CHAR and null matches.  */
-	kord	%k0, %k1, %k0
-	kmovd	%k0, %eax
-	tzcntl	%eax, %eax
+	KOR	%k0, %k1, %k0
+	KMOV	%k0, %VRAX
+	bsf	%VRAX, %VRAX
 # endif
 	/* NB: Multiply sizeof char type (1 or 4) to get the number of
 	   bytes.  */
 	leaq	(VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
 	ret
 
-	.p2align 4,, 10
-L(first_vec_x3):
-	/* Use bsf here to save 1-byte keeping keeping the block in 1x
-	   fetch block. eax guranteed non-zero.  */
-	bsfl	%eax, %eax
-# ifndef USE_AS_STRCHRNUL
-	/* Found CHAR or the null byte.	 */
-	cmp	(VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %CHAR_REG
-	jne	L(zero)
+# ifdef USE_AS_STRCHRNUL
+	/* We use this as a hook to get imm8 encoding for the jmp to
+	   L(page_cross_boundary).  This allows the hot case of a
+	   match/null-term in first VEC to fit entirely in 1 cache
+	   line.  */
+L(cross_page_boundary):
+	jmp	L(cross_page_boundary_real)
 # endif
-	/* NB: Multiply sizeof char type (1 or 4) to get the number of
-	   bytes.  */
-	leaq	(VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax
-	ret
 
 	.p2align 4
 L(aligned_more):
+L(cross_page_continue):
 	/* Align data to VEC_SIZE.  */
 	andq	$-VEC_SIZE, %rdi
-L(cross_page_continue):
-	/* Check the next 4 * VEC_SIZE. Only one VEC_SIZE at a time since
-	   data is only aligned to VEC_SIZE. Use two alternating methods
-	   for checking VEC to balance latency and port contention.  */
 
-	/* This method has higher latency but has better port
-	   distribution.  */
-	VMOVA	(VEC_SIZE)(%rdi), %YMM1
+	/* Check the next 4 * VEC_SIZE. Only one VEC_SIZE at a time
+	   since data is only aligned to VEC_SIZE. Use two alternating
+	   methods for checking VEC to balance latency and port
+	   contention.  */
+
+    /* Method(1) with 8c latency:
+	   For VEC_SIZE == 32:
+	   p0 * 1.83, p1 * 0.83, p5 * 1.33
+	   For VEC_SIZE == 64:
+	   p0 * 2.50, p1 * 0.00, p5 * 1.50  */
+	VMOVA	(VEC_SIZE)(%rdi), %VMM(1)
 	/* Leaves only CHARS matching esi as 0.  */
-	vpxorq	%YMM1, %YMM0, %YMM2
-	VPMINU	%YMM2, %YMM1, %YMM2
-	/* Each bit in K0 represents a CHAR or a null byte in YMM1.  */
-	VPTESTN	%YMM2, %YMM2, %k0
-	kmovd	%k0, %eax
-	testl	%eax, %eax
+	vpxorq	%VMM(1), %VMATCH, %VMM(2)
+	VPMINU	%VMM(2), %VMM(1), %VMM(2)
+	/* Each bit in K0 represents a CHAR or a null byte in VEC_1.  */
+	VPTESTN	%VMM(2), %VMM(2), %k0
+	KMOV	%k0, %VRCX
+	test	%VRCX, %VRCX
 	jnz	L(first_vec_x1)
 
-	/* This method has higher latency but has better port
-	   distribution.  */
-	VMOVA	(VEC_SIZE * 2)(%rdi), %YMM1
-	/* Each bit in K0 represents a CHAR in YMM1.  */
-	VPCMP	$0, %YMM1, %YMM0, %k0
-	/* Each bit in K1 represents a CHAR in YMM1.  */
-	VPTESTN	%YMM1, %YMM1, %k1
-	kortestd	%k0, %k1
+    /* Method(2) with 6c latency:
+	   For VEC_SIZE == 32:
+	   p0 * 1.00, p1 * 0.00, p5 * 2.00
+	   For VEC_SIZE == 64:
+	   p0 * 1.00, p1 * 0.00, p5 * 2.00  */
+	VMOVA	(VEC_SIZE * 2)(%rdi), %VMM(1)
+	/* Each bit in K0 represents a CHAR in VEC_1.  */
+	VPCMPEQ	%VMM(1), %VMATCH, %k0
+	/* Each bit in K1 represents a CHAR in VEC_1.  */
+	VPTESTN	%VMM(1), %VMM(1), %k1
+	KORTEST %k0, %k1
 	jnz	L(first_vec_x2)
 
-	VMOVA	(VEC_SIZE * 3)(%rdi), %YMM1
+	/* By swapping between Method 1/2 we get more fair port
+	   distrubition and better throughput.  */
+
+	VMOVA	(VEC_SIZE * 3)(%rdi), %VMM(1)
 	/* Leaves only CHARS matching esi as 0.  */
-	vpxorq	%YMM1, %YMM0, %YMM2
-	VPMINU	%YMM2, %YMM1, %YMM2
-	/* Each bit in K0 represents a CHAR or a null byte in YMM1.  */
-	VPTESTN	%YMM2, %YMM2, %k0
-	kmovd	%k0, %eax
-	testl	%eax, %eax
+	vpxorq	%VMM(1), %VMATCH, %VMM(2)
+	VPMINU	%VMM(2), %VMM(1), %VMM(2)
+	/* Each bit in K0 represents a CHAR or a null byte in VEC_1.  */
+	VPTESTN	%VMM(2), %VMM(2), %k0
+	KMOV	%k0, %VRCX
+	test	%VRCX, %VRCX
 	jnz	L(first_vec_x3)
 
-	VMOVA	(VEC_SIZE * 4)(%rdi), %YMM1
-	/* Each bit in K0 represents a CHAR in YMM1.  */
-	VPCMP	$0, %YMM1, %YMM0, %k0
-	/* Each bit in K1 represents a CHAR in YMM1.  */
-	VPTESTN	%YMM1, %YMM1, %k1
-	kortestd	%k0, %k1
+	VMOVA	(VEC_SIZE * 4)(%rdi), %VMM(1)
+	/* Each bit in K0 represents a CHAR in VEC_1.  */
+	VPCMPEQ	%VMM(1), %VMATCH, %k0
+	/* Each bit in K1 represents a CHAR in VEC_1.  */
+	VPTESTN	%VMM(1), %VMM(1), %k1
+	KORTEST %k0, %k1
 	jnz	L(first_vec_x4)
 
 	/* Align data to VEC_SIZE * 4 for the loop.  */
+# if VEC_SIZE == 64
+	/* Use rax for the loop reg as it allows to the loop to fit in
+	   exactly 2-cache-lines. (more efficient imm32 + gpr
+	   encoding).  */
+	leaq	(VEC_SIZE)(%rdi), %rax
+	/* No partial register stalls on evex512 processors.  */
+	xorb	%al, %al
+# else
+	/* For VEC_SIZE == 32 continue using rdi for loop reg so we can
+	   reuse more code and save space.  */
 	addq	$VEC_SIZE, %rdi
 	andq	$-(VEC_SIZE * 4), %rdi
-
+# endif
 	.p2align 4
 L(loop_4x_vec):
-	/* Check 4x VEC at a time. No penalty to imm32 offset with evex
-	   encoding.  */
-	VMOVA	(VEC_SIZE * 4)(%rdi), %YMM1
-	VMOVA	(VEC_SIZE * 5)(%rdi), %YMM2
-	VMOVA	(VEC_SIZE * 6)(%rdi), %YMM3
-	VMOVA	(VEC_SIZE * 7)(%rdi), %YMM4
-
-	/* For YMM1 and YMM3 use xor to set the CHARs matching esi to
+	/* Check 4x VEC at a time. No penalty for imm32 offset with evex
+	   encoding (if offset % VEC_SIZE == 0).  */
+	VMOVA	(VEC_SIZE * 4)(%LOOP_REG), %VMM(1)
+	VMOVA	(VEC_SIZE * 5)(%LOOP_REG), %VMM(2)
+	VMOVA	(VEC_SIZE * 6)(%LOOP_REG), %VMM(3)
+	VMOVA	(VEC_SIZE * 7)(%LOOP_REG), %VMM(4)
+
+	/* Collect bits where VEC_1 does NOT match esi.  This is later
+	   use to mask of results (getting not matches allows us to
+	   save an instruction on combining).  */
+	VPCMP	$4, %VMATCH, %VMM(1), %k1
+
+	/* Two methods for loop depending on VEC_SIZE.  This is because
+	   with zmm registers VPMINU can only run on p0 (as opposed to
+	   p0/p1 for ymm) so it is less prefered.  */
+# if VEC_SIZE == 32
+	/* For VEC_2 and VEC_3 use xor to set the CHARs matching esi to
 	   zero.  */
-	vpxorq	%YMM1, %YMM0, %YMM5
-	/* For YMM2 and YMM4 cmp not equals to CHAR and store result in
-	   k register. Its possible to save either 1 or 2 instructions
-	   using cmp no equals method for either YMM1 or YMM1 and YMM3
-	   respectively but bottleneck on p5 makes it not worth it.  */
-	VPCMP	$4, %YMM0, %YMM2, %k2
-	vpxorq	%YMM3, %YMM0, %YMM7
-	VPCMP	$4, %YMM0, %YMM4, %k4
-
-	/* Use min to select all zeros from either xor or end of string).
-	 */
-	VPMINU	%YMM1, %YMM5, %YMM1
-	VPMINU	%YMM3, %YMM7, %YMM3
+	vpxorq	%VMM(2), %VMATCH, %VMM(6)
+	vpxorq	%VMM(3), %VMATCH, %VMM(7)
 
-	/* Use min + zeromask to select for zeros. Since k2 and k4 will
-	   have 0 as positions that matched with CHAR which will set
-	   zero in the corresponding destination bytes in YMM2 / YMM4.
-	 */
-	VPMINU	%YMM1, %YMM2, %YMM2{%k2}{z}
-	VPMINU	%YMM3, %YMM4, %YMM4
-	VPMINU	%YMM2, %YMM4, %YMM4{%k4}{z}
-
-	VPTESTN	%YMM4, %YMM4, %k1
-	kmovd	%k1, %ecx
-	subq	$-(VEC_SIZE * 4), %rdi
-	testl	%ecx, %ecx
+	/* Find non-matches in VEC_4 while combining with non-matches
+	   from VEC_1.  NB: Try and use masked predicate execution on
+	   instructions that have mask result as it has no latency
+	   penalty.  */
+	VPCMP	$4, %VMATCH, %VMM(4), %k4{%k1}
+
+	/* Combined zeros from VEC_1 / VEC_2 (search for null term).  */
+	VPMINU	%VMM(1), %VMM(2), %VMM(2)
+
+	/* Use min to select all zeros from either xor or end of
+	   string).  */
+	VPMINU	%VMM(3), %VMM(7), %VMM(3)
+	VPMINU	%VMM(2), %VMM(6), %VMM(2)
+
+	/* Combined zeros from VEC_2 / VEC_3 (search for null term).  */
+	VPMINU	%VMM(3), %VMM(4), %VMM(4)
+
+	/* Combined zeros from VEC_2 / VEC_4 (this has all null term and
+	   esi matches for VEC_2 / VEC_3).  */
+	VPMINU	%VMM(2), %VMM(4), %VMM(4)
+# else
+	/* Collect non-matches for VEC_2.  */
+	VPCMP	$4, %VMM(2), %VMATCH, %k2
+
+	/* Combined zeros from VEC_1 / VEC_2 (search for null term).  */
+	VPMINU	%VMM(1), %VMM(2), %VMM(2)
+
+	/* Find non-matches in VEC_3/VEC_4 while combining with non-
+	   matches from VEC_1/VEC_2 respectively.  */
+	VPCMP	$4, %VMM(3), %VMATCH, %k3{%k1}
+	VPCMP	$4, %VMM(4), %VMATCH, %k4{%k2}
+
+	/* Finish combining zeros in all VECs.  */
+	VPMINU	%VMM(3), %VMM(4), %VMM(4)
+
+	/* Combine in esi matches for VEC_3 (if there was a match with
+	   esi, the corresponding bit in %k3 is zero so the
+	   VPMINU_MASKZ will have a zero in the result).  NB: This make
+	   the VPMINU 3c latency.  The only way to avoid it is to
+	   createa a 12c dependency chain on all the `VPCMP $4, ...`
+	   which has higher total latency.  */
+	VPMINU	%VMM(2), %VMM(4), %VMM(4){%k3}{z}
+# endif
+	VPTEST	%VMM(4), %VMM(4), %k0{%k4}
+	KMOV	%k0, %VRDX
+	subq	$-(VEC_SIZE * 4), %LOOP_REG
+
+	/* TESTZ is inc using the proper register width depending on
+	   CHAR_PER_VEC. An esi match or null-term match leaves a zero-
+	   bit in rdx so inc won't overflow and won't be zero.  */
+	TESTZ	(rdx)
 	jz	L(loop_4x_vec)
 
-	VPTESTN	%YMM1, %YMM1, %k0
-	kmovd	%k0, %eax
-	testl	%eax, %eax
-	jnz	L(last_vec_x1)
+	VPTEST	%VMM(1), %VMM(1), %k0{%k1}
+	KMOV	%k0, %VGPR(MASK_GPR)
+	TESTZ	(MASK_GPR)
+# if VEC_SIZE == 32
+	/* We can reuse the return code in page_cross logic for VEC_SIZE
+	   == 32.  */
+	jnz	L(last_vec_x1_vec_size32)
+# else
+	jnz	L(last_vec_x1_vec_size64)
+# endif
+
 
-	VPTESTN	%YMM2, %YMM2, %k0
-	kmovd	%k0, %eax
-	testl	%eax, %eax
+	/* COND_MASK integates the esi matches for VEC_SIZE == 64. For
+	   VEC_SIZE == 32 they are already integrated.  */
+	VPTEST	%VMM(2), %VMM(2), %k0 COND_MASK(k2)
+	KMOV	%k0, %VRCX
+	TESTZ	(rcx)
 	jnz	L(last_vec_x2)
 
-	VPTESTN	%YMM3, %YMM3, %k0
-	kmovd	%k0, %eax
-	/* Combine YMM3 matches (eax) with YMM4 matches (ecx).  */
-# ifdef USE_AS_WCSCHR
-	sall	$8, %ecx
-	orl	%ecx, %eax
-	bsfl	%eax, %eax
+	VPTEST	%VMM(3), %VMM(3), %k0 COND_MASK(k3)
+	KMOV	%k0, %VRCX
+# if CHAR_PER_VEC == 64
+	TESTZ	(rcx)
+	jnz	L(last_vec_x3)
 # else
-	salq	$32, %rcx
-	orq	%rcx, %rax
-	bsfq	%rax, %rax
+	salq	$CHAR_PER_VEC, %rdx
+	TESTZ	(rcx)
+	orq	%rcx, %rdx
 # endif
+
+	bsfq	%rdx, %rdx
+
 # ifndef USE_AS_STRCHRNUL
 	/* Check if match was CHAR or null.  */
-	cmp	(VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %CHAR_REG
+	cmp	(LAST_VEC_OFFSET)(%LOOP_REG, %rdx, CHAR_SIZE), %CHAR_REG
 	jne	L(zero_end)
 # endif
 	/* NB: Multiply sizeof char type (1 or 4) to get the number of
 	   bytes.  */
-	leaq	(VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
+	leaq	(LAST_VEC_OFFSET)(%LOOP_REG, %rdx, CHAR_SIZE), %rax
 	ret
 
-	.p2align 4,, 8
-L(last_vec_x1):
-	bsfl	%eax, %eax
-# ifdef USE_AS_WCSCHR
-	/* NB: Multiply wchar_t count by 4 to get the number of bytes.
-	   */
-	leaq	(%rdi, %rax, CHAR_SIZE), %rax
-# else
-	addq	%rdi, %rax
+# ifndef USE_AS_STRCHRNUL
+L(zero_end):
+	xorl	%eax, %eax
+	ret
 # endif
 
-# ifndef USE_AS_STRCHRNUL
+
+	/* Seperate return label for last VEC1 because for VEC_SIZE ==
+	   32 we can reuse return code in L(page_cross) but VEC_SIZE ==
+	   64 has mismatched registers.  */
+# if VEC_SIZE == 64
+	.p2align 4,, 8
+L(last_vec_x1_vec_size64):
+	bsf	%VRCX, %VRCX
+#  ifndef USE_AS_STRCHRNUL
 	/* Check if match was null.  */
-	cmp	(%rax), %CHAR_REG
+	cmp	(%rax, %rcx, CHAR_SIZE), %CHAR_REG
 	jne	L(zero_end)
-# endif
-
+#  endif
+#  ifdef USE_AS_WCSCHR
+	/* NB: Multiply wchar_t count by 4 to get the number of bytes.
+	 */
+	leaq	(%rax, %rcx, CHAR_SIZE), %rax
+#  else
+	addq	%rcx, %rax
+#  endif
 	ret
 
+	/* Since we can't combine the last 2x matches for CHAR_PER_VEC
+	   == 64 we need return label for last VEC3.  */
+#  if CHAR_PER_VEC == 64
 	.p2align 4,, 8
+L(last_vec_x3):
+	addq	$VEC_SIZE, %LOOP_REG
+#  endif
+
+	/* Duplicate L(last_vec_x2) for VEC_SIZE == 64 because we can't
+	   reuse L(first_vec_x3) due to register mismatch.  */
 L(last_vec_x2):
-	bsfl	%eax, %eax
-# ifndef USE_AS_STRCHRNUL
+	bsf	%VGPR(MASK_GPR), %VGPR(MASK_GPR)
+#  ifndef USE_AS_STRCHRNUL
 	/* Check if match was null.  */
-	cmp	(VEC_SIZE)(%rdi, %rax, CHAR_SIZE), %CHAR_REG
+	cmp	(VEC_SIZE * 1)(%LOOP_REG, %MASK_GPR, CHAR_SIZE), %CHAR_REG
 	jne	L(zero_end)
-# endif
+#  endif
 	/* NB: Multiply sizeof char type (1 or 4) to get the number of
 	   bytes.  */
-	leaq	(VEC_SIZE)(%rdi, %rax, CHAR_SIZE), %rax
+	leaq	(VEC_SIZE * 1)(%LOOP_REG, %MASK_GPR, CHAR_SIZE), %rax
 	ret
+# endif
 
-	/* Cold case for crossing page with first load.	 */
-	.p2align 4,, 8
+	/* Cold case for crossing page with first load.  */
+	.p2align 4,, 10
+# ifndef USE_AS_STRCHRNUL
 L(cross_page_boundary):
-	movq	%rdi, %rdx
+# endif
+L(cross_page_boundary_real):
 	/* Align rdi.  */
-	andq	$-VEC_SIZE, %rdi
-	VMOVA	(%rdi), %YMM1
-	/* Leaves only CHARS matching esi as 0.  */
-	vpxorq	%YMM1, %YMM0, %YMM2
-	VPMINU	%YMM2, %YMM1, %YMM2
-	/* Each bit in K0 represents a CHAR or a null byte in YMM1.  */
-	VPTESTN	%YMM2, %YMM2, %k0
-	kmovd	%k0, %eax
+	xorq	%rdi, %rax
+	VMOVA	(PAGE_SIZE - VEC_SIZE)(%rax), %VMM(1)
+	/* Use high latency method of getting matches to save code size.
+	 */
+
+	/* K1 has 1s where VEC(1) does NOT match esi.  */
+	VPCMP	$4, %VMM(1), %VMATCH, %k1
+	/* K0 has ones where K1 is 1 (non-match with esi), and non-zero
+	   (null).  */
+	VPTEST	%VMM(1), %VMM(1), %k0{%k1}
+	KMOV	%k0, %VRAX
 	/* Remove the leading bits.  */
 # ifdef USE_AS_WCSCHR
-	movl	%edx, %SHIFT_REG
+	movl	%edi, %VGPR_SZ(SHIFT_REG, 32)
 	/* NB: Divide shift count by 4 since each bit in K1 represent 4
 	   bytes.  */
-	sarl	$2, %SHIFT_REG
-	andl	$(CHAR_PER_VEC - 1), %SHIFT_REG
+	sarl	$2, %VGPR_SZ(SHIFT_REG, 32)
+	andl	$(CHAR_PER_VEC - 1), %VGPR_SZ(SHIFT_REG, 32)
+
+	/* if wcsrchr we need to reverse matches as we can't rely on
+	   signed shift to bring in ones. There is not sarx for
+	   gpr8/16. Also not we can't use inc here as the lower bits
+	   represent matches out of range so we can't rely on overflow.
+	 */
+	xorl	$((1 << CHAR_PER_VEC)- 1), %eax
+# endif
+	/* Use arithmatic shift so that leading 1s are filled in.  */
+	sarx	%VGPR(SHIFT_REG), %VRAX, %VRAX
+	/* If eax is all ones then no matches for esi or NULL.  */
+
+# ifdef USE_AS_WCSCHR
+	test	%VRAX, %VRAX
+# else
+	inc	%VRAX
 # endif
-	sarxl	%SHIFT_REG, %eax, %eax
-	/* If eax is zero continue.  */
-	testl	%eax, %eax
 	jz	L(cross_page_continue)
-	bsfl	%eax, %eax
 
+	.p2align 4,, 10
+L(last_vec_x1_vec_size32):
+	bsf	%VRAX, %VRAX
 # ifdef USE_AS_WCSCHR
-	/* NB: Multiply wchar_t count by 4 to get the number of
-	   bytes.  */
-	leaq	(%rdx, %rax, CHAR_SIZE), %rax
+	/* NB: Multiply wchar_t count by 4 to get the number of bytes.
+	 */
+	leaq	(%rdi, %rax, CHAR_SIZE), %rax
 # else
-	addq	%rdx, %rax
+	addq	%rdi, %rax
 # endif
 # ifndef USE_AS_STRCHRNUL
 	/* Check to see if match was CHAR or null.  */
 	cmp	(%rax), %CHAR_REG
-	je	L(cross_page_ret)
-L(zero_end):
-	xorl	%eax, %eax
-L(cross_page_ret):
+	jne	L(zero_end_0)
 # endif
 	ret
+# ifndef USE_AS_STRCHRNUL
+L(zero_end_0):
+	xorl	%eax, %eax
+	ret
+# endif
 
 END (STRCHR)
 #endif
-- 
2.34.1


^ permalink raw reply	[flat|nested] 41+ messages in thread

* [PATCH v1 3/7] x86: Optimize strnlen-evex.S and implement with VMM headers
  2022-10-18  2:48 [PATCH v1 1/7] x86: Optimize memchr-evex.S and implement with VMM headers Noah Goldstein
  2022-10-18  2:48 ` [PATCH v1 2/7] x86: Shrink / minorly optimize strchr-evex " Noah Goldstein
@ 2022-10-18  2:48 ` Noah Goldstein
  2022-10-18  2:51   ` Noah Goldstein
  2022-10-18  2:48 ` [PATCH v1 4/7] x86: Optimize memrchr-evex.S Noah Goldstein
                   ` (6 subsequent siblings)
  8 siblings, 1 reply; 41+ messages in thread
From: Noah Goldstein @ 2022-10-18  2:48 UTC (permalink / raw)
  To: libc-alpha; +Cc: goldstein.w.n, hjl.tools, carlos

Optimizations are:
1. Use the fact that bsf(0) leaves the destination unchanged to save a
   branch in short string case.
2. Restructure code so that small strings are given the hot path.
        - This is a net-zero on the benchmark suite but in general makes
      sense as smaller sizes are far more common.
3. Use more code-size efficient instructions.
	- tzcnt ...     -> bsf ...
	- vpcmpb $0 ... -> vpcmpeq ...
4. Align labels less aggressively, especially if it doesn't save fetch
   blocks / causes the basic-block to span extra cache-lines.

The optimizations (especially for point 2) make the strnlen and
strlen code essentially incompatible so split strnlen-evex
to a new file.

Code Size Changes:
strlen-evex.S       :  -23 bytes
strnlen-evex.S      : -167 bytes

Net perf changes:

Reported as geometric mean of all improvements / regressions from N=10
runs of the benchtests. Value as New Time / Old Time so < 1.0 is
improvement and 1.0 is regression.

strlen-evex.S       : 0.992 (No real change)
strnlen-evex.S      : 0.947

Full results attached in email.

Full check passes on x86-64.
---
 sysdeps/x86_64/multiarch/strlen-evex.S  | 544 +++++++-----------------
 sysdeps/x86_64/multiarch/strnlen-evex.S | 427 ++++++++++++++++++-
 sysdeps/x86_64/multiarch/wcsnlen-evex.S |   5 +-
 3 files changed, 572 insertions(+), 404 deletions(-)

diff --git a/sysdeps/x86_64/multiarch/strlen-evex.S b/sysdeps/x86_64/multiarch/strlen-evex.S
index 2109ec2f7a..487846f098 100644
--- a/sysdeps/x86_64/multiarch/strlen-evex.S
+++ b/sysdeps/x86_64/multiarch/strlen-evex.S
@@ -26,466 +26,220 @@
 #  define STRLEN	__strlen_evex
 # endif
 
-# define VMOVA		vmovdqa64
+# ifndef VEC_SIZE
+#  include "x86-evex256-vecs.h"
+# endif
 
 # ifdef USE_AS_WCSLEN
-#  define VPCMP		vpcmpd
+#  define VPCMPEQ	vpcmpeqd
+#  define VPCMPNEQ	vpcmpneqd
+#  define VPTESTN	vptestnmd
+#  define VPTEST	vptestmd
 #  define VPMINU	vpminud
-#  define SHIFT_REG ecx
 #  define CHAR_SIZE	4
+#  define CHAR_SIZE_SHIFT_REG(reg)	sar $2, %reg
 # else
-#  define VPCMP		vpcmpb
+#  define VPCMPEQ	vpcmpeqb
+#  define VPCMPNEQ	vpcmpneqb
+#  define VPTESTN	vptestnmb
+#  define VPTEST	vptestmb
 #  define VPMINU	vpminub
-#  define SHIFT_REG edx
 #  define CHAR_SIZE	1
+#  define CHAR_SIZE_SHIFT_REG(reg)
+
+#  define REG_WIDTH	VEC_SIZE
 # endif
 
-# define XMMZERO	xmm16
-# define YMMZERO	ymm16
-# define YMM1		ymm17
-# define YMM2		ymm18
-# define YMM3		ymm19
-# define YMM4		ymm20
-# define YMM5		ymm21
-# define YMM6		ymm22
-
-# define VEC_SIZE 32
-# define PAGE_SIZE 4096
-# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE)
-
-	.section .text.evex,"ax",@progbits
-ENTRY (STRLEN)
-# ifdef USE_AS_STRNLEN
-	/* Check zero length.  */
-	test	%RSI_LP, %RSI_LP
-	jz	L(zero)
-#  ifdef __ILP32__
-	/* Clear the upper 32 bits.  */
-	movl	%esi, %esi
-#  endif
-	mov	%RSI_LP, %R8_LP
+# define CHAR_PER_VEC	(VEC_SIZE / CHAR_SIZE)
+
+# include "reg-macros.h"
+
+# if CHAR_PER_VEC == 64
+
+#  define TAIL_RETURN_LBL	first_vec_x2
+#  define TAIL_RETURN_OFFSET	(CHAR_PER_VEC * 2)
+
+#  define FALLTHROUGH_RETURN_LBL	first_vec_x3
+#  define FALLTHROUGH_RETURN_OFFSET	(CHAR_PER_VEC * 3)
+
+# else
+
+#  define TAIL_RETURN_LBL	first_vec_x3
+#  define TAIL_RETURN_OFFSET	(CHAR_PER_VEC * 3)
+
+#  define FALLTHROUGH_RETURN_LBL	first_vec_x2
+#  define FALLTHROUGH_RETURN_OFFSET	(CHAR_PER_VEC * 2)
 # endif
+
+# define XZERO	VMM_128(0)
+# define VZERO	VMM(0)
+# define PAGE_SIZE	4096
+
+	.section SECTION(.text), "ax", @progbits
+ENTRY_P2ALIGN (STRLEN, 6)
 	movl	%edi, %eax
-	vpxorq	%XMMZERO, %XMMZERO, %XMMZERO
-	/* Clear high bits from edi. Only keeping bits relevant to page
-	   cross check.  */
+	vpxorq	%XZERO, %XZERO, %XZERO
 	andl	$(PAGE_SIZE - 1), %eax
-	/* Check if we may cross page boundary with one vector load.  */
 	cmpl	$(PAGE_SIZE - VEC_SIZE), %eax
 	ja	L(cross_page_boundary)
 
 	/* Check the first VEC_SIZE bytes.  Each bit in K0 represents a
 	   null byte.  */
-	VPCMP	$0, (%rdi), %YMMZERO, %k0
-	kmovd	%k0, %eax
-# ifdef USE_AS_STRNLEN
-	/* If length < CHAR_PER_VEC handle special.  */
-	cmpq	$CHAR_PER_VEC, %rsi
-	jbe	L(first_vec_x0)
-# endif
-	testl	%eax, %eax
+	VPCMPEQ	(%rdi), %VZERO, %k0
+	KMOV	%k0, %VRAX
+	test	%VRAX, %VRAX
 	jz	L(aligned_more)
-	tzcntl	%eax, %eax
-	ret
-# ifdef USE_AS_STRNLEN
-L(zero):
-	xorl	%eax, %eax
-	ret
-
-	.p2align 4
-L(first_vec_x0):
-	/* Set bit for max len so that tzcnt will return min of max len
-	   and position of first match.  */
-	btsq	%rsi, %rax
-	tzcntl	%eax, %eax
-	ret
-# endif
-
-	.p2align 4
-L(first_vec_x1):
-	tzcntl	%eax, %eax
-	/* Safe to use 32 bit instructions as these are only called for
-	   size = [1, 159].  */
-# ifdef USE_AS_STRNLEN
-	/* Use ecx which was computed earlier to compute correct value.
-	 */
-	leal	-(CHAR_PER_VEC * 4 + 1)(%rcx, %rax), %eax
-# else
-	subl	%edx, %edi
-#  ifdef USE_AS_WCSLEN
-	/* NB: Divide bytes by 4 to get the wchar_t count.  */
-	sarl	$2, %edi
-#  endif
-	leal	CHAR_PER_VEC(%rdi, %rax), %eax
-# endif
-	ret
-
-	.p2align 4
-L(first_vec_x2):
-	tzcntl	%eax, %eax
-	/* Safe to use 32 bit instructions as these are only called for
-	   size = [1, 159].  */
-# ifdef USE_AS_STRNLEN
-	/* Use ecx which was computed earlier to compute correct value.
-	 */
-	leal	-(CHAR_PER_VEC * 3 + 1)(%rcx, %rax), %eax
-# else
-	subl	%edx, %edi
-#  ifdef USE_AS_WCSLEN
-	/* NB: Divide bytes by 4 to get the wchar_t count.  */
-	sarl	$2, %edi
-#  endif
-	leal	(CHAR_PER_VEC * 2)(%rdi, %rax), %eax
-# endif
+	bsf	%VRAX, %VRAX
 	ret
 
-	.p2align 4
-L(first_vec_x3):
-	tzcntl	%eax, %eax
-	/* Safe to use 32 bit instructions as these are only called for
-	   size = [1, 159].  */
-# ifdef USE_AS_STRNLEN
-	/* Use ecx which was computed earlier to compute correct value.
-	 */
-	leal	-(CHAR_PER_VEC * 2 + 1)(%rcx, %rax), %eax
-# else
-	subl	%edx, %edi
-#  ifdef USE_AS_WCSLEN
-	/* NB: Divide bytes by 4 to get the wchar_t count.  */
-	sarl	$2, %edi
-#  endif
-	leal	(CHAR_PER_VEC * 3)(%rdi, %rax), %eax
-# endif
-	ret
-
-	.p2align 4
+	.p2align 4,, 8
 L(first_vec_x4):
-	tzcntl	%eax, %eax
-	/* Safe to use 32 bit instructions as these are only called for
-	   size = [1, 159].  */
-# ifdef USE_AS_STRNLEN
-	/* Use ecx which was computed earlier to compute correct value.
-	 */
-	leal	-(CHAR_PER_VEC + 1)(%rcx, %rax), %eax
-# else
-	subl	%edx, %edi
-#  ifdef USE_AS_WCSLEN
-	/* NB: Divide bytes by 4 to get the wchar_t count.  */
-	sarl	$2, %edi
-#  endif
+	bsf	%VRAX, %VRAX
+	subl	%ecx, %edi
+	CHAR_SIZE_SHIFT_REG (edi)
 	leal	(CHAR_PER_VEC * 4)(%rdi, %rax), %eax
-# endif
 	ret
 
-	.p2align 5
+
+
+	/* Aligned more for strnlen compares remaining length vs 2 *
+	   CHAR_PER_VEC, 4 * CHAR_PER_VEC, and 8 * CHAR_PER_VEC before
+	   going to the loop.  */
+	.p2align 4,, 10
 L(aligned_more):
-	movq	%rdi, %rdx
-	/* Align data to VEC_SIZE.  */
-	andq	$-(VEC_SIZE), %rdi
+	movq	%rdi, %rcx
+	andq	$(VEC_SIZE * -1), %rdi
 L(cross_page_continue):
-	/* Check the first 4 * VEC_SIZE.  Only one VEC_SIZE at a time
-	   since data is only aligned to VEC_SIZE.  */
-# ifdef USE_AS_STRNLEN
-	/* + CHAR_SIZE because it simplies the logic in
-	   last_4x_vec_or_less.  */
-	leaq	(VEC_SIZE * 5 + CHAR_SIZE)(%rdi), %rcx
-	subq	%rdx, %rcx
-#  ifdef USE_AS_WCSLEN
-	/* NB: Divide bytes by 4 to get the wchar_t count.  */
-	sarl	$2, %ecx
-#  endif
-# endif
-	/* Load first VEC regardless.  */
-	VPCMP	$0, VEC_SIZE(%rdi), %YMMZERO, %k0
-# ifdef USE_AS_STRNLEN
-	/* Adjust length. If near end handle specially.  */
-	subq	%rcx, %rsi
-	jb	L(last_4x_vec_or_less)
-# endif
-	kmovd	%k0, %eax
-	testl	%eax, %eax
+	/* Remaining length >= 2 * CHAR_PER_VEC so do VEC0/VEC1 without
+	   rechecking bounds.  */
+	VPCMPEQ	(VEC_SIZE * 1)(%rdi), %VZERO, %k0
+	KMOV	%k0, %VRAX
+	test	%VRAX, %VRAX
 	jnz	L(first_vec_x1)
 
-	VPCMP	$0, (VEC_SIZE * 2)(%rdi), %YMMZERO, %k0
-	kmovd	%k0, %eax
-	test	%eax, %eax
+	VPCMPEQ	(VEC_SIZE * 2)(%rdi), %VZERO, %k0
+	KMOV	%k0, %VRAX
+	test	%VRAX, %VRAX
 	jnz	L(first_vec_x2)
 
-	VPCMP	$0, (VEC_SIZE * 3)(%rdi), %YMMZERO, %k0
-	kmovd	%k0, %eax
-	testl	%eax, %eax
+	VPCMPEQ	(VEC_SIZE * 3)(%rdi), %VZERO, %k0
+	KMOV	%k0, %VRAX
+	test	%VRAX, %VRAX
 	jnz	L(first_vec_x3)
 
-	VPCMP	$0, (VEC_SIZE * 4)(%rdi), %YMMZERO, %k0
-	kmovd	%k0, %eax
-	testl	%eax, %eax
+	VPCMPEQ	(VEC_SIZE * 4)(%rdi), %VZERO, %k0
+	KMOV	%k0, %VRAX
+	test	%VRAX, %VRAX
 	jnz	L(first_vec_x4)
 
-	addq	$VEC_SIZE, %rdi
-# ifdef USE_AS_STRNLEN
-	/* Check if at last VEC_SIZE * 4 length.  */
-	cmpq	$(CHAR_PER_VEC * 4 - 1), %rsi
-	jbe	L(last_4x_vec_or_less_load)
-	movl	%edi, %ecx
-	andl	$(VEC_SIZE * 4 - 1), %ecx
-#  ifdef USE_AS_WCSLEN
-	/* NB: Divide bytes by 4 to get the wchar_t count.  */
-	sarl	$2, %ecx
-#  endif
-	/* Readjust length.  */
-	addq	%rcx, %rsi
-# endif
-	/* Align data to VEC_SIZE * 4.  */
+	subq	$(VEC_SIZE * -1), %rdi
+
+# if CHAR_PER_VEC == 64
+	/* No partial register stalls on processors that we use evex512
+	   on and this saves code size.  */
+	xorb	%dil, %dil
+# else
 	andq	$-(VEC_SIZE * 4), %rdi
+# endif
+
+
 
 	/* Compare 4 * VEC at a time forward.  */
 	.p2align 4
 L(loop_4x_vec):
-	/* Load first VEC regardless.  */
-	VMOVA	(VEC_SIZE * 4)(%rdi), %YMM1
-# ifdef USE_AS_STRNLEN
-	/* Break if at end of length.  */
-	subq	$(CHAR_PER_VEC * 4), %rsi
-	jb	L(last_4x_vec_or_less_cmpeq)
-# endif
-	/* Save some code size by microfusing VPMINU with the load. Since
-	   the matches in ymm2/ymm4 can only be returned if there where no
-	   matches in ymm1/ymm3 respectively there is no issue with overlap.
-	 */
-	VPMINU	(VEC_SIZE * 5)(%rdi), %YMM1, %YMM2
-	VMOVA	(VEC_SIZE * 6)(%rdi), %YMM3
-	VPMINU	(VEC_SIZE * 7)(%rdi), %YMM3, %YMM4
+	VMOVA	(VEC_SIZE * 4)(%rdi), %VMM(1)
+	VPMINU	(VEC_SIZE * 5)(%rdi), %VMM(1), %VMM(2)
+	VMOVA	(VEC_SIZE * 6)(%rdi), %VMM(3)
+	VPMINU	(VEC_SIZE * 7)(%rdi), %VMM(3), %VMM(4)
+	VPTESTN	%VMM(2), %VMM(2), %k0
+	VPTESTN	%VMM(4), %VMM(4), %k2
 
-	VPCMP	$0, %YMM2, %YMMZERO, %k0
-	VPCMP	$0, %YMM4, %YMMZERO, %k1
 	subq	$-(VEC_SIZE * 4), %rdi
-	kortestd	%k0, %k1
+	KORTEST %k0, %k2
 	jz	L(loop_4x_vec)
 
-	/* Check if end was in first half.  */
-	kmovd	%k0, %eax
-	subq	%rdx, %rdi
-# ifdef USE_AS_WCSLEN
-	shrq	$2, %rdi
-# endif
-	testl	%eax, %eax
-	jz	L(second_vec_return)
+	VPTESTN	%VMM(1), %VMM(1), %k1
+	KMOV	%k1, %VRAX
+	test	%VRAX, %VRAX
+	jnz	L(first_vec_x0)
 
-	VPCMP	$0, %YMM1, %YMMZERO, %k2
-	kmovd	%k2, %edx
-	/* Combine VEC1 matches (edx) with VEC2 matches (eax).  */
-# ifdef USE_AS_WCSLEN
-	sall	$CHAR_PER_VEC, %eax
-	orl	%edx, %eax
-	tzcntl	%eax, %eax
-# else
-	salq	$CHAR_PER_VEC, %rax
-	orq	%rdx, %rax
-	tzcntq	%rax, %rax
-# endif
-	addq	%rdi, %rax
-	ret
-
-
-# ifdef USE_AS_STRNLEN
-
-L(last_4x_vec_or_less_load):
-	/* Depending on entry adjust rdi / prepare first VEC in YMM1.  */
-	VMOVA	(VEC_SIZE * 4)(%rdi), %YMM1
-L(last_4x_vec_or_less_cmpeq):
-	VPCMP	$0, %YMM1, %YMMZERO, %k0
-	addq	$(VEC_SIZE * 3), %rdi
-L(last_4x_vec_or_less):
-	kmovd	%k0, %eax
-	/* If remaining length > VEC_SIZE * 2. This works if esi is off by
-	   VEC_SIZE * 4.  */
-	testl	$(CHAR_PER_VEC * 2), %esi
-	jnz	L(last_4x_vec)
-
-	/* length may have been negative or positive by an offset of
-	   CHAR_PER_VEC * 4 depending on where this was called from. This
-	   fixes that.  */
-	andl	$(CHAR_PER_VEC * 4 - 1), %esi
-	testl	%eax, %eax
-	jnz	L(last_vec_x1_check)
+	KMOV	%k0, %VRAX
+	test	%VRAX, %VRAX
+	jnz	L(first_vec_x1)
 
-	/* Check the end of data.  */
-	subl	$CHAR_PER_VEC, %esi
-	jb	L(max)
+	VPTESTN	%VMM(3), %VMM(3), %k0
 
-	VPCMP	$0, (VEC_SIZE * 2)(%rdi), %YMMZERO, %k0
-	kmovd	%k0, %eax
-	tzcntl	%eax, %eax
-	/* Check the end of data.  */
-	cmpl	%eax, %esi
-	jb	L(max)
-
-	subq	%rdx, %rdi
-#  ifdef USE_AS_WCSLEN
-	/* NB: Divide bytes by 4 to get the wchar_t count.  */
-	sarq	$2, %rdi
-#  endif
-	leaq	(CHAR_PER_VEC * 2)(%rdi, %rax), %rax
-	ret
-L(max):
-	movq	%r8, %rax
-	ret
-# endif
-
-	/* Placed here in strnlen so that the jcc L(last_4x_vec_or_less)
-	   in the 4x VEC loop can use 2 byte encoding.  */
-	.p2align 4
-L(second_vec_return):
-	VPCMP	$0, %YMM3, %YMMZERO, %k0
-	/* Combine YMM3 matches (k0) with YMM4 matches (k1).  */
-# ifdef USE_AS_WCSLEN
-	kunpckbw	%k0, %k1, %k0
-	kmovd	%k0, %eax
-	tzcntl	%eax, %eax
+# if CHAR_PER_VEC == 64
+	KMOV	%k0, %VRAX
+	test	%VRAX, %VRAX
+	jnz	L(first_vec_x2)
+	KMOV	%k2, %VRAX
 # else
-	kunpckdq	%k0, %k1, %k0
-	kmovq	%k0, %rax
-	tzcntq	%rax, %rax
+	/* We can only combine last 2x VEC masks if CHAR_PER_VEC <= 32.
+	 */
+	kmovd	%k2, %edx
+	kmovd	%k0, %eax
+	salq	$CHAR_PER_VEC, %rdx
+	orq	%rdx, %rax
 # endif
-	leaq	(CHAR_PER_VEC * 2)(%rdi, %rax), %rax
-	ret
 
-
-# ifdef USE_AS_STRNLEN
-L(last_vec_x1_check):
-	tzcntl	%eax, %eax
-	/* Check the end of data.  */
-	cmpl	%eax, %esi
-	jb	L(max)
-	subq	%rdx, %rdi
-#  ifdef USE_AS_WCSLEN
-	/* NB: Divide bytes by 4 to get the wchar_t count.  */
-	sarq	$2, %rdi
-#  endif
-	leaq	(CHAR_PER_VEC)(%rdi, %rax), %rax
+	/* first_vec_x3 for strlen-ZMM and first_vec_x2 for strlen-YMM.
+	 */
+	.p2align 4,, 2
+L(FALLTHROUGH_RETURN_LBL):
+	bsfq	%rax, %rax
+	subq	%rcx, %rdi
+	CHAR_SIZE_SHIFT_REG (rdi)
+	leaq	(FALLTHROUGH_RETURN_OFFSET)(%rdi, %rax), %rax
 	ret
 
-	.p2align 4
-L(last_4x_vec):
-	/* Test first 2x VEC normally.  */
-	testl	%eax, %eax
-	jnz	L(last_vec_x1)
-
-	VPCMP	$0, (VEC_SIZE * 2)(%rdi), %YMMZERO, %k0
-	kmovd	%k0, %eax
-	testl	%eax, %eax
-	jnz	L(last_vec_x2)
-
-	/* Normalize length.  */
-	andl	$(CHAR_PER_VEC * 4 - 1), %esi
-	VPCMP	$0, (VEC_SIZE * 3)(%rdi), %YMMZERO, %k0
-	kmovd	%k0, %eax
-	testl	%eax, %eax
-	jnz	L(last_vec_x3)
-
-	/* Check the end of data.  */
-	subl	$(CHAR_PER_VEC * 3), %esi
-	jb	L(max)
-
-	VPCMP	$0, (VEC_SIZE * 4)(%rdi), %YMMZERO, %k0
-	kmovd	%k0, %eax
-	tzcntl	%eax, %eax
-	/* Check the end of data.  */
-	cmpl	%eax, %esi
-	jb	L(max_end)
-
-	subq	%rdx, %rdi
-#  ifdef USE_AS_WCSLEN
-	/* NB: Divide bytes by 4 to get the wchar_t count.  */
-	sarq	$2, %rdi
-#  endif
-	leaq	(CHAR_PER_VEC * 4)(%rdi, %rax), %rax
+	.p2align 4,, 8
+L(first_vec_x0):
+	bsf	%VRAX, %VRAX
+	sub	%rcx, %rdi
+	CHAR_SIZE_SHIFT_REG (rdi)
+	addq	%rdi, %rax
 	ret
 
-	.p2align 4
-L(last_vec_x1):
-	tzcntl	%eax, %eax
-	subq	%rdx, %rdi
-#  ifdef USE_AS_WCSLEN
-	/* NB: Divide bytes by 4 to get the wchar_t count.  */
-	sarq	$2, %rdi
-#  endif
+	.p2align 4,, 10
+L(first_vec_x1):
+	bsf	%VRAX, %VRAX
+	sub	%rcx, %rdi
+	CHAR_SIZE_SHIFT_REG (rdi)
 	leaq	(CHAR_PER_VEC)(%rdi, %rax), %rax
 	ret
 
-	.p2align 4
-L(last_vec_x2):
-	tzcntl	%eax, %eax
-	subq	%rdx, %rdi
-#  ifdef USE_AS_WCSLEN
-	/* NB: Divide bytes by 4 to get the wchar_t count.  */
-	sarq	$2, %rdi
-#  endif
-	leaq	(CHAR_PER_VEC * 2)(%rdi, %rax), %rax
-	ret
-
-	.p2align 4
-L(last_vec_x3):
-	tzcntl	%eax, %eax
-	subl	$(CHAR_PER_VEC * 2), %esi
-	/* Check the end of data.  */
-	cmpl	%eax, %esi
-	jb	L(max_end)
-	subq	%rdx, %rdi
-#  ifdef USE_AS_WCSLEN
-	/* NB: Divide bytes by 4 to get the wchar_t count.  */
-	sarq	$2, %rdi
-#  endif
-	leaq	(CHAR_PER_VEC * 3)(%rdi, %rax), %rax
-	ret
-L(max_end):
-	movq	%r8, %rax
+	.p2align 4,, 10
+	/* first_vec_x2 for strlen-ZMM and first_vec_x3 for strlen-YMM.
+	 */
+L(TAIL_RETURN_LBL):
+	bsf	%VRAX, %VRAX
+	sub	%VRCX, %VRDI
+	CHAR_SIZE_SHIFT_REG (VRDI)
+	lea	(TAIL_RETURN_OFFSET)(%rdi, %rax), %VRAX
 	ret
-# endif
 
-	/* Cold case for crossing page with first load.	 */
-	.p2align 4
+	.p2align 4,, 8
 L(cross_page_boundary):
-	movq	%rdi, %rdx
+	movq	%rdi, %rcx
 	/* Align data to VEC_SIZE.  */
 	andq	$-VEC_SIZE, %rdi
-	VPCMP	$0, (%rdi), %YMMZERO, %k0
-	kmovd	%k0, %eax
-	/* Remove the leading bytes.  */
+
+	VPCMPEQ	(%rdi), %VZERO, %k0
+
+	KMOV	%k0, %VRAX
 # ifdef USE_AS_WCSLEN
-	/* NB: Divide shift count by 4 since each bit in K0 represent 4
-	   bytes.  */
-	movl	%edx, %ecx
-	shrl	$2, %ecx
-	andl	$(CHAR_PER_VEC - 1), %ecx
-# endif
-	/* SHIFT_REG is ecx for USE_AS_WCSLEN and edx otherwise.  */
-	sarxl	%SHIFT_REG, %eax, %eax
+	movl	%ecx, %edx
+	shrl	$2, %edx
+	andl	$(CHAR_PER_VEC - 1), %edx
+	shrx	%edx, %eax, %eax
 	testl	%eax, %eax
-# ifndef USE_AS_STRNLEN
-	jz	L(cross_page_continue)
-	tzcntl	%eax, %eax
-	ret
 # else
-	jnz	L(cross_page_less_vec)
-#  ifndef USE_AS_WCSLEN
-	movl	%edx, %ecx
-	andl	$(CHAR_PER_VEC - 1), %ecx
-#  endif
-	movl	$CHAR_PER_VEC, %eax
-	subl	%ecx, %eax
-	/* Check the end of data.  */
-	cmpq	%rax, %rsi
-	ja	L(cross_page_continue)
-	movl	%esi, %eax
-	ret
-L(cross_page_less_vec):
-	tzcntl	%eax, %eax
-	/* Select min of length and position of first null.  */
-	cmpq	%rax, %rsi
-	cmovb	%esi, %eax
-	ret
+	shr	%cl, %VRAX
 # endif
+	jz	L(cross_page_continue)
+	bsf	%VRAX, %VRAX
+	ret
 
 END (STRLEN)
 #endif
diff --git a/sysdeps/x86_64/multiarch/strnlen-evex.S b/sysdeps/x86_64/multiarch/strnlen-evex.S
index 64a9fc2606..443a32a749 100644
--- a/sysdeps/x86_64/multiarch/strnlen-evex.S
+++ b/sysdeps/x86_64/multiarch/strnlen-evex.S
@@ -1,8 +1,423 @@
-#ifndef STRNLEN
-# define STRNLEN __strnlen_evex
-#endif
+/* strnlen/wcsnlen optimized with 256-bit EVEX instructions.
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <isa-level.h>
+#include <sysdep.h>
+
+#if ISA_SHOULD_BUILD (4)
+
+# ifndef VEC_SIZE
+#  include "x86-evex256-vecs.h"
+# endif
+
+
+# ifndef STRNLEN
+#  define STRNLEN	__strnlen_evex
+# endif
+
+# ifdef USE_AS_WCSLEN
+#  define VPCMPEQ	vpcmpeqd
+#  define VPCMPNEQ	vpcmpneqd
+#  define VPTESTN	vptestnmd
+#  define VPTEST	vptestmd
+#  define VPMINU	vpminud
+#  define CHAR_SIZE	4
+
+# else
+#  define VPCMPEQ	vpcmpeqb
+#  define VPCMPNEQ	vpcmpneqb
+#  define VPTESTN	vptestnmb
+#  define VPTEST	vptestmb
+#  define VPMINU	vpminub
+#  define CHAR_SIZE	1
+
+#  define REG_WIDTH	VEC_SIZE
+# endif
+
+# define CHAR_PER_VEC	(VEC_SIZE / CHAR_SIZE)
+
+# include "reg-macros.h"
+
+# if CHAR_PER_VEC == 32
+#  define SUB_SHORT(imm, reg)	subb $(imm), %VGPR_SZ(reg, 8)
+# else
+#  define SUB_SHORT(imm, reg)	subl $(imm), %VGPR_SZ(reg, 32)
+# endif
+
+
+
+# if CHAR_PER_VEC == 64
+#  define FALLTHROUGH_RETURN_OFFSET	(CHAR_PER_VEC * 3)
+# else
+#  define FALLTHROUGH_RETURN_OFFSET	(CHAR_PER_VEC * 2)
+# endif
+
+
+# define XZERO	VMM_128(0)
+# define VZERO	VMM(0)
+# define PAGE_SIZE	4096
+
+	.section SECTION(.text), "ax", @progbits
+ENTRY_P2ALIGN (STRNLEN, 6)
+	/* Check zero length.  */
+	test	%RSI_LP, %RSI_LP
+	jz	L(zero)
+# ifdef __ILP32__
+	/* Clear the upper 32 bits.  */
+	movl	%esi, %esi
+# endif
+
+	movl	%edi, %eax
+	vpxorq	%XZERO, %XZERO, %XZERO
+	andl	$(PAGE_SIZE - 1), %eax
+	cmpl	$(PAGE_SIZE - VEC_SIZE), %eax
+	ja	L(cross_page_boundary)
+
+	/* Check the first VEC_SIZE bytes.  Each bit in K0 represents a
+	   null byte.  */
+	VPCMPEQ	(%rdi), %VZERO, %k0
+
+	KMOV	%k0, %VRCX
+	movq	%rsi, %rax
+
+	/* If src (rcx) is zero, bsf does not change the result.  NB:
+	   Must use 64-bit bsf here so that upper bits of len are not
+	   cleared.  */
+	bsfq	%rcx, %rax
+	/* If rax > CHAR_PER_VEC then rcx must have been zero (no null
+	   CHAR) and rsi must be > CHAR_PER_VEC.  */
+	cmpq	$CHAR_PER_VEC, %rax
+	ja	L(more_1x_vec)
+	/* Check if first match in bounds.  */
+	cmpq	%rax, %rsi
+	cmovb	%esi, %eax
+	ret
+
+
+# if CHAR_PER_VEC != 32
+	.p2align 4,, 2
+L(zero):
+L(max_0):
+	movl	%esi, %eax
+	ret
+# endif
+
+	/* Aligned more for strnlen compares remaining length vs 2 *
+	   CHAR_PER_VEC, 4 * CHAR_PER_VEC, and 8 * CHAR_PER_VEC before
+	   going to the loop.  */
+	.p2align 4,, 10
+L(more_1x_vec):
+L(cross_page_continue):
+	/* Compute number of words checked after aligning.  */
+# ifdef USE_AS_WCSLEN
+	/* Need to compute directly for wcslen as CHAR_SIZE * rsi can
+	   overflow.  */
+	movq	%rdi, %rax
+	andq	$(VEC_SIZE * -1), %rdi
+	subq	%rdi, %rax
+	sarq	$2, %rax
+	leaq	-(CHAR_PER_VEC * 1)(%rax, %rsi), %rax
+# else
+	leaq	(VEC_SIZE * -1)(%rsi, %rdi), %rax
+	andq	$(VEC_SIZE * -1), %rdi
+	subq	%rdi, %rax
+# endif
+
+
+	VPCMPEQ	VEC_SIZE(%rdi), %VZERO, %k0
+
+	cmpq	$(CHAR_PER_VEC * 2), %rax
+	ja	L(more_2x_vec)
+
+L(last_2x_vec_or_less):
+	KMOV	%k0, %VRDX
+	test	%VRDX, %VRDX
+	jnz	L(last_vec_check)
+
+	/* Check the end of data.  */
+	SUB_SHORT (CHAR_PER_VEC, rax)
+	jbe	L(max_0)
+	VPCMPEQ	(VEC_SIZE * 2)(%rdi), %VZERO, %k0
+	KMOV	%k0, %VRDX
+	test	%VRDX, %VRDX
+	jz	L(max_0)
+	/* Best place for LAST_VEC_CHECK if ZMM.  */
+	.p2align 4,, 8
+L(last_vec_check):
+	bsf	%VRDX, %VRDX
+	sub	%eax, %edx
+	lea	(%rsi, %rdx), %eax
+	cmovae	%esi, %eax
+	ret
+
+# if CHAR_PER_VEC == 32
+	.p2align 4,, 2
+L(zero):
+L(max_0):
+	movl	%esi, %eax
+	ret
+# endif
+
+	.p2align 4,, 8
+L(last_4x_vec_or_less):
+	addl	$(CHAR_PER_VEC * -4), %eax
+	VPCMPEQ	(VEC_SIZE * 5)(%rdi), %VZERO, %k0
+	subq	$(VEC_SIZE * -4), %rdi
+	cmpl	$(CHAR_PER_VEC * 2), %eax
+	jbe	L(last_2x_vec_or_less)
+
+	.p2align 4,, 6
+L(more_2x_vec):
+	/* Remaining length >= 2 * CHAR_PER_VEC so do VEC0/VEC1 without
+	   rechecking bounds.  */
 
-#define USE_AS_STRNLEN 1
-#define STRLEN	STRNLEN
+	KMOV	%k0, %VRDX
 
-#include "strlen-evex.S"
+	test	%VRDX, %VRDX
+	jnz	L(first_vec_x1)
+
+	VPCMPEQ	(VEC_SIZE * 2)(%rdi), %VZERO, %k0
+	KMOV	%k0, %VRDX
+	test	%VRDX, %VRDX
+	jnz	L(first_vec_x2)
+
+	cmpq	$(CHAR_PER_VEC * 4), %rax
+	ja	L(more_4x_vec)
+
+
+	VPCMPEQ	(VEC_SIZE * 3)(%rdi), %VZERO, %k0
+	KMOV	%k0, %VRDX
+	addl	$(CHAR_PER_VEC * -2), %eax
+	test	%VRDX, %VRDX
+	jnz	L(last_vec_check)
+
+	subl	$(CHAR_PER_VEC), %eax
+	jbe	L(max_1)
+
+	VPCMPEQ	(VEC_SIZE * 4)(%rdi), %VZERO, %k0
+	KMOV	%k0, %VRDX
+
+	test	%VRDX, %VRDX
+	jnz	L(last_vec_check)
+L(max_1):
+	movl	%esi, %eax
+	ret
+
+	.p2align 4,, 3
+L(first_vec_x2):
+# if VEC_SIZE == 64
+	/* If VEC_SIZE == 64 we can fit logic for full return label in
+	   spare bytes before next cache line.  */
+	bsf	%VRDX, %VRDX
+	sub	%eax, %esi
+	leal	(CHAR_PER_VEC * 1)(%rsi, %rdx), %eax
+	ret
+	.p2align 4,, 6
+# else
+	addl	$CHAR_PER_VEC, %esi
+# endif
+L(first_vec_x1):
+	bsf	%VRDX, %VRDX
+	sub	%eax, %esi
+	leal	(CHAR_PER_VEC * 0)(%rsi, %rdx), %eax
+	ret
+
+
+	.p2align 4,, 6
+L(first_vec_x4):
+# if VEC_SIZE == 64
+	/* If VEC_SIZE == 64 we can fit logic for full return label in
+	   spare bytes before next cache line.  */
+	bsf	%VRDX, %VRDX
+	sub	%eax, %esi
+	leal	(CHAR_PER_VEC * 3)(%rsi, %rdx), %eax
+	ret
+	.p2align 4,, 6
+# else
+	addl	$CHAR_PER_VEC, %esi
+# endif
+L(first_vec_x3):
+	bsf	%VRDX, %VRDX
+	sub	%eax, %esi
+	leal	(CHAR_PER_VEC * 2)(%rsi, %rdx), %eax
+	ret
+
+	.p2align 4,, 5
+L(more_4x_vec):
+	VPCMPEQ	(VEC_SIZE * 3)(%rdi), %VZERO, %k0
+	KMOV	%k0, %VRDX
+	test	%VRDX, %VRDX
+	jnz	L(first_vec_x3)
+
+	VPCMPEQ	(VEC_SIZE * 4)(%rdi), %VZERO, %k0
+	KMOV	%k0, %VRDX
+	test	%VRDX, %VRDX
+	jnz	L(first_vec_x4)
+
+	/* Check if at last VEC_SIZE * 4 length before aligning for the
+	   loop.  */
+	cmpq	$(CHAR_PER_VEC * 8), %rax
+	jbe	L(last_4x_vec_or_less)
+
+
+	/* Compute number of words checked after aligning.  */
+# ifdef USE_AS_WCSLEN
+	/* Need to compute directly for wcslen as CHAR_SIZE * rsi can
+	   overflow.  */
+	leaq	(VEC_SIZE * -3)(%rdi), %rdx
+# else
+	leaq	(VEC_SIZE * -3)(%rdi, %rax), %rax
+# endif
+
+	subq	$(VEC_SIZE * -1), %rdi
+
+	/* Align data to VEC_SIZE * 4.  */
+# if VEC_SIZE == 64
+	/* Saves code size.  No evex512 processor has partial register
+	   stalls.  If that change this can be replaced with `andq
+	   $-(VEC_SIZE * 4), %rdi`.  */
+	xorb	%dil, %dil
+# else
+	andq	$-(VEC_SIZE * 4), %rdi
+# endif
+
+# ifdef USE_AS_WCSLEN
+	subq	%rdi, %rdx
+	sarq	$2, %rdx
+	addq	%rdx, %rax
+# else
+	subq	%rdi, %rax
+# endif
+	/* Compare 4 * VEC at a time forward.  */
+	.p2align 4,, 11
+L(loop_4x_vec):
+	VMOVA	(VEC_SIZE * 4)(%rdi), %VMM(1)
+	VPMINU	(VEC_SIZE * 5)(%rdi), %VMM(1), %VMM(2)
+	VMOVA	(VEC_SIZE * 6)(%rdi), %VMM(3)
+	VPMINU	(VEC_SIZE * 7)(%rdi), %VMM(3), %VMM(4)
+	VPTESTN	%VMM(2), %VMM(2), %k0
+	VPTESTN	%VMM(4), %VMM(4), %k2
+	subq	$-(VEC_SIZE * 4), %rdi
+	/* Break if at end of length.  */
+	subq	$(CHAR_PER_VEC * 4), %rax
+	jbe	L(loop_len_end)
+
+
+	KORTEST %k0, %k2
+	jz	L(loop_4x_vec)
+
+
+L(loop_last_4x_vec):
+	movq	%rsi, %rcx
+	subq	%rax, %rsi
+	VPTESTN	%VMM(1), %VMM(1), %k1
+	KMOV	%k1, %VRDX
+	test	%VRDX, %VRDX
+	jnz	L(last_vec_x0)
+
+	KMOV	%k0, %VRDX
+	test	%VRDX, %VRDX
+	jnz	L(last_vec_x1)
+
+	VPTESTN	%VMM(3), %VMM(3), %k0
+
+	/* Seperate logic for VEC_SIZE == 64 and VEC_SIZE == 32 for
+	   returning last 2x VEC. For VEC_SIZE == 64 we test each VEC
+	   individually, for VEC_SIZE == 32 we combine them in a single
+	   64-bit GPR.  */
+# if CHAR_PER_VEC == 64
+	KMOV	%k0, %VRDX
+	test	%VRDX, %VRDX
+	jnz	L(last_vec_x2)
+	KMOV	%k2, %VRDX
+# else
+	/* We can only combine last 2x VEC masks if CHAR_PER_VEC <= 32.
+	 */
+	kmovd	%k2, %edx
+	kmovd	%k0, %eax
+	salq	$CHAR_PER_VEC, %rdx
+	orq	%rax, %rdx
+# endif
+
+	/* first_vec_x3 for strlen-ZMM and first_vec_x2 for strlen-YMM.
+	 */
+	bsfq	%rdx, %rdx
+	leaq	(FALLTHROUGH_RETURN_OFFSET - CHAR_PER_VEC * 4)(%rsi, %rdx), %rax
+	cmpq	%rax, %rcx
+	cmovb	%rcx, %rax
+	ret
+
+	/* Handle last 4x VEC after loop. All VECs have been loaded.  */
+	.p2align 4,, 4
+L(loop_len_end):
+	KORTEST %k0, %k2
+	jnz	L(loop_last_4x_vec)
+	movq	%rsi, %rax
+	ret
+
+
+# if CHAR_PER_VEC == 64
+	/* Since we can't combine the last 2x VEC for VEC_SIZE == 64
+	   need return label for it.  */
+	.p2align 4,, 8
+L(last_vec_x2):
+	bsf	%VRDX, %VRDX
+	leaq	(CHAR_PER_VEC * -2)(%rsi, %rdx), %rax
+	cmpq	%rax, %rcx
+	cmovb	%rcx, %rax
+	ret
+# endif
+
+
+	.p2align 4,, 10
+L(last_vec_x1):
+	addq	$CHAR_PER_VEC, %rsi
+L(last_vec_x0):
+	bsf	%VRDX, %VRDX
+	leaq	(CHAR_PER_VEC * -4)(%rsi, %rdx), %rax
+	cmpq	%rax, %rcx
+	cmovb	%rcx, %rax
+	ret
+
+
+	.p2align 4,, 8
+L(cross_page_boundary):
+	/* Align data to VEC_SIZE.  */
+	movq	%rdi, %rcx
+	andq	$-VEC_SIZE, %rcx
+	VPCMPEQ	(%rcx), %VZERO, %k0
+
+	KMOV	%k0, %VRCX
+# ifdef USE_AS_WCSLEN
+	shrl	$2, %eax
+	andl	$(CHAR_PER_VEC - 1), %eax
+# endif
+	shrx	%VRAX, %VRCX, %VRCX
+
+	negl	%eax
+	andl	$(CHAR_PER_VEC - 1), %eax
+	movq	%rsi, %rdx
+	bsf	%VRCX, %VRDX
+	cmpq	%rax, %rdx
+	ja	L(cross_page_continue)
+	movl	%edx, %eax
+	cmpq	%rdx, %rsi
+	cmovb	%esi, %eax
+	ret
+END (STRNLEN)
+#endif
diff --git a/sysdeps/x86_64/multiarch/wcsnlen-evex.S b/sysdeps/x86_64/multiarch/wcsnlen-evex.S
index e2aad94c1e..57a7e93fbf 100644
--- a/sysdeps/x86_64/multiarch/wcsnlen-evex.S
+++ b/sysdeps/x86_64/multiarch/wcsnlen-evex.S
@@ -2,8 +2,7 @@
 # define WCSNLEN	__wcsnlen_evex
 #endif
 
-#define STRLEN	WCSNLEN
+#define STRNLEN	WCSNLEN
 #define USE_AS_WCSLEN 1
-#define USE_AS_STRNLEN 1
 
-#include "strlen-evex.S"
+#include "strnlen-evex.S"
-- 
2.34.1


^ permalink raw reply	[flat|nested] 41+ messages in thread

* [PATCH v1 4/7] x86: Optimize memrchr-evex.S
  2022-10-18  2:48 [PATCH v1 1/7] x86: Optimize memchr-evex.S and implement with VMM headers Noah Goldstein
  2022-10-18  2:48 ` [PATCH v1 2/7] x86: Shrink / minorly optimize strchr-evex " Noah Goldstein
  2022-10-18  2:48 ` [PATCH v1 3/7] x86: Optimize strnlen-evex.S " Noah Goldstein
@ 2022-10-18  2:48 ` Noah Goldstein
  2022-10-18  2:51   ` Noah Goldstein
  2022-10-18  2:48 ` [PATCH v1 5/7] x86: Optimize strrchr-evex.S and implement with VMM headers Noah Goldstein
                   ` (5 subsequent siblings)
  8 siblings, 1 reply; 41+ messages in thread
From: Noah Goldstein @ 2022-10-18  2:48 UTC (permalink / raw)
  To: libc-alpha; +Cc: goldstein.w.n, hjl.tools, carlos

Optimizations are:
1. Use the fact that lzcnt(0) -> VEC_SIZE for memchr to save a branch
   in short string case.
2. Save several instructions in len = [VEC_SIZE, 4 * VEC_SIZE] case.
3. Use more code-size efficient instructions.
	- tzcnt ...     -> bsf ...
	- vpcmpb $0 ... -> vpcmpeq ...

Code Size Changes:
memrchr-evex.S      :  -29 bytes

Net perf changes:

Reported as geometric mean of all improvements / regressions from N=10
runs of the benchtests. Value as New Time / Old Time so < 1.0 is
improvement and 1.0 is regression.

memrchr-evex.S      : 0.949 (Mostly from improvements in small strings)

Full results attached in email.

Full check passes on x86-64.
---
 sysdeps/x86_64/multiarch/memrchr-evex.S | 538 ++++++++++++++----------
 1 file changed, 324 insertions(+), 214 deletions(-)

diff --git a/sysdeps/x86_64/multiarch/memrchr-evex.S b/sysdeps/x86_64/multiarch/memrchr-evex.S
index 550b328c5a..dbcf52808f 100644
--- a/sysdeps/x86_64/multiarch/memrchr-evex.S
+++ b/sysdeps/x86_64/multiarch/memrchr-evex.S
@@ -21,17 +21,19 @@
 #if ISA_SHOULD_BUILD (4)
 
 # include <sysdep.h>
-# include "x86-evex256-vecs.h"
-# if VEC_SIZE != 32
-#  error "VEC_SIZE != 32 unimplemented"
+
+# ifndef VEC_SIZE
+#  include "x86-evex256-vecs.h"
 # endif
 
+# include "reg-macros.h"
+
 # ifndef MEMRCHR
-#  define MEMRCHR				__memrchr_evex
+#  define MEMRCHR	__memrchr_evex
 # endif
 
-# define PAGE_SIZE			4096
-# define VMMMATCH			VMM(0)
+# define PAGE_SIZE	4096
+# define VMATCH	VMM(0)
 
 	.section SECTION(.text), "ax", @progbits
 ENTRY_P2ALIGN(MEMRCHR, 6)
@@ -43,294 +45,402 @@ ENTRY_P2ALIGN(MEMRCHR, 6)
 # endif
 	jz	L(zero_0)
 
-	/* Get end pointer. Minus one for two reasons. 1) It is necessary for a
-	   correct page cross check and 2) it correctly sets up end ptr to be
-	   subtract by lzcnt aligned.  */
+	/* Get end pointer. Minus one for three reasons. 1) It is
+	   necessary for a correct page cross check and 2) it correctly
+	   sets up end ptr to be subtract by lzcnt aligned. 3) it is a
+	   necessary step in aligning ptr.  */
 	leaq	-1(%rdi, %rdx), %rax
-	vpbroadcastb %esi, %VMMMATCH
+	vpbroadcastb %esi, %VMATCH
 
 	/* Check if we can load 1x VEC without cross a page.  */
 	testl	$(PAGE_SIZE - VEC_SIZE), %eax
 	jz	L(page_cross)
 
-	/* Don't use rax for pointer here because EVEX has better encoding with
-	   offset % VEC_SIZE == 0.  */
-	vpcmpb	$0, -(VEC_SIZE)(%rdi, %rdx), %VMMMATCH, %k0
-	kmovd	%k0, %ecx
-
-	/* Fall through for rdx (len) <= VEC_SIZE (expect small sizes).  */
-	cmpq	$VEC_SIZE, %rdx
-	ja	L(more_1x_vec)
-L(ret_vec_x0_test):
-
-	/* If ecx is zero (no matches) lzcnt will set it 32 (VEC_SIZE) which
-	   will guarantee edx (len) is less than it.  */
-	lzcntl	%ecx, %ecx
-	cmpl	%ecx, %edx
-	jle	L(zero_0)
-	subq	%rcx, %rax
+	/* Don't use rax for pointer here because EVEX has better
+	   encoding with offset % VEC_SIZE == 0.  */
+	vpcmpeqb (VEC_SIZE * -1)(%rdi, %rdx), %VMATCH, %k0
+	KMOV	%k0, %VRCX
+
+	/* If rcx is zero then lzcnt -> VEC_SIZE.  NB: there is a
+	   already a dependency between rcx and rsi so no worries about
+	   false-dep here.  */
+	lzcnt	%VRCX, %VRSI
+	/* If rdx <= rsi then either 1) rcx was non-zero (there was a
+	   match) but it was out of bounds or 2) rcx was zero and rdx
+	   was <= VEC_SIZE so we are done scanning.  */
+	cmpq	%rsi, %rdx
+	/* NB: Use branch to return zero/non-zero.  Common usage will
+	   branch on result of function (if return is null/non-null).
+	   This branch can be used to predict the ensuing one so there
+	   is no reason to extend the data-dependency with cmovcc.  */
+	jbe	L(zero_0)
+
+	/* If rcx is zero then len must be > RDX, otherwise since we
+	   already tested len vs lzcnt(rcx) (in rsi) we are good to
+	   return this match.  */
+	test	%VRCX, %VRCX
+	jz	L(more_1x_vec)
+	subq	%rsi, %rax
 	ret
 
-	/* Fits in aligning bytes of first cache line.  */
+	/* Fits in aligning bytes of first cache line for VEC_SIZE ==
+	   32.  */
+# if VEC_SIZE == 32
+	.p2align 4,, 2
 L(zero_0):
 	xorl	%eax, %eax
 	ret
-
-	.p2align 4,, 9
-L(ret_vec_x0_dec):
-	decq	%rax
-L(ret_vec_x0):
-	lzcntl	%ecx, %ecx
-	subq	%rcx, %rax
-	ret
+# endif
 
 	.p2align 4,, 10
 L(more_1x_vec):
-	testl	%ecx, %ecx
-	jnz	L(ret_vec_x0)
-
 	/* Align rax (pointer to string).  */
 	andq	$-VEC_SIZE, %rax
-
+L(page_cross_continue):
 	/* Recompute length after aligning.  */
-	movq	%rax, %rdx
+	subq	%rdi, %rax
 
-	/* Need no matter what.  */
-	vpcmpb	$0, -(VEC_SIZE)(%rax), %VMMMATCH, %k0
-	kmovd	%k0, %ecx
-
-	subq	%rdi, %rdx
-
-	cmpq	$(VEC_SIZE * 2), %rdx
+	cmpq	$(VEC_SIZE * 2), %rax
 	ja	L(more_2x_vec)
+
 L(last_2x_vec):
+	vpcmpeqb (VEC_SIZE * -1)(%rdi, %rax), %VMATCH, %k0
+	KMOV	%k0, %VRCX
 
-	/* Must dec rax because L(ret_vec_x0_test) expects it.  */
-	decq	%rax
-	cmpl	$VEC_SIZE, %edx
-	jbe	L(ret_vec_x0_test)
+	test	%VRCX, %VRCX
+	jnz	L(ret_vec_x0_test)
 
-	testl	%ecx, %ecx
-	jnz	L(ret_vec_x0)
+	/* If VEC_SIZE == 64 need to subtract because lzcntq won't
+	   implicitly add VEC_SIZE to match position.  */
+# if VEC_SIZE == 64
+	subl	$VEC_SIZE, %eax
+# else
+	cmpb	$VEC_SIZE, %al
+# endif
+	jle	L(zero_2)
 
-	/* Don't use rax for pointer here because EVEX has better encoding with
-	   offset % VEC_SIZE == 0.  */
-	vpcmpb	$0, -(VEC_SIZE * 2)(%rdi, %rdx), %VMMMATCH, %k0
-	kmovd	%k0, %ecx
-	/* NB: 64-bit lzcnt. This will naturally add 32 to position.  */
+	/* We adjusted rax (length) for VEC_SIZE == 64 so need seperate
+	   offsets.  */
+# if VEC_SIZE == 64
+	vpcmpeqb (VEC_SIZE * -1)(%rdi, %rax), %VMATCH, %k0
+# else
+	vpcmpeqb (VEC_SIZE * -2)(%rdi, %rax), %VMATCH, %k0
+# endif
+	KMOV	%k0, %VRCX
+	/* NB: 64-bit lzcnt. This will naturally add 32 to position for
+	   VEC_SIZE == 32.  */
 	lzcntq	%rcx, %rcx
-	cmpl	%ecx, %edx
-	jle	L(zero_0)
-	subq	%rcx, %rax
-	ret
-
-	/* Inexpensive place to put this regarding code size / target alignments
-	   / ICache NLP. Necessary for 2-byte encoding of jump to page cross
-	   case which in turn is necessary for hot path (len <= VEC_SIZE) to fit
-	   in first cache line.  */
-L(page_cross):
-	movq	%rax, %rsi
-	andq	$-VEC_SIZE, %rsi
-	vpcmpb	$0, (%rsi), %VMMMATCH, %k0
-	kmovd	%k0, %r8d
-	/* Shift out negative alignment (because we are starting from endptr and
-	   working backwards).  */
-	movl	%eax, %ecx
-	/* notl because eax already has endptr - 1.  (-x = ~(x - 1)).  */
-	notl	%ecx
-	shlxl	%ecx, %r8d, %ecx
-	cmpq	%rdi, %rsi
-	ja	L(more_1x_vec)
-	lzcntl	%ecx, %ecx
-	cmpl	%ecx, %edx
-	jle	L(zero_1)
-	subq	%rcx, %rax
+	subl	%ecx, %eax
+	ja	L(first_vec_x1_ret)
+	/* If VEC_SIZE == 64 put L(zero_0) here as we can't fit in the
+	   first cache line (this is the second cache line).  */
+# if VEC_SIZE == 64
+L(zero_0):
+# endif
+L(zero_2):
+	xorl	%eax, %eax
 	ret
 
-	/* Continue creating zero labels that fit in aligning bytes and get
-	   2-byte encoding / are in the same cache line as condition.  */
-L(zero_1):
-	xorl	%eax, %eax
+	/* NB: Fits in aligning bytes before next cache line for
+	   VEC_SIZE == 32.  For VEC_SIZE == 64 this is attached to
+	   L(first_vec_x0_test).  */
+# if VEC_SIZE == 32
+L(first_vec_x1_ret):
+	leaq	-1(%rdi, %rax), %rax
 	ret
+# endif
 
-	.p2align 4,, 8
-L(ret_vec_x1):
-	/* This will naturally add 32 to position.  */
-	bsrl	%ecx, %ecx
-	leaq	-(VEC_SIZE * 2)(%rcx, %rax), %rax
+	.p2align 4,, 6
+L(ret_vec_x0_test):
+	lzcnt	%VRCX, %VRCX
+	subl	%ecx, %eax
+	jle	L(zero_2)
+# if VEC_SIZE == 64
+	/* Reuse code at the end of L(ret_vec_x0_test) as we can't fit
+	   L(first_vec_x1_ret) in the same cache line as its jmp base
+	   so we might as well save code size.  */
+L(first_vec_x1_ret):
+# endif
+	leaq	-1(%rdi, %rax), %rax
 	ret
 
-	.p2align 4,, 8
+	.p2align 4,, 6
+L(loop_last_4x_vec):
+	/* Compute remaining length.  */
+	subl	%edi, %eax
+L(last_4x_vec):
+	cmpl	$(VEC_SIZE * 2), %eax
+	jle	L(last_2x_vec)
+# if VEC_SIZE == 32
+	/* Only align for VEC_SIZE == 32.  For VEC_SIZE == 64 we need
+	   the spare bytes to align the loop properly.  */
+	.p2align 4,, 10
+# endif
 L(more_2x_vec):
-	testl	%ecx, %ecx
-	jnz	L(ret_vec_x0_dec)
 
-	vpcmpb	$0, -(VEC_SIZE * 2)(%rax), %VMMMATCH, %k0
-	kmovd	%k0, %ecx
-	testl	%ecx, %ecx
-	jnz	L(ret_vec_x1)
+	/* Length > VEC_SIZE * 2 so check the first 2x VEC for match and
+	   return if either hit.  */
+	vpcmpeqb (VEC_SIZE * -1)(%rdi, %rax), %VMATCH, %k0
+	KMOV	%k0, %VRCX
+
+	test	%VRCX, %VRCX
+	jnz	L(first_vec_x0)
+
+	vpcmpeqb (VEC_SIZE * -2)(%rdi, %rax), %VMATCH, %k0
+	KMOV	%k0, %VRCX
+	test	%VRCX, %VRCX
+	jnz	L(first_vec_x1)
 
 	/* Need no matter what.  */
-	vpcmpb	$0, -(VEC_SIZE * 3)(%rax), %VMMMATCH, %k0
-	kmovd	%k0, %ecx
+	vpcmpeqb (VEC_SIZE * -3)(%rdi, %rax), %VMATCH, %k0
+	KMOV	%k0, %VRCX
 
-	subq	$(VEC_SIZE * 4), %rdx
+	/* Check if we are near the end.  */
+	subq	$(VEC_SIZE * 4), %rax
 	ja	L(more_4x_vec)
 
-	cmpl	$(VEC_SIZE * -1), %edx
-	jle	L(ret_vec_x2_test)
-L(last_vec):
-	testl	%ecx, %ecx
-	jnz	L(ret_vec_x2)
+	test	%VRCX, %VRCX
+	jnz	L(first_vec_x2_test)
 
+	/* Adjust length for final check and check if we are at the end.
+	 */
+	addl	$(VEC_SIZE * 1), %eax
+	jle	L(zero_1)
 
-	/* Need no matter what.  */
-	vpcmpb	$0, -(VEC_SIZE * 4)(%rax), %VMMMATCH, %k0
-	kmovd	%k0, %ecx
-	lzcntl	%ecx, %ecx
-	subq	$(VEC_SIZE * 3 + 1), %rax
-	subq	%rcx, %rax
-	cmpq	%rax, %rdi
-	ja	L(zero_1)
+	vpcmpeqb (VEC_SIZE * -1)(%rdi, %rax), %VMATCH, %k0
+	KMOV	%k0, %VRCX
+
+	lzcnt	%VRCX, %VRCX
+	subl	%ecx, %eax
+	ja	L(first_vec_x3_ret)
+L(zero_1):
+	xorl	%eax, %eax
+	ret
+L(first_vec_x3_ret):
+	leaq	-1(%rdi, %rax), %rax
 	ret
 
-	.p2align 4,, 8
-L(ret_vec_x2_test):
-	lzcntl	%ecx, %ecx
-	subq	$(VEC_SIZE * 2 + 1), %rax
-	subq	%rcx, %rax
-	cmpq	%rax, %rdi
-	ja	L(zero_1)
+	.p2align 4,, 6
+L(first_vec_x2_test):
+	/* Must adjust length before check.  */
+	subl	$-(VEC_SIZE * 2 - 1), %eax
+	lzcnt	%VRCX, %VRCX
+	subl	%ecx, %eax
+	jl	L(zero_4)
+	addq	%rdi, %rax
 	ret
 
-	.p2align 4,, 8
-L(ret_vec_x2):
-	bsrl	%ecx, %ecx
-	leaq	-(VEC_SIZE * 3)(%rcx, %rax), %rax
+
+	.p2align 4,, 10
+L(first_vec_x0):
+	bsr	%VRCX, %VRCX
+	leaq	(VEC_SIZE * -1)(%rdi, %rax), %rax
+	addq	%rcx, %rax
 	ret
 
-	.p2align 4,, 8
-L(ret_vec_x3):
-	bsrl	%ecx, %ecx
-	leaq	-(VEC_SIZE * 4)(%rcx, %rax), %rax
+	/* Fits unobtrusively here.  */
+L(zero_4):
+	xorl	%eax, %eax
+	ret
+
+	.p2align 4,, 10
+L(first_vec_x1):
+	bsr	%VRCX, %VRCX
+	leaq	(VEC_SIZE * -2)(%rdi, %rax), %rax
+	addq	%rcx, %rax
 	ret
 
 	.p2align 4,, 8
+L(first_vec_x3):
+	bsr	%VRCX, %VRCX
+	addq	%rdi, %rax
+	addq	%rcx, %rax
+	ret
+
+	.p2align 4,, 6
+L(first_vec_x2):
+	bsr	%VRCX, %VRCX
+	leaq	(VEC_SIZE * 1)(%rdi, %rax), %rax
+	addq	%rcx, %rax
+	ret
+
+	.p2align 4,, 2
 L(more_4x_vec):
-	testl	%ecx, %ecx
-	jnz	L(ret_vec_x2)
+	test	%VRCX, %VRCX
+	jnz	L(first_vec_x2)
 
-	vpcmpb	$0, -(VEC_SIZE * 4)(%rax), %VMMMATCH, %k0
-	kmovd	%k0, %ecx
+	vpcmpeqb (%rdi, %rax), %VMATCH, %k0
+	KMOV	%k0, %VRCX
 
-	testl	%ecx, %ecx
-	jnz	L(ret_vec_x3)
+	test	%VRCX, %VRCX
+	jnz	L(first_vec_x3)
 
 	/* Check if near end before re-aligning (otherwise might do an
 	   unnecessary loop iteration).  */
-	addq	$-(VEC_SIZE * 4), %rax
-	cmpq	$(VEC_SIZE * 4), %rdx
+	cmpq	$(VEC_SIZE * 4), %rax
 	jbe	L(last_4x_vec)
 
-	decq	%rax
-	andq	$-(VEC_SIZE * 4), %rax
-	movq	%rdi, %rdx
-	/* Get endptr for loop in rdx. NB: Can't just do while rax > rdi because
-	   lengths that overflow can be valid and break the comparison.  */
-	andq	$-(VEC_SIZE * 4), %rdx
+
+	/* NB: We setup the loop to NOT use index-address-mode for the
+	   buffer.  This costs some instructions & code size but avoids
+	   stalls due to unlaminated micro-fused instructions (as used
+	   in the loop) from being forced to issue in the same group
+	   (essentially narrowing the backend width).  */
+
+	/* Get endptr for loop in rdx. NB: Can't just do while rax > rdi
+	   because lengths that overflow can be valid and break the
+	   comparison.  */
+# if VEC_SIZE == 64
+	/* Use rdx as intermediate to compute rax, this gets us imm8
+	   encoding which just allows the L(more_4x_vec) block to fit
+	   in 1 cache-line.  */
+	leaq	(VEC_SIZE * 4)(%rdi), %rdx
+	leaq	(VEC_SIZE * -1)(%rdx, %rax), %rax
+
+	/* No evex machine has partial register stalls. This can be
+	   replaced with: `andq $(VEC_SIZE * -4), %rax/%rdx` if that
+	   changes.  */
+	xorb	%al, %al
+	xorb	%dl, %dl
+# else
+	leaq	(VEC_SIZE * 3)(%rdi, %rax), %rax
+	andq	$(VEC_SIZE * -4), %rax
+	leaq	(VEC_SIZE * 4)(%rdi), %rdx
+	andq	$(VEC_SIZE * -4), %rdx
+# endif
+
 
 	.p2align 4
 L(loop_4x_vec):
-	/* Store 1 were not-equals and 0 where equals in k1 (used to mask later
-	   on).  */
-	vpcmpb	$4, (VEC_SIZE * 3)(%rax), %VMMMATCH, %k1
+	/* NB: We could do the same optimization here as we do for
+	   memchr/rawmemchr by using VEX encoding in the loop for access
+	   to VEX vpcmpeqb + vpternlogd.  Since memrchr is not as hot as
+	   memchr it may not be worth the extra code size, but if the
+	   need arises it an easy ~15% perf improvement to the loop.  */
+
+	cmpq	%rdx, %rax
+	je	L(loop_last_4x_vec)
+	/* Store 1 were not-equals and 0 where equals in k1 (used to
+	   mask later on).  */
+	vpcmpb	$4, (VEC_SIZE * -1)(%rax), %VMATCH, %k1
 
 	/* VEC(2/3) will have zero-byte where we found a CHAR.  */
-	vpxorq	(VEC_SIZE * 2)(%rax), %VMMMATCH, %VMM(2)
-	vpxorq	(VEC_SIZE * 1)(%rax), %VMMMATCH, %VMM(3)
-	vpcmpb	$0, (VEC_SIZE * 0)(%rax), %VMMMATCH, %k4
+	vpxorq	(VEC_SIZE * -2)(%rax), %VMATCH, %VMM(2)
+	vpxorq	(VEC_SIZE * -3)(%rax), %VMATCH, %VMM(3)
+	vpcmpeqb (VEC_SIZE * -4)(%rax), %VMATCH, %k4
 
-	/* Combine VEC(2/3) with min and maskz with k1 (k1 has zero bit where
-	   CHAR is found and VEC(2/3) have zero-byte where CHAR is found.  */
+	/* Combine VEC(2/3) with min and maskz with k1 (k1 has zero bit
+	   where CHAR is found and VEC(2/3) have zero-byte where CHAR
+	   is found.  */
 	vpminub	%VMM(2), %VMM(3), %VMM(3){%k1}{z}
 	vptestnmb %VMM(3), %VMM(3), %k2
 
-	/* Any 1s and we found CHAR.  */
-	kortestd %k2, %k4
-	jnz	L(loop_end)
-
 	addq	$-(VEC_SIZE * 4), %rax
-	cmpq	%rdx, %rax
-	jne	L(loop_4x_vec)
 
-	/* Need to re-adjust rdx / rax for L(last_4x_vec).  */
-	subq	$-(VEC_SIZE * 4), %rdx
-	movq	%rdx, %rax
-	subl	%edi, %edx
-L(last_4x_vec):
+	/* Any 1s and we found CHAR.  */
+	KORTEST %k2, %k4
+	jz	L(loop_4x_vec)
+
 
-	/* Used no matter what.  */
-	vpcmpb	$0, (VEC_SIZE * -1)(%rax), %VMMMATCH, %k0
-	kmovd	%k0, %ecx
+	/* K1 has non-matches for first VEC. inc; jz will overflow rcx
+	   iff all bytes where non-matches.  */
+	KMOV	%k1, %VRCX
+	inc	%VRCX
+	jnz	L(first_vec_x0_end)
 
-	cmpl	$(VEC_SIZE * 2), %edx
-	jbe	L(last_2x_vec)
+	vptestnmb %VMM(2), %VMM(2), %k0
+	KMOV	%k0, %VRCX
+	test	%VRCX, %VRCX
+	jnz	L(first_vec_x1_end)
+	KMOV	%k2, %VRCX
+
+	/* Seperate logic for VEC_SIZE == 64 and VEC_SIZE == 32 for
+	   returning last 2x VEC. For VEC_SIZE == 64 we test each VEC
+	   individually, for VEC_SIZE == 32 we combine them in a single
+	   64-bit GPR.  */
+# if VEC_SIZE == 64
+	test	%VRCX, %VRCX
+	jnz	L(first_vec_x2_end)
+	KMOV	%k4, %VRCX
+# else
+	/* Combine last 2 VEC matches for VEC_SIZE == 32. If rcx (from
+	   VEC(3)) is zero (no CHAR in VEC(3)) then it won't affect the
+	   result in rsi (from VEC(4)). If rcx is non-zero then CHAR in
+	   VEC(3) and bsrq will use that position.  */
+	KMOV	%k4, %VRSI
+	salq	$32, %rcx
+	orq	%rsi, %rcx
+# endif
+	bsrq	%rcx, %rcx
+	addq	%rcx, %rax
+	ret
 
-	testl	%ecx, %ecx
-	jnz	L(ret_vec_x0_dec)
+	.p2align 4,, 4
+L(first_vec_x0_end):
+	/* rcx has 1s at non-matches so we need to `not` it. We used
+	   `inc` to test if zero so use `neg` to complete the `not` so
+	   the last 1 bit represent a match.  NB: (-x + 1 == ~x).  */
+	neg	%VRCX
+	bsr	%VRCX, %VRCX
+	leaq	(VEC_SIZE * 3)(%rcx, %rax), %rax
+	ret
 
+	.p2align 4,, 10
+L(first_vec_x1_end):
+	bsr	%VRCX, %VRCX
+	leaq	(VEC_SIZE * 2)(%rcx, %rax), %rax
+	ret
 
-	vpcmpb	$0, (VEC_SIZE * -2)(%rax), %VMMMATCH, %k0
-	kmovd	%k0, %ecx
+# if VEC_SIZE == 64
+	/* Since we can't combine the last 2x VEC for VEC_SIZE == 64
+	   need return label for it.  */
+	.p2align 4,, 4
+L(first_vec_x2_end):
+	bsr	%VRCX, %VRCX
+	leaq	(VEC_SIZE * 1)(%rcx, %rax), %rax
+	ret
+# endif
 
-	testl	%ecx, %ecx
-	jnz	L(ret_vec_x1)
 
-	/* Used no matter what.  */
-	vpcmpb	$0, (VEC_SIZE * -3)(%rax), %VMMMATCH, %k0
-	kmovd	%k0, %ecx
+	.p2align 4,, 4
+L(page_cross):
+	/* only lower bits of eax[log2(VEC_SIZE):0] are set so we can
+	   use movzbl to get the amount of bytes we are checking here.
+	 */
+	movzbl	%al, %ecx
+	andq	$-VEC_SIZE, %rax
+	vpcmpeqb (%rax), %VMATCH, %k0
+	KMOV	%k0, %VRSI
 
-	cmpl	$(VEC_SIZE * 3), %edx
-	ja	L(last_vec)
+	/* eax was comptued as %rdi + %rdx - 1 so need to add back 1
+	   here.  */
+	leal	1(%rcx), %r8d
 
-	lzcntl	%ecx, %ecx
-	subq	$(VEC_SIZE * 2 + 1), %rax
-	subq	%rcx, %rax
-	cmpq	%rax, %rdi
-	jbe	L(ret_1)
+	/* Invert ecx to get shift count for byte matches out of range.
+	 */
+	notl	%ecx
+	shlx	%VRCX, %VRSI, %VRSI
+
+	/* if r8 < rdx then the entire [buf, buf + len] is handled in
+	   the page cross case.  NB: we can't use the trick here we use
+	   in the non page-cross case because we aren't checking full
+	   VEC_SIZE.  */
+	cmpq	%r8, %rdx
+	ja	L(page_cross_check)
+	lzcnt	%VRSI, %VRSI
+	subl	%esi, %edx
+	ja	L(page_cross_ret)
 	xorl	%eax, %eax
-L(ret_1):
 	ret
 
-	.p2align 4,, 6
-L(loop_end):
-	kmovd	%k1, %ecx
-	notl	%ecx
-	testl	%ecx, %ecx
-	jnz	L(ret_vec_x0_end)
+L(page_cross_check):
+	test	%VRSI, %VRSI
+	jz	L(page_cross_continue)
 
-	vptestnmb %VMM(2), %VMM(2), %k0
-	kmovd	%k0, %ecx
-	testl	%ecx, %ecx
-	jnz	L(ret_vec_x1_end)
-
-	kmovd	%k2, %ecx
-	kmovd	%k4, %esi
-	/* Combine last 2 VEC matches. If ecx (VEC3) is zero (no CHAR in VEC3)
-	   then it won't affect the result in esi (VEC4). If ecx is non-zero
-	   then CHAR in VEC3 and bsrq will use that position.  */
-	salq	$32, %rcx
-	orq	%rsi, %rcx
-	bsrq	%rcx, %rcx
-	addq	%rcx, %rax
-	ret
-	.p2align 4,, 4
-L(ret_vec_x0_end):
-	addq	$(VEC_SIZE), %rax
-L(ret_vec_x1_end):
-	bsrl	%ecx, %ecx
-	leaq	(VEC_SIZE * 2)(%rax, %rcx), %rax
+	lzcnt	%VRSI, %VRSI
+	subl	%esi, %edx
+L(page_cross_ret):
+	leaq	-1(%rdi, %rdx), %rax
 	ret
-
 END(MEMRCHR)
 #endif
-- 
2.34.1


^ permalink raw reply	[flat|nested] 41+ messages in thread

* [PATCH v1 5/7] x86: Optimize strrchr-evex.S and implement with VMM headers
  2022-10-18  2:48 [PATCH v1 1/7] x86: Optimize memchr-evex.S and implement with VMM headers Noah Goldstein
                   ` (2 preceding siblings ...)
  2022-10-18  2:48 ` [PATCH v1 4/7] x86: Optimize memrchr-evex.S Noah Goldstein
@ 2022-10-18  2:48 ` Noah Goldstein
  2022-10-18  2:52   ` Noah Goldstein
  2022-10-18  2:49 ` [PATCH v1 6/7] x86: Add support for VEC_SIZE == 64 in strcmp-evex.S impl Noah Goldstein
                   ` (4 subsequent siblings)
  8 siblings, 1 reply; 41+ messages in thread
From: Noah Goldstein @ 2022-10-18  2:48 UTC (permalink / raw)
  To: libc-alpha; +Cc: goldstein.w.n, hjl.tools, carlos

Optimization is:
1. Cache latest result in "fast path" loop with `vmovdqu` instead of
  `kunpckdq`.  This helps if there are more than one matches.

Code Size Changes:
strrchr-evex.S       :  +30 bytes (Same number of cache lines)

Net perf changes:

Reported as geometric mean of all improvements / regressions from N=10
runs of the benchtests. Value as New Time / Old Time so < 1.0 is
improvement and 1.0 is regression.

strrchr-evex.S       : 0.932 (From cases with higher match frequency)

Full results attached in email.

Full check passes on x86-64.
---
 sysdeps/x86_64/multiarch/strrchr-evex.S | 371 +++++++++++++-----------
 1 file changed, 200 insertions(+), 171 deletions(-)

diff --git a/sysdeps/x86_64/multiarch/strrchr-evex.S b/sysdeps/x86_64/multiarch/strrchr-evex.S
index 992b45fb47..45487dc87a 100644
--- a/sysdeps/x86_64/multiarch/strrchr-evex.S
+++ b/sysdeps/x86_64/multiarch/strrchr-evex.S
@@ -26,25 +26,30 @@
 #  define STRRCHR	__strrchr_evex
 # endif
 
-# define VMOVU	vmovdqu64
-# define VMOVA	vmovdqa64
+# include "x86-evex256-vecs.h"
 
 # ifdef USE_AS_WCSRCHR
-#  define SHIFT_REG	esi
-
-#  define kunpck	kunpckbw
+#  define RCX_M	cl
+#  define SHIFT_REG	rcx
+#  define VPCOMPRESS	vpcompressd
+#  define kunpck_2x	kunpckbw
 #  define kmov_2x	kmovd
 #  define maskz_2x	ecx
 #  define maskm_2x	eax
 #  define CHAR_SIZE	4
 #  define VPMIN	vpminud
 #  define VPTESTN	vptestnmd
+#  define VPTEST	vptestmd
 #  define VPBROADCAST	vpbroadcastd
+#  define VPCMPEQ	vpcmpeqd
 #  define VPCMP	vpcmpd
-# else
-#  define SHIFT_REG	edi
 
-#  define kunpck	kunpckdq
+#  define USE_WIDE_CHAR
+# else
+#  define RCX_M	ecx
+#  define SHIFT_REG	rdi
+#  define VPCOMPRESS	vpcompressb
+#  define kunpck_2x	kunpckdq
 #  define kmov_2x	kmovq
 #  define maskz_2x	rcx
 #  define maskm_2x	rax
@@ -52,58 +57,48 @@
 #  define CHAR_SIZE	1
 #  define VPMIN	vpminub
 #  define VPTESTN	vptestnmb
+#  define VPTEST	vptestmb
 #  define VPBROADCAST	vpbroadcastb
+#  define VPCMPEQ	vpcmpeqb
 #  define VPCMP	vpcmpb
 # endif
 
-# define XMMZERO	xmm16
-# define YMMZERO	ymm16
-# define YMMMATCH	ymm17
-# define YMMSAVE	ymm18
+# include "reg-macros.h"
 
-# define YMM1	ymm19
-# define YMM2	ymm20
-# define YMM3	ymm21
-# define YMM4	ymm22
-# define YMM5	ymm23
-# define YMM6	ymm24
-# define YMM7	ymm25
-# define YMM8	ymm26
-
-
-# define VEC_SIZE	32
+# define VMATCH	VMM(0)
+# define CHAR_PER_VEC	(VEC_SIZE / CHAR_SIZE)
 # define PAGE_SIZE	4096
-	.section .text.evex, "ax", @progbits
-ENTRY(STRRCHR)
+
+	.section SECTION(.text), "ax", @progbits
+ENTRY_P2ALIGN(STRRCHR, 6)
 	movl	%edi, %eax
-	/* Broadcast CHAR to YMMMATCH.  */
-	VPBROADCAST %esi, %YMMMATCH
+	/* Broadcast CHAR to VMATCH.  */
+	VPBROADCAST %esi, %VMATCH
 
 	andl	$(PAGE_SIZE - 1), %eax
 	cmpl	$(PAGE_SIZE - VEC_SIZE), %eax
 	jg	L(cross_page_boundary)
 
-L(page_cross_continue):
-	VMOVU	(%rdi), %YMM1
-	/* k0 has a 1 for each zero CHAR in YMM1.  */
-	VPTESTN	%YMM1, %YMM1, %k0
-	kmovd	%k0, %ecx
-	testl	%ecx, %ecx
+	VMOVU	(%rdi), %VMM(1)
+	/* k0 has a 1 for each zero CHAR in VEC(1).  */
+	VPTESTN	%VMM(1), %VMM(1), %k0
+	KMOV	%k0, %VRSI
+	test	%VRSI, %VRSI
 	jz	L(aligned_more)
 	/* fallthrough: zero CHAR in first VEC.  */
-
-	/* K1 has a 1 for each search CHAR match in YMM1.  */
-	VPCMP	$0, %YMMMATCH, %YMM1, %k1
-	kmovd	%k1, %eax
+L(page_cross_return):
+	/* K1 has a 1 for each search CHAR match in VEC(1).  */
+	VPCMPEQ	%VMATCH, %VMM(1), %k1
+	KMOV	%k1, %VRAX
 	/* Build mask up until first zero CHAR (used to mask of
 	   potential search CHAR matches past the end of the string).
 	 */
-	blsmskl	%ecx, %ecx
-	andl	%ecx, %eax
+	blsmsk	%VRSI, %VRSI
+	and	%VRSI, %VRAX
 	jz	L(ret0)
-	/* Get last match (the `andl` removed any out of bounds
-	   matches).  */
-	bsrl	%eax, %eax
+	/* Get last match (the `and` removed any out of bounds matches).
+	 */
+	bsr	%VRAX, %VRAX
 # ifdef USE_AS_WCSRCHR
 	leaq	(%rdi, %rax, CHAR_SIZE), %rax
 # else
@@ -116,22 +111,22 @@ L(ret0):
 	   search path for earlier matches.  */
 	.p2align 4,, 6
 L(first_vec_x1):
-	VPCMP	$0, %YMMMATCH, %YMM2, %k1
-	kmovd	%k1, %eax
-	blsmskl	%ecx, %ecx
+	VPCMPEQ	%VMATCH, %VMM(2), %k1
+	KMOV	%k1, %VRAX
+	blsmsk	%VRCX, %VRCX
 	/* eax non-zero if search CHAR in range.  */
-	andl	%ecx, %eax
+	and	%VRCX, %VRAX
 	jnz	L(first_vec_x1_return)
 
-	/* fallthrough: no match in YMM2 then need to check for earlier
-	   matches (in YMM1).  */
+	/* fallthrough: no match in VEC(2) then need to check for
+	   earlier matches (in VEC(1)).  */
 	.p2align 4,, 4
 L(first_vec_x0_test):
-	VPCMP	$0, %YMMMATCH, %YMM1, %k1
-	kmovd	%k1, %eax
-	testl	%eax, %eax
+	VPCMPEQ	%VMATCH, %VMM(1), %k1
+	KMOV	%k1, %VRAX
+	test	%VRAX, %VRAX
 	jz	L(ret1)
-	bsrl	%eax, %eax
+	bsr	%VRAX, %VRAX
 # ifdef USE_AS_WCSRCHR
 	leaq	(%rsi, %rax, CHAR_SIZE), %rax
 # else
@@ -142,129 +137,144 @@ L(ret1):
 
 	.p2align 4,, 10
 L(first_vec_x1_or_x2):
-	VPCMP	$0, %YMM3, %YMMMATCH, %k3
-	VPCMP	$0, %YMM2, %YMMMATCH, %k2
+	VPCMPEQ	%VMM(3), %VMATCH, %k3
+	VPCMPEQ	%VMM(2), %VMATCH, %k2
 	/* K2 and K3 have 1 for any search CHAR match. Test if any
-	   matches between either of them. Otherwise check YMM1.  */
-	kortestd %k2, %k3
+	   matches between either of them. Otherwise check VEC(1).  */
+	KORTEST %k2, %k3
 	jz	L(first_vec_x0_test)
 
-	/* Guranteed that YMM2 and YMM3 are within range so merge the
-	   two bitmasks then get last result.  */
-	kunpck	%k2, %k3, %k3
-	kmovq	%k3, %rax
-	bsrq	%rax, %rax
-	leaq	(VEC_SIZE)(%r8, %rax, CHAR_SIZE), %rax
+	/* Guranteed that VEC(2) and VEC(3) are within range so merge
+	   the two bitmasks then get last result.  */
+	kunpck_2x %k2, %k3, %k3
+	kmov_2x	%k3, %maskm_2x
+	bsr	%maskm_2x, %maskm_2x
+	leaq	(VEC_SIZE * 1)(%r8, %rax, CHAR_SIZE), %rax
 	ret
 
-	.p2align 4,, 6
+	.p2align 4,, 7
 L(first_vec_x3):
-	VPCMP	$0, %YMMMATCH, %YMM4, %k1
-	kmovd	%k1, %eax
-	blsmskl	%ecx, %ecx
-	/* If no search CHAR match in range check YMM1/YMM2/YMM3.  */
-	andl	%ecx, %eax
+	VPCMPEQ	%VMATCH, %VMM(4), %k1
+	KMOV	%k1, %VRAX
+	blsmsk	%VRCX, %VRCX
+	/* If no search CHAR match in range check VEC(1)/VEC(2)/VEC(3).
+	 */
+	and	%VRCX, %VRAX
 	jz	L(first_vec_x1_or_x2)
-	bsrl	%eax, %eax
+	bsr	%VRAX, %VRAX
 	leaq	(VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax
 	ret
 
+
 	.p2align 4,, 6
 L(first_vec_x0_x1_test):
-	VPCMP	$0, %YMMMATCH, %YMM2, %k1
-	kmovd	%k1, %eax
-	/* Check YMM2 for last match first. If no match try YMM1.  */
-	testl	%eax, %eax
+	VPCMPEQ	%VMATCH, %VMM(2), %k1
+	KMOV	%k1, %VRAX
+	/* Check VEC(2) for last match first. If no match try VEC(1).
+	 */
+	test	%VRAX, %VRAX
 	jz	L(first_vec_x0_test)
 	.p2align 4,, 4
 L(first_vec_x1_return):
-	bsrl	%eax, %eax
+	bsr	%VRAX, %VRAX
 	leaq	(VEC_SIZE)(%rdi, %rax, CHAR_SIZE), %rax
 	ret
 
+
 	.p2align 4,, 10
 L(first_vec_x2):
-	VPCMP	$0, %YMMMATCH, %YMM3, %k1
-	kmovd	%k1, %eax
-	blsmskl	%ecx, %ecx
-	/* Check YMM3 for last match first. If no match try YMM2/YMM1.
-	 */
-	andl	%ecx, %eax
+	VPCMPEQ	%VMATCH, %VMM(3), %k1
+	KMOV	%k1, %VRAX
+	blsmsk	%VRCX, %VRCX
+	/* Check VEC(3) for last match first. If no match try
+	   VEC(2)/VEC(1).  */
+	and	%VRCX, %VRAX
 	jz	L(first_vec_x0_x1_test)
-	bsrl	%eax, %eax
+	bsr	%VRAX, %VRAX
 	leaq	(VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
 	ret
 
 
-	.p2align 4
+	.p2align 4,, 12
 L(aligned_more):
-	/* Need to keep original pointer incase YMM1 has last match.  */
+L(page_cross_continue):
+	/* Need to keep original pointer incase VEC(1) has last match.
+	 */
 	movq	%rdi, %rsi
 	andq	$-VEC_SIZE, %rdi
-	VMOVU	VEC_SIZE(%rdi), %YMM2
-	VPTESTN	%YMM2, %YMM2, %k0
-	kmovd	%k0, %ecx
-	testl	%ecx, %ecx
+
+	VMOVU	VEC_SIZE(%rdi), %VMM(2)
+	VPTESTN	%VMM(2), %VMM(2), %k0
+	KMOV	%k0, %VRCX
+
+	test	%VRCX, %VRCX
 	jnz	L(first_vec_x1)
 
-	VMOVU	(VEC_SIZE * 2)(%rdi), %YMM3
-	VPTESTN	%YMM3, %YMM3, %k0
-	kmovd	%k0, %ecx
-	testl	%ecx, %ecx
+	VMOVU	(VEC_SIZE * 2)(%rdi), %VMM(3)
+	VPTESTN	%VMM(3), %VMM(3), %k0
+	KMOV	%k0, %VRCX
+
+	test	%VRCX, %VRCX
 	jnz	L(first_vec_x2)
 
-	VMOVU	(VEC_SIZE * 3)(%rdi), %YMM4
-	VPTESTN	%YMM4, %YMM4, %k0
-	kmovd	%k0, %ecx
+	VMOVU	(VEC_SIZE * 3)(%rdi), %VMM(4)
+	VPTESTN	%VMM(4), %VMM(4), %k0
+	KMOV	%k0, %VRCX
 	movq	%rdi, %r8
-	testl	%ecx, %ecx
+	test	%VRCX, %VRCX
 	jnz	L(first_vec_x3)
 
 	andq	$-(VEC_SIZE * 2), %rdi
-	.p2align 4
+	.p2align 4,, 10
 L(first_aligned_loop):
-	/* Preserve YMM1, YMM2, YMM3, and YMM4 until we can gurantee
-	   they don't store a match.  */
-	VMOVA	(VEC_SIZE * 4)(%rdi), %YMM5
-	VMOVA	(VEC_SIZE * 5)(%rdi), %YMM6
+	/* Preserve VEC(1), VEC(2), VEC(3), and VEC(4) until we can
+	   gurantee they don't store a match.  */
+	VMOVA	(VEC_SIZE * 4)(%rdi), %VMM(5)
+	VMOVA	(VEC_SIZE * 5)(%rdi), %VMM(6)
 
-	VPCMP	$0, %YMM5, %YMMMATCH, %k2
-	vpxord	%YMM6, %YMMMATCH, %YMM7
+	VPCMPEQ	%VMM(5), %VMATCH, %k2
+	vpxord	%VMM(6), %VMATCH, %VMM(7)
 
-	VPMIN	%YMM5, %YMM6, %YMM8
-	VPMIN	%YMM8, %YMM7, %YMM7
+	VPMIN	%VMM(5), %VMM(6), %VMM(8)
+	VPMIN	%VMM(8), %VMM(7), %VMM(7)
 
-	VPTESTN	%YMM7, %YMM7, %k1
+	VPTESTN	%VMM(7), %VMM(7), %k1
 	subq	$(VEC_SIZE * -2), %rdi
-	kortestd %k1, %k2
+	KORTEST %k1, %k2
 	jz	L(first_aligned_loop)
 
-	VPCMP	$0, %YMM6, %YMMMATCH, %k3
-	VPTESTN	%YMM8, %YMM8, %k1
-	ktestd	%k1, %k1
+	VPCMPEQ	%VMM(6), %VMATCH, %k3
+	VPTESTN	%VMM(8), %VMM(8), %k1
+
+	/* If k1 is zero, then we found a CHAR match but no null-term.
+	   We can now safely throw out VEC1-4.  */
+	KTEST	%k1, %k1
 	jz	L(second_aligned_loop_prep)
 
-	kortestd %k2, %k3
+	KORTEST %k2, %k3
 	jnz	L(return_first_aligned_loop)
 
+
 	.p2align 4,, 6
 L(first_vec_x1_or_x2_or_x3):
-	VPCMP	$0, %YMM4, %YMMMATCH, %k4
-	kmovd	%k4, %eax
-	testl	%eax, %eax
+	VPCMPEQ	%VMM(4), %VMATCH, %k4
+	KMOV	%k4, %VRAX
+	bsr	%VRAX, %VRAX
 	jz	L(first_vec_x1_or_x2)
-	bsrl	%eax, %eax
 	leaq	(VEC_SIZE * 3)(%r8, %rax, CHAR_SIZE), %rax
 	ret
 
+
 	.p2align 4,, 8
 L(return_first_aligned_loop):
-	VPTESTN	%YMM5, %YMM5, %k0
-	kunpck	%k0, %k1, %k0
+	VPTESTN	%VMM(5), %VMM(5), %k0
+
+	/* Combined results from VEC5/6.  */
+	kunpck_2x %k0, %k1, %k0
 	kmov_2x	%k0, %maskz_2x
 
 	blsmsk	%maskz_2x, %maskz_2x
-	kunpck	%k2, %k3, %k3
+	kunpck_2x %k2, %k3, %k3
 	kmov_2x	%k3, %maskm_2x
 	and	%maskz_2x, %maskm_2x
 	jz	L(first_vec_x1_or_x2_or_x3)
@@ -280,47 +290,62 @@ L(return_first_aligned_loop):
 L(second_aligned_loop_prep):
 L(second_aligned_loop_set_furthest_match):
 	movq	%rdi, %rsi
-	kunpck	%k2, %k3, %k4
-
+	/* Ideally we would safe k2/k3 but `kmov/kunpck` take uops on
+	   port0 and have noticable overhead in the loop.  */
+	VMOVA	%VMM(5), %VMM(7)
+	VMOVA	%VMM(6), %VMM(8)
 	.p2align 4
 L(second_aligned_loop):
-	VMOVU	(VEC_SIZE * 4)(%rdi), %YMM1
-	VMOVU	(VEC_SIZE * 5)(%rdi), %YMM2
-
-	VPCMP	$0, %YMM1, %YMMMATCH, %k2
-	vpxord	%YMM2, %YMMMATCH, %YMM3
+	VMOVU	(VEC_SIZE * 4)(%rdi), %VMM(5)
+	VMOVU	(VEC_SIZE * 5)(%rdi), %VMM(6)
+	VPCMPEQ	%VMM(5), %VMATCH, %k2
+	vpxord	%VMM(6), %VMATCH, %VMM(3)
 
-	VPMIN	%YMM1, %YMM2, %YMM4
-	VPMIN	%YMM3, %YMM4, %YMM3
+	VPMIN	%VMM(5), %VMM(6), %VMM(4)
+	VPMIN	%VMM(3), %VMM(4), %VMM(3)
 
-	VPTESTN	%YMM3, %YMM3, %k1
+	VPTESTN	%VMM(3), %VMM(3), %k1
 	subq	$(VEC_SIZE * -2), %rdi
-	kortestd %k1, %k2
+	KORTEST %k1, %k2
 	jz	L(second_aligned_loop)
-
-	VPCMP	$0, %YMM2, %YMMMATCH, %k3
-	VPTESTN	%YMM4, %YMM4, %k1
-	ktestd	%k1, %k1
+	VPCMPEQ	%VMM(6), %VMATCH, %k3
+	VPTESTN	%VMM(4), %VMM(4), %k1
+	KTEST	%k1, %k1
 	jz	L(second_aligned_loop_set_furthest_match)
 
-	kortestd %k2, %k3
-	/* branch here because there is a significant advantage interms
-	   of output dependency chance in using edx.  */
+	/* branch here because we know we have a match in VEC7/8 but
+	   might not in VEC5/6 so the latter is expected to be less
+	   likely.  */
+	KORTEST %k2, %k3
 	jnz	L(return_new_match)
+
 L(return_old_match):
-	kmovq	%k4, %rax
-	bsrq	%rax, %rax
-	leaq	(VEC_SIZE * 2)(%rsi, %rax, CHAR_SIZE), %rax
+	VPCMPEQ	%VMM(8), %VMATCH, %k0
+	KMOV	%k0, %VRCX
+	bsr	%VRCX, %VRCX
+	jnz	L(return_old_match_ret)
+
+	VPCMPEQ	%VMM(7), %VMATCH, %k0
+	KMOV	%k0, %VRCX
+	bsr	%VRCX, %VRCX
+	subq	$VEC_SIZE, %rsi
+L(return_old_match_ret):
+	leaq	(VEC_SIZE * 3)(%rsi, %rcx, CHAR_SIZE), %rax
 	ret
 
+	.p2align 4,, 10
 L(return_new_match):
-	VPTESTN	%YMM1, %YMM1, %k0
-	kunpck	%k0, %k1, %k0
+	VPTESTN	%VMM(5), %VMM(5), %k0
+
+	/* Combined results from VEC5/6.  */
+	kunpck_2x %k0, %k1, %k0
 	kmov_2x	%k0, %maskz_2x
 
 	blsmsk	%maskz_2x, %maskz_2x
-	kunpck	%k2, %k3, %k3
+	kunpck_2x %k2, %k3, %k3
 	kmov_2x	%k3, %maskm_2x
+
+	/* Match at end was out-of-bounds so use last known match.  */
 	and	%maskz_2x, %maskm_2x
 	jz	L(return_old_match)
 
@@ -328,49 +353,53 @@ L(return_new_match):
 	leaq	(VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
 	ret
 
+	.p2align 4,, 4
 L(cross_page_boundary):
-	/* eax contains all the page offset bits of src (rdi). `xor rdi,
-	   rax` sets pointer will all page offset bits cleared so
-	   offset of (PAGE_SIZE - VEC_SIZE) will get last aligned VEC
-	   before page cross (guranteed to be safe to read). Doing this
-	   as opposed to `movq %rdi, %rax; andq $-VEC_SIZE, %rax` saves
-	   a bit of code size.  */
 	xorq	%rdi, %rax
-	VMOVU	(PAGE_SIZE - VEC_SIZE)(%rax), %YMM1
-	VPTESTN	%YMM1, %YMM1, %k0
-	kmovd	%k0, %ecx
+	mov	$-1, %VRDX
+	VMOVU	(PAGE_SIZE - VEC_SIZE)(%rax), %VMM(6)
+	VPTESTN	%VMM(6), %VMM(6), %k0
+	KMOV	%k0, %VRSI
+
+# ifdef USE_AS_WCSRCHR
+	movl	%edi, %ecx
+	and	$(VEC_SIZE - 1), %ecx
+	shrl	$2, %ecx
+# endif
+	shlx	%VGPR(SHIFT_REG), %VRDX, %VRDX
 
-	/* Shift out zero CHAR matches that are before the begining of
-	   src (rdi).  */
 # ifdef USE_AS_WCSRCHR
-	movl	%edi, %esi
-	andl	$(VEC_SIZE - 1), %esi
-	shrl	$2, %esi
+	kmovb	%edx, %k1
+# else
+	KMOV	%VRDX, %k1
 # endif
-	shrxl	%SHIFT_REG, %ecx, %ecx
 
-	testl	%ecx, %ecx
+	/* Need to adjust result to VEC(1) so it can be re-used by
+	   L(return_vec_x0_test).  The alternative is to collect VEC(1)
+	   will a page cross load which is far more expensive.  */
+	VPCOMPRESS %VMM(6), %VMM(1){%k1}{z}
+
+	/* We could technically just jmp back after the vpcompress but
+	   it doesn't save any 16-byte blocks.  */
+	shrx	%VGPR(SHIFT_REG), %VRSI, %VRSI
+	test	%VRSI, %VRSI
 	jz	L(page_cross_continue)
 
-	/* Found zero CHAR so need to test for search CHAR.  */
-	VPCMP	$0, %YMMMATCH, %YMM1, %k1
-	kmovd	%k1, %eax
-	/* Shift out search CHAR matches that are before the begining of
-	   src (rdi).  */
-	shrxl	%SHIFT_REG, %eax, %eax
-
-	/* Check if any search CHAR match in range.  */
-	blsmskl	%ecx, %ecx
-	andl	%ecx, %eax
-	jz	L(ret3)
-	bsrl	%eax, %eax
+	/* Duplicate of return logic from ENTRY. Doesn't cause spill to
+	   next cache line so might as well copy it here.  */
+	VPCMPEQ	%VMATCH, %VMM(1), %k1
+	KMOV	%k1, %VRAX
+	blsmsk	%VRSI, %VRSI
+	and	%VRSI, %VRAX
+	jz	L(ret_page_cross)
+	bsr	%VRAX, %VRAX
 # ifdef USE_AS_WCSRCHR
 	leaq	(%rdi, %rax, CHAR_SIZE), %rax
 # else
 	addq	%rdi, %rax
 # endif
-L(ret3):
+L(ret_page_cross):
 	ret
-
+	/* 1 byte till next cache line.  */
 END(STRRCHR)
 #endif
-- 
2.34.1


^ permalink raw reply	[flat|nested] 41+ messages in thread

* [PATCH v1 6/7] x86: Add support for VEC_SIZE == 64 in strcmp-evex.S impl
  2022-10-18  2:48 [PATCH v1 1/7] x86: Optimize memchr-evex.S and implement with VMM headers Noah Goldstein
                   ` (3 preceding siblings ...)
  2022-10-18  2:48 ` [PATCH v1 5/7] x86: Optimize strrchr-evex.S and implement with VMM headers Noah Goldstein
@ 2022-10-18  2:49 ` Noah Goldstein
  2022-10-20  2:15   ` [PATCH v4] " Noah Goldstein
  2022-10-18  2:49 ` [PATCH v1 7/7] Bench: Improve benchtests for memchr, strchr, strnlen, strrchr Noah Goldstein
                   ` (3 subsequent siblings)
  8 siblings, 1 reply; 41+ messages in thread
From: Noah Goldstein @ 2022-10-18  2:49 UTC (permalink / raw)
  To: libc-alpha; +Cc: goldstein.w.n, hjl.tools, carlos

Unused at the moment, but evex512 strcmp, strncmp, strcasecmp{l}, and
strncasecmp{l} functions can be added by including strcmp-evex.S with
"x86-evex512-vecs.h" defined.

In addition save code size a bit in a few places.

1. tzcnt ...         -> bsf ...
2. vpcmp{b|d} $0 ... -> vpcmpeq{b|d}

This saves a touch of code size but has minimal net affect.

Full check passes on x86-64.
---
 sysdeps/x86_64/multiarch/strcmp-evex.S | 676 ++++++++++++++++---------
 1 file changed, 430 insertions(+), 246 deletions(-)

diff --git a/sysdeps/x86_64/multiarch/strcmp-evex.S b/sysdeps/x86_64/multiarch/strcmp-evex.S
index e482d0167f..756a3bb8d6 100644
--- a/sysdeps/x86_64/multiarch/strcmp-evex.S
+++ b/sysdeps/x86_64/multiarch/strcmp-evex.S
@@ -20,6 +20,10 @@
 
 #if ISA_SHOULD_BUILD (4)
 
+# ifndef VEC_SIZE
+#  include "x86-evex256-vecs.h"
+# endif
+
 # define STRCMP_ISA	_evex
 # include "strcmp-naming.h"
 
@@ -35,41 +39,57 @@
 # define PAGE_SIZE	4096
 
 	/* VEC_SIZE = Number of bytes in a ymm register.  */
-# define VEC_SIZE	32
 # define CHAR_PER_VEC	(VEC_SIZE	/	SIZE_OF_CHAR)
 
-# define VMOVU	vmovdqu64
-# define VMOVA	vmovdqa64
-
 # ifdef USE_AS_WCSCMP
-#  define TESTEQ	subl $0xff,
 	/* Compare packed dwords.  */
 #  define VPCMP	vpcmpd
+#  define VPCMPEQ	vpcmpeqd
 #  define VPMINU	vpminud
 #  define VPTESTM	vptestmd
 #  define VPTESTNM	vptestnmd
 	/* 1 dword char == 4 bytes.  */
 #  define SIZE_OF_CHAR	4
+
+#  define TESTEQ	sub $((1 << CHAR_PER_VEC) - 1),
+
+#  define USE_WIDE_CHAR
 # else
-#  define TESTEQ	incl
 	/* Compare packed bytes.  */
 #  define VPCMP	vpcmpb
+#  define VPCMPEQ	vpcmpeqb
 #  define VPMINU	vpminub
 #  define VPTESTM	vptestmb
 #  define VPTESTNM	vptestnmb
 	/* 1 byte char == 1 byte.  */
 #  define SIZE_OF_CHAR	1
+
+#  define TESTEQ	inc
+# endif
+
+# include "reg-macros.h"
+
+# if VEC_SIZE == 64
+#  define RODATA_SECTION	rodata.cst64
+# else
+#  define RODATA_SECTION	rodata.cst32
+# endif
+
+# if CHAR_PER_VEC == 64
+#  define FALLTHROUGH_RETURN_OFFSET	(VEC_SIZE * 3)
+# else
+#  define FALLTHROUGH_RETURN_OFFSET	(VEC_SIZE * 2)
 # endif
 
 # ifdef USE_AS_STRNCMP
-#  define LOOP_REG	r9d
+#  define LOOP_REG	VR9
 #  define LOOP_REG64	r9
 
 #  define OFFSET_REG8	r9b
 #  define OFFSET_REG	r9d
 #  define OFFSET_REG64	r9
 # else
-#  define LOOP_REG	edx
+#  define LOOP_REG	VRDX
 #  define LOOP_REG64	rdx
 
 #  define OFFSET_REG8	dl
@@ -83,32 +103,6 @@
 #  define VEC_OFFSET	(-VEC_SIZE)
 # endif
 
-# define XMM0	xmm17
-# define XMM1	xmm18
-
-# define XMM10	xmm27
-# define XMM11	xmm28
-# define XMM12	xmm29
-# define XMM13	xmm30
-# define XMM14	xmm31
-
-
-# define YMM0	ymm17
-# define YMM1	ymm18
-# define YMM2	ymm19
-# define YMM3	ymm20
-# define YMM4	ymm21
-# define YMM5	ymm22
-# define YMM6	ymm23
-# define YMM7	ymm24
-# define YMM8	ymm25
-# define YMM9	ymm26
-# define YMM10	ymm27
-# define YMM11	ymm28
-# define YMM12	ymm29
-# define YMM13	ymm30
-# define YMM14	ymm31
-
 # ifdef USE_AS_STRCASECMP_L
 #  define BYTE_LOOP_REG	OFFSET_REG
 # else
@@ -125,61 +119,72 @@
 #  endif
 # endif
 
-# define LCASE_MIN_YMM	%YMM12
-# define LCASE_MAX_YMM	%YMM13
-# define CASE_ADD_YMM	%YMM14
+# define LCASE_MIN_V	VMM(12)
+# define LCASE_MAX_V	VMM(13)
+# define CASE_ADD_V	VMM(14)
 
-# define LCASE_MIN_XMM	%XMM12
-# define LCASE_MAX_XMM	%XMM13
-# define CASE_ADD_XMM	%XMM14
+# if VEC_SIZE == 64
+#  define LCASE_MIN_YMM	VMM_256(12)
+#  define LCASE_MAX_YMM	VMM_256(13)
+#  define CASE_ADD_YMM	VMM_256(14)
+# endif
+
+# define LCASE_MIN_XMM	VMM_128(12)
+# define LCASE_MAX_XMM	VMM_128(13)
+# define CASE_ADD_XMM	VMM_128(14)
 
 	/* NB: wcsncmp uses r11 but strcasecmp is never used in
 	   conjunction with wcscmp.  */
 # define TOLOWER_BASE	%r11
 
 # ifdef USE_AS_STRCASECMP_L
-#  define _REG(x, y) x ## y
-#  define REG(x, y) _REG(x, y)
-#  define TOLOWER(reg1, reg2, ext)										\
-	vpsubb	REG(LCASE_MIN_, ext), reg1, REG(%ext, 10);					\
-	vpsubb	REG(LCASE_MIN_, ext), reg2, REG(%ext, 11);					\
-	vpcmpub	$1, REG(LCASE_MAX_, ext), REG(%ext, 10), %k5;				\
-	vpcmpub	$1, REG(LCASE_MAX_, ext), REG(%ext, 11), %k6;				\
-	vpaddb	reg1, REG(CASE_ADD_, ext), reg1{%k5};						\
-	vpaddb	reg2, REG(CASE_ADD_, ext), reg2{%k6}
-
-#  define TOLOWER_gpr(src, dst) movl (TOLOWER_BASE, src, 4), dst
-#  define TOLOWER_YMM(...)	TOLOWER(__VA_ARGS__, YMM)
-#  define TOLOWER_XMM(...)	TOLOWER(__VA_ARGS__, XMM)
-
-#  define CMP_R1_R2(s1_reg, s2_reg, reg_out, ext)						\
-	TOLOWER	(s1_reg, s2_reg, ext);										\
-	VPCMP	$0, s1_reg, s2_reg, reg_out
-
-#  define CMP_R1_S2(s1_reg, s2_mem, s2_reg, reg_out, ext)				\
-	VMOVU	s2_mem, s2_reg;												\
-	CMP_R1_R2(s1_reg, s2_reg, reg_out, ext)
-
-#  define CMP_R1_R2_YMM(...) CMP_R1_R2(__VA_ARGS__, YMM)
-#  define CMP_R1_R2_XMM(...) CMP_R1_R2(__VA_ARGS__, XMM)
-
-#  define CMP_R1_S2_YMM(...) CMP_R1_S2(__VA_ARGS__, YMM)
-#  define CMP_R1_S2_XMM(...) CMP_R1_S2(__VA_ARGS__, XMM)
+#  define _REG(x, y)	x ## y
+#  define REG(x, y)	_REG(x, y)
+#  define TOLOWER(reg1, reg2, ext, vec_macro)	\
+	vpsubb	%REG(LCASE_MIN_, ext), reg1, %vec_macro(10);	\
+	vpsubb	%REG(LCASE_MIN_, ext), reg2, %vec_macro(11);	\
+	vpcmpub	$1, %REG(LCASE_MAX_, ext), %vec_macro(10), %k5;	\
+	vpcmpub	$1, %REG(LCASE_MAX_, ext), %vec_macro(11), %k6;	\
+	vpaddb	reg1, %REG(CASE_ADD_, ext), reg1{%k5};	\
+	vpaddb	reg2, %REG(CASE_ADD_, ext), reg2{%k6}
+
+#  define TOLOWER_gpr(src, dst)	movl (TOLOWER_BASE, src, 4), dst
+#  define TOLOWER_VMM(...)	TOLOWER(__VA_ARGS__, V, VMM)
+#  define TOLOWER_YMM(...)	TOLOWER(__VA_ARGS__, YMM, VMM_256)
+#  define TOLOWER_XMM(...)	TOLOWER(__VA_ARGS__, XMM, VMM_128)
+
+#  define CMP_R1_R2(s1_reg, s2_reg, reg_out, ext, vec_macro)	\
+	TOLOWER	(s1_reg, s2_reg, ext, vec_macro);	\
+	VPCMPEQ	s1_reg, s2_reg, reg_out
+
+#  define CMP_R1_S2(s1_reg, s2_mem, s2_reg, reg_out, ext, vec_macro)	\
+	VMOVU	s2_mem, s2_reg;	\
+	CMP_R1_R2 (s1_reg, s2_reg, reg_out, ext, vec_macro)
+
+#  define CMP_R1_R2_VMM(...)	CMP_R1_R2(__VA_ARGS__, V, VMM)
+#  define CMP_R1_R2_YMM(...)	CMP_R1_R2(__VA_ARGS__, YMM, VMM_256)
+#  define CMP_R1_R2_XMM(...)	CMP_R1_R2(__VA_ARGS__, XMM, VMM_128)
+
+#  define CMP_R1_S2_VMM(...)	CMP_R1_S2(__VA_ARGS__, V, VMM)
+#  define CMP_R1_S2_YMM(...)	CMP_R1_S2(__VA_ARGS__, YMM, VMM_256)
+#  define CMP_R1_S2_XMM(...)	CMP_R1_S2(__VA_ARGS__, XMM, VMM_128)
 
 # else
 #  define TOLOWER_gpr(...)
+#  define TOLOWER_VMM(...)
 #  define TOLOWER_YMM(...)
 #  define TOLOWER_XMM(...)
 
-#  define CMP_R1_R2_YMM(s1_reg, s2_reg, reg_out)						\
-	VPCMP	$0, s2_reg, s1_reg, reg_out
+#  define CMP_R1_R2_VMM(s1_reg, s2_reg, reg_out)	\
+	VPCMPEQ	s2_reg, s1_reg, reg_out
 
-#  define CMP_R1_R2_XMM(...) CMP_R1_R2_YMM(__VA_ARGS__)
+#  define CMP_R1_R2_YMM(...)	CMP_R1_R2_VMM(__VA_ARGS__)
+#  define CMP_R1_R2_XMM(...)	CMP_R1_R2_VMM(__VA_ARGS__)
 
-#  define CMP_R1_S2_YMM(s1_reg, s2_mem, unused, reg_out)				\
-	VPCMP	$0, s2_mem, s1_reg, reg_out
-
-#  define CMP_R1_S2_XMM(...) CMP_R1_S2_YMM(__VA_ARGS__)
+#  define CMP_R1_S2_VMM(s1_reg, s2_mem, unused, reg_out)	\
+	VPCMPEQ	s2_mem, s1_reg, reg_out
+#  define CMP_R1_S2_YMM(...)	CMP_R1_S2_VMM(__VA_ARGS__)
+#  define CMP_R1_S2_XMM(...)	CMP_R1_S2_VMM(__VA_ARGS__)
 # endif
 
 /* Warning!
@@ -203,7 +208,7 @@
    the maximum offset is reached before a difference is found, zero is
    returned.  */
 
-	.section .text.evex, "ax", @progbits
+	.section SECTION(.text), "ax", @progbits
 	.align	16
 	.type	STRCMP, @function
 	.globl	STRCMP
@@ -232,7 +237,7 @@ STRCMP:
 #  else
 	mov	(%LOCALE_REG), %RAX_LP
 #  endif
-	testl	$1, LOCALE_DATA_VALUES + _NL_CTYPE_NONASCII_CASE * SIZEOF_VALUES(%rax)
+	testb	$1, LOCALE_DATA_VALUES + _NL_CTYPE_NONASCII_CASE * SIZEOF_VALUES(%rax)
 	jne	STRCASECMP_L_NONASCII
 	leaq	_nl_C_LC_CTYPE_tolower + 128 * 4(%rip), TOLOWER_BASE
 # endif
@@ -254,28 +259,46 @@ STRCMP:
 # endif
 
 # if defined USE_AS_STRCASECMP_L
-	.section .rodata.cst32, "aM", @progbits, 32
-	.align	32
+	.section RODATA_SECTION, "aM", @progbits, VEC_SIZE
+	.align	VEC_SIZE
 L(lcase_min):
 	.quad	0x4141414141414141
 	.quad	0x4141414141414141
 	.quad	0x4141414141414141
 	.quad	0x4141414141414141
+#  if VEC_SIZE == 64
+	.quad	0x4141414141414141
+	.quad	0x4141414141414141
+	.quad	0x4141414141414141
+	.quad	0x4141414141414141
+#  endif
 L(lcase_max):
 	.quad	0x1a1a1a1a1a1a1a1a
 	.quad	0x1a1a1a1a1a1a1a1a
 	.quad	0x1a1a1a1a1a1a1a1a
 	.quad	0x1a1a1a1a1a1a1a1a
+#  if VEC_SIZE == 64
+	.quad	0x1a1a1a1a1a1a1a1a
+	.quad	0x1a1a1a1a1a1a1a1a
+	.quad	0x1a1a1a1a1a1a1a1a
+	.quad	0x1a1a1a1a1a1a1a1a
+#  endif
 L(case_add):
 	.quad	0x2020202020202020
 	.quad	0x2020202020202020
 	.quad	0x2020202020202020
 	.quad	0x2020202020202020
+#  if VEC_SIZE == 64
+	.quad	0x2020202020202020
+	.quad	0x2020202020202020
+	.quad	0x2020202020202020
+	.quad	0x2020202020202020
+#  endif
 	.previous
 
-	vmovdqa64 L(lcase_min)(%rip), LCASE_MIN_YMM
-	vmovdqa64 L(lcase_max)(%rip), LCASE_MAX_YMM
-	vmovdqa64 L(case_add)(%rip), CASE_ADD_YMM
+	VMOVA	L(lcase_min)(%rip), %LCASE_MIN_V
+	VMOVA	L(lcase_max)(%rip), %LCASE_MAX_V
+	VMOVA	L(case_add)(%rip), %CASE_ADD_V
 # endif
 
 	movl	%edi, %eax
@@ -288,12 +311,12 @@ L(case_add):
 
 L(no_page_cross):
 	/* Safe to compare 4x vectors.  */
-	VMOVU	(%rdi), %YMM0
-	VPTESTM	%YMM0, %YMM0, %k2
+	VMOVU	(%rdi), %VMM(0)
+	VPTESTM	%VMM(0), %VMM(0), %k2
 	/* Each bit cleared in K1 represents a mismatch or a null CHAR
 	   in YMM0 and 32 bytes at (%rsi).  */
-	CMP_R1_S2_YMM (%YMM0, (%rsi), %YMM1, %k1){%k2}
-	kmovd	%k1, %ecx
+	CMP_R1_S2_VMM (%VMM(0), (%rsi), %VMM(1), %k1){%k2}
+	KMOV	%k1, %VRCX
 # ifdef USE_AS_STRNCMP
 	cmpq	$CHAR_PER_VEC, %rdx
 	jbe	L(vec_0_test_len)
@@ -303,14 +326,14 @@ L(no_page_cross):
 	   wcscmp/wcsncmp.  */
 
 	/* All 1s represents all equals. TESTEQ will overflow to zero in
-	   all equals case. Otherwise 1s will carry until position of first
-	   mismatch.  */
-	TESTEQ	%ecx
+	   all equals case. Otherwise 1s will carry until position of
+	   first mismatch.  */
+	TESTEQ	%VRCX
 	jz	L(more_3x_vec)
 
 	.p2align 4,, 4
 L(return_vec_0):
-	tzcntl	%ecx, %ecx
+	bsf	%VRCX, %VRCX
 # ifdef USE_AS_WCSCMP
 	movl	(%rdi, %rcx, SIZE_OF_CHAR), %edx
 	xorl	%eax, %eax
@@ -321,7 +344,16 @@ L(return_vec_0):
 	orl	$1, %eax
 # else
 	movzbl	(%rdi, %rcx), %eax
+	/* For VEC_SIZE == 64 use movb instead of movzbl to save a byte
+	   and keep logic for len <= VEC_SIZE (common) in just the
+	   first cache line.  NB: No evex512 processor has partial-
+	   register stalls. If that changes this ifdef can be disabled
+	   without affecting correctness.  */
+#  if !defined USE_AS_STRNCMP && !defined USE_AS_STRCASECMP_L && VEC_SIZE == 64
+	movb	(%rsi, %rcx), %cl
+#  else
 	movzbl	(%rsi, %rcx), %ecx
+#  endif
 	TOLOWER_gpr (%rax, %eax)
 	TOLOWER_gpr (%rcx, %ecx)
 	subl	%ecx, %eax
@@ -332,8 +364,8 @@ L(ret0):
 # ifdef USE_AS_STRNCMP
 	.p2align 4,, 4
 L(vec_0_test_len):
-	notl	%ecx
-	bzhil	%edx, %ecx, %eax
+	not	%VRCX
+	bzhi	%VRDX, %VRCX, %VRAX
 	jnz	L(return_vec_0)
 	/* Align if will cross fetch block.  */
 	.p2align 4,, 2
@@ -372,7 +404,7 @@ L(ret1):
 
 	.p2align 4,, 10
 L(return_vec_1):
-	tzcntl	%ecx, %ecx
+	bsf	%VRCX, %VRCX
 # ifdef USE_AS_STRNCMP
 	/* rdx must be > CHAR_PER_VEC so its safe to subtract without
 	   worrying about underflow.  */
@@ -401,24 +433,41 @@ L(ret2):
 	.p2align 4,, 10
 # ifdef USE_AS_STRNCMP
 L(return_vec_3):
-#  if CHAR_PER_VEC <= 16
+#  if CHAR_PER_VEC <= 32
+	/* If CHAR_PER_VEC <= 32 reuse code from L(return_vec_3) without
+	   additional branches by adjusting the bit positions from
+	   VEC3.  We can't do this for CHAR_PER_VEC == 64.  */
+#   if CHAR_PER_VEC <= 16
 	sall	$CHAR_PER_VEC, %ecx
-#  else
+#   else
 	salq	$CHAR_PER_VEC, %rcx
+#   endif
+#  else
+	/* If CHAR_PER_VEC == 64 we can't shift the return GPR so just
+	   check it.  */
+	bsf	%VRCX, %VRCX
+	addl	$(CHAR_PER_VEC), %ecx
+	cmpq	%rcx, %rdx
+	ja	L(ret_vec_3_finish)
+	xorl	%eax, %eax
+	ret
 #  endif
 # endif
+
+	/* If CHAR_PER_VEC == 64 we can't combine matches from the last
+	   2x VEC so need seperate return label.  */
 L(return_vec_2):
 # if (CHAR_PER_VEC <= 16) || !(defined USE_AS_STRNCMP)
-	tzcntl	%ecx, %ecx
+	bsf	%VRCX, %VRCX
 # else
-	tzcntq	%rcx, %rcx
+	bsfq	%rcx, %rcx
 # endif
-
 # ifdef USE_AS_STRNCMP
 	cmpq	%rcx, %rdx
 	jbe	L(ret_zero)
 # endif
 
+L(ret_vec_3_finish):
 # ifdef USE_AS_WCSCMP
 	movl	(VEC_SIZE * 2)(%rdi, %rcx, SIZE_OF_CHAR), %edx
 	xorl	%eax, %eax
@@ -440,7 +489,7 @@ L(ret3):
 # ifndef USE_AS_STRNCMP
 	.p2align 4,, 10
 L(return_vec_3):
-	tzcntl	%ecx, %ecx
+	bsf	%VRCX, %VRCX
 #  ifdef USE_AS_WCSCMP
 	movl	(VEC_SIZE * 3)(%rdi, %rcx, SIZE_OF_CHAR), %edx
 	xorl	%eax, %eax
@@ -465,11 +514,11 @@ L(ret4):
 	.p2align 5
 L(more_3x_vec):
 	/* Safe to compare 4x vectors.  */
-	VMOVU	(VEC_SIZE)(%rdi), %YMM0
-	VPTESTM	%YMM0, %YMM0, %k2
-	CMP_R1_S2_YMM (%YMM0, VEC_SIZE(%rsi), %YMM1, %k1){%k2}
-	kmovd	%k1, %ecx
-	TESTEQ	%ecx
+	VMOVU	(VEC_SIZE)(%rdi), %VMM(0)
+	VPTESTM	%VMM(0), %VMM(0), %k2
+	CMP_R1_S2_VMM (%VMM(0), VEC_SIZE(%rsi), %VMM(1), %k1){%k2}
+	KMOV	%k1, %VRCX
+	TESTEQ	%VRCX
 	jnz	L(return_vec_1)
 
 # ifdef USE_AS_STRNCMP
@@ -477,18 +526,18 @@ L(more_3x_vec):
 	jbe	L(ret_zero)
 # endif
 
-	VMOVU	(VEC_SIZE * 2)(%rdi), %YMM0
-	VPTESTM	%YMM0, %YMM0, %k2
-	CMP_R1_S2_YMM (%YMM0, (VEC_SIZE * 2)(%rsi), %YMM1, %k1){%k2}
-	kmovd	%k1, %ecx
-	TESTEQ	%ecx
+	VMOVU	(VEC_SIZE * 2)(%rdi), %VMM(0)
+	VPTESTM	%VMM(0), %VMM(0), %k2
+	CMP_R1_S2_VMM (%VMM(0), (VEC_SIZE * 2)(%rsi), %VMM(1), %k1){%k2}
+	KMOV	%k1, %VRCX
+	TESTEQ	%VRCX
 	jnz	L(return_vec_2)
 
-	VMOVU	(VEC_SIZE * 3)(%rdi), %YMM0
-	VPTESTM	%YMM0, %YMM0, %k2
-	CMP_R1_S2_YMM (%YMM0, (VEC_SIZE * 3)(%rsi), %YMM1, %k1){%k2}
-	kmovd	%k1, %ecx
-	TESTEQ	%ecx
+	VMOVU	(VEC_SIZE * 3)(%rdi), %VMM(0)
+	VPTESTM	%VMM(0), %VMM(0), %k2
+	CMP_R1_S2_VMM (%VMM(0), (VEC_SIZE * 3)(%rsi), %VMM(1), %k1){%k2}
+	KMOV	%k1, %VRCX
+	TESTEQ	%VRCX
 	jnz	L(return_vec_3)
 
 # ifdef USE_AS_STRNCMP
@@ -565,110 +614,123 @@ L(loop):
 
 	/* Loop entry after handling page cross during loop.  */
 L(loop_skip_page_cross_check):
-	VMOVA	(VEC_SIZE * 0)(%rdi), %YMM0
-	VMOVA	(VEC_SIZE * 1)(%rdi), %YMM2
-	VMOVA	(VEC_SIZE * 2)(%rdi), %YMM4
-	VMOVA	(VEC_SIZE * 3)(%rdi), %YMM6
+	VMOVA	(VEC_SIZE * 0)(%rdi), %VMM(0)
+	VMOVA	(VEC_SIZE * 1)(%rdi), %VMM(2)
+	VMOVA	(VEC_SIZE * 2)(%rdi), %VMM(4)
+	VMOVA	(VEC_SIZE * 3)(%rdi), %VMM(6)
 
-	VPMINU	%YMM0, %YMM2, %YMM8
-	VPMINU	%YMM4, %YMM6, %YMM9
+	VPMINU	%VMM(0), %VMM(2), %VMM(8)
+	VPMINU	%VMM(4), %VMM(6), %VMM(9)
 
 	/* A zero CHAR in YMM9 means that there is a null CHAR.  */
-	VPMINU	%YMM8, %YMM9, %YMM9
+	VPMINU	%VMM(8), %VMM(9), %VMM(9)
 
 	/* Each bit set in K1 represents a non-null CHAR in YMM9.  */
-	VPTESTM	%YMM9, %YMM9, %k1
+	VPTESTM	%VMM(9), %VMM(9), %k1
 # ifndef USE_AS_STRCASECMP_L
-	vpxorq	(VEC_SIZE * 0)(%rsi), %YMM0, %YMM1
-	vpxorq	(VEC_SIZE * 1)(%rsi), %YMM2, %YMM3
-	vpxorq	(VEC_SIZE * 2)(%rsi), %YMM4, %YMM5
+	vpxorq	(VEC_SIZE * 0)(%rsi), %VMM(0), %VMM(1)
+	vpxorq	(VEC_SIZE * 1)(%rsi), %VMM(2), %VMM(3)
+	vpxorq	(VEC_SIZE * 2)(%rsi), %VMM(4), %VMM(5)
 	/* Ternary logic to xor (VEC_SIZE * 3)(%rsi) with YMM6 while
 	   oring with YMM1. Result is stored in YMM6.  */
-	vpternlogd $0xde, (VEC_SIZE * 3)(%rsi), %YMM1, %YMM6
+	vpternlogd $0xde, (VEC_SIZE * 3)(%rsi), %VMM(1), %VMM(6)
 # else
-	VMOVU	(VEC_SIZE * 0)(%rsi), %YMM1
-	TOLOWER_YMM (%YMM0, %YMM1)
-	VMOVU	(VEC_SIZE * 1)(%rsi), %YMM3
-	TOLOWER_YMM (%YMM2, %YMM3)
-	VMOVU	(VEC_SIZE * 2)(%rsi), %YMM5
-	TOLOWER_YMM (%YMM4, %YMM5)
-	VMOVU	(VEC_SIZE * 3)(%rsi), %YMM7
-	TOLOWER_YMM (%YMM6, %YMM7)
-	vpxorq	%YMM0, %YMM1, %YMM1
-	vpxorq	%YMM2, %YMM3, %YMM3
-	vpxorq	%YMM4, %YMM5, %YMM5
-	vpternlogd $0xde, %YMM7, %YMM1, %YMM6
+	VMOVU	(VEC_SIZE * 0)(%rsi), %VMM(1)
+	TOLOWER_VMM (%VMM(0), %VMM(1))
+	VMOVU	(VEC_SIZE * 1)(%rsi), %VMM(3)
+	TOLOWER_VMM (%VMM(2), %VMM(3))
+	VMOVU	(VEC_SIZE * 2)(%rsi), %VMM(5)
+	TOLOWER_VMM (%VMM(4), %VMM(5))
+	VMOVU	(VEC_SIZE * 3)(%rsi), %VMM(7)
+	TOLOWER_VMM (%VMM(6), %VMM(7))
+	vpxorq	%VMM(0), %VMM(1), %VMM(1)
+	vpxorq	%VMM(2), %VMM(3), %VMM(3)
+	vpxorq	%VMM(4), %VMM(5), %VMM(5)
+	vpternlogd $0xde, %VMM(7), %VMM(1), %VMM(6)
 # endif
 	/* Or together YMM3, YMM5, and YMM6.  */
-	vpternlogd $0xfe, %YMM3, %YMM5, %YMM6
+	vpternlogd $0xfe, %VMM(3), %VMM(5), %VMM(6)
 
 
 	/* A non-zero CHAR in YMM6 represents a mismatch.  */
-	VPTESTNM %YMM6, %YMM6, %k0{%k1}
-	kmovd	%k0, %LOOP_REG
+	VPTESTNM %VMM(6), %VMM(6), %k0{%k1}
+	KMOV	%k0, %LOOP_REG
 
 	TESTEQ	%LOOP_REG
 	jz	L(loop)
 
 
 	/* Find which VEC has the mismatch of end of string.  */
-	VPTESTM	%YMM0, %YMM0, %k1
-	VPTESTNM %YMM1, %YMM1, %k0{%k1}
-	kmovd	%k0, %ecx
-	TESTEQ	%ecx
+	VPTESTM	%VMM(0), %VMM(0), %k1
+	VPTESTNM %VMM(1), %VMM(1), %k0{%k1}
+	KMOV	%k0, %VRCX
+	TESTEQ	%VRCX
 	jnz	L(return_vec_0_end)
 
-	VPTESTM	%YMM2, %YMM2, %k1
-	VPTESTNM %YMM3, %YMM3, %k0{%k1}
-	kmovd	%k0, %ecx
-	TESTEQ	%ecx
+	VPTESTM	%VMM(2), %VMM(2), %k1
+	VPTESTNM %VMM(3), %VMM(3), %k0{%k1}
+	KMOV	%k0, %VRCX
+	TESTEQ	%VRCX
 	jnz	L(return_vec_1_end)
 
 
-	/* Handle VEC 2 and 3 without branches.  */
+	/* Handle VEC 2 and 3 without branches if CHAR_PER_VEC <= 32.
+	 */
 L(return_vec_2_3_end):
 # ifdef USE_AS_STRNCMP
 	subq	$(CHAR_PER_VEC * 2), %rdx
 	jbe	L(ret_zero_end)
 # endif
 
-	VPTESTM	%YMM4, %YMM4, %k1
-	VPTESTNM %YMM5, %YMM5, %k0{%k1}
-	kmovd	%k0, %ecx
-	TESTEQ	%ecx
+	VPTESTM	%VMM(4), %VMM(4), %k1
+	VPTESTNM %VMM(5), %VMM(5), %k0{%k1}
+	KMOV	%k0, %VRCX
+	TESTEQ	%VRCX
 # if CHAR_PER_VEC <= 16
 	sall	$CHAR_PER_VEC, %LOOP_REG
 	orl	%ecx, %LOOP_REG
-# else
+# elif CHAR_PER_VEC <= 32
 	salq	$CHAR_PER_VEC, %LOOP_REG64
 	orq	%rcx, %LOOP_REG64
+# else
+	/* We aren't combining last 2x VEC so branch on second the last.
+	 */
+	jnz	L(return_vec_2_end)
 # endif
-L(return_vec_3_end):
+
 	/* LOOP_REG contains matches for null/mismatch from the loop. If
-	   VEC 0,1,and 2 all have no null and no mismatches then mismatch
-	   must entirely be from VEC 3 which is fully represented by
-	   LOOP_REG.  */
+	   VEC 0,1,and 2 all have no null and no mismatches then
+	   mismatch must entirely be from VEC 3 which is fully
+	   represented by LOOP_REG.  */
 # if CHAR_PER_VEC <= 16
-	tzcntl	%LOOP_REG, %LOOP_REG
+	bsf	%LOOP_REG, %LOOP_REG
 # else
-	tzcntq	%LOOP_REG64, %LOOP_REG64
+	bsfq	%LOOP_REG64, %LOOP_REG64
 # endif
 # ifdef USE_AS_STRNCMP
+
+	/* If CHAR_PER_VEC == 64 we can't combine last 2x VEC so need to
+	   adj length before last comparison.  */
+#  if CHAR_PER_VEC == 64
+	subq	$CHAR_PER_VEC, %rdx
+	jbe	L(ret_zero_end)
+#  endif
+
 	cmpq	%LOOP_REG64, %rdx
 	jbe	L(ret_zero_end)
 # endif
 
 # ifdef USE_AS_WCSCMP
-	movl	(VEC_SIZE * 2)(%rdi, %LOOP_REG64, SIZE_OF_CHAR), %ecx
+	movl	(FALLTHROUGH_RETURN_OFFSET)(%rdi, %LOOP_REG64, SIZE_OF_CHAR), %ecx
 	xorl	%eax, %eax
-	cmpl	(VEC_SIZE * 2)(%rsi, %LOOP_REG64, SIZE_OF_CHAR), %ecx
+	cmpl	(FALLTHROUGH_RETURN_OFFSET)(%rsi, %LOOP_REG64, SIZE_OF_CHAR), %ecx
 	je	L(ret5)
 	setl	%al
 	negl	%eax
 	xorl	%r8d, %eax
 # else
-	movzbl	(VEC_SIZE * 2)(%rdi, %LOOP_REG64), %eax
-	movzbl	(VEC_SIZE * 2)(%rsi, %LOOP_REG64), %ecx
+	movzbl	(FALLTHROUGH_RETURN_OFFSET)(%rdi, %LOOP_REG64), %eax
+	movzbl	(FALLTHROUGH_RETURN_OFFSET)(%rsi, %LOOP_REG64), %ecx
 	TOLOWER_gpr (%rax, %eax)
 	TOLOWER_gpr (%rcx, %ecx)
 	subl	%ecx, %eax
@@ -686,23 +748,39 @@ L(ret_zero_end):
 # endif
 
 
+
 	/* The L(return_vec_N_end) differ from L(return_vec_N) in that
-	   they use the value of `r8` to negate the return value. This is
-	   because the page cross logic can swap `rdi` and `rsi`.  */
+	   they use the value of `r8` to negate the return value. This
+	   is because the page cross logic can swap `rdi` and `rsi`.
+	 */
 	.p2align 4,, 10
 # ifdef USE_AS_STRNCMP
 L(return_vec_1_end):
-#  if CHAR_PER_VEC <= 16
+#  if CHAR_PER_VEC <= 32
+	/* If CHAR_PER_VEC <= 32 reuse code from L(return_vec_0_end)
+	   without additional branches by adjusting the bit positions
+	   from VEC1.  We can't do this for CHAR_PER_VEC == 64.  */
+#   if CHAR_PER_VEC <= 16
 	sall	$CHAR_PER_VEC, %ecx
-#  else
+#   else
 	salq	$CHAR_PER_VEC, %rcx
+#   endif
+#  else
+	/* If CHAR_PER_VEC == 64 we can't shift the return GPR so just
+	   check it.  */
+	bsf	%VRCX, %VRCX
+	addl	$(CHAR_PER_VEC), %ecx
+	cmpq	%rcx, %rdx
+	ja	L(ret_vec_0_end_finish)
+	xorl	%eax, %eax
+	ret
 #  endif
 # endif
 L(return_vec_0_end):
 # if (CHAR_PER_VEC <= 16) || !(defined USE_AS_STRNCMP)
-	tzcntl	%ecx, %ecx
+	bsf	%VRCX, %VRCX
 # else
-	tzcntq	%rcx, %rcx
+	bsfq	%rcx, %rcx
 # endif
 
 # ifdef USE_AS_STRNCMP
@@ -710,6 +788,7 @@ L(return_vec_0_end):
 	jbe	L(ret_zero_end)
 # endif
 
+L(ret_vec_0_end_finish):
 # ifdef USE_AS_WCSCMP
 	movl	(%rdi, %rcx, SIZE_OF_CHAR), %edx
 	xorl	%eax, %eax
@@ -737,7 +816,7 @@ L(ret6):
 # ifndef USE_AS_STRNCMP
 	.p2align 4,, 10
 L(return_vec_1_end):
-	tzcntl	%ecx, %ecx
+	bsf	%VRCX, %VRCX
 #  ifdef USE_AS_WCSCMP
 	movl	VEC_SIZE(%rdi, %rcx, SIZE_OF_CHAR), %edx
 	xorl	%eax, %eax
@@ -760,6 +839,41 @@ L(ret7):
 # endif
 
 
+	/* If CHAR_PER_VEC == 64 we can't combine matches from the last
+	   2x VEC so need seperate return label.  */
+# if CHAR_PER_VEC == 64
+L(return_vec_2_end):
+	bsf	%VRCX, %VRCX
+#  ifdef USE_AS_STRNCMP
+	cmpq	%rcx, %rdx
+	jbe	L(ret_zero_end)
+#  endif
+#  ifdef USE_AS_WCSCMP
+	movl	(VEC_SIZE * 2)(%rdi, %rcx, SIZE_OF_CHAR), %edx
+	xorl	%eax, %eax
+	cmpl	(VEC_SIZE * 2)(%rsi, %rcx, SIZE_OF_CHAR), %edx
+	je	L(ret31)
+	setl	%al
+	negl	%eax
+	/* This is the non-zero case for `eax` so just xorl with `r8d`
+	   flip is `rdi` and `rsi` where swapped.  */
+	xorl	%r8d, %eax
+#  else
+	movzbl	(VEC_SIZE * 2)(%rdi, %rcx), %eax
+	movzbl	(VEC_SIZE * 2)(%rsi, %rcx), %ecx
+	TOLOWER_gpr (%rax, %eax)
+	TOLOWER_gpr (%rcx, %ecx)
+	subl	%ecx, %eax
+	/* Flip `eax` if `rdi` and `rsi` where swapped in page cross
+	   logic. Subtract `r8d` after xor for zero case.  */
+	xorl	%r8d, %eax
+	subl	%r8d, %eax
+#  endif
+L(ret13):
+	ret
+# endif
+
+
 	/* Page cross in rsi in next 4x VEC.  */
 
 	/* TODO: Improve logic here.  */
@@ -778,11 +892,11 @@ L(page_cross_during_loop):
 	cmpl	$-(VEC_SIZE * 3), %eax
 	jle	L(less_1x_vec_till_page_cross)
 
-	VMOVA	(%rdi), %YMM0
-	VPTESTM	%YMM0, %YMM0, %k2
-	CMP_R1_S2_YMM (%YMM0, (%rsi), %YMM1, %k1){%k2}
-	kmovd	%k1, %ecx
-	TESTEQ	%ecx
+	VMOVA	(%rdi), %VMM(0)
+	VPTESTM	%VMM(0), %VMM(0), %k2
+	CMP_R1_S2_VMM (%VMM(0), (%rsi), %VMM(1), %k1){%k2}
+	KMOV	%k1, %VRCX
+	TESTEQ	%VRCX
 	jnz	L(return_vec_0_end)
 
 	/* if distance >= 2x VEC then eax > -(VEC_SIZE * 2).  */
@@ -799,9 +913,9 @@ L(less_1x_vec_till_page_cross):
 	   to read back -VEC_SIZE. If rdi is truly at the start of a page
 	   here, it means the previous page (rdi - VEC_SIZE) has already
 	   been loaded earlier so must be valid.  */
-	VMOVU	-VEC_SIZE(%rdi, %rax), %YMM0
-	VPTESTM	%YMM0, %YMM0, %k2
-	CMP_R1_S2_YMM (%YMM0, -VEC_SIZE(%rsi, %rax), %YMM1, %k1){%k2}
+	VMOVU	-VEC_SIZE(%rdi, %rax), %VMM(0)
+	VPTESTM	%VMM(0), %VMM(0), %k2
+	CMP_R1_S2_VMM (%VMM(0), -VEC_SIZE(%rsi, %rax), %VMM(1), %k1){%k2}
 	/* Mask of potentially valid bits. The lower bits can be out of
 	   range comparisons (but safe regarding page crosses).  */
 
@@ -813,12 +927,12 @@ L(less_1x_vec_till_page_cross):
 	shlxl	%ecx, %r10d, %ecx
 	movzbl	%cl, %r10d
 # else
-	movl	$-1, %ecx
-	shlxl	%esi, %ecx, %r10d
+	mov	$-1, %VRCX
+	shlx	%VRSI, %VRCX, %VR10
 # endif
 
-	kmovd	%k1, %ecx
-	notl	%ecx
+	KMOV	%k1, %VRCX
+	not	%VRCX
 
 
 # ifdef USE_AS_STRNCMP
@@ -838,12 +952,10 @@ L(less_1x_vec_till_page_cross):
 	/* Readjust eax before potentially returning to the loop.  */
 	addl	$(PAGE_SIZE - VEC_SIZE * 4), %eax
 
-	andl	%r10d, %ecx
+	and	%VR10, %VRCX
 	jz	L(loop_skip_page_cross_check)
 
-	.p2align 4,, 3
-L(return_page_cross_end):
-	tzcntl	%ecx, %ecx
+	bsf	%VRCX, %VRCX
 
 # if (defined USE_AS_STRNCMP) || (defined USE_AS_WCSCMP)
 	leal	-VEC_SIZE(%OFFSET_REG64, %rcx, SIZE_OF_CHAR), %ecx
@@ -874,8 +986,12 @@ L(ret8):
 # ifdef USE_AS_STRNCMP
 	.p2align 4,, 10
 L(return_page_cross_end_check):
-	andl	%r10d, %ecx
-	tzcntl	%ecx, %ecx
+	and	%VR10, %VRCX
+	/* Need to use tzcnt here as VRCX may be zero.  If VRCX is zero
+	   tzcnt(VRCX) will be CHAR_PER and remaining length (edx) is
+	   guranteed to be <= CHAR_PER_VEC so we will only use the return
+	   idx if VRCX was non-zero.  */
+	tzcnt	%VRCX, %VRCX
 	leal	-VEC_SIZE(%rax, %rcx, SIZE_OF_CHAR), %ecx
 #  ifdef USE_AS_WCSCMP
 	sall	$2, %edx
@@ -892,11 +1008,11 @@ L(more_2x_vec_till_page_cross):
 	/* If more 2x vec till cross we will complete a full loop
 	   iteration here.  */
 
-	VMOVA	VEC_SIZE(%rdi), %YMM0
-	VPTESTM	%YMM0, %YMM0, %k2
-	CMP_R1_S2_YMM (%YMM0, VEC_SIZE(%rsi), %YMM1, %k1){%k2}
-	kmovd	%k1, %ecx
-	TESTEQ	%ecx
+	VMOVA	VEC_SIZE(%rdi), %VMM(0)
+	VPTESTM	%VMM(0), %VMM(0), %k2
+	CMP_R1_S2_VMM (%VMM(0), VEC_SIZE(%rsi), %VMM(1), %k1){%k2}
+	KMOV	%k1, %VRCX
+	TESTEQ	%VRCX
 	jnz	L(return_vec_1_end)
 
 # ifdef USE_AS_STRNCMP
@@ -907,18 +1023,18 @@ L(more_2x_vec_till_page_cross):
 	subl	$-(VEC_SIZE * 4), %eax
 
 	/* Safe to include comparisons from lower bytes.  */
-	VMOVU	-(VEC_SIZE * 2)(%rdi, %rax), %YMM0
-	VPTESTM	%YMM0, %YMM0, %k2
-	CMP_R1_S2_YMM (%YMM0, -(VEC_SIZE * 2)(%rsi, %rax), %YMM1, %k1){%k2}
-	kmovd	%k1, %ecx
-	TESTEQ	%ecx
+	VMOVU	-(VEC_SIZE * 2)(%rdi, %rax), %VMM(0)
+	VPTESTM	%VMM(0), %VMM(0), %k2
+	CMP_R1_S2_VMM (%VMM(0), -(VEC_SIZE * 2)(%rsi, %rax), %VMM(1), %k1){%k2}
+	KMOV	%k1, %VRCX
+	TESTEQ	%VRCX
 	jnz	L(return_vec_page_cross_0)
 
-	VMOVU	-(VEC_SIZE * 1)(%rdi, %rax), %YMM0
-	VPTESTM	%YMM0, %YMM0, %k2
-	CMP_R1_S2_YMM (%YMM0, -(VEC_SIZE * 1)(%rsi, %rax), %YMM1, %k1){%k2}
-	kmovd	%k1, %ecx
-	TESTEQ	%ecx
+	VMOVU	-(VEC_SIZE * 1)(%rdi, %rax), %VMM(0)
+	VPTESTM	%VMM(0), %VMM(0), %k2
+	CMP_R1_S2_VMM (%VMM(0), -(VEC_SIZE * 1)(%rsi, %rax), %VMM(1), %k1){%k2}
+	KMOV	%k1, %VRCX
+	TESTEQ	%VRCX
 	jnz	L(return_vec_page_cross_1)
 
 # ifdef USE_AS_STRNCMP
@@ -937,30 +1053,30 @@ L(more_2x_vec_till_page_cross):
 # endif
 
 	/* Finish the loop.  */
-	VMOVA	(VEC_SIZE * 2)(%rdi), %YMM4
-	VMOVA	(VEC_SIZE * 3)(%rdi), %YMM6
-	VPMINU	%YMM4, %YMM6, %YMM9
-	VPTESTM	%YMM9, %YMM9, %k1
+	VMOVA	(VEC_SIZE * 2)(%rdi), %VMM(4)
+	VMOVA	(VEC_SIZE * 3)(%rdi), %VMM(6)
+	VPMINU	%VMM(4), %VMM(6), %VMM(9)
+	VPTESTM	%VMM(9), %VMM(9), %k1
 # ifndef USE_AS_STRCASECMP_L
-	vpxorq	(VEC_SIZE * 2)(%rsi), %YMM4, %YMM5
+	vpxorq	(VEC_SIZE * 2)(%rsi), %VMM(4), %VMM(5)
 	/* YMM6 = YMM5 | ((VEC_SIZE * 3)(%rsi) ^ YMM6).  */
-	vpternlogd $0xde, (VEC_SIZE * 3)(%rsi), %YMM5, %YMM6
+	vpternlogd $0xde, (VEC_SIZE * 3)(%rsi), %VMM(5), %VMM(6)
 # else
-	VMOVU	(VEC_SIZE * 2)(%rsi), %YMM5
-	TOLOWER_YMM (%YMM4, %YMM5)
-	VMOVU	(VEC_SIZE * 3)(%rsi), %YMM7
-	TOLOWER_YMM (%YMM6, %YMM7)
-	vpxorq	%YMM4, %YMM5, %YMM5
-	vpternlogd $0xde, %YMM7, %YMM5, %YMM6
-# endif
-	VPTESTNM %YMM6, %YMM6, %k0{%k1}
-	kmovd	%k0, %LOOP_REG
+	VMOVU	(VEC_SIZE * 2)(%rsi), %VMM(5)
+	TOLOWER_VMM (%VMM(4), %VMM(5))
+	VMOVU	(VEC_SIZE * 3)(%rsi), %VMM(7)
+	TOLOWER_VMM (%VMM(6), %VMM(7))
+	vpxorq	%VMM(4), %VMM(5), %VMM(5)
+	vpternlogd $0xde, %VMM(7), %VMM(5), %VMM(6)
+# endif
+	VPTESTNM %VMM(6), %VMM(6), %k0{%k1}
+	KMOV	%k0, %LOOP_REG
 	TESTEQ	%LOOP_REG
 	jnz	L(return_vec_2_3_end)
 
 	/* Best for code size to include ucond-jmp here. Would be faster
-	   if this case is hot to duplicate the L(return_vec_2_3_end) code
-	   as fall-through and have jump back to loop on mismatch
+	   if this case is hot to duplicate the L(return_vec_2_3_end)
+	   code as fall-through and have jump back to loop on mismatch
 	   comparison.  */
 	subq	$-(VEC_SIZE * 4), %rdi
 	subq	$-(VEC_SIZE * 4), %rsi
@@ -980,7 +1096,7 @@ L(ret_zero_in_loop_page_cross):
 L(return_vec_page_cross_0):
 	addl	$-VEC_SIZE, %eax
 L(return_vec_page_cross_1):
-	tzcntl	%ecx, %ecx
+	bsf	%VRCX, %VRCX
 # if defined USE_AS_STRNCMP || defined USE_AS_WCSCMP
 	leal	-VEC_SIZE(%rax, %rcx, SIZE_OF_CHAR), %ecx
 #  ifdef USE_AS_STRNCMP
@@ -1023,8 +1139,8 @@ L(ret9):
 L(page_cross):
 # ifndef USE_AS_STRNCMP
 	/* If both are VEC aligned we don't need any special logic here.
-	   Only valid for strcmp where stop condition is guranteed to be
-	   reachable by just reading memory.  */
+	   Only valid for strcmp where stop condition is guranteed to
+	   be reachable by just reading memory.  */
 	testl	$((VEC_SIZE - 1) << 20), %eax
 	jz	L(no_page_cross)
 # endif
@@ -1065,11 +1181,11 @@ L(page_cross):
 	   loadable memory until within 1x VEC of page cross.  */
 	.p2align 4,, 8
 L(page_cross_loop):
-	VMOVU	(%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %YMM0
-	VPTESTM	%YMM0, %YMM0, %k2
-	CMP_R1_S2_YMM (%YMM0, (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %YMM1, %k1){%k2}
-	kmovd	%k1, %ecx
-	TESTEQ	%ecx
+	VMOVU	(%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %VMM(0)
+	VPTESTM	%VMM(0), %VMM(0), %k2
+	CMP_R1_S2_VMM (%VMM(0), (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %VMM(1), %k1){%k2}
+	KMOV	%k1, %VRCX
+	TESTEQ	%VRCX
 	jnz	L(check_ret_vec_page_cross)
 	addl	$CHAR_PER_VEC, %OFFSET_REG
 # ifdef USE_AS_STRNCMP
@@ -1087,13 +1203,13 @@ L(page_cross_loop):
 	subl	%eax, %OFFSET_REG
 	/* OFFSET_REG has distance to page cross - VEC_SIZE. Guranteed
 	   to not cross page so is safe to load. Since we have already
-	   loaded at least 1 VEC from rsi it is also guranteed to be safe.
-	 */
-	VMOVU	(%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %YMM0
-	VPTESTM	%YMM0, %YMM0, %k2
-	CMP_R1_S2_YMM (%YMM0, (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %YMM1, %k1){%k2}
+	   loaded at least 1 VEC from rsi it is also guranteed to be
+	   safe.  */
+	VMOVU	(%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %VMM(0)
+	VPTESTM	%VMM(0), %VMM(0), %k2
+	CMP_R1_S2_VMM (%VMM(0), (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %VMM(1), %k1){%k2}
 
-	kmovd	%k1, %ecx
+	KMOV	%k1, %VRCX
 # ifdef USE_AS_STRNCMP
 	leal	CHAR_PER_VEC(%OFFSET_REG64), %eax
 	cmpq	%rax, %rdx
@@ -1104,7 +1220,7 @@ L(page_cross_loop):
 	addq	%rdi, %rdx
 #  endif
 # endif
-	TESTEQ	%ecx
+	TESTEQ	%VRCX
 	jz	L(prepare_loop_no_len)
 
 	.p2align 4,, 4
@@ -1112,7 +1228,7 @@ L(ret_vec_page_cross):
 # ifndef USE_AS_STRNCMP
 L(check_ret_vec_page_cross):
 # endif
-	tzcntl	%ecx, %ecx
+	tzcnt	%VRCX, %VRCX
 	addl	%OFFSET_REG, %ecx
 L(ret_vec_page_cross_cont):
 # ifdef USE_AS_WCSCMP
@@ -1139,9 +1255,9 @@ L(ret12):
 # ifdef USE_AS_STRNCMP
 	.p2align 4,, 10
 L(check_ret_vec_page_cross2):
-	TESTEQ	%ecx
+	TESTEQ	%VRCX
 L(check_ret_vec_page_cross):
-	tzcntl	%ecx, %ecx
+	tzcnt	%VRCX, %VRCX
 	addl	%OFFSET_REG, %ecx
 	cmpq	%rcx, %rdx
 	ja	L(ret_vec_page_cross_cont)
@@ -1180,8 +1296,71 @@ L(less_1x_vec_till_page):
 # ifdef USE_AS_WCSCMP
 	shrl	$2, %eax
 # endif
+
+	/* Find largest load size we can use. VEC_SIZE == 64 only check
+	   if we can do a full ymm load.  */
+# if VEC_SIZE == 64
+
+	cmpl	$((VEC_SIZE - 32) / SIZE_OF_CHAR), %eax
+	ja	L(less_32_till_page)
+
+
+	/* Use 16 byte comparison.  */
+	VMOVU	(%rdi), %VMM_256(0)
+	VPTESTM	%VMM_256(0), %VMM_256(0), %k2
+	CMP_R1_S2_YMM (%VMM_256(0), (%rsi), %VMM_256(1), %k1){%k2}
+	kmovd	%k1, %ecx
+#  ifdef USE_AS_WCSCMP
+	subl	$0xff, %ecx
+#  else
+	incl	%ecx
+#  endif
+	jnz	L(check_ret_vec_page_cross)
+	movl	$((VEC_SIZE - 32) / SIZE_OF_CHAR), %OFFSET_REG
+#  ifdef USE_AS_STRNCMP
+	cmpq	%OFFSET_REG64, %rdx
+	jbe	L(ret_zero_page_cross_slow_case64)
+	subl	%eax, %OFFSET_REG
+#  else
+	/* Explicit check for 32 byte alignment.  */
+	subl	%eax, %OFFSET_REG
+	jz	L(prepare_loop)
+#  endif
+	VMOVU	(%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %VMM_256(0)
+	VPTESTM	%VMM_256(0), %VMM_256(0), %k2
+	CMP_R1_S2_YMM (%VMM_256(0), (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %VMM_256(1), %k1){%k2}
+	kmovd	%k1, %ecx
+#  ifdef USE_AS_WCSCMP
+	subl	$0xff, %ecx
+#  else
+	incl	%ecx
+#  endif
+	jnz	L(check_ret_vec_page_cross)
+#  ifdef USE_AS_STRNCMP
+	addl	$(32 / SIZE_OF_CHAR), %OFFSET_REG
+	subq	%OFFSET_REG64, %rdx
+	jbe	L(ret_zero_page_cross_slow_case64)
+	subq	$-(CHAR_PER_VEC * 4), %rdx
+
+	leaq	-(VEC_SIZE * 4)(%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %rdi
+	leaq	-(VEC_SIZE * 4)(%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %rsi
+#  else
+	leaq	(32 - VEC_SIZE * 4)(%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %rdi
+	leaq	(32 - VEC_SIZE * 4)(%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %rsi
+#  endif
+	jmp	L(prepare_loop_aligned)
+
+#  ifdef USE_AS_STRNCMP
+	.p2align 4,, 2
+L(ret_zero_page_cross_slow_case64):
+	xorl	%eax, %eax
+	ret
+#  endif
+L(less_32_till_page):
+# endif
+
 	/* Find largest load size we can use.  */
-	cmpl	$(16 / SIZE_OF_CHAR), %eax
+	cmpl	$((VEC_SIZE - 16) / SIZE_OF_CHAR), %eax
 	ja	L(less_16_till_page)
 
 	/* Use 16 byte comparison.  */
@@ -1195,9 +1374,14 @@ L(less_1x_vec_till_page):
 	incw	%cx
 # endif
 	jnz	L(check_ret_vec_page_cross)
-	movl	$(16 / SIZE_OF_CHAR), %OFFSET_REG
+
+	movl	$((VEC_SIZE - 16) / SIZE_OF_CHAR), %OFFSET_REG
 # ifdef USE_AS_STRNCMP
+#  if VEC_SIZE == 32
 	cmpq	%OFFSET_REG64, %rdx
+#  else
+	cmpq	$(16 / SIZE_OF_CHAR), %rdx
+#  endif
 	jbe	L(ret_zero_page_cross_slow_case0)
 	subl	%eax, %OFFSET_REG
 # else
@@ -1239,7 +1423,7 @@ L(ret_zero_page_cross_slow_case0):
 
 	.p2align 4,, 10
 L(less_16_till_page):
-	cmpl	$(24 / SIZE_OF_CHAR), %eax
+	cmpl	$((VEC_SIZE - 8) / SIZE_OF_CHAR), %eax
 	ja	L(less_8_till_page)
 
 	/* Use 8 byte comparison.  */
@@ -1260,7 +1444,7 @@ L(less_16_till_page):
 	cmpq	$(8 / SIZE_OF_CHAR), %rdx
 	jbe	L(ret_zero_page_cross_slow_case0)
 # endif
-	movl	$(24 / SIZE_OF_CHAR), %OFFSET_REG
+	movl	$((VEC_SIZE - 8) / SIZE_OF_CHAR), %OFFSET_REG
 	subl	%eax, %OFFSET_REG
 
 	vmovq	(%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm0
@@ -1320,7 +1504,7 @@ L(ret_less_8_wcs):
 	ret
 
 # else
-	cmpl	$28, %eax
+	cmpl	$(VEC_SIZE - 4), %eax
 	ja	L(less_4_till_page)
 
 	vmovd	(%rdi), %xmm0
@@ -1335,7 +1519,7 @@ L(ret_less_8_wcs):
 	cmpq	$4, %rdx
 	jbe	L(ret_zero_page_cross_slow_case1)
 #  endif
-	movl	$(28 / SIZE_OF_CHAR), %OFFSET_REG
+	movl	$((VEC_SIZE - 4) / SIZE_OF_CHAR), %OFFSET_REG
 	subl	%eax, %OFFSET_REG
 
 	vmovd	(%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm0
@@ -1386,7 +1570,7 @@ L(less_4_loop):
 #  endif
 	incq	%rdi
 	/* end condition is reach page boundary (rdi is aligned).  */
-	testl	$31, %edi
+	testb	$(VEC_SIZE - 1), %dil
 	jnz	L(less_4_loop)
 	leaq	-(VEC_SIZE * 4)(%rdi, %rsi), %rsi
 	addq	$-(VEC_SIZE * 4), %rdi
-- 
2.34.1


^ permalink raw reply	[flat|nested] 41+ messages in thread

* [PATCH v1 7/7] Bench: Improve benchtests for memchr, strchr, strnlen, strrchr
  2022-10-18  2:48 [PATCH v1 1/7] x86: Optimize memchr-evex.S and implement with VMM headers Noah Goldstein
                   ` (4 preceding siblings ...)
  2022-10-18  2:49 ` [PATCH v1 6/7] x86: Add support for VEC_SIZE == 64 in strcmp-evex.S impl Noah Goldstein
@ 2022-10-18  2:49 ` Noah Goldstein
  2022-10-18 21:00   ` H.J. Lu
  2022-10-18  2:50 ` [PATCH v1 1/7] x86: Optimize memchr-evex.S and implement with VMM headers Noah Goldstein
                   ` (2 subsequent siblings)
  8 siblings, 1 reply; 41+ messages in thread
From: Noah Goldstein @ 2022-10-18  2:49 UTC (permalink / raw)
  To: libc-alpha; +Cc: goldstein.w.n, hjl.tools, carlos

1. Add more complete coverage in the medium size range.
2. In strnlen remove the `1 << i` which was UB (`i` could go beyond
   32/64)
3. Add timer for total benchmark runtime (useful for deciding about
   tradeoff between coverage and runtime).
---
 benchtests/bench-memchr.c    | 83 +++++++++++++++++++++++++-----------
 benchtests/bench-rawmemchr.c | 36 ++++++++++++++--
 benchtests/bench-strchr.c    | 42 +++++++++++++-----
 benchtests/bench-strnlen.c   | 19 ++++++---
 benchtests/bench-strrchr.c   | 33 +++++++++++++-
 5 files changed, 166 insertions(+), 47 deletions(-)

diff --git a/benchtests/bench-memchr.c b/benchtests/bench-memchr.c
index 0facda2fa0..c4d758ae61 100644
--- a/benchtests/bench-memchr.c
+++ b/benchtests/bench-memchr.c
@@ -126,9 +126,10 @@ do_test (json_ctx_t *json_ctx, size_t align, size_t pos, size_t len,
 int
 test_main (void)
 {
-  size_t i;
+  size_t i, j, al, al_max;
   int repeats;
   json_ctx_t json_ctx;
+  timing_t bench_start, bench_stop, bench_total_time;
   test_init ();
 
   json_init (&json_ctx, 0, stdout);
@@ -147,35 +148,47 @@ test_main (void)
 
   json_array_begin (&json_ctx, "results");
 
+  TIMING_NOW (bench_start);
+  al_max = 0;
+#ifdef USE_AS_MEMRCHR
+  al_max = getpagesize () / 2;
+#endif
+
   for (repeats = 0; repeats < 2; ++repeats)
     {
-      for (i = 1; i < 8; ++i)
+      for (al = 0; al <= al_max; al += getpagesize () / 2)
 	{
-	  do_test (&json_ctx, 0, 16 << i, 2048, 23, repeats);
-	  do_test (&json_ctx, i, 64, 256, 23, repeats);
-	  do_test (&json_ctx, 0, 16 << i, 2048, 0, repeats);
-	  do_test (&json_ctx, i, 64, 256, 0, repeats);
-
-	  do_test (&json_ctx, getpagesize () - 15, 64, 256, 0, repeats);
+	  for (i = 1; i < 8; ++i)
+	    {
+	      do_test (&json_ctx, al, 16 << i, 2048, 23, repeats);
+	      do_test (&json_ctx, al + i, 64, 256, 23, repeats);
+	      do_test (&json_ctx, al, 16 << i, 2048, 0, repeats);
+	      do_test (&json_ctx, al + i, 64, 256, 0, repeats);
+
+	      do_test (&json_ctx, al + getpagesize () - 15, 64, 256, 0,
+		       repeats);
 #ifdef USE_AS_MEMRCHR
-	  /* Also test the position close to the beginning for memrchr.  */
-	  do_test (&json_ctx, 0, i, 256, 23, repeats);
-	  do_test (&json_ctx, 0, i, 256, 0, repeats);
-	  do_test (&json_ctx, i, i, 256, 23, repeats);
-	  do_test (&json_ctx, i, i, 256, 0, repeats);
+	      /* Also test the position close to the beginning for memrchr.  */
+	      do_test (&json_ctx, al, i, 256, 23, repeats);
+	      do_test (&json_ctx, al, i, 256, 0, repeats);
+	      do_test (&json_ctx, al + i, i, 256, 23, repeats);
+	      do_test (&json_ctx, al + i, i, 256, 0, repeats);
 #endif
+	    }
+	  for (i = 1; i < 8; ++i)
+	    {
+	      do_test (&json_ctx, al + i, i << 5, 192, 23, repeats);
+	      do_test (&json_ctx, al + i, i << 5, 192, 0, repeats);
+	      do_test (&json_ctx, al + i, i << 5, 256, 23, repeats);
+	      do_test (&json_ctx, al + i, i << 5, 256, 0, repeats);
+	      do_test (&json_ctx, al + i, i << 5, 512, 23, repeats);
+	      do_test (&json_ctx, al + i, i << 5, 512, 0, repeats);
+
+	      do_test (&json_ctx, al + getpagesize () - 15, i << 5, 256, 23,
+		       repeats);
+	    }
 	}
-      for (i = 1; i < 8; ++i)
-	{
-	  do_test (&json_ctx, i, i << 5, 192, 23, repeats);
-	  do_test (&json_ctx, i, i << 5, 192, 0, repeats);
-	  do_test (&json_ctx, i, i << 5, 256, 23, repeats);
-	  do_test (&json_ctx, i, i << 5, 256, 0, repeats);
-	  do_test (&json_ctx, i, i << 5, 512, 23, repeats);
-	  do_test (&json_ctx, i, i << 5, 512, 0, repeats);
-
-	  do_test (&json_ctx, getpagesize () - 15, i << 5, 256, 23, repeats);
-	}
+
       for (i = 1; i < 32; ++i)
 	{
 	  do_test (&json_ctx, 0, i, i + 1, 23, repeats);
@@ -207,11 +220,33 @@ test_main (void)
 	  do_test (&json_ctx, 0, 2, i + 1, 0, repeats);
 #endif
 	}
+      for (al = 0; al <= al_max; al += getpagesize () / 2)
+	{
+	  for (i = (16 / sizeof (CHAR)); i <= (8192 / sizeof (CHAR)); i += i)
+	    {
+	      for (j = 0; j <= (384 / sizeof (CHAR));
+		   j += (32 / sizeof (CHAR)))
+		{
+		  do_test (&json_ctx, al, i + j, i, 23, repeats);
+		  do_test (&json_ctx, al, i, i + j, 23, repeats);
+		  if (j < i)
+		    {
+		      do_test (&json_ctx, al, i - j, i, 23, repeats);
+		      do_test (&json_ctx, al, i, i - j, 23, repeats);
+		    }
+		}
+	    }
+	}
+
 #ifndef USE_AS_MEMRCHR
       break;
 #endif
     }
 
+  TIMING_NOW (bench_stop);
+  TIMING_DIFF (bench_total_time, bench_start, bench_stop);
+  json_attr_double (&json_ctx, "benchtime", bench_total_time);
+
   json_array_end (&json_ctx);
   json_attr_object_end (&json_ctx);
   json_attr_object_end (&json_ctx);
diff --git a/benchtests/bench-rawmemchr.c b/benchtests/bench-rawmemchr.c
index b1803afc14..667ecd48f9 100644
--- a/benchtests/bench-rawmemchr.c
+++ b/benchtests/bench-rawmemchr.c
@@ -70,7 +70,7 @@ do_test (json_ctx_t *json_ctx, size_t align, size_t pos, size_t len, int seek_ch
   size_t i;
   char *result;
 
-  align &= 7;
+  align &= getpagesize () - 1;
   if (align + len >= page_size)
     return;
 
@@ -106,7 +106,7 @@ test_main (void)
 {
   json_ctx_t json_ctx;
   size_t i;
-
+  timing_t bench_start, bench_stop, bench_total_time;
   test_init ();
 
   json_init (&json_ctx, 0, stdout);
@@ -120,11 +120,12 @@ test_main (void)
 
   json_array_begin (&json_ctx, "ifuncs");
   FOR_EACH_IMPL (impl, 0)
-      json_element_string (&json_ctx, impl->name);
+    json_element_string (&json_ctx, impl->name);
   json_array_end (&json_ctx);
 
   json_array_begin (&json_ctx, "results");
 
+  TIMING_NOW (bench_start);
   for (i = 1; i < 7; ++i)
     {
       do_test (&json_ctx, 0, 16 << i, 2048, 23);
@@ -137,6 +138,35 @@ test_main (void)
       do_test (&json_ctx, 0, i, i + 1, 23);
       do_test (&json_ctx, 0, i, i + 1, 0);
     }
+  for (; i < 256; i += 32)
+    {
+      do_test (&json_ctx, 0, i, i + 1, 23);
+      do_test (&json_ctx, 0, i - 1, i, 23);
+    }
+  for (; i < 512; i += 64)
+    {
+      do_test (&json_ctx, 0, i, i + 1, 23);
+      do_test (&json_ctx, 0, i - 1, i, 23);
+    }
+  for (; i < 1024; i += 128)
+    {
+      do_test (&json_ctx, 0, i, i + 1, 23);
+      do_test (&json_ctx, 0, i - 1, i, 23);
+    }
+  for (; i < 2048; i += 256)
+    {
+      do_test (&json_ctx, 0, i, i + 1, 23);
+      do_test (&json_ctx, 0, i - 1, i, 23);
+    }
+  for (; i < 4096; i += 512)
+    {
+      do_test (&json_ctx, 0, i, i + 1, 23);
+      do_test (&json_ctx, 0, i - 1, i, 23);
+    }
+
+  TIMING_NOW (bench_stop);
+  TIMING_DIFF (bench_total_time, bench_start, bench_stop);
+  json_attr_double (&json_ctx, "benchtime", bench_total_time);
 
   json_array_end (&json_ctx);
   json_attr_object_end (&json_ctx);
diff --git a/benchtests/bench-strchr.c b/benchtests/bench-strchr.c
index 54640bde7e..af325806ce 100644
--- a/benchtests/bench-strchr.c
+++ b/benchtests/bench-strchr.c
@@ -287,8 +287,8 @@ int
 test_main (void)
 {
   json_ctx_t json_ctx;
-  size_t i;
-
+  size_t i, j;
+  timing_t bench_start, bench_stop, bench_total_time;
   test_init ();
 
   json_init (&json_ctx, 0, stdout);
@@ -307,6 +307,7 @@ test_main (void)
 
   json_array_begin (&json_ctx, "results");
 
+  TIMING_NOW (bench_start);
   for (i = 1; i < 8; ++i)
     {
       do_test (&json_ctx, 0, 16 << i, 2048, SMALL_CHAR, MIDDLE_CHAR);
@@ -367,15 +368,34 @@ test_main (void)
       do_test (&json_ctx, 0, i, i + 1, 0, BIG_CHAR);
     }
 
-  DO_RAND_TEST(&json_ctx, 0, 15, 16, 0.0);
-  DO_RAND_TEST(&json_ctx, 0, 15, 16, 0.1);
-  DO_RAND_TEST(&json_ctx, 0, 15, 16, 0.25);
-  DO_RAND_TEST(&json_ctx, 0, 15, 16, 0.33);
-  DO_RAND_TEST(&json_ctx, 0, 15, 16, 0.5);
-  DO_RAND_TEST(&json_ctx, 0, 15, 16, 0.66);
-  DO_RAND_TEST(&json_ctx, 0, 15, 16, 0.75);
-  DO_RAND_TEST(&json_ctx, 0, 15, 16, 0.9);
-  DO_RAND_TEST(&json_ctx, 0, 15, 16, 1.0);
+  for (i = 16 / sizeof (CHAR); i <= 8192 / sizeof (CHAR); i += i)
+    {
+      for (j = 32 / sizeof (CHAR); j <= 320 / sizeof (CHAR);
+	   j += 32 / sizeof (CHAR))
+	{
+	  do_test (&json_ctx, 0, i, i + j, 0, MIDDLE_CHAR);
+	  do_test (&json_ctx, 0, i + j, i, 0, MIDDLE_CHAR);
+	  if (i > j)
+	    {
+	      do_test (&json_ctx, 0, i, i - j, 0, MIDDLE_CHAR);
+	      do_test (&json_ctx, 0, i - j, i, 0, MIDDLE_CHAR);
+	    }
+	}
+    }
+
+  DO_RAND_TEST (&json_ctx, 0, 15, 16, 0.0);
+  DO_RAND_TEST (&json_ctx, 0, 15, 16, 0.1);
+  DO_RAND_TEST (&json_ctx, 0, 15, 16, 0.25);
+  DO_RAND_TEST (&json_ctx, 0, 15, 16, 0.33);
+  DO_RAND_TEST (&json_ctx, 0, 15, 16, 0.5);
+  DO_RAND_TEST (&json_ctx, 0, 15, 16, 0.66);
+  DO_RAND_TEST (&json_ctx, 0, 15, 16, 0.75);
+  DO_RAND_TEST (&json_ctx, 0, 15, 16, 0.9);
+  DO_RAND_TEST (&json_ctx, 0, 15, 16, 1.0);
+
+  TIMING_NOW (bench_stop);
+  TIMING_DIFF (bench_total_time, bench_start, bench_stop);
+  json_attr_double (&json_ctx, "benchtime", bench_total_time);
 
   json_array_end (&json_ctx);
   json_attr_object_end (&json_ctx);
diff --git a/benchtests/bench-strnlen.c b/benchtests/bench-strnlen.c
index 13b46b3f57..c6281b6373 100644
--- a/benchtests/bench-strnlen.c
+++ b/benchtests/bench-strnlen.c
@@ -117,7 +117,7 @@ test_main (void)
 {
   size_t i, j;
   json_ctx_t json_ctx;
-
+  timing_t bench_start, bench_stop, bench_total_time;
   test_init ();
 
   json_init (&json_ctx, 0, stdout);
@@ -136,6 +136,7 @@ test_main (void)
 
   json_array_begin (&json_ctx, "results");
 
+  TIMING_NOW (bench_start);
   for (i = 0; i <= 1; ++i)
     {
       do_test (&json_ctx, i, 1, 128, MIDDLE_CHAR);
@@ -195,23 +196,27 @@ test_main (void)
     {
       for (j = 0; j <= (704 / sizeof (CHAR)); j += (32 / sizeof (CHAR)))
 	{
-	  do_test (&json_ctx, 0, 1 << i, (i + j), BIG_CHAR);
 	  do_test (&json_ctx, 0, i + j, i, BIG_CHAR);
-
-	  do_test (&json_ctx, 64, 1 << i, (i + j), BIG_CHAR);
 	  do_test (&json_ctx, 64, i + j, i, BIG_CHAR);
 
+	  do_test (&json_ctx, 0, i, i + j, BIG_CHAR);
+	  do_test (&json_ctx, 64, i, i + j, BIG_CHAR);
+
 	  if (j < i)
 	    {
-	      do_test (&json_ctx, 0, 1 << i, i - j, BIG_CHAR);
 	      do_test (&json_ctx, 0, i - j, i, BIG_CHAR);
-
-	      do_test (&json_ctx, 64, 1 << i, i - j, BIG_CHAR);
 	      do_test (&json_ctx, 64, i - j, i, BIG_CHAR);
+
+	      do_test (&json_ctx, 0, i, i - j, BIG_CHAR);
+	      do_test (&json_ctx, 64, i, i - j, BIG_CHAR);
 	    }
 	}
     }
 
+  TIMING_NOW (bench_stop);
+  TIMING_DIFF (bench_total_time, bench_start, bench_stop);
+  json_attr_double (&json_ctx, "benchtime", bench_total_time);
+
   json_array_end (&json_ctx);
   json_attr_object_end (&json_ctx);
   json_attr_object_end (&json_ctx);
diff --git a/benchtests/bench-strrchr.c b/benchtests/bench-strrchr.c
index 7cd2a15484..e6d8163047 100644
--- a/benchtests/bench-strrchr.c
+++ b/benchtests/bench-strrchr.c
@@ -151,8 +151,9 @@ int
 test_main (void)
 {
   json_ctx_t json_ctx;
-  size_t i, j;
+  size_t i, j, k;
   int seek;
+  timing_t bench_start, bench_stop, bench_total_time;
 
   test_init ();
   json_init (&json_ctx, 0, stdout);
@@ -171,9 +172,10 @@ test_main (void)
 
   json_array_begin (&json_ctx, "results");
 
+  TIMING_NOW (bench_start);
   for (seek = 0; seek <= 23; seek += 23)
     {
-      for (j = 1; j < 32; j += j)
+      for (j = 1; j <= 256; j = (j * 4))
 	{
 	  for (i = 1; i < 9; ++i)
 	    {
@@ -197,12 +199,39 @@ test_main (void)
 	      do_test (&json_ctx, getpagesize () - i / 2 - 1, i, i + 1, seek,
 		       SMALL_CHAR, j);
 	    }
+
+	  for (i = (16 / sizeof (CHAR)); i <= (288 / sizeof (CHAR)); i += 32)
+	    {
+	      do_test (&json_ctx, 0, i - 16, i, seek, SMALL_CHAR, j);
+	      do_test (&json_ctx, 0, i, i + 16, seek, SMALL_CHAR, j);
+	    }
+
+	  for (i = (16 / sizeof (CHAR)); i <= (2048 / sizeof (CHAR)); i += i)
+	    {
+	      for (k = 0; k <= (288 / sizeof (CHAR));
+		   k += (48 / sizeof (CHAR)))
+		{
+		  do_test (&json_ctx, 0, k, i, seek, SMALL_CHAR, j);
+		  do_test (&json_ctx, 0, i, i + k, seek, SMALL_CHAR, j);
+
+		  if (k < i)
+		    {
+		      do_test (&json_ctx, 0, i - k, i, seek, SMALL_CHAR, j);
+		      do_test (&json_ctx, 0, k, i - k, seek, SMALL_CHAR, j);
+		      do_test (&json_ctx, 0, i, i - k, seek, SMALL_CHAR, j);
+		    }
+		}
+	    }
+
 	  if (seek == 0)
 	    {
 	      break;
 	    }
 	}
     }
+  TIMING_NOW (bench_stop);
+  TIMING_DIFF (bench_total_time, bench_start, bench_stop);
+  json_attr_double (&json_ctx, "benchtime", bench_total_time);
 
   json_array_end (&json_ctx);
   json_attr_object_end (&json_ctx);
-- 
2.34.1


^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH v1 1/7] x86: Optimize memchr-evex.S and implement with VMM headers
  2022-10-18  2:48 [PATCH v1 1/7] x86: Optimize memchr-evex.S and implement with VMM headers Noah Goldstein
                   ` (5 preceding siblings ...)
  2022-10-18  2:49 ` [PATCH v1 7/7] Bench: Improve benchtests for memchr, strchr, strnlen, strrchr Noah Goldstein
@ 2022-10-18  2:50 ` Noah Goldstein
  2022-10-18 23:19 ` [PATCH v2 " Noah Goldstein
  2022-10-19  0:44 ` [PATCH v3 1/7] x86: Optimize memchr-evex.S and implement with VMM headers Noah Goldstein
  8 siblings, 0 replies; 41+ messages in thread
From: Noah Goldstein @ 2022-10-18  2:50 UTC (permalink / raw)
  To: libc-alpha; +Cc: hjl.tools, carlos

[-- Attachment #1: Type: text/plain, Size: 49429 bytes --]

On Mon, Oct 17, 2022 at 7:49 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> Optimizations are:
>
> 1. Use the fact that tzcnt(0) -> VEC_SIZE for memchr to save a branch
>    in short string case.
> 2. Restructure code so that small strings are given the hot path.
>         - This is a net-zero on the benchmark suite but in general makes
>       sense as smaller sizes are far more common.
> 3. Use more code-size efficient instructions.
>         - tzcnt ...     -> bsf ...
>         - vpcmpb $0 ... -> vpcmpeq ...
> 4. Align labels less aggressively, especially if it doesn't save fetch
>    blocks / causes the basic-block to span extra cache-lines.
>
> The optimizations (especially for point 2) make the memchr and
> rawmemchr code essentially incompatible so split rawmemchr-evex
> to a new file.
>
> Code Size Changes:
> memchr-evex.S       : -107 bytes
> rawmemchr-evex.S    :  -53 bytes
>
> Net perf changes:
>
> Reported as geometric mean of all improvements / regressions from N=10
> runs of the benchtests. Value as New Time / Old Time so < 1.0 is
> improvement and 1.0 is regression.
>
> memchr-evex.S       : 0.928
> rawmemchr-evex.S    : 0.986 (Less targets cross cache lines)
>
> Full results attached in email.
>
> Full check passes on x86-64.
> ---
>  sysdeps/x86_64/multiarch/memchr-evex.S        | 939 ++++++++++--------
>  sysdeps/x86_64/multiarch/rawmemchr-evex-rtm.S |   9 +-
>  sysdeps/x86_64/multiarch/rawmemchr-evex.S     | 313 +++++-
>  3 files changed, 851 insertions(+), 410 deletions(-)
>
> diff --git a/sysdeps/x86_64/multiarch/memchr-evex.S b/sysdeps/x86_64/multiarch/memchr-evex.S
> index 0dd4f1dcce..23a1c0018e 100644
> --- a/sysdeps/x86_64/multiarch/memchr-evex.S
> +++ b/sysdeps/x86_64/multiarch/memchr-evex.S
> @@ -21,17 +21,27 @@
>
>  #if ISA_SHOULD_BUILD (4)
>
> +# ifndef VEC_SIZE
> +#  include "x86-evex256-vecs.h"
> +# endif
> +
>  # ifndef MEMCHR
>  #  define MEMCHR       __memchr_evex
>  # endif
>
>  # ifdef USE_AS_WMEMCHR
> +#  define PC_SHIFT_GPR rcx
> +#  define VPTESTN      vptestnmd
>  #  define VPBROADCAST  vpbroadcastd
>  #  define VPMINU       vpminud
>  #  define VPCMP        vpcmpd
>  #  define VPCMPEQ      vpcmpeqd
>  #  define CHAR_SIZE    4
> +
> +#  define USE_WIDE_CHAR
>  # else
> +#  define PC_SHIFT_GPR rdi
> +#  define VPTESTN      vptestnmb
>  #  define VPBROADCAST  vpbroadcastb
>  #  define VPMINU       vpminub
>  #  define VPCMP        vpcmpb
> @@ -39,534 +49,661 @@
>  #  define CHAR_SIZE    1
>  # endif
>
> -       /* In the 4x loop the RTM and non-RTM versions have data pointer
> -          off by VEC_SIZE * 4 with RTM version being VEC_SIZE * 4 greater.
> -          This is represented by BASE_OFFSET. As well because the RTM
> -          version uses vpcmp which stores a bit per element compared where
> -          the non-RTM version uses vpcmpeq which stores a bit per byte
> -          compared RET_SCALE of CHAR_SIZE is only relevant for the RTM
> -          version.  */
> -# ifdef USE_IN_RTM
> +# include "reg-macros.h"
> +
> +
> +/* If not in an RTM and VEC_SIZE != 64 (the VEC_SIZE = 64
> +   doesn't have VEX encoding), use VEX encoding in loop so we
> +   can use vpcmpeqb + vptern which is more efficient than the
> +   EVEX alternative.  */
> +# if defined USE_IN_RTM || VEC_SIZE == 64
> +#  undef COND_VZEROUPPER
> +#  undef VZEROUPPER_RETURN
> +#  undef VZEROUPPER
> +
> +#  define COND_VZEROUPPER
> +#  define VZEROUPPER_RETURN    ret
>  #  define VZEROUPPER
> -#  define BASE_OFFSET  (VEC_SIZE * 4)
> -#  define RET_SCALE    CHAR_SIZE
> +
> +#  define USE_TERN_IN_LOOP     0
>  # else
> +#  define USE_TERN_IN_LOOP     1
> +#  undef VZEROUPPER
>  #  define VZEROUPPER   vzeroupper
> -#  define BASE_OFFSET  0
> -#  define RET_SCALE    1
>  # endif
>
> -       /* In the return from 4x loop memchr and rawmemchr versions have
> -          data pointers off by VEC_SIZE * 4 with memchr version being
> -          VEC_SIZE * 4 greater.  */
> -# ifdef USE_AS_RAWMEMCHR
> -#  define RET_OFFSET   (BASE_OFFSET - (VEC_SIZE * 4))
> -#  define RAW_PTR_REG  rcx
> -#  define ALGN_PTR_REG rdi
> +# if USE_TERN_IN_LOOP
> +       /* Resulting bitmask for vpmovmskb has 4-bits set for each wchar
> +          so we don't want to multiply resulting index.  */
> +#  define TERN_CHAR_MULT       1
> +
> +#  ifdef USE_AS_WMEMCHR
> +#   define TEST_END()  inc %VRCX
> +#  else
> +#   define TEST_END()  add %rdx, %rcx
> +#  endif
>  # else
> -#  define RET_OFFSET   BASE_OFFSET
> -#  define RAW_PTR_REG  rdi
> -#  define ALGN_PTR_REG rcx
> +#  define TERN_CHAR_MULT       CHAR_SIZE
> +#  define TEST_END()   KORTEST %k2, %k3
>  # endif
>
> -# define XMMZERO       xmm23
> -# define YMMZERO       ymm23
> -# define XMMMATCH      xmm16
> -# define YMMMATCH      ymm16
> -# define YMM1          ymm17
> -# define YMM2          ymm18
> -# define YMM3          ymm19
> -# define YMM4          ymm20
> -# define YMM5          ymm21
> -# define YMM6          ymm22
> +# if defined USE_AS_WMEMCHR || !USE_TERN_IN_LOOP
> +#  ifndef USE_AS_WMEMCHR
> +#   define GPR_X0_IS_RET       1
> +#  else
> +#   define GPR_X0_IS_RET       0
> +#  endif
> +#  define GPR_X0       rax
> +# else
> +#  define GPR_X0_IS_RET        0
> +#  define GPR_X0       rdx
> +# endif
> +
> +# define CHAR_PER_VEC  (VEC_SIZE / CHAR_SIZE)
>
> -# ifndef SECTION
> -#  define SECTION(p)   p##.evex
> +# if CHAR_PER_VEC == 64
> +#  define LAST_VEC_OFFSET      (VEC_SIZE * 3)
> +# else
> +#  define LAST_VEC_OFFSET      (VEC_SIZE * 2)
> +# endif
> +# if CHAR_PER_VEC >= 32
> +#  define MASK_GPR(...)        VGPR(__VA_ARGS__)
> +# elif CHAR_PER_VEC == 16
> +#  define MASK_GPR(reg)        VGPR_SZ(reg, 16)
> +# else
> +#  define MASK_GPR(reg)        VGPR_SZ(reg, 8)
>  # endif
>
> -# define VEC_SIZE 32
> -# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE)
> -# define PAGE_SIZE 4096
> +# define VMATCH        VMM(0)
> +# define VMATCH_LO     VMM_lo(0)
>
> -       .section SECTION(.text),"ax",@progbits
> +# define PAGE_SIZE     4096
> +
> +
> +       .section SECTION(.text), "ax", @progbits
>  ENTRY_P2ALIGN (MEMCHR, 6)
> -# ifndef USE_AS_RAWMEMCHR
>         /* Check for zero length.  */
>         test    %RDX_LP, %RDX_LP
> -       jz      L(zero)
> +       jz      L(zero_0)
>
> -#  ifdef __ILP32__
> +# ifdef __ILP32__
>         /* Clear the upper 32 bits.  */
>         movl    %edx, %edx
> -#  endif
>  # endif
> -       /* Broadcast CHAR to YMMMATCH.  */
> -       VPBROADCAST %esi, %YMMMATCH
> +       VPBROADCAST %esi, %VMATCH
>         /* Check if we may cross page boundary with one vector load.  */
>         movl    %edi, %eax
>         andl    $(PAGE_SIZE - 1), %eax
>         cmpl    $(PAGE_SIZE - VEC_SIZE), %eax
> -       ja      L(cross_page_boundary)
> +       ja      L(page_cross)
> +
> +       VPCMPEQ (%rdi), %VMATCH, %k0
> +       KMOV    %k0, %VRAX
> +# ifndef USE_AS_WMEMCHR
> +       /* If rcx is zero then tzcnt -> CHAR_PER_VEC.  NB: there is a
> +          already a dependency between rcx and rsi so no worries about
> +          false-dep here.  */
> +       tzcnt   %VRAX, %VRSI
> +       /* If rdx <= rsi then either 1) rcx was non-zero (there was a
> +          match) but it was out of bounds or 2) rcx was zero and rdx
> +          was <= VEC_SIZE so we are done scanning.  */
> +       cmpq    %rsi, %rdx
> +       /* NB: Use branch to return zero/non-zero.  Common usage will
> +          branch on result of function (if return is null/non-null).
> +          This branch can be used to predict the ensuing one so there
> +          is no reason to extend the data-dependency with cmovcc.  */
> +       jbe     L(zero_0)
> +
> +       /* If rcx is zero then len must be > RDX, otherwise since we
> +          already tested len vs lzcnt(rcx) (in rsi) we are good to
> +          return this match.  */
> +       test    %VRAX, %VRAX
> +       jz      L(more_1x_vec)
> +       leaq    (%rdi, %rsi), %rax
> +# else
>
> -       /* Check the first VEC_SIZE bytes.  */
> -       VPCMP   $0, (%rdi), %YMMMATCH, %k0
> -       kmovd   %k0, %eax
> -# ifndef USE_AS_RAWMEMCHR
> -       /* If length < CHAR_PER_VEC handle special.  */
> +       /* We can't use the `tzcnt` trick for wmemchr because CHAR_SIZE
> +          > 1 so if rcx is tzcnt != CHAR_PER_VEC.  */
>         cmpq    $CHAR_PER_VEC, %rdx
> -       jbe     L(first_vec_x0)
> -# endif
> -       testl   %eax, %eax
> -       jz      L(aligned_more)
> -       tzcntl  %eax, %eax
> -# ifdef USE_AS_WMEMCHR
> -       /* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count.  */
> +       ja      L(more_1x_vec)
> +       tzcnt   %VRAX, %VRAX
> +       cmpl    %eax, %edx
> +       jbe     L(zero_0)
> +L(first_vec_x0_ret):
>         leaq    (%rdi, %rax, CHAR_SIZE), %rax
> -# else
> -       addq    %rdi, %rax
>  # endif
>         ret
>
> -# ifndef USE_AS_RAWMEMCHR
> -L(zero):
> -       xorl    %eax, %eax
> -       ret
> -
> -       .p2align 4
> -L(first_vec_x0):
> -       /* Check if first match was before length. NB: tzcnt has false data-
> -          dependency on destination. eax already had a data-dependency on esi
> -          so this should have no affect here.  */
> -       tzcntl  %eax, %esi
> -#  ifdef USE_AS_WMEMCHR
> -       leaq    (%rdi, %rsi, CHAR_SIZE), %rdi
> -#  else
> -       addq    %rsi, %rdi
> -#  endif
> +       /* Only fits in first cache line for VEC_SIZE == 32.  */
> +# if VEC_SIZE == 32
> +       .p2align 4,, 2
> +L(zero_0):
>         xorl    %eax, %eax
> -       cmpl    %esi, %edx
> -       cmovg   %rdi, %rax
>         ret
>  # endif
>
> -       .p2align 4
> -L(cross_page_boundary):
> -       /* Save pointer before aligning as its original value is
> -          necessary for computer return address if byte is found or
> -          adjusting length if it is not and this is memchr.  */
> -       movq    %rdi, %rcx
> -       /* Align data to VEC_SIZE. ALGN_PTR_REG is rcx for memchr and rdi
> -          for rawmemchr.  */
> -       andq    $-VEC_SIZE, %ALGN_PTR_REG
> -       VPCMP   $0, (%ALGN_PTR_REG), %YMMMATCH, %k0
> -       kmovd   %k0, %r8d
> +       .p2align 4,, 9
> +L(more_1x_vec):
>  # ifdef USE_AS_WMEMCHR
> -       /* NB: Divide shift count by 4 since each bit in K0 represent 4
> -          bytes.  */
> -       sarl    $2, %eax
> -# endif
> -# ifndef USE_AS_RAWMEMCHR
> -       movl    $(PAGE_SIZE / CHAR_SIZE), %esi
> -       subl    %eax, %esi
> +       /* If wmemchr still need to test if there was a match in first
> +          VEC.  Use bsf to test here so we can reuse
> +          L(first_vec_x0_ret).  */
> +       bsf     %VRAX, %VRAX
> +       jnz     L(first_vec_x0_ret)
>  # endif
> +
> +L(page_cross_continue):
>  # ifdef USE_AS_WMEMCHR
> -       andl    $(CHAR_PER_VEC - 1), %eax
> -# endif
> -       /* Remove the leading bytes.  */
> -       sarxl   %eax, %r8d, %eax
> -# ifndef USE_AS_RAWMEMCHR
> -       /* Check the end of data.  */
> -       cmpq    %rsi, %rdx
> -       jbe     L(first_vec_x0)
> +       /* We can't use end of the buffer to re-calculate length for
> +          wmemchr as len * CHAR_SIZE may overflow.  */
> +       leaq    -(VEC_SIZE + CHAR_SIZE)(%rdi), %rax
> +       andq    $(VEC_SIZE * -1), %rdi
> +       subq    %rdi, %rax
> +       sarq    $2, %rax
> +       addq    %rdx, %rax
> +# else
> +       leaq    -(VEC_SIZE + 1)(%rdx, %rdi), %rax
> +       andq    $(VEC_SIZE * -1), %rdi
> +       subq    %rdi, %rax
>  # endif
> -       testl   %eax, %eax
> -       jz      L(cross_page_continue)
> -       tzcntl  %eax, %eax
> +
> +       /* rax contains remaining length - 1.  -1 so we can get imm8
> +          encoding in a few additional places saving code size.  */
> +
> +       /* Needed regardless of remaining length.  */
> +       VPCMPEQ VEC_SIZE(%rdi), %VMATCH, %k0
> +       KMOV    %k0, %VRDX
> +
> +       /* We cannot fold the above `sub %rdi, %rax` with the `cmp
> +          $(CHAR_PER_VEC * 2), %rax` because its possible for a very
> +          large length to overflow and cause the subtract to carry
> +          despite length being above CHAR_PER_VEC * 2.  */
> +       cmpq    $(CHAR_PER_VEC * 2 - 1), %rax
> +       ja      L(more_2x_vec)
> +L(last_2x_vec):
> +
> +       test    %VRDX, %VRDX
> +       jnz     L(first_vec_x1_check)
> +
> +       /* Check the end of data.  NB: use 8-bit operations to save code
> +          size.  We no longer need the full-width of eax and will
> +          perform a write-only operation over eax so there will be no
> +          partial-register stalls.  */
> +       subb    $(CHAR_PER_VEC * 1 - 1), %al
> +       jle     L(zero_0)
> +
> +       VPCMPEQ (VEC_SIZE * 2)(%rdi), %VMATCH, %k0
> +       KMOV    %k0, %VRCX
>  # ifdef USE_AS_WMEMCHR
> -       /* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count.  */
> -       leaq    (%RAW_PTR_REG, %rax, CHAR_SIZE), %rax
> +       /* For wmemchr against we can't take advantage of tzcnt(0) ==
> +          VEC_SIZE as CHAR_PER_VEC != VEC_SIZE.  */
> +       test    %VRCX, %VRCX
> +       jz      L(zero_0)
> +# endif
> +       tzcnt   %VRCX, %VRCX
> +       cmp     %cl, %al
> +
> +       /* Same CFG for VEC_SIZE == 64 and VEC_SIZE == 32.  We give
> +          fallthrough to L(zero_0) for VEC_SIZE == 64 here as there is
> +          not enough space before the next cache line to fit the `lea`
> +          for return.  */
> +# if VEC_SIZE == 64
> +       ja      L(first_vec_x2_ret)
> +L(zero_0):
> +       xorl    %eax, %eax
> +       ret
>  # else
> -       addq    %RAW_PTR_REG, %rax
> +       jbe     L(zero_0)
> +       leaq    (VEC_SIZE * 2)(%rdi, %rcx, CHAR_SIZE), %rax
> +       ret
>  # endif
> +
> +       .p2align 4,, 5
> +L(first_vec_x1_check):
> +       bsf     %VRDX, %VRDX
> +       cmpb    %dl, %al
> +       jb      L(zero_4)
> +       leaq    (VEC_SIZE * 1)(%rdi, %rdx, CHAR_SIZE), %rax
>         ret
>
> -       .p2align 4
> -L(first_vec_x1):
> -       tzcntl  %eax, %eax
> -       leaq    VEC_SIZE(%rdi, %rax, CHAR_SIZE), %rax
> +       /* Fits at the end of the cache line here for VEC_SIZE == 32.
> +        */
> +# if VEC_SIZE == 32
> +L(zero_4):
> +       xorl    %eax, %eax
>         ret
> +# endif
>
> -       .p2align 4
> +
> +       .p2align 4,, 4
>  L(first_vec_x2):
> -       tzcntl  %eax, %eax
> -       leaq    (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
> +       bsf     %VRCX, %VRCX
> +L(first_vec_x2_ret):
> +       leaq    (VEC_SIZE * 2)(%rdi, %rcx, CHAR_SIZE), %rax
>         ret
>
> -       .p2align 4
> -L(first_vec_x3):
> -       tzcntl  %eax, %eax
> -       leaq    (VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax
> +       /* Fits at the end of the cache line here for VEC_SIZE == 64.
> +        */
> +# if VEC_SIZE == 64
> +L(zero_4):
> +       xorl    %eax, %eax
>         ret
> +# endif
>
> -       .p2align 4
> -L(first_vec_x4):
> -       tzcntl  %eax, %eax
> -       leaq    (VEC_SIZE * 4)(%rdi, %rax, CHAR_SIZE), %rax
> +       .p2align 4,, 4
> +L(first_vec_x1):
> +       bsf     %VRDX, %VRDX
> +       leaq    (VEC_SIZE * 1)(%rdi, %rdx, CHAR_SIZE), %rax
>         ret
>
> -       .p2align 5
> -L(aligned_more):
> -       /* Check the first 4 * VEC_SIZE.  Only one VEC_SIZE at a time
> -          since data is only aligned to VEC_SIZE.  */
>
> -# ifndef USE_AS_RAWMEMCHR
> -       /* Align data to VEC_SIZE.  */
> -L(cross_page_continue):
> -       xorl    %ecx, %ecx
> -       subl    %edi, %ecx
> -       andq    $-VEC_SIZE, %rdi
> -       /* esi is for adjusting length to see if near the end.  */
> -       leal    (VEC_SIZE * 5)(%rdi, %rcx), %esi
> -#  ifdef USE_AS_WMEMCHR
> -       /* NB: Divide bytes by 4 to get the wchar_t count.  */
> -       sarl    $2, %esi
> -#  endif
> -# else
> -       andq    $-VEC_SIZE, %rdi
> -L(cross_page_continue):
> -# endif
> -       /* Load first VEC regardless.  */
> -       VPCMP   $0, (VEC_SIZE)(%rdi), %YMMMATCH, %k0
> -       kmovd   %k0, %eax
> -# ifndef USE_AS_RAWMEMCHR
> -       /* Adjust length. If near end handle specially.  */
> -       subq    %rsi, %rdx
> -       jbe     L(last_4x_vec_or_less)
> -# endif
> -       testl   %eax, %eax
> +       .p2align 4,, 5
> +L(more_2x_vec):
> +       /* Length > VEC_SIZE * 2 so check first 2x VEC before rechecking
> +          length.  */
> +
> +
> +       /* Already computed matches for first VEC in rdx.  */
> +       test    %VRDX, %VRDX
>         jnz     L(first_vec_x1)
>
> -       VPCMP   $0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k0
> -       kmovd   %k0, %eax
> -       testl   %eax, %eax
> +
> +       VPCMPEQ (VEC_SIZE * 2)(%rdi), %VMATCH, %k0
> +       KMOV    %k0, %VRCX
> +       test    %VRCX, %VRCX
>         jnz     L(first_vec_x2)
>
> -       VPCMP   $0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k0
> -       kmovd   %k0, %eax
> -       testl   %eax, %eax
> +       /* Needed regardless of next length check.  */
> +       VPCMPEQ (VEC_SIZE * 3)(%rdi), %VMATCH, %k0
> +       KMOV    %k0, %VRCX
> +
> +       /* Check if we are near the end.  */
> +       cmpq    $(CHAR_PER_VEC * 4 - 1), %rax
> +       ja      L(more_4x_vec)
> +
> +       test    %VRCX, %VRCX
> +       jnz     L(first_vec_x3_check)
> +
> +       /* Use 8-bit instructions to save code size.  We won't use full-
> +          width eax again and will perform a write-only operation to
> +          eax so no worries about partial-register stalls.  */
> +       subb    $(CHAR_PER_VEC * 3), %al
> +       jb      L(zero_2)
> +L(last_vec_check):
> +       VPCMPEQ (VEC_SIZE * 4)(%rdi), %VMATCH, %k0
> +       KMOV    %k0, %VRCX
> +# ifdef USE_AS_WMEMCHR
> +       /* For wmemchr against we can't take advantage of tzcnt(0) ==
> +          VEC_SIZE as CHAR_PER_VEC != VEC_SIZE.  */
> +       test    %VRCX, %VRCX
> +       jz      L(zero_2)
> +# endif
> +       tzcnt   %VRCX, %VRCX
> +       cmp     %cl, %al
> +       jae     L(first_vec_x4_ret)
> +L(zero_2):
> +       xorl    %eax, %eax
> +       ret
> +
> +       /* Fits at the end of the cache line here for VEC_SIZE == 64.
> +          For VEC_SIZE == 32 we put the return label at the end of
> +          L(first_vec_x4).  */
> +# if VEC_SIZE == 64
> +L(first_vec_x4_ret):
> +       leaq    (VEC_SIZE * 4)(%rdi, %rcx, CHAR_SIZE), %rax
> +       ret
> +# endif
> +
> +       .p2align 4,, 6
> +L(first_vec_x4):
> +       bsf     %VRCX, %VRCX
> +# if VEC_SIZE == 32
> +       /* Place L(first_vec_x4_ret) here as we can't fit it in the same
> +          cache line as where it is called from so we might as well
> +          save code size by reusing return of L(first_vec_x4).  */
> +L(first_vec_x4_ret):
> +# endif
> +       leaq    (VEC_SIZE * 4)(%rdi, %rcx, CHAR_SIZE), %rax
> +       ret
> +
> +       .p2align 4,, 6
> +L(first_vec_x3_check):
> +       /* Need to adjust remaining length before checking.  */
> +       addb    $-(CHAR_PER_VEC * 2), %al
> +       bsf     %VRCX, %VRCX
> +       cmpb    %cl, %al
> +       jb      L(zero_2)
> +       leaq    (VEC_SIZE * 3)(%rdi, %rcx, CHAR_SIZE), %rax
> +       ret
> +
> +       .p2align 4,, 6
> +L(first_vec_x3):
> +       bsf     %VRCX, %VRCX
> +       leaq    (VEC_SIZE * 3)(%rdi, %rcx, CHAR_SIZE), %rax
> +       ret
> +
> +       .p2align 4,, 3
> +# if !USE_TERN_IN_LOOP
> +       .p2align 4,, 10
> +# endif
> +L(more_4x_vec):
> +       test    %VRCX, %VRCX
>         jnz     L(first_vec_x3)
>
> -       VPCMP   $0, (VEC_SIZE * 4)(%rdi), %YMMMATCH, %k0
> -       kmovd   %k0, %eax
> -       testl   %eax, %eax
> +       VPCMPEQ (VEC_SIZE * 4)(%rdi), %VMATCH, %k0
> +       KMOV    %k0, %VRCX
> +       test    %VRCX, %VRCX
>         jnz     L(first_vec_x4)
>
> +       subq    $-(VEC_SIZE * 5), %rdi
> +       subq    $(CHAR_PER_VEC * 8), %rax
> +       jb      L(last_4x_vec)
>
> -# ifndef USE_AS_RAWMEMCHR
> -       /* Check if at last CHAR_PER_VEC * 4 length.  */
> -       subq    $(CHAR_PER_VEC * 4), %rdx
> -       jbe     L(last_4x_vec_or_less_cmpeq)
> -       /* +VEC_SIZE if USE_IN_RTM otherwise +VEC_SIZE * 5.  */
> -       addq    $(VEC_SIZE + (VEC_SIZE * 4 - BASE_OFFSET)), %rdi
> -
> -       /* Align data to VEC_SIZE * 4 for the loop and readjust length.
> -        */
> -#  ifdef USE_AS_WMEMCHR
> +# ifdef USE_AS_WMEMCHR
>         movl    %edi, %ecx
> -       andq    $-(4 * VEC_SIZE), %rdi
> +# else
> +       addq    %rdi, %rax
> +# endif
> +
> +
> +# if VEC_SIZE == 64
> +       /* use xorb to do `andq $-(VEC_SIZE * 4), %rdi`. No evex
> +          processor has partial register stalls (all have merging
> +          uop). If that changes this can be removed.  */
> +       xorb    %dil, %dil
> +# else
> +       andq    $-(VEC_SIZE * 4), %rdi
> +# endif
> +
> +# ifdef USE_AS_WMEMCHR
>         subl    %edi, %ecx
> -       /* NB: Divide bytes by 4 to get the wchar_t count.  */
>         sarl    $2, %ecx
> -       addq    %rcx, %rdx
> -#  else
> -       addq    %rdi, %rdx
> -       andq    $-(4 * VEC_SIZE), %rdi
> -       subq    %rdi, %rdx
> -#  endif
> +       addq    %rcx, %rax
>  # else
> -       addq    $(VEC_SIZE + (VEC_SIZE * 4 - BASE_OFFSET)), %rdi
> -       andq    $-(4 * VEC_SIZE), %rdi
> +       subq    %rdi, %rax
>  # endif
> -# ifdef USE_IN_RTM
> -       vpxorq  %XMMZERO, %XMMZERO, %XMMZERO
> -# else
> -       /* copy ymmmatch to ymm0 so we can use vpcmpeq which is not
> -          encodable with EVEX registers (ymm16-ymm31).  */
> -       vmovdqa64 %YMMMATCH, %ymm0
> +
> +
> +
> +# if USE_TERN_IN_LOOP
> +       /* copy VMATCH to low ymm so we can use vpcmpeq which is not
> +          encodable with EVEX registers.  NB: this is VEC_SIZE == 32
> +          only as there is no way to encode vpcmpeq with zmm0-15.  */
> +       vmovdqa64 %VMATCH, %VMATCH_LO
>  # endif
>
> -       /* Compare 4 * VEC at a time forward.  */
> -       .p2align 4
> +       .p2align 4,, 11
>  L(loop_4x_vec):
> -       /* Two versions of the loop. One that does not require
> -          vzeroupper by not using ymm0-ymm15 and another does that require
> -          vzeroupper because it uses ymm0-ymm15. The reason why ymm0-ymm15
> -          is used at all is because there is no EVEX encoding vpcmpeq and
> -          with vpcmpeq this loop can be performed more efficiently. The
> -          non-vzeroupper version is safe for RTM while the vzeroupper
> -          version should be prefered if RTM are not supported.  */
> -# ifdef USE_IN_RTM
> -       /* It would be possible to save some instructions using 4x VPCMP
> -          but bottleneck on port 5 makes it not woth it.  */
> -       VPCMP   $4, (VEC_SIZE * 4)(%rdi), %YMMMATCH, %k1
> -       /* xor will set bytes match esi to zero.  */
> -       vpxorq  (VEC_SIZE * 5)(%rdi), %YMMMATCH, %YMM2
> -       vpxorq  (VEC_SIZE * 6)(%rdi), %YMMMATCH, %YMM3
> -       VPCMP   $0, (VEC_SIZE * 7)(%rdi), %YMMMATCH, %k3
> -       /* Reduce VEC2 / VEC3 with min and VEC1 with zero mask.  */
> -       VPMINU  %YMM2, %YMM3, %YMM3{%k1}{z}
> -       VPCMP   $0, %YMM3, %YMMZERO, %k2
> -# else
> +       /* Two versions of the loop.  One that does not require
> +          vzeroupper by not using ymmm0-15 and another does that
> +          require vzeroupper because it uses ymmm0-15.  The reason why
> +          ymm0-15 is used at all is because there is no EVEX encoding
> +          vpcmpeq and with vpcmpeq this loop can be performed more
> +          efficiently.  The non-vzeroupper version is safe for RTM
> +          while the vzeroupper version should be prefered if RTM are
> +          not supported.   Which loop version we use is determined by
> +          USE_TERN_IN_LOOP.  */
> +
> +# if USE_TERN_IN_LOOP
>         /* Since vptern can only take 3x vectors fastest to do 1 vec
>            seperately with EVEX vpcmp.  */
>  #  ifdef USE_AS_WMEMCHR
>         /* vptern can only accept masks for epi32/epi64 so can only save
> -          instruction using not equals mask on vptern with wmemchr.  */
> -       VPCMP   $4, (%rdi), %YMMMATCH, %k1
> +          instruction using not equals mask on vptern with wmemchr.
> +        */
> +       VPCMP   $4, (VEC_SIZE * 0)(%rdi), %VMATCH, %k1
>  #  else
> -       VPCMP   $0, (%rdi), %YMMMATCH, %k1
> +       VPCMPEQ (VEC_SIZE * 0)(%rdi), %VMATCH, %k1
>  #  endif
>         /* Compare 3x with vpcmpeq and or them all together with vptern.
>          */
> -       VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm2
> -       VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm0, %ymm3
> -       VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm0, %ymm4
> +       VPCMPEQ (VEC_SIZE * 1)(%rdi), %VMATCH_LO, %VMM_lo(2)
> +       VPCMPEQ (VEC_SIZE * 2)(%rdi), %VMATCH_LO, %VMM_lo(3)
> +       VPCMPEQ (VEC_SIZE * 3)(%rdi), %VMATCH_LO, %VMM_lo(4)
>  #  ifdef USE_AS_WMEMCHR
> -       /* This takes the not of or between ymm2, ymm3, ymm4 as well as
> -          combines result from VEC0 with zero mask.  */
> -       vpternlogd $1, %ymm2, %ymm3, %ymm4{%k1}{z}
> -       vpmovmskb %ymm4, %ecx
> +       /* This takes the not of or between VEC_lo(2), VEC_lo(3),
> +          VEC_lo(4) as well as combines result from VEC(0) with zero
> +          mask.  */
> +       vpternlogd $1, %VMM_lo(2), %VMM_lo(3), %VMM_lo(4){%k1}{z}
> +       vpmovmskb %VMM_lo(4), %VRCX
>  #  else
> -       /* 254 is mask for oring ymm2, ymm3, ymm4 into ymm4.  */
> -       vpternlogd $254, %ymm2, %ymm3, %ymm4
> -       vpmovmskb %ymm4, %ecx
> -       kmovd   %k1, %eax
> +       /* 254 is mask for oring VEC_lo(2), VEC_lo(3), VEC_lo(4) into
> +          VEC_lo(4).  */
> +       vpternlogd $254, %VMM_lo(2), %VMM_lo(3), %VMM_lo(4)
> +       vpmovmskb %VMM_lo(4), %VRCX
> +       KMOV    %k1, %edx
>  #  endif
> -# endif
>
> -# ifdef USE_AS_RAWMEMCHR
> -       subq    $-(VEC_SIZE * 4), %rdi
> -# endif
> -# ifdef USE_IN_RTM
> -       kortestd %k2, %k3
>  # else
> -#  ifdef USE_AS_WMEMCHR
> -       /* ecx contains not of matches. All 1s means no matches. incl will
> -          overflow and set zeroflag if that is the case.  */
> -       incl    %ecx
> -#  else
> -       /* If either VEC1 (eax) or VEC2-VEC4 (ecx) are not zero. Adding
> -          to ecx is not an issue because if eax is non-zero it will be
> -          used for returning the match. If it is zero the add does
> -          nothing.  */
> -       addq    %rax, %rcx
> -#  endif
> +       /* Loop version that uses EVEX encoding.  */
> +       VPCMP   $4, (VEC_SIZE * 0)(%rdi), %VMATCH, %k1
> +       vpxorq  (VEC_SIZE * 1)(%rdi), %VMATCH, %VMM(2)
> +       vpxorq  (VEC_SIZE * 2)(%rdi), %VMATCH, %VMM(3)
> +       VPCMPEQ (VEC_SIZE * 3)(%rdi), %VMATCH, %k3
> +       VPMINU  %VMM(2), %VMM(3), %VMM(3){%k1}{z}
> +       VPTESTN %VMM(3), %VMM(3), %k2
>  # endif
> -# ifdef USE_AS_RAWMEMCHR
> -       jz      L(loop_4x_vec)
> -# else
> -       jnz     L(loop_4x_vec_end)
> +
> +
> +       TEST_END ()
> +       jnz     L(loop_vec_ret)
>
>         subq    $-(VEC_SIZE * 4), %rdi
>
> -       subq    $(CHAR_PER_VEC * 4), %rdx
> -       ja      L(loop_4x_vec)
> +       subq    $(CHAR_PER_VEC * 4), %rax
> +       jae     L(loop_4x_vec)
>
> -       /* Fall through into less than 4 remaining vectors of length case.
> +       /* COND_VZEROUPPER is vzeroupper if we use the VEX encoded loop.
>          */
> -       VPCMP   $0, BASE_OFFSET(%rdi), %YMMMATCH, %k0
> -       addq    $(BASE_OFFSET - VEC_SIZE), %rdi
> -       kmovd   %k0, %eax
> -       VZEROUPPER
> -
> -L(last_4x_vec_or_less):
> -       /* Check if first VEC contained match.  */
> -       testl   %eax, %eax
> -       jnz     L(first_vec_x1_check)
> +       COND_VZEROUPPER
>
> -       /* If remaining length > CHAR_PER_VEC * 2.  */
> -       addl    $(CHAR_PER_VEC * 2), %edx
> -       jg      L(last_4x_vec)
> -
> -L(last_2x_vec):
> -       /* If remaining length < CHAR_PER_VEC.  */
> -       addl    $CHAR_PER_VEC, %edx
> -       jle     L(zero_end)
> -
> -       /* Check VEC2 and compare any match with remaining length.  */
> -       VPCMP   $0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k0
> -       kmovd   %k0, %eax
> -       tzcntl  %eax, %eax
> -       cmpl    %eax, %edx
> -       jbe     L(set_zero_end)
> -       leaq    (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
> -L(zero_end):
> -       ret
> +       .p2align 4,, 10
> +L(last_4x_vec):
> +       /* For CHAR_PER_VEC == 64 we don't need to mask as we use 8-bit
> +          instructions on eax from here on out.  */
> +# if CHAR_PER_VEC != 64
> +       andl    $(CHAR_PER_VEC * 4 - 1), %eax
> +# endif
> +       VPCMPEQ (VEC_SIZE * 0)(%rdi), %VMATCH, %k0
> +       subq    $(VEC_SIZE * 1), %rdi
> +       KMOV    %k0, %VRDX
> +       cmpb    $(CHAR_PER_VEC * 2 - 1), %al
> +       jbe     L(last_2x_vec)
> +       test    %VRDX, %VRDX
> +       jnz     L(last_vec_x1_novzero)
> +
> +       VPCMPEQ (VEC_SIZE * 2)(%rdi), %VMATCH, %k0
> +       KMOV    %k0, %VRDX
> +       test    %VRDX, %VRDX
> +       jnz     L(last_vec_x2_novzero)
> +
> +       VPCMPEQ (VEC_SIZE * 3)(%rdi), %VMATCH, %k0
> +       KMOV    %k0, %VRCX
> +       test    %VRCX, %VRCX
> +       jnz     L(first_vec_x3_check)
> +
> +       subb    $(CHAR_PER_VEC * 3), %al
> +       jae     L(last_vec_check)
>
> -L(set_zero_end):
>         xorl    %eax, %eax
>         ret
>
> -       .p2align 4
> -L(first_vec_x1_check):
> -       /* eax must be non-zero. Use bsfl to save code size.  */
> -       bsfl    %eax, %eax
> -       /* Adjust length.  */
> -       subl    $-(CHAR_PER_VEC * 4), %edx
> -       /* Check if match within remaining length.  */
> -       cmpl    %eax, %edx
> -       jbe     L(set_zero_end)
> -       /* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count.  */
> -       leaq    VEC_SIZE(%rdi, %rax, CHAR_SIZE), %rax
> +# if defined USE_AS_WMEMCHR && USE_TERN_IN_LOOP
> +L(last_vec_x2_novzero):
> +       addq    $VEC_SIZE, %rdi
> +L(last_vec_x1_novzero):
> +       bsf     %VRDX, %VRDX
> +       leaq    (VEC_SIZE * 1)(%rdi, %rdx, CHAR_SIZE), %rax
>         ret
> +# endif
>
> -       .p2align 4
> -L(loop_4x_vec_end):
> +# if CHAR_PER_VEC == 64
> +       /* Since we can't combine the last 2x VEC when CHAR_PER_VEC ==
> +          64 it needs a seperate return label.  */
> +       .p2align 4,, 4
> +L(last_vec_x2):
> +L(last_vec_x2_novzero):
> +       bsf     %VRDX, %VRDX
> +       leaq    (VEC_SIZE * 2)(%rdi, %rdx, TERN_CHAR_MULT), %rax
> +       ret
>  # endif
> -       /* rawmemchr will fall through into this if match was found in
> -          loop.  */
>
> -# if defined USE_IN_RTM || defined USE_AS_WMEMCHR
> -       /* k1 has not of matches with VEC1.  */
> -       kmovd   %k1, %eax
> -#  ifdef USE_AS_WMEMCHR
> -       subl    $((1 << CHAR_PER_VEC) - 1), %eax
> -#  else
> -       incl    %eax
> -#  endif
> +       .p2align 4,, 4
> +L(loop_vec_ret):
> +# if defined USE_AS_WMEMCHR || !USE_TERN_IN_LOOP
> +       KMOV    %k1, %VRAX
> +       inc     %MASK_GPR(rax)
>  # else
> -       /* eax already has matches for VEC1.  */
> -       testl   %eax, %eax
> +       test    %VRDX, %VRDX
>  # endif
> -       jnz     L(last_vec_x1_return)
> +       jnz     L(last_vec_x0)
>
> -# ifdef USE_IN_RTM
> -       VPCMP   $0, %YMM2, %YMMZERO, %k0
> -       kmovd   %k0, %eax
> +
> +# if USE_TERN_IN_LOOP
> +       vpmovmskb %VMM_lo(2), %VRDX
>  # else
> -       vpmovmskb %ymm2, %eax
> +       VPTESTN %VMM(2), %VMM(2), %k1
> +       KMOV    %k1, %VRDX
>  # endif
> -       testl   %eax, %eax
> -       jnz     L(last_vec_x2_return)
> +       test    %VRDX, %VRDX
> +       jnz     L(last_vec_x1)
>
> -# ifdef USE_IN_RTM
> -       kmovd   %k2, %eax
> -       testl   %eax, %eax
> -       jnz     L(last_vec_x3_return)
>
> -       kmovd   %k3, %eax
> -       tzcntl  %eax, %eax
> -       leaq    (VEC_SIZE * 3 + RET_OFFSET)(%rdi, %rax, CHAR_SIZE), %rax
> +# if USE_TERN_IN_LOOP
> +       vpmovmskb %VMM_lo(3), %VRDX
>  # else
> -       vpmovmskb %ymm3, %eax
> -       /* Combine matches in VEC3 (eax) with matches in VEC4 (ecx).  */
> -       salq    $VEC_SIZE, %rcx
> -       orq     %rcx, %rax
> -       tzcntq  %rax, %rax
> -       leaq    (VEC_SIZE * 2 + RET_OFFSET)(%rdi, %rax), %rax
> -       VZEROUPPER
> +       KMOV    %k2, %VRDX
>  # endif
> -       ret
>
> -       .p2align 4,, 10
> -L(last_vec_x1_return):
> -       tzcntl  %eax, %eax
> -# if defined USE_AS_WMEMCHR || RET_OFFSET != 0
> -       /* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count.  */
> -       leaq    RET_OFFSET(%rdi, %rax, CHAR_SIZE), %rax
> +       /* No longer need any of the lo vecs (ymm0-15) so vzeroupper
> +          (only if used VEX encoded loop).  */
> +       COND_VZEROUPPER
> +
> +       /* Seperate logic for CHAR_PER_VEC == 64 vs the rest.  For
> +          CHAR_PER_VEC we test the last 2x VEC seperately, for
> +          CHAR_PER_VEC <= 32 we can combine the results from the 2x
> +          VEC in a single GPR.  */
> +# if CHAR_PER_VEC == 64
> +#  if USE_TERN_IN_LOOP
> +#   error "Unsupported"
> +#  endif
> +
> +
> +       /* If CHAR_PER_VEC == 64 we can't combine the last two VEC.  */
> +       test    %VRDX, %VRDX
> +       jnz     L(last_vec_x2)
> +       KMOV    %k3, %VRDX
>  # else
> -       addq    %rdi, %rax
> +       /* CHAR_PER_VEC <= 32 so we can combine the results from the
> +          last 2x VEC.  */
> +
> +#  if !USE_TERN_IN_LOOP
> +       KMOV    %k3, %VRCX
> +#  endif
> +       salq    $(VEC_SIZE / TERN_CHAR_MULT), %rcx
> +       addq    %rcx, %rdx
> +#  if !defined USE_AS_WMEMCHR || !USE_TERN_IN_LOOP
> +L(last_vec_x2_novzero):
> +#  endif
>  # endif
> -       VZEROUPPER
> +       bsf     %rdx, %rdx
> +       leaq    (LAST_VEC_OFFSET)(%rdi, %rdx, TERN_CHAR_MULT), %rax
>         ret
>
> -       .p2align 4
> -L(last_vec_x2_return):
> -       tzcntl  %eax, %eax
> -       /* NB: Multiply bytes by RET_SCALE to get the wchar_t count
> -          if relevant (RET_SCALE = CHAR_SIZE if USE_AS_WMEMCHAR and
> -          USE_IN_RTM are both defined. Otherwise RET_SCALE = 1.  */
> -       leaq    (VEC_SIZE + RET_OFFSET)(%rdi, %rax, RET_SCALE), %rax
> -       VZEROUPPER
> +       .p2align 4,, 8
> +L(last_vec_x1):
> +       COND_VZEROUPPER
> +# if !defined USE_AS_WMEMCHR || !USE_TERN_IN_LOOP
> +L(last_vec_x1_novzero):
> +# endif
> +       bsf     %VRDX, %VRDX
> +       leaq    (VEC_SIZE * 1)(%rdi, %rdx, TERN_CHAR_MULT), %rax
>         ret
>
> -# ifdef USE_IN_RTM
> -       .p2align 4
> -L(last_vec_x3_return):
> -       tzcntl  %eax, %eax
> -       /* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count.  */
> -       leaq    (VEC_SIZE * 2 + RET_OFFSET)(%rdi, %rax, CHAR_SIZE), %rax
> +
> +       .p2align 4,, 4
> +L(last_vec_x0):
> +       COND_VZEROUPPER
> +       bsf     %VGPR(GPR_X0), %VGPR(GPR_X0)
> +# if GPR_X0_IS_RET
> +       addq    %rdi, %rax
> +# else
> +       leaq    (%rdi, %GPR_X0, CHAR_SIZE), %rax
> +# endif
>         ret
> +
> +       .p2align 4,, 6
> +L(page_cross):
> +       /* Need to preserve eax to compute inbound bytes we are
> +          checking.  */
> +# ifdef USE_AS_WMEMCHR
> +       movl    %eax, %ecx
> +# else
> +       xorl    %ecx, %ecx
> +       subl    %eax, %ecx
>  # endif
>
> -# ifndef USE_AS_RAWMEMCHR
> -       .p2align 4,, 5
> -L(last_4x_vec_or_less_cmpeq):
> -       VPCMP   $0, (VEC_SIZE * 5)(%rdi), %YMMMATCH, %k0
> -       kmovd   %k0, %eax
> -       subq    $-(VEC_SIZE * 4), %rdi
> -       /* Check first VEC regardless.  */
> -       testl   %eax, %eax
> -       jnz     L(first_vec_x1_check)
> +       xorq    %rdi, %rax
> +       VPCMPEQ (PAGE_SIZE - VEC_SIZE)(%rax), %VMATCH, %k0
> +       KMOV    %k0, %VRAX
>
> -       /* If remaining length <= CHAR_PER_VEC * 2.  */
> -       addl    $(CHAR_PER_VEC * 2), %edx
> -       jle     L(last_2x_vec)
> +# ifdef USE_AS_WMEMCHR
> +       /* NB: Divide by CHAR_SIZE to shift out out of bounds bytes.  */
> +       shrl    $2, %ecx
> +       andl    $(CHAR_PER_VEC - 1), %ecx
> +# endif
>
> -       .p2align 4
> -L(last_4x_vec):
> -       VPCMP   $0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k0
> -       kmovd   %k0, %eax
> -       testl   %eax, %eax
> -       jnz     L(last_vec_x2)
>
> +       shrx    %VGPR(PC_SHIFT_GPR), %VRAX, %VRAX
>
> -       VPCMP   $0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k0
> -       kmovd   %k0, %eax
> -       /* Create mask for possible matches within remaining length.  */
> -#  ifdef USE_AS_WMEMCHR
> -       movl    $((1 << (CHAR_PER_VEC * 2)) - 1), %ecx
> -       bzhil   %edx, %ecx, %ecx
> -#  else
> -       movq    $-1, %rcx
> -       bzhiq   %rdx, %rcx, %rcx
> -#  endif
> -       /* Test matches in data against length match.  */
> -       andl    %ecx, %eax
> -       jnz     L(last_vec_x3)
> +# ifdef USE_AS_WMEMCHR
> +       negl    %ecx
> +# endif
>
> -       /* if remaining length <= CHAR_PER_VEC * 3 (Note this is after
> -          remaining length was found to be > CHAR_PER_VEC * 2.  */
> -       subl    $CHAR_PER_VEC, %edx
> -       jbe     L(zero_end2)
> +       /* mask lower bits from ecx (negative eax) to get bytes till
> +          next VEC.  */
> +       andl    $(CHAR_PER_VEC - 1), %ecx
>
> +       /* Check if VEC is entirely contained in the remainder of the
> +          page.  */
> +       cmpq    %rcx, %rdx
> +       jbe     L(page_cross_ret)
>
> -       VPCMP   $0, (VEC_SIZE * 4)(%rdi), %YMMMATCH, %k0
> -       kmovd   %k0, %eax
> -       /* Shift remaining length mask for last VEC.  */
> -#  ifdef USE_AS_WMEMCHR
> -       shrl    $CHAR_PER_VEC, %ecx
> -#  else
> -       shrq    $CHAR_PER_VEC, %rcx
> -#  endif
> -       andl    %ecx, %eax
> -       jz      L(zero_end2)
> -       bsfl    %eax, %eax
> -       leaq    (VEC_SIZE * 4)(%rdi, %rax, CHAR_SIZE), %rax
> -L(zero_end2):
> -       ret
> +       /* Length crosses the page so if rax is zero (no matches)
> +          continue.  */
> +       test    %VRAX, %VRAX
> +       jz      L(page_cross_continue)
>
> -L(last_vec_x2):
> -       tzcntl  %eax, %eax
> -       leaq    (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
> +       /* if rdx > rcx then any match here must be in [buf:buf + len].
> +        */
> +       tzcnt   %VRAX, %VRAX
> +# ifdef USE_AS_WMEMCHR
> +       leaq    (%rdi, %rax, CHAR_SIZE), %rax
> +# else
> +       addq    %rdi, %rax
> +# endif
>         ret
>
> -       .p2align 4
> -L(last_vec_x3):
> -       tzcntl  %eax, %eax
> -       leaq    (VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax
> +       .p2align 4,, 2
> +L(page_cross_zero):
> +       xorl    %eax, %eax
>         ret
> +
> +       .p2align 4,, 4
> +L(page_cross_ret):
> +       /* Search is entirely contained in page cross case.  */
> +# ifdef USE_AS_WMEMCHR
> +       test    %VRAX, %VRAX
> +       jz      L(page_cross_zero)
> +# endif
> +       tzcnt   %VRAX, %VRAX
> +       cmpl    %eax, %edx
> +       jbe     L(page_cross_zero)
> +# ifdef USE_AS_WMEMCHR
> +       leaq    (%rdi, %rax, CHAR_SIZE), %rax
> +# else
> +       addq    %rdi, %rax
>  # endif
> -       /* 7 bytes from next cache line.  */
> +       ret
>  END (MEMCHR)
>  #endif
> diff --git a/sysdeps/x86_64/multiarch/rawmemchr-evex-rtm.S b/sysdeps/x86_64/multiarch/rawmemchr-evex-rtm.S
> index deda1ca395..2073eaa620 100644
> --- a/sysdeps/x86_64/multiarch/rawmemchr-evex-rtm.S
> +++ b/sysdeps/x86_64/multiarch/rawmemchr-evex-rtm.S
> @@ -1,3 +1,6 @@
> -#define MEMCHR __rawmemchr_evex_rtm
> -#define USE_AS_RAWMEMCHR 1
> -#include "memchr-evex-rtm.S"
> +#define RAWMEMCHR      __rawmemchr_evex_rtm
> +
> +#define USE_IN_RTM     1
> +#define SECTION(p)     p##.evex.rtm
> +
> +#include "rawmemchr-evex.S"
> diff --git a/sysdeps/x86_64/multiarch/rawmemchr-evex.S b/sysdeps/x86_64/multiarch/rawmemchr-evex.S
> index dc1c450699..dad54def2b 100644
> --- a/sysdeps/x86_64/multiarch/rawmemchr-evex.S
> +++ b/sysdeps/x86_64/multiarch/rawmemchr-evex.S
> @@ -1,7 +1,308 @@
> -#ifndef RAWMEMCHR
> -# define RAWMEMCHR     __rawmemchr_evex
> -#endif
> -#define USE_AS_RAWMEMCHR       1
> -#define MEMCHR RAWMEMCHR
> +/* rawmemchr optimized with 256-bit EVEX instructions.
> +   Copyright (C) 2022 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +#include <isa-level.h>
> +#include <sysdep.h>
> +
> +#if ISA_SHOULD_BUILD (4)
> +
> +# ifndef VEC_SIZE
> +#  include "x86-evex256-vecs.h"
> +# endif
> +
> +# ifndef RAWMEMCHR
> +#  define RAWMEMCHR    __rawmemchr_evex
> +# endif
> +
> +
> +# define PC_SHIFT_GPR  rdi
> +# define REG_WIDTH     VEC_SIZE
> +# define VPTESTN       vptestnmb
> +# define VPBROADCAST   vpbroadcastb
> +# define VPMINU        vpminub
> +# define VPCMP vpcmpb
> +# define VPCMPEQ       vpcmpeqb
> +# define CHAR_SIZE     1
> +
> +# include "reg-macros.h"
> +
> +/* If not in an RTM and VEC_SIZE != 64 (the VEC_SIZE = 64
> +   doesn't have VEX encoding), use VEX encoding in loop so we
> +   can use vpcmpeqb + vptern which is more efficient than the
> +   EVEX alternative.  */
> +# if defined USE_IN_RTM || VEC_SIZE == 64
> +#  undef COND_VZEROUPPER
> +#  undef VZEROUPPER_RETURN
> +#  undef VZEROUPPER
> +
> +
> +#  define COND_VZEROUPPER
> +#  define VZEROUPPER_RETURN    ret
> +#  define VZEROUPPER
> +
> +#  define USE_TERN_IN_LOOP     0
> +# else
> +#  define USE_TERN_IN_LOOP     1
> +#  undef VZEROUPPER
> +#  define VZEROUPPER   vzeroupper
> +# endif
> +
> +# define CHAR_PER_VEC  VEC_SIZE
> +
> +# if CHAR_PER_VEC == 64
> +
> +#  define TAIL_RETURN_LBL      first_vec_x2
> +#  define TAIL_RETURN_OFFSET   (CHAR_PER_VEC * 2)
> +
> +#  define FALLTHROUGH_RETURN_LBL       first_vec_x3
> +#  define FALLTHROUGH_RETURN_OFFSET    (CHAR_PER_VEC * 3)
> +
> +# else /* !(CHAR_PER_VEC == 64) */
> +
> +#  define TAIL_RETURN_LBL      first_vec_x3
> +#  define TAIL_RETURN_OFFSET   (CHAR_PER_VEC * 3)
> +
> +#  define FALLTHROUGH_RETURN_LBL       first_vec_x2
> +#  define FALLTHROUGH_RETURN_OFFSET    (CHAR_PER_VEC * 2)
> +# endif        /* !(CHAR_PER_VEC == 64) */
> +
> +
> +# define VMATCH        VMM(0)
> +# define VMATCH_LO     VMM_lo(0)
> +
> +# define PAGE_SIZE     4096
> +
> +       .section SECTION(.text), "ax", @progbits
> +ENTRY_P2ALIGN (RAWMEMCHR, 6)
> +       VPBROADCAST %esi, %VMATCH
> +       /* Check if we may cross page boundary with one vector load.  */
> +       movl    %edi, %eax
> +       andl    $(PAGE_SIZE - 1), %eax
> +       cmpl    $(PAGE_SIZE - VEC_SIZE), %eax
> +       ja      L(page_cross)
> +
> +       VPCMPEQ (%rdi), %VMATCH, %k0
> +       KMOV    %k0, %VRAX
> +
> +       test    %VRAX, %VRAX
> +       jz      L(aligned_more)
> +L(first_vec_x0):
> +       bsf     %VRAX, %VRAX
> +       addq    %rdi, %rax
> +       ret
> +
> +       .p2align 4,, 4
> +L(first_vec_x4):
> +       bsf     %VRAX, %VRAX
> +       leaq    (VEC_SIZE * 4)(%rdi, %rax), %rax
> +       ret
>
> -#include "memchr-evex.S"
> +       /* For VEC_SIZE == 32 we can fit this in aligning bytes so might
> +          as well place it more locally.  For VEC_SIZE == 64 we reuse
> +          return code at the end of loop's return.  */
> +# if VEC_SIZE == 32
> +       .p2align 4,, 4
> +L(FALLTHROUGH_RETURN_LBL):
> +       bsf     %VRAX, %VRAX
> +       leaq    (FALLTHROUGH_RETURN_OFFSET)(%rdi, %rax), %rax
> +       ret
> +# endif
> +
> +       .p2align 4,, 6
> +L(page_cross):
> +       /* eax has lower page-offset bits of rdi so xor will zero them
> +          out.  */
> +       xorq    %rdi, %rax
> +       VPCMPEQ (PAGE_SIZE - VEC_SIZE)(%rax), %VMATCH, %k0
> +       KMOV    %k0, %VRAX
> +
> +       /* Shift out out-of-bounds matches.  */
> +       shrx    %VRDI, %VRAX, %VRAX
> +       test    %VRAX, %VRAX
> +       jnz     L(first_vec_x0)
> +
> +       .p2align 4,, 10
> +L(aligned_more):
> +L(page_cross_continue):
> +       /* Align pointer.  */
> +       andq    $(VEC_SIZE * -1), %rdi
> +
> +       VPCMPEQ VEC_SIZE(%rdi), %VMATCH, %k0
> +       KMOV    %k0, %VRAX
> +       test    %VRAX, %VRAX
> +       jnz     L(first_vec_x1)
> +
> +       VPCMPEQ (VEC_SIZE * 2)(%rdi), %VMATCH, %k0
> +       KMOV    %k0, %VRAX
> +       test    %VRAX, %VRAX
> +       jnz     L(first_vec_x2)
> +
> +       VPCMPEQ (VEC_SIZE * 3)(%rdi), %VMATCH, %k0
> +       KMOV    %k0, %VRAX
> +       test    %VRAX, %VRAX
> +       jnz     L(first_vec_x3)
> +
> +       VPCMPEQ (VEC_SIZE * 4)(%rdi), %VMATCH, %k0
> +       KMOV    %k0, %VRAX
> +       test    %VRAX, %VRAX
> +       jnz     L(first_vec_x4)
> +
> +       subq    $-(VEC_SIZE * 1), %rdi
> +# if VEC_SIZE == 64
> +       /* Saves code size.  No evex512 processor has partial register
> +          stalls.  If that change this can be replaced with `andq
> +          $-(VEC_SIZE * 4), %rdi`.  */
> +       xorb    %dil, %dil
> +# else
> +       andq    $-(VEC_SIZE * 4), %rdi
> +# endif
> +
> +# if USE_TERN_IN_LOOP
> +       /* copy VMATCH to low ymm so we can use vpcmpeq which is not
> +          encodable with EVEX registers.  NB: this is VEC_SIZE == 32
> +          only as there is no way to encode vpcmpeq with zmm0-15.  */
> +       vmovdqa64 %VMATCH, %VMATCH_LO
> +# endif
> +
> +       .p2align 4
> +L(loop_4x_vec):
> +       /* Two versions of the loop.  One that does not require
> +          vzeroupper by not using ymm0-15 and another does that
> +          require vzeroupper because it uses ymm0-15.  The reason why
> +          ymm0-15 is used at all is because there is no EVEX encoding
> +          vpcmpeq and with vpcmpeq this loop can be performed more
> +          efficiently.  The non-vzeroupper version is safe for RTM
> +          while the vzeroupper version should be prefered if RTM are
> +          not supported.   Which loop version we use is determined by
> +          USE_TERN_IN_LOOP.  */
> +
> +# if USE_TERN_IN_LOOP
> +       /* Since vptern can only take 3x vectors fastest to do 1 vec
> +          seperately with EVEX vpcmp.  */
> +       VPCMPEQ (VEC_SIZE * 4)(%rdi), %VMATCH, %k1
> +       /* Compare 3x with vpcmpeq and or them all together with vptern.
> +        */
> +
> +       VPCMPEQ (VEC_SIZE * 5)(%rdi), %VMATCH_LO, %VMM_lo(2)
> +       subq    $(VEC_SIZE * -4), %rdi
> +       VPCMPEQ (VEC_SIZE * 2)(%rdi), %VMATCH_LO, %VMM_lo(3)
> +       VPCMPEQ (VEC_SIZE * 3)(%rdi), %VMATCH_LO, %VMM_lo(4)
> +
> +       /* 254 is mask for oring VEC_lo(2), VEC_lo(3), VEC_lo(4) into
> +          VEC_lo(4).  */
> +       vpternlogd $254, %VMM_lo(2), %VMM_lo(3), %VMM_lo(4)
> +       vpmovmskb %VMM_lo(4), %VRCX
> +
> +       KMOV    %k1, %eax
> +
> +       /* NB:  rax has match from first VEC and rcx has matches from
> +          VEC 2-4.  If rax is non-zero we will return that match.  If
> +          rax is zero adding won't disturb the bits in rcx.  */
> +       add     %rax, %rcx
> +# else
> +       /* Loop version that uses EVEX encoding.  */
> +       VPCMP   $4, (VEC_SIZE * 4)(%rdi), %VMATCH, %k1
> +       vpxorq  (VEC_SIZE * 5)(%rdi), %VMATCH, %VMM(2)
> +       vpxorq  (VEC_SIZE * 6)(%rdi), %VMATCH, %VMM(3)
> +       VPCMPEQ (VEC_SIZE * 7)(%rdi), %VMATCH, %k3
> +       VPMINU  %VMM(2), %VMM(3), %VMM(3){%k1}{z}
> +       VPTESTN %VMM(3), %VMM(3), %k2
> +       subq    $(VEC_SIZE * -4), %rdi
> +       KORTEST %k2, %k3
> +# endif
> +       jz      L(loop_4x_vec)
> +
> +# if USE_TERN_IN_LOOP
> +       test    %VRAX, %VRAX
> +# else
> +       KMOV    %k1, %VRAX
> +       inc     %VRAX
> +# endif
> +       jnz     L(last_vec_x0)
> +
> +
> +# if USE_TERN_IN_LOOP
> +       vpmovmskb %VMM_lo(2), %VRAX
> +# else
> +       VPTESTN %VMM(2), %VMM(2), %k1
> +       KMOV    %k1, %VRAX
> +# endif
> +       test    %VRAX, %VRAX
> +       jnz     L(last_vec_x1)
> +
> +
> +# if USE_TERN_IN_LOOP
> +       vpmovmskb %VMM_lo(3), %VRAX
> +# else
> +       KMOV    %k2, %VRAX
> +# endif
> +
> +       /* No longer need any of the lo vecs (ymm0-15) so vzeroupper
> +          (only if used VEX encoded loop).  */
> +       COND_VZEROUPPER
> +
> +       /* Seperate logic for VEC_SIZE == 64 and VEC_SIZE == 32 for
> +          returning last 2x VEC. For VEC_SIZE == 64 we test each VEC
> +          individually, for VEC_SIZE == 32 we combine them in a single
> +          64-bit GPR.  */
> +# if CHAR_PER_VEC == 64
> +#  if USE_TERN_IN_LOOP
> +#   error "Unsupported"
> +#  endif
> +
> +
> +       /* If CHAR_PER_VEC == 64 we can't combine the last two VEC.  */
> +       test    %VRAX, %VRAX
> +       jnz     L(first_vec_x2)
> +       KMOV    %k3, %VRAX
> +L(FALLTHROUGH_RETURN_LBL):
> +# else
> +       /* CHAR_PER_VEC <= 32 so we can combine the results from the
> +          last 2x VEC.  */
> +#  if !USE_TERN_IN_LOOP
> +       KMOV    %k3, %VRCX
> +#  endif
> +       salq    $CHAR_PER_VEC, %rcx
> +       addq    %rcx, %rax
> +# endif
> +       bsf     %rax, %rax
> +       leaq    (FALLTHROUGH_RETURN_OFFSET)(%rdi, %rax), %rax
> +       ret
> +
> +       .p2align 4,, 8
> +L(TAIL_RETURN_LBL):
> +       bsf     %rax, %rax
> +       leaq    (TAIL_RETURN_OFFSET)(%rdi, %rax), %rax
> +       ret
> +
> +       .p2align 4,, 8
> +L(last_vec_x1):
> +       COND_VZEROUPPER
> +L(first_vec_x1):
> +       bsf     %VRAX, %VRAX
> +       leaq    (VEC_SIZE * 1)(%rdi, %rax), %rax
> +       ret
> +
> +       .p2align 4,, 8
> +L(last_vec_x0):
> +       COND_VZEROUPPER
> +       bsf     %VRAX, %VRAX
> +       addq    %rdi, %rax
> +       ret
> +END (RAWMEMCHR)
> +#endif
> --
> 2.34.1
>

[-- Attachment #2: rawmemchr.txt --]
[-- Type: text/plain, Size: 9107 bytes --]

Results For: rawmemchr
alignment,char ,length ,__rawmemchr_evex ,__rawmemchr_evex_orig 
0        ,0    ,1      ,2.878            ,2.891                 ,0.996 
0        ,0    ,10     ,2.876            ,2.886                 ,0.997 
0        ,0    ,1024   ,22.832           ,23.58                 ,0.968 
0        ,0    ,11     ,2.886            ,2.887                 ,0.999 
0        ,0    ,12     ,2.864            ,2.871                 ,0.998 
0        ,0    ,128    ,5.816            ,6.014                 ,0.967 
0        ,0    ,13     ,2.854            ,2.863                 ,0.997 
0        ,0    ,14     ,2.886            ,2.865                 ,1.007 
0        ,0    ,15     ,2.863            ,2.886                 ,0.992 
0        ,0    ,16     ,2.859            ,2.857                 ,1.001 
0        ,0    ,17     ,2.848            ,2.881                 ,0.988 
0        ,0    ,18     ,2.854            ,2.865                 ,0.996 
0        ,0    ,19     ,2.878            ,2.872                 ,1.002 
0        ,0    ,2      ,2.887            ,2.9                   ,0.995 
0        ,0    ,20     ,2.857            ,2.862                 ,0.998 
0        ,0    ,21     ,2.861            ,2.86                  ,1.0   
0        ,0    ,22     ,2.854            ,2.873                 ,0.993 
0        ,0    ,23     ,2.872            ,2.861                 ,1.004 
0        ,0    ,24     ,2.853            ,2.855                 ,0.999 
0        ,0    ,25     ,2.85             ,2.853                 ,0.999 
0        ,0    ,256    ,10.355           ,10.703                ,0.968 
0        ,0    ,26     ,2.86             ,2.853                 ,1.002 
0        ,0    ,27     ,2.846            ,2.861                 ,0.995 
0        ,0    ,28     ,2.849            ,2.861                 ,0.996 
0        ,0    ,29     ,2.867            ,2.868                 ,1.0   
0        ,0    ,3      ,2.863            ,2.892                 ,0.99  
0        ,0    ,30     ,2.855            ,2.869                 ,0.995 
0        ,0    ,31     ,2.842            ,2.867                 ,0.991 
0        ,0    ,32     ,4.245            ,4.28                  ,0.992 
0        ,0    ,4      ,2.875            ,2.894                 ,0.994 
0        ,0    ,5      ,2.887            ,2.893                 ,0.998 
0        ,0    ,512    ,14.736           ,15.229                ,0.968 
0        ,0    ,6      ,2.876            ,2.868                 ,1.003 
0        ,0    ,64     ,4.957            ,4.968                 ,0.998 
0        ,0    ,7      ,2.893            ,2.88                  ,1.004 
0        ,0    ,8      ,2.856            ,2.867                 ,0.996 
0        ,0    ,9      ,2.872            ,2.885                 ,0.996 
0        ,23   ,1      ,2.826            ,2.859                 ,0.988 
0        ,23   ,10     ,2.861            ,2.876                 ,0.995 
0        ,23   ,1023   ,21.322           ,22.016                ,0.968 
0        ,23   ,1024   ,22.76            ,23.532                ,0.967 
0        ,23   ,11     ,2.872            ,2.875                 ,0.999 
0        ,23   ,12     ,2.872            ,2.881                 ,0.997 
0        ,23   ,127    ,5.293            ,5.38                  ,0.984 
0        ,23   ,1279   ,24.974           ,25.923                ,0.963 
0        ,23   ,128    ,5.904            ,5.683                 ,1.039 
0        ,23   ,1280   ,26.229           ,27.041                ,0.97  
0        ,23   ,13     ,2.878            ,2.87                  ,1.003 
0        ,23   ,14     ,2.843            ,2.87                  ,0.991 
0        ,23   ,15     ,2.864            ,2.873                 ,0.997 
0        ,23   ,1535   ,28.787           ,29.899                ,0.963 
0        ,23   ,1536   ,30.286           ,31.62                 ,0.958 
0        ,23   ,159    ,6.12             ,6.081                 ,1.006 
0        ,23   ,16     ,2.879            ,2.868                 ,1.004 
0        ,23   ,160    ,8.965            ,9.035                 ,0.992 
0        ,23   ,17     ,2.861            ,2.884                 ,0.992 
0        ,23   ,1791   ,32.274           ,33.92                 ,0.951 
0        ,23   ,1792   ,33.701           ,35.386                ,0.952 
0        ,23   ,18     ,2.861            ,2.873                 ,0.996 
0        ,23   ,19     ,2.848            ,2.865                 ,0.994 
0        ,23   ,191    ,8.858            ,9.03                  ,0.981 
0        ,23   ,192    ,9.255            ,9.801                 ,0.944 
0        ,23   ,2      ,2.889            ,2.897                 ,0.997 
0        ,23   ,20     ,2.843            ,2.846                 ,0.999 
0        ,23   ,2047   ,36.33            ,37.384                ,0.972 
0        ,23   ,2048   ,37.147           ,38.863                ,0.956 
0        ,23   ,21     ,2.855            ,2.86                  ,0.998 
0        ,23   ,22     ,2.843            ,2.846                 ,0.999 
0        ,23   ,223    ,8.993            ,9.551                 ,0.942 
0        ,23   ,224    ,9.1              ,9.656                 ,0.942 
0        ,23   ,23     ,2.847            ,2.852                 ,0.998 
0        ,23   ,24     ,2.854            ,2.854                 ,1.0   
0        ,23   ,25     ,2.863            ,2.873                 ,0.996 
0        ,23   ,255    ,9.087            ,9.693                 ,0.938 
0        ,23   ,2559   ,50.009           ,57.564                ,0.869 
0        ,23   ,256    ,10.385           ,10.78                 ,0.963 
0        ,23   ,2560   ,44.992           ,49.487                ,0.909 
0        ,23   ,26     ,2.859            ,2.86                  ,0.999 
0        ,23   ,27     ,2.856            ,2.861                 ,0.998 
0        ,23   ,28     ,2.862            ,2.853                 ,1.003 
0        ,23   ,29     ,2.853            ,2.851                 ,1.001 
0        ,23   ,3      ,2.89             ,2.917                 ,0.991 
0        ,23   ,30     ,2.871            ,2.888                 ,0.994 
0        ,23   ,3071   ,70.078           ,66.366                ,1.056 
0        ,23   ,3072   ,51.136           ,54.752                ,0.934 
0        ,23   ,31     ,2.848            ,2.857                 ,0.997 
0        ,23   ,319    ,10.808           ,11.072                ,0.976 
0        ,23   ,32     ,4.202            ,4.195                 ,1.002 
0        ,23   ,320    ,11.071           ,11.839                ,0.935 
0        ,23   ,3583   ,82.389           ,81.245                ,1.014 
0        ,23   ,3584   ,58.072           ,62.416                ,0.93  
0        ,23   ,383    ,11.152           ,11.866                ,0.94  
0        ,23   ,384    ,12.533           ,12.761                ,0.982 
0        ,23   ,4      ,2.868            ,2.892                 ,0.992 
0        ,23   ,447    ,12.916           ,13.313                ,0.97  
0        ,23   ,448    ,13.303           ,13.954                ,0.953 
0        ,23   ,5      ,2.885            ,2.875                 ,1.004 
0        ,23   ,511    ,13.28            ,13.871                ,0.957 
0        ,23   ,512    ,14.792           ,15.284                ,0.968 
0        ,23   ,6      ,2.857            ,2.87                  ,0.995 
0        ,23   ,63     ,4.277            ,4.283                 ,0.999 
0        ,23   ,639    ,15.31            ,16.14                 ,0.949 
0        ,23   ,64     ,4.961            ,4.961                 ,1.0   
0        ,23   ,640    ,16.757           ,17.581                ,0.953 
0        ,23   ,7      ,2.881            ,2.875                 ,1.002 
0        ,23   ,767    ,17.31            ,18.654                ,0.928 
0        ,23   ,768    ,19.421           ,19.879                ,0.977 
0        ,23   ,8      ,2.871            ,2.878                 ,0.998 
0        ,23   ,895    ,19.345           ,20.32                 ,0.952 
0        ,23   ,896    ,21.683           ,21.331                ,1.017 
0        ,23   ,9      ,2.904            ,2.868                 ,1.013 
0        ,23   ,95     ,4.989            ,4.945                 ,1.009 
0        ,23   ,96     ,5.382            ,5.098                 ,1.056 
1        ,0    ,64     ,4.945            ,4.953                 ,0.998 
1        ,23   ,64     ,4.998            ,4.95                  ,1.01  
2        ,0    ,64     ,4.92             ,4.939                 ,0.996 
2        ,23   ,64     ,4.95             ,4.957                 ,0.999 
3        ,0    ,64     ,4.964            ,4.954                 ,1.002 
3        ,23   ,64     ,4.943            ,4.978                 ,0.993 
4        ,0    ,64     ,4.981            ,4.968                 ,1.003 
4        ,23   ,64     ,4.949            ,4.969                 ,0.996 
5        ,0    ,64     ,4.923            ,4.932                 ,0.998 
5        ,23   ,64     ,4.931            ,4.931                 ,1.0   
6        ,0    ,64     ,4.794            ,4.799                 ,0.999 
6        ,23   ,64     ,4.803            ,4.8                   ,1.001 
0.9859952989629946

[-- Attachment #3: memchr.txt --]
[-- Type: text/plain, Size: 85937 bytes --]

Results For: memchr
align,invert_pos ,len  ,pos  ,seek_char ,__memchr_evex ,__memchr_evex_orig 
0    ,0          ,0    ,1    ,0         ,3.473         ,4.166              ,0.834 
0    ,0          ,0    ,1    ,23        ,3.505         ,4.181              ,0.838 
0    ,0          ,1    ,2    ,0         ,3.488         ,3.485              ,1.001 
0    ,0          ,1    ,2    ,23        ,3.472         ,3.469              ,1.001 
0    ,0          ,10   ,11   ,0         ,3.665         ,4.443              ,0.825 
0    ,0          ,10   ,11   ,23        ,3.485         ,3.856              ,0.904 
0    ,0          ,10   ,9    ,0         ,3.646         ,3.872              ,0.942 
0    ,0          ,10   ,9    ,23        ,3.661         ,3.771              ,0.971 
0    ,0          ,1024 ,1024 ,23        ,21.347        ,20.117             ,1.061 
0    ,0          ,1024 ,1056 ,23        ,21.66         ,20.361             ,1.064 
0    ,0          ,1024 ,1088 ,23        ,22.226        ,20.41              ,1.089 
0    ,0          ,1024 ,1120 ,23        ,21.754        ,20.29              ,1.072 
0    ,0          ,1024 ,1152 ,23        ,21.777        ,20.303             ,1.073 
0    ,0          ,1024 ,1184 ,23        ,21.532        ,20.325             ,1.059 
0    ,0          ,1024 ,1216 ,23        ,21.862        ,20.278             ,1.078 
0    ,0          ,1024 ,1248 ,23        ,21.539        ,20.218             ,1.065 
0    ,0          ,1024 ,1280 ,23        ,21.725        ,20.265             ,1.072 
0    ,0          ,1024 ,1312 ,23        ,21.756        ,20.352             ,1.069 
0    ,0          ,1024 ,1344 ,23        ,21.772        ,20.247             ,1.075 
0    ,0          ,1024 ,1376 ,23        ,21.542        ,20.363             ,1.058 
0    ,0          ,1024 ,1408 ,23        ,21.573        ,20.319             ,1.062 
0    ,0          ,1024 ,640  ,23        ,16.42         ,16.53              ,0.993 
0    ,0          ,1024 ,672  ,23        ,16.664        ,16.655             ,1.001 
0    ,0          ,1024 ,704  ,23        ,17.763        ,17.228             ,1.031 
0    ,0          ,1024 ,736  ,23        ,18.094        ,17.306             ,1.046 
0    ,0          ,1024 ,768  ,23        ,18.683        ,18.971             ,0.985 
0    ,0          ,1024 ,800  ,23        ,18.738        ,18.792             ,0.997 
0    ,0          ,1024 ,832  ,23        ,19.831        ,19.277             ,1.029 
0    ,0          ,1024 ,864  ,23        ,19.749        ,19.052             ,1.037 
0    ,0          ,1024 ,896  ,23        ,20.025        ,19.218             ,1.042 
0    ,0          ,1024 ,928  ,23        ,21.18         ,19.66              ,1.077 
0    ,0          ,1024 ,960  ,23        ,20.96         ,21.487             ,0.975 
0    ,0          ,1024 ,992  ,23        ,22.066        ,20.802             ,1.061 
0    ,0          ,1056 ,1024 ,23        ,21.801        ,20.757             ,1.05  
0    ,0          ,1088 ,1024 ,23        ,21.457        ,20.95              ,1.024 
0    ,0          ,11   ,10   ,0         ,3.617         ,3.812              ,0.949 
0    ,0          ,11   ,10   ,23        ,3.701         ,3.848              ,0.962 
0    ,0          ,11   ,12   ,0         ,3.482         ,3.759              ,0.926 
0    ,0          ,11   ,12   ,23        ,3.513         ,3.78               ,0.929 
0    ,0          ,112  ,16   ,23        ,3.56          ,3.807              ,0.935 
0    ,0          ,1120 ,1024 ,23        ,21.753        ,20.777             ,1.047 
0    ,0          ,1152 ,1024 ,23        ,21.724        ,20.948             ,1.037 
0    ,0          ,1184 ,1024 ,23        ,22.588        ,22.291             ,1.013 
0    ,0          ,12   ,11   ,0         ,3.588         ,3.76               ,0.954 
0    ,0          ,12   ,11   ,23        ,3.737         ,3.853              ,0.97  
0    ,0          ,12   ,13   ,0         ,3.504         ,3.843              ,0.912 
0    ,0          ,12   ,13   ,23        ,3.498         ,3.807              ,0.919 
0    ,0          ,1216 ,1024 ,23        ,22.525        ,22.172             ,1.016 
0    ,0          ,1248 ,1024 ,23        ,22.882        ,22.391             ,1.022 
0    ,0          ,128  ,128  ,23        ,5.46          ,6.528              ,0.836 
0    ,0          ,128  ,160  ,23        ,5.622         ,6.848              ,0.821 
0    ,0          ,128  ,192  ,23        ,5.653         ,6.872              ,0.823 
0    ,0          ,128  ,224  ,23        ,6.018         ,7.722              ,0.779 
0    ,0          ,128  ,256  ,23        ,5.693         ,6.915              ,0.823 
0    ,0          ,128  ,288  ,23        ,5.669         ,7.024              ,0.807 
0    ,0          ,128  ,32   ,23        ,4.641         ,5.73               ,0.81  
0    ,0          ,128  ,320  ,23        ,5.588         ,6.872              ,0.813 
0    ,0          ,128  ,352  ,23        ,5.571         ,6.87               ,0.811 
0    ,0          ,128  ,384  ,23        ,5.61          ,6.913              ,0.811 
0    ,0          ,128  ,416  ,23        ,5.545         ,6.835              ,0.811 
0    ,0          ,128  ,448  ,23        ,5.586         ,6.908              ,0.809 
0    ,0          ,128  ,480  ,23        ,5.59          ,6.674              ,0.837 
0    ,0          ,128  ,512  ,23        ,5.58          ,6.76               ,0.825 
0    ,0          ,128  ,64   ,23        ,5.036         ,6.123              ,0.823 
0    ,0          ,128  ,96   ,23        ,6.141         ,6.397              ,0.96  
0    ,0          ,1280 ,1024 ,23        ,22.328        ,22.221             ,1.005 
0    ,0          ,13   ,12   ,0         ,3.551         ,3.81               ,0.932 
0    ,0          ,13   ,12   ,23        ,3.644         ,3.956              ,0.921 
0    ,0          ,13   ,14   ,0         ,3.498         ,3.775              ,0.926 
0    ,0          ,13   ,14   ,23        ,3.489         ,3.785              ,0.922 
0    ,0          ,1312 ,1024 ,23        ,22.724        ,22.229             ,1.022 
0    ,0          ,1344 ,1024 ,23        ,22.405        ,22.205             ,1.009 
0    ,0          ,1376 ,1024 ,23        ,22.286        ,22.346             ,0.997 
0    ,0          ,14   ,13   ,0         ,3.548         ,3.805              ,0.932 
0    ,0          ,14   ,13   ,23        ,3.612         ,3.9                ,0.926 
0    ,0          ,14   ,15   ,0         ,3.491         ,3.771              ,0.926 
0    ,0          ,14   ,15   ,23        ,3.507         ,3.819              ,0.918 
0    ,0          ,1408 ,1024 ,23        ,22.468        ,22.266             ,1.009 
0    ,0          ,144  ,16   ,23        ,3.633         ,3.828              ,0.949 
0    ,0          ,15   ,14   ,0         ,3.642         ,3.863              ,0.943 
0    ,0          ,15   ,14   ,23        ,3.69          ,3.832              ,0.963 
0    ,0          ,15   ,16   ,0         ,3.501         ,3.894              ,0.899 
0    ,0          ,15   ,16   ,23        ,3.611         ,3.839              ,0.941 
0    ,0          ,16   ,112  ,23        ,3.497         ,3.909              ,0.895 
0    ,0          ,16   ,144  ,23        ,3.501         ,3.925              ,0.892 
0    ,0          ,16   ,15   ,0         ,3.658         ,3.857              ,0.948 
0    ,0          ,16   ,15   ,23        ,3.87          ,3.787              ,1.022 
0    ,0          ,16   ,16   ,23        ,3.425         ,3.711              ,0.923 
0    ,0          ,16   ,17   ,0         ,3.5           ,3.848              ,0.909 
0    ,0          ,16   ,17   ,23        ,3.494         ,3.82               ,0.914 
0    ,0          ,16   ,176  ,23        ,3.476         ,3.88               ,0.896 
0    ,0          ,16   ,208  ,23        ,3.464         ,3.799              ,0.912 
0    ,0          ,16   ,240  ,23        ,3.468         ,3.85               ,0.901 
0    ,0          ,16   ,272  ,23        ,3.516         ,3.848              ,0.914 
0    ,0          ,16   ,304  ,23        ,3.497         ,3.869              ,0.904 
0    ,0          ,16   ,336  ,23        ,3.491         ,3.822              ,0.913 
0    ,0          ,16   ,368  ,23        ,3.484         ,3.798              ,0.917 
0    ,0          ,16   ,400  ,23        ,3.493         ,3.877              ,0.901 
0    ,0          ,16   ,48   ,23        ,3.48          ,3.823              ,0.91  
0    ,0          ,16   ,80   ,23        ,3.497         ,3.868              ,0.904 
0    ,0          ,160  ,128  ,23        ,6.651         ,7.158              ,0.929 
0    ,0          ,160  ,256  ,23        ,6.136         ,7.605              ,0.807 
0    ,0          ,160  ,32   ,23        ,4.882         ,5.71               ,0.855 
0    ,0          ,160  ,512  ,23        ,6.102         ,6.676              ,0.914 
0    ,0          ,160  ,64   ,23        ,5.311         ,6.122              ,0.867 
0    ,0          ,1664 ,2048 ,23        ,31.73         ,29.774             ,1.066 
0    ,0          ,1696 ,2048 ,23        ,31.282        ,29.567             ,1.058 
0    ,0          ,17   ,16   ,0         ,3.66          ,3.868              ,0.946 
0    ,0          ,17   ,16   ,23        ,3.803         ,3.855              ,0.986 
0    ,0          ,17   ,18   ,0         ,3.477         ,3.893              ,0.893 
0    ,0          ,17   ,18   ,23        ,3.475         ,3.809              ,0.912 
0    ,0          ,1728 ,2048 ,23        ,32.093        ,30.336             ,1.058 
0    ,0          ,176  ,16   ,23        ,3.665         ,3.884              ,0.944 
0    ,0          ,1760 ,2048 ,23        ,32.968        ,30.894             ,1.067 
0    ,0          ,1792 ,2048 ,23        ,33.445        ,31.817             ,1.051 
0    ,0          ,18   ,17   ,0         ,3.701         ,3.785              ,0.978 
0    ,0          ,18   ,17   ,23        ,3.743         ,3.833              ,0.977 
0    ,0          ,18   ,19   ,0         ,3.478         ,3.837              ,0.907 
0    ,0          ,18   ,19   ,23        ,3.463         ,3.868              ,0.895 
0    ,0          ,1824 ,2048 ,23        ,33.291        ,31.768             ,1.048 
0    ,0          ,1856 ,2048 ,23        ,33.922        ,32.431             ,1.046 
0    ,0          ,1888 ,2048 ,23        ,35.392        ,33.135             ,1.068 
0    ,0          ,19   ,18   ,0         ,3.616         ,3.791              ,0.954 
0    ,0          ,19   ,18   ,23        ,3.813         ,3.807              ,1.002 
0    ,0          ,19   ,20   ,0         ,3.465         ,3.795              ,0.913 
0    ,0          ,19   ,20   ,23        ,3.458         ,3.811              ,0.907 
0    ,0          ,192  ,128  ,23        ,6.158         ,6.144              ,1.002 
0    ,0          ,192  ,256  ,23        ,7.663         ,7.608              ,1.007 
0    ,0          ,192  ,32   ,23        ,4.818         ,5.133              ,0.939 
0    ,0          ,192  ,512  ,23        ,7.465         ,7.249              ,1.03  
0    ,0          ,192  ,64   ,23        ,5.125         ,5.188              ,0.988 
0    ,0          ,1920 ,2048 ,23        ,35.59         ,33.388             ,1.066 
0    ,0          ,1952 ,2048 ,23        ,35.15         ,33.167             ,1.06  
0    ,0          ,1984 ,2048 ,23        ,35.715        ,33.95              ,1.052 
0    ,0          ,2    ,1    ,0         ,3.496         ,3.642              ,0.96  
0    ,0          ,2    ,1    ,23        ,3.466         ,3.444              ,1.007 
0    ,0          ,2    ,3    ,0         ,3.501         ,3.677              ,0.952 
0    ,0          ,2    ,3    ,23        ,3.553         ,3.604              ,0.986 
0    ,0          ,20   ,19   ,0         ,3.573         ,3.804              ,0.939 
0    ,0          ,20   ,19   ,23        ,3.815         ,3.834              ,0.995 
0    ,0          ,20   ,21   ,0         ,3.481         ,3.778              ,0.921 
0    ,0          ,20   ,21   ,23        ,3.481         ,3.833              ,0.908 
0    ,0          ,2016 ,2048 ,23        ,36.429        ,34.281             ,1.063 
0    ,0          ,2048 ,1024 ,0         ,23.047        ,22.507             ,1.024 
0    ,0          ,2048 ,1024 ,23        ,22.719        ,22.414             ,1.014 
0    ,0          ,2048 ,128  ,0         ,6.151         ,6.026              ,1.021 
0    ,0          ,2048 ,128  ,23        ,6.186         ,6.083              ,1.017 
0    ,0          ,2048 ,1664 ,23        ,32.613        ,31.399             ,1.039 
0    ,0          ,2048 ,1696 ,23        ,32.519        ,31.396             ,1.036 
0    ,0          ,2048 ,1728 ,23        ,34.272        ,32.097             ,1.068 
0    ,0          ,2048 ,1760 ,23        ,33.56         ,32.092             ,1.046 
0    ,0          ,2048 ,1792 ,23        ,34.325        ,35.3               ,0.972 
0    ,0          ,2048 ,1824 ,23        ,34.551        ,33.401             ,1.034 
0    ,0          ,2048 ,1856 ,23        ,35.717        ,34.195             ,1.044 
0    ,0          ,2048 ,1888 ,23        ,35.653        ,34.074             ,1.046 
0    ,0          ,2048 ,1920 ,23        ,35.127        ,33.787             ,1.04  
0    ,0          ,2048 ,1952 ,23        ,37.31         ,33.955             ,1.099 
0    ,0          ,2048 ,1984 ,23        ,36.119        ,36.15              ,0.999 
0    ,0          ,2048 ,2016 ,23        ,37.774        ,35.764             ,1.056 
0    ,0          ,2048 ,2048 ,0         ,37.794        ,35.197             ,1.074 
0    ,0          ,2048 ,2048 ,23        ,37.135        ,34.502             ,1.076 
0    ,0          ,2048 ,2080 ,23        ,37.593        ,34.836             ,1.079 
0    ,0          ,2048 ,2112 ,23        ,37.494        ,34.934             ,1.073 
0    ,0          ,2048 ,2144 ,23        ,37.47         ,35.042             ,1.069 
0    ,0          ,2048 ,2176 ,23        ,37.51         ,34.77              ,1.079 
0    ,0          ,2048 ,2208 ,23        ,37.512        ,34.873             ,1.076 
0    ,0          ,2048 ,2240 ,23        ,37.81         ,35.223             ,1.073 
0    ,0          ,2048 ,2272 ,23        ,37.648        ,34.795             ,1.082 
0    ,0          ,2048 ,2304 ,23        ,37.628        ,34.938             ,1.077 
0    ,0          ,2048 ,2336 ,23        ,37.607        ,34.815             ,1.08  
0    ,0          ,2048 ,2368 ,23        ,37.661        ,34.828             ,1.081 
0    ,0          ,2048 ,2400 ,23        ,37.711        ,34.934             ,1.08  
0    ,0          ,2048 ,2432 ,23        ,37.428        ,34.937             ,1.071 
0    ,0          ,2048 ,256  ,0         ,10.418        ,10.646             ,0.979 
0    ,0          ,2048 ,256  ,23        ,10.448        ,10.688             ,0.978 
0    ,0          ,2048 ,32   ,0         ,4.639         ,5.259              ,0.882 
0    ,0          ,2048 ,32   ,23        ,4.822         ,5.232              ,0.922 
0    ,0          ,2048 ,512  ,0         ,14.497        ,14.909             ,0.972 
0    ,0          ,2048 ,512  ,23        ,14.652        ,14.994             ,0.977 
0    ,0          ,2048 ,64   ,0         ,5.159         ,5.176              ,0.997 
0    ,0          ,2048 ,64   ,23        ,5.135         ,5.157              ,0.996 
0    ,0          ,208  ,16   ,23        ,3.6           ,3.935              ,0.915 
0    ,0          ,2080 ,2048 ,23        ,37.366        ,35.59              ,1.05  
0    ,0          ,21   ,20   ,0         ,3.618         ,3.93               ,0.921 
0    ,0          ,21   ,20   ,23        ,3.826         ,3.756              ,1.019 
0    ,0          ,21   ,22   ,0         ,3.456         ,3.754              ,0.92  
0    ,0          ,21   ,22   ,23        ,3.421         ,3.825              ,0.895 
0    ,0          ,2112 ,2048 ,23        ,37.713        ,35.722             ,1.056 
0    ,0          ,2144 ,2048 ,23        ,37.058        ,35.878             ,1.033 
0    ,0          ,2176 ,2048 ,23        ,37.001        ,35.798             ,1.034 
0    ,0          ,22   ,21   ,0         ,3.53          ,3.708              ,0.952 
0    ,0          ,22   ,21   ,23        ,3.705         ,3.821              ,0.97  
0    ,0          ,22   ,23   ,0         ,3.385         ,3.744              ,0.904 
0    ,0          ,22   ,23   ,23        ,3.6           ,4.397              ,0.819 
0    ,0          ,2208 ,2048 ,23        ,37.641        ,37.406             ,1.006 
0    ,0          ,224  ,128  ,23        ,6.174         ,6.209              ,0.994 
0    ,0          ,224  ,256  ,23        ,8.043         ,8.168              ,0.985 
0    ,0          ,224  ,32   ,23        ,5.2           ,5.013              ,1.037 
0    ,0          ,224  ,512  ,23        ,7.923         ,7.845              ,1.01  
0    ,0          ,224  ,64   ,23        ,5.059         ,5.266              ,0.961 
0    ,0          ,2240 ,2048 ,23        ,38.457        ,37.305             ,1.031 
0    ,0          ,2272 ,2048 ,23        ,38.433        ,37.216             ,1.033 
0    ,0          ,23   ,22   ,0         ,3.593         ,3.725              ,0.964 
0    ,0          ,23   ,22   ,23        ,3.689         ,3.827              ,0.964 
0    ,0          ,23   ,24   ,0         ,3.422         ,3.765              ,0.909 
0    ,0          ,23   ,24   ,23        ,3.445         ,3.745              ,0.92  
0    ,0          ,2304 ,2048 ,23        ,37.974        ,37.383             ,1.016 
0    ,0          ,2336 ,2048 ,23        ,38.69         ,37.569             ,1.03  
0    ,0          ,2368 ,2048 ,23        ,38.716        ,37.644             ,1.028 
0    ,0          ,24   ,23   ,0         ,3.549         ,3.806              ,0.932 
0    ,0          ,24   ,23   ,23        ,3.738         ,3.762              ,0.994 
0    ,0          ,24   ,25   ,0         ,3.342         ,3.681              ,0.908 
0    ,0          ,24   ,25   ,23        ,3.341         ,3.823              ,0.874 
0    ,0          ,240  ,16   ,23        ,3.642         ,3.859              ,0.944 
0    ,0          ,2400 ,2048 ,23        ,38.162        ,37.283             ,1.024 
0    ,0          ,2432 ,2048 ,23        ,38.212        ,37.582             ,1.017 
0    ,0          ,25   ,24   ,0         ,3.61          ,3.795              ,0.951 
0    ,0          ,25   ,24   ,23        ,3.695         ,3.769              ,0.98  
0    ,0          ,25   ,26   ,0         ,3.351         ,3.7                ,0.906 
0    ,0          ,25   ,26   ,23        ,3.322         ,3.734              ,0.89  
0    ,0          ,256  ,128  ,23        ,6.204         ,6.079              ,1.02  
0    ,0          ,256  ,160  ,23        ,7.927         ,7.624              ,1.04  
0    ,0          ,256  ,192  ,23        ,7.865         ,7.782              ,1.011 
0    ,0          ,256  ,224  ,23        ,8.83          ,8.766              ,1.007 
0    ,0          ,256  ,256  ,23        ,8.367         ,8.437              ,0.992 
0    ,0          ,256  ,288  ,23        ,8.523         ,8.537              ,0.998 
0    ,0          ,256  ,32   ,23        ,5.07          ,5.007              ,1.013 
0    ,0          ,256  ,320  ,23        ,8.523         ,8.604              ,0.991 
0    ,0          ,256  ,352  ,23        ,8.611         ,8.629              ,0.998 
0    ,0          ,256  ,384  ,23        ,8.541         ,8.495              ,1.005 
0    ,0          ,256  ,416  ,23        ,8.723         ,8.63               ,1.011 
0    ,0          ,256  ,448  ,23        ,8.598         ,8.623              ,0.997 
0    ,0          ,256  ,480  ,23        ,8.498         ,8.622              ,0.986 
0    ,0          ,256  ,512  ,23        ,8.532         ,8.632              ,0.988 
0    ,0          ,256  ,544  ,23        ,9.267         ,8.599              ,1.078 
0    ,0          ,256  ,576  ,23        ,9.163         ,8.699              ,1.053 
0    ,0          ,256  ,608  ,23        ,9.201         ,8.691              ,1.059 
0    ,0          ,256  ,64   ,23        ,5.013         ,5.26               ,0.953 
0    ,0          ,256  ,640  ,23        ,8.489         ,8.643              ,0.982 
0    ,0          ,256  ,96   ,23        ,6.429         ,5.756              ,1.117 
0    ,0          ,26   ,25   ,0         ,3.485         ,3.71               ,0.939 
0    ,0          ,26   ,25   ,23        ,3.535         ,3.742              ,0.945 
0    ,0          ,26   ,27   ,0         ,3.351         ,3.728              ,0.899 
0    ,0          ,26   ,27   ,23        ,3.344         ,3.826              ,0.874 
0    ,0          ,27   ,26   ,0         ,3.462         ,3.683              ,0.94  
0    ,0          ,27   ,26   ,23        ,3.602         ,3.81               ,0.945 
0    ,0          ,27   ,28   ,0         ,3.326         ,3.716              ,0.895 
0    ,0          ,27   ,28   ,23        ,3.313         ,3.698              ,0.896 
0    ,0          ,272  ,16   ,23        ,3.603         ,3.867              ,0.932 
0    ,0          ,28   ,27   ,0         ,3.445         ,3.714              ,0.927 
0    ,0          ,28   ,27   ,23        ,3.553         ,3.789              ,0.938 
0    ,0          ,28   ,29   ,0         ,3.287         ,3.739              ,0.879 
0    ,0          ,28   ,29   ,23        ,3.286         ,3.753              ,0.875 
0    ,0          ,288  ,128  ,23        ,6.189         ,6.001              ,1.031 
0    ,0          ,288  ,256  ,23        ,9.392         ,9.63               ,0.975 
0    ,0          ,288  ,32   ,23        ,5.028         ,5.029              ,1.0   
0    ,0          ,288  ,512  ,23        ,9.082         ,9.382              ,0.968 
0    ,0          ,288  ,64   ,23        ,5.107         ,5.276              ,0.968 
0    ,0          ,29   ,28   ,0         ,3.467         ,3.703              ,0.936 
0    ,0          ,29   ,28   ,23        ,3.643         ,3.785              ,0.962 
0    ,0          ,29   ,30   ,0         ,3.279         ,3.69               ,0.889 
0    ,0          ,29   ,30   ,23        ,3.263         ,3.705              ,0.881 
0    ,0          ,3    ,2    ,0         ,3.483         ,3.75               ,0.929 
0    ,0          ,3    ,2    ,23        ,3.549         ,3.791              ,0.936 
0    ,0          ,3    ,4    ,0         ,3.499         ,3.615              ,0.968 
0    ,0          ,3    ,4    ,23        ,3.492         ,3.616              ,0.966 
0    ,0          ,30   ,29   ,0         ,3.455         ,3.746              ,0.922 
0    ,0          ,30   ,29   ,23        ,3.643         ,3.797              ,0.959 
0    ,0          ,30   ,31   ,0         ,3.309         ,3.704              ,0.893 
0    ,0          ,30   ,31   ,23        ,3.302         ,3.801              ,0.869 
0    ,0          ,304  ,16   ,23        ,3.571         ,3.965              ,0.901 
0    ,0          ,31   ,30   ,0         ,3.428         ,3.748              ,0.915 
0    ,0          ,31   ,30   ,23        ,3.511         ,3.755              ,0.935 
0    ,0          ,32   ,128  ,23        ,3.28          ,3.702              ,0.886 
0    ,0          ,32   ,160  ,23        ,3.308         ,3.702              ,0.894 
0    ,0          ,32   ,192  ,23        ,3.296         ,3.756              ,0.878 
0    ,0          ,32   ,224  ,23        ,3.31          ,3.707              ,0.893 
0    ,0          ,32   ,256  ,23        ,3.314         ,3.715              ,0.892 
0    ,0          ,32   ,288  ,23        ,3.324         ,3.737              ,0.889 
0    ,0          ,32   ,31   ,0         ,3.458         ,3.752              ,0.922 
0    ,0          ,32   ,31   ,23        ,3.456         ,3.7                ,0.934 
0    ,0          ,32   ,32   ,23        ,3.23          ,3.643              ,0.887 
0    ,0          ,32   ,320  ,23        ,3.334         ,3.673              ,0.908 
0    ,0          ,32   ,352  ,23        ,3.324         ,3.728              ,0.892 
0    ,0          ,32   ,384  ,23        ,3.311         ,3.713              ,0.892 
0    ,0          ,32   ,416  ,23        ,3.34          ,3.676              ,0.908 
0    ,0          ,32   ,64   ,23        ,3.285         ,3.673              ,0.895 
0    ,0          ,32   ,96   ,23        ,3.3           ,3.67               ,0.899 
0    ,0          ,320  ,128  ,23        ,6.128         ,5.986              ,1.024 
0    ,0          ,320  ,256  ,23        ,10.255        ,9.859              ,1.04  
0    ,0          ,320  ,32   ,23        ,5.226         ,5.063              ,1.032 
0    ,0          ,320  ,512  ,23        ,10.38         ,10.25              ,1.013 
0    ,0          ,320  ,64   ,23        ,5.062         ,5.193              ,0.975 
0    ,0          ,336  ,16   ,23        ,3.592         ,3.963              ,0.906 
0    ,0          ,352  ,128  ,23        ,6.197         ,6.048              ,1.025 
0    ,0          ,352  ,256  ,23        ,10.583        ,10.571             ,1.001 
0    ,0          ,352  ,32   ,23        ,5.248         ,5.028              ,1.044 
0    ,0          ,352  ,512  ,23        ,10.823        ,10.873             ,0.995 
0    ,0          ,352  ,64   ,23        ,5.071         ,5.202              ,0.975 
0    ,0          ,368  ,16   ,23        ,3.556         ,3.857              ,0.922 
0    ,0          ,3712 ,4096 ,23        ,63.78         ,69.22              ,0.921 
0    ,0          ,3744 ,4096 ,23        ,63.149        ,70.832             ,0.892 
0    ,0          ,3776 ,4096 ,23        ,63.619        ,70.826             ,0.898 
0    ,0          ,3808 ,4096 ,23        ,64.318        ,71.604             ,0.898 
0    ,0          ,384  ,128  ,23        ,6.161         ,6.105              ,1.009 
0    ,0          ,384  ,256  ,23        ,9.792         ,9.752              ,1.004 
0    ,0          ,384  ,32   ,23        ,5.498         ,5.014              ,1.097 
0    ,0          ,384  ,512  ,23        ,11.584        ,11.573             ,1.001 
0    ,0          ,384  ,64   ,23        ,4.951         ,5.261              ,0.941 
0    ,0          ,3840 ,4096 ,23        ,65.775        ,70.85              ,0.928 
0    ,0          ,3872 ,4096 ,23        ,66.258        ,72.207             ,0.918 
0    ,0          ,3904 ,4096 ,23        ,66.891        ,72.083             ,0.928 
0    ,0          ,3936 ,4096 ,23        ,66.326        ,73.547             ,0.902 
0    ,0          ,3968 ,4096 ,23        ,67.857        ,73.444             ,0.924 
0    ,0          ,4    ,3    ,0         ,3.591         ,3.785              ,0.949 
0    ,0          ,4    ,3    ,23        ,3.589         ,3.813              ,0.941 
0    ,0          ,4    ,5    ,0         ,3.486         ,3.514              ,0.992 
0    ,0          ,4    ,5    ,23        ,3.483         ,3.58               ,0.973 
0    ,0          ,400  ,16   ,23        ,3.575         ,3.88               ,0.921 
0    ,0          ,4000 ,4096 ,23        ,67.682        ,74.733             ,0.906 
0    ,0          ,4032 ,4096 ,23        ,67.609        ,76.891             ,0.879 
0    ,0          ,4064 ,4096 ,23        ,68.659        ,76.556             ,0.897 
0    ,0          ,4096 ,3712 ,23        ,64.615        ,88.387             ,0.731 
0    ,0          ,4096 ,3744 ,23        ,64.921        ,87.941             ,0.738 
0    ,0          ,4096 ,3776 ,23        ,65.276        ,87.668             ,0.745 
0    ,0          ,4096 ,3808 ,23        ,66.016        ,88.603             ,0.745 
0    ,0          ,4096 ,3840 ,23        ,70.403        ,91.997             ,0.765 
0    ,0          ,4096 ,3872 ,23        ,67.055        ,87.431             ,0.767 
0    ,0          ,4096 ,3904 ,23        ,68.023        ,89.039             ,0.764 
0    ,0          ,4096 ,3936 ,23        ,67.631        ,89.265             ,0.758 
0    ,0          ,4096 ,3968 ,23        ,68.641        ,74.007             ,0.927 
0    ,0          ,4096 ,4000 ,23        ,72.133        ,78.95              ,0.914 
0    ,0          ,4096 ,4032 ,23        ,69.08         ,77.393             ,0.893 
0    ,0          ,4096 ,4064 ,23        ,70.372        ,77.075             ,0.913 
0    ,0          ,4096 ,4096 ,23        ,69.437        ,75.123             ,0.924 
0    ,0          ,4096 ,4128 ,23        ,70.462        ,75.608             ,0.932 
0    ,0          ,4096 ,4160 ,23        ,69.956        ,75.867             ,0.922 
0    ,0          ,4096 ,4192 ,23        ,69.843        ,75.901             ,0.92  
0    ,0          ,4096 ,4224 ,23        ,70.844        ,76.334             ,0.928 
0    ,0          ,4096 ,4256 ,23        ,69.573        ,75.887             ,0.917 
0    ,0          ,4096 ,4288 ,23        ,70.359        ,76.0               ,0.926 
0    ,0          ,4096 ,4320 ,23        ,71.167        ,75.91              ,0.938 
0    ,0          ,4096 ,4352 ,23        ,69.839        ,75.444             ,0.926 
0    ,0          ,4096 ,4384 ,23        ,69.719        ,75.942             ,0.918 
0    ,0          ,4096 ,4416 ,23        ,69.554        ,75.796             ,0.918 
0    ,0          ,4096 ,4448 ,23        ,69.115        ,75.496             ,0.915 
0    ,0          ,4096 ,4480 ,23        ,70.861        ,75.695             ,0.936 
0    ,0          ,4128 ,4096 ,23        ,69.667        ,77.45              ,0.9   
0    ,0          ,416  ,128  ,23        ,6.163         ,6.065              ,1.016 
0    ,0          ,416  ,256  ,23        ,11.565        ,10.811             ,1.07  
0    ,0          ,416  ,32   ,23        ,5.391         ,5.133              ,1.05  
0    ,0          ,416  ,512  ,23        ,11.685        ,10.918             ,1.07  
0    ,0          ,416  ,64   ,23        ,4.987         ,5.125              ,0.973 
0    ,0          ,4160 ,4096 ,23        ,69.348        ,76.459             ,0.907 
0    ,0          ,4192 ,4096 ,23        ,70.619        ,76.057             ,0.929 
0    ,0          ,4224 ,4096 ,23        ,68.959        ,76.303             ,0.904 
0    ,0          ,4256 ,4096 ,23        ,75.085        ,96.41              ,0.779 
0    ,0          ,4288 ,4096 ,23        ,69.921        ,92.693             ,0.754 
0    ,0          ,4320 ,4096 ,23        ,72.347        ,96.461             ,0.75  
0    ,0          ,4352 ,4096 ,23        ,72.83         ,98.647             ,0.738 
0    ,0          ,4384 ,4096 ,23        ,70.59         ,95.961             ,0.736 
0    ,0          ,4416 ,4096 ,23        ,71.088        ,95.826             ,0.742 
0    ,0          ,4448 ,4096 ,23        ,71.876        ,96.575             ,0.744 
0    ,0          ,448  ,128  ,23        ,6.128         ,6.058              ,1.012 
0    ,0          ,448  ,256  ,23        ,10.492        ,10.524             ,0.997 
0    ,0          ,448  ,512  ,23        ,12.444        ,11.774             ,1.057 
0    ,0          ,448  ,64   ,23        ,4.977         ,5.204              ,0.956 
0    ,0          ,4480 ,4096 ,23        ,70.467        ,95.694             ,0.736 
0    ,0          ,48   ,16   ,23        ,3.472         ,3.889              ,0.893 
0    ,0          ,480  ,128  ,23        ,6.185         ,6.002              ,1.031 
0    ,0          ,480  ,256  ,23        ,10.382        ,10.477             ,0.991 
0    ,0          ,480  ,512  ,23        ,12.402        ,12.486             ,0.993 
0    ,0          ,5    ,4    ,0         ,3.578         ,3.777              ,0.947 
0    ,0          ,5    ,4    ,23        ,3.521         ,3.788              ,0.929 
0    ,0          ,5    ,6    ,0         ,3.489         ,3.712              ,0.94  
0    ,0          ,5    ,6    ,23        ,3.476         ,3.727              ,0.933 
0    ,0          ,512  ,128  ,23        ,6.127         ,6.091              ,1.006 
0    ,0          ,512  ,160  ,23        ,9.055         ,9.019              ,1.004 
0    ,0          ,512  ,192  ,23        ,9.408         ,9.58               ,0.982 
0    ,0          ,512  ,224  ,23        ,9.337         ,9.378              ,0.996 
0    ,0          ,512  ,256  ,23        ,10.419        ,10.511             ,0.991 
0    ,0          ,512  ,288  ,23        ,10.862        ,10.885             ,0.998 
0    ,0          ,512  ,320  ,23        ,11.236        ,11.349             ,0.99  
0    ,0          ,512  ,352  ,23        ,12.097        ,11.381             ,1.063 
0    ,0          ,512  ,384  ,23        ,11.787        ,11.561             ,1.02  
0    ,0          ,512  ,416  ,23        ,12.889        ,12.124             ,1.063 
0    ,0          ,512  ,448  ,23        ,13.497        ,13.479             ,1.001 
0    ,0          ,512  ,480  ,23        ,13.987        ,13.836             ,1.011 
0    ,0          ,512  ,512  ,23        ,13.425        ,13.128             ,1.023 
0    ,0          ,512  ,544  ,23        ,13.628        ,13.322             ,1.023 
0    ,0          ,512  ,576  ,23        ,13.629        ,13.332             ,1.022 
0    ,0          ,512  ,608  ,23        ,13.592        ,13.286             ,1.023 
0    ,0          ,512  ,640  ,23        ,13.504        ,13.303             ,1.015 
0    ,0          ,512  ,672  ,23        ,13.641        ,13.31              ,1.025 
0    ,0          ,512  ,704  ,23        ,13.602        ,14.037             ,0.969 
0    ,0          ,512  ,736  ,23        ,13.599        ,13.259             ,1.026 
0    ,0          ,512  ,768  ,23        ,13.556        ,13.218             ,1.026 
0    ,0          ,512  ,800  ,23        ,13.479        ,13.274             ,1.016 
0    ,0          ,512  ,832  ,23        ,13.588        ,13.265             ,1.024 
0    ,0          ,512  ,864  ,23        ,13.552        ,13.265             ,1.022 
0    ,0          ,512  ,896  ,23        ,13.688        ,13.369             ,1.024 
0    ,0          ,544  ,256  ,23        ,10.269        ,10.421             ,0.985 
0    ,0          ,544  ,512  ,23        ,14.301        ,13.686             ,1.045 
0    ,0          ,576  ,256  ,23        ,10.335        ,10.421             ,0.992 
0    ,0          ,576  ,512  ,23        ,14.129        ,13.776             ,1.026 
0    ,0          ,6    ,5    ,0         ,3.6           ,3.781              ,0.952 
0    ,0          ,6    ,5    ,23        ,3.522         ,3.783              ,0.931 
0    ,0          ,6    ,7    ,0         ,3.506         ,3.787              ,0.926 
0    ,0          ,6    ,7    ,23        ,3.505         ,3.811              ,0.92  
0    ,0          ,608  ,256  ,23        ,10.422        ,10.401             ,1.002 
0    ,0          ,608  ,512  ,23        ,14.041        ,13.801             ,1.017 
0    ,0          ,64   ,128  ,23        ,4.606         ,5.534              ,0.832 
0    ,0          ,64   ,160  ,23        ,4.482         ,5.649              ,0.793 
0    ,0          ,64   ,192  ,23        ,4.629         ,5.528              ,0.837 
0    ,0          ,64   ,224  ,23        ,4.516         ,5.489              ,0.823 
0    ,0          ,64   ,256  ,23        ,4.448         ,5.588              ,0.796 
0    ,0          ,64   ,288  ,23        ,4.581         ,5.517              ,0.83  
0    ,0          ,64   ,32   ,23        ,4.755         ,5.667              ,0.839 
0    ,0          ,64   ,320  ,23        ,4.421         ,5.481              ,0.807 
0    ,0          ,64   ,352  ,23        ,4.562         ,5.522              ,0.826 
0    ,0          ,64   ,384  ,23        ,4.467         ,5.49               ,0.814 
0    ,0          ,64   ,416  ,23        ,4.384         ,5.449              ,0.804 
0    ,0          ,64   ,448  ,23        ,4.492         ,5.542              ,0.811 
0    ,0          ,64   ,64   ,23        ,4.373         ,5.382              ,0.812 
0    ,0          ,64   ,96   ,23        ,4.473         ,5.568              ,0.803 
0    ,0          ,640  ,1024 ,23        ,15.477        ,15.286             ,1.012 
0    ,0          ,640  ,256  ,23        ,10.386        ,10.54              ,0.985 
0    ,0          ,640  ,512  ,23        ,13.804        ,13.711             ,1.007 
0    ,0          ,672  ,1024 ,23        ,15.551        ,15.098             ,1.03  
0    ,0          ,672  ,512  ,23        ,14.409        ,14.727             ,0.978 
0    ,0          ,7    ,6    ,0         ,3.658         ,3.773              ,0.969 
0    ,0          ,7    ,6    ,23        ,3.684         ,3.864              ,0.953 
0    ,0          ,7    ,8    ,0         ,3.506         ,3.831              ,0.915 
0    ,0          ,7    ,8    ,23        ,3.498         ,3.796              ,0.921 
0    ,0          ,704  ,1024 ,23        ,16.131        ,15.806             ,1.021 
0    ,0          ,704  ,512  ,23        ,14.531        ,14.761             ,0.984 
0    ,0          ,736  ,1024 ,23        ,16.909        ,16.371             ,1.033 
0    ,0          ,736  ,512  ,23        ,14.332        ,14.728             ,0.973 
0    ,0          ,768  ,1024 ,23        ,17.52         ,17.314             ,1.012 
0    ,0          ,768  ,512  ,23        ,14.487        ,14.744             ,0.983 
0    ,0          ,7808 ,8192 ,23        ,142.838       ,140.594            ,1.016 
0    ,0          ,7840 ,8192 ,23        ,146.234       ,141.352            ,1.035 
0    ,0          ,7872 ,8192 ,23        ,145.796       ,142.548            ,1.023 
0    ,0          ,7904 ,8192 ,23        ,144.219       ,143.683            ,1.004 
0    ,0          ,7936 ,8192 ,23        ,147.803       ,143.665            ,1.029 
0    ,0          ,7968 ,8192 ,23        ,147.458       ,144.457            ,1.021 
0    ,0          ,8    ,7    ,0         ,3.556         ,3.801              ,0.935 
0    ,0          ,8    ,7    ,23        ,3.613         ,3.782              ,0.955 
0    ,0          ,8    ,9    ,0         ,3.5           ,3.811              ,0.918 
0    ,0          ,8    ,9    ,23        ,3.506         ,3.825              ,0.917 
0    ,0          ,80   ,16   ,23        ,3.541         ,3.965              ,0.893 
0    ,0          ,800  ,1024 ,23        ,17.385        ,17.114             ,1.016 
0    ,0          ,800  ,512  ,23        ,14.447        ,14.829             ,0.974 
0    ,0          ,8000 ,8192 ,23        ,147.199       ,144.857            ,1.016 
0    ,0          ,8032 ,8192 ,23        ,148.789       ,145.683            ,1.021 
0    ,0          ,8064 ,8192 ,23        ,149.846       ,145.922            ,1.027 
0    ,0          ,8096 ,8192 ,23        ,150.151       ,145.632            ,1.031 
0    ,0          ,8128 ,8192 ,23        ,149.362       ,146.551            ,1.019 
0    ,0          ,8160 ,8192 ,23        ,149.914       ,149.245            ,1.004 
0    ,0          ,832  ,1024 ,23        ,17.734        ,17.688             ,1.003 
0    ,0          ,832  ,512  ,23        ,14.485        ,14.736             ,0.983 
0    ,0          ,864  ,1024 ,23        ,18.89         ,17.95              ,1.052 
0    ,0          ,864  ,512  ,23        ,15.036        ,15.126             ,0.994 
0    ,0          ,896  ,1024 ,23        ,19.813        ,18.7               ,1.06  
0    ,0          ,896  ,512  ,23        ,14.523        ,14.808             ,0.981 
0    ,0          ,9    ,10   ,0         ,3.498         ,3.818              ,0.916 
0    ,0          ,9    ,10   ,23        ,3.519         ,3.792              ,0.928 
0    ,0          ,9    ,8    ,0         ,3.637         ,3.787              ,0.96  
0    ,0          ,9    ,8    ,23        ,3.571         ,3.784              ,0.944 
0    ,0          ,928  ,1024 ,23        ,19.587        ,18.73              ,1.046 
0    ,0          ,96   ,128  ,23        ,5.024         ,6.657              ,0.755 
0    ,0          ,96   ,256  ,23        ,5.063         ,6.472              ,0.782 
0    ,0          ,96   ,32   ,23        ,4.998         ,5.735              ,0.871 
0    ,0          ,96   ,64   ,23        ,5.6           ,5.634              ,0.994 
0    ,0          ,960  ,1024 ,23        ,19.758        ,19.474             ,1.015 
0    ,0          ,992  ,1024 ,23        ,21.526        ,19.571             ,1.1   
1    ,0          ,0    ,1    ,0         ,3.321         ,3.989              ,0.832 
1    ,0          ,0    ,1    ,23        ,3.381         ,4.061              ,0.833 
1    ,0          ,192  ,32   ,0         ,4.672         ,5.119              ,0.913 
1    ,0          ,192  ,32   ,23        ,4.516         ,4.979              ,0.907 
1    ,0          ,2    ,1    ,0         ,3.525         ,3.521              ,1.001 
1    ,0          ,2    ,1    ,23        ,3.608         ,3.668              ,0.984 
1    ,0          ,256  ,32   ,0         ,4.58          ,5.029              ,0.911 
1    ,0          ,256  ,32   ,23        ,4.569         ,5.008              ,0.912 
1    ,0          ,256  ,64   ,0         ,5.933         ,5.39               ,1.101 
1    ,0          ,256  ,64   ,23        ,5.057         ,5.365              ,0.943 
1    ,0          ,512  ,32   ,0         ,4.63          ,4.965              ,0.933 
1    ,0          ,512  ,32   ,23        ,4.581         ,5.087              ,0.901 
10   ,0          ,11   ,10   ,0         ,3.57          ,3.81               ,0.937 
10   ,0          ,11   ,10   ,23        ,3.59          ,3.816              ,0.941 
10   ,0          ,9    ,10   ,0         ,3.51          ,3.84               ,0.914 
10   ,0          ,9    ,10   ,23        ,3.506         ,3.818              ,0.918 
11   ,0          ,10   ,11   ,0         ,3.508         ,3.829              ,0.916 
11   ,0          ,10   ,11   ,23        ,3.5           ,3.952              ,0.886 
11   ,0          ,12   ,11   ,0         ,3.62          ,3.813              ,0.949 
11   ,0          ,12   ,11   ,23        ,3.595         ,3.816              ,0.942 
12   ,0          ,11   ,12   ,0         ,3.508         ,3.828              ,0.916 
12   ,0          ,11   ,12   ,23        ,3.509         ,3.823              ,0.918 
12   ,0          ,13   ,12   ,0         ,3.622         ,3.798              ,0.954 
12   ,0          ,13   ,12   ,23        ,3.567         ,3.835              ,0.93  
13   ,0          ,12   ,13   ,0         ,3.51          ,3.797              ,0.924 
13   ,0          ,12   ,13   ,23        ,3.485         ,3.778              ,0.922 
13   ,0          ,14   ,13   ,0         ,3.625         ,3.84               ,0.944 
13   ,0          ,14   ,13   ,23        ,3.594         ,3.842              ,0.935 
14   ,0          ,13   ,14   ,0         ,3.473         ,3.829              ,0.907 
14   ,0          ,13   ,14   ,23        ,3.5           ,3.846              ,0.91  
14   ,0          ,15   ,14   ,0         ,3.691         ,3.795              ,0.973 
14   ,0          ,15   ,14   ,23        ,3.537         ,3.828              ,0.924 
15   ,0          ,14   ,15   ,0         ,3.489         ,3.83               ,0.911 
15   ,0          ,14   ,15   ,23        ,3.495         ,3.793              ,0.921 
15   ,0          ,16   ,15   ,0         ,3.607         ,3.775              ,0.956 
15   ,0          ,16   ,15   ,23        ,3.619         ,3.883              ,0.932 
16   ,0          ,15   ,16   ,0         ,3.518         ,3.852              ,0.913 
16   ,0          ,15   ,16   ,23        ,3.492         ,3.772              ,0.926 
16   ,0          ,17   ,16   ,0         ,3.624         ,3.859              ,0.939 
16   ,0          ,17   ,16   ,23        ,3.634         ,3.817              ,0.952 
17   ,0          ,16   ,17   ,0         ,3.485         ,3.89               ,0.896 
17   ,0          ,16   ,17   ,23        ,3.498         ,3.836              ,0.912 
17   ,0          ,18   ,17   ,0         ,3.583         ,3.816              ,0.939 
17   ,0          ,18   ,17   ,23        ,3.595         ,3.818              ,0.942 
18   ,0          ,17   ,18   ,0         ,3.468         ,3.839              ,0.903 
18   ,0          ,17   ,18   ,23        ,3.493         ,3.805              ,0.918 
18   ,0          ,19   ,18   ,0         ,3.593         ,3.805              ,0.944 
18   ,0          ,19   ,18   ,23        ,3.585         ,3.776              ,0.949 
19   ,0          ,18   ,19   ,0         ,3.474         ,3.818              ,0.91  
19   ,0          ,18   ,19   ,23        ,3.474         ,3.832              ,0.907 
19   ,0          ,20   ,19   ,0         ,3.576         ,3.849              ,0.929 
19   ,0          ,20   ,19   ,23        ,3.502         ,3.873              ,0.904 
2    ,0          ,1    ,2    ,0         ,3.515         ,3.515              ,1.0   
2    ,0          ,1    ,2    ,23        ,3.506         ,3.504              ,1.0   
2    ,0          ,192  ,64   ,0         ,5.019         ,5.348              ,0.938 
2    ,0          ,192  ,64   ,23        ,5.265         ,5.433              ,0.969 
2    ,0          ,256  ,64   ,0         ,5.028         ,5.155              ,0.975 
2    ,0          ,256  ,64   ,23        ,4.967         ,5.161              ,0.962 
2    ,0          ,3    ,2    ,0         ,3.603         ,3.78               ,0.953 
2    ,0          ,3    ,2    ,23        ,3.568         ,3.829              ,0.932 
2    ,0          ,512  ,64   ,0         ,4.982         ,5.124              ,0.972 
2    ,0          ,512  ,64   ,23        ,4.963         ,5.239              ,0.947 
20   ,0          ,19   ,20   ,0         ,3.446         ,3.791              ,0.909 
20   ,0          ,19   ,20   ,23        ,3.475         ,3.819              ,0.91  
20   ,0          ,21   ,20   ,0         ,3.601         ,3.776              ,0.954 
20   ,0          ,21   ,20   ,23        ,3.599         ,3.798              ,0.948 
2048 ,0          ,0    ,1    ,0         ,3.429         ,4.112              ,0.834 
2048 ,0          ,0    ,1    ,23        ,3.455         ,4.144              ,0.834 
2048 ,0          ,1    ,2    ,0         ,3.525         ,3.505              ,1.006 
2048 ,0          ,1    ,2    ,23        ,3.498         ,3.496              ,1.001 
2048 ,0          ,10   ,11   ,0         ,3.5           ,3.931              ,0.89  
2048 ,0          ,10   ,11   ,23        ,3.542         ,3.848              ,0.92  
2048 ,0          ,10   ,9    ,0         ,3.588         ,3.819              ,0.94  
2048 ,0          ,10   ,9    ,23        ,3.595         ,3.836              ,0.937 
2048 ,0          ,11   ,10   ,0         ,3.626         ,3.785              ,0.958 
2048 ,0          ,11   ,10   ,23        ,3.622         ,3.816              ,0.949 
2048 ,0          ,11   ,12   ,0         ,3.491         ,3.826              ,0.912 
2048 ,0          ,11   ,12   ,23        ,3.49          ,3.804              ,0.917 
2048 ,0          ,12   ,11   ,0         ,3.556         ,3.774              ,0.942 
2048 ,0          ,12   ,11   ,23        ,3.678         ,3.986              ,0.923 
2048 ,0          ,12   ,13   ,0         ,3.494         ,3.835              ,0.911 
2048 ,0          ,12   ,13   ,23        ,3.481         ,3.829              ,0.909 
2048 ,0          ,13   ,12   ,0         ,3.632         ,3.888              ,0.934 
2048 ,0          ,13   ,12   ,23        ,3.614         ,3.824              ,0.945 
2048 ,0          ,13   ,14   ,0         ,3.497         ,3.888              ,0.9   
2048 ,0          ,13   ,14   ,23        ,3.506         ,3.833              ,0.915 
2048 ,0          ,14   ,13   ,0         ,3.568         ,3.792              ,0.941 
2048 ,0          ,14   ,13   ,23        ,3.563         ,3.829              ,0.931 
2048 ,0          ,14   ,15   ,0         ,3.482         ,3.809              ,0.914 
2048 ,0          ,14   ,15   ,23        ,3.471         ,3.792              ,0.915 
2048 ,0          ,15   ,14   ,0         ,3.598         ,3.813              ,0.944 
2048 ,0          ,15   ,14   ,23        ,3.576         ,3.868              ,0.925 
2048 ,0          ,15   ,16   ,0         ,3.506         ,3.915              ,0.896 
2048 ,0          ,15   ,16   ,23        ,3.494         ,3.827              ,0.913 
2048 ,0          ,16   ,15   ,0         ,3.564         ,3.857              ,0.924 
2048 ,0          ,16   ,15   ,23        ,3.578         ,3.789              ,0.944 
2048 ,0          ,16   ,17   ,0         ,3.487         ,3.826              ,0.911 
2048 ,0          ,16   ,17   ,23        ,3.472         ,3.789              ,0.916 
2048 ,0          ,17   ,16   ,0         ,3.572         ,3.859              ,0.925 
2048 ,0          ,17   ,16   ,23        ,3.64          ,3.797              ,0.959 
2048 ,0          ,17   ,18   ,0         ,3.485         ,3.808              ,0.915 
2048 ,0          ,17   ,18   ,23        ,3.471         ,3.896              ,0.891 
2048 ,0          ,18   ,17   ,0         ,3.585         ,3.802              ,0.943 
2048 ,0          ,18   ,17   ,23        ,3.578         ,3.834              ,0.933 
2048 ,0          ,18   ,19   ,0         ,3.5           ,3.797              ,0.922 
2048 ,0          ,18   ,19   ,23        ,3.468         ,3.798              ,0.913 
2048 ,0          ,19   ,18   ,0         ,3.595         ,3.893              ,0.923 
2048 ,0          ,19   ,18   ,23        ,3.588         ,3.862              ,0.929 
2048 ,0          ,19   ,20   ,0         ,3.455         ,3.908              ,0.884 
2048 ,0          ,19   ,20   ,23        ,3.465         ,3.801              ,0.911 
2048 ,0          ,2    ,1    ,0         ,3.461         ,3.542              ,0.977 
2048 ,0          ,2    ,1    ,23        ,3.27          ,3.298              ,0.992 
2048 ,0          ,2    ,3    ,0         ,3.686         ,3.71               ,0.994 
2048 ,0          ,2    ,3    ,23        ,3.681         ,3.836              ,0.959 
2048 ,0          ,20   ,19   ,0         ,3.601         ,3.756              ,0.959 
2048 ,0          ,20   ,19   ,23        ,3.586         ,3.85               ,0.932 
2048 ,0          ,20   ,21   ,0         ,3.448         ,3.753              ,0.919 
2048 ,0          ,20   ,21   ,23        ,3.496         ,3.85               ,0.908 
2048 ,0          ,21   ,20   ,0         ,3.632         ,3.848              ,0.944 
2048 ,0          ,21   ,20   ,23        ,3.599         ,3.813              ,0.944 
2048 ,0          ,21   ,22   ,0         ,3.45          ,3.763              ,0.917 
2048 ,0          ,21   ,22   ,23        ,3.436         ,3.82               ,0.899 
2048 ,0          ,22   ,21   ,0         ,3.575         ,3.914              ,0.914 
2048 ,0          ,22   ,21   ,23        ,3.574         ,3.793              ,0.942 
2048 ,0          ,22   ,23   ,0         ,3.442         ,3.759              ,0.916 
2048 ,0          ,22   ,23   ,23        ,3.437         ,3.802              ,0.904 
2048 ,0          ,23   ,22   ,0         ,3.553         ,3.789              ,0.938 
2048 ,0          ,23   ,22   ,23        ,3.571         ,3.739              ,0.955 
2048 ,0          ,23   ,24   ,0         ,3.429         ,3.78               ,0.907 
2048 ,0          ,23   ,24   ,23        ,3.467         ,3.739              ,0.927 
2048 ,0          ,24   ,23   ,0         ,3.566         ,3.821              ,0.933 
2048 ,0          ,24   ,23   ,23        ,3.536         ,3.759              ,0.941 
2048 ,0          ,24   ,25   ,0         ,3.429         ,3.718              ,0.922 
2048 ,0          ,24   ,25   ,23        ,3.431         ,3.794              ,0.904 
2048 ,0          ,25   ,24   ,0         ,3.521         ,3.735              ,0.943 
2048 ,0          ,25   ,24   ,23        ,3.557         ,3.713              ,0.958 
2048 ,0          ,25   ,26   ,0         ,3.389         ,3.764              ,0.901 
2048 ,0          ,25   ,26   ,23        ,3.369         ,3.712              ,0.908 
2048 ,0          ,26   ,25   ,0         ,3.511         ,3.82               ,0.919 
2048 ,0          ,26   ,25   ,23        ,3.524         ,3.81               ,0.925 
2048 ,0          ,26   ,27   ,0         ,3.399         ,3.767              ,0.902 
2048 ,0          ,26   ,27   ,23        ,3.411         ,3.733              ,0.914 
2048 ,0          ,27   ,26   ,0         ,3.511         ,3.742              ,0.938 
2048 ,0          ,27   ,26   ,23        ,3.526         ,3.733              ,0.945 
2048 ,0          ,27   ,28   ,0         ,3.358         ,3.709              ,0.905 
2048 ,0          ,27   ,28   ,23        ,3.408         ,3.735              ,0.912 
2048 ,0          ,28   ,27   ,0         ,3.508         ,3.733              ,0.94  
2048 ,0          ,28   ,27   ,23        ,3.467         ,3.686              ,0.941 
2048 ,0          ,28   ,29   ,0         ,3.335         ,3.699              ,0.902 
2048 ,0          ,28   ,29   ,23        ,3.363         ,3.675              ,0.915 
2048 ,0          ,29   ,28   ,0         ,3.561         ,3.72               ,0.957 
2048 ,0          ,29   ,28   ,23        ,3.501         ,3.707              ,0.944 
2048 ,0          ,29   ,30   ,0         ,3.348         ,3.734              ,0.897 
2048 ,0          ,29   ,30   ,23        ,3.336         ,3.767              ,0.886 
2048 ,0          ,3    ,2    ,0         ,3.627         ,3.8                ,0.954 
2048 ,0          ,3    ,2    ,23        ,3.632         ,3.831              ,0.948 
2048 ,0          ,3    ,4    ,0         ,3.501         ,3.491              ,1.003 
2048 ,0          ,3    ,4    ,23        ,3.498         ,3.652              ,0.958 
2048 ,0          ,30   ,29   ,0         ,3.528         ,3.794              ,0.93  
2048 ,0          ,30   ,29   ,23        ,3.47          ,3.666              ,0.947 
2048 ,0          ,30   ,31   ,0         ,3.355         ,3.752              ,0.894 
2048 ,0          ,30   ,31   ,23        ,3.316         ,3.671              ,0.903 
2048 ,0          ,31   ,30   ,0         ,3.429         ,3.679              ,0.932 
2048 ,0          ,31   ,30   ,23        ,3.441         ,3.724              ,0.924 
2048 ,0          ,32   ,31   ,0         ,3.367         ,3.671              ,0.917 
2048 ,0          ,32   ,31   ,23        ,3.416         ,3.708              ,0.921 
2048 ,0          ,4    ,3    ,0         ,3.699         ,3.977              ,0.93  
2048 ,0          ,4    ,3    ,23        ,3.832         ,3.977              ,0.964 
2048 ,0          ,4    ,5    ,0         ,3.527         ,3.549              ,0.994 
2048 ,0          ,4    ,5    ,23        ,3.489         ,3.567              ,0.978 
2048 ,0          ,5    ,4    ,0         ,3.657         ,3.842              ,0.952 
2048 ,0          ,5    ,4    ,23        ,3.655         ,3.789              ,0.965 
2048 ,0          ,5    ,6    ,0         ,3.51          ,3.778              ,0.929 
2048 ,0          ,5    ,6    ,23        ,3.498         ,3.794              ,0.922 
2048 ,0          ,6    ,5    ,0         ,3.601         ,3.798              ,0.948 
2048 ,0          ,6    ,5    ,23        ,3.637         ,3.846              ,0.946 
2048 ,0          ,6    ,7    ,0         ,3.48          ,3.741              ,0.93  
2048 ,0          ,6    ,7    ,23        ,3.489         ,3.804              ,0.917 
2048 ,0          ,7    ,6    ,0         ,3.613         ,3.817              ,0.947 
2048 ,0          ,7    ,6    ,23        ,3.6           ,3.783              ,0.952 
2048 ,0          ,7    ,8    ,0         ,3.48          ,3.816              ,0.912 
2048 ,0          ,7    ,8    ,23        ,3.498         ,3.743              ,0.934 
2048 ,0          ,8    ,7    ,0         ,3.599         ,3.791              ,0.95  
2048 ,0          ,8    ,7    ,23        ,3.616         ,3.859              ,0.937 
2048 ,0          ,8    ,9    ,0         ,3.509         ,3.791              ,0.925 
2048 ,0          ,8    ,9    ,23        ,3.501         ,3.801              ,0.921 
2048 ,0          ,9    ,10   ,0         ,3.509         ,3.841              ,0.913 
2048 ,0          ,9    ,10   ,23        ,3.507         ,3.804              ,0.922 
2048 ,0          ,9    ,8    ,0         ,3.583         ,3.771              ,0.95  
2048 ,0          ,9    ,8    ,23        ,3.551         ,3.844              ,0.924 
2049 ,0          ,0    ,1    ,0         ,3.316         ,3.994              ,0.83  
2049 ,0          ,0    ,1    ,23        ,3.378         ,4.055              ,0.833 
2049 ,0          ,2    ,1    ,0         ,3.498         ,3.602              ,0.971 
2049 ,0          ,2    ,1    ,23        ,3.502         ,3.565              ,0.982 
2050 ,0          ,1    ,2    ,0         ,3.533         ,3.531              ,1.001 
2050 ,0          ,1    ,2    ,23        ,3.513         ,3.504              ,1.002 
2050 ,0          ,3    ,2    ,0         ,3.628         ,3.894              ,0.932 
2050 ,0          ,3    ,2    ,23        ,3.579         ,3.836              ,0.933 
2051 ,0          ,2    ,3    ,0         ,3.697         ,3.771              ,0.98  
2051 ,0          ,2    ,3    ,23        ,3.696         ,3.738              ,0.989 
2051 ,0          ,4    ,3    ,0         ,3.751         ,3.969              ,0.945 
2051 ,0          ,4    ,3    ,23        ,3.713         ,3.979              ,0.933 
2052 ,0          ,3    ,4    ,0         ,3.498         ,3.544              ,0.987 
2052 ,0          ,3    ,4    ,23        ,3.521         ,3.513              ,1.002 
2052 ,0          ,5    ,4    ,0         ,3.575         ,3.824              ,0.935 
2052 ,0          ,5    ,4    ,23        ,3.598         ,3.877              ,0.928 
2053 ,0          ,4    ,5    ,0         ,3.506         ,3.592              ,0.976 
2053 ,0          ,4    ,5    ,23        ,3.509         ,3.525              ,0.996 
2053 ,0          ,6    ,5    ,0         ,3.558         ,3.881              ,0.917 
2053 ,0          ,6    ,5    ,23        ,3.597         ,3.853              ,0.933 
2054 ,0          ,5    ,6    ,0         ,3.503         ,3.807              ,0.92  
2054 ,0          ,5    ,6    ,23        ,3.515         ,3.827              ,0.919 
2054 ,0          ,7    ,6    ,0         ,3.535         ,3.793              ,0.932 
2054 ,0          ,7    ,6    ,23        ,3.572         ,3.796              ,0.941 
2055 ,0          ,6    ,7    ,0         ,3.492         ,3.691              ,0.946 
2055 ,0          ,6    ,7    ,23        ,3.489         ,3.717              ,0.939 
2055 ,0          ,8    ,7    ,0         ,3.604         ,3.792              ,0.95  
2055 ,0          ,8    ,7    ,23        ,3.542         ,3.784              ,0.936 
2056 ,0          ,7    ,8    ,0         ,3.507         ,3.861              ,0.908 
2056 ,0          ,7    ,8    ,23        ,3.501         ,3.825              ,0.915 
2056 ,0          ,9    ,8    ,0         ,3.599         ,3.792              ,0.949 
2056 ,0          ,9    ,8    ,23        ,3.585         ,3.818              ,0.939 
2057 ,0          ,10   ,9    ,0         ,3.607         ,3.816              ,0.945 
2057 ,0          ,10   ,9    ,23        ,3.652         ,3.814              ,0.958 
2057 ,0          ,8    ,9    ,0         ,3.515         ,3.827              ,0.918 
2057 ,0          ,8    ,9    ,23        ,3.506         ,3.808              ,0.921 
2058 ,0          ,11   ,10   ,0         ,3.593         ,3.806              ,0.944 
2058 ,0          ,11   ,10   ,23        ,3.623         ,3.845              ,0.942 
2058 ,0          ,9    ,10   ,0         ,3.506         ,3.844              ,0.912 
2058 ,0          ,9    ,10   ,23        ,3.498         ,3.819              ,0.916 
2059 ,0          ,10   ,11   ,0         ,3.506         ,3.862              ,0.908 
2059 ,0          ,10   ,11   ,23        ,3.509         ,3.794              ,0.925 
2059 ,0          ,12   ,11   ,0         ,3.567         ,3.855              ,0.925 
2059 ,0          ,12   ,11   ,23        ,3.595         ,3.8                ,0.946 
2060 ,0          ,11   ,12   ,0         ,3.509         ,3.87               ,0.907 
2060 ,0          ,11   ,12   ,23        ,3.494         ,3.773              ,0.926 
2060 ,0          ,13   ,12   ,0         ,3.537         ,3.78               ,0.936 
2060 ,0          ,13   ,12   ,23        ,3.631         ,3.839              ,0.946 
2061 ,0          ,12   ,13   ,0         ,3.509         ,3.854              ,0.91  
2061 ,0          ,12   ,13   ,23        ,3.491         ,3.815              ,0.915 
2061 ,0          ,14   ,13   ,0         ,3.572         ,3.838              ,0.931 
2061 ,0          ,14   ,13   ,23        ,3.588         ,3.796              ,0.945 
2062 ,0          ,13   ,14   ,0         ,3.497         ,3.839              ,0.911 
2062 ,0          ,13   ,14   ,23        ,3.481         ,3.809              ,0.914 
2062 ,0          ,15   ,14   ,0         ,3.621         ,3.802              ,0.952 
2062 ,0          ,15   ,14   ,23        ,3.549         ,3.869              ,0.917 
2063 ,0          ,14   ,15   ,0         ,3.489         ,3.825              ,0.912 
2063 ,0          ,14   ,15   ,23        ,3.478         ,3.78               ,0.92  
2063 ,0          ,16   ,15   ,0         ,3.571         ,3.823              ,0.934 
2063 ,0          ,16   ,15   ,23        ,3.58          ,3.827              ,0.935 
2064 ,0          ,15   ,16   ,0         ,3.489         ,3.846              ,0.907 
2064 ,0          ,15   ,16   ,23        ,3.486         ,3.827              ,0.911 
2064 ,0          ,17   ,16   ,0         ,3.567         ,3.811              ,0.936 
2064 ,0          ,17   ,16   ,23        ,3.638         ,3.83               ,0.95  
2065 ,0          ,16   ,17   ,0         ,3.482         ,3.772              ,0.923 
2065 ,0          ,16   ,17   ,23        ,3.498         ,3.841              ,0.911 
2065 ,0          ,18   ,17   ,0         ,3.559         ,3.807              ,0.935 
2065 ,0          ,18   ,17   ,23        ,3.62          ,3.731              ,0.97  
2066 ,0          ,17   ,18   ,0         ,3.476         ,3.809              ,0.913 
2066 ,0          ,17   ,18   ,23        ,3.467         ,3.843              ,0.902 
2066 ,0          ,19   ,18   ,0         ,3.58          ,3.806              ,0.941 
2066 ,0          ,19   ,18   ,23        ,3.577         ,3.915              ,0.914 
2067 ,0          ,18   ,19   ,0         ,3.485         ,3.828              ,0.91  
2067 ,0          ,18   ,19   ,23        ,3.471         ,3.831              ,0.906 
2067 ,0          ,20   ,19   ,0         ,3.611         ,3.848              ,0.938 
2067 ,0          ,20   ,19   ,23        ,3.582         ,3.855              ,0.929 
2068 ,0          ,19   ,20   ,0         ,3.449         ,3.739              ,0.922 
2068 ,0          ,19   ,20   ,23        ,3.463         ,3.827              ,0.905 
2068 ,0          ,21   ,20   ,0         ,3.669         ,3.824              ,0.959 
2068 ,0          ,21   ,20   ,23        ,3.6           ,3.845              ,0.936 
2069 ,0          ,20   ,21   ,0         ,3.441         ,3.802              ,0.905 
2069 ,0          ,20   ,21   ,23        ,3.463         ,3.735              ,0.927 
2069 ,0          ,22   ,21   ,0         ,3.609         ,3.768              ,0.958 
2069 ,0          ,22   ,21   ,23        ,3.605         ,3.769              ,0.956 
2070 ,0          ,21   ,22   ,0         ,3.431         ,3.815              ,0.899 
2070 ,0          ,21   ,22   ,23        ,3.452         ,3.81               ,0.906 
2070 ,0          ,23   ,22   ,0         ,3.563         ,3.811              ,0.935 
2070 ,0          ,23   ,22   ,23        ,3.53          ,3.85               ,0.917 
2071 ,0          ,22   ,23   ,0         ,3.439         ,3.837              ,0.896 
2071 ,0          ,22   ,23   ,23        ,3.421         ,3.778              ,0.905 
2071 ,0          ,24   ,23   ,0         ,3.552         ,3.746              ,0.948 
2071 ,0          ,24   ,23   ,23        ,3.545         ,3.805              ,0.932 
2072 ,0          ,23   ,24   ,0         ,3.431         ,3.788              ,0.906 
2072 ,0          ,23   ,24   ,23        ,3.444         ,3.789              ,0.909 
2072 ,0          ,25   ,24   ,0         ,3.553         ,3.781              ,0.94  
2072 ,0          ,25   ,24   ,23        ,3.563         ,3.74               ,0.953 
2073 ,0          ,24   ,25   ,0         ,3.421         ,3.688              ,0.928 
2073 ,0          ,24   ,25   ,23        ,3.425         ,3.833              ,0.893 
2073 ,0          ,26   ,25   ,0         ,3.56          ,3.765              ,0.945 
2073 ,0          ,26   ,25   ,23        ,3.549         ,3.758              ,0.945 
2074 ,0          ,25   ,26   ,0         ,3.4           ,3.743              ,0.908 
2074 ,0          ,25   ,26   ,23        ,3.39          ,3.725              ,0.91  
2074 ,0          ,27   ,26   ,0         ,3.509         ,3.807              ,0.922 
2074 ,0          ,27   ,26   ,23        ,3.514         ,3.791              ,0.927 
2075 ,0          ,26   ,27   ,0         ,3.395         ,3.765              ,0.902 
2075 ,0          ,26   ,27   ,23        ,3.391         ,3.75               ,0.904 
2075 ,0          ,28   ,27   ,0         ,3.538         ,3.772              ,0.938 
2075 ,0          ,28   ,27   ,23        ,3.504         ,3.705              ,0.946 
2076 ,0          ,27   ,28   ,0         ,3.368         ,3.689              ,0.913 
2076 ,0          ,27   ,28   ,23        ,3.358         ,3.732              ,0.9   
2076 ,0          ,29   ,28   ,0         ,3.523         ,3.723              ,0.946 
2076 ,0          ,29   ,28   ,23        ,3.443         ,3.752              ,0.917 
2077 ,0          ,28   ,29   ,0         ,3.356         ,3.711              ,0.904 
2077 ,0          ,28   ,29   ,23        ,3.348         ,3.684              ,0.909 
2077 ,0          ,30   ,29   ,0         ,3.5           ,3.68               ,0.951 
2077 ,0          ,30   ,29   ,23        ,3.4           ,3.711              ,0.916 
2078 ,0          ,29   ,30   ,0         ,3.368         ,3.697              ,0.911 
2078 ,0          ,29   ,30   ,23        ,3.348         ,3.652              ,0.917 
2078 ,0          ,31   ,30   ,0         ,3.455         ,3.781              ,0.914 
2078 ,0          ,31   ,30   ,23        ,3.461         ,3.735              ,0.927 
2079 ,0          ,30   ,31   ,0         ,3.372         ,3.816              ,0.884 
2079 ,0          ,30   ,31   ,23        ,3.357         ,3.692              ,0.909 
2079 ,0          ,32   ,31   ,0         ,3.358         ,3.741              ,0.898 
2079 ,0          ,32   ,31   ,23        ,3.386         ,3.702              ,0.915 
21   ,0          ,20   ,21   ,0         ,3.485         ,3.842              ,0.907 
21   ,0          ,20   ,21   ,23        ,3.469         ,3.829              ,0.906 
21   ,0          ,22   ,21   ,0         ,3.541         ,3.756              ,0.943 
21   ,0          ,22   ,21   ,23        ,3.586         ,3.787              ,0.947 
22   ,0          ,21   ,22   ,0         ,3.438         ,3.813              ,0.902 
22   ,0          ,21   ,22   ,23        ,3.44          ,3.788              ,0.908 
22   ,0          ,23   ,22   ,0         ,3.602         ,3.905              ,0.922 
22   ,0          ,23   ,22   ,23        ,3.604         ,3.83               ,0.941 
23   ,0          ,22   ,23   ,0         ,3.396         ,3.736              ,0.909 
23   ,0          ,22   ,23   ,23        ,3.386         ,3.856              ,0.878 
23   ,0          ,24   ,23   ,0         ,3.589         ,3.853              ,0.932 
23   ,0          ,24   ,23   ,23        ,3.528         ,3.816              ,0.925 
24   ,0          ,23   ,24   ,0         ,3.414         ,3.688              ,0.926 
24   ,0          ,23   ,24   ,23        ,3.402         ,3.768              ,0.903 
24   ,0          ,25   ,24   ,0         ,3.524         ,3.701              ,0.952 
24   ,0          ,25   ,24   ,23        ,3.486         ,3.738              ,0.933 
25   ,0          ,24   ,25   ,0         ,3.383         ,3.755              ,0.901 
25   ,0          ,24   ,25   ,23        ,3.382         ,3.766              ,0.898 
25   ,0          ,26   ,25   ,0         ,3.51          ,3.789              ,0.926 
25   ,0          ,26   ,25   ,23        ,3.475         ,3.735              ,0.93  
26   ,0          ,25   ,26   ,0         ,3.367         ,3.8                ,0.886 
26   ,0          ,25   ,26   ,23        ,3.364         ,3.732              ,0.901 
26   ,0          ,27   ,26   ,0         ,3.544         ,3.664              ,0.967 
26   ,0          ,27   ,26   ,23        ,3.487         ,3.706              ,0.941 
27   ,0          ,26   ,27   ,0         ,3.358         ,3.683              ,0.912 
27   ,0          ,26   ,27   ,23        ,3.33          ,3.736              ,0.891 
27   ,0          ,28   ,27   ,0         ,3.488         ,3.666              ,0.951 
27   ,0          ,28   ,27   ,23        ,3.479         ,3.707              ,0.938 
28   ,0          ,27   ,28   ,0         ,3.367         ,3.826              ,0.88  
28   ,0          ,27   ,28   ,23        ,3.323         ,3.709              ,0.896 
28   ,0          ,29   ,28   ,0         ,3.468         ,3.704              ,0.936 
28   ,0          ,29   ,28   ,23        ,3.537         ,3.804              ,0.93  
29   ,0          ,28   ,29   ,0         ,3.322         ,3.699              ,0.898 
29   ,0          ,28   ,29   ,23        ,3.291         ,3.701              ,0.889 
29   ,0          ,30   ,29   ,0         ,3.451         ,3.715              ,0.929 
29   ,0          ,30   ,29   ,23        ,3.412         ,3.674              ,0.929 
3    ,0          ,192  ,96   ,0         ,5.844         ,5.713              ,1.023 
3    ,0          ,192  ,96   ,23        ,5.792         ,5.688              ,1.018 
3    ,0          ,2    ,3    ,0         ,3.699         ,3.756              ,0.985 
3    ,0          ,2    ,3    ,23        ,3.686         ,3.753              ,0.982 
3    ,0          ,256  ,64   ,0         ,4.998         ,5.242              ,0.953 
3    ,0          ,256  ,64   ,23        ,4.987         ,5.224              ,0.955 
3    ,0          ,256  ,96   ,0         ,5.846         ,5.735              ,1.019 
3    ,0          ,256  ,96   ,23        ,5.809         ,5.795              ,1.003 
3    ,0          ,4    ,3    ,0         ,3.619         ,3.823              ,0.947 
3    ,0          ,4    ,3    ,23        ,3.644         ,3.798              ,0.96  
3    ,0          ,512  ,96   ,0         ,5.684         ,5.685              ,1.0   
3    ,0          ,512  ,96   ,23        ,5.781         ,5.718              ,1.011 
30   ,0          ,29   ,30   ,0         ,3.332         ,3.682              ,0.905 
30   ,0          ,29   ,30   ,23        ,3.327         ,3.688              ,0.902 
30   ,0          ,31   ,30   ,0         ,3.403         ,3.732              ,0.912 
30   ,0          ,31   ,30   ,23        ,3.406         ,3.778              ,0.902 
31   ,0          ,30   ,31   ,0         ,3.358         ,3.665              ,0.916 
31   ,0          ,30   ,31   ,23        ,3.334         ,3.663              ,0.91  
31   ,0          ,32   ,31   ,0         ,3.381         ,3.712              ,0.911 
31   ,0          ,32   ,31   ,23        ,3.506         ,3.837              ,0.914 
4    ,0          ,192  ,128  ,0         ,6.737         ,6.179              ,1.09  
4    ,0          ,192  ,128  ,23        ,6.341         ,6.195              ,1.024 
4    ,0          ,256  ,128  ,0         ,6.751         ,6.094              ,1.108 
4    ,0          ,256  ,128  ,23        ,6.153         ,6.145              ,1.001 
4    ,0          ,256  ,64   ,0         ,5.052         ,5.33               ,0.948 
4    ,0          ,256  ,64   ,23        ,5.043         ,5.31               ,0.95  
4    ,0          ,3    ,4    ,0         ,3.515         ,3.542              ,0.992 
4    ,0          ,3    ,4    ,23        ,3.508         ,3.531              ,0.993 
4    ,0          ,5    ,4    ,0         ,3.548         ,3.767              ,0.942 
4    ,0          ,5    ,4    ,23        ,3.543         ,3.752              ,0.944 
4    ,0          ,512  ,128  ,0         ,6.143         ,6.093              ,1.008 
4    ,0          ,512  ,128  ,23        ,6.715         ,6.042              ,1.111 
4081 ,0          ,0    ,1    ,0         ,3.262         ,3.912              ,0.834 
4081 ,0          ,0    ,1    ,23        ,3.27          ,3.921              ,0.834 
4081 ,0          ,1    ,2    ,0         ,5.01          ,5.101              ,0.982 
4081 ,0          ,1    ,2    ,23        ,5.01          ,5.061              ,0.99  
4081 ,0          ,10   ,11   ,0         ,4.959         ,5.291              ,0.937 
4081 ,0          ,10   ,11   ,23        ,4.966         ,5.312              ,0.935 
4081 ,0          ,10   ,9    ,0         ,4.317         ,5.319              ,0.812 
4081 ,0          ,10   ,9    ,23        ,4.32          ,5.257              ,0.822 
4081 ,0          ,11   ,10   ,0         ,4.314         ,5.287              ,0.816 
4081 ,0          ,11   ,10   ,23        ,4.325         ,5.268              ,0.821 
4081 ,0          ,11   ,12   ,0         ,4.94          ,5.302              ,0.932 
4081 ,0          ,11   ,12   ,23        ,4.96          ,5.291              ,0.937 
4081 ,0          ,12   ,11   ,0         ,4.379         ,5.237              ,0.836 
4081 ,0          ,12   ,11   ,23        ,4.304         ,5.285              ,0.814 
4081 ,0          ,12   ,13   ,0         ,4.971         ,5.321              ,0.934 
4081 ,0          ,12   ,13   ,23        ,4.944         ,5.26               ,0.94  
4081 ,0          ,13   ,12   ,0         ,4.302         ,5.298              ,0.812 
4081 ,0          ,13   ,12   ,23        ,4.296         ,5.238              ,0.82  
4081 ,0          ,13   ,14   ,0         ,4.933         ,5.278              ,0.935 
4081 ,0          ,13   ,14   ,23        ,4.963         ,5.356              ,0.926 
4081 ,0          ,14   ,13   ,0         ,4.292         ,5.262              ,0.816 
4081 ,0          ,14   ,13   ,23        ,4.337         ,5.342              ,0.812 
4081 ,0          ,14   ,15   ,0         ,4.899         ,5.254              ,0.932 
4081 ,0          ,14   ,15   ,23        ,4.955         ,5.272              ,0.94  
4081 ,0          ,15   ,14   ,0         ,4.327         ,5.284              ,0.819 
4081 ,0          ,15   ,14   ,23        ,4.327         ,5.382              ,0.804 
4081 ,0          ,15   ,16   ,0         ,4.939         ,5.28               ,0.935 
4081 ,0          ,15   ,16   ,23        ,4.986         ,5.275              ,0.945 
4081 ,0          ,16   ,15   ,0         ,5.696         ,7.264              ,0.784 
4081 ,0          ,16   ,15   ,23        ,5.642         ,7.302              ,0.773 
4081 ,0          ,16   ,17   ,0         ,5.603         ,7.975              ,0.703 
4081 ,0          ,16   ,17   ,23        ,5.635         ,7.971              ,0.707 
4081 ,0          ,17   ,16   ,0         ,5.659         ,7.294              ,0.776 
4081 ,0          ,17   ,16   ,23        ,5.716         ,7.371              ,0.775 
4081 ,0          ,17   ,18   ,0         ,5.602         ,7.928              ,0.707 
4081 ,0          ,17   ,18   ,23        ,5.65          ,7.964              ,0.709 
4081 ,0          ,18   ,17   ,0         ,5.697         ,7.34               ,0.776 
4081 ,0          ,18   ,17   ,23        ,5.647         ,7.265              ,0.777 
4081 ,0          ,18   ,19   ,0         ,5.587         ,7.918              ,0.706 
4081 ,0          ,18   ,19   ,23        ,5.625         ,8.091              ,0.695 
4081 ,0          ,19   ,18   ,0         ,5.645         ,7.312              ,0.772 
4081 ,0          ,19   ,18   ,23        ,5.711         ,7.357              ,0.776 
4081 ,0          ,19   ,20   ,0         ,5.572         ,7.979              ,0.698 
4081 ,0          ,19   ,20   ,23        ,5.649         ,7.944              ,0.711 
4081 ,0          ,2    ,1    ,0         ,4.2           ,5.012              ,0.838 
4081 ,0          ,2    ,1    ,23        ,3.979         ,4.597              ,0.865 
4081 ,0          ,2    ,3    ,0         ,5.245         ,5.274              ,0.994 
4081 ,0          ,2    ,3    ,23        ,5.27          ,5.303              ,0.994 
4081 ,0          ,20   ,19   ,0         ,5.646         ,7.264              ,0.777 
4081 ,0          ,20   ,19   ,23        ,5.649         ,7.373              ,0.766 
4081 ,0          ,20   ,21   ,0         ,5.583         ,7.914              ,0.705 
4081 ,0          ,20   ,21   ,23        ,5.614         ,7.952              ,0.706 
4081 ,0          ,21   ,20   ,0         ,5.64          ,7.308              ,0.772 
4081 ,0          ,21   ,20   ,23        ,5.657         ,7.283              ,0.777 
4081 ,0          ,21   ,22   ,0         ,5.592         ,7.854              ,0.712 
4081 ,0          ,21   ,22   ,23        ,5.592         ,7.881              ,0.71  
4081 ,0          ,22   ,21   ,0         ,5.653         ,7.219              ,0.783 
4081 ,0          ,22   ,21   ,23        ,5.628         ,7.21               ,0.781 
4081 ,0          ,22   ,23   ,0         ,5.633         ,7.904              ,0.713 
4081 ,0          ,22   ,23   ,23        ,5.634         ,7.902              ,0.713 
4081 ,0          ,23   ,22   ,0         ,5.658         ,7.27               ,0.778 
4081 ,0          ,23   ,22   ,23        ,5.653         ,7.243              ,0.78  
4081 ,0          ,23   ,24   ,0         ,5.546         ,7.838              ,0.708 
4081 ,0          ,23   ,24   ,23        ,5.574         ,7.876              ,0.708 
4081 ,0          ,24   ,23   ,0         ,5.641         ,7.303              ,0.772 
4081 ,0          ,24   ,23   ,23        ,5.645         ,7.225              ,0.781 
4081 ,0          ,24   ,25   ,0         ,5.566         ,7.864              ,0.708 
4081 ,0          ,24   ,25   ,23        ,5.555         ,7.879              ,0.705 
4081 ,0          ,25   ,24   ,0         ,5.603         ,7.182              ,0.78  
4081 ,0          ,25   ,24   ,23        ,5.604         ,7.186              ,0.78  
4081 ,0          ,25   ,26   ,0         ,5.498         ,7.79               ,0.706 
4081 ,0          ,25   ,26   ,23        ,5.503         ,7.781              ,0.707 
4081 ,0          ,256  ,128  ,23        ,6.564         ,7.033              ,0.933 
4081 ,0          ,256  ,160  ,23        ,8.062         ,8.228              ,0.98  
4081 ,0          ,256  ,192  ,23        ,8.183         ,8.162              ,1.003 
4081 ,0          ,256  ,224  ,23        ,9.406         ,9.034              ,1.041 
4081 ,0          ,256  ,32   ,23        ,5.45          ,6.315              ,0.863 
4081 ,0          ,256  ,64   ,0         ,5.398         ,5.967              ,0.905 
4081 ,0          ,256  ,64   ,23        ,5.557         ,6.259              ,0.888 
4081 ,0          ,256  ,96   ,23        ,6.277         ,6.661              ,0.942 
4081 ,0          ,26   ,25   ,0         ,5.616         ,7.212              ,0.779 
4081 ,0          ,26   ,25   ,23        ,5.586         ,7.134              ,0.783 
4081 ,0          ,26   ,27   ,0         ,5.467         ,7.724              ,0.708 
4081 ,0          ,26   ,27   ,23        ,5.453         ,7.743              ,0.704 
4081 ,0          ,27   ,26   ,0         ,5.56          ,7.131              ,0.78  
4081 ,0          ,27   ,26   ,23        ,5.559         ,7.112              ,0.782 
4081 ,0          ,27   ,28   ,0         ,5.459         ,7.804              ,0.699 
4081 ,0          ,27   ,28   ,23        ,5.454         ,7.837              ,0.696 
4081 ,0          ,28   ,27   ,0         ,5.599         ,7.209              ,0.777 
4081 ,0          ,28   ,27   ,23        ,5.531         ,7.126              ,0.776 
4081 ,0          ,28   ,29   ,0         ,5.458         ,7.795              ,0.7   
4081 ,0          ,28   ,29   ,23        ,5.467         ,7.69               ,0.711 
4081 ,0          ,29   ,28   ,0         ,5.563         ,7.19               ,0.774 
4081 ,0          ,29   ,28   ,23        ,5.536         ,7.119              ,0.778 
4081 ,0          ,29   ,30   ,0         ,5.464         ,7.727              ,0.707 
4081 ,0          ,29   ,30   ,23        ,5.507         ,7.707              ,0.715 
4081 ,0          ,3    ,2    ,0         ,4.347         ,5.331              ,0.815 
4081 ,0          ,3    ,2    ,23        ,4.366         ,5.319              ,0.821 
4081 ,0          ,3    ,4    ,0         ,4.968         ,5.147              ,0.965 
4081 ,0          ,3    ,4    ,23        ,4.972         ,5.04               ,0.987 
4081 ,0          ,30   ,29   ,0         ,5.589         ,7.146              ,0.782 
4081 ,0          ,30   ,29   ,23        ,5.561         ,7.145              ,0.778 
4081 ,0          ,30   ,31   ,0         ,5.453         ,7.709              ,0.707 
4081 ,0          ,30   ,31   ,23        ,5.441         ,7.687              ,0.708 
4081 ,0          ,31   ,30   ,0         ,5.498         ,7.059              ,0.779 
4081 ,0          ,31   ,30   ,23        ,5.52          ,7.076              ,0.78  
4081 ,0          ,32   ,31   ,0         ,5.496         ,7.072              ,0.777 
4081 ,0          ,32   ,31   ,23        ,5.506         ,7.113              ,0.774 
4081 ,0          ,4    ,3    ,0         ,4.341         ,5.298              ,0.819 
4081 ,0          ,4    ,3    ,23        ,4.333         ,5.34               ,0.811 
4081 ,0          ,4    ,5    ,0         ,4.968         ,5.179              ,0.959 
4081 ,0          ,4    ,5    ,23        ,4.984         ,5.108              ,0.976 
4081 ,0          ,5    ,4    ,0         ,4.327         ,5.31               ,0.815 
4081 ,0          ,5    ,4    ,23        ,4.345         ,5.274              ,0.824 
4081 ,0          ,5    ,6    ,0         ,4.907         ,5.312              ,0.924 
4081 ,0          ,5    ,6    ,23        ,4.935         ,5.239              ,0.942 
4081 ,0          ,6    ,5    ,0         ,4.335         ,5.322              ,0.815 
4081 ,0          ,6    ,5    ,23        ,4.337         ,5.272              ,0.823 
4081 ,0          ,6    ,7    ,0         ,4.929         ,5.278              ,0.934 
4081 ,0          ,6    ,7    ,23        ,4.956         ,5.192              ,0.954 
4081 ,0          ,7    ,6    ,0         ,4.307         ,5.273              ,0.817 
4081 ,0          ,7    ,6    ,23        ,4.263         ,5.198              ,0.82  
4081 ,0          ,7    ,8    ,0         ,4.941         ,5.263              ,0.939 
4081 ,0          ,7    ,8    ,23        ,4.975         ,5.301              ,0.939 
4081 ,0          ,8    ,7    ,0         ,4.315         ,5.236              ,0.824 
4081 ,0          ,8    ,7    ,23        ,4.312         ,5.331              ,0.809 
4081 ,0          ,8    ,9    ,0         ,4.97          ,5.327              ,0.933 
4081 ,0          ,8    ,9    ,23        ,4.953         ,5.266              ,0.941 
4081 ,0          ,9    ,10   ,0         ,4.941         ,5.297              ,0.933 
4081 ,0          ,9    ,10   ,23        ,4.959         ,5.303              ,0.935 
4081 ,0          ,9    ,8    ,0         ,4.314         ,5.283              ,0.817 
4081 ,0          ,9    ,8    ,23        ,4.331         ,5.283              ,0.82  
5    ,0          ,192  ,160  ,0         ,7.739         ,7.265              ,1.065 
5    ,0          ,192  ,160  ,23        ,7.878         ,7.41               ,1.063 
5    ,0          ,256  ,160  ,0         ,7.5           ,7.28               ,1.03  
5    ,0          ,256  ,160  ,23        ,7.693         ,7.228              ,1.064 
5    ,0          ,256  ,64   ,0         ,5.195         ,5.353              ,0.97  
5    ,0          ,256  ,64   ,23        ,5.142         ,5.359              ,0.96  
5    ,0          ,4    ,5    ,0         ,3.508         ,3.534              ,0.993 
5    ,0          ,4    ,5    ,23        ,3.506         ,3.532              ,0.993 
5    ,0          ,512  ,160  ,0         ,9.026         ,9.23               ,0.978 
5    ,0          ,512  ,160  ,23        ,9.133         ,9.441              ,0.967 
5    ,0          ,6    ,5    ,0         ,3.575         ,3.729              ,0.959 
5    ,0          ,6    ,5    ,23        ,3.556         ,3.791              ,0.938 
6    ,0          ,192  ,192  ,0         ,7.969         ,7.958              ,1.001 
6    ,0          ,192  ,192  ,23        ,8.081         ,7.991              ,1.011 
6    ,0          ,256  ,192  ,0         ,7.801         ,7.655              ,1.019 
6    ,0          ,256  ,192  ,23        ,7.927         ,7.813              ,1.015 
6    ,0          ,256  ,64   ,0         ,5.218         ,5.435              ,0.96  
6    ,0          ,256  ,64   ,23        ,5.112         ,5.372              ,0.952 
6    ,0          ,5    ,6    ,0         ,3.491         ,3.684              ,0.948 
6    ,0          ,5    ,6    ,23        ,3.483         ,3.718              ,0.937 
6    ,0          ,512  ,192  ,0         ,9.568         ,9.86               ,0.97  
6    ,0          ,512  ,192  ,23        ,9.556         ,9.693              ,0.986 
6    ,0          ,7    ,6    ,0         ,3.631         ,3.739              ,0.971 
6    ,0          ,7    ,6    ,23        ,3.614         ,3.865              ,0.935 
7    ,0          ,192  ,224  ,0         ,7.997         ,7.814              ,1.023 
7    ,0          ,192  ,224  ,23        ,7.919         ,7.82               ,1.013 
7    ,0          ,256  ,224  ,0         ,8.76          ,8.428              ,1.039 
7    ,0          ,256  ,224  ,23        ,8.73          ,8.474              ,1.03  
7    ,0          ,256  ,64   ,0         ,5.074         ,5.389              ,0.942 
7    ,0          ,256  ,64   ,23        ,5.123         ,5.229              ,0.98  
7    ,0          ,512  ,224  ,0         ,9.416         ,9.45               ,0.996 
7    ,0          ,512  ,224  ,23        ,9.405         ,9.482              ,0.992 
7    ,0          ,6    ,7    ,0         ,3.498         ,3.75               ,0.933 
7    ,0          ,6    ,7    ,23        ,3.49          ,3.738              ,0.934 
7    ,0          ,8    ,7    ,0         ,3.631         ,3.773              ,0.962 
7    ,0          ,8    ,7    ,23        ,3.622         ,3.79               ,0.956 
8    ,0          ,7    ,8    ,0         ,3.498         ,3.761              ,0.93  
8    ,0          ,7    ,8    ,23        ,3.489         ,3.785              ,0.922 
8    ,0          ,9    ,8    ,0         ,3.606         ,3.782              ,0.953 
8    ,0          ,9    ,8    ,23        ,3.604         ,3.85               ,0.936 
9    ,0          ,10   ,9    ,0         ,3.589         ,3.84               ,0.935 
9    ,0          ,10   ,9    ,23        ,3.624         ,3.814              ,0.95  
9    ,0          ,8    ,9    ,0         ,3.508         ,3.822              ,0.918 
9    ,0          ,8    ,9    ,23        ,3.5           ,3.793              ,0.923 
0.9281712548418259

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH v1 2/7] x86: Shrink / minorly optimize strchr-evex and implement with VMM headers
  2022-10-18  2:48 ` [PATCH v1 2/7] x86: Shrink / minorly optimize strchr-evex " Noah Goldstein
@ 2022-10-18  2:51   ` Noah Goldstein
  0 siblings, 0 replies; 41+ messages in thread
From: Noah Goldstein @ 2022-10-18  2:51 UTC (permalink / raw)
  To: libc-alpha; +Cc: hjl.tools, carlos

[-- Attachment #1: Type: text/plain, Size: 26558 bytes --]

On Mon, Oct 17, 2022 at 7:49 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> Size Optimizations:
> 1. Condence hot path for better cache-locality.
>     - This is most impact for strchrnul where the logic strings with
>       len <= VEC_SIZE or with a match in the first VEC no fits entirely
>       in the first cache line.
> 2. Reuse common targets in first 4x VEC and after the loop.
> 3. Don't align targets so aggressively if it doesn't change the number
>    of fetch blocks it will require and put more care in avoiding the
>    case where targets unnecessarily split cache lines.
> 4. Align the loop better for DSB/LSD
> 5. Use more code-size efficient instructions.
>         - tzcnt ...     -> bsf ...
>         - vpcmpb $0 ... -> vpcmpeq ...
> 6. Align labels less aggressively, especially if it doesn't save fetch
>    blocks / causes the basic-block to span extra cache-lines.
>
> Code Size Changes:
> strchr-evex.S   : -63 bytes
> strchrnul-evex.S: -48 bytes
>
> Net perf changes:
> Reported as geometric mean of all improvements / regressions from N=10
> runs of the benchtests. Value as New Time / Old Time so < 1.0 is
> improvement and 1.0 is regression.
>
> strchr-evex.S (Fixed)   : 0.971
> strchr-evex.S (Rand)    : 0.932
> strchrnul-evex.S        : 0.965
>
> Full results attached in email.
>
> Full check passes on x86-64.
> ---
>  sysdeps/x86_64/multiarch/strchr-evex.S | 558 +++++++++++++++----------
>  1 file changed, 340 insertions(+), 218 deletions(-)
>
> diff --git a/sysdeps/x86_64/multiarch/strchr-evex.S b/sysdeps/x86_64/multiarch/strchr-evex.S
> index a1c15c4419..c2a0d112f7 100644
> --- a/sysdeps/x86_64/multiarch/strchr-evex.S
> +++ b/sysdeps/x86_64/multiarch/strchr-evex.S
> @@ -26,48 +26,75 @@
>  #  define STRCHR       __strchr_evex
>  # endif
>
> -# define VMOVU         vmovdqu64
> -# define VMOVA         vmovdqa64
> +# ifndef VEC_SIZE
> +#  include "x86-evex256-vecs.h"
> +# endif
>
>  # ifdef USE_AS_WCSCHR
>  #  define VPBROADCAST  vpbroadcastd
> -#  define VPCMP                vpcmpd
> +#  define VPCMP        vpcmpd
> +#  define VPCMPEQ      vpcmpeqd
>  #  define VPTESTN      vptestnmd
> +#  define VPTEST       vptestmd
>  #  define VPMINU       vpminud
>  #  define CHAR_REG     esi
> -#  define SHIFT_REG    ecx
> +#  define SHIFT_REG    rcx
>  #  define CHAR_SIZE    4
> +
> +#  define USE_WIDE_CHAR
>  # else
>  #  define VPBROADCAST  vpbroadcastb
> -#  define VPCMP                vpcmpb
> +#  define VPCMP        vpcmpb
> +#  define VPCMPEQ      vpcmpeqb
>  #  define VPTESTN      vptestnmb
> +#  define VPTEST       vptestmb
>  #  define VPMINU       vpminub
>  #  define CHAR_REG     sil
> -#  define SHIFT_REG    edx
> +#  define SHIFT_REG    rdi
>  #  define CHAR_SIZE    1
>  # endif
>
> -# define XMMZERO       xmm16
> -
> -# define YMMZERO       ymm16
> -# define YMM0          ymm17
> -# define YMM1          ymm18
> -# define YMM2          ymm19
> -# define YMM3          ymm20
> -# define YMM4          ymm21
> -# define YMM5          ymm22
> -# define YMM6          ymm23
> -# define YMM7          ymm24
> -# define YMM8          ymm25
> -
> -# define VEC_SIZE 32
> -# define PAGE_SIZE 4096
> -# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE)
> -
> -       .section .text.evex,"ax",@progbits
> -ENTRY_P2ALIGN (STRCHR, 5)
> -       /* Broadcast CHAR to YMM0.      */
> -       VPBROADCAST     %esi, %YMM0
> +# include "reg-macros.h"
> +
> +# if VEC_SIZE == 64
> +#  define MASK_GPR     rcx
> +#  define LOOP_REG     rax
> +
> +#  define COND_MASK(k_reg)     {%k_reg}
> +# else
> +#  define MASK_GPR     rax
> +#  define LOOP_REG     rdi
> +
> +#  define COND_MASK(k_reg)
> +# endif
> +
> +# define CHAR_PER_VEC  (VEC_SIZE / CHAR_SIZE)
> +
> +
> +# if CHAR_PER_VEC == 64
> +#  define LAST_VEC_OFFSET      (VEC_SIZE * 3)
> +#  define TESTZ(reg)   incq %VGPR_SZ(reg, 64)
> +# else
> +
> +#  if CHAR_PER_VEC == 32
> +#   define TESTZ(reg)  incl %VGPR_SZ(reg, 32)
> +#  elif CHAR_PER_VEC == 16
> +#   define TESTZ(reg)  incw %VGPR_SZ(reg, 16)
> +#  else
> +#   define TESTZ(reg)  incb %VGPR_SZ(reg, 8)
> +#  endif
> +
> +#  define LAST_VEC_OFFSET      (VEC_SIZE * 2)
> +# endif
> +
> +# define VMATCH        VMM(0)
> +
> +# define PAGE_SIZE     4096
> +
> +       .section SECTION(.text), "ax", @progbits
> +ENTRY_P2ALIGN (STRCHR, 6)
> +       /* Broadcast CHAR to VEC_0.  */
> +       VPBROADCAST %esi, %VMATCH
>         movl    %edi, %eax
>         andl    $(PAGE_SIZE - 1), %eax
>         /* Check if we cross page boundary with one vector load.
> @@ -75,19 +102,27 @@ ENTRY_P2ALIGN (STRCHR, 5)
>         cmpl    $(PAGE_SIZE - VEC_SIZE), %eax
>         ja      L(cross_page_boundary)
>
> +
>         /* Check the first VEC_SIZE bytes. Search for both CHAR and the
>            null bytes.  */
> -       VMOVU   (%rdi), %YMM1
> -
> +       VMOVU   (%rdi), %VMM(1)
>         /* Leaves only CHARS matching esi as 0.  */
> -       vpxorq  %YMM1, %YMM0, %YMM2
> -       VPMINU  %YMM2, %YMM1, %YMM2
> -       /* Each bit in K0 represents a CHAR or a null byte in YMM1.  */
> -       VPTESTN %YMM2, %YMM2, %k0
> -       kmovd   %k0, %eax
> -       testl   %eax, %eax
> +       vpxorq  %VMM(1), %VMATCH, %VMM(2)
> +       VPMINU  %VMM(2), %VMM(1), %VMM(2)
> +       /* Each bit in K0 represents a CHAR or a null byte in VEC_1.  */
> +       VPTESTN %VMM(2), %VMM(2), %k0
> +       KMOV    %k0, %VRAX
> +# if VEC_SIZE == 64 && defined USE_AS_STRCHRNUL
> +       /* If VEC_SIZE == 64 && STRCHRNUL use bsf to test condition so
> +          that all logic for match/null in first VEC first in 1x cache
> +          lines.  This has a slight cost to larger sizes.  */
> +       bsf     %VRAX, %VRAX
> +       jz      L(aligned_more)
> +# else
> +       test    %VRAX, %VRAX
>         jz      L(aligned_more)
> -       tzcntl  %eax, %eax
> +       bsf     %VRAX, %VRAX
> +# endif
>  # ifndef USE_AS_STRCHRNUL
>         /* Found CHAR or the null byte.  */
>         cmp     (%rdi, %rax, CHAR_SIZE), %CHAR_REG
> @@ -109,287 +144,374 @@ ENTRY_P2ALIGN (STRCHR, 5)
>  # endif
>         ret
>
> -
> -
> -       .p2align 4,, 10
> -L(first_vec_x4):
> -# ifndef USE_AS_STRCHRNUL
> -       /* Check to see if first match was CHAR (k0) or null (k1).  */
> -       kmovd   %k0, %eax
> -       tzcntl  %eax, %eax
> -       kmovd   %k1, %ecx
> -       /* bzhil will not be 0 if first match was null.  */
> -       bzhil   %eax, %ecx, %ecx
> -       jne     L(zero)
> -# else
> -       /* Combine CHAR and null matches.  */
> -       kord    %k0, %k1, %k0
> -       kmovd   %k0, %eax
> -       tzcntl  %eax, %eax
> -# endif
> -       /* NB: Multiply sizeof char type (1 or 4) to get the number of
> -          bytes.  */
> -       leaq    (VEC_SIZE * 4)(%rdi, %rax, CHAR_SIZE), %rax
> -       ret
> -
>  # ifndef USE_AS_STRCHRNUL
>  L(zero):
>         xorl    %eax, %eax
>         ret
>  # endif
>
> -
> -       .p2align 4
> +       .p2align 4,, 2
> +L(first_vec_x3):
> +       subq    $-(VEC_SIZE * 2), %rdi
> +# if VEC_SIZE == 32
> +       /* Reuse L(first_vec_x3) for last VEC2 only for VEC_SIZE == 32.
> +          For VEC_SIZE == 64 the registers don't match.  */
> +L(last_vec_x2):
> +# endif
>  L(first_vec_x1):
>         /* Use bsf here to save 1-byte keeping keeping the block in 1x
>            fetch block. eax guranteed non-zero.  */
> -       bsfl    %eax, %eax
> +       bsf     %VRCX, %VRCX
>  # ifndef USE_AS_STRCHRNUL
> -       /* Found CHAR or the null byte.  */
> -       cmp     (VEC_SIZE)(%rdi, %rax, CHAR_SIZE), %CHAR_REG
> +       /* Found CHAR or the null byte.  */
> +       cmp     (VEC_SIZE)(%rdi, %rcx, CHAR_SIZE), %CHAR_REG
>         jne     L(zero)
> -
>  # endif
>         /* NB: Multiply sizeof char type (1 or 4) to get the number of
>            bytes.  */
> -       leaq    (VEC_SIZE)(%rdi, %rax, CHAR_SIZE), %rax
> +       leaq    (VEC_SIZE)(%rdi, %rcx, CHAR_SIZE), %rax
>         ret
>
> -       .p2align 4,, 10
> +       .p2align 4,, 2
> +L(first_vec_x4):
> +       subq    $-(VEC_SIZE * 2), %rdi
>  L(first_vec_x2):
>  # ifndef USE_AS_STRCHRNUL
>         /* Check to see if first match was CHAR (k0) or null (k1).  */
> -       kmovd   %k0, %eax
> -       tzcntl  %eax, %eax
> -       kmovd   %k1, %ecx
> +       KMOV    %k0, %VRAX
> +       tzcnt   %VRAX, %VRAX
> +       KMOV    %k1, %VRCX
>         /* bzhil will not be 0 if first match was null.  */
> -       bzhil   %eax, %ecx, %ecx
> +       bzhi    %VRAX, %VRCX, %VRCX
>         jne     L(zero)
>  # else
>         /* Combine CHAR and null matches.  */
> -       kord    %k0, %k1, %k0
> -       kmovd   %k0, %eax
> -       tzcntl  %eax, %eax
> +       KOR     %k0, %k1, %k0
> +       KMOV    %k0, %VRAX
> +       bsf     %VRAX, %VRAX
>  # endif
>         /* NB: Multiply sizeof char type (1 or 4) to get the number of
>            bytes.  */
>         leaq    (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
>         ret
>
> -       .p2align 4,, 10
> -L(first_vec_x3):
> -       /* Use bsf here to save 1-byte keeping keeping the block in 1x
> -          fetch block. eax guranteed non-zero.  */
> -       bsfl    %eax, %eax
> -# ifndef USE_AS_STRCHRNUL
> -       /* Found CHAR or the null byte.  */
> -       cmp     (VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %CHAR_REG
> -       jne     L(zero)
> +# ifdef USE_AS_STRCHRNUL
> +       /* We use this as a hook to get imm8 encoding for the jmp to
> +          L(page_cross_boundary).  This allows the hot case of a
> +          match/null-term in first VEC to fit entirely in 1 cache
> +          line.  */
> +L(cross_page_boundary):
> +       jmp     L(cross_page_boundary_real)
>  # endif
> -       /* NB: Multiply sizeof char type (1 or 4) to get the number of
> -          bytes.  */
> -       leaq    (VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax
> -       ret
>
>         .p2align 4
>  L(aligned_more):
> +L(cross_page_continue):
>         /* Align data to VEC_SIZE.  */
>         andq    $-VEC_SIZE, %rdi
> -L(cross_page_continue):
> -       /* Check the next 4 * VEC_SIZE. Only one VEC_SIZE at a time since
> -          data is only aligned to VEC_SIZE. Use two alternating methods
> -          for checking VEC to balance latency and port contention.  */
>
> -       /* This method has higher latency but has better port
> -          distribution.  */
> -       VMOVA   (VEC_SIZE)(%rdi), %YMM1
> +       /* Check the next 4 * VEC_SIZE. Only one VEC_SIZE at a time
> +          since data is only aligned to VEC_SIZE. Use two alternating
> +          methods for checking VEC to balance latency and port
> +          contention.  */
> +
> +    /* Method(1) with 8c latency:
> +          For VEC_SIZE == 32:
> +          p0 * 1.83, p1 * 0.83, p5 * 1.33
> +          For VEC_SIZE == 64:
> +          p0 * 2.50, p1 * 0.00, p5 * 1.50  */
> +       VMOVA   (VEC_SIZE)(%rdi), %VMM(1)
>         /* Leaves only CHARS matching esi as 0.  */
> -       vpxorq  %YMM1, %YMM0, %YMM2
> -       VPMINU  %YMM2, %YMM1, %YMM2
> -       /* Each bit in K0 represents a CHAR or a null byte in YMM1.  */
> -       VPTESTN %YMM2, %YMM2, %k0
> -       kmovd   %k0, %eax
> -       testl   %eax, %eax
> +       vpxorq  %VMM(1), %VMATCH, %VMM(2)
> +       VPMINU  %VMM(2), %VMM(1), %VMM(2)
> +       /* Each bit in K0 represents a CHAR or a null byte in VEC_1.  */
> +       VPTESTN %VMM(2), %VMM(2), %k0
> +       KMOV    %k0, %VRCX
> +       test    %VRCX, %VRCX
>         jnz     L(first_vec_x1)
>
> -       /* This method has higher latency but has better port
> -          distribution.  */
> -       VMOVA   (VEC_SIZE * 2)(%rdi), %YMM1
> -       /* Each bit in K0 represents a CHAR in YMM1.  */
> -       VPCMP   $0, %YMM1, %YMM0, %k0
> -       /* Each bit in K1 represents a CHAR in YMM1.  */
> -       VPTESTN %YMM1, %YMM1, %k1
> -       kortestd        %k0, %k1
> +    /* Method(2) with 6c latency:
> +          For VEC_SIZE == 32:
> +          p0 * 1.00, p1 * 0.00, p5 * 2.00
> +          For VEC_SIZE == 64:
> +          p0 * 1.00, p1 * 0.00, p5 * 2.00  */
> +       VMOVA   (VEC_SIZE * 2)(%rdi), %VMM(1)
> +       /* Each bit in K0 represents a CHAR in VEC_1.  */
> +       VPCMPEQ %VMM(1), %VMATCH, %k0
> +       /* Each bit in K1 represents a CHAR in VEC_1.  */
> +       VPTESTN %VMM(1), %VMM(1), %k1
> +       KORTEST %k0, %k1
>         jnz     L(first_vec_x2)
>
> -       VMOVA   (VEC_SIZE * 3)(%rdi), %YMM1
> +       /* By swapping between Method 1/2 we get more fair port
> +          distrubition and better throughput.  */
> +
> +       VMOVA   (VEC_SIZE * 3)(%rdi), %VMM(1)
>         /* Leaves only CHARS matching esi as 0.  */
> -       vpxorq  %YMM1, %YMM0, %YMM2
> -       VPMINU  %YMM2, %YMM1, %YMM2
> -       /* Each bit in K0 represents a CHAR or a null byte in YMM1.  */
> -       VPTESTN %YMM2, %YMM2, %k0
> -       kmovd   %k0, %eax
> -       testl   %eax, %eax
> +       vpxorq  %VMM(1), %VMATCH, %VMM(2)
> +       VPMINU  %VMM(2), %VMM(1), %VMM(2)
> +       /* Each bit in K0 represents a CHAR or a null byte in VEC_1.  */
> +       VPTESTN %VMM(2), %VMM(2), %k0
> +       KMOV    %k0, %VRCX
> +       test    %VRCX, %VRCX
>         jnz     L(first_vec_x3)
>
> -       VMOVA   (VEC_SIZE * 4)(%rdi), %YMM1
> -       /* Each bit in K0 represents a CHAR in YMM1.  */
> -       VPCMP   $0, %YMM1, %YMM0, %k0
> -       /* Each bit in K1 represents a CHAR in YMM1.  */
> -       VPTESTN %YMM1, %YMM1, %k1
> -       kortestd        %k0, %k1
> +       VMOVA   (VEC_SIZE * 4)(%rdi), %VMM(1)
> +       /* Each bit in K0 represents a CHAR in VEC_1.  */
> +       VPCMPEQ %VMM(1), %VMATCH, %k0
> +       /* Each bit in K1 represents a CHAR in VEC_1.  */
> +       VPTESTN %VMM(1), %VMM(1), %k1
> +       KORTEST %k0, %k1
>         jnz     L(first_vec_x4)
>
>         /* Align data to VEC_SIZE * 4 for the loop.  */
> +# if VEC_SIZE == 64
> +       /* Use rax for the loop reg as it allows to the loop to fit in
> +          exactly 2-cache-lines. (more efficient imm32 + gpr
> +          encoding).  */
> +       leaq    (VEC_SIZE)(%rdi), %rax
> +       /* No partial register stalls on evex512 processors.  */
> +       xorb    %al, %al
> +# else
> +       /* For VEC_SIZE == 32 continue using rdi for loop reg so we can
> +          reuse more code and save space.  */
>         addq    $VEC_SIZE, %rdi
>         andq    $-(VEC_SIZE * 4), %rdi
> -
> +# endif
>         .p2align 4
>  L(loop_4x_vec):
> -       /* Check 4x VEC at a time. No penalty to imm32 offset with evex
> -          encoding.  */
> -       VMOVA   (VEC_SIZE * 4)(%rdi), %YMM1
> -       VMOVA   (VEC_SIZE * 5)(%rdi), %YMM2
> -       VMOVA   (VEC_SIZE * 6)(%rdi), %YMM3
> -       VMOVA   (VEC_SIZE * 7)(%rdi), %YMM4
> -
> -       /* For YMM1 and YMM3 use xor to set the CHARs matching esi to
> +       /* Check 4x VEC at a time. No penalty for imm32 offset with evex
> +          encoding (if offset % VEC_SIZE == 0).  */
> +       VMOVA   (VEC_SIZE * 4)(%LOOP_REG), %VMM(1)
> +       VMOVA   (VEC_SIZE * 5)(%LOOP_REG), %VMM(2)
> +       VMOVA   (VEC_SIZE * 6)(%LOOP_REG), %VMM(3)
> +       VMOVA   (VEC_SIZE * 7)(%LOOP_REG), %VMM(4)
> +
> +       /* Collect bits where VEC_1 does NOT match esi.  This is later
> +          use to mask of results (getting not matches allows us to
> +          save an instruction on combining).  */
> +       VPCMP   $4, %VMATCH, %VMM(1), %k1
> +
> +       /* Two methods for loop depending on VEC_SIZE.  This is because
> +          with zmm registers VPMINU can only run on p0 (as opposed to
> +          p0/p1 for ymm) so it is less prefered.  */
> +# if VEC_SIZE == 32
> +       /* For VEC_2 and VEC_3 use xor to set the CHARs matching esi to
>            zero.  */
> -       vpxorq  %YMM1, %YMM0, %YMM5
> -       /* For YMM2 and YMM4 cmp not equals to CHAR and store result in
> -          k register. Its possible to save either 1 or 2 instructions
> -          using cmp no equals method for either YMM1 or YMM1 and YMM3
> -          respectively but bottleneck on p5 makes it not worth it.  */
> -       VPCMP   $4, %YMM0, %YMM2, %k2
> -       vpxorq  %YMM3, %YMM0, %YMM7
> -       VPCMP   $4, %YMM0, %YMM4, %k4
> -
> -       /* Use min to select all zeros from either xor or end of string).
> -        */
> -       VPMINU  %YMM1, %YMM5, %YMM1
> -       VPMINU  %YMM3, %YMM7, %YMM3
> +       vpxorq  %VMM(2), %VMATCH, %VMM(6)
> +       vpxorq  %VMM(3), %VMATCH, %VMM(7)
>
> -       /* Use min + zeromask to select for zeros. Since k2 and k4 will
> -          have 0 as positions that matched with CHAR which will set
> -          zero in the corresponding destination bytes in YMM2 / YMM4.
> -        */
> -       VPMINU  %YMM1, %YMM2, %YMM2{%k2}{z}
> -       VPMINU  %YMM3, %YMM4, %YMM4
> -       VPMINU  %YMM2, %YMM4, %YMM4{%k4}{z}
> -
> -       VPTESTN %YMM4, %YMM4, %k1
> -       kmovd   %k1, %ecx
> -       subq    $-(VEC_SIZE * 4), %rdi
> -       testl   %ecx, %ecx
> +       /* Find non-matches in VEC_4 while combining with non-matches
> +          from VEC_1.  NB: Try and use masked predicate execution on
> +          instructions that have mask result as it has no latency
> +          penalty.  */
> +       VPCMP   $4, %VMATCH, %VMM(4), %k4{%k1}
> +
> +       /* Combined zeros from VEC_1 / VEC_2 (search for null term).  */
> +       VPMINU  %VMM(1), %VMM(2), %VMM(2)
> +
> +       /* Use min to select all zeros from either xor or end of
> +          string).  */
> +       VPMINU  %VMM(3), %VMM(7), %VMM(3)
> +       VPMINU  %VMM(2), %VMM(6), %VMM(2)
> +
> +       /* Combined zeros from VEC_2 / VEC_3 (search for null term).  */
> +       VPMINU  %VMM(3), %VMM(4), %VMM(4)
> +
> +       /* Combined zeros from VEC_2 / VEC_4 (this has all null term and
> +          esi matches for VEC_2 / VEC_3).  */
> +       VPMINU  %VMM(2), %VMM(4), %VMM(4)
> +# else
> +       /* Collect non-matches for VEC_2.  */
> +       VPCMP   $4, %VMM(2), %VMATCH, %k2
> +
> +       /* Combined zeros from VEC_1 / VEC_2 (search for null term).  */
> +       VPMINU  %VMM(1), %VMM(2), %VMM(2)
> +
> +       /* Find non-matches in VEC_3/VEC_4 while combining with non-
> +          matches from VEC_1/VEC_2 respectively.  */
> +       VPCMP   $4, %VMM(3), %VMATCH, %k3{%k1}
> +       VPCMP   $4, %VMM(4), %VMATCH, %k4{%k2}
> +
> +       /* Finish combining zeros in all VECs.  */
> +       VPMINU  %VMM(3), %VMM(4), %VMM(4)
> +
> +       /* Combine in esi matches for VEC_3 (if there was a match with
> +          esi, the corresponding bit in %k3 is zero so the
> +          VPMINU_MASKZ will have a zero in the result).  NB: This make
> +          the VPMINU 3c latency.  The only way to avoid it is to
> +          createa a 12c dependency chain on all the `VPCMP $4, ...`
> +          which has higher total latency.  */
> +       VPMINU  %VMM(2), %VMM(4), %VMM(4){%k3}{z}
> +# endif
> +       VPTEST  %VMM(4), %VMM(4), %k0{%k4}
> +       KMOV    %k0, %VRDX
> +       subq    $-(VEC_SIZE * 4), %LOOP_REG
> +
> +       /* TESTZ is inc using the proper register width depending on
> +          CHAR_PER_VEC. An esi match or null-term match leaves a zero-
> +          bit in rdx so inc won't overflow and won't be zero.  */
> +       TESTZ   (rdx)
>         jz      L(loop_4x_vec)
>
> -       VPTESTN %YMM1, %YMM1, %k0
> -       kmovd   %k0, %eax
> -       testl   %eax, %eax
> -       jnz     L(last_vec_x1)
> +       VPTEST  %VMM(1), %VMM(1), %k0{%k1}
> +       KMOV    %k0, %VGPR(MASK_GPR)
> +       TESTZ   (MASK_GPR)
> +# if VEC_SIZE == 32
> +       /* We can reuse the return code in page_cross logic for VEC_SIZE
> +          == 32.  */
> +       jnz     L(last_vec_x1_vec_size32)
> +# else
> +       jnz     L(last_vec_x1_vec_size64)
> +# endif
> +
>
> -       VPTESTN %YMM2, %YMM2, %k0
> -       kmovd   %k0, %eax
> -       testl   %eax, %eax
> +       /* COND_MASK integates the esi matches for VEC_SIZE == 64. For
> +          VEC_SIZE == 32 they are already integrated.  */
> +       VPTEST  %VMM(2), %VMM(2), %k0 COND_MASK(k2)
> +       KMOV    %k0, %VRCX
> +       TESTZ   (rcx)
>         jnz     L(last_vec_x2)
>
> -       VPTESTN %YMM3, %YMM3, %k0
> -       kmovd   %k0, %eax
> -       /* Combine YMM3 matches (eax) with YMM4 matches (ecx).  */
> -# ifdef USE_AS_WCSCHR
> -       sall    $8, %ecx
> -       orl     %ecx, %eax
> -       bsfl    %eax, %eax
> +       VPTEST  %VMM(3), %VMM(3), %k0 COND_MASK(k3)
> +       KMOV    %k0, %VRCX
> +# if CHAR_PER_VEC == 64
> +       TESTZ   (rcx)
> +       jnz     L(last_vec_x3)
>  # else
> -       salq    $32, %rcx
> -       orq     %rcx, %rax
> -       bsfq    %rax, %rax
> +       salq    $CHAR_PER_VEC, %rdx
> +       TESTZ   (rcx)
> +       orq     %rcx, %rdx
>  # endif
> +
> +       bsfq    %rdx, %rdx
> +
>  # ifndef USE_AS_STRCHRNUL
>         /* Check if match was CHAR or null.  */
> -       cmp     (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %CHAR_REG
> +       cmp     (LAST_VEC_OFFSET)(%LOOP_REG, %rdx, CHAR_SIZE), %CHAR_REG
>         jne     L(zero_end)
>  # endif
>         /* NB: Multiply sizeof char type (1 or 4) to get the number of
>            bytes.  */
> -       leaq    (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
> +       leaq    (LAST_VEC_OFFSET)(%LOOP_REG, %rdx, CHAR_SIZE), %rax
>         ret
>
> -       .p2align 4,, 8
> -L(last_vec_x1):
> -       bsfl    %eax, %eax
> -# ifdef USE_AS_WCSCHR
> -       /* NB: Multiply wchar_t count by 4 to get the number of bytes.
> -          */
> -       leaq    (%rdi, %rax, CHAR_SIZE), %rax
> -# else
> -       addq    %rdi, %rax
> +# ifndef USE_AS_STRCHRNUL
> +L(zero_end):
> +       xorl    %eax, %eax
> +       ret
>  # endif
>
> -# ifndef USE_AS_STRCHRNUL
> +
> +       /* Seperate return label for last VEC1 because for VEC_SIZE ==
> +          32 we can reuse return code in L(page_cross) but VEC_SIZE ==
> +          64 has mismatched registers.  */
> +# if VEC_SIZE == 64
> +       .p2align 4,, 8
> +L(last_vec_x1_vec_size64):
> +       bsf     %VRCX, %VRCX
> +#  ifndef USE_AS_STRCHRNUL
>         /* Check if match was null.  */
> -       cmp     (%rax), %CHAR_REG
> +       cmp     (%rax, %rcx, CHAR_SIZE), %CHAR_REG
>         jne     L(zero_end)
> -# endif
> -
> +#  endif
> +#  ifdef USE_AS_WCSCHR
> +       /* NB: Multiply wchar_t count by 4 to get the number of bytes.
> +        */
> +       leaq    (%rax, %rcx, CHAR_SIZE), %rax
> +#  else
> +       addq    %rcx, %rax
> +#  endif
>         ret
>
> +       /* Since we can't combine the last 2x matches for CHAR_PER_VEC
> +          == 64 we need return label for last VEC3.  */
> +#  if CHAR_PER_VEC == 64
>         .p2align 4,, 8
> +L(last_vec_x3):
> +       addq    $VEC_SIZE, %LOOP_REG
> +#  endif
> +
> +       /* Duplicate L(last_vec_x2) for VEC_SIZE == 64 because we can't
> +          reuse L(first_vec_x3) due to register mismatch.  */
>  L(last_vec_x2):
> -       bsfl    %eax, %eax
> -# ifndef USE_AS_STRCHRNUL
> +       bsf     %VGPR(MASK_GPR), %VGPR(MASK_GPR)
> +#  ifndef USE_AS_STRCHRNUL
>         /* Check if match was null.  */
> -       cmp     (VEC_SIZE)(%rdi, %rax, CHAR_SIZE), %CHAR_REG
> +       cmp     (VEC_SIZE * 1)(%LOOP_REG, %MASK_GPR, CHAR_SIZE), %CHAR_REG
>         jne     L(zero_end)
> -# endif
> +#  endif
>         /* NB: Multiply sizeof char type (1 or 4) to get the number of
>            bytes.  */
> -       leaq    (VEC_SIZE)(%rdi, %rax, CHAR_SIZE), %rax
> +       leaq    (VEC_SIZE * 1)(%LOOP_REG, %MASK_GPR, CHAR_SIZE), %rax
>         ret
> +# endif
>
> -       /* Cold case for crossing page with first load.  */
> -       .p2align 4,, 8
> +       /* Cold case for crossing page with first load.  */
> +       .p2align 4,, 10
> +# ifndef USE_AS_STRCHRNUL
>  L(cross_page_boundary):
> -       movq    %rdi, %rdx
> +# endif
> +L(cross_page_boundary_real):
>         /* Align rdi.  */
> -       andq    $-VEC_SIZE, %rdi
> -       VMOVA   (%rdi), %YMM1
> -       /* Leaves only CHARS matching esi as 0.  */
> -       vpxorq  %YMM1, %YMM0, %YMM2
> -       VPMINU  %YMM2, %YMM1, %YMM2
> -       /* Each bit in K0 represents a CHAR or a null byte in YMM1.  */
> -       VPTESTN %YMM2, %YMM2, %k0
> -       kmovd   %k0, %eax
> +       xorq    %rdi, %rax
> +       VMOVA   (PAGE_SIZE - VEC_SIZE)(%rax), %VMM(1)
> +       /* Use high latency method of getting matches to save code size.
> +        */
> +
> +       /* K1 has 1s where VEC(1) does NOT match esi.  */
> +       VPCMP   $4, %VMM(1), %VMATCH, %k1
> +       /* K0 has ones where K1 is 1 (non-match with esi), and non-zero
> +          (null).  */
> +       VPTEST  %VMM(1), %VMM(1), %k0{%k1}
> +       KMOV    %k0, %VRAX
>         /* Remove the leading bits.  */
>  # ifdef USE_AS_WCSCHR
> -       movl    %edx, %SHIFT_REG
> +       movl    %edi, %VGPR_SZ(SHIFT_REG, 32)
>         /* NB: Divide shift count by 4 since each bit in K1 represent 4
>            bytes.  */
> -       sarl    $2, %SHIFT_REG
> -       andl    $(CHAR_PER_VEC - 1), %SHIFT_REG
> +       sarl    $2, %VGPR_SZ(SHIFT_REG, 32)
> +       andl    $(CHAR_PER_VEC - 1), %VGPR_SZ(SHIFT_REG, 32)
> +
> +       /* if wcsrchr we need to reverse matches as we can't rely on
> +          signed shift to bring in ones. There is not sarx for
> +          gpr8/16. Also not we can't use inc here as the lower bits
> +          represent matches out of range so we can't rely on overflow.
> +        */
> +       xorl    $((1 << CHAR_PER_VEC)- 1), %eax
> +# endif
> +       /* Use arithmatic shift so that leading 1s are filled in.  */
> +       sarx    %VGPR(SHIFT_REG), %VRAX, %VRAX
> +       /* If eax is all ones then no matches for esi or NULL.  */
> +
> +# ifdef USE_AS_WCSCHR
> +       test    %VRAX, %VRAX
> +# else
> +       inc     %VRAX
>  # endif
> -       sarxl   %SHIFT_REG, %eax, %eax
> -       /* If eax is zero continue.  */
> -       testl   %eax, %eax
>         jz      L(cross_page_continue)
> -       bsfl    %eax, %eax
>
> +       .p2align 4,, 10
> +L(last_vec_x1_vec_size32):
> +       bsf     %VRAX, %VRAX
>  # ifdef USE_AS_WCSCHR
> -       /* NB: Multiply wchar_t count by 4 to get the number of
> -          bytes.  */
> -       leaq    (%rdx, %rax, CHAR_SIZE), %rax
> +       /* NB: Multiply wchar_t count by 4 to get the number of bytes.
> +        */
> +       leaq    (%rdi, %rax, CHAR_SIZE), %rax
>  # else
> -       addq    %rdx, %rax
> +       addq    %rdi, %rax
>  # endif
>  # ifndef USE_AS_STRCHRNUL
>         /* Check to see if match was CHAR or null.  */
>         cmp     (%rax), %CHAR_REG
> -       je      L(cross_page_ret)
> -L(zero_end):
> -       xorl    %eax, %eax
> -L(cross_page_ret):
> +       jne     L(zero_end_0)
>  # endif
>         ret
> +# ifndef USE_AS_STRCHRNUL
> +L(zero_end_0):
> +       xorl    %eax, %eax
> +       ret
> +# endif
>
>  END (STRCHR)
>  #endif
> --
> 2.34.1
>

[-- Attachment #2: strchr.txt --]
[-- Type: text/plain, Size: 43850 bytes --]

Results For: strchr
alignment,length ,max_char ,pos       ,rand ,seek_char ,__strchr_evex ,__strchr_evex_orig 
0        ,1      ,127      ,0         ,0    ,0         ,3.484         ,3.482              ,1.001 
0        ,1      ,127      ,0         ,0    ,23        ,3.549         ,3.577              ,0.992 
0        ,10     ,127      ,9         ,0    ,0         ,3.764         ,3.774              ,0.997 
0        ,10     ,127      ,9         ,0    ,23        ,3.667         ,3.725              ,0.985 
0        ,1024   ,127      ,1056      ,0    ,0         ,29.009        ,29.496             ,0.983 
0        ,1024   ,127      ,1088      ,0    ,0         ,30.558        ,29.533             ,1.035 
0        ,1024   ,127      ,1120      ,0    ,0         ,28.984        ,29.538             ,0.981 
0        ,1024   ,127      ,1152      ,0    ,0         ,29.12         ,29.453             ,0.989 
0        ,1024   ,127      ,1184      ,0    ,0         ,28.992        ,29.719             ,0.976 
0        ,1024   ,127      ,1216      ,0    ,0         ,29.231        ,29.728             ,0.983 
0        ,1024   ,127      ,1248      ,0    ,0         ,28.974        ,29.482             ,0.983 
0        ,1024   ,127      ,1280      ,0    ,0         ,30.446        ,31.0               ,0.982 
0        ,1024   ,127      ,1312      ,0    ,0         ,28.923        ,29.424             ,0.983 
0        ,1024   ,127      ,1344      ,0    ,0         ,29.066        ,29.51              ,0.985 
0        ,1024   ,127      ,704       ,0    ,0         ,23.787        ,24.111             ,0.987 
0        ,1024   ,127      ,736       ,0    ,0         ,24.089        ,23.965             ,1.005 
0        ,1024   ,127      ,768       ,0    ,0         ,23.96         ,24.187             ,0.991 
0        ,1024   ,127      ,800       ,0    ,0         ,24.756        ,25.882             ,0.957 
0        ,1024   ,127      ,832       ,0    ,0         ,27.218        ,27.062             ,1.006 
0        ,1024   ,127      ,864       ,0    ,0         ,26.651        ,27.02              ,0.986 
0        ,1024   ,127      ,896       ,0    ,0         ,26.368        ,26.469             ,0.996 
0        ,1024   ,127      ,928       ,0    ,0         ,27.253        ,28.029             ,0.972 
0        ,1024   ,127      ,960       ,0    ,0         ,28.766        ,29.732             ,0.968 
0        ,1024   ,127      ,992       ,0    ,0         ,29.113        ,29.589             ,0.984 
0        ,1056   ,127      ,1024      ,0    ,0         ,29.376        ,29.74              ,0.988 
0        ,1088   ,127      ,1024      ,0    ,0         ,28.924        ,29.572             ,0.978 
0        ,11     ,127      ,10        ,0    ,0         ,3.801         ,3.762              ,1.01  
0        ,11     ,127      ,10        ,0    ,23        ,3.867         ,3.724              ,1.038 
0        ,112    ,127      ,16        ,0    ,0         ,3.581         ,3.788              ,0.945 
0        ,1120   ,127      ,1024      ,0    ,0         ,28.917        ,29.617             ,0.976 
0        ,1152   ,127      ,1024      ,0    ,0         ,29.024        ,29.636             ,0.979 
0        ,1184   ,127      ,1024      ,0    ,0         ,29.117        ,29.367             ,0.991 
0        ,12     ,127      ,11        ,0    ,0         ,3.749         ,3.813              ,0.983 
0        ,12     ,127      ,11        ,0    ,23        ,3.85          ,3.75               ,1.027 
0        ,1216   ,127      ,1024      ,0    ,0         ,32.235        ,32.195             ,1.001 
0        ,1248   ,127      ,1024      ,0    ,0         ,29.111        ,29.558             ,0.985 
0        ,128    ,127      ,160       ,0    ,0         ,7.904         ,8.004              ,0.988 
0        ,128    ,127      ,192       ,0    ,0         ,7.678         ,8.022              ,0.957 
0        ,128    ,127      ,224       ,0    ,0         ,7.665         ,7.954              ,0.964 
0        ,128    ,127      ,256       ,0    ,0         ,7.697         ,7.944              ,0.969 
0        ,128    ,127      ,288       ,0    ,0         ,7.658         ,7.986              ,0.959 
0        ,128    ,127      ,32        ,0    ,0         ,4.469         ,5.122              ,0.873 
0        ,128    ,127      ,320       ,0    ,0         ,7.617         ,7.951              ,0.958 
0        ,128    ,127      ,352       ,0    ,0         ,7.67          ,7.933              ,0.967 
0        ,128    ,127      ,384       ,0    ,0         ,7.67          ,7.962              ,0.963 
0        ,128    ,127      ,416       ,0    ,0         ,7.642         ,7.925              ,0.964 
0        ,128    ,127      ,448       ,0    ,0         ,7.694         ,8.028              ,0.958 
0        ,128    ,127      ,64        ,0    ,0         ,5.725         ,6.131              ,0.934 
0        ,128    ,127      ,96        ,0    ,0         ,6.267         ,6.434              ,0.974 
0        ,1280   ,127      ,1024      ,0    ,0         ,28.901        ,29.648             ,0.975 
0        ,13     ,127      ,12        ,0    ,0         ,3.878         ,3.87               ,1.002 
0        ,13     ,127      ,12        ,0    ,23        ,3.908         ,3.798              ,1.029 
0        ,1312   ,127      ,1024      ,0    ,0         ,29.025        ,29.584             ,0.981 
0        ,1344   ,127      ,1024      ,0    ,0         ,29.021        ,29.673             ,0.978 
0        ,14     ,127      ,13        ,0    ,0         ,3.717         ,3.81               ,0.976 
0        ,14     ,127      ,13        ,0    ,23        ,3.882         ,3.824              ,1.015 
0        ,144    ,127      ,16        ,0    ,0         ,3.672         ,3.791              ,0.968 
0        ,15     ,127      ,14        ,0    ,0         ,3.635         ,3.822              ,0.951 
0        ,15     ,127      ,14        ,0    ,23        ,3.923         ,3.944              ,0.995 
0        ,16     ,127      ,112       ,0    ,0         ,3.77          ,3.777              ,0.998 
0        ,16     ,127      ,144       ,0    ,0         ,3.639         ,3.777              ,0.963 
0        ,16     ,127      ,15        ,0    ,0         ,3.757         ,3.882              ,0.968 
0        ,16     ,127      ,15        ,0    ,23        ,3.785         ,3.842              ,0.985 
0        ,16     ,127      ,176       ,0    ,0         ,3.624         ,3.797              ,0.954 
0        ,16     ,127      ,208       ,0    ,0         ,3.773         ,3.785              ,0.997 
0        ,16     ,127      ,240       ,0    ,0         ,3.705         ,3.8                ,0.975 
0        ,16     ,127      ,272       ,0    ,0         ,3.679         ,3.693              ,0.996 
0        ,16     ,127      ,304       ,0    ,0         ,3.651         ,3.87               ,0.943 
0        ,16     ,127      ,336       ,0    ,0         ,3.882         ,3.79               ,1.024 
0        ,16     ,127      ,48        ,0    ,0         ,3.59          ,3.675              ,0.977 
0        ,16     ,127      ,80        ,0    ,0         ,3.705         ,3.756              ,0.986 
0        ,160    ,127      ,128       ,0    ,0         ,7.638         ,7.97               ,0.958 
0        ,160    ,127      ,256       ,0    ,0         ,11.478        ,11.872             ,0.967 
0        ,160    ,127      ,32        ,0    ,0         ,4.484         ,5.186              ,0.865 
0        ,160    ,127      ,64        ,0    ,0         ,5.722         ,6.116              ,0.936 
0        ,17     ,127      ,16        ,0    ,0         ,3.78          ,3.857              ,0.98  
0        ,17     ,127      ,16        ,0    ,23        ,3.808         ,3.803              ,1.001 
0        ,1728   ,127      ,2048      ,0    ,0         ,44.094        ,44.594             ,0.989 
0        ,176    ,127      ,16        ,0    ,0         ,3.691         ,3.774              ,0.978 
0        ,1760   ,127      ,2048      ,0    ,0         ,44.531        ,44.645             ,0.997 
0        ,1792   ,127      ,2048      ,0    ,0         ,44.158        ,44.741             ,0.987 
0        ,18     ,127      ,17        ,0    ,0         ,3.749         ,3.829              ,0.979 
0        ,18     ,127      ,17        ,0    ,23        ,3.763         ,3.821              ,0.985 
0        ,1824   ,127      ,2048      ,0    ,0         ,45.962        ,47.852             ,0.961 
0        ,1856   ,127      ,2048      ,0    ,0         ,46.911        ,46.994             ,0.998 
0        ,1888   ,127      ,2048      ,0    ,0         ,46.859        ,47.08              ,0.995 
0        ,19     ,127      ,18        ,0    ,0         ,3.794         ,3.774              ,1.005 
0        ,19     ,127      ,18        ,0    ,23        ,3.716         ,3.831              ,0.97  
0        ,192    ,127      ,128       ,0    ,0         ,7.664         ,8.001              ,0.958 
0        ,192    ,127      ,256       ,0    ,0         ,13.449        ,13.331             ,1.009 
0        ,192    ,127      ,32        ,0    ,0         ,4.61          ,5.183              ,0.889 
0        ,192    ,127      ,512       ,0    ,0         ,12.659        ,13.106             ,0.966 
0        ,192    ,127      ,64        ,0    ,0         ,5.733         ,6.114              ,0.938 
0        ,1920   ,127      ,2048      ,0    ,0         ,46.512        ,46.564             ,0.999 
0        ,1952   ,127      ,2048      ,0    ,0         ,47.817        ,48.691             ,0.982 
0        ,1984   ,127      ,2048      ,0    ,0         ,49.355        ,50.161             ,0.984 
0        ,2      ,127      ,1         ,0    ,0         ,3.699         ,3.743              ,0.988 
0        ,2      ,127      ,1         ,0    ,23        ,3.697         ,3.704              ,0.998 
0        ,20     ,127      ,19        ,0    ,0         ,3.717         ,3.758              ,0.989 
0        ,20     ,127      ,19        ,0    ,23        ,3.662         ,3.829              ,0.956 
0        ,2016   ,127      ,2048      ,0    ,0         ,49.752        ,49.795             ,0.999 
0        ,2048   ,127      ,1024      ,0    ,0         ,31.515        ,30.241             ,1.042 
0        ,2048   ,127      ,1024      ,0    ,23        ,29.306        ,30.083             ,0.974 
0        ,2048   ,127      ,128       ,0    ,0         ,7.675         ,8.03               ,0.956 
0        ,2048   ,127      ,128       ,0    ,23        ,7.827         ,7.93               ,0.987 
0        ,2048   ,127      ,1728      ,0    ,0         ,44.263        ,44.614             ,0.992 
0        ,2048   ,127      ,1760      ,0    ,0         ,44.122        ,44.538             ,0.991 
0        ,2048   ,127      ,1792      ,0    ,0         ,44.0          ,44.677             ,0.985 
0        ,2048   ,127      ,1824      ,0    ,0         ,45.275        ,46.338             ,0.977 
0        ,2048   ,127      ,1856      ,0    ,0         ,46.763        ,47.028             ,0.994 
0        ,2048   ,127      ,1888      ,0    ,0         ,46.854        ,47.025             ,0.996 
0        ,2048   ,127      ,1920      ,0    ,0         ,46.518        ,46.679             ,0.997 
0        ,2048   ,127      ,1952      ,0    ,0         ,47.9          ,48.726             ,0.983 
0        ,2048   ,127      ,1984      ,0    ,0         ,49.596        ,49.835             ,0.995 
0        ,2048   ,127      ,2016      ,0    ,0         ,49.767        ,49.671             ,1.002 
0        ,2048   ,127      ,2048      ,0    ,0         ,49.438        ,49.743             ,0.994 
0        ,2048   ,127      ,2048      ,0    ,23        ,49.619        ,51.643             ,0.961 
0        ,2048   ,127      ,2080      ,0    ,0         ,49.35         ,49.306             ,1.001 
0        ,2048   ,127      ,2112      ,0    ,0         ,49.517        ,49.302             ,1.004 
0        ,2048   ,127      ,2144      ,0    ,0         ,49.677        ,49.31              ,1.007 
0        ,2048   ,127      ,2176      ,0    ,0         ,51.055        ,49.334             ,1.035 
0        ,2048   ,127      ,2208      ,0    ,0         ,48.811        ,49.293             ,0.99  
0        ,2048   ,127      ,2240      ,0    ,0         ,49.336        ,49.366             ,0.999 
0        ,2048   ,127      ,2272      ,0    ,0         ,49.354        ,49.432             ,0.998 
0        ,2048   ,127      ,2304      ,0    ,0         ,49.361        ,49.314             ,1.001 
0        ,2048   ,127      ,2336      ,0    ,0         ,50.948        ,49.404             ,1.031 
0        ,2048   ,127      ,2368      ,0    ,0         ,49.49         ,49.145             ,1.007 
0        ,2048   ,127      ,256       ,0    ,0         ,13.201        ,13.373             ,0.987 
0        ,2048   ,127      ,256       ,0    ,23        ,13.305        ,13.412             ,0.992 
0        ,2048   ,127      ,32        ,0    ,0         ,4.417         ,5.272              ,0.838 
0        ,2048   ,127      ,32        ,0    ,23        ,4.279         ,4.939              ,0.866 
0        ,2048   ,127      ,512       ,0    ,0         ,19.755        ,20.248             ,0.976 
0        ,2048   ,127      ,512       ,0    ,23        ,19.304        ,19.867             ,0.972 
0        ,2048   ,127      ,64        ,0    ,0         ,5.777         ,6.178              ,0.935 
0        ,2048   ,127      ,64        ,0    ,23        ,5.782         ,6.039              ,0.957 
0        ,208    ,127      ,16        ,0    ,0         ,3.842         ,3.815              ,1.007 
0        ,2080   ,127      ,2048      ,0    ,0         ,50.755        ,49.293             ,1.03  
0        ,21     ,127      ,20        ,0    ,0         ,3.639         ,3.785              ,0.961 
0        ,21     ,127      ,20        ,0    ,23        ,3.782         ,3.783              ,1.0   
0        ,2112   ,127      ,2048      ,0    ,0         ,49.595        ,49.264             ,1.007 
0        ,2144   ,127      ,2048      ,0    ,0         ,48.922        ,49.41              ,0.99  
0        ,2176   ,127      ,2048      ,0    ,0         ,49.269        ,49.334             ,0.999 
0        ,22     ,127      ,21        ,0    ,0         ,3.809         ,3.81               ,1.0   
0        ,22     ,127      ,21        ,0    ,23        ,3.766         ,3.815              ,0.987 
0        ,2208   ,127      ,2048      ,0    ,0         ,49.252        ,49.21              ,1.001 
0        ,224    ,127      ,128       ,0    ,0         ,7.663         ,8.045              ,0.952 
0        ,224    ,127      ,256       ,0    ,0         ,12.533        ,13.126             ,0.955 
0        ,224    ,127      ,32        ,0    ,0         ,4.526         ,5.154              ,0.878 
0        ,224    ,127      ,512       ,0    ,0         ,12.546        ,13.085             ,0.959 
0        ,224    ,127      ,64        ,0    ,0         ,5.732         ,6.097              ,0.94  
0        ,2240   ,127      ,2048      ,0    ,0         ,49.802        ,49.194             ,1.012 
0        ,2272   ,127      ,2048      ,0    ,0         ,49.469        ,49.332             ,1.003 
0        ,23     ,127      ,22        ,0    ,0         ,3.958         ,3.873              ,1.022 
0        ,23     ,127      ,22        ,0    ,23        ,3.796         ,3.838              ,0.989 
0        ,2304   ,127      ,2048      ,0    ,0         ,49.215        ,49.287             ,0.999 
0        ,2336   ,127      ,2048      ,0    ,0         ,49.271        ,49.267             ,1.0   
0        ,2368   ,127      ,2048      ,0    ,0         ,49.236        ,49.279             ,0.999 
0        ,24     ,127      ,23        ,0    ,0         ,3.646         ,3.808              ,0.958 
0        ,24     ,127      ,23        ,0    ,23        ,3.839         ,3.779              ,1.016 
0        ,240    ,127      ,16        ,0    ,0         ,3.768         ,3.827              ,0.984 
0        ,25     ,127      ,24        ,0    ,0         ,3.785         ,3.813              ,0.993 
0        ,25     ,127      ,24        ,0    ,23        ,3.853         ,3.838              ,1.004 
0        ,256    ,127      ,128       ,0    ,0         ,7.66          ,7.949              ,0.964 
0        ,256    ,127      ,160       ,0    ,0         ,12.312        ,12.208             ,1.009 
0        ,256    ,127      ,192       ,0    ,0         ,12.436        ,13.071             ,0.951 
0        ,256    ,127      ,224       ,0    ,0         ,12.381        ,13.039             ,0.949 
0        ,256    ,127      ,288       ,0    ,0         ,13.236        ,13.383             ,0.989 
0        ,256    ,127      ,32        ,0    ,0         ,4.482         ,5.181              ,0.865 
0        ,256    ,127      ,320       ,0    ,0         ,13.176        ,13.428             ,0.981 
0        ,256    ,127      ,352       ,0    ,0         ,13.174        ,13.41              ,0.982 
0        ,256    ,127      ,384       ,0    ,0         ,13.2          ,13.363             ,0.988 
0        ,256    ,127      ,416       ,0    ,0         ,13.196        ,13.39              ,0.985 
0        ,256    ,127      ,448       ,0    ,0         ,13.205        ,13.356             ,0.989 
0        ,256    ,127      ,480       ,0    ,0         ,13.28         ,13.438             ,0.988 
0        ,256    ,127      ,512       ,0    ,0         ,13.222        ,13.408             ,0.986 
0        ,256    ,127      ,544       ,0    ,0         ,13.202        ,13.366             ,0.988 
0        ,256    ,127      ,576       ,0    ,0         ,13.238        ,13.423             ,0.986 
0        ,256    ,127      ,64        ,0    ,0         ,5.76          ,6.152              ,0.936 
0        ,256    ,127      ,96        ,0    ,0         ,6.315         ,6.431              ,0.982 
0        ,26     ,127      ,25        ,0    ,0         ,3.771         ,3.751              ,1.005 
0        ,26     ,127      ,25        ,0    ,23        ,3.723         ,3.83               ,0.972 
0        ,27     ,127      ,26        ,0    ,0         ,3.72          ,3.799              ,0.979 
0        ,27     ,127      ,26        ,0    ,23        ,3.613         ,3.727              ,0.969 
0        ,272    ,127      ,16        ,0    ,0         ,3.8           ,3.681              ,1.032 
0        ,28     ,127      ,27        ,0    ,0         ,3.77          ,3.881              ,0.972 
0        ,28     ,127      ,27        ,0    ,23        ,3.767         ,3.77               ,0.999 
0        ,288    ,127      ,128       ,0    ,0         ,7.678         ,7.993              ,0.96  
0        ,288    ,127      ,256       ,0    ,0         ,13.302        ,13.406             ,0.992 
0        ,288    ,127      ,32        ,0    ,0         ,4.601         ,5.171              ,0.89  
0        ,288    ,127      ,512       ,0    ,0         ,14.188        ,14.563             ,0.974 
0        ,288    ,127      ,64        ,0    ,0         ,5.692         ,6.088              ,0.935 
0        ,29     ,127      ,28        ,0    ,0         ,3.716         ,3.834              ,0.969 
0        ,29     ,127      ,28        ,0    ,23        ,3.636         ,3.694              ,0.984 
0        ,3      ,127      ,2         ,0    ,0         ,3.639         ,3.763              ,0.967 
0        ,3      ,127      ,2         ,0    ,23        ,3.932         ,3.78               ,1.04  
0        ,30     ,127      ,29        ,0    ,0         ,4.022         ,3.916              ,1.027 
0        ,30     ,127      ,29        ,0    ,23        ,3.74          ,3.705              ,1.01  
0        ,304    ,127      ,16        ,0    ,0         ,3.624         ,3.732              ,0.971 
0        ,31     ,127      ,30        ,0    ,0         ,3.389         ,3.714              ,0.912 
0        ,31     ,127      ,30        ,0    ,23        ,3.687         ,3.798              ,0.971 
0        ,32     ,127      ,128       ,0    ,0         ,4.491         ,5.165              ,0.869 
0        ,32     ,127      ,160       ,0    ,0         ,4.574         ,5.127              ,0.892 
0        ,32     ,127      ,192       ,0    ,0         ,4.556         ,5.205              ,0.875 
0        ,32     ,127      ,224       ,0    ,0         ,4.461         ,5.18               ,0.861 
0        ,32     ,127      ,256       ,0    ,0         ,4.531         ,5.221              ,0.868 
0        ,32     ,127      ,288       ,0    ,0         ,4.478         ,5.097              ,0.879 
0        ,32     ,127      ,31        ,0    ,0         ,3.625         ,3.727              ,0.973 
0        ,32     ,127      ,31        ,0    ,23        ,3.565         ,3.746              ,0.952 
0        ,32     ,127      ,320       ,0    ,0         ,4.466         ,5.132              ,0.87  
0        ,32     ,127      ,352       ,0    ,0         ,4.445         ,5.156              ,0.862 
0        ,32     ,127      ,64        ,0    ,0         ,4.436         ,5.158              ,0.86  
0        ,32     ,127      ,96        ,0    ,0         ,4.524         ,5.177              ,0.874 
0        ,320    ,127      ,128       ,0    ,0         ,7.679         ,8.009              ,0.959 
0        ,320    ,127      ,256       ,0    ,0         ,13.156        ,13.269             ,0.991 
0        ,320    ,127      ,32        ,0    ,0         ,4.406         ,5.076              ,0.868 
0        ,320    ,127      ,512       ,0    ,0         ,15.267        ,15.689             ,0.973 
0        ,320    ,127      ,64        ,0    ,0         ,5.728         ,6.071              ,0.943 
0        ,336    ,127      ,16        ,0    ,0         ,3.546         ,3.785              ,0.937 
0        ,352    ,127      ,128       ,0    ,0         ,7.629         ,7.947              ,0.96  
0        ,352    ,127      ,256       ,0    ,0         ,13.186        ,13.265             ,0.994 
0        ,352    ,127      ,32        ,0    ,0         ,4.472         ,5.164              ,0.866 
0        ,352    ,127      ,512       ,0    ,0         ,15.227        ,15.664             ,0.972 
0        ,352    ,127      ,64        ,0    ,0         ,5.718         ,6.146              ,0.93  
0        ,3776   ,127      ,4096      ,0    ,0         ,105.51        ,107.765            ,0.979 
0        ,3808   ,127      ,4096      ,0    ,0         ,106.367       ,108.324            ,0.982 
0        ,384    ,127      ,128       ,0    ,0         ,7.676         ,7.958              ,0.965 
0        ,384    ,127      ,256       ,0    ,0         ,13.166        ,13.286             ,0.991 
0        ,384    ,127      ,512       ,0    ,0         ,15.978        ,16.496             ,0.969 
0        ,384    ,127      ,64        ,0    ,0         ,5.725         ,6.119              ,0.936 
0        ,3840   ,127      ,4096      ,0    ,0         ,109.166       ,109.746            ,0.995 
0        ,3872   ,127      ,4096      ,0    ,0         ,110.249       ,111.264            ,0.991 
0        ,3904   ,127      ,4096      ,0    ,0         ,109.902       ,110.697            ,0.993 
0        ,3936   ,127      ,4096      ,0    ,0         ,111.099       ,110.706            ,1.004 
0        ,3968   ,127      ,4096      ,0    ,0         ,111.392       ,111.842            ,0.996 
0        ,4      ,127      ,3         ,0    ,0         ,3.839         ,3.808              ,1.008 
0        ,4      ,127      ,3         ,0    ,23        ,3.856         ,3.77               ,1.023 
0        ,4000   ,127      ,4096      ,0    ,0         ,116.589       ,114.016            ,1.023 
0        ,4032   ,127      ,4096      ,0    ,0         ,110.905       ,112.745            ,0.984 
0        ,4064   ,127      ,4096      ,0    ,0         ,111.287       ,112.624            ,0.988 
0        ,4096   ,127      ,1024      ,0    ,0         ,29.657        ,30.043             ,0.987 
0        ,4096   ,127      ,1024      ,0    ,23        ,29.48         ,30.512             ,0.966 
0        ,4096   ,127      ,128       ,0    ,0         ,7.7           ,7.971              ,0.966 
0        ,4096   ,127      ,128       ,0    ,23        ,7.755         ,8.1                ,0.957 
0        ,4096   ,127      ,2048      ,0    ,0         ,49.751        ,49.548             ,1.004 
0        ,4096   ,127      ,2048      ,0    ,23        ,49.523        ,50.29              ,0.985 
0        ,4096   ,127      ,256       ,0    ,0         ,13.245        ,13.453             ,0.985 
0        ,4096   ,127      ,256       ,0    ,23        ,13.209        ,13.294             ,0.994 
0        ,4096   ,127      ,32        ,0    ,0         ,4.265         ,5.024              ,0.849 
0        ,4096   ,127      ,32        ,0    ,23        ,4.281         ,5.055              ,0.847 
0        ,4096   ,127      ,3776      ,0    ,0         ,105.786       ,107.432            ,0.985 
0        ,4096   ,127      ,3808      ,0    ,0         ,106.443       ,107.572            ,0.99  
0        ,4096   ,127      ,3840      ,0    ,0         ,108.991       ,108.912            ,1.001 
0        ,4096   ,127      ,3872      ,0    ,0         ,111.415       ,110.611            ,1.007 
0        ,4096   ,127      ,3904      ,0    ,0         ,110.989       ,111.712            ,0.994 
0        ,4096   ,127      ,3936      ,0    ,0         ,109.447       ,110.444            ,0.991 
0        ,4096   ,127      ,3968      ,0    ,0         ,111.311       ,111.836            ,0.995 
0        ,4096   ,127      ,4000      ,0    ,0         ,113.892       ,113.212            ,1.006 
0        ,4096   ,127      ,4032      ,0    ,0         ,111.372       ,112.833            ,0.987 
0        ,4096   ,127      ,4064      ,0    ,0         ,111.099       ,112.903            ,0.984 
0        ,4096   ,127      ,4128      ,0    ,0         ,114.014       ,114.658            ,0.994 
0        ,4096   ,127      ,4160      ,0    ,0         ,114.292       ,114.87             ,0.995 
0        ,4096   ,127      ,4192      ,0    ,0         ,113.46        ,115.051            ,0.986 
0        ,4096   ,127      ,4224      ,0    ,0         ,117.617       ,114.589            ,1.026 
0        ,4096   ,127      ,4256      ,0    ,0         ,113.151       ,114.284            ,0.99  
0        ,4096   ,127      ,4288      ,0    ,0         ,114.383       ,114.095            ,1.003 
0        ,4096   ,127      ,4320      ,0    ,0         ,114.065       ,114.231            ,0.999 
0        ,4096   ,127      ,4352      ,0    ,0         ,113.966       ,114.57             ,0.995 
0        ,4096   ,127      ,4384      ,0    ,0         ,115.202       ,114.359            ,1.007 
0        ,4096   ,127      ,4416      ,0    ,0         ,112.809       ,115.726            ,0.975 
0        ,4096   ,127      ,512       ,0    ,0         ,18.721        ,19.765             ,0.947 
0        ,4096   ,127      ,512       ,0    ,23        ,19.174        ,20.009             ,0.958 
0        ,4096   ,127      ,64        ,0    ,0         ,5.695         ,6.177              ,0.922 
0        ,4096   ,127      ,64        ,0    ,23        ,5.749         ,6.158              ,0.934 
0        ,4128   ,127      ,4096      ,0    ,0         ,114.74        ,115.58             ,0.993 
0        ,416    ,127      ,128       ,0    ,0         ,7.664         ,7.92               ,0.968 
0        ,416    ,127      ,256       ,0    ,0         ,13.141        ,13.309             ,0.987 
0        ,416    ,127      ,512       ,0    ,0         ,16.533        ,17.572             ,0.941 
0        ,4160   ,127      ,4096      ,0    ,0         ,114.205       ,114.581            ,0.997 
0        ,4192   ,127      ,4096      ,0    ,0         ,114.037       ,114.563            ,0.995 
0        ,4224   ,127      ,4096      ,0    ,0         ,112.83        ,113.973            ,0.99  
0        ,4256   ,127      ,4096      ,0    ,0         ,114.023       ,114.334            ,0.997 
0        ,4288   ,127      ,4096      ,0    ,0         ,114.114       ,116.138            ,0.983 
0        ,4320   ,127      ,4096      ,0    ,0         ,113.92        ,114.222            ,0.997 
0        ,4352   ,127      ,4096      ,0    ,0         ,112.957       ,114.64             ,0.985 
0        ,4384   ,127      ,4096      ,0    ,0         ,114.339       ,114.157            ,1.002 
0        ,4416   ,127      ,4096      ,0    ,0         ,113.914       ,114.599            ,0.994 
0        ,448    ,127      ,128       ,0    ,0         ,7.621         ,7.962              ,0.957 
0        ,448    ,127      ,256       ,0    ,0         ,13.26         ,13.177             ,1.006 
0        ,448    ,127      ,512       ,0    ,0         ,18.465        ,18.643             ,0.99  
0        ,48     ,127      ,16        ,0    ,0         ,3.734         ,3.646              ,1.024 
0        ,480    ,127      ,256       ,0    ,0         ,13.329        ,13.499             ,0.987 
0        ,480    ,127      ,512       ,0    ,0         ,19.341        ,18.871             ,1.025 
0        ,5      ,127      ,4         ,0    ,0         ,3.754         ,3.763              ,0.998 
0        ,5      ,127      ,4         ,0    ,23        ,3.765         ,3.751              ,1.004 
0        ,512    ,127      ,192       ,0    ,0         ,12.734        ,13.07              ,0.974 
0        ,512    ,127      ,224       ,0    ,0         ,12.673        ,12.963             ,0.978 
0        ,512    ,127      ,256       ,0    ,0         ,12.912        ,13.061             ,0.989 
0        ,512    ,127      ,256       ,0    ,23        ,13.257        ,13.38              ,0.991 
0        ,512    ,127      ,288       ,0    ,0         ,13.901        ,14.448             ,0.962 
0        ,512    ,127      ,320       ,0    ,0         ,15.205        ,15.713             ,0.968 
0        ,512    ,127      ,352       ,0    ,0         ,15.225        ,15.747             ,0.967 
0        ,512    ,127      ,384       ,0    ,0         ,15.999        ,16.379             ,0.977 
0        ,512    ,127      ,416       ,0    ,0         ,16.328        ,17.508             ,0.933 
0        ,512    ,127      ,448       ,0    ,0         ,18.029        ,18.544             ,0.972 
0        ,512    ,127      ,480       ,0    ,0         ,18.664        ,18.775             ,0.994 
0        ,512    ,127      ,544       ,0    ,0         ,18.686        ,19.64              ,0.951 
0        ,512    ,127      ,576       ,0    ,0         ,18.776        ,19.686             ,0.954 
0        ,512    ,127      ,608       ,0    ,0         ,18.678        ,19.647             ,0.951 
0        ,512    ,127      ,640       ,0    ,0         ,18.757        ,19.616             ,0.956 
0        ,512    ,127      ,672       ,0    ,0         ,18.745        ,19.624             ,0.955 
0        ,512    ,127      ,704       ,0    ,0         ,18.672        ,19.656             ,0.95  
0        ,512    ,127      ,736       ,0    ,0         ,18.718        ,19.674             ,0.951 
0        ,512    ,127      ,768       ,0    ,0         ,18.952        ,19.726             ,0.961 
0        ,512    ,127      ,800       ,0    ,0         ,18.774        ,19.765             ,0.95  
0        ,512    ,127      ,832       ,0    ,0         ,18.699        ,19.706             ,0.949 
0        ,544    ,127      ,256       ,0    ,0         ,13.279        ,13.418             ,0.99  
0        ,544    ,127      ,512       ,0    ,0         ,19.074        ,19.752             ,0.966 
0        ,576    ,127      ,256       ,0    ,0         ,13.238        ,13.385             ,0.989 
0        ,576    ,127      ,512       ,0    ,0         ,19.177        ,19.701             ,0.973 
0        ,6      ,127      ,5         ,0    ,0         ,3.836         ,3.797              ,1.01  
0        ,6      ,127      ,5         ,0    ,23        ,3.777         ,3.786              ,0.997 
0        ,608    ,127      ,512       ,0    ,0         ,19.094        ,19.804             ,0.964 
0        ,64     ,127      ,128       ,0    ,0         ,5.722         ,6.174              ,0.927 
0        ,64     ,127      ,160       ,0    ,0         ,5.765         ,6.12               ,0.942 
0        ,64     ,127      ,192       ,0    ,0         ,5.737         ,6.173              ,0.929 
0        ,64     ,127      ,224       ,0    ,0         ,5.734         ,6.125              ,0.936 
0        ,64     ,127      ,256       ,0    ,0         ,5.721         ,6.158              ,0.929 
0        ,64     ,127      ,288       ,0    ,0         ,5.718         ,6.165              ,0.928 
0        ,64     ,127      ,32        ,0    ,0         ,4.61          ,5.286              ,0.872 
0        ,64     ,127      ,320       ,0    ,0         ,5.731         ,6.134              ,0.934 
0        ,64     ,127      ,352       ,0    ,0         ,5.725         ,6.088              ,0.94  
0        ,64     ,127      ,384       ,0    ,0         ,5.681         ,6.04               ,0.94  
0        ,64     ,127      ,96        ,0    ,0         ,5.721         ,6.103              ,0.937 
0        ,640    ,127      ,512       ,0    ,0         ,18.698        ,20.357             ,0.918 
0        ,672    ,127      ,512       ,0    ,0         ,18.702        ,19.702             ,0.949 
0        ,7      ,127      ,6         ,0    ,0         ,3.805         ,3.745              ,1.016 
0        ,7      ,127      ,6         ,0    ,23        ,3.839         ,3.669              ,1.046 
0        ,704    ,127      ,1024      ,0    ,0         ,23.955        ,24.068             ,0.995 
0        ,704    ,127      ,512       ,0    ,0         ,18.759        ,19.622             ,0.956 
0        ,736    ,127      ,1024      ,0    ,0         ,24.345        ,24.028             ,1.013 
0        ,736    ,127      ,512       ,0    ,0         ,18.668        ,19.678             ,0.949 
0        ,768    ,127      ,1024      ,0    ,0         ,23.966        ,24.134             ,0.993 
0        ,768    ,127      ,512       ,0    ,0         ,18.792        ,19.694             ,0.954 
0        ,7872   ,127      ,8192      ,0    ,0         ,188.906       ,188.92             ,1.0   
0        ,7904   ,127      ,8192      ,0    ,0         ,188.558       ,189.02             ,0.998 
0        ,7936   ,127      ,8192      ,0    ,0         ,192.26        ,190.741            ,1.008 
0        ,7968   ,127      ,8192      ,0    ,0         ,193.974       ,190.979            ,1.016 
0        ,8      ,127      ,7         ,0    ,0         ,3.744         ,3.69               ,1.015 
0        ,8      ,127      ,7         ,0    ,23        ,3.796         ,3.749              ,1.013 
0        ,80     ,127      ,16        ,0    ,0         ,3.555         ,3.815              ,0.932 
0        ,800    ,127      ,1024      ,0    ,0         ,25.005        ,25.674             ,0.974 
0        ,800    ,127      ,512       ,0    ,0         ,19.018        ,19.747             ,0.963 
0        ,8000   ,127      ,8192      ,0    ,0         ,191.652       ,192.035            ,0.998 
0        ,8032   ,127      ,8192      ,0    ,0         ,191.076       ,191.566            ,0.997 
0        ,8064   ,127      ,8192      ,0    ,0         ,193.207       ,193.492            ,0.999 
0        ,8096   ,127      ,8192      ,0    ,0         ,197.26        ,193.563            ,1.019 
0        ,8128   ,127      ,8192      ,0    ,0         ,193.573       ,193.812            ,0.999 
0        ,8160   ,127      ,8192      ,0    ,0         ,193.447       ,193.887            ,0.998 
0        ,832    ,127      ,1024      ,0    ,0         ,26.586        ,27.037             ,0.983 
0        ,832    ,127      ,512       ,0    ,0         ,18.694        ,19.728             ,0.948 
0        ,864    ,127      ,1024      ,0    ,0         ,26.631        ,26.966             ,0.988 
0        ,896    ,127      ,1024      ,0    ,0         ,26.344        ,26.579             ,0.991 
0        ,9      ,127      ,8         ,0    ,0         ,3.743         ,3.787              ,0.988 
0        ,9      ,127      ,8         ,0    ,23        ,3.805         ,3.726              ,1.021 
0        ,928    ,127      ,1024      ,0    ,0         ,27.017        ,28.306             ,0.954 
0        ,96     ,127      ,128       ,0    ,0         ,6.253         ,6.449              ,0.97  
0        ,96     ,127      ,256       ,0    ,0         ,6.283         ,6.457              ,0.973 
0        ,96     ,127      ,32        ,0    ,0         ,4.546         ,5.143              ,0.884 
0        ,96     ,127      ,64        ,0    ,0         ,5.726         ,6.18               ,0.927 
0        ,960    ,127      ,1024      ,0    ,0         ,28.882        ,29.824             ,0.968 
0        ,992    ,127      ,1024      ,0    ,0         ,29.47         ,30.002             ,0.982 
1        ,2048   ,127      ,32        ,0    ,0         ,4.496         ,5.184              ,0.867 
1        ,2048   ,127      ,32        ,0    ,23        ,4.364         ,5.121              ,0.852 
1        ,256    ,127      ,64        ,0    ,0         ,5.633         ,6.061              ,0.929 
1        ,256    ,127      ,64        ,0    ,23        ,5.673         ,6.14               ,0.924 
1        ,4096   ,127      ,32        ,0    ,0         ,4.362         ,5.109              ,0.854 
1        ,4096   ,127      ,32        ,0    ,23        ,4.38          ,5.189              ,0.844 
112      ,512    ,127      ,256       ,0    ,0         ,12.498        ,13.087             ,0.955 
112      ,512    ,127      ,256       ,0    ,23        ,12.488        ,13.024             ,0.959 
16       ,512    ,127      ,256       ,0    ,0         ,13.162        ,13.325             ,0.988 
16       ,512    ,127      ,256       ,0    ,23        ,13.287        ,13.397             ,0.992 
2        ,2048   ,127      ,64        ,0    ,0         ,5.794         ,6.136              ,0.944 
2        ,2048   ,127      ,64        ,0    ,23        ,6.643         ,6.347              ,1.047 
2        ,256    ,127      ,64        ,0    ,0         ,5.804         ,6.116              ,0.949 
2        ,256    ,127      ,64        ,0    ,23        ,5.749         ,6.133              ,0.937 
2        ,4096   ,127      ,64        ,0    ,0         ,5.723         ,6.136              ,0.933 
2        ,4096   ,127      ,64        ,0    ,23        ,5.746         ,6.145              ,0.935 
3        ,2048   ,127      ,128       ,0    ,0         ,7.751         ,7.978              ,0.972 
3        ,2048   ,127      ,128       ,0    ,23        ,7.715         ,7.908              ,0.976 
3        ,256    ,127      ,64        ,0    ,0         ,5.748         ,6.085              ,0.945 
3        ,256    ,127      ,64        ,0    ,23        ,5.769         ,6.148              ,0.938 
3        ,4096   ,127      ,128       ,0    ,0         ,7.743         ,7.953              ,0.974 
3        ,4096   ,127      ,128       ,0    ,23        ,7.778         ,7.967              ,0.976 
32       ,512    ,127      ,256       ,0    ,0         ,13.969        ,14.553             ,0.96  
32       ,512    ,127      ,256       ,0    ,23        ,14.077        ,14.603             ,0.964 
4        ,2048   ,127      ,256       ,0    ,0         ,13.278        ,13.426             ,0.989 
4        ,2048   ,127      ,256       ,0    ,23        ,13.228        ,13.339             ,0.992 
4        ,256    ,127      ,64        ,0    ,0         ,5.768         ,6.178              ,0.934 
4        ,256    ,127      ,64        ,0    ,23        ,5.735         ,6.178              ,0.928 
4        ,4096   ,127      ,256       ,0    ,0         ,13.149        ,13.324             ,0.987 
4        ,4096   ,127      ,256       ,0    ,23        ,13.294        ,13.347             ,0.996 
48       ,512    ,127      ,256       ,0    ,0         ,14.041        ,14.585             ,0.963 
48       ,512    ,127      ,256       ,0    ,23        ,14.077        ,14.604             ,0.964 
5        ,2048   ,127      ,512       ,0    ,0         ,18.994        ,19.767             ,0.961 
5        ,2048   ,127      ,512       ,0    ,23        ,18.849        ,19.714             ,0.956 
5        ,256    ,127      ,64        ,0    ,0         ,5.781         ,6.154              ,0.939 
5        ,256    ,127      ,64        ,0    ,23        ,5.765         ,6.127              ,0.941 
5        ,4096   ,127      ,512       ,0    ,0         ,18.798        ,19.661             ,0.956 
5        ,4096   ,127      ,512       ,0    ,23        ,18.791        ,19.726             ,0.953 
6        ,2048   ,127      ,1024      ,0    ,0         ,29.292        ,29.622             ,0.989 
6        ,2048   ,127      ,1024      ,0    ,23        ,29.479        ,29.791             ,0.99  
6        ,256    ,127      ,64        ,0    ,0         ,5.757         ,6.182              ,0.931 
6        ,256    ,127      ,64        ,0    ,23        ,5.752         ,6.147              ,0.936 
6        ,4096   ,127      ,1024      ,0    ,0         ,29.127        ,29.948             ,0.973 
6        ,4096   ,127      ,1024      ,0    ,23        ,29.61         ,29.72              ,0.996 
64       ,512    ,127      ,256       ,0    ,0         ,15.276        ,15.847             ,0.964 
64       ,512    ,127      ,256       ,0    ,23        ,15.232        ,15.837             ,0.962 
7        ,2048   ,127      ,2048      ,0    ,0         ,49.456        ,49.464             ,1.0   
7        ,2048   ,127      ,2048      ,0    ,23        ,49.474        ,49.562             ,0.998 
7        ,256    ,127      ,64        ,0    ,0         ,5.719         ,6.138              ,0.932 
7        ,256    ,127      ,64        ,0    ,23        ,5.827         ,6.498              ,0.897 
7        ,4096   ,127      ,2048      ,0    ,0         ,49.453        ,49.364             ,1.002 
7        ,4096   ,127      ,2048      ,0    ,23        ,49.621        ,49.647             ,0.999 
80       ,512    ,127      ,256       ,0    ,0         ,15.222        ,15.796             ,0.964 
80       ,512    ,127      ,256       ,0    ,23        ,15.275        ,15.819             ,0.966 
96       ,512    ,127      ,256       ,0    ,0         ,12.366        ,12.833             ,0.964 
96       ,512    ,127      ,256       ,0    ,23        ,12.363        ,12.934             ,0.956 
alignment,branch ,length   ,perc-zero ,pos  ,rand      ,__strchr_evex ,__strchr_evex_orig 
0        ,0      ,16       ,0         ,15   ,1         ,4.172         ,4.532              ,0.92  
0        ,0      ,16       ,0.1       ,15   ,1         ,4.151         ,4.495              ,0.923 
0        ,0      ,16       ,0.25      ,15   ,1         ,4.152         ,4.413              ,0.941 
0        ,0      ,16       ,0.33      ,15   ,1         ,4.045         ,4.481              ,0.903 
0        ,0      ,16       ,0.5       ,15   ,1         ,4.055         ,4.396              ,0.922 
0        ,0      ,16       ,0.66      ,15   ,1         ,4.113         ,4.432              ,0.928 
0        ,0      ,16       ,0.75      ,15   ,1         ,4.053         ,4.353              ,0.931 
0        ,0      ,16       ,0.9       ,15   ,1         ,4.183         ,4.467              ,0.936 
0        ,0      ,16       ,1         ,15   ,1         ,4.194         ,4.41               ,0.951 
0        ,1      ,16       ,0         ,15   ,1         ,3.834         ,4.118              ,0.931 
0        ,1      ,16       ,0.1       ,15   ,1         ,4.129         ,4.454              ,0.927 
0        ,1      ,16       ,0.25      ,15   ,1         ,4.118         ,4.446              ,0.926 
0        ,1      ,16       ,0.33      ,15   ,1         ,4.134         ,4.357              ,0.949 
0        ,1      ,16       ,0.5       ,15   ,1         ,4.073         ,4.441              ,0.917 
0        ,1      ,16       ,0.66      ,15   ,1         ,4.146         ,4.294              ,0.965 
0        ,1      ,16       ,0.75      ,15   ,1         ,4.009         ,4.295              ,0.934 
0        ,1      ,16       ,0.9       ,15   ,1         ,4.106         ,4.398              ,0.934 
0        ,1      ,16       ,1         ,15   ,1         ,4.176         ,4.474              ,0.933 
0.9711578243606377
0.9316238188843593

[-- Attachment #3: strchrnul.txt --]
[-- Type: text/plain, Size: 42407 bytes --]

Results For: strchrnul
alignment,length ,max_char ,pos  ,rand ,seek_char ,__strchrnul_evex ,__strchrnul_evex_orig 
0        ,1      ,127      ,0    ,0    ,0         ,3.278            ,3.518                 ,0.932 
0        ,1      ,127      ,0    ,0    ,23        ,3.361            ,3.484                 ,0.965 
0        ,10     ,127      ,9    ,0    ,0         ,3.536            ,3.875                 ,0.913 
0        ,10     ,127      ,9    ,0    ,23        ,3.504            ,3.848                 ,0.911 
0        ,1024   ,127      ,1056 ,0    ,0         ,27.379           ,28.047                ,0.976 
0        ,1024   ,127      ,1088 ,0    ,0         ,27.769           ,27.969                ,0.993 
0        ,1024   ,127      ,1120 ,0    ,0         ,27.693           ,28.098                ,0.986 
0        ,1024   ,127      ,1152 ,0    ,0         ,27.515           ,27.972                ,0.984 
0        ,1024   ,127      ,1184 ,0    ,0         ,27.754           ,28.04                 ,0.99  
0        ,1024   ,127      ,1216 ,0    ,0         ,27.505           ,27.97                 ,0.983 
0        ,1024   ,127      ,1248 ,0    ,0         ,27.209           ,28.032                ,0.971 
0        ,1024   ,127      ,1280 ,0    ,0         ,27.272           ,28.006                ,0.974 
0        ,1024   ,127      ,1312 ,0    ,0         ,27.439           ,28.208                ,0.973 
0        ,1024   ,127      ,1344 ,0    ,0         ,27.735           ,28.267                ,0.981 
0        ,1024   ,127      ,704  ,0    ,0         ,23.327           ,22.756                ,1.025 
0        ,1024   ,127      ,736  ,0    ,0         ,23.564           ,22.933                ,1.028 
0        ,1024   ,127      ,768  ,0    ,0         ,22.939           ,23.247                ,0.987 
0        ,1024   ,127      ,800  ,0    ,0         ,25.721           ,23.809                ,1.08  
0        ,1024   ,127      ,832  ,0    ,0         ,26.317           ,25.581                ,1.029 
0        ,1024   ,127      ,864  ,0    ,0         ,26.403           ,25.816                ,1.023 
0        ,1024   ,127      ,896  ,0    ,0         ,25.478           ,25.882                ,0.984 
0        ,1024   ,127      ,928  ,0    ,0         ,27.202           ,26.707                ,1.019 
0        ,1024   ,127      ,960  ,0    ,0         ,28.797           ,28.491                ,1.011 
0        ,1024   ,127      ,992  ,0    ,0         ,28.914           ,28.424                ,1.017 
0        ,1056   ,127      ,1024 ,0    ,0         ,27.875           ,28.307                ,0.985 
0        ,1088   ,127      ,1024 ,0    ,0         ,27.721           ,28.452                ,0.974 
0        ,11     ,127      ,10   ,0    ,0         ,3.527            ,3.86                  ,0.914 
0        ,11     ,127      ,10   ,0    ,23        ,3.522            ,3.73                  ,0.944 
0        ,112    ,127      ,16   ,0    ,0         ,3.339            ,3.913                 ,0.853 
0        ,1120   ,127      ,1024 ,0    ,0         ,28.622           ,28.123                ,1.018 
0        ,1152   ,127      ,1024 ,0    ,0         ,27.549           ,27.931                ,0.986 
0        ,1184   ,127      ,1024 ,0    ,0         ,27.42            ,28.076                ,0.977 
0        ,12     ,127      ,11   ,0    ,0         ,3.514            ,3.837                 ,0.916 
0        ,12     ,127      ,11   ,0    ,23        ,3.429            ,3.846                 ,0.892 
0        ,1216   ,127      ,1024 ,0    ,0         ,27.451           ,28.133                ,0.976 
0        ,1248   ,127      ,1024 ,0    ,0         ,27.235           ,28.012                ,0.972 
0        ,128    ,127      ,160  ,0    ,0         ,7.064            ,7.304                 ,0.967 
0        ,128    ,127      ,192  ,0    ,0         ,7.042            ,7.337                 ,0.96  
0        ,128    ,127      ,224  ,0    ,0         ,7.075            ,7.344                 ,0.963 
0        ,128    ,127      ,256  ,0    ,0         ,7.066            ,7.338                 ,0.963 
0        ,128    ,127      ,288  ,0    ,0         ,7.054            ,7.338                 ,0.961 
0        ,128    ,127      ,32   ,0    ,0         ,4.748            ,4.967                 ,0.956 
0        ,128    ,127      ,320  ,0    ,0         ,7.061            ,7.285                 ,0.969 
0        ,128    ,127      ,352  ,0    ,0         ,7.104            ,7.342                 ,0.968 
0        ,128    ,127      ,384  ,0    ,0         ,7.133            ,7.472                 ,0.955 
0        ,128    ,127      ,416  ,0    ,0         ,7.017            ,7.321                 ,0.958 
0        ,128    ,127      ,448  ,0    ,0         ,7.065            ,7.341                 ,0.962 
0        ,128    ,127      ,64   ,0    ,0         ,5.224            ,5.743                 ,0.91  
0        ,128    ,127      ,96   ,0    ,0         ,5.969            ,6.206                 ,0.962 
0        ,1280   ,127      ,1024 ,0    ,0         ,27.438           ,27.981                ,0.981 
0        ,13     ,127      ,12   ,0    ,0         ,3.56             ,3.838                 ,0.927 
0        ,13     ,127      ,12   ,0    ,23        ,3.559            ,3.879                 ,0.917 
0        ,1312   ,127      ,1024 ,0    ,0         ,27.522           ,28.385                ,0.97  
0        ,1344   ,127      ,1024 ,0    ,0         ,27.53            ,28.04                 ,0.982 
0        ,14     ,127      ,13   ,0    ,0         ,3.483            ,3.916                 ,0.889 
0        ,14     ,127      ,13   ,0    ,23        ,3.472            ,3.916                 ,0.887 
0        ,144    ,127      ,16   ,0    ,0         ,3.485            ,3.975                 ,0.877 
0        ,15     ,127      ,14   ,0    ,0         ,3.467            ,3.859                 ,0.898 
0        ,15     ,127      ,14   ,0    ,23        ,3.533            ,3.92                  ,0.901 
0        ,16     ,127      ,112  ,0    ,0         ,3.471            ,3.855                 ,0.9   
0        ,16     ,127      ,144  ,0    ,0         ,3.417            ,3.839                 ,0.89  
0        ,16     ,127      ,15   ,0    ,0         ,3.479            ,3.924                 ,0.887 
0        ,16     ,127      ,15   ,0    ,23        ,3.482            ,3.885                 ,0.896 
0        ,16     ,127      ,176  ,0    ,0         ,3.526            ,3.868                 ,0.912 
0        ,16     ,127      ,208  ,0    ,0         ,3.574            ,4.006                 ,0.892 
0        ,16     ,127      ,240  ,0    ,0         ,3.44             ,3.907                 ,0.88  
0        ,16     ,127      ,272  ,0    ,0         ,3.473            ,3.958                 ,0.877 
0        ,16     ,127      ,304  ,0    ,0         ,3.394            ,3.924                 ,0.865 
0        ,16     ,127      ,336  ,0    ,0         ,3.499            ,3.875                 ,0.903 
0        ,16     ,127      ,48   ,0    ,0         ,3.319            ,3.769                 ,0.88  
0        ,16     ,127      ,80   ,0    ,0         ,3.392            ,3.901                 ,0.87  
0        ,160    ,127      ,128  ,0    ,0         ,7.183            ,7.334                 ,0.979 
0        ,160    ,127      ,256  ,0    ,0         ,10.851           ,10.975                ,0.989 
0        ,160    ,127      ,32   ,0    ,0         ,4.642            ,4.577                 ,1.014 
0        ,160    ,127      ,64   ,0    ,0         ,5.258            ,5.725                 ,0.918 
0        ,17     ,127      ,16   ,0    ,0         ,3.518            ,3.895                 ,0.903 
0        ,17     ,127      ,16   ,0    ,23        ,3.452            ,3.94                  ,0.876 
0        ,1728   ,127      ,2048 ,0    ,0         ,44.006           ,43.637                ,1.008 
0        ,176    ,127      ,16   ,0    ,0         ,3.397            ,3.829                 ,0.887 
0        ,1760   ,127      ,2048 ,0    ,0         ,44.426           ,43.129                ,1.03  
0        ,1792   ,127      ,2048 ,0    ,0         ,43.072           ,44.071                ,0.977 
0        ,18     ,127      ,17   ,0    ,0         ,3.514            ,3.861                 ,0.91  
0        ,18     ,127      ,17   ,0    ,23        ,3.491            ,3.839                 ,0.91  
0        ,1824   ,127      ,2048 ,0    ,0         ,44.994           ,44.676                ,1.007 
0        ,1856   ,127      ,2048 ,0    ,0         ,46.447           ,45.904                ,1.012 
0        ,1888   ,127      ,2048 ,0    ,0         ,46.266           ,47.06                 ,0.983 
0        ,19     ,127      ,18   ,0    ,0         ,3.392            ,3.884                 ,0.873 
0        ,19     ,127      ,18   ,0    ,23        ,3.572            ,3.859                 ,0.925 
0        ,192    ,127      ,128  ,0    ,0         ,7.063            ,7.296                 ,0.968 
0        ,192    ,127      ,256  ,0    ,0         ,12.113           ,12.303                ,0.985 
0        ,192    ,127      ,32   ,0    ,0         ,4.606            ,4.526                 ,1.018 
0        ,192    ,127      ,512  ,0    ,0         ,12.33            ,12.393                ,0.995 
0        ,192    ,127      ,64   ,0    ,0         ,5.249            ,5.627                 ,0.933 
0        ,1920   ,127      ,2048 ,0    ,0         ,45.169           ,45.434                ,0.994 
0        ,1952   ,127      ,2048 ,0    ,0         ,46.968           ,46.56                 ,1.009 
0        ,1984   ,127      ,2048 ,0    ,0         ,48.746           ,48.556                ,1.004 
0        ,2      ,127      ,1    ,0    ,0         ,3.322            ,3.55                  ,0.936 
0        ,2      ,127      ,1    ,0    ,23        ,3.54             ,3.6                   ,0.983 
0        ,20     ,127      ,19   ,0    ,0         ,3.424            ,3.885                 ,0.881 
0        ,20     ,127      ,19   ,0    ,23        ,3.402            ,3.925                 ,0.867 
0        ,2016   ,127      ,2048 ,0    ,0         ,48.746           ,48.314                ,1.009 
0        ,2048   ,127      ,1024 ,0    ,0         ,27.64            ,28.634                ,0.965 
0        ,2048   ,127      ,1024 ,0    ,23        ,27.678           ,28.285                ,0.979 
0        ,2048   ,127      ,128  ,0    ,0         ,7.006            ,7.284                 ,0.962 
0        ,2048   ,127      ,128  ,0    ,23        ,7.306            ,7.942                 ,0.92  
0        ,2048   ,127      ,1728 ,0    ,0         ,44.065           ,43.587                ,1.011 
0        ,2048   ,127      ,1760 ,0    ,0         ,43.92            ,43.199                ,1.017 
0        ,2048   ,127      ,1792 ,0    ,0         ,43.424           ,43.32                 ,1.002 
0        ,2048   ,127      ,1824 ,0    ,0         ,44.812           ,43.868                ,1.022 
0        ,2048   ,127      ,1856 ,0    ,0         ,46.22            ,45.548                ,1.015 
0        ,2048   ,127      ,1888 ,0    ,0         ,46.415           ,45.692                ,1.016 
0        ,2048   ,127      ,1920 ,0    ,0         ,45.27            ,45.443                ,0.996 
0        ,2048   ,127      ,1952 ,0    ,0         ,47.135           ,46.583                ,1.012 
0        ,2048   ,127      ,1984 ,0    ,0         ,49.092           ,48.104                ,1.021 
0        ,2048   ,127      ,2016 ,0    ,0         ,49.169           ,48.166                ,1.021 
0        ,2048   ,127      ,2048 ,0    ,0         ,50.101           ,49.82                 ,1.006 
0        ,2048   ,127      ,2048 ,0    ,23        ,48.06            ,48.231                ,0.996 
0        ,2048   ,127      ,2080 ,0    ,0         ,48.992           ,48.394                ,1.012 
0        ,2048   ,127      ,2112 ,0    ,0         ,48.227           ,48.29                 ,0.999 
0        ,2048   ,127      ,2144 ,0    ,0         ,48.0             ,48.069                ,0.999 
0        ,2048   ,127      ,2176 ,0    ,0         ,48.046           ,47.876                ,1.004 
0        ,2048   ,127      ,2208 ,0    ,0         ,48.293           ,50.406                ,0.958 
0        ,2048   ,127      ,2240 ,0    ,0         ,48.085           ,47.981                ,1.002 
0        ,2048   ,127      ,2272 ,0    ,0         ,47.946           ,48.359                ,0.991 
0        ,2048   ,127      ,2304 ,0    ,0         ,47.888           ,48.39                 ,0.99  
0        ,2048   ,127      ,2336 ,0    ,0         ,48.161           ,48.024                ,1.003 
0        ,2048   ,127      ,2368 ,0    ,0         ,47.996           ,47.979                ,1.0   
0        ,2048   ,127      ,256  ,0    ,0         ,12.933           ,13.158                ,0.983 
0        ,2048   ,127      ,256  ,0    ,23        ,13.021           ,13.142                ,0.991 
0        ,2048   ,127      ,32   ,0    ,0         ,4.701            ,4.704                 ,0.999 
0        ,2048   ,127      ,32   ,0    ,23        ,4.691            ,4.529                 ,1.036 
0        ,2048   ,127      ,512  ,0    ,0         ,18.465           ,18.891                ,0.977 
0        ,2048   ,127      ,512  ,0    ,23        ,18.035           ,18.838                ,0.957 
0        ,2048   ,127      ,64   ,0    ,0         ,5.34             ,5.647                 ,0.946 
0        ,2048   ,127      ,64   ,0    ,23        ,5.25             ,5.543                 ,0.947 
0        ,208    ,127      ,16   ,0    ,0         ,3.469            ,3.856                 ,0.9   
0        ,2080   ,127      ,2048 ,0    ,0         ,48.121           ,48.004                ,1.002 
0        ,21     ,127      ,20   ,0    ,0         ,3.356            ,3.874                 ,0.866 
0        ,21     ,127      ,20   ,0    ,23        ,3.382            ,3.91                  ,0.865 
0        ,2112   ,127      ,2048 ,0    ,0         ,48.191           ,48.216                ,0.999 
0        ,2144   ,127      ,2048 ,0    ,0         ,47.773           ,48.086                ,0.993 
0        ,2176   ,127      ,2048 ,0    ,0         ,48.646           ,48.115                ,1.011 
0        ,22     ,127      ,21   ,0    ,0         ,3.424            ,3.927                 ,0.872 
0        ,22     ,127      ,21   ,0    ,23        ,3.35             ,3.872                 ,0.865 
0        ,2208   ,127      ,2048 ,0    ,0         ,48.158           ,48.111                ,1.001 
0        ,224    ,127      ,128  ,0    ,0         ,7.119            ,7.292                 ,0.976 
0        ,224    ,127      ,256  ,0    ,0         ,12.252           ,12.207                ,1.004 
0        ,224    ,127      ,32   ,0    ,0         ,4.612            ,4.608                 ,1.001 
0        ,224    ,127      ,512  ,0    ,0         ,12.195           ,12.268                ,0.994 
0        ,224    ,127      ,64   ,0    ,0         ,5.215            ,5.72                  ,0.912 
0        ,2240   ,127      ,2048 ,0    ,0         ,47.999           ,47.936                ,1.001 
0        ,2272   ,127      ,2048 ,0    ,0         ,48.9             ,47.945                ,1.02  
0        ,23     ,127      ,22   ,0    ,0         ,3.421            ,3.869                 ,0.884 
0        ,23     ,127      ,22   ,0    ,23        ,3.351            ,3.935                 ,0.852 
0        ,2304   ,127      ,2048 ,0    ,0         ,48.037           ,48.658                ,0.987 
0        ,2336   ,127      ,2048 ,0    ,0         ,48.418           ,48.175                ,1.005 
0        ,2368   ,127      ,2048 ,0    ,0         ,47.82            ,48.381                ,0.988 
0        ,24     ,127      ,23   ,0    ,0         ,3.369            ,3.863                 ,0.872 
0        ,24     ,127      ,23   ,0    ,23        ,3.366            ,3.847                 ,0.875 
0        ,240    ,127      ,16   ,0    ,0         ,3.385            ,3.917                 ,0.864 
0        ,25     ,127      ,24   ,0    ,0         ,3.379            ,3.827                 ,0.883 
0        ,25     ,127      ,24   ,0    ,23        ,3.381            ,3.879                 ,0.872 
0        ,256    ,127      ,128  ,0    ,0         ,7.071            ,7.284                 ,0.971 
0        ,256    ,127      ,160  ,0    ,0         ,10.739           ,10.904                ,0.985 
0        ,256    ,127      ,192  ,0    ,0         ,12.189           ,12.179                ,1.001 
0        ,256    ,127      ,224  ,0    ,0         ,12.123           ,12.175                ,0.996 
0        ,256    ,127      ,288  ,0    ,0         ,13.003           ,13.249                ,0.981 
0        ,256    ,127      ,32   ,0    ,0         ,4.637            ,4.618                 ,1.004 
0        ,256    ,127      ,320  ,0    ,0         ,12.939           ,13.124                ,0.986 
0        ,256    ,127      ,352  ,0    ,0         ,12.896           ,13.169                ,0.979 
0        ,256    ,127      ,384  ,0    ,0         ,12.988           ,13.192                ,0.985 
0        ,256    ,127      ,416  ,0    ,0         ,13.026           ,13.225                ,0.985 
0        ,256    ,127      ,448  ,0    ,0         ,12.931           ,13.129                ,0.985 
0        ,256    ,127      ,480  ,0    ,0         ,13.054           ,13.158                ,0.992 
0        ,256    ,127      ,512  ,0    ,0         ,13.007           ,13.227                ,0.983 
0        ,256    ,127      ,544  ,0    ,0         ,12.966           ,13.177                ,0.984 
0        ,256    ,127      ,576  ,0    ,0         ,12.959           ,13.112                ,0.988 
0        ,256    ,127      ,64   ,0    ,0         ,5.231            ,5.695                 ,0.918 
0        ,256    ,127      ,96   ,0    ,0         ,5.96             ,6.268                 ,0.951 
0        ,26     ,127      ,25   ,0    ,0         ,3.347            ,3.807                 ,0.879 
0        ,26     ,127      ,25   ,0    ,23        ,3.367            ,3.813                 ,0.883 
0        ,27     ,127      ,26   ,0    ,0         ,3.424            ,3.858                 ,0.888 
0        ,27     ,127      ,26   ,0    ,23        ,3.348            ,3.806                 ,0.879 
0        ,272    ,127      ,16   ,0    ,0         ,3.49             ,3.851                 ,0.906 
0        ,28     ,127      ,27   ,0    ,0         ,3.355            ,3.779                 ,0.888 
0        ,28     ,127      ,27   ,0    ,23        ,3.353            ,3.788                 ,0.885 
0        ,288    ,127      ,128  ,0    ,0         ,7.067            ,7.332                 ,0.964 
0        ,288    ,127      ,256  ,0    ,0         ,13.022           ,13.165                ,0.989 
0        ,288    ,127      ,32   ,0    ,0         ,4.59             ,4.579                 ,1.002 
0        ,288    ,127      ,512  ,0    ,0         ,14.011           ,14.077                ,0.995 
0        ,288    ,127      ,64   ,0    ,0         ,5.217            ,5.7                   ,0.915 
0        ,29     ,127      ,28   ,0    ,0         ,3.374            ,3.861                 ,0.874 
0        ,29     ,127      ,28   ,0    ,23        ,3.343            ,3.817                 ,0.876 
0        ,3      ,127      ,2    ,0    ,0         ,3.46             ,3.611                 ,0.958 
0        ,3      ,127      ,2    ,0    ,23        ,3.457            ,3.656                 ,0.946 
0        ,30     ,127      ,29   ,0    ,0         ,3.294            ,3.832                 ,0.86  
0        ,30     ,127      ,29   ,0    ,23        ,3.255            ,3.831                 ,0.85  
0        ,304    ,127      ,16   ,0    ,0         ,3.488            ,3.933                 ,0.887 
0        ,31     ,127      ,30   ,0    ,0         ,3.261            ,3.747                 ,0.87  
0        ,31     ,127      ,30   ,0    ,23        ,3.173            ,3.781                 ,0.839 
0        ,32     ,127      ,128  ,0    ,0         ,4.607            ,4.599                 ,1.002 
0        ,32     ,127      ,160  ,0    ,0         ,4.618            ,4.622                 ,0.999 
0        ,32     ,127      ,192  ,0    ,0         ,4.592            ,4.602                 ,0.998 
0        ,32     ,127      ,224  ,0    ,0         ,4.659            ,4.554                 ,1.023 
0        ,32     ,127      ,256  ,0    ,0         ,4.627            ,4.679                 ,0.989 
0        ,32     ,127      ,288  ,0    ,0         ,4.621            ,4.613                 ,1.002 
0        ,32     ,127      ,31   ,0    ,0         ,3.261            ,3.685                 ,0.885 
0        ,32     ,127      ,31   ,0    ,23        ,3.135            ,3.756                 ,0.835 
0        ,32     ,127      ,320  ,0    ,0         ,4.605            ,4.586                 ,1.004 
0        ,32     ,127      ,352  ,0    ,0         ,4.618            ,4.614                 ,1.001 
0        ,32     ,127      ,64   ,0    ,0         ,4.668            ,4.695                 ,0.994 
0        ,32     ,127      ,96   ,0    ,0         ,4.594            ,4.567                 ,1.006 
0        ,320    ,127      ,128  ,0    ,0         ,7.089            ,7.334                 ,0.967 
0        ,320    ,127      ,256  ,0    ,0         ,12.997           ,13.234                ,0.982 
0        ,320    ,127      ,32   ,0    ,0         ,4.63             ,4.598                 ,1.007 
0        ,320    ,127      ,512  ,0    ,0         ,15.176           ,14.995                ,1.012 
0        ,320    ,127      ,64   ,0    ,0         ,5.225            ,5.715                 ,0.914 
0        ,336    ,127      ,16   ,0    ,0         ,3.409            ,3.886                 ,0.877 
0        ,352    ,127      ,128  ,0    ,0         ,7.069            ,7.33                  ,0.964 
0        ,352    ,127      ,256  ,0    ,0         ,12.825           ,13.147                ,0.975 
0        ,352    ,127      ,32   ,0    ,0         ,4.599            ,4.572                 ,1.006 
0        ,352    ,127      ,512  ,0    ,0         ,15.347           ,15.141                ,1.014 
0        ,352    ,127      ,64   ,0    ,0         ,5.441            ,5.964                 ,0.912 
0        ,3776   ,127      ,4096 ,0    ,0         ,107.135          ,107.618               ,0.996 
0        ,3808   ,127      ,4096 ,0    ,0         ,106.308          ,106.252               ,1.001 
0        ,384    ,127      ,128  ,0    ,0         ,7.067            ,7.368                 ,0.959 
0        ,384    ,127      ,256  ,0    ,0         ,12.939           ,13.235                ,0.978 
0        ,384    ,127      ,512  ,0    ,0         ,15.149           ,15.802                ,0.959 
0        ,384    ,127      ,64   ,0    ,0         ,5.521            ,6.014                 ,0.918 
0        ,3840   ,127      ,4096 ,0    ,0         ,109.957          ,107.854               ,1.019 
0        ,3872   ,127      ,4096 ,0    ,0         ,110.377          ,109.542               ,1.008 
0        ,3904   ,127      ,4096 ,0    ,0         ,109.019          ,108.598               ,1.004 
0        ,3936   ,127      ,4096 ,0    ,0         ,110.068          ,109.132               ,1.009 
0        ,3968   ,127      ,4096 ,0    ,0         ,114.341          ,112.557               ,1.016 
0        ,4      ,127      ,3    ,0    ,0         ,3.435            ,3.626                 ,0.947 
0        ,4      ,127      ,3    ,0    ,23        ,3.548            ,3.801                 ,0.933 
0        ,4000   ,127      ,4096 ,0    ,0         ,113.432          ,112.38                ,1.009 
0        ,4032   ,127      ,4096 ,0    ,0         ,111.913          ,112.306               ,0.996 
0        ,4064   ,127      ,4096 ,0    ,0         ,112.393          ,111.56                ,1.007 
0        ,4096   ,127      ,1024 ,0    ,0         ,28.261           ,28.808                ,0.981 
0        ,4096   ,127      ,1024 ,0    ,23        ,28.045           ,28.644                ,0.979 
0        ,4096   ,127      ,128  ,0    ,0         ,7.124            ,7.351                 ,0.969 
0        ,4096   ,127      ,128  ,0    ,23        ,7.016            ,7.395                 ,0.949 
0        ,4096   ,127      ,2048 ,0    ,0         ,48.17            ,48.541                ,0.992 
0        ,4096   ,127      ,2048 ,0    ,23        ,48.388           ,49.056                ,0.986 
0        ,4096   ,127      ,256  ,0    ,0         ,12.957           ,13.163                ,0.984 
0        ,4096   ,127      ,256  ,0    ,23        ,13.13            ,13.266                ,0.99  
0        ,4096   ,127      ,32   ,0    ,0         ,4.764            ,4.838                 ,0.985 
0        ,4096   ,127      ,32   ,0    ,23        ,4.754            ,4.723                 ,1.006 
0        ,4096   ,127      ,3776 ,0    ,0         ,106.852          ,106.349               ,1.005 
0        ,4096   ,127      ,3808 ,0    ,0         ,106.998          ,106.658               ,1.003 
0        ,4096   ,127      ,3840 ,0    ,0         ,110.603          ,107.567               ,1.028 
0        ,4096   ,127      ,3872 ,0    ,0         ,110.267          ,108.296               ,1.018 
0        ,4096   ,127      ,3904 ,0    ,0         ,109.43           ,109.724               ,0.997 
0        ,4096   ,127      ,3936 ,0    ,0         ,108.706          ,111.21                ,0.977 
0        ,4096   ,127      ,3968 ,0    ,0         ,110.841          ,112.385               ,0.986 
0        ,4096   ,127      ,4000 ,0    ,0         ,113.403          ,111.569               ,1.016 
0        ,4096   ,127      ,4032 ,0    ,0         ,111.23           ,111.98                ,0.993 
0        ,4096   ,127      ,4064 ,0    ,0         ,112.755          ,112.228               ,1.005 
0        ,4096   ,127      ,4128 ,0    ,0         ,115.807          ,113.662               ,1.019 
0        ,4096   ,127      ,4160 ,0    ,0         ,114.45           ,113.133               ,1.012 
0        ,4096   ,127      ,4192 ,0    ,0         ,115.81           ,113.153               ,1.023 
0        ,4096   ,127      ,4224 ,0    ,0         ,113.306          ,113.886               ,0.995 
0        ,4096   ,127      ,4256 ,0    ,0         ,115.746          ,113.454               ,1.02  
0        ,4096   ,127      ,4288 ,0    ,0         ,115.042          ,114.579               ,1.004 
0        ,4096   ,127      ,4320 ,0    ,0         ,116.733          ,113.221               ,1.031 
0        ,4096   ,127      ,4352 ,0    ,0         ,114.863          ,113.593               ,1.011 
0        ,4096   ,127      ,4384 ,0    ,0         ,116.745          ,113.405               ,1.029 
0        ,4096   ,127      ,4416 ,0    ,0         ,114.161          ,113.181               ,1.009 
0        ,4096   ,127      ,512  ,0    ,0         ,18.172           ,18.892                ,0.962 
0        ,4096   ,127      ,512  ,0    ,23        ,19.072           ,19.144                ,0.996 
0        ,4096   ,127      ,64   ,0    ,0         ,5.245            ,5.689                 ,0.922 
0        ,4096   ,127      ,64   ,0    ,23        ,5.296            ,5.714                 ,0.927 
0        ,4128   ,127      ,4096 ,0    ,0         ,115.166          ,114.833               ,1.003 
0        ,416    ,127      ,128  ,0    ,0         ,7.043            ,7.394                 ,0.953 
0        ,416    ,127      ,256  ,0    ,0         ,13.002           ,13.18                 ,0.987 
0        ,416    ,127      ,512  ,0    ,0         ,16.393           ,16.776                ,0.977 
0        ,4160   ,127      ,4096 ,0    ,0         ,119.857          ,115.093               ,1.041 
0        ,4192   ,127      ,4096 ,0    ,0         ,114.634          ,113.745               ,1.008 
0        ,4224   ,127      ,4096 ,0    ,0         ,117.198          ,113.874               ,1.029 
0        ,4256   ,127      ,4096 ,0    ,0         ,115.097          ,113.647               ,1.013 
0        ,4288   ,127      ,4096 ,0    ,0         ,113.484          ,114.45                ,0.992 
0        ,4320   ,127      ,4096 ,0    ,0         ,115.203          ,114.171               ,1.009 
0        ,4352   ,127      ,4096 ,0    ,0         ,114.364          ,113.866               ,1.004 
0        ,4384   ,127      ,4096 ,0    ,0         ,115.509          ,114.031               ,1.013 
0        ,4416   ,127      ,4096 ,0    ,0         ,118.825          ,113.969               ,1.043 
0        ,448    ,127      ,128  ,0    ,0         ,7.228            ,8.146                 ,0.887 
0        ,448    ,127      ,256  ,0    ,0         ,12.88            ,13.158                ,0.979 
0        ,448    ,127      ,512  ,0    ,0         ,19.471           ,18.169                ,1.072 
0        ,48     ,127      ,16   ,0    ,0         ,3.307            ,3.81                  ,0.868 
0        ,480    ,127      ,256  ,0    ,0         ,12.896           ,13.087                ,0.985 
0        ,480    ,127      ,512  ,0    ,0         ,18.529           ,18.077                ,1.025 
0        ,5      ,127      ,4    ,0    ,0         ,3.576            ,3.67                  ,0.974 
0        ,5      ,127      ,4    ,0    ,23        ,3.521            ,3.827                 ,0.92  
0        ,512    ,127      ,192  ,0    ,0         ,12.107           ,12.181                ,0.994 
0        ,512    ,127      ,224  ,0    ,0         ,12.093           ,12.201                ,0.991 
0        ,512    ,127      ,256  ,0    ,0         ,12.725           ,12.987                ,0.98  
0        ,512    ,127      ,256  ,0    ,23        ,13.103           ,13.716                ,0.955 
0        ,512    ,127      ,288  ,0    ,0         ,13.825           ,13.897                ,0.995 
0        ,512    ,127      ,320  ,0    ,0         ,15.178           ,14.967                ,1.014 
0        ,512    ,127      ,352  ,0    ,0         ,15.153           ,14.971                ,1.012 
0        ,512    ,127      ,384  ,0    ,0         ,15.071           ,15.901                ,0.948 
0        ,512    ,127      ,416  ,0    ,0         ,16.275           ,16.735                ,0.973 
0        ,512    ,127      ,448  ,0    ,0         ,18.028           ,17.982                ,1.003 
0        ,512    ,127      ,480  ,0    ,0         ,18.016           ,17.867                ,1.008 
0        ,512    ,127      ,544  ,0    ,0         ,18.413           ,18.819                ,0.978 
0        ,512    ,127      ,576  ,0    ,0         ,18.447           ,18.844                ,0.979 
0        ,512    ,127      ,608  ,0    ,0         ,18.033           ,18.876                ,0.955 
0        ,512    ,127      ,640  ,0    ,0         ,18.087           ,18.878                ,0.958 
0        ,512    ,127      ,672  ,0    ,0         ,18.097           ,18.809                ,0.962 
0        ,512    ,127      ,704  ,0    ,0         ,18.175           ,18.882                ,0.963 
0        ,512    ,127      ,736  ,0    ,0         ,18.202           ,18.79                 ,0.969 
0        ,512    ,127      ,768  ,0    ,0         ,18.273           ,18.979                ,0.963 
0        ,512    ,127      ,800  ,0    ,0         ,18.139           ,19.157                ,0.947 
0        ,512    ,127      ,832  ,0    ,0         ,18.662           ,18.941                ,0.985 
0        ,544    ,127      ,256  ,0    ,0         ,12.943           ,13.125                ,0.986 
0        ,544    ,127      ,512  ,0    ,0         ,18.26            ,18.993                ,0.961 
0        ,576    ,127      ,256  ,0    ,0         ,12.868           ,13.241                ,0.972 
0        ,576    ,127      ,512  ,0    ,0         ,18.084           ,18.759                ,0.964 
0        ,6      ,127      ,5    ,0    ,0         ,3.521            ,3.83                  ,0.919 
0        ,6      ,127      ,5    ,0    ,23        ,3.489            ,3.808                 ,0.916 
0        ,608    ,127      ,512  ,0    ,0         ,18.142           ,18.837                ,0.963 
0        ,64     ,127      ,128  ,0    ,0         ,5.242            ,5.718                 ,0.917 
0        ,64     ,127      ,160  ,0    ,0         ,5.217            ,5.694                 ,0.916 
0        ,64     ,127      ,192  ,0    ,0         ,5.191            ,5.697                 ,0.911 
0        ,64     ,127      ,224  ,0    ,0         ,5.197            ,5.691                 ,0.913 
0        ,64     ,127      ,256  ,0    ,0         ,5.227            ,5.723                 ,0.913 
0        ,64     ,127      ,288  ,0    ,0         ,5.223            ,5.685                 ,0.919 
0        ,64     ,127      ,32   ,0    ,0         ,4.716            ,4.683                 ,1.007 
0        ,64     ,127      ,320  ,0    ,0         ,5.438            ,5.91                  ,0.92  
0        ,64     ,127      ,352  ,0    ,0         ,5.484            ,5.959                 ,0.92  
0        ,64     ,127      ,384  ,0    ,0         ,5.291            ,5.742                 ,0.921 
0        ,64     ,127      ,96   ,0    ,0         ,5.218            ,5.718                 ,0.913 
0        ,640    ,127      ,512  ,0    ,0         ,18.136           ,18.77                 ,0.966 
0        ,672    ,127      ,512  ,0    ,0         ,18.151           ,18.806                ,0.965 
0        ,7      ,127      ,6    ,0    ,0         ,3.427            ,3.828                 ,0.895 
0        ,7      ,127      ,6    ,0    ,23        ,3.441            ,3.864                 ,0.89  
0        ,704    ,127      ,1024 ,0    ,0         ,23.45            ,22.943                ,1.022 
0        ,704    ,127      ,512  ,0    ,0         ,18.597           ,18.792                ,0.99  
0        ,736    ,127      ,1024 ,0    ,0         ,23.884           ,22.97                 ,1.04  
0        ,736    ,127      ,512  ,0    ,0         ,18.207           ,18.872                ,0.965 
0        ,768    ,127      ,1024 ,0    ,0         ,23.011           ,23.516                ,0.979 
0        ,768    ,127      ,512  ,0    ,0         ,18.216           ,18.837                ,0.967 
0        ,7872   ,127      ,8192 ,0    ,0         ,188.983          ,188.439               ,1.003 
0        ,7904   ,127      ,8192 ,0    ,0         ,190.441          ,188.431               ,1.011 
0        ,7936   ,127      ,8192 ,0    ,0         ,191.761          ,190.242               ,1.008 
0        ,7968   ,127      ,8192 ,0    ,0         ,192.509          ,190.241               ,1.012 
0        ,8      ,127      ,7    ,0    ,0         ,3.551            ,3.88                  ,0.915 
0        ,8      ,127      ,7    ,0    ,23        ,3.425            ,3.836                 ,0.893 
0        ,80     ,127      ,16   ,0    ,0         ,3.418            ,3.784                 ,0.903 
0        ,800    ,127      ,1024 ,0    ,0         ,24.447           ,23.902                ,1.023 
0        ,800    ,127      ,512  ,0    ,0         ,18.203           ,18.902                ,0.963 
0        ,8000   ,127      ,8192 ,0    ,0         ,191.608          ,189.985               ,1.009 
0        ,8032   ,127      ,8192 ,0    ,0         ,190.488          ,198.673               ,0.959 
0        ,8064   ,127      ,8192 ,0    ,0         ,193.941          ,192.12                ,1.009 
0        ,8096   ,127      ,8192 ,0    ,0         ,195.619          ,193.212               ,1.012 
0        ,8128   ,127      ,8192 ,0    ,0         ,194.105          ,193.11                ,1.005 
0        ,8160   ,127      ,8192 ,0    ,0         ,194.197          ,193.289               ,1.005 
0        ,832    ,127      ,1024 ,0    ,0         ,26.28            ,25.929                ,1.014 
0        ,832    ,127      ,512  ,0    ,0         ,19.137           ,18.783                ,1.019 
0        ,864    ,127      ,1024 ,0    ,0         ,26.167           ,25.502                ,1.026 
0        ,896    ,127      ,1024 ,0    ,0         ,25.73            ,25.839                ,0.996 
0        ,9      ,127      ,8    ,0    ,0         ,3.529            ,3.896                 ,0.906 
0        ,9      ,127      ,8    ,0    ,23        ,3.429            ,3.754                 ,0.913 
0        ,928    ,127      ,1024 ,0    ,0         ,27.35            ,26.841                ,1.019 
0        ,96     ,127      ,128  ,0    ,0         ,5.992            ,6.295                 ,0.952 
0        ,96     ,127      ,256  ,0    ,0         ,6.051            ,6.24                  ,0.97  
0        ,96     ,127      ,32   ,0    ,0         ,4.641            ,4.757                 ,0.976 
0        ,96     ,127      ,64   ,0    ,0         ,5.34             ,5.683                 ,0.94  
0        ,960    ,127      ,1024 ,0    ,0         ,29.039           ,28.286                ,1.027 
0        ,992    ,127      ,1024 ,0    ,0         ,28.868           ,28.33                 ,1.019 
1        ,2048   ,127      ,32   ,0    ,0         ,4.646            ,4.616                 ,1.006 
1        ,2048   ,127      ,32   ,0    ,23        ,4.623            ,4.581                 ,1.009 
1        ,256    ,127      ,64   ,0    ,0         ,5.294            ,5.771                 ,0.917 
1        ,256    ,127      ,64   ,0    ,23        ,5.334            ,5.759                 ,0.926 
1        ,4096   ,127      ,32   ,0    ,0         ,4.68             ,4.705                 ,0.995 
1        ,4096   ,127      ,32   ,0    ,23        ,4.711            ,4.68                  ,1.007 
112      ,512    ,127      ,256  ,0    ,0         ,12.16            ,12.221                ,0.995 
112      ,512    ,127      ,256  ,0    ,23        ,12.168           ,12.232                ,0.995 
16       ,512    ,127      ,256  ,0    ,0         ,13.435           ,13.243                ,1.015 
16       ,512    ,127      ,256  ,0    ,23        ,13.59            ,13.315                ,1.021 
2        ,2048   ,127      ,64   ,0    ,0         ,5.275            ,5.669                 ,0.93  
2        ,2048   ,127      ,64   ,0    ,23        ,5.453            ,6.489                 ,0.84  
2        ,256    ,127      ,64   ,0    ,0         ,5.212            ,5.665                 ,0.92  
2        ,256    ,127      ,64   ,0    ,23        ,5.243            ,5.709                 ,0.918 
2        ,4096   ,127      ,64   ,0    ,0         ,5.303            ,5.682                 ,0.933 
2        ,4096   ,127      ,64   ,0    ,23        ,5.222            ,5.702                 ,0.916 
3        ,2048   ,127      ,128  ,0    ,0         ,6.955            ,7.269                 ,0.957 
3        ,2048   ,127      ,128  ,0    ,23        ,7.071            ,7.28                  ,0.971 
3        ,256    ,127      ,64   ,0    ,0         ,5.232            ,5.698                 ,0.918 
3        ,256    ,127      ,64   ,0    ,23        ,5.279            ,5.701                 ,0.926 
3        ,4096   ,127      ,128  ,0    ,0         ,7.061            ,7.349                 ,0.961 
3        ,4096   ,127      ,128  ,0    ,23        ,7.077            ,7.301                 ,0.969 
32       ,512    ,127      ,256  ,0    ,0         ,13.98            ,13.911                ,1.005 
32       ,512    ,127      ,256  ,0    ,23        ,13.886           ,13.902                ,0.999 
4        ,2048   ,127      ,256  ,0    ,0         ,12.887           ,13.167                ,0.979 
4        ,2048   ,127      ,256  ,0    ,23        ,12.944           ,13.153                ,0.984 
4        ,256    ,127      ,64   ,0    ,0         ,5.204            ,5.683                 ,0.916 
4        ,256    ,127      ,64   ,0    ,23        ,5.307            ,5.74                  ,0.925 
4        ,4096   ,127      ,256  ,0    ,0         ,12.879           ,13.11                 ,0.982 
4        ,4096   ,127      ,256  ,0    ,23        ,12.951           ,13.149                ,0.985 
48       ,512    ,127      ,256  ,0    ,0         ,14.086           ,14.075                ,1.001 
48       ,512    ,127      ,256  ,0    ,23        ,14.025           ,14.042                ,0.999 
5        ,2048   ,127      ,512  ,0    ,0         ,18.365           ,18.915                ,0.971 
5        ,2048   ,127      ,512  ,0    ,23        ,18.154           ,18.828                ,0.964 
5        ,256    ,127      ,64   ,0    ,0         ,5.246            ,5.696                 ,0.921 
5        ,256    ,127      ,64   ,0    ,23        ,5.263            ,5.676                 ,0.927 
5        ,4096   ,127      ,512  ,0    ,0         ,18.352           ,19.037                ,0.964 
5        ,4096   ,127      ,512  ,0    ,23        ,18.16            ,18.894                ,0.961 
6        ,2048   ,127      ,1024 ,0    ,0         ,28.802           ,28.303                ,1.018 
6        ,2048   ,127      ,1024 ,0    ,23        ,28.685           ,27.894                ,1.028 
6        ,256    ,127      ,64   ,0    ,0         ,5.21             ,5.656                 ,0.921 
6        ,256    ,127      ,64   ,0    ,23        ,5.301            ,5.707                 ,0.929 
6        ,4096   ,127      ,1024 ,0    ,0         ,27.401           ,28.065                ,0.976 
6        ,4096   ,127      ,1024 ,0    ,23        ,27.327           ,27.901                ,0.979 
64       ,512    ,127      ,256  ,0    ,0         ,15.265           ,15.029                ,1.016 
64       ,512    ,127      ,256  ,0    ,23        ,15.297           ,15.008                ,1.019 
7        ,2048   ,127      ,2048 ,0    ,0         ,48.207           ,47.941                ,1.006 
7        ,2048   ,127      ,2048 ,0    ,23        ,48.017           ,48.214                ,0.996 
7        ,256    ,127      ,64   ,0    ,0         ,5.219            ,5.724                 ,0.912 
7        ,256    ,127      ,64   ,0    ,23        ,5.24             ,5.667                 ,0.925 
7        ,4096   ,127      ,2048 ,0    ,0         ,48.105           ,48.19                 ,0.998 
7        ,4096   ,127      ,2048 ,0    ,23        ,48.277           ,48.418                ,0.997 
80       ,512    ,127      ,256  ,0    ,0         ,15.197           ,14.997                ,1.013 
80       ,512    ,127      ,256  ,0    ,23        ,15.449           ,15.249                ,1.013 
96       ,512    ,127      ,256  ,0    ,0         ,12.192           ,12.072                ,1.01  
96       ,512    ,127      ,256  ,0    ,23        ,12.349           ,12.371                ,0.998 
0.9651929728891862

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH v1 3/7] x86: Optimize strnlen-evex.S and implement with VMM headers
  2022-10-18  2:48 ` [PATCH v1 3/7] x86: Optimize strnlen-evex.S " Noah Goldstein
@ 2022-10-18  2:51   ` Noah Goldstein
  0 siblings, 0 replies; 41+ messages in thread
From: Noah Goldstein @ 2022-10-18  2:51 UTC (permalink / raw)
  To: libc-alpha; +Cc: hjl.tools, carlos

[-- Attachment #1: Type: text/plain, Size: 34350 bytes --]

On Mon, Oct 17, 2022 at 7:49 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> Optimizations are:
> 1. Use the fact that bsf(0) leaves the destination unchanged to save a
>    branch in short string case.
> 2. Restructure code so that small strings are given the hot path.
>         - This is a net-zero on the benchmark suite but in general makes
>       sense as smaller sizes are far more common.
> 3. Use more code-size efficient instructions.
>         - tzcnt ...     -> bsf ...
>         - vpcmpb $0 ... -> vpcmpeq ...
> 4. Align labels less aggressively, especially if it doesn't save fetch
>    blocks / causes the basic-block to span extra cache-lines.
>
> The optimizations (especially for point 2) make the strnlen and
> strlen code essentially incompatible so split strnlen-evex
> to a new file.
>
> Code Size Changes:
> strlen-evex.S       :  -23 bytes
> strnlen-evex.S      : -167 bytes
>
> Net perf changes:
>
> Reported as geometric mean of all improvements / regressions from N=10
> runs of the benchtests. Value as New Time / Old Time so < 1.0 is
> improvement and 1.0 is regression.
>
> strlen-evex.S       : 0.992 (No real change)
> strnlen-evex.S      : 0.947
>
> Full results attached in email.
>
> Full check passes on x86-64.
> ---
>  sysdeps/x86_64/multiarch/strlen-evex.S  | 544 +++++++-----------------
>  sysdeps/x86_64/multiarch/strnlen-evex.S | 427 ++++++++++++++++++-
>  sysdeps/x86_64/multiarch/wcsnlen-evex.S |   5 +-
>  3 files changed, 572 insertions(+), 404 deletions(-)
>
> diff --git a/sysdeps/x86_64/multiarch/strlen-evex.S b/sysdeps/x86_64/multiarch/strlen-evex.S
> index 2109ec2f7a..487846f098 100644
> --- a/sysdeps/x86_64/multiarch/strlen-evex.S
> +++ b/sysdeps/x86_64/multiarch/strlen-evex.S
> @@ -26,466 +26,220 @@
>  #  define STRLEN       __strlen_evex
>  # endif
>
> -# define VMOVA         vmovdqa64
> +# ifndef VEC_SIZE
> +#  include "x86-evex256-vecs.h"
> +# endif
>
>  # ifdef USE_AS_WCSLEN
> -#  define VPCMP                vpcmpd
> +#  define VPCMPEQ      vpcmpeqd
> +#  define VPCMPNEQ     vpcmpneqd
> +#  define VPTESTN      vptestnmd
> +#  define VPTEST       vptestmd
>  #  define VPMINU       vpminud
> -#  define SHIFT_REG ecx
>  #  define CHAR_SIZE    4
> +#  define CHAR_SIZE_SHIFT_REG(reg)     sar $2, %reg
>  # else
> -#  define VPCMP                vpcmpb
> +#  define VPCMPEQ      vpcmpeqb
> +#  define VPCMPNEQ     vpcmpneqb
> +#  define VPTESTN      vptestnmb
> +#  define VPTEST       vptestmb
>  #  define VPMINU       vpminub
> -#  define SHIFT_REG edx
>  #  define CHAR_SIZE    1
> +#  define CHAR_SIZE_SHIFT_REG(reg)
> +
> +#  define REG_WIDTH    VEC_SIZE
>  # endif
>
> -# define XMMZERO       xmm16
> -# define YMMZERO       ymm16
> -# define YMM1          ymm17
> -# define YMM2          ymm18
> -# define YMM3          ymm19
> -# define YMM4          ymm20
> -# define YMM5          ymm21
> -# define YMM6          ymm22
> -
> -# define VEC_SIZE 32
> -# define PAGE_SIZE 4096
> -# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE)
> -
> -       .section .text.evex,"ax",@progbits
> -ENTRY (STRLEN)
> -# ifdef USE_AS_STRNLEN
> -       /* Check zero length.  */
> -       test    %RSI_LP, %RSI_LP
> -       jz      L(zero)
> -#  ifdef __ILP32__
> -       /* Clear the upper 32 bits.  */
> -       movl    %esi, %esi
> -#  endif
> -       mov     %RSI_LP, %R8_LP
> +# define CHAR_PER_VEC  (VEC_SIZE / CHAR_SIZE)
> +
> +# include "reg-macros.h"
> +
> +# if CHAR_PER_VEC == 64
> +
> +#  define TAIL_RETURN_LBL      first_vec_x2
> +#  define TAIL_RETURN_OFFSET   (CHAR_PER_VEC * 2)
> +
> +#  define FALLTHROUGH_RETURN_LBL       first_vec_x3
> +#  define FALLTHROUGH_RETURN_OFFSET    (CHAR_PER_VEC * 3)
> +
> +# else
> +
> +#  define TAIL_RETURN_LBL      first_vec_x3
> +#  define TAIL_RETURN_OFFSET   (CHAR_PER_VEC * 3)
> +
> +#  define FALLTHROUGH_RETURN_LBL       first_vec_x2
> +#  define FALLTHROUGH_RETURN_OFFSET    (CHAR_PER_VEC * 2)
>  # endif
> +
> +# define XZERO VMM_128(0)
> +# define VZERO VMM(0)
> +# define PAGE_SIZE     4096
> +
> +       .section SECTION(.text), "ax", @progbits
> +ENTRY_P2ALIGN (STRLEN, 6)
>         movl    %edi, %eax
> -       vpxorq  %XMMZERO, %XMMZERO, %XMMZERO
> -       /* Clear high bits from edi. Only keeping bits relevant to page
> -          cross check.  */
> +       vpxorq  %XZERO, %XZERO, %XZERO
>         andl    $(PAGE_SIZE - 1), %eax
> -       /* Check if we may cross page boundary with one vector load.  */
>         cmpl    $(PAGE_SIZE - VEC_SIZE), %eax
>         ja      L(cross_page_boundary)
>
>         /* Check the first VEC_SIZE bytes.  Each bit in K0 represents a
>            null byte.  */
> -       VPCMP   $0, (%rdi), %YMMZERO, %k0
> -       kmovd   %k0, %eax
> -# ifdef USE_AS_STRNLEN
> -       /* If length < CHAR_PER_VEC handle special.  */
> -       cmpq    $CHAR_PER_VEC, %rsi
> -       jbe     L(first_vec_x0)
> -# endif
> -       testl   %eax, %eax
> +       VPCMPEQ (%rdi), %VZERO, %k0
> +       KMOV    %k0, %VRAX
> +       test    %VRAX, %VRAX
>         jz      L(aligned_more)
> -       tzcntl  %eax, %eax
> -       ret
> -# ifdef USE_AS_STRNLEN
> -L(zero):
> -       xorl    %eax, %eax
> -       ret
> -
> -       .p2align 4
> -L(first_vec_x0):
> -       /* Set bit for max len so that tzcnt will return min of max len
> -          and position of first match.  */
> -       btsq    %rsi, %rax
> -       tzcntl  %eax, %eax
> -       ret
> -# endif
> -
> -       .p2align 4
> -L(first_vec_x1):
> -       tzcntl  %eax, %eax
> -       /* Safe to use 32 bit instructions as these are only called for
> -          size = [1, 159].  */
> -# ifdef USE_AS_STRNLEN
> -       /* Use ecx which was computed earlier to compute correct value.
> -        */
> -       leal    -(CHAR_PER_VEC * 4 + 1)(%rcx, %rax), %eax
> -# else
> -       subl    %edx, %edi
> -#  ifdef USE_AS_WCSLEN
> -       /* NB: Divide bytes by 4 to get the wchar_t count.  */
> -       sarl    $2, %edi
> -#  endif
> -       leal    CHAR_PER_VEC(%rdi, %rax), %eax
> -# endif
> -       ret
> -
> -       .p2align 4
> -L(first_vec_x2):
> -       tzcntl  %eax, %eax
> -       /* Safe to use 32 bit instructions as these are only called for
> -          size = [1, 159].  */
> -# ifdef USE_AS_STRNLEN
> -       /* Use ecx which was computed earlier to compute correct value.
> -        */
> -       leal    -(CHAR_PER_VEC * 3 + 1)(%rcx, %rax), %eax
> -# else
> -       subl    %edx, %edi
> -#  ifdef USE_AS_WCSLEN
> -       /* NB: Divide bytes by 4 to get the wchar_t count.  */
> -       sarl    $2, %edi
> -#  endif
> -       leal    (CHAR_PER_VEC * 2)(%rdi, %rax), %eax
> -# endif
> +       bsf     %VRAX, %VRAX
>         ret
>
> -       .p2align 4
> -L(first_vec_x3):
> -       tzcntl  %eax, %eax
> -       /* Safe to use 32 bit instructions as these are only called for
> -          size = [1, 159].  */
> -# ifdef USE_AS_STRNLEN
> -       /* Use ecx which was computed earlier to compute correct value.
> -        */
> -       leal    -(CHAR_PER_VEC * 2 + 1)(%rcx, %rax), %eax
> -# else
> -       subl    %edx, %edi
> -#  ifdef USE_AS_WCSLEN
> -       /* NB: Divide bytes by 4 to get the wchar_t count.  */
> -       sarl    $2, %edi
> -#  endif
> -       leal    (CHAR_PER_VEC * 3)(%rdi, %rax), %eax
> -# endif
> -       ret
> -
> -       .p2align 4
> +       .p2align 4,, 8
>  L(first_vec_x4):
> -       tzcntl  %eax, %eax
> -       /* Safe to use 32 bit instructions as these are only called for
> -          size = [1, 159].  */
> -# ifdef USE_AS_STRNLEN
> -       /* Use ecx which was computed earlier to compute correct value.
> -        */
> -       leal    -(CHAR_PER_VEC + 1)(%rcx, %rax), %eax
> -# else
> -       subl    %edx, %edi
> -#  ifdef USE_AS_WCSLEN
> -       /* NB: Divide bytes by 4 to get the wchar_t count.  */
> -       sarl    $2, %edi
> -#  endif
> +       bsf     %VRAX, %VRAX
> +       subl    %ecx, %edi
> +       CHAR_SIZE_SHIFT_REG (edi)
>         leal    (CHAR_PER_VEC * 4)(%rdi, %rax), %eax
> -# endif
>         ret
>
> -       .p2align 5
> +
> +
> +       /* Aligned more for strnlen compares remaining length vs 2 *
> +          CHAR_PER_VEC, 4 * CHAR_PER_VEC, and 8 * CHAR_PER_VEC before
> +          going to the loop.  */
> +       .p2align 4,, 10
>  L(aligned_more):
> -       movq    %rdi, %rdx
> -       /* Align data to VEC_SIZE.  */
> -       andq    $-(VEC_SIZE), %rdi
> +       movq    %rdi, %rcx
> +       andq    $(VEC_SIZE * -1), %rdi
>  L(cross_page_continue):
> -       /* Check the first 4 * VEC_SIZE.  Only one VEC_SIZE at a time
> -          since data is only aligned to VEC_SIZE.  */
> -# ifdef USE_AS_STRNLEN
> -       /* + CHAR_SIZE because it simplies the logic in
> -          last_4x_vec_or_less.  */
> -       leaq    (VEC_SIZE * 5 + CHAR_SIZE)(%rdi), %rcx
> -       subq    %rdx, %rcx
> -#  ifdef USE_AS_WCSLEN
> -       /* NB: Divide bytes by 4 to get the wchar_t count.  */
> -       sarl    $2, %ecx
> -#  endif
> -# endif
> -       /* Load first VEC regardless.  */
> -       VPCMP   $0, VEC_SIZE(%rdi), %YMMZERO, %k0
> -# ifdef USE_AS_STRNLEN
> -       /* Adjust length. If near end handle specially.  */
> -       subq    %rcx, %rsi
> -       jb      L(last_4x_vec_or_less)
> -# endif
> -       kmovd   %k0, %eax
> -       testl   %eax, %eax
> +       /* Remaining length >= 2 * CHAR_PER_VEC so do VEC0/VEC1 without
> +          rechecking bounds.  */
> +       VPCMPEQ (VEC_SIZE * 1)(%rdi), %VZERO, %k0
> +       KMOV    %k0, %VRAX
> +       test    %VRAX, %VRAX
>         jnz     L(first_vec_x1)
>
> -       VPCMP   $0, (VEC_SIZE * 2)(%rdi), %YMMZERO, %k0
> -       kmovd   %k0, %eax
> -       test    %eax, %eax
> +       VPCMPEQ (VEC_SIZE * 2)(%rdi), %VZERO, %k0
> +       KMOV    %k0, %VRAX
> +       test    %VRAX, %VRAX
>         jnz     L(first_vec_x2)
>
> -       VPCMP   $0, (VEC_SIZE * 3)(%rdi), %YMMZERO, %k0
> -       kmovd   %k0, %eax
> -       testl   %eax, %eax
> +       VPCMPEQ (VEC_SIZE * 3)(%rdi), %VZERO, %k0
> +       KMOV    %k0, %VRAX
> +       test    %VRAX, %VRAX
>         jnz     L(first_vec_x3)
>
> -       VPCMP   $0, (VEC_SIZE * 4)(%rdi), %YMMZERO, %k0
> -       kmovd   %k0, %eax
> -       testl   %eax, %eax
> +       VPCMPEQ (VEC_SIZE * 4)(%rdi), %VZERO, %k0
> +       KMOV    %k0, %VRAX
> +       test    %VRAX, %VRAX
>         jnz     L(first_vec_x4)
>
> -       addq    $VEC_SIZE, %rdi
> -# ifdef USE_AS_STRNLEN
> -       /* Check if at last VEC_SIZE * 4 length.  */
> -       cmpq    $(CHAR_PER_VEC * 4 - 1), %rsi
> -       jbe     L(last_4x_vec_or_less_load)
> -       movl    %edi, %ecx
> -       andl    $(VEC_SIZE * 4 - 1), %ecx
> -#  ifdef USE_AS_WCSLEN
> -       /* NB: Divide bytes by 4 to get the wchar_t count.  */
> -       sarl    $2, %ecx
> -#  endif
> -       /* Readjust length.  */
> -       addq    %rcx, %rsi
> -# endif
> -       /* Align data to VEC_SIZE * 4.  */
> +       subq    $(VEC_SIZE * -1), %rdi
> +
> +# if CHAR_PER_VEC == 64
> +       /* No partial register stalls on processors that we use evex512
> +          on and this saves code size.  */
> +       xorb    %dil, %dil
> +# else
>         andq    $-(VEC_SIZE * 4), %rdi
> +# endif
> +
> +
>
>         /* Compare 4 * VEC at a time forward.  */
>         .p2align 4
>  L(loop_4x_vec):
> -       /* Load first VEC regardless.  */
> -       VMOVA   (VEC_SIZE * 4)(%rdi), %YMM1
> -# ifdef USE_AS_STRNLEN
> -       /* Break if at end of length.  */
> -       subq    $(CHAR_PER_VEC * 4), %rsi
> -       jb      L(last_4x_vec_or_less_cmpeq)
> -# endif
> -       /* Save some code size by microfusing VPMINU with the load. Since
> -          the matches in ymm2/ymm4 can only be returned if there where no
> -          matches in ymm1/ymm3 respectively there is no issue with overlap.
> -        */
> -       VPMINU  (VEC_SIZE * 5)(%rdi), %YMM1, %YMM2
> -       VMOVA   (VEC_SIZE * 6)(%rdi), %YMM3
> -       VPMINU  (VEC_SIZE * 7)(%rdi), %YMM3, %YMM4
> +       VMOVA   (VEC_SIZE * 4)(%rdi), %VMM(1)
> +       VPMINU  (VEC_SIZE * 5)(%rdi), %VMM(1), %VMM(2)
> +       VMOVA   (VEC_SIZE * 6)(%rdi), %VMM(3)
> +       VPMINU  (VEC_SIZE * 7)(%rdi), %VMM(3), %VMM(4)
> +       VPTESTN %VMM(2), %VMM(2), %k0
> +       VPTESTN %VMM(4), %VMM(4), %k2
>
> -       VPCMP   $0, %YMM2, %YMMZERO, %k0
> -       VPCMP   $0, %YMM4, %YMMZERO, %k1
>         subq    $-(VEC_SIZE * 4), %rdi
> -       kortestd        %k0, %k1
> +       KORTEST %k0, %k2
>         jz      L(loop_4x_vec)
>
> -       /* Check if end was in first half.  */
> -       kmovd   %k0, %eax
> -       subq    %rdx, %rdi
> -# ifdef USE_AS_WCSLEN
> -       shrq    $2, %rdi
> -# endif
> -       testl   %eax, %eax
> -       jz      L(second_vec_return)
> +       VPTESTN %VMM(1), %VMM(1), %k1
> +       KMOV    %k1, %VRAX
> +       test    %VRAX, %VRAX
> +       jnz     L(first_vec_x0)
>
> -       VPCMP   $0, %YMM1, %YMMZERO, %k2
> -       kmovd   %k2, %edx
> -       /* Combine VEC1 matches (edx) with VEC2 matches (eax).  */
> -# ifdef USE_AS_WCSLEN
> -       sall    $CHAR_PER_VEC, %eax
> -       orl     %edx, %eax
> -       tzcntl  %eax, %eax
> -# else
> -       salq    $CHAR_PER_VEC, %rax
> -       orq     %rdx, %rax
> -       tzcntq  %rax, %rax
> -# endif
> -       addq    %rdi, %rax
> -       ret
> -
> -
> -# ifdef USE_AS_STRNLEN
> -
> -L(last_4x_vec_or_less_load):
> -       /* Depending on entry adjust rdi / prepare first VEC in YMM1.  */
> -       VMOVA   (VEC_SIZE * 4)(%rdi), %YMM1
> -L(last_4x_vec_or_less_cmpeq):
> -       VPCMP   $0, %YMM1, %YMMZERO, %k0
> -       addq    $(VEC_SIZE * 3), %rdi
> -L(last_4x_vec_or_less):
> -       kmovd   %k0, %eax
> -       /* If remaining length > VEC_SIZE * 2. This works if esi is off by
> -          VEC_SIZE * 4.  */
> -       testl   $(CHAR_PER_VEC * 2), %esi
> -       jnz     L(last_4x_vec)
> -
> -       /* length may have been negative or positive by an offset of
> -          CHAR_PER_VEC * 4 depending on where this was called from. This
> -          fixes that.  */
> -       andl    $(CHAR_PER_VEC * 4 - 1), %esi
> -       testl   %eax, %eax
> -       jnz     L(last_vec_x1_check)
> +       KMOV    %k0, %VRAX
> +       test    %VRAX, %VRAX
> +       jnz     L(first_vec_x1)
>
> -       /* Check the end of data.  */
> -       subl    $CHAR_PER_VEC, %esi
> -       jb      L(max)
> +       VPTESTN %VMM(3), %VMM(3), %k0
>
> -       VPCMP   $0, (VEC_SIZE * 2)(%rdi), %YMMZERO, %k0
> -       kmovd   %k0, %eax
> -       tzcntl  %eax, %eax
> -       /* Check the end of data.  */
> -       cmpl    %eax, %esi
> -       jb      L(max)
> -
> -       subq    %rdx, %rdi
> -#  ifdef USE_AS_WCSLEN
> -       /* NB: Divide bytes by 4 to get the wchar_t count.  */
> -       sarq    $2, %rdi
> -#  endif
> -       leaq    (CHAR_PER_VEC * 2)(%rdi, %rax), %rax
> -       ret
> -L(max):
> -       movq    %r8, %rax
> -       ret
> -# endif
> -
> -       /* Placed here in strnlen so that the jcc L(last_4x_vec_or_less)
> -          in the 4x VEC loop can use 2 byte encoding.  */
> -       .p2align 4
> -L(second_vec_return):
> -       VPCMP   $0, %YMM3, %YMMZERO, %k0
> -       /* Combine YMM3 matches (k0) with YMM4 matches (k1).  */
> -# ifdef USE_AS_WCSLEN
> -       kunpckbw        %k0, %k1, %k0
> -       kmovd   %k0, %eax
> -       tzcntl  %eax, %eax
> +# if CHAR_PER_VEC == 64
> +       KMOV    %k0, %VRAX
> +       test    %VRAX, %VRAX
> +       jnz     L(first_vec_x2)
> +       KMOV    %k2, %VRAX
>  # else
> -       kunpckdq        %k0, %k1, %k0
> -       kmovq   %k0, %rax
> -       tzcntq  %rax, %rax
> +       /* We can only combine last 2x VEC masks if CHAR_PER_VEC <= 32.
> +        */
> +       kmovd   %k2, %edx
> +       kmovd   %k0, %eax
> +       salq    $CHAR_PER_VEC, %rdx
> +       orq     %rdx, %rax
>  # endif
> -       leaq    (CHAR_PER_VEC * 2)(%rdi, %rax), %rax
> -       ret
>
> -
> -# ifdef USE_AS_STRNLEN
> -L(last_vec_x1_check):
> -       tzcntl  %eax, %eax
> -       /* Check the end of data.  */
> -       cmpl    %eax, %esi
> -       jb      L(max)
> -       subq    %rdx, %rdi
> -#  ifdef USE_AS_WCSLEN
> -       /* NB: Divide bytes by 4 to get the wchar_t count.  */
> -       sarq    $2, %rdi
> -#  endif
> -       leaq    (CHAR_PER_VEC)(%rdi, %rax), %rax
> +       /* first_vec_x3 for strlen-ZMM and first_vec_x2 for strlen-YMM.
> +        */
> +       .p2align 4,, 2
> +L(FALLTHROUGH_RETURN_LBL):
> +       bsfq    %rax, %rax
> +       subq    %rcx, %rdi
> +       CHAR_SIZE_SHIFT_REG (rdi)
> +       leaq    (FALLTHROUGH_RETURN_OFFSET)(%rdi, %rax), %rax
>         ret
>
> -       .p2align 4
> -L(last_4x_vec):
> -       /* Test first 2x VEC normally.  */
> -       testl   %eax, %eax
> -       jnz     L(last_vec_x1)
> -
> -       VPCMP   $0, (VEC_SIZE * 2)(%rdi), %YMMZERO, %k0
> -       kmovd   %k0, %eax
> -       testl   %eax, %eax
> -       jnz     L(last_vec_x2)
> -
> -       /* Normalize length.  */
> -       andl    $(CHAR_PER_VEC * 4 - 1), %esi
> -       VPCMP   $0, (VEC_SIZE * 3)(%rdi), %YMMZERO, %k0
> -       kmovd   %k0, %eax
> -       testl   %eax, %eax
> -       jnz     L(last_vec_x3)
> -
> -       /* Check the end of data.  */
> -       subl    $(CHAR_PER_VEC * 3), %esi
> -       jb      L(max)
> -
> -       VPCMP   $0, (VEC_SIZE * 4)(%rdi), %YMMZERO, %k0
> -       kmovd   %k0, %eax
> -       tzcntl  %eax, %eax
> -       /* Check the end of data.  */
> -       cmpl    %eax, %esi
> -       jb      L(max_end)
> -
> -       subq    %rdx, %rdi
> -#  ifdef USE_AS_WCSLEN
> -       /* NB: Divide bytes by 4 to get the wchar_t count.  */
> -       sarq    $2, %rdi
> -#  endif
> -       leaq    (CHAR_PER_VEC * 4)(%rdi, %rax), %rax
> +       .p2align 4,, 8
> +L(first_vec_x0):
> +       bsf     %VRAX, %VRAX
> +       sub     %rcx, %rdi
> +       CHAR_SIZE_SHIFT_REG (rdi)
> +       addq    %rdi, %rax
>         ret
>
> -       .p2align 4
> -L(last_vec_x1):
> -       tzcntl  %eax, %eax
> -       subq    %rdx, %rdi
> -#  ifdef USE_AS_WCSLEN
> -       /* NB: Divide bytes by 4 to get the wchar_t count.  */
> -       sarq    $2, %rdi
> -#  endif
> +       .p2align 4,, 10
> +L(first_vec_x1):
> +       bsf     %VRAX, %VRAX
> +       sub     %rcx, %rdi
> +       CHAR_SIZE_SHIFT_REG (rdi)
>         leaq    (CHAR_PER_VEC)(%rdi, %rax), %rax
>         ret
>
> -       .p2align 4
> -L(last_vec_x2):
> -       tzcntl  %eax, %eax
> -       subq    %rdx, %rdi
> -#  ifdef USE_AS_WCSLEN
> -       /* NB: Divide bytes by 4 to get the wchar_t count.  */
> -       sarq    $2, %rdi
> -#  endif
> -       leaq    (CHAR_PER_VEC * 2)(%rdi, %rax), %rax
> -       ret
> -
> -       .p2align 4
> -L(last_vec_x3):
> -       tzcntl  %eax, %eax
> -       subl    $(CHAR_PER_VEC * 2), %esi
> -       /* Check the end of data.  */
> -       cmpl    %eax, %esi
> -       jb      L(max_end)
> -       subq    %rdx, %rdi
> -#  ifdef USE_AS_WCSLEN
> -       /* NB: Divide bytes by 4 to get the wchar_t count.  */
> -       sarq    $2, %rdi
> -#  endif
> -       leaq    (CHAR_PER_VEC * 3)(%rdi, %rax), %rax
> -       ret
> -L(max_end):
> -       movq    %r8, %rax
> +       .p2align 4,, 10
> +       /* first_vec_x2 for strlen-ZMM and first_vec_x3 for strlen-YMM.
> +        */
> +L(TAIL_RETURN_LBL):
> +       bsf     %VRAX, %VRAX
> +       sub     %VRCX, %VRDI
> +       CHAR_SIZE_SHIFT_REG (VRDI)
> +       lea     (TAIL_RETURN_OFFSET)(%rdi, %rax), %VRAX
>         ret
> -# endif
>
> -       /* Cold case for crossing page with first load.  */
> -       .p2align 4
> +       .p2align 4,, 8
>  L(cross_page_boundary):
> -       movq    %rdi, %rdx
> +       movq    %rdi, %rcx
>         /* Align data to VEC_SIZE.  */
>         andq    $-VEC_SIZE, %rdi
> -       VPCMP   $0, (%rdi), %YMMZERO, %k0
> -       kmovd   %k0, %eax
> -       /* Remove the leading bytes.  */
> +
> +       VPCMPEQ (%rdi), %VZERO, %k0
> +
> +       KMOV    %k0, %VRAX
>  # ifdef USE_AS_WCSLEN
> -       /* NB: Divide shift count by 4 since each bit in K0 represent 4
> -          bytes.  */
> -       movl    %edx, %ecx
> -       shrl    $2, %ecx
> -       andl    $(CHAR_PER_VEC - 1), %ecx
> -# endif
> -       /* SHIFT_REG is ecx for USE_AS_WCSLEN and edx otherwise.  */
> -       sarxl   %SHIFT_REG, %eax, %eax
> +       movl    %ecx, %edx
> +       shrl    $2, %edx
> +       andl    $(CHAR_PER_VEC - 1), %edx
> +       shrx    %edx, %eax, %eax
>         testl   %eax, %eax
> -# ifndef USE_AS_STRNLEN
> -       jz      L(cross_page_continue)
> -       tzcntl  %eax, %eax
> -       ret
>  # else
> -       jnz     L(cross_page_less_vec)
> -#  ifndef USE_AS_WCSLEN
> -       movl    %edx, %ecx
> -       andl    $(CHAR_PER_VEC - 1), %ecx
> -#  endif
> -       movl    $CHAR_PER_VEC, %eax
> -       subl    %ecx, %eax
> -       /* Check the end of data.  */
> -       cmpq    %rax, %rsi
> -       ja      L(cross_page_continue)
> -       movl    %esi, %eax
> -       ret
> -L(cross_page_less_vec):
> -       tzcntl  %eax, %eax
> -       /* Select min of length and position of first null.  */
> -       cmpq    %rax, %rsi
> -       cmovb   %esi, %eax
> -       ret
> +       shr     %cl, %VRAX
>  # endif
> +       jz      L(cross_page_continue)
> +       bsf     %VRAX, %VRAX
> +       ret
>
>  END (STRLEN)
>  #endif
> diff --git a/sysdeps/x86_64/multiarch/strnlen-evex.S b/sysdeps/x86_64/multiarch/strnlen-evex.S
> index 64a9fc2606..443a32a749 100644
> --- a/sysdeps/x86_64/multiarch/strnlen-evex.S
> +++ b/sysdeps/x86_64/multiarch/strnlen-evex.S
> @@ -1,8 +1,423 @@
> -#ifndef STRNLEN
> -# define STRNLEN __strnlen_evex
> -#endif
> +/* strnlen/wcsnlen optimized with 256-bit EVEX instructions.
> +   Copyright (C) 2022 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +#include <isa-level.h>
> +#include <sysdep.h>
> +
> +#if ISA_SHOULD_BUILD (4)
> +
> +# ifndef VEC_SIZE
> +#  include "x86-evex256-vecs.h"
> +# endif
> +
> +
> +# ifndef STRNLEN
> +#  define STRNLEN      __strnlen_evex
> +# endif
> +
> +# ifdef USE_AS_WCSLEN
> +#  define VPCMPEQ      vpcmpeqd
> +#  define VPCMPNEQ     vpcmpneqd
> +#  define VPTESTN      vptestnmd
> +#  define VPTEST       vptestmd
> +#  define VPMINU       vpminud
> +#  define CHAR_SIZE    4
> +
> +# else
> +#  define VPCMPEQ      vpcmpeqb
> +#  define VPCMPNEQ     vpcmpneqb
> +#  define VPTESTN      vptestnmb
> +#  define VPTEST       vptestmb
> +#  define VPMINU       vpminub
> +#  define CHAR_SIZE    1
> +
> +#  define REG_WIDTH    VEC_SIZE
> +# endif
> +
> +# define CHAR_PER_VEC  (VEC_SIZE / CHAR_SIZE)
> +
> +# include "reg-macros.h"
> +
> +# if CHAR_PER_VEC == 32
> +#  define SUB_SHORT(imm, reg)  subb $(imm), %VGPR_SZ(reg, 8)
> +# else
> +#  define SUB_SHORT(imm, reg)  subl $(imm), %VGPR_SZ(reg, 32)
> +# endif
> +
> +
> +
> +# if CHAR_PER_VEC == 64
> +#  define FALLTHROUGH_RETURN_OFFSET    (CHAR_PER_VEC * 3)
> +# else
> +#  define FALLTHROUGH_RETURN_OFFSET    (CHAR_PER_VEC * 2)
> +# endif
> +
> +
> +# define XZERO VMM_128(0)
> +# define VZERO VMM(0)
> +# define PAGE_SIZE     4096
> +
> +       .section SECTION(.text), "ax", @progbits
> +ENTRY_P2ALIGN (STRNLEN, 6)
> +       /* Check zero length.  */
> +       test    %RSI_LP, %RSI_LP
> +       jz      L(zero)
> +# ifdef __ILP32__
> +       /* Clear the upper 32 bits.  */
> +       movl    %esi, %esi
> +# endif
> +
> +       movl    %edi, %eax
> +       vpxorq  %XZERO, %XZERO, %XZERO
> +       andl    $(PAGE_SIZE - 1), %eax
> +       cmpl    $(PAGE_SIZE - VEC_SIZE), %eax
> +       ja      L(cross_page_boundary)
> +
> +       /* Check the first VEC_SIZE bytes.  Each bit in K0 represents a
> +          null byte.  */
> +       VPCMPEQ (%rdi), %VZERO, %k0
> +
> +       KMOV    %k0, %VRCX
> +       movq    %rsi, %rax
> +
> +       /* If src (rcx) is zero, bsf does not change the result.  NB:
> +          Must use 64-bit bsf here so that upper bits of len are not
> +          cleared.  */
> +       bsfq    %rcx, %rax
> +       /* If rax > CHAR_PER_VEC then rcx must have been zero (no null
> +          CHAR) and rsi must be > CHAR_PER_VEC.  */
> +       cmpq    $CHAR_PER_VEC, %rax
> +       ja      L(more_1x_vec)
> +       /* Check if first match in bounds.  */
> +       cmpq    %rax, %rsi
> +       cmovb   %esi, %eax
> +       ret
> +
> +
> +# if CHAR_PER_VEC != 32
> +       .p2align 4,, 2
> +L(zero):
> +L(max_0):
> +       movl    %esi, %eax
> +       ret
> +# endif
> +
> +       /* Aligned more for strnlen compares remaining length vs 2 *
> +          CHAR_PER_VEC, 4 * CHAR_PER_VEC, and 8 * CHAR_PER_VEC before
> +          going to the loop.  */
> +       .p2align 4,, 10
> +L(more_1x_vec):
> +L(cross_page_continue):
> +       /* Compute number of words checked after aligning.  */
> +# ifdef USE_AS_WCSLEN
> +       /* Need to compute directly for wcslen as CHAR_SIZE * rsi can
> +          overflow.  */
> +       movq    %rdi, %rax
> +       andq    $(VEC_SIZE * -1), %rdi
> +       subq    %rdi, %rax
> +       sarq    $2, %rax
> +       leaq    -(CHAR_PER_VEC * 1)(%rax, %rsi), %rax
> +# else
> +       leaq    (VEC_SIZE * -1)(%rsi, %rdi), %rax
> +       andq    $(VEC_SIZE * -1), %rdi
> +       subq    %rdi, %rax
> +# endif
> +
> +
> +       VPCMPEQ VEC_SIZE(%rdi), %VZERO, %k0
> +
> +       cmpq    $(CHAR_PER_VEC * 2), %rax
> +       ja      L(more_2x_vec)
> +
> +L(last_2x_vec_or_less):
> +       KMOV    %k0, %VRDX
> +       test    %VRDX, %VRDX
> +       jnz     L(last_vec_check)
> +
> +       /* Check the end of data.  */
> +       SUB_SHORT (CHAR_PER_VEC, rax)
> +       jbe     L(max_0)
> +       VPCMPEQ (VEC_SIZE * 2)(%rdi), %VZERO, %k0
> +       KMOV    %k0, %VRDX
> +       test    %VRDX, %VRDX
> +       jz      L(max_0)
> +       /* Best place for LAST_VEC_CHECK if ZMM.  */
> +       .p2align 4,, 8
> +L(last_vec_check):
> +       bsf     %VRDX, %VRDX
> +       sub     %eax, %edx
> +       lea     (%rsi, %rdx), %eax
> +       cmovae  %esi, %eax
> +       ret
> +
> +# if CHAR_PER_VEC == 32
> +       .p2align 4,, 2
> +L(zero):
> +L(max_0):
> +       movl    %esi, %eax
> +       ret
> +# endif
> +
> +       .p2align 4,, 8
> +L(last_4x_vec_or_less):
> +       addl    $(CHAR_PER_VEC * -4), %eax
> +       VPCMPEQ (VEC_SIZE * 5)(%rdi), %VZERO, %k0
> +       subq    $(VEC_SIZE * -4), %rdi
> +       cmpl    $(CHAR_PER_VEC * 2), %eax
> +       jbe     L(last_2x_vec_or_less)
> +
> +       .p2align 4,, 6
> +L(more_2x_vec):
> +       /* Remaining length >= 2 * CHAR_PER_VEC so do VEC0/VEC1 without
> +          rechecking bounds.  */
>
> -#define USE_AS_STRNLEN 1
> -#define STRLEN STRNLEN
> +       KMOV    %k0, %VRDX
>
> -#include "strlen-evex.S"
> +       test    %VRDX, %VRDX
> +       jnz     L(first_vec_x1)
> +
> +       VPCMPEQ (VEC_SIZE * 2)(%rdi), %VZERO, %k0
> +       KMOV    %k0, %VRDX
> +       test    %VRDX, %VRDX
> +       jnz     L(first_vec_x2)
> +
> +       cmpq    $(CHAR_PER_VEC * 4), %rax
> +       ja      L(more_4x_vec)
> +
> +
> +       VPCMPEQ (VEC_SIZE * 3)(%rdi), %VZERO, %k0
> +       KMOV    %k0, %VRDX
> +       addl    $(CHAR_PER_VEC * -2), %eax
> +       test    %VRDX, %VRDX
> +       jnz     L(last_vec_check)
> +
> +       subl    $(CHAR_PER_VEC), %eax
> +       jbe     L(max_1)
> +
> +       VPCMPEQ (VEC_SIZE * 4)(%rdi), %VZERO, %k0
> +       KMOV    %k0, %VRDX
> +
> +       test    %VRDX, %VRDX
> +       jnz     L(last_vec_check)
> +L(max_1):
> +       movl    %esi, %eax
> +       ret
> +
> +       .p2align 4,, 3
> +L(first_vec_x2):
> +# if VEC_SIZE == 64
> +       /* If VEC_SIZE == 64 we can fit logic for full return label in
> +          spare bytes before next cache line.  */
> +       bsf     %VRDX, %VRDX
> +       sub     %eax, %esi
> +       leal    (CHAR_PER_VEC * 1)(%rsi, %rdx), %eax
> +       ret
> +       .p2align 4,, 6
> +# else
> +       addl    $CHAR_PER_VEC, %esi
> +# endif
> +L(first_vec_x1):
> +       bsf     %VRDX, %VRDX
> +       sub     %eax, %esi
> +       leal    (CHAR_PER_VEC * 0)(%rsi, %rdx), %eax
> +       ret
> +
> +
> +       .p2align 4,, 6
> +L(first_vec_x4):
> +# if VEC_SIZE == 64
> +       /* If VEC_SIZE == 64 we can fit logic for full return label in
> +          spare bytes before next cache line.  */
> +       bsf     %VRDX, %VRDX
> +       sub     %eax, %esi
> +       leal    (CHAR_PER_VEC * 3)(%rsi, %rdx), %eax
> +       ret
> +       .p2align 4,, 6
> +# else
> +       addl    $CHAR_PER_VEC, %esi
> +# endif
> +L(first_vec_x3):
> +       bsf     %VRDX, %VRDX
> +       sub     %eax, %esi
> +       leal    (CHAR_PER_VEC * 2)(%rsi, %rdx), %eax
> +       ret
> +
> +       .p2align 4,, 5
> +L(more_4x_vec):
> +       VPCMPEQ (VEC_SIZE * 3)(%rdi), %VZERO, %k0
> +       KMOV    %k0, %VRDX
> +       test    %VRDX, %VRDX
> +       jnz     L(first_vec_x3)
> +
> +       VPCMPEQ (VEC_SIZE * 4)(%rdi), %VZERO, %k0
> +       KMOV    %k0, %VRDX
> +       test    %VRDX, %VRDX
> +       jnz     L(first_vec_x4)
> +
> +       /* Check if at last VEC_SIZE * 4 length before aligning for the
> +          loop.  */
> +       cmpq    $(CHAR_PER_VEC * 8), %rax
> +       jbe     L(last_4x_vec_or_less)
> +
> +
> +       /* Compute number of words checked after aligning.  */
> +# ifdef USE_AS_WCSLEN
> +       /* Need to compute directly for wcslen as CHAR_SIZE * rsi can
> +          overflow.  */
> +       leaq    (VEC_SIZE * -3)(%rdi), %rdx
> +# else
> +       leaq    (VEC_SIZE * -3)(%rdi, %rax), %rax
> +# endif
> +
> +       subq    $(VEC_SIZE * -1), %rdi
> +
> +       /* Align data to VEC_SIZE * 4.  */
> +# if VEC_SIZE == 64
> +       /* Saves code size.  No evex512 processor has partial register
> +          stalls.  If that change this can be replaced with `andq
> +          $-(VEC_SIZE * 4), %rdi`.  */
> +       xorb    %dil, %dil
> +# else
> +       andq    $-(VEC_SIZE * 4), %rdi
> +# endif
> +
> +# ifdef USE_AS_WCSLEN
> +       subq    %rdi, %rdx
> +       sarq    $2, %rdx
> +       addq    %rdx, %rax
> +# else
> +       subq    %rdi, %rax
> +# endif
> +       /* Compare 4 * VEC at a time forward.  */
> +       .p2align 4,, 11
> +L(loop_4x_vec):
> +       VMOVA   (VEC_SIZE * 4)(%rdi), %VMM(1)
> +       VPMINU  (VEC_SIZE * 5)(%rdi), %VMM(1), %VMM(2)
> +       VMOVA   (VEC_SIZE * 6)(%rdi), %VMM(3)
> +       VPMINU  (VEC_SIZE * 7)(%rdi), %VMM(3), %VMM(4)
> +       VPTESTN %VMM(2), %VMM(2), %k0
> +       VPTESTN %VMM(4), %VMM(4), %k2
> +       subq    $-(VEC_SIZE * 4), %rdi
> +       /* Break if at end of length.  */
> +       subq    $(CHAR_PER_VEC * 4), %rax
> +       jbe     L(loop_len_end)
> +
> +
> +       KORTEST %k0, %k2
> +       jz      L(loop_4x_vec)
> +
> +
> +L(loop_last_4x_vec):
> +       movq    %rsi, %rcx
> +       subq    %rax, %rsi
> +       VPTESTN %VMM(1), %VMM(1), %k1
> +       KMOV    %k1, %VRDX
> +       test    %VRDX, %VRDX
> +       jnz     L(last_vec_x0)
> +
> +       KMOV    %k0, %VRDX
> +       test    %VRDX, %VRDX
> +       jnz     L(last_vec_x1)
> +
> +       VPTESTN %VMM(3), %VMM(3), %k0
> +
> +       /* Seperate logic for VEC_SIZE == 64 and VEC_SIZE == 32 for
> +          returning last 2x VEC. For VEC_SIZE == 64 we test each VEC
> +          individually, for VEC_SIZE == 32 we combine them in a single
> +          64-bit GPR.  */
> +# if CHAR_PER_VEC == 64
> +       KMOV    %k0, %VRDX
> +       test    %VRDX, %VRDX
> +       jnz     L(last_vec_x2)
> +       KMOV    %k2, %VRDX
> +# else
> +       /* We can only combine last 2x VEC masks if CHAR_PER_VEC <= 32.
> +        */
> +       kmovd   %k2, %edx
> +       kmovd   %k0, %eax
> +       salq    $CHAR_PER_VEC, %rdx
> +       orq     %rax, %rdx
> +# endif
> +
> +       /* first_vec_x3 for strlen-ZMM and first_vec_x2 for strlen-YMM.
> +        */
> +       bsfq    %rdx, %rdx
> +       leaq    (FALLTHROUGH_RETURN_OFFSET - CHAR_PER_VEC * 4)(%rsi, %rdx), %rax
> +       cmpq    %rax, %rcx
> +       cmovb   %rcx, %rax
> +       ret
> +
> +       /* Handle last 4x VEC after loop. All VECs have been loaded.  */
> +       .p2align 4,, 4
> +L(loop_len_end):
> +       KORTEST %k0, %k2
> +       jnz     L(loop_last_4x_vec)
> +       movq    %rsi, %rax
> +       ret
> +
> +
> +# if CHAR_PER_VEC == 64
> +       /* Since we can't combine the last 2x VEC for VEC_SIZE == 64
> +          need return label for it.  */
> +       .p2align 4,, 8
> +L(last_vec_x2):
> +       bsf     %VRDX, %VRDX
> +       leaq    (CHAR_PER_VEC * -2)(%rsi, %rdx), %rax
> +       cmpq    %rax, %rcx
> +       cmovb   %rcx, %rax
> +       ret
> +# endif
> +
> +
> +       .p2align 4,, 10
> +L(last_vec_x1):
> +       addq    $CHAR_PER_VEC, %rsi
> +L(last_vec_x0):
> +       bsf     %VRDX, %VRDX
> +       leaq    (CHAR_PER_VEC * -4)(%rsi, %rdx), %rax
> +       cmpq    %rax, %rcx
> +       cmovb   %rcx, %rax
> +       ret
> +
> +
> +       .p2align 4,, 8
> +L(cross_page_boundary):
> +       /* Align data to VEC_SIZE.  */
> +       movq    %rdi, %rcx
> +       andq    $-VEC_SIZE, %rcx
> +       VPCMPEQ (%rcx), %VZERO, %k0
> +
> +       KMOV    %k0, %VRCX
> +# ifdef USE_AS_WCSLEN
> +       shrl    $2, %eax
> +       andl    $(CHAR_PER_VEC - 1), %eax
> +# endif
> +       shrx    %VRAX, %VRCX, %VRCX
> +
> +       negl    %eax
> +       andl    $(CHAR_PER_VEC - 1), %eax
> +       movq    %rsi, %rdx
> +       bsf     %VRCX, %VRDX
> +       cmpq    %rax, %rdx
> +       ja      L(cross_page_continue)
> +       movl    %edx, %eax
> +       cmpq    %rdx, %rsi
> +       cmovb   %esi, %eax
> +       ret
> +END (STRNLEN)
> +#endif
> diff --git a/sysdeps/x86_64/multiarch/wcsnlen-evex.S b/sysdeps/x86_64/multiarch/wcsnlen-evex.S
> index e2aad94c1e..57a7e93fbf 100644
> --- a/sysdeps/x86_64/multiarch/wcsnlen-evex.S
> +++ b/sysdeps/x86_64/multiarch/wcsnlen-evex.S
> @@ -2,8 +2,7 @@
>  # define WCSNLEN       __wcsnlen_evex
>  #endif
>
> -#define STRLEN WCSNLEN
> +#define STRNLEN        WCSNLEN
>  #define USE_AS_WCSLEN 1
> -#define USE_AS_STRNLEN 1
>
> -#include "strlen-evex.S"
> +#include "strnlen-evex.S"
> --
> 2.34.1
>

[-- Attachment #2: strlen.txt --]
[-- Type: text/plain, Size: 5792 bytes --]

Results For: strlen
alignment,length ,__strlen_evex ,__strlen_evex_orig 
0        ,0      ,2.789         ,2.836              ,0.983 
0        ,1      ,2.576         ,2.59               ,0.995 
0        ,1024   ,18.366        ,18.987             ,0.967 
0        ,1152   ,19.69         ,20.571             ,0.957 
0        ,128    ,5.532         ,5.481              ,1.009 
0        ,1280   ,21.278        ,22.211             ,0.958 
0        ,1408   ,22.981        ,23.668             ,0.971 
0        ,1536   ,25.244        ,24.822             ,1.017 
0        ,16     ,2.832         ,2.832              ,1.0   
0        ,160    ,8.36          ,8.71               ,0.96  
0        ,1664   ,26.608        ,26.666             ,0.998 
0        ,1792   ,28.21         ,28.953             ,0.974 
0        ,192    ,9.399         ,8.475              ,1.109 
0        ,1920   ,29.609        ,30.389             ,0.974 
0        ,2      ,3.652         ,3.779              ,0.966 
0        ,2048   ,31.087        ,32.884             ,0.945 
0        ,224    ,9.305         ,8.356              ,1.114 
0        ,2304   ,34.284        ,35.183             ,0.974 
0        ,256    ,9.083         ,10.019             ,0.907 
0        ,2560   ,36.909        ,40.442             ,0.913 
0        ,2816   ,43.14         ,48.723             ,0.885 
0        ,288    ,9.432         ,9.851              ,0.957 
0        ,3      ,2.636         ,2.608              ,1.011 
0        ,3072   ,58.749        ,66.729             ,0.88  
0        ,32     ,4.239         ,4.272              ,0.992 
0        ,320    ,10.685        ,9.969              ,1.072 
0        ,3328   ,69.222        ,68.331             ,1.013 
0        ,352    ,10.704        ,9.7                ,1.104 
0        ,3584   ,72.488        ,72.329             ,1.002 
0        ,384    ,10.635        ,11.528             ,0.923 
0        ,3840   ,74.933        ,76.039             ,0.985 
0        ,4      ,2.777         ,2.743              ,1.013 
0        ,4096   ,79.241        ,77.521             ,1.022 
0        ,416    ,11.036        ,11.535             ,0.957 
0        ,448    ,12.466        ,11.544             ,1.08  
0        ,4608   ,84.571        ,84.503             ,1.001 
0        ,480    ,12.479        ,11.472             ,1.088 
0        ,5      ,2.923         ,2.784              ,1.05  
0        ,512    ,12.12         ,12.888             ,0.94  
0        ,5120   ,91.334        ,91.435             ,0.999 
0        ,5632   ,98.695        ,95.914             ,1.029 
0        ,576    ,13.732        ,12.493             ,1.099 
0        ,6      ,2.928         ,2.75               ,1.064 
0        ,6144   ,104.673       ,102.746            ,1.019 
0        ,64     ,4.372         ,4.281              ,1.021 
0        ,640    ,13.884        ,14.217             ,0.977 
0        ,6656   ,112.122       ,110.392            ,1.016 
0        ,7      ,2.798         ,2.763              ,1.012 
0        ,704    ,15.31         ,14.697             ,1.042 
0        ,7168   ,117.652       ,114.757            ,1.025 
0        ,768    ,15.406        ,16.286             ,0.946 
0        ,7680   ,122.809       ,121.845            ,1.008 
0        ,8      ,2.83          ,2.818              ,1.004 
0        ,832    ,17.179        ,16.597             ,1.035 
0        ,896    ,16.906        ,17.978             ,0.94  
0        ,96     ,4.933         ,4.884              ,1.01  
0        ,960    ,18.548        ,18.041             ,1.028 
1        ,1      ,2.594         ,2.619              ,0.991 
10       ,1024   ,18.161        ,19.003             ,0.956 
10       ,682    ,14.286        ,14.158             ,1.009 
11       ,1365   ,23.596        ,21.917             ,1.077 
11       ,2048   ,31.044        ,32.299             ,0.961 
12       ,2730   ,50.067        ,52.292             ,0.957 
12       ,4096   ,79.161        ,78.804             ,1.005 
2        ,2      ,3.055         ,3.22               ,0.949 
2        ,4      ,2.818         ,2.836              ,0.994 
3        ,3      ,2.699         ,2.896              ,0.932 
3        ,5      ,2.843         ,2.852              ,0.997 
3        ,8      ,2.837         ,2.839              ,0.999 
4        ,10     ,2.84          ,2.825              ,1.005 
4        ,16     ,2.811         ,2.826              ,0.994 
4        ,4      ,2.715         ,2.714              ,1.0   
5        ,21     ,2.782         ,2.824              ,0.985 
5        ,32     ,4.189         ,4.222              ,0.992 
5        ,5      ,2.721         ,2.701              ,1.007 
6        ,42     ,4.295         ,4.211              ,1.02  
6        ,6      ,2.775         ,2.81               ,0.988 
6        ,64     ,4.224         ,4.27               ,0.989 
7        ,1024   ,18.286        ,18.987             ,0.963 
7        ,128    ,5.4           ,5.343              ,1.011 
7        ,16     ,2.846         ,2.836              ,1.003 
7        ,2048   ,31.003        ,32.319             ,0.959 
7        ,256    ,9.04          ,9.946              ,0.909 
7        ,32     ,4.219         ,4.218              ,1.0   
7        ,4      ,2.909         ,2.906              ,1.001 
7        ,4096   ,79.073        ,78.896             ,1.002 
7        ,512    ,12.178        ,12.742             ,0.956 
7        ,64     ,4.368         ,4.519              ,0.967 
7        ,7      ,2.762         ,2.771              ,0.997 
7        ,8      ,2.867         ,2.839              ,1.01  
7        ,85     ,4.187         ,4.336              ,0.966 
8        ,170    ,7.993         ,8.372              ,0.955 
8        ,256    ,9.016         ,9.91               ,0.91  
9        ,341    ,10.593        ,9.577              ,1.106 
9        ,512    ,11.939        ,12.694             ,0.941 
0.9925909850217739

[-- Attachment #3: strnlen.txt --]
[-- Type: text/plain, Size: 95789 bytes --]

Results For: strnlen
align,len  ,max_char ,maxlen ,__strnlen_evex ,__strnlen_evex_orig 
0    ,1    ,127      ,0      ,8.826          ,10.545              ,0.837 
0    ,1    ,127      ,1      ,8.36           ,9.794               ,0.854 
0    ,1    ,127      ,128    ,8.707          ,8.733               ,0.997 
0    ,1    ,127      ,2      ,8.43           ,9.042               ,0.932 
0    ,1    ,127      ,5000   ,8.226          ,8.442               ,0.974 
0    ,1024 ,127      ,1024   ,50.898         ,54.809              ,0.929 
0    ,1024 ,127      ,1056   ,61.814         ,56.289              ,1.098 
0    ,1024 ,127      ,1088   ,61.941         ,57.059              ,1.086 
0    ,1024 ,127      ,1120   ,61.708         ,57.166              ,1.079 
0    ,1024 ,127      ,1152   ,61.88          ,57.664              ,1.073 
0    ,1024 ,127      ,1184   ,62.084         ,60.571              ,1.025 
0    ,1024 ,127      ,1216   ,61.799         ,60.38               ,1.023 
0    ,1024 ,127      ,1248   ,61.836         ,60.313              ,1.025 
0    ,1024 ,127      ,1280   ,61.829         ,60.038              ,1.03  
0    ,1024 ,127      ,1312   ,61.932         ,60.317              ,1.027 
0    ,1024 ,127      ,1344   ,61.726         ,60.014              ,1.029 
0    ,1024 ,127      ,1376   ,62.018         ,60.242              ,1.029 
0    ,1024 ,127      ,1408   ,61.944         ,60.107              ,1.031 
0    ,1024 ,127      ,1440   ,61.799         ,59.875              ,1.032 
0    ,1024 ,127      ,1472   ,61.891         ,60.589              ,1.021 
0    ,1024 ,127      ,1504   ,61.95          ,59.84               ,1.035 
0    ,1024 ,127      ,1536   ,61.757         ,59.769              ,1.033 
0    ,1024 ,127      ,1568   ,61.685         ,60.345              ,1.022 
0    ,1024 ,127      ,1600   ,61.986         ,60.672              ,1.022 
0    ,1024 ,127      ,1632   ,61.845         ,60.189              ,1.028 
0    ,1024 ,127      ,1664   ,61.971         ,61.093              ,1.014 
0    ,1024 ,127      ,1696   ,61.855         ,60.162              ,1.028 
0    ,1024 ,127      ,1728   ,63.386         ,59.919              ,1.058 
0    ,1024 ,127      ,320    ,26.779         ,30.15               ,0.888 
0    ,1024 ,127      ,352    ,26.779         ,30.898              ,0.867 
0    ,1024 ,127      ,384    ,26.768         ,32.851              ,0.815 
0    ,1024 ,127      ,416    ,31.668         ,31.878              ,0.993 
0    ,1024 ,127      ,448    ,31.654         ,33.63               ,0.941 
0    ,1024 ,127      ,480    ,31.685         ,34.387              ,0.921 
0    ,1024 ,127      ,5000   ,61.853         ,62.0                ,0.998 
0    ,1024 ,127      ,512    ,31.67          ,37.012              ,0.856 
0    ,1024 ,127      ,544    ,36.553         ,37.076              ,0.986 
0    ,1024 ,127      ,576    ,36.533         ,38.968              ,0.938 
0    ,1024 ,127      ,608    ,36.527         ,40.962              ,0.892 
0    ,1024 ,127      ,640    ,36.512         ,41.935              ,0.871 
0    ,1024 ,127      ,672    ,41.601         ,40.159              ,1.036 
0    ,1024 ,127      ,704    ,43.111         ,43.128              ,1.0   
0    ,1024 ,127      ,736    ,41.645         ,44.285              ,0.94  
0    ,1024 ,127      ,768    ,41.631         ,46.597              ,0.893 
0    ,1024 ,127      ,800    ,46.671         ,46.504              ,1.004 
0    ,1024 ,127      ,832    ,46.815         ,47.772              ,0.98  
0    ,1024 ,127      ,864    ,46.688         ,51.689              ,0.903 
0    ,1024 ,127      ,896    ,46.743         ,52.56               ,0.889 
0    ,1024 ,127      ,928    ,51.212         ,51.64               ,0.992 
0    ,1024 ,127      ,960    ,51.243         ,53.334              ,0.961 
0    ,1024 ,127      ,992    ,51.256         ,54.768              ,0.936 
0    ,1056 ,127      ,1024   ,51.215         ,55.52               ,0.922 
0    ,1056 ,127      ,512    ,31.646         ,36.902              ,0.858 
0    ,1088 ,127      ,1024   ,51.259         ,56.534              ,0.907 
0    ,1088 ,127      ,512    ,31.647         ,36.903              ,0.858 
0    ,112  ,127      ,16     ,8.512          ,9.287               ,0.917 
0    ,1120 ,127      ,1024   ,51.303         ,55.574              ,0.923 
0    ,1120 ,127      ,512    ,31.644         ,37.558              ,0.843 
0    ,1152 ,127      ,1024   ,51.252         ,56.372              ,0.909 
0    ,1152 ,127      ,512    ,31.647         ,37.888              ,0.835 
0    ,1184 ,127      ,1024   ,51.223         ,56.414              ,0.908 
0    ,1184 ,127      ,512    ,31.635         ,36.859              ,0.858 
0    ,1216 ,127      ,1024   ,51.243         ,55.82               ,0.918 
0    ,1216 ,127      ,512    ,31.66          ,36.881              ,0.858 
0    ,1248 ,127      ,1024   ,51.211         ,55.607              ,0.921 
0    ,128  ,127      ,1      ,8.815          ,8.894               ,0.991 
0    ,128  ,127      ,128    ,15.165         ,17.562              ,0.863 
0    ,128  ,127      ,160    ,18.865         ,20.212              ,0.933 
0    ,128  ,127      ,192    ,17.618         ,16.757              ,1.051 
0    ,128  ,127      ,224    ,17.609         ,16.766              ,1.05  
0    ,128  ,127      ,256    ,17.597         ,16.589              ,1.061 
0    ,128  ,127      ,288    ,17.592         ,17.272              ,1.019 
0    ,128  ,127      ,32     ,8.262          ,8.96                ,0.922 
0    ,128  ,127      ,320    ,17.6           ,16.518              ,1.065 
0    ,128  ,127      ,352    ,17.601         ,16.965              ,1.037 
0    ,128  ,127      ,384    ,17.595         ,16.917              ,1.04  
0    ,128  ,127      ,416    ,17.608         ,16.805              ,1.048 
0    ,128  ,127      ,448    ,17.599         ,17.616              ,0.999 
0    ,128  ,127      ,480    ,17.604         ,16.925              ,1.04  
0    ,128  ,127      ,5000   ,17.6           ,17.169              ,1.025 
0    ,128  ,127      ,512    ,17.617         ,16.877              ,1.044 
0    ,128  ,127      ,544    ,17.618         ,16.679              ,1.056 
0    ,128  ,127      ,576    ,17.588         ,17.283              ,1.018 
0    ,128  ,127      ,608    ,17.611         ,17.113              ,1.029 
0    ,128  ,127      ,64     ,11.588         ,16.35               ,0.709 
0    ,128  ,127      ,640    ,17.596         ,16.752              ,1.05  
0    ,128  ,127      ,672    ,17.606         ,16.778              ,1.049 
0    ,128  ,127      ,704    ,17.591         ,17.232              ,1.021 
0    ,128  ,127      ,736    ,17.605         ,16.987              ,1.036 
0    ,128  ,127      ,768    ,17.619         ,17.879              ,0.985 
0    ,128  ,127      ,800    ,17.605         ,17.371              ,1.013 
0    ,128  ,127      ,832    ,17.603         ,16.967              ,1.037 
0    ,128  ,127      ,96     ,12.339         ,16.454              ,0.75  
0    ,1280 ,127      ,1024   ,51.193         ,55.361              ,0.925 
0    ,1312 ,127      ,1024   ,51.2           ,56.589              ,0.905 
0    ,1344 ,127      ,1024   ,51.203         ,55.915              ,0.916 
0    ,1344 ,127      ,2048   ,75.041         ,70.123              ,1.07  
0    ,1376 ,127      ,1024   ,51.251         ,55.31               ,0.927 
0    ,1376 ,127      ,2048   ,75.027         ,70.119              ,1.07  
0    ,1408 ,127      ,1024   ,51.199         ,56.591              ,0.905 
0    ,1408 ,127      ,2048   ,75.92          ,74.458              ,1.02  
0    ,144  ,127      ,16     ,8.276          ,9.446               ,0.876 
0    ,1440 ,127      ,1024   ,51.278         ,55.935              ,0.917 
0    ,1440 ,127      ,2048   ,76.43          ,72.711              ,1.051 
0    ,1472 ,127      ,1024   ,51.257         ,56.579              ,0.906 
0    ,1472 ,127      ,2048   ,79.523         ,74.993              ,1.06  
0    ,1504 ,127      ,1024   ,51.191         ,56.314              ,0.909 
0    ,1504 ,127      ,2048   ,79.489         ,74.554              ,1.066 
0    ,1536 ,127      ,1024   ,51.204         ,55.617              ,0.921 
0    ,1536 ,127      ,2048   ,80.762         ,80.577              ,1.002 
0    ,1568 ,127      ,1024   ,51.231         ,55.206              ,0.928 
0    ,1568 ,127      ,2048   ,81.672         ,77.45               ,1.055 
0    ,16   ,127      ,112    ,8.028          ,7.947               ,1.01  
0    ,16   ,127      ,144    ,8.253          ,7.179               ,1.15  
0    ,16   ,127      ,16     ,7.711          ,8.782               ,0.878 
0    ,16   ,127      ,176    ,7.765          ,7.904               ,0.982 
0    ,16   ,127      ,208    ,7.985          ,7.606               ,1.05  
0    ,16   ,127      ,240    ,7.872          ,8.401               ,0.937 
0    ,16   ,127      ,272    ,7.991          ,7.467               ,1.07  
0    ,16   ,127      ,304    ,7.872          ,7.737               ,1.018 
0    ,16   ,127      ,336    ,7.981          ,7.474               ,1.068 
0    ,16   ,127      ,368    ,7.985          ,8.093               ,0.987 
0    ,16   ,127      ,400    ,8.134          ,7.181               ,1.133 
0    ,16   ,127      ,432    ,7.913          ,8.09                ,0.978 
0    ,16   ,127      ,464    ,7.873          ,8.062               ,0.976 
0    ,16   ,127      ,48     ,8.523          ,7.473               ,1.14  
0    ,16   ,127      ,496    ,7.872          ,7.469               ,1.054 
0    ,16   ,127      ,5000   ,8.014          ,7.552               ,1.061 
0    ,16   ,127      ,528    ,8.103          ,7.766               ,1.043 
0    ,16   ,127      ,560    ,7.77           ,7.495               ,1.037 
0    ,16   ,127      ,592    ,7.872          ,7.779               ,1.012 
0    ,16   ,127      ,624    ,7.877          ,7.929               ,0.993 
0    ,16   ,127      ,656    ,8.207          ,8.078               ,1.016 
0    ,16   ,127      ,688    ,8.081          ,8.243               ,0.98  
0    ,16   ,127      ,720    ,7.895          ,7.96                ,0.992 
0    ,16   ,127      ,80     ,7.766          ,8.232               ,0.943 
0    ,160  ,127      ,128    ,15.154         ,18.801              ,0.806 
0    ,160  ,127      ,256    ,20.798         ,22.397              ,0.929 
0    ,160  ,127      ,32     ,8.391          ,9.465               ,0.887 
0    ,160  ,127      ,512    ,28.453         ,27.335              ,1.041 
0    ,160  ,127      ,64     ,11.772         ,16.048              ,0.734 
0    ,1600 ,127      ,1024   ,51.248         ,56.536              ,0.906 
0    ,1600 ,127      ,2048   ,83.783         ,79.095              ,1.059 
0    ,1632 ,127      ,1024   ,51.209         ,55.354              ,0.925 
0    ,1632 ,127      ,2048   ,83.795         ,80.783              ,1.037 
0    ,1664 ,127      ,1024   ,51.231         ,55.463              ,0.924 
0    ,1664 ,127      ,2048   ,84.843         ,81.011              ,1.047 
0    ,1696 ,127      ,1024   ,51.224         ,55.806              ,0.918 
0    ,1696 ,127      ,2048   ,85.355         ,81.067              ,1.053 
0    ,1728 ,127      ,1024   ,51.24          ,55.575              ,0.922 
0    ,1728 ,127      ,2048   ,88.35          ,85.182              ,1.037 
0    ,176  ,127      ,16     ,7.848          ,9.112               ,0.861 
0    ,1760 ,127      ,2048   ,88.324         ,86.607              ,1.02  
0    ,1792 ,127      ,2048   ,89.051         ,89.539              ,0.995 
0    ,1824 ,127      ,2048   ,89.869         ,89.569              ,1.003 
0    ,1856 ,127      ,2048   ,92.812         ,92.592              ,1.002 
0    ,1888 ,127      ,2048   ,92.888         ,89.784              ,1.035 
0    ,192  ,127      ,128    ,16.134         ,19.141              ,0.843 
0    ,192  ,127      ,256    ,22.552         ,23.728              ,0.95  
0    ,192  ,127      ,32     ,7.771          ,8.878               ,0.875 
0    ,192  ,127      ,512    ,30.556         ,27.211              ,1.123 
0    ,192  ,127      ,64     ,11.901         ,15.859              ,0.75  
0    ,1920 ,127      ,2048   ,93.42          ,87.672              ,1.066 
0    ,1952 ,127      ,2048   ,94.412         ,89.887              ,1.05  
0    ,1984 ,127      ,2048   ,97.4           ,95.328              ,1.022 
0    ,2    ,127      ,1      ,8.372          ,8.943               ,0.936 
0    ,2    ,127      ,2      ,8.219          ,9.107               ,0.902 
0    ,2    ,127      ,3      ,8.136          ,9.115               ,0.893 
0    ,2    ,127      ,5000   ,8.244          ,7.468               ,1.104 
0    ,2016 ,127      ,2048   ,97.397         ,93.516              ,1.042 
0    ,2048 ,127      ,1344   ,65.155         ,65.144              ,1.0   
0    ,2048 ,127      ,1376   ,65.218         ,68.192              ,0.956 
0    ,2048 ,127      ,1408   ,65.129         ,69.788              ,0.933 
0    ,2048 ,127      ,1440   ,69.729         ,69.167              ,1.008 
0    ,2048 ,127      ,1472   ,69.858         ,70.173              ,0.996 
0    ,2048 ,127      ,1504   ,69.811         ,76.589              ,0.912 
0    ,2048 ,127      ,1536   ,69.755         ,71.866              ,0.971 
0    ,2048 ,127      ,1568   ,74.011         ,72.649              ,1.019 
0    ,2048 ,127      ,1600   ,74.101         ,73.454              ,1.009 
0    ,2048 ,127      ,1632   ,74.022         ,78.453              ,0.944 
0    ,2048 ,127      ,1664   ,74.022         ,76.724              ,0.965 
0    ,2048 ,127      ,1696   ,78.328         ,77.968              ,1.005 
0    ,2048 ,127      ,1728   ,78.165         ,79.1                ,0.988 
0    ,2048 ,127      ,1760   ,78.292         ,86.051              ,0.91  
0    ,2048 ,127      ,1792   ,78.238         ,82.325              ,0.95  
0    ,2048 ,127      ,1824   ,82.681         ,91.502              ,0.904 
0    ,2048 ,127      ,1856   ,82.708         ,90.495              ,0.914 
0    ,2048 ,127      ,1888   ,82.688         ,90.966              ,0.909 
0    ,2048 ,127      ,1920   ,82.953         ,88.146              ,0.941 
0    ,2048 ,127      ,1952   ,88.907         ,86.354              ,1.03  
0    ,2048 ,127      ,1984   ,87.401         ,89.249              ,0.979 
0    ,2048 ,127      ,2016   ,87.451         ,93.03               ,0.94  
0    ,2048 ,127      ,2048   ,87.085         ,87.77               ,0.992 
0    ,2048 ,127      ,2080   ,97.034         ,91.859              ,1.056 
0    ,2048 ,127      ,2112   ,97.241         ,89.463              ,1.087 
0    ,2048 ,127      ,2144   ,97.439         ,91.745              ,1.062 
0    ,2048 ,127      ,2176   ,97.365         ,91.434              ,1.065 
0    ,2048 ,127      ,2208   ,97.29          ,94.349              ,1.031 
0    ,2048 ,127      ,2240   ,97.514         ,94.828              ,1.028 
0    ,2048 ,127      ,2272   ,97.354         ,96.468              ,1.009 
0    ,2048 ,127      ,2304   ,97.463         ,95.07               ,1.025 
0    ,2048 ,127      ,2336   ,97.521         ,93.862              ,1.039 
0    ,2048 ,127      ,2368   ,97.458         ,91.991              ,1.059 
0    ,2048 ,127      ,2400   ,97.462         ,95.001              ,1.026 
0    ,2048 ,127      ,2432   ,97.431         ,94.729              ,1.029 
0    ,2048 ,127      ,2464   ,98.059         ,96.648              ,1.015 
0    ,2048 ,127      ,2496   ,98.201         ,94.299              ,1.041 
0    ,2048 ,127      ,2528   ,97.463         ,92.872              ,1.049 
0    ,2048 ,127      ,2560   ,97.224         ,92.746              ,1.048 
0    ,2048 ,127      ,2592   ,97.552         ,92.734              ,1.052 
0    ,2048 ,127      ,2624   ,97.225         ,94.323              ,1.031 
0    ,2048 ,127      ,2656   ,97.533         ,92.955              ,1.049 
0    ,2048 ,127      ,2688   ,97.286         ,92.563              ,1.051 
0    ,2048 ,127      ,2720   ,97.663         ,93.009              ,1.05  
0    ,2048 ,127      ,2752   ,97.566         ,92.544              ,1.054 
0    ,208  ,127      ,16     ,8.269          ,9.636               ,0.858 
0    ,2080 ,127      ,2048   ,87.327         ,88.36               ,0.988 
0    ,2112 ,127      ,2048   ,87.295         ,88.916              ,0.982 
0    ,2144 ,127      ,2048   ,87.303         ,88.041              ,0.992 
0    ,2176 ,127      ,2048   ,87.271         ,92.076              ,0.948 
0    ,2208 ,127      ,2048   ,87.277         ,88.826              ,0.983 
0    ,224  ,127      ,128    ,15.744         ,18.486              ,0.852 
0    ,224  ,127      ,256    ,25.117         ,24.473              ,1.026 
0    ,224  ,127      ,32     ,8.188          ,9.108               ,0.899 
0    ,224  ,127      ,512    ,30.598         ,27.231              ,1.124 
0    ,224  ,127      ,64     ,11.588         ,14.368              ,0.807 
0    ,2240 ,127      ,2048   ,87.264         ,92.115              ,0.947 
0    ,2272 ,127      ,2048   ,87.337         ,93.49               ,0.934 
0    ,2304 ,127      ,2048   ,89.4           ,88.821              ,1.007 
0    ,2336 ,127      ,2048   ,87.416         ,91.319              ,0.957 
0    ,2368 ,127      ,2048   ,87.567         ,91.481              ,0.957 
0    ,240  ,127      ,16     ,7.919          ,9.446               ,0.838 
0    ,2400 ,127      ,2048   ,87.283         ,91.766              ,0.951 
0    ,2432 ,127      ,2048   ,87.24          ,88.452              ,0.986 
0    ,2464 ,127      ,2048   ,87.265         ,89.14               ,0.979 
0    ,2496 ,127      ,2048   ,87.269         ,90.857              ,0.961 
0    ,2528 ,127      ,2048   ,87.281         ,88.188              ,0.99  
0    ,256  ,127      ,128    ,15.801         ,18.709              ,0.845 
0    ,256  ,127      ,160    ,16.748         ,19.81               ,0.845 
0    ,256  ,127      ,192    ,20.426         ,22.021              ,0.928 
0    ,256  ,127      ,224    ,21.854         ,25.135              ,0.869 
0    ,256  ,127      ,256    ,24.458         ,23.601              ,1.036 
0    ,256  ,127      ,288    ,27.505         ,26.207              ,1.05  
0    ,256  ,127      ,32     ,8.482          ,8.969               ,0.946 
0    ,256  ,127      ,320    ,32.108         ,29.16               ,1.101 
0    ,256  ,127      ,352    ,32.026         ,27.815              ,1.151 
0    ,256  ,127      ,384    ,32.05          ,27.73               ,1.156 
0    ,256  ,127      ,416    ,31.946         ,31.99               ,0.999 
0    ,256  ,127      ,448    ,32.078         ,32.051              ,1.001 
0    ,256  ,127      ,480    ,32.029         ,31.955              ,1.002 
0    ,256  ,127      ,5000   ,32.099         ,32.119              ,0.999 
0    ,256  ,127      ,512    ,32.106         ,31.981              ,1.004 
0    ,256  ,127      ,544    ,32.112         ,32.085              ,1.001 
0    ,256  ,127      ,576    ,32.102         ,32.016              ,1.003 
0    ,256  ,127      ,608    ,32.129         ,32.028              ,1.003 
0    ,256  ,127      ,64     ,11.543         ,16.009              ,0.721 
0    ,256  ,127      ,640    ,32.065         ,32.097              ,0.999 
0    ,256  ,127      ,672    ,32.034         ,31.884              ,1.005 
0    ,256  ,127      ,704    ,33.044         ,32.017              ,1.032 
0    ,256  ,127      ,736    ,32.079         ,31.959              ,1.004 
0    ,256  ,127      ,768    ,32.121         ,32.047              ,1.002 
0    ,256  ,127      ,800    ,32.118         ,31.976              ,1.004 
0    ,256  ,127      ,832    ,32.062         ,31.96               ,1.003 
0    ,256  ,127      ,864    ,32.031         ,31.882              ,1.005 
0    ,256  ,127      ,896    ,32.091         ,31.986              ,1.003 
0    ,256  ,127      ,928    ,32.001         ,31.985              ,1.001 
0    ,256  ,127      ,96     ,12.448         ,16.698              ,0.745 
0    ,256  ,127      ,960    ,32.025         ,32.087              ,0.998 
0    ,2560 ,127      ,2048   ,87.253         ,88.383              ,0.987 
0    ,2592 ,127      ,2048   ,87.302         ,88.626              ,0.985 
0    ,2624 ,127      ,2048   ,87.315         ,93.108              ,0.938 
0    ,2656 ,127      ,2048   ,88.187         ,88.823              ,0.993 
0    ,2688 ,127      ,2048   ,87.345         ,88.174              ,0.991 
0    ,272  ,127      ,16     ,7.93           ,9.626               ,0.824 
0    ,2720 ,127      ,2048   ,87.285         ,88.878              ,0.982 
0    ,2752 ,127      ,2048   ,87.233         ,88.579              ,0.985 
0    ,288  ,127      ,128    ,15.364         ,18.403              ,0.835 
0    ,288  ,127      ,256    ,24.552         ,24.252              ,1.012 
0    ,288  ,127      ,32     ,8.017          ,9.577               ,0.837 
0    ,288  ,127      ,512    ,33.191         ,32.165              ,1.032 
0    ,288  ,127      ,64     ,11.494         ,15.185              ,0.757 
0    ,3    ,127      ,2      ,8.285          ,8.966               ,0.924 
0    ,3    ,127      ,3      ,8.167          ,8.983               ,0.909 
0    ,3    ,127      ,4      ,8.01           ,9.069               ,0.883 
0    ,3    ,127      ,5000   ,8.128          ,7.766               ,1.047 
0    ,304  ,127      ,16     ,8.096          ,9.454               ,0.856 
0    ,32   ,127      ,128    ,12.311         ,16.153              ,0.762 
0    ,32   ,127      ,160    ,12.336         ,16.172              ,0.763 
0    ,32   ,127      ,192    ,12.305         ,13.279              ,0.927 
0    ,32   ,127      ,224    ,12.308         ,13.091              ,0.94  
0    ,32   ,127      ,256    ,12.632         ,13.381              ,0.944 
0    ,32   ,127      ,288    ,12.294         ,12.47               ,0.986 
0    ,32   ,127      ,32     ,7.66           ,8.781               ,0.872 
0    ,32   ,127      ,320    ,12.333         ,13.122              ,0.94  
0    ,32   ,127      ,352    ,12.339         ,12.464              ,0.99  
0    ,32   ,127      ,384    ,12.304         ,12.46               ,0.987 
0    ,32   ,127      ,416    ,12.336         ,13.574              ,0.909 
0    ,32   ,127      ,448    ,12.354         ,12.306              ,1.004 
0    ,32   ,127      ,480    ,12.304         ,12.304              ,1.0   
0    ,32   ,127      ,5000   ,12.306         ,13.123              ,0.938 
0    ,32   ,127      ,512    ,12.32          ,13.246              ,0.93  
0    ,32   ,127      ,544    ,12.34          ,13.222              ,0.933 
0    ,32   ,127      ,576    ,12.339         ,12.918              ,0.955 
0    ,32   ,127      ,608    ,12.343         ,12.805              ,0.964 
0    ,32   ,127      ,64     ,12.98          ,14.809              ,0.877 
0    ,32   ,127      ,640    ,12.304         ,12.471              ,0.987 
0    ,32   ,127      ,672    ,12.303         ,12.464              ,0.987 
0    ,32   ,127      ,704    ,12.3           ,12.804              ,0.961 
0    ,32   ,127      ,736    ,12.298         ,12.464              ,0.987 
0    ,32   ,127      ,96     ,12.424         ,14.9                ,0.834 
0    ,320  ,127      ,1024   ,35.324         ,31.788              ,1.111 
0    ,320  ,127      ,128    ,15.262         ,18.518              ,0.824 
0    ,320  ,127      ,256    ,24.669         ,25.17               ,0.98  
0    ,320  ,127      ,32     ,7.999          ,9.123               ,0.877 
0    ,320  ,127      ,512    ,35.3           ,31.824              ,1.109 
0    ,320  ,127      ,64     ,11.522         ,15.007              ,0.768 
0    ,336  ,127      ,16     ,7.981          ,8.948               ,0.892 
0    ,3392 ,127      ,4096   ,150.235        ,190.301             ,0.789 
0    ,3424 ,127      ,4096   ,144.605        ,190.131             ,0.761 
0    ,3456 ,127      ,4096   ,142.366        ,193.997             ,0.734 
0    ,3488 ,127      ,4096   ,145.561        ,196.579             ,0.74  
0    ,352  ,127      ,1024   ,35.334         ,31.77               ,1.112 
0    ,352  ,127      ,128    ,16.03          ,18.485              ,0.867 
0    ,352  ,127      ,256    ,24.505         ,24.607              ,0.996 
0    ,352  ,127      ,32     ,8.016          ,9.285               ,0.863 
0    ,352  ,127      ,512    ,35.297         ,31.777              ,1.111 
0    ,352  ,127      ,64     ,11.594         ,16.022              ,0.724 
0    ,3520 ,127      ,4096   ,149.189        ,187.86              ,0.794 
0    ,3552 ,127      ,4096   ,148.896        ,189.592             ,0.785 
0    ,3584 ,127      ,4096   ,146.434        ,195.891             ,0.748 
0    ,3616 ,127      ,4096   ,149.628        ,194.825             ,0.768 
0    ,3648 ,127      ,4096   ,153.47         ,190.168             ,0.807 
0    ,368  ,127      ,16     ,8.17           ,9.113               ,0.897 
0    ,3680 ,127      ,4096   ,155.436        ,191.619             ,0.811 
0    ,3712 ,127      ,4096   ,149.822        ,203.939             ,0.735 
0    ,3744 ,127      ,4096   ,153.881        ,196.519             ,0.783 
0    ,3776 ,127      ,4096   ,158.302        ,200.946             ,0.788 
0    ,3808 ,127      ,4096   ,158.081        ,209.14              ,0.756 
0    ,384  ,127      ,1024   ,37.181         ,36.796              ,1.01  
0    ,384  ,127      ,128    ,16.028         ,18.65               ,0.859 
0    ,384  ,127      ,256    ,24.866         ,24.507              ,1.015 
0    ,384  ,127      ,32     ,8.429          ,8.943               ,0.943 
0    ,384  ,127      ,512    ,37.171         ,32.643              ,1.139 
0    ,384  ,127      ,64     ,11.473         ,15.68               ,0.732 
0    ,3840 ,127      ,4096   ,155.507        ,200.042             ,0.777 
0    ,3872 ,127      ,4096   ,158.122        ,199.468             ,0.793 
0    ,3904 ,127      ,4096   ,163.552        ,199.163             ,0.821 
0    ,3936 ,127      ,4096   ,162.695        ,204.503             ,0.796 
0    ,3968 ,127      ,4096   ,173.435        ,177.618             ,0.976 
0    ,4    ,127      ,3      ,8.129          ,9.283               ,0.876 
0    ,4    ,127      ,4      ,7.918          ,9.049               ,0.875 
0    ,4    ,127      ,5      ,8.122          ,9.107               ,0.892 
0    ,4    ,127      ,5000   ,7.665          ,7.321               ,1.047 
0    ,400  ,127      ,16     ,8.183          ,8.943               ,0.915 
0    ,4000 ,127      ,4096   ,182.372        ,176.806             ,1.031 
0    ,4032 ,127      ,4096   ,173.531        ,176.896             ,0.981 
0    ,4064 ,127      ,4096   ,170.429        ,188.202             ,0.906 
0    ,4096 ,127      ,3392   ,134.112        ,159.888             ,0.839 
0    ,4096 ,127      ,3424   ,134.255        ,171.495             ,0.783 
0    ,4096 ,127      ,3456   ,134.558        ,165.724             ,0.812 
0    ,4096 ,127      ,3488   ,138.429        ,166.295             ,0.832 
0    ,4096 ,127      ,3520   ,138.508        ,163.608             ,0.847 
0    ,4096 ,127      ,3552   ,138.455        ,167.833             ,0.825 
0    ,4096 ,127      ,3584   ,139.393        ,165.671             ,0.841 
0    ,4096 ,127      ,3616   ,142.563        ,170.198             ,0.838 
0    ,4096 ,127      ,3648   ,142.746        ,169.878             ,0.84  
0    ,4096 ,127      ,3680   ,142.798        ,171.673             ,0.832 
0    ,4096 ,127      ,3712   ,142.619        ,173.275             ,0.823 
0    ,4096 ,127      ,3744   ,147.268        ,170.217             ,0.865 
0    ,4096 ,127      ,3776   ,147.036        ,169.047             ,0.87  
0    ,4096 ,127      ,3808   ,146.977        ,172.515             ,0.852 
0    ,4096 ,127      ,3840   ,147.399        ,175.952             ,0.838 
0    ,4096 ,127      ,3872   ,151.254        ,178.702             ,0.846 
0    ,4096 ,127      ,3904   ,151.309        ,177.89              ,0.851 
0    ,4096 ,127      ,3936   ,151.626        ,181.201             ,0.837 
0    ,4096 ,127      ,3968   ,151.281        ,177.809             ,0.851 
0    ,4096 ,127      ,4000   ,155.566        ,176.872             ,0.88  
0    ,4096 ,127      ,4032   ,156.314        ,178.469             ,0.876 
0    ,4096 ,127      ,4064   ,156.323        ,191.263             ,0.817 
0    ,4096 ,127      ,4096   ,155.278        ,175.579             ,0.884 
0    ,4096 ,127      ,4128   ,163.473        ,187.974             ,0.87  
0    ,4096 ,127      ,4160   ,166.296        ,182.482             ,0.911 
0    ,4096 ,127      ,4192   ,162.559        ,178.45              ,0.911 
0    ,4096 ,127      ,4224   ,164.064        ,179.153             ,0.916 
0    ,4096 ,127      ,4256   ,181.209        ,212.238             ,0.854 
0    ,4096 ,127      ,4288   ,167.509        ,206.898             ,0.81  
0    ,4096 ,127      ,4320   ,162.726        ,210.745             ,0.772 
0    ,4096 ,127      ,4352   ,163.294        ,215.134             ,0.759 
0    ,4096 ,127      ,4384   ,163.785        ,208.764             ,0.785 
0    ,4096 ,127      ,4416   ,164.439        ,207.951             ,0.791 
0    ,4096 ,127      ,4448   ,163.662        ,206.41              ,0.793 
0    ,4096 ,127      ,4480   ,164.414        ,205.231             ,0.801 
0    ,4096 ,127      ,4512   ,163.637        ,214.655             ,0.762 
0    ,4096 ,127      ,4544   ,162.945        ,207.81              ,0.784 
0    ,4096 ,127      ,4576   ,162.81         ,212.317             ,0.767 
0    ,4096 ,127      ,4608   ,167.929        ,207.966             ,0.807 
0    ,4096 ,127      ,4640   ,162.01         ,207.893             ,0.779 
0    ,4096 ,127      ,4672   ,172.59         ,209.725             ,0.823 
0    ,4096 ,127      ,4704   ,168.842        ,209.017             ,0.808 
0    ,4096 ,127      ,4736   ,172.708        ,221.116             ,0.781 
0    ,4096 ,127      ,4768   ,163.522        ,209.261             ,0.781 
0    ,4096 ,127      ,4800   ,162.52         ,213.294             ,0.762 
0    ,4128 ,127      ,4096   ,155.478        ,182.694             ,0.851 
0    ,416  ,127      ,1024   ,38.324         ,37.116              ,1.033 
0    ,416  ,127      ,128    ,15.347         ,18.663              ,0.822 
0    ,416  ,127      ,256    ,24.518         ,24.291              ,1.009 
0    ,416  ,127      ,32     ,8.096          ,9.275               ,0.873 
0    ,416  ,127      ,512    ,38.394         ,34.173              ,1.124 
0    ,416  ,127      ,64     ,11.255         ,14.832              ,0.759 
0    ,4160 ,127      ,4096   ,155.74         ,184.944             ,0.842 
0    ,4192 ,127      ,4096   ,155.272        ,183.359             ,0.847 
0    ,4224 ,127      ,4096   ,155.427        ,181.21              ,0.858 
0    ,4256 ,127      ,4096   ,155.675        ,180.996             ,0.86  
0    ,4288 ,127      ,4096   ,156.771        ,179.921             ,0.871 
0    ,432  ,127      ,16     ,8.512          ,8.949               ,0.951 
0    ,4320 ,127      ,4096   ,157.846        ,181.116             ,0.872 
0    ,4352 ,127      ,4096   ,155.56         ,185.393             ,0.839 
0    ,4384 ,127      ,4096   ,155.489        ,186.039             ,0.836 
0    ,4416 ,127      ,4096   ,155.707        ,182.402             ,0.854 
0    ,4448 ,127      ,4096   ,155.77         ,181.283             ,0.859 
0    ,448  ,127      ,1024   ,40.651         ,36.497              ,1.114 
0    ,448  ,127      ,128    ,15.182         ,19.331              ,0.785 
0    ,448  ,127      ,256    ,24.505         ,24.898              ,0.984 
0    ,448  ,127      ,32     ,7.933          ,8.788               ,0.903 
0    ,448  ,127      ,512    ,40.662         ,37.111              ,1.096 
0    ,448  ,127      ,64     ,11.556         ,16.163              ,0.715 
0    ,4480 ,127      ,4096   ,156.429        ,184.441             ,0.848 
0    ,4512 ,127      ,4096   ,155.53         ,180.857             ,0.86  
0    ,4544 ,127      ,4096   ,156.2          ,183.916             ,0.849 
0    ,4576 ,127      ,4096   ,155.654        ,180.911             ,0.86  
0    ,4608 ,127      ,4096   ,155.66         ,185.312             ,0.84  
0    ,464  ,127      ,16     ,8.127          ,9.619               ,0.845 
0    ,4640 ,127      ,4096   ,155.667        ,179.762             ,0.866 
0    ,4672 ,127      ,4096   ,155.61         ,186.585             ,0.834 
0    ,4704 ,127      ,4096   ,155.664        ,189.499             ,0.821 
0    ,4736 ,127      ,4096   ,155.896        ,187.151             ,0.833 
0    ,4768 ,127      ,4096   ,155.663        ,185.39              ,0.84  
0    ,48   ,127      ,16     ,8.181          ,8.943               ,0.915 
0    ,480  ,127      ,1024   ,40.736         ,36.551              ,1.115 
0    ,480  ,127      ,128    ,15.69          ,18.342              ,0.855 
0    ,480  ,127      ,256    ,24.684         ,24.586              ,1.004 
0    ,480  ,127      ,32     ,8.127          ,9.456               ,0.859 
0    ,480  ,127      ,512    ,40.643         ,37.968              ,1.07  
0    ,480  ,127      ,64     ,11.367         ,15.192              ,0.748 
0    ,4800 ,127      ,4096   ,155.66         ,185.849             ,0.838 
0    ,496  ,127      ,16     ,8.395          ,9.28                ,0.905 
0    ,5    ,127      ,4      ,8.201          ,9.108               ,0.9   
0    ,5    ,127      ,5      ,8.085          ,9.107               ,0.888 
0    ,5    ,127      ,5000   ,8.128          ,7.622               ,1.066 
0    ,5    ,127      ,6      ,8.156          ,9.28                ,0.879 
0    ,5000 ,127      ,1      ,8.628          ,8.806               ,0.98  
0    ,5000 ,127      ,1024   ,51.209         ,56.867              ,0.901 
0    ,5000 ,127      ,128    ,17.026         ,18.619              ,0.914 
0    ,5000 ,127      ,16     ,8.186          ,9.38                ,0.873 
0    ,5000 ,127      ,2      ,8.136          ,9.123               ,0.892 
0    ,5000 ,127      ,256    ,24.936         ,24.81               ,1.005 
0    ,5000 ,127      ,3      ,8.277          ,9.624               ,0.86  
0    ,5000 ,127      ,32     ,8.417          ,9.114               ,0.924 
0    ,5000 ,127      ,4      ,7.665          ,8.788               ,0.872 
0    ,5000 ,127      ,5      ,7.872          ,8.943               ,0.88  
0    ,5000 ,127      ,512    ,31.663         ,37.085              ,0.854 
0    ,5000 ,127      ,6      ,8.644          ,9.052               ,0.955 
0    ,5000 ,127      ,64     ,11.542         ,15.94               ,0.724 
0    ,5000 ,127      ,7      ,8.02           ,9.011               ,0.89  
0    ,5000 ,127      ,8      ,8.026          ,8.952               ,0.897 
0    ,512  ,127      ,1024   ,41.887         ,41.549              ,1.008 
0    ,512  ,127      ,1056   ,41.851         ,41.465              ,1.009 
0    ,512  ,127      ,1088   ,41.795         ,42.078              ,0.993 
0    ,512  ,127      ,1120   ,41.903         ,41.43               ,1.011 
0    ,512  ,127      ,1152   ,42.096         ,41.437              ,1.016 
0    ,512  ,127      ,1184   ,41.949         ,41.367              ,1.014 
0    ,512  ,127      ,1216   ,42.025         ,41.343              ,1.016 
0    ,512  ,127      ,128    ,16.134         ,18.676              ,0.864 
0    ,512  ,127      ,160    ,16.73          ,19.325              ,0.866 
0    ,512  ,127      ,192    ,20.227         ,22.514              ,0.898 
0    ,512  ,127      ,224    ,21.703         ,23.175              ,0.936 
0    ,512  ,127      ,256    ,24.883         ,25.43               ,0.978 
0    ,512  ,127      ,288    ,26.298         ,26.515              ,0.992 
0    ,512  ,127      ,32     ,8.456          ,9.142               ,0.925 
0    ,512  ,127      ,320    ,26.787         ,30.445              ,0.88  
0    ,512  ,127      ,352    ,26.768         ,31.235              ,0.857 
0    ,512  ,127      ,384    ,26.813         ,32.966              ,0.813 
0    ,512  ,127      ,416    ,31.659         ,32.359              ,0.978 
0    ,512  ,127      ,448    ,31.659         ,34.141              ,0.927 
0    ,512  ,127      ,480    ,31.653         ,33.596              ,0.942 
0    ,512  ,127      ,5000   ,41.891         ,41.417              ,1.011 
0    ,512  ,127      ,512    ,31.538         ,36.786              ,0.857 
0    ,512  ,127      ,544    ,41.989         ,37.363              ,1.124 
0    ,512  ,127      ,576    ,42.276         ,37.994              ,1.113 
0    ,512  ,127      ,608    ,42.033         ,37.045              ,1.135 
0    ,512  ,127      ,64     ,11.594         ,15.701              ,0.738 
0    ,512  ,127      ,640    ,41.864         ,37.692              ,1.111 
0    ,512  ,127      ,672    ,41.934         ,41.474              ,1.011 
0    ,512  ,127      ,704    ,41.944         ,41.419              ,1.013 
0    ,512  ,127      ,736    ,41.991         ,41.586              ,1.01  
0    ,512  ,127      ,768    ,41.921         ,41.356              ,1.014 
0    ,512  ,127      ,800    ,41.983         ,41.394              ,1.014 
0    ,512  ,127      ,832    ,42.518         ,41.454              ,1.026 
0    ,512  ,127      ,864    ,41.914         ,41.342              ,1.014 
0    ,512  ,127      ,896    ,41.8           ,41.642              ,1.004 
0    ,512  ,127      ,928    ,42.012         ,41.354              ,1.016 
0    ,512  ,127      ,96     ,12.48          ,16.392              ,0.761 
0    ,512  ,127      ,960    ,41.87          ,43.373              ,0.965 
0    ,512  ,127      ,992    ,41.867         ,41.742              ,1.003 
0    ,528  ,127      ,16     ,8.391          ,9.293               ,0.903 
0    ,544  ,127      ,1024   ,43.101         ,41.449              ,1.04  
0    ,544  ,127      ,128    ,15.444         ,19.018              ,0.812 
0    ,544  ,127      ,256    ,24.483         ,25.001              ,0.979 
0    ,544  ,127      ,32     ,8.179          ,9.353               ,0.874 
0    ,544  ,127      ,512    ,31.643         ,36.862              ,0.858 
0    ,544  ,127      ,64     ,11.256         ,15.206              ,0.74  
0    ,560  ,127      ,16     ,7.766          ,9.446               ,0.822 
0    ,576  ,127      ,1024   ,45.631         ,41.479              ,1.1   
0    ,576  ,127      ,128    ,15.526         ,19.48               ,0.797 
0    ,576  ,127      ,256    ,24.474         ,24.807              ,0.987 
0    ,576  ,127      ,32     ,8.244          ,9.45                ,0.872 
0    ,576  ,127      ,512    ,31.66          ,37.825              ,0.837 
0    ,576  ,127      ,64     ,11.602         ,15.611              ,0.743 
0    ,592  ,127      ,16     ,7.991          ,9.556               ,0.836 
0    ,6    ,127      ,5      ,8.498          ,9.134               ,0.93  
0    ,6    ,127      ,5000   ,7.999          ,7.767               ,1.03  
0    ,6    ,127      ,6      ,8.148          ,8.948               ,0.911 
0    ,6    ,127      ,7      ,7.877          ,9.218               ,0.855 
0    ,608  ,127      ,1024   ,45.647         ,41.482              ,1.1   
0    ,608  ,127      ,128    ,15.588         ,19.387              ,0.804 
0    ,608  ,127      ,256    ,24.653         ,24.723              ,0.997 
0    ,608  ,127      ,32     ,8.028          ,8.953               ,0.897 
0    ,608  ,127      ,512    ,31.66          ,37.302              ,0.849 
0    ,608  ,127      ,64     ,11.819         ,14.897              ,0.793 
0    ,624  ,127      ,16     ,8.175          ,9.101               ,0.898 
0    ,64   ,127      ,128    ,14.215         ,18.247              ,0.779 
0    ,64   ,127      ,160    ,14.242         ,18.062              ,0.788 
0    ,64   ,127      ,192    ,14.176         ,14.246              ,0.995 
0    ,64   ,127      ,224    ,14.199         ,14.057              ,1.01  
0    ,64   ,127      ,256    ,14.202         ,13.852              ,1.025 
0    ,64   ,127      ,288    ,14.208         ,14.229              ,0.999 
0    ,64   ,127      ,32     ,8.243          ,9.068               ,0.909 
0    ,64   ,127      ,320    ,14.18          ,14.165              ,1.001 
0    ,64   ,127      ,352    ,14.164         ,14.056              ,1.008 
0    ,64   ,127      ,384    ,14.185         ,13.535              ,1.048 
0    ,64   ,127      ,416    ,14.203         ,14.318              ,0.992 
0    ,64   ,127      ,448    ,14.183         ,13.366              ,1.061 
0    ,64   ,127      ,480    ,14.178         ,13.852              ,1.024 
0    ,64   ,127      ,5000   ,14.273         ,14.58               ,0.979 
0    ,64   ,127      ,512    ,14.219         ,14.24               ,0.999 
0    ,64   ,127      ,544    ,14.156         ,13.952              ,1.015 
0    ,64   ,127      ,576    ,14.158         ,14.481              ,0.978 
0    ,64   ,127      ,608    ,14.189         ,14.159              ,1.002 
0    ,64   ,127      ,64     ,11.14          ,14.05               ,0.793 
0    ,64   ,127      ,640    ,14.171         ,13.543              ,1.046 
0    ,64   ,127      ,672    ,14.193         ,13.751              ,1.032 
0    ,64   ,127      ,704    ,14.182         ,13.959              ,1.016 
0    ,64   ,127      ,736    ,14.171         ,14.055              ,1.008 
0    ,64   ,127      ,768    ,14.157         ,14.204              ,0.997 
0    ,64   ,127      ,96     ,14.456         ,17.141              ,0.843 
0    ,640  ,127      ,1024   ,47.142         ,46.073              ,1.023 
0    ,640  ,127      ,128    ,15.872         ,18.998              ,0.835 
0    ,640  ,127      ,256    ,24.671         ,24.487              ,1.008 
0    ,640  ,127      ,32     ,8.396          ,9.055               ,0.927 
0    ,640  ,127      ,512    ,31.646         ,37.804              ,0.837 
0    ,640  ,127      ,64     ,11.552         ,14.921              ,0.774 
0    ,656  ,127      ,16     ,8.022          ,9.28                ,0.864 
0    ,672  ,127      ,1024   ,47.939         ,46.177              ,1.038 
0    ,672  ,127      ,128    ,16.03          ,19.0                ,0.844 
0    ,672  ,127      ,256    ,24.487         ,25.587              ,0.957 
0    ,672  ,127      ,32     ,7.765          ,9.282               ,0.837 
0    ,672  ,127      ,512    ,31.655         ,37.045              ,0.855 
0    ,672  ,127      ,64     ,11.707         ,15.716              ,0.745 
0    ,688  ,127      ,16     ,8.176          ,9.109               ,0.898 
0    ,7    ,127      ,5000   ,7.908          ,7.778               ,1.017 
0    ,7    ,127      ,6      ,8.239          ,9.113               ,0.904 
0    ,7    ,127      ,7      ,8.091          ,8.943               ,0.905 
0    ,7    ,127      ,8      ,8.495          ,9.113               ,0.932 
0    ,704  ,127      ,1024   ,50.512         ,46.444              ,1.088 
0    ,704  ,127      ,128    ,15.947         ,18.644              ,0.855 
0    ,704  ,127      ,256    ,24.475         ,24.618              ,0.994 
0    ,704  ,127      ,32     ,8.339          ,8.943               ,0.932 
0    ,704  ,127      ,512    ,31.672         ,37.016              ,0.856 
0    ,704  ,127      ,64     ,11.676         ,16.287              ,0.717 
0    ,720  ,127      ,16     ,8.073          ,9.451               ,0.854 
0    ,736  ,127      ,1024   ,50.557         ,46.873              ,1.079 
0    ,736  ,127      ,128    ,15.519         ,19.137              ,0.811 
0    ,736  ,127      ,256    ,24.493         ,24.042              ,1.019 
0    ,736  ,127      ,32     ,7.963          ,9.314               ,0.855 
0    ,736  ,127      ,512    ,31.674         ,37.365              ,0.848 
0    ,736  ,127      ,64     ,11.588         ,15.9                ,0.729 
0    ,7488 ,127      ,8192   ,328.179        ,308.263             ,1.065 
0    ,7520 ,127      ,8192   ,329.61         ,306.088             ,1.077 
0    ,7552 ,127      ,8192   ,337.338        ,308.477             ,1.094 
0    ,7584 ,127      ,8192   ,331.688        ,309.124             ,1.073 
0    ,7616 ,127      ,8192   ,336.799        ,308.588             ,1.091 
0    ,7648 ,127      ,8192   ,335.838        ,309.37              ,1.086 
0    ,768  ,127      ,1024   ,51.751         ,51.583              ,1.003 
0    ,768  ,127      ,128    ,15.601         ,19.449              ,0.802 
0    ,768  ,127      ,256    ,24.518         ,24.414              ,1.004 
0    ,768  ,127      ,512    ,31.647         ,36.928              ,0.857 
0    ,768  ,127      ,64     ,11.269         ,15.894              ,0.709 
0    ,7680 ,127      ,8192   ,337.088        ,310.192             ,1.087 
0    ,7712 ,127      ,8192   ,335.836        ,312.243             ,1.076 
0    ,7744 ,127      ,8192   ,341.67         ,313.952             ,1.088 
0    ,7776 ,127      ,8192   ,337.677        ,312.114             ,1.082 
0    ,7808 ,127      ,8192   ,338.394        ,313.933             ,1.078 
0    ,7840 ,127      ,8192   ,337.827        ,318.984             ,1.059 
0    ,7872 ,127      ,8192   ,338.106        ,315.827             ,1.071 
0    ,7904 ,127      ,8192   ,341.94         ,319.556             ,1.07  
0    ,7936 ,127      ,8192   ,345.793        ,319.103             ,1.084 
0    ,7968 ,127      ,8192   ,343.159        ,323.411             ,1.061 
0    ,8    ,127      ,5000   ,8.327          ,7.9                 ,1.054 
0    ,80   ,127      ,16     ,7.876          ,8.949               ,0.88  
0    ,800  ,127      ,1024   ,53.244         ,52.011              ,1.024 
0    ,800  ,127      ,128    ,15.693         ,19.293              ,0.813 
0    ,800  ,127      ,256    ,24.473         ,24.437              ,1.001 
0    ,800  ,127      ,512    ,31.654         ,36.836              ,0.859 
0    ,8000 ,127      ,8192   ,344.845        ,321.799             ,1.072 
0    ,8032 ,127      ,8192   ,343.376        ,322.474             ,1.065 
0    ,8064 ,127      ,8192   ,326.536        ,296.036             ,1.103 
0    ,8096 ,127      ,8192   ,328.024        ,301.152             ,1.089 
0    ,8128 ,127      ,8192   ,331.53         ,297.397             ,1.115 
0    ,8160 ,127      ,8192   ,331.008        ,303.453             ,1.091 
0    ,832  ,127      ,1024   ,57.15          ,51.405              ,1.112 
0    ,832  ,127      ,128    ,15.531         ,19.35               ,0.803 
0    ,832  ,127      ,256    ,24.545         ,24.501              ,1.002 
0    ,832  ,127      ,512    ,31.643         ,38.15               ,0.829 
0    ,864  ,127      ,1024   ,55.392         ,51.462              ,1.076 
0    ,864  ,127      ,256    ,24.472         ,24.553              ,0.997 
0    ,864  ,127      ,512    ,31.672         ,37.169              ,0.852 
0    ,896  ,127      ,1024   ,56.578         ,52.206              ,1.084 
0    ,896  ,127      ,256    ,24.485         ,24.586              ,0.996 
0    ,896  ,127      ,512    ,31.659         ,37.055              ,0.854 
0    ,928  ,127      ,1024   ,58.075         ,54.221              ,1.071 
0    ,928  ,127      ,256    ,24.829         ,24.799              ,1.001 
0    ,928  ,127      ,512    ,31.663         ,36.843              ,0.859 
0    ,96   ,127      ,128    ,17.064         ,17.918              ,0.952 
0    ,96   ,127      ,256    ,16.1           ,15.861              ,1.015 
0    ,96   ,127      ,32     ,8.507          ,9.108               ,0.934 
0    ,96   ,127      ,512    ,15.739         ,15.943              ,0.987 
0    ,96   ,127      ,64     ,11.63          ,14.875              ,0.782 
0    ,960  ,127      ,1024   ,60.301         ,56.801              ,1.062 
0    ,960  ,127      ,256    ,24.872         ,25.147              ,0.989 
0    ,960  ,127      ,512    ,31.651         ,36.958              ,0.856 
0    ,992  ,127      ,1024   ,60.336         ,57.422              ,1.051 
0    ,992  ,127      ,512    ,31.738         ,36.905              ,0.86  
1    ,1    ,127      ,0      ,8.786          ,10.542              ,0.833 
1    ,1    ,127      ,1      ,8.823          ,9.62                ,0.917 
1    ,1    ,127      ,128    ,8.579          ,8.57                ,1.001 
1    ,1    ,127      ,2      ,7.938          ,9.048               ,0.877 
1    ,1    ,127      ,5000   ,8.662          ,7.751               ,1.118 
1    ,1024 ,127      ,5000   ,61.941         ,61.077              ,1.014 
1    ,128  ,127      ,1      ,8.993          ,8.961               ,1.004 
1    ,128  ,127      ,5000   ,17.592         ,16.919              ,1.04  
1    ,16   ,127      ,5000   ,8.352          ,7.627               ,1.095 
1    ,2    ,127      ,1      ,8.51           ,8.819               ,0.965 
1    ,256  ,127      ,5000   ,32.189         ,32.041              ,1.005 
1    ,32   ,127      ,5000   ,12.297         ,12.844              ,0.957 
1    ,4    ,127      ,5000   ,8.183          ,7.857               ,1.041 
1    ,5000 ,127      ,1      ,8.134          ,9.275               ,0.877 
1    ,5000 ,127      ,1024   ,55.756         ,57.004              ,0.978 
1    ,5000 ,127      ,128    ,17.023         ,18.302              ,0.93  
1    ,5000 ,127      ,16     ,7.98           ,9.495               ,0.84  
1    ,5000 ,127      ,256    ,26.307         ,26.073              ,1.009 
1    ,5000 ,127      ,32     ,8.057          ,9.253               ,0.871 
1    ,5000 ,127      ,4      ,8.057          ,9.275               ,0.869 
1    ,5000 ,127      ,512    ,36.553         ,38.267              ,0.955 
1    ,5000 ,127      ,64     ,12.699         ,17.646              ,0.72  
1    ,5000 ,127      ,8      ,8.276          ,9.449               ,0.876 
1    ,512  ,127      ,5000   ,41.962         ,41.559              ,1.01  
1    ,64   ,127      ,5000   ,14.202         ,14.256              ,0.996 
1    ,8    ,127      ,5000   ,8.383          ,7.613               ,1.101 
2    ,2    ,127      ,1      ,8.013          ,8.943               ,0.896 
2    ,2    ,127      ,2      ,8.014          ,8.943               ,0.896 
2    ,2    ,127      ,3      ,8.214          ,8.953               ,0.917 
2    ,2    ,127      ,5000   ,8.133          ,7.479               ,1.087 
2    ,5000 ,127      ,2      ,7.872          ,9.802               ,0.803 
3    ,3    ,127      ,2      ,8.54           ,8.965               ,0.953 
3    ,3    ,127      ,3      ,8.26           ,8.943               ,0.924 
3    ,3    ,127      ,4      ,8.314          ,8.966               ,0.927 
3    ,3    ,127      ,5000   ,8.127          ,7.177               ,1.132 
3    ,5000 ,127      ,3      ,7.952          ,9.648               ,0.824 
32   ,1    ,127      ,128    ,8.566          ,8.881               ,0.964 
32   ,1    ,127      ,2      ,8.76           ,9.099               ,0.963 
32   ,128  ,127      ,1      ,8.717          ,8.944               ,0.975 
32   ,2    ,127      ,1      ,8.889          ,9.109               ,0.976 
33   ,1    ,127      ,128    ,8.826          ,8.419               ,1.048 
33   ,1    ,127      ,2      ,8.587          ,9.136               ,0.94  
33   ,128  ,127      ,1      ,8.82           ,8.973               ,0.983 
33   ,2    ,127      ,1      ,8.91           ,8.952               ,0.995 
4    ,4    ,127      ,3      ,8.127          ,8.943               ,0.909 
4    ,4    ,127      ,4      ,7.993          ,8.948               ,0.893 
4    ,4    ,127      ,5      ,8.6            ,9.107               ,0.944 
4    ,4    ,127      ,5000   ,8.232          ,7.626               ,1.079 
4    ,5000 ,127      ,4      ,7.77           ,9.413               ,0.825 
5    ,5    ,127      ,4      ,7.872          ,9.446               ,0.833 
5    ,5    ,127      ,5      ,7.872          ,8.915               ,0.883 
5    ,5    ,127      ,5000   ,7.98           ,7.329               ,1.089 
5    ,5    ,127      ,6      ,8.178          ,9.446               ,0.866 
5    ,5000 ,127      ,5      ,8.255          ,9.456               ,0.873 
6    ,5000 ,127      ,6      ,8.068          ,9.62                ,0.839 
6    ,6    ,127      ,5      ,7.77           ,8.943               ,0.869 
6    ,6    ,127      ,5000   ,8.362          ,7.463               ,1.12  
6    ,6    ,127      ,6      ,7.987          ,8.949               ,0.893 
6    ,6    ,127      ,7      ,8.097          ,9.107               ,0.889 
64   ,1024 ,127      ,1024   ,64.971         ,55.783              ,1.165 
64   ,1024 ,127      ,1056   ,65.377         ,63.04               ,1.037 
64   ,1024 ,127      ,1088   ,65.398         ,62.278              ,1.05  
64   ,1024 ,127      ,1120   ,65.333         ,61.871              ,1.056 
64   ,1024 ,127      ,1152   ,65.387         ,60.977              ,1.072 
64   ,1024 ,127      ,1184   ,65.403         ,61.454              ,1.064 
64   ,1024 ,127      ,1216   ,65.416         ,62.302              ,1.05  
64   ,1024 ,127      ,1248   ,65.408         ,60.884              ,1.074 
64   ,1024 ,127      ,1280   ,65.39          ,62.096              ,1.053 
64   ,1024 ,127      ,1312   ,65.439         ,60.899              ,1.075 
64   ,1024 ,127      ,1344   ,65.408         ,61.893              ,1.057 
64   ,1024 ,127      ,1376   ,65.416         ,61.402              ,1.065 
64   ,1024 ,127      ,1408   ,65.419         ,61.418              ,1.065 
64   ,1024 ,127      ,1440   ,65.391         ,62.334              ,1.049 
64   ,1024 ,127      ,1472   ,65.463         ,61.948              ,1.057 
64   ,1024 ,127      ,1504   ,65.411         ,62.018              ,1.055 
64   ,1024 ,127      ,1536   ,65.417         ,61.016              ,1.072 
64   ,1024 ,127      ,1568   ,65.892         ,61.578              ,1.07  
64   ,1024 ,127      ,1600   ,65.384         ,61.727              ,1.059 
64   ,1024 ,127      ,1632   ,65.415         ,60.985              ,1.073 
64   ,1024 ,127      ,1664   ,65.416         ,61.007              ,1.072 
64   ,1024 ,127      ,1696   ,65.424         ,60.987              ,1.073 
64   ,1024 ,127      ,1728   ,65.373         ,61.051              ,1.071 
64   ,1024 ,127      ,320    ,26.766         ,33.089              ,0.809 
64   ,1024 ,127      ,352    ,31.673         ,32.153              ,0.985 
64   ,1024 ,127      ,384    ,31.643         ,33.68               ,0.94  
64   ,1024 ,127      ,416    ,31.774         ,34.205              ,0.929 
64   ,1024 ,127      ,448    ,31.646         ,36.928              ,0.857 
64   ,1024 ,127      ,480    ,36.544         ,38.926              ,0.939 
64   ,1024 ,127      ,512    ,36.515         ,38.739              ,0.943 
64   ,1024 ,127      ,544    ,36.517         ,40.344              ,0.905 
64   ,1024 ,127      ,576    ,36.509         ,42.023              ,0.869 
64   ,1024 ,127      ,608    ,41.605         ,40.212              ,1.035 
64   ,1024 ,127      ,640    ,41.74          ,44.206              ,0.944 
64   ,1024 ,127      ,672    ,41.64          ,44.05               ,0.945 
64   ,1024 ,127      ,704    ,41.663         ,46.577              ,0.894 
64   ,1024 ,127      ,736    ,46.661         ,47.867              ,0.975 
64   ,1024 ,127      ,768    ,46.684         ,48.378              ,0.965 
64   ,1024 ,127      ,800    ,46.629         ,50.581              ,0.922 
64   ,1024 ,127      ,832    ,46.701         ,52.198              ,0.895 
64   ,1024 ,127      ,864    ,51.219         ,50.305              ,1.018 
64   ,1024 ,127      ,896    ,51.27          ,52.707              ,0.973 
64   ,1024 ,127      ,928    ,51.218         ,54.9                ,0.933 
64   ,1024 ,127      ,960    ,53.183         ,55.933              ,0.951 
64   ,1024 ,127      ,992    ,65.381         ,58.381              ,1.12  
64   ,1056 ,127      ,1024   ,65.39          ,56.236              ,1.163 
64   ,1056 ,127      ,512    ,36.552         ,38.571              ,0.948 
64   ,1088 ,127      ,1024   ,55.746         ,58.405              ,0.954 
64   ,1088 ,127      ,512    ,36.516         ,38.472              ,0.949 
64   ,112  ,127      ,16     ,7.765          ,9.107               ,0.853 
64   ,1120 ,127      ,1024   ,55.837         ,57.316              ,0.974 
64   ,1120 ,127      ,512    ,36.537         ,38.32               ,0.953 
64   ,1152 ,127      ,1024   ,55.772         ,57.132              ,0.976 
64   ,1152 ,127      ,512    ,36.667         ,38.258              ,0.958 
64   ,1184 ,127      ,1024   ,55.83          ,57.747              ,0.967 
64   ,1184 ,127      ,512    ,36.546         ,38.311              ,0.954 
64   ,1216 ,127      ,1024   ,55.75          ,57.945              ,0.962 
64   ,1216 ,127      ,512    ,36.52          ,38.478              ,0.949 
64   ,1248 ,127      ,1024   ,55.72          ,56.268              ,0.99  
64   ,128  ,127      ,128    ,14.951         ,17.567              ,0.851 
64   ,128  ,127      ,160    ,18.82          ,19.533              ,0.963 
64   ,128  ,127      ,192    ,17.623         ,17.414              ,1.012 
64   ,128  ,127      ,224    ,17.614         ,17.218              ,1.023 
64   ,128  ,127      ,256    ,17.599         ,16.681              ,1.055 
64   ,128  ,127      ,288    ,17.587         ,17.555              ,1.002 
64   ,128  ,127      ,32     ,8.451          ,9.044               ,0.934 
64   ,128  ,127      ,320    ,17.591         ,16.707              ,1.053 
64   ,128  ,127      ,352    ,17.603         ,16.752              ,1.051 
64   ,128  ,127      ,384    ,17.581         ,17.236              ,1.02  
64   ,128  ,127      ,416    ,17.593         ,16.641              ,1.057 
64   ,128  ,127      ,448    ,17.571         ,16.475              ,1.067 
64   ,128  ,127      ,480    ,17.636         ,16.765              ,1.052 
64   ,128  ,127      ,512    ,17.594         ,16.557              ,1.063 
64   ,128  ,127      ,544    ,17.601         ,16.699              ,1.054 
64   ,128  ,127      ,576    ,17.587         ,16.917              ,1.04  
64   ,128  ,127      ,608    ,17.58          ,16.869              ,1.042 
64   ,128  ,127      ,64     ,11.533         ,15.234              ,0.757 
64   ,128  ,127      ,640    ,17.605         ,16.752              ,1.051 
64   ,128  ,127      ,672    ,17.598         ,16.915              ,1.04  
64   ,128  ,127      ,704    ,17.584         ,16.814              ,1.046 
64   ,128  ,127      ,736    ,17.604         ,16.323              ,1.078 
64   ,128  ,127      ,768    ,17.607         ,17.409              ,1.011 
64   ,128  ,127      ,800    ,17.617         ,16.328              ,1.079 
64   ,128  ,127      ,832    ,17.609         ,16.614              ,1.06  
64   ,128  ,127      ,96     ,12.296         ,16.585              ,0.741 
64   ,1280 ,127      ,1024   ,55.662         ,56.854              ,0.979 
64   ,1312 ,127      ,1024   ,55.745         ,56.286              ,0.99  
64   ,1344 ,127      ,1024   ,55.767         ,56.956              ,0.979 
64   ,1344 ,127      ,2048   ,76.337         ,73.18               ,1.043 
64   ,1376 ,127      ,1024   ,55.742         ,56.238              ,0.991 
64   ,1376 ,127      ,2048   ,76.411         ,72.703              ,1.051 
64   ,1408 ,127      ,1024   ,55.742         ,56.724              ,0.983 
64   ,1408 ,127      ,2048   ,79.436         ,75.642              ,1.05  
64   ,144  ,127      ,16     ,7.986          ,8.943               ,0.893 
64   ,1440 ,127      ,1024   ,55.829         ,56.224              ,0.993 
64   ,1440 ,127      ,2048   ,79.46          ,75.995              ,1.046 
64   ,1472 ,127      ,1024   ,55.69          ,57.423              ,0.97  
64   ,1472 ,127      ,2048   ,80.68          ,78.008              ,1.034 
64   ,1504 ,127      ,1024   ,55.713         ,56.294              ,0.99  
64   ,1504 ,127      ,2048   ,80.871         ,77.185              ,1.048 
64   ,1536 ,127      ,1024   ,55.79          ,56.47               ,0.988 
64   ,1536 ,127      ,2048   ,84.926         ,79.761              ,1.065 
64   ,1568 ,127      ,1024   ,55.771         ,57.598              ,0.968 
64   ,1568 ,127      ,2048   ,83.826         ,80.832              ,1.037 
64   ,16   ,127      ,112    ,8.045          ,7.761               ,1.037 
64   ,16   ,127      ,144    ,8.06           ,7.325               ,1.1   
64   ,16   ,127      ,16     ,7.659          ,8.781               ,0.872 
64   ,16   ,127      ,176    ,8.018          ,7.925               ,1.012 
64   ,16   ,127      ,208    ,8.175          ,7.767               ,1.052 
64   ,16   ,127      ,240    ,8.392          ,7.472               ,1.123 
64   ,16   ,127      ,272    ,7.988          ,7.458               ,1.071 
64   ,16   ,127      ,304    ,8.391          ,7.469               ,1.123 
64   ,16   ,127      ,336    ,7.987          ,7.611               ,1.05  
64   ,16   ,127      ,368    ,8.207          ,7.466               ,1.099 
64   ,16   ,127      ,400    ,7.982          ,7.631               ,1.046 
64   ,16   ,127      ,432    ,8.017          ,7.692               ,1.042 
64   ,16   ,127      ,464    ,7.986          ,8.078               ,0.989 
64   ,16   ,127      ,48     ,8.09           ,7.976               ,1.014 
64   ,16   ,127      ,496    ,8.022          ,7.466               ,1.074 
64   ,16   ,127      ,528    ,7.77           ,7.904               ,0.983 
64   ,16   ,127      ,560    ,7.872          ,7.785               ,1.011 
64   ,16   ,127      ,592    ,8.124          ,7.612               ,1.067 
64   ,16   ,127      ,624    ,7.999          ,7.642               ,1.047 
64   ,16   ,127      ,656    ,8.09           ,8.08                ,1.001 
64   ,16   ,127      ,688    ,8.017          ,7.911               ,1.013 
64   ,16   ,127      ,720    ,7.77           ,8.901               ,0.873 
64   ,16   ,127      ,80     ,8.277          ,8.178               ,1.012 
64   ,160  ,127      ,128    ,15.611         ,18.063              ,0.864 
64   ,160  ,127      ,256    ,20.686         ,21.588              ,0.958 
64   ,160  ,127      ,32     ,8.055          ,9.108               ,0.884 
64   ,160  ,127      ,512    ,30.56          ,27.229              ,1.122 
64   ,160  ,127      ,64     ,11.603         ,15.072              ,0.77  
64   ,1600 ,127      ,1024   ,55.78          ,57.748              ,0.966 
64   ,1600 ,127      ,2048   ,84.93          ,81.269              ,1.045 
64   ,1632 ,127      ,1024   ,55.708         ,56.24               ,0.991 
64   ,1632 ,127      ,2048   ,85.373         ,84.345              ,1.012 
64   ,1664 ,127      ,1024   ,55.749         ,57.664              ,0.967 
64   ,1664 ,127      ,2048   ,88.322         ,83.584              ,1.057 
64   ,1696 ,127      ,1024   ,55.741         ,56.289              ,0.99  
64   ,1696 ,127      ,2048   ,88.315         ,83.622              ,1.056 
64   ,1728 ,127      ,1024   ,55.793         ,56.68               ,0.984 
64   ,1728 ,127      ,2048   ,89.163         ,87.273              ,1.022 
64   ,176  ,127      ,16     ,7.985          ,8.943               ,0.893 
64   ,1760 ,127      ,2048   ,89.93          ,89.578              ,1.004 
64   ,1792 ,127      ,2048   ,92.85          ,88.006              ,1.055 
64   ,1824 ,127      ,2048   ,92.89          ,99.222              ,0.936 
64   ,1856 ,127      ,2048   ,93.394         ,94.794              ,0.985 
64   ,1888 ,127      ,2048   ,94.411         ,90.86               ,1.039 
64   ,192  ,127      ,128    ,16.037         ,18.753              ,0.855 
64   ,192  ,127      ,256    ,22.517         ,23.013              ,0.978 
64   ,192  ,127      ,32     ,8.018          ,8.948               ,0.896 
64   ,192  ,127      ,512    ,32.004         ,32.347              ,0.989 
64   ,192  ,127      ,64     ,11.592         ,14.371              ,0.807 
64   ,1920 ,127      ,2048   ,97.241         ,93.101              ,1.044 
64   ,1952 ,127      ,2048   ,97.428         ,94.797              ,1.028 
64   ,1984 ,127      ,2048   ,97.133         ,90.174              ,1.077 
64   ,2016 ,127      ,2048   ,98.795         ,91.341              ,1.082 
64   ,2048 ,127      ,1344   ,65.138         ,68.223              ,0.955 
64   ,2048 ,127      ,1376   ,69.864         ,67.995              ,1.027 
64   ,2048 ,127      ,1408   ,69.756         ,69.011              ,1.011 
64   ,2048 ,127      ,1440   ,69.704         ,73.781              ,0.945 
64   ,2048 ,127      ,1472   ,69.865         ,74.31               ,0.94  
64   ,2048 ,127      ,1504   ,73.951         ,76.322              ,0.969 
64   ,2048 ,127      ,1536   ,74.002         ,73.291              ,1.01  
64   ,2048 ,127      ,1568   ,73.91          ,77.498              ,0.954 
64   ,2048 ,127      ,1600   ,74.011         ,76.796              ,0.964 
64   ,2048 ,127      ,1632   ,78.217         ,80.64               ,0.97  
64   ,2048 ,127      ,1664   ,78.286         ,77.664              ,1.008 
64   ,2048 ,127      ,1696   ,78.253         ,81.062              ,0.965 
64   ,2048 ,127      ,1728   ,78.262         ,82.213              ,0.952 
64   ,2048 ,127      ,1760   ,82.727         ,87.374              ,0.947 
64   ,2048 ,127      ,1792   ,84.58          ,82.094              ,1.03  
64   ,2048 ,127      ,1824   ,83.053         ,85.553              ,0.971 
64   ,2048 ,127      ,1856   ,84.106         ,89.528              ,0.939 
64   ,2048 ,127      ,1888   ,87.284         ,89.447              ,0.976 
64   ,2048 ,127      ,1920   ,87.312         ,88.408              ,0.988 
64   ,2048 ,127      ,1952   ,87.307         ,99.35               ,0.879 
64   ,2048 ,127      ,1984   ,87.275         ,91.012              ,0.959 
64   ,2048 ,127      ,2016   ,101.886        ,91.884              ,1.109 
64   ,2048 ,127      ,2048   ,101.515        ,89.226              ,1.138 
64   ,2048 ,127      ,2080   ,101.94         ,99.396              ,1.026 
64   ,2048 ,127      ,2112   ,101.904        ,96.903              ,1.052 
64   ,2048 ,127      ,2144   ,101.87         ,99.579              ,1.023 
64   ,2048 ,127      ,2176   ,101.849        ,96.54               ,1.055 
64   ,2048 ,127      ,2208   ,101.879        ,98.68               ,1.032 
64   ,2048 ,127      ,2240   ,101.91         ,102.184             ,0.997 
64   ,2048 ,127      ,2272   ,101.87         ,104.041             ,0.979 
64   ,2048 ,127      ,2304   ,101.912        ,96.477              ,1.056 
64   ,2048 ,127      ,2336   ,101.909        ,98.526              ,1.034 
64   ,2048 ,127      ,2368   ,101.899        ,96.566              ,1.055 
64   ,2048 ,127      ,2400   ,101.916        ,96.489              ,1.056 
64   ,2048 ,127      ,2432   ,101.903        ,96.423              ,1.057 
64   ,2048 ,127      ,2464   ,101.905        ,99.235              ,1.027 
64   ,2048 ,127      ,2496   ,104.879        ,96.592              ,1.086 
64   ,2048 ,127      ,2528   ,101.86         ,96.762              ,1.053 
64   ,2048 ,127      ,2560   ,101.881        ,96.481              ,1.056 
64   ,2048 ,127      ,2592   ,101.88         ,96.514              ,1.056 
64   ,2048 ,127      ,2624   ,101.892        ,98.573              ,1.034 
64   ,2048 ,127      ,2656   ,101.857        ,96.487              ,1.056 
64   ,2048 ,127      ,2688   ,101.889        ,98.711              ,1.032 
64   ,2048 ,127      ,2720   ,101.908        ,96.524              ,1.056 
64   ,2048 ,127      ,2752   ,101.91         ,96.637              ,1.055 
64   ,208  ,127      ,16     ,7.981          ,9.125               ,0.875 
64   ,2080 ,127      ,2048   ,102.44         ,89.479              ,1.145 
64   ,2112 ,127      ,2048   ,91.705         ,89.65               ,1.023 
64   ,2144 ,127      ,2048   ,91.734         ,89.971              ,1.02  
64   ,2176 ,127      ,2048   ,91.835         ,89.61               ,1.025 
64   ,2208 ,127      ,2048   ,91.823         ,92.301              ,0.995 
64   ,224  ,127      ,128    ,15.289         ,18.061              ,0.847 
64   ,224  ,127      ,256    ,25.093         ,24.496              ,1.024 
64   ,224  ,127      ,32     ,7.985          ,8.786               ,0.909 
64   ,224  ,127      ,512    ,33.216         ,31.969              ,1.039 
64   ,224  ,127      ,64     ,11.702         ,15.55               ,0.753 
64   ,2240 ,127      ,2048   ,91.783         ,89.771              ,1.022 
64   ,2272 ,127      ,2048   ,91.741         ,95.858              ,0.957 
64   ,2304 ,127      ,2048   ,91.698         ,92.925              ,0.987 
64   ,2336 ,127      ,2048   ,91.693         ,91.869              ,0.998 
64   ,2368 ,127      ,2048   ,91.767         ,89.529              ,1.025 
64   ,240  ,127      ,16     ,7.95           ,9.458               ,0.841 
64   ,2400 ,127      ,2048   ,91.725         ,92.771              ,0.989 
64   ,2432 ,127      ,2048   ,93.544         ,89.835              ,1.041 
64   ,2464 ,127      ,2048   ,91.614         ,91.863              ,0.997 
64   ,2496 ,127      ,2048   ,91.719         ,92.649              ,0.99  
64   ,2528 ,127      ,2048   ,91.735         ,89.808              ,1.021 
64   ,256  ,127      ,128    ,16.115         ,18.733              ,0.86  
64   ,256  ,127      ,160    ,17.265         ,18.078              ,0.955 
64   ,256  ,127      ,192    ,20.265         ,21.571              ,0.939 
64   ,256  ,127      ,224    ,22.035         ,22.455              ,0.981 
64   ,256  ,127      ,256    ,24.459         ,23.401              ,1.045 
64   ,256  ,127      ,288    ,27.516         ,26.141              ,1.053 
64   ,256  ,127      ,32     ,8.08           ,8.966               ,0.901 
64   ,256  ,127      ,320    ,35.289         ,32.263              ,1.094 
64   ,256  ,127      ,352    ,35.299         ,31.886              ,1.107 
64   ,256  ,127      ,384    ,35.324         ,31.95               ,1.106 
64   ,256  ,127      ,416    ,35.299         ,33.723              ,1.047 
64   ,256  ,127      ,448    ,35.356         ,31.778              ,1.113 
64   ,256  ,127      ,480    ,35.331         ,31.728              ,1.114 
64   ,256  ,127      ,512    ,35.305         ,31.78               ,1.111 
64   ,256  ,127      ,544    ,35.298         ,31.759              ,1.111 
64   ,256  ,127      ,576    ,35.276         ,31.757              ,1.111 
64   ,256  ,127      ,608    ,35.297         ,31.762              ,1.111 
64   ,256  ,127      ,64     ,11.657         ,15.239              ,0.765 
64   ,256  ,127      ,640    ,35.423         ,31.735              ,1.116 
64   ,256  ,127      ,672    ,35.299         ,31.788              ,1.11  
64   ,256  ,127      ,704    ,35.371         ,31.742              ,1.114 
64   ,256  ,127      ,736    ,35.294         ,31.754              ,1.111 
64   ,256  ,127      ,768    ,35.314         ,31.747              ,1.112 
64   ,256  ,127      ,800    ,35.289         ,31.731              ,1.112 
64   ,256  ,127      ,832    ,35.291         ,31.744              ,1.112 
64   ,256  ,127      ,864    ,35.304         ,31.789              ,1.111 
64   ,256  ,127      ,896    ,35.312         ,31.775              ,1.111 
64   ,256  ,127      ,928    ,35.306         ,31.767              ,1.111 
64   ,256  ,127      ,96     ,12.99          ,15.93               ,0.815 
64   ,256  ,127      ,960    ,35.303         ,31.738              ,1.112 
64   ,2560 ,127      ,2048   ,91.702         ,89.771              ,1.022 
64   ,2592 ,127      ,2048   ,91.794         ,89.687              ,1.023 
64   ,2624 ,127      ,2048   ,91.692         ,96.465              ,0.951 
64   ,2656 ,127      ,2048   ,91.783         ,89.551              ,1.025 
64   ,2688 ,127      ,2048   ,91.787         ,89.754              ,1.023 
64   ,272  ,127      ,16     ,7.873          ,9.114               ,0.864 
64   ,2720 ,127      ,2048   ,92.755         ,89.501              ,1.036 
64   ,2752 ,127      ,2048   ,91.612         ,89.613              ,1.022 
64   ,288  ,127      ,128    ,15.772         ,17.991              ,0.877 
64   ,288  ,127      ,256    ,24.482         ,24.561              ,0.997 
64   ,288  ,127      ,32     ,7.995          ,8.788               ,0.91  
64   ,288  ,127      ,512    ,35.334         ,31.832              ,1.11  
64   ,288  ,127      ,64     ,11.718         ,15.726              ,0.745 
64   ,304  ,127      ,16     ,8.038          ,9.446               ,0.851 
64   ,32   ,127      ,128    ,12.379         ,16.075              ,0.77  
64   ,32   ,127      ,160    ,12.299         ,16.699              ,0.737 
64   ,32   ,127      ,192    ,12.308         ,13.155              ,0.936 
64   ,32   ,127      ,224    ,12.301         ,12.975              ,0.948 
64   ,32   ,127      ,256    ,12.307         ,13.601              ,0.905 
64   ,32   ,127      ,288    ,12.294         ,13.125              ,0.937 
64   ,32   ,127      ,32     ,7.66           ,8.781               ,0.872 
64   ,32   ,127      ,320    ,12.345         ,12.939              ,0.954 
64   ,32   ,127      ,352    ,12.299         ,13.598              ,0.904 
64   ,32   ,127      ,384    ,12.294         ,13.147              ,0.935 
64   ,32   ,127      ,416    ,12.342         ,12.807              ,0.964 
64   ,32   ,127      ,448    ,12.336         ,12.772              ,0.966 
64   ,32   ,127      ,480    ,12.294         ,13.09               ,0.939 
64   ,32   ,127      ,512    ,12.302         ,12.968              ,0.949 
64   ,32   ,127      ,544    ,12.299         ,12.801              ,0.961 
64   ,32   ,127      ,576    ,12.335         ,12.474              ,0.989 
64   ,32   ,127      ,608    ,12.336         ,12.551              ,0.983 
64   ,32   ,127      ,64     ,12.454         ,15.309              ,0.814 
64   ,32   ,127      ,640    ,12.318         ,13.428              ,0.917 
64   ,32   ,127      ,672    ,12.294         ,12.801              ,0.96  
64   ,32   ,127      ,704    ,12.317         ,12.469              ,0.988 
64   ,32   ,127      ,736    ,12.299         ,12.824              ,0.959 
64   ,32   ,127      ,96     ,12.414         ,15.582              ,0.797 
64   ,320  ,127      ,1024   ,37.12          ,36.922              ,1.005 
64   ,320  ,127      ,128    ,15.531         ,18.152              ,0.856 
64   ,320  ,127      ,256    ,24.657         ,24.923              ,0.989 
64   ,320  ,127      ,32     ,8.09           ,9.282               ,0.872 
64   ,320  ,127      ,512    ,37.521         ,36.809              ,1.019 
64   ,320  ,127      ,64     ,11.702         ,15.282              ,0.766 
64   ,336  ,127      ,16     ,7.872          ,9.208               ,0.855 
64   ,3392 ,127      ,4096   ,142.48         ,192.837             ,0.739 
64   ,3424 ,127      ,4096   ,145.42         ,192.324             ,0.756 
64   ,3456 ,127      ,4096   ,149.279        ,197.89              ,0.754 
64   ,3488 ,127      ,4096   ,149.085        ,194.594             ,0.766 
64   ,352  ,127      ,1024   ,38.319         ,36.879              ,1.039 
64   ,352  ,127      ,128    ,15.359         ,18.48               ,0.831 
64   ,352  ,127      ,256    ,24.475         ,24.979              ,0.98  
64   ,352  ,127      ,32     ,8.277          ,9.275               ,0.892 
64   ,352  ,127      ,512    ,38.31          ,36.782              ,1.042 
64   ,352  ,127      ,64     ,11.704         ,15.576              ,0.751 
64   ,3520 ,127      ,4096   ,146.936        ,196.64              ,0.747 
64   ,3552 ,127      ,4096   ,149.58         ,194.169             ,0.77  
64   ,3584 ,127      ,4096   ,153.647        ,192.594             ,0.798 
64   ,3616 ,127      ,4096   ,153.753        ,194.453             ,0.791 
64   ,3648 ,127      ,4096   ,151.528        ,194.552             ,0.779 
64   ,368  ,127      ,16     ,8.371          ,8.948               ,0.936 
64   ,3680 ,127      ,4096   ,153.849        ,195.728             ,0.786 
64   ,3712 ,127      ,4096   ,158.049        ,201.301             ,0.785 
64   ,3744 ,127      ,4096   ,158.077        ,199.971             ,0.79  
64   ,3776 ,127      ,4096   ,155.904        ,199.662             ,0.781 
64   ,3808 ,127      ,4096   ,159.441        ,204.15              ,0.781 
64   ,384  ,127      ,1024   ,40.642         ,36.491              ,1.114 
64   ,384  ,127      ,128    ,15.605         ,19.163              ,0.814 
64   ,384  ,127      ,256    ,24.459         ,24.134              ,1.013 
64   ,384  ,127      ,32     ,8.239          ,9.035               ,0.912 
64   ,384  ,127      ,512    ,40.663         ,36.5                ,1.114 
64   ,384  ,127      ,64     ,11.714         ,15.514              ,0.755 
64   ,3840 ,127      ,4096   ,162.766        ,205.416             ,0.792 
64   ,3872 ,127      ,4096   ,162.281        ,204.965             ,0.792 
64   ,3904 ,127      ,4096   ,162.984        ,204.368             ,0.798 
64   ,3936 ,127      ,4096   ,166.82         ,205.068             ,0.813 
64   ,3968 ,127      ,4096   ,166.561        ,205.982             ,0.809 
64   ,400  ,127      ,16     ,8.277          ,9.275               ,0.892 
64   ,4000 ,127      ,4096   ,166.61         ,205.727             ,0.81  
64   ,4032 ,127      ,4096   ,166.001        ,182.025             ,0.912 
64   ,4064 ,127      ,4096   ,170.568        ,183.146             ,0.931 
64   ,4096 ,127      ,3392   ,134.25         ,167.121             ,0.803 
64   ,4096 ,127      ,3424   ,138.383        ,170.362             ,0.812 
64   ,4096 ,127      ,3456   ,138.382        ,169.139             ,0.818 
64   ,4096 ,127      ,3488   ,138.307        ,175.368             ,0.789 
64   ,4096 ,127      ,3520   ,138.249        ,167.96              ,0.823 
64   ,4096 ,127      ,3552   ,142.7          ,170.201             ,0.838 
64   ,4096 ,127      ,3584   ,142.6          ,171.287             ,0.833 
64   ,4096 ,127      ,3616   ,142.872        ,177.928             ,0.803 
64   ,4096 ,127      ,3648   ,142.755        ,168.606             ,0.847 
64   ,4096 ,127      ,3680   ,146.907        ,172.935             ,0.849 
64   ,4096 ,127      ,3712   ,146.919        ,170.171             ,0.863 
64   ,4096 ,127      ,3744   ,149.022        ,176.907             ,0.842 
64   ,4096 ,127      ,3776   ,146.889        ,179.14              ,0.82  
64   ,4096 ,127      ,3808   ,151.458        ,175.67              ,0.862 
64   ,4096 ,127      ,3840   ,152.743        ,177.074             ,0.863 
64   ,4096 ,127      ,3872   ,151.354        ,179.163             ,0.845 
64   ,4096 ,127      ,3904   ,151.249        ,176.688             ,0.856 
64   ,4096 ,127      ,3936   ,164.341        ,187.46              ,0.877 
64   ,4096 ,127      ,3968   ,155.67         ,180.712             ,0.861 
64   ,4096 ,127      ,4000   ,155.521        ,186.318             ,0.835 
64   ,4096 ,127      ,4032   ,158.276        ,184.134             ,0.86  
64   ,4096 ,127      ,4064   ,170.873        ,182.524             ,0.936 
64   ,4096 ,127      ,4096   ,170.591        ,176.172             ,0.968 
64   ,4096 ,127      ,4128   ,170.902        ,182.988             ,0.934 
64   ,4096 ,127      ,4160   ,176.928        ,180.486             ,0.98  
64   ,4096 ,127      ,4192   ,206.3          ,209.978             ,0.982 
64   ,4096 ,127      ,4224   ,195.332        ,209.087             ,0.934 
64   ,4096 ,127      ,4256   ,180.675        ,212.911             ,0.849 
64   ,4096 ,127      ,4288   ,175.117        ,223.157             ,0.785 
64   ,4096 ,127      ,4320   ,170.977        ,209.594             ,0.816 
64   ,4096 ,127      ,4352   ,170.872        ,214.023             ,0.798 
64   ,4096 ,127      ,4384   ,171.021        ,210.292             ,0.813 
64   ,4096 ,127      ,4416   ,171.041        ,208.592             ,0.82  
64   ,4096 ,127      ,4448   ,170.921        ,213.242             ,0.802 
64   ,4096 ,127      ,4480   ,175.356        ,209.464             ,0.837 
64   ,4096 ,127      ,4512   ,170.966        ,207.261             ,0.825 
64   ,4096 ,127      ,4544   ,174.869        ,209.292             ,0.836 
64   ,4096 ,127      ,4576   ,170.922        ,207.908             ,0.822 
64   ,4096 ,127      ,4608   ,175.563        ,211.322             ,0.831 
64   ,4096 ,127      ,4640   ,170.879        ,217.164             ,0.787 
64   ,4096 ,127      ,4672   ,184.643        ,209.815             ,0.88  
64   ,4096 ,127      ,4704   ,171.124        ,215.689             ,0.793 
64   ,4096 ,127      ,4736   ,180.595        ,217.091             ,0.832 
64   ,4096 ,127      ,4768   ,170.862        ,212.905             ,0.803 
64   ,4096 ,127      ,4800   ,171.022        ,210.846             ,0.811 
64   ,4128 ,127      ,4096   ,171.589        ,185.468             ,0.925 
64   ,416  ,127      ,1024   ,40.673         ,36.987              ,1.1   
64   ,416  ,127      ,128    ,15.241         ,18.563              ,0.821 
64   ,416  ,127      ,256    ,24.472         ,25.273              ,0.968 
64   ,416  ,127      ,32     ,8.169          ,8.955               ,0.912 
64   ,416  ,127      ,512    ,40.665         ,36.518              ,1.114 
64   ,416  ,127      ,64     ,11.51          ,15.211              ,0.757 
64   ,4160 ,127      ,4096   ,159.801        ,186.421             ,0.857 
64   ,4192 ,127      ,4096   ,159.891        ,181.758             ,0.88  
64   ,4224 ,127      ,4096   ,159.878        ,182.758             ,0.875 
64   ,4256 ,127      ,4096   ,161.458        ,184.396             ,0.876 
64   ,4288 ,127      ,4096   ,161.191        ,183.008             ,0.881 
64   ,432  ,127      ,16     ,7.766          ,9.107               ,0.853 
64   ,4320 ,127      ,4096   ,159.789        ,182.923             ,0.874 
64   ,4352 ,127      ,4096   ,165.247        ,185.383             ,0.891 
64   ,4384 ,127      ,4096   ,161.743        ,182.33              ,0.887 
64   ,4416 ,127      ,4096   ,160.01         ,183.496             ,0.872 
64   ,4448 ,127      ,4096   ,162.218        ,181.611             ,0.893 
64   ,448  ,127      ,1024   ,42.065         ,41.352              ,1.017 
64   ,448  ,127      ,128    ,15.186         ,18.339              ,0.828 
64   ,448  ,127      ,256    ,24.512         ,24.805              ,0.988 
64   ,448  ,127      ,32     ,8.277          ,8.787               ,0.942 
64   ,448  ,127      ,512    ,42.013         ,37.53               ,1.119 
64   ,448  ,127      ,64     ,11.596         ,14.871              ,0.78  
64   ,4480 ,127      ,4096   ,163.127        ,183.933             ,0.887 
64   ,4512 ,127      ,4096   ,164.937        ,186.469             ,0.885 
64   ,4544 ,127      ,4096   ,161.351        ,181.491             ,0.889 
64   ,4576 ,127      ,4096   ,162.731        ,177.275             ,0.918 
64   ,4608 ,127      ,4096   ,164.463        ,181.843             ,0.904 
64   ,464  ,127      ,16     ,8.047          ,9.629               ,0.836 
64   ,4640 ,127      ,4096   ,160.0          ,181.165             ,0.883 
64   ,4672 ,127      ,4096   ,161.484        ,185.762             ,0.869 
64   ,4704 ,127      ,4096   ,159.8          ,183.296             ,0.872 
64   ,4736 ,127      ,4096   ,161.754        ,188.718             ,0.857 
64   ,4768 ,127      ,4096   ,161.781        ,183.893             ,0.88  
64   ,48   ,127      ,16     ,8.071          ,8.96                ,0.901 
64   ,480  ,127      ,1024   ,43.12          ,41.388              ,1.042 
64   ,480  ,127      ,128    ,15.086         ,19.072              ,0.791 
64   ,480  ,127      ,256    ,24.499         ,24.671              ,0.993 
64   ,480  ,127      ,32     ,8.095          ,9.281               ,0.872 
64   ,480  ,127      ,512    ,43.09          ,39.439              ,1.093 
64   ,480  ,127      ,64     ,11.823         ,15.559              ,0.76  
64   ,4800 ,127      ,4096   ,159.781        ,188.592             ,0.847 
64   ,496  ,127      ,16     ,8.164          ,9.113               ,0.896 
64   ,512  ,127      ,1024   ,45.669         ,41.583              ,1.098 
64   ,512  ,127      ,1056   ,45.632         ,41.507              ,1.099 
64   ,512  ,127      ,1088   ,45.69          ,41.573              ,1.099 
64   ,512  ,127      ,1120   ,45.63          ,41.96               ,1.087 
64   ,512  ,127      ,1152   ,45.679         ,41.513              ,1.1   
64   ,512  ,127      ,1184   ,45.643         ,41.485              ,1.1   
64   ,512  ,127      ,1216   ,45.634         ,41.481              ,1.1   
64   ,512  ,127      ,128    ,16.158         ,19.405              ,0.833 
64   ,512  ,127      ,160    ,16.667         ,18.292              ,0.911 
64   ,512  ,127      ,192    ,19.931         ,22.015              ,0.905 
64   ,512  ,127      ,224    ,21.976         ,23.494              ,0.935 
64   ,512  ,127      ,256    ,24.613         ,24.359              ,1.01  
64   ,512  ,127      ,288    ,26.287         ,25.949              ,1.013 
64   ,512  ,127      ,32     ,8.278          ,9.032               ,0.916 
64   ,512  ,127      ,320    ,26.777         ,33.628              ,0.796 
64   ,512  ,127      ,352    ,31.718         ,32.567              ,0.974 
64   ,512  ,127      ,384    ,31.644         ,33.668              ,0.94  
64   ,512  ,127      ,416    ,31.653         ,34.469              ,0.918 
64   ,512  ,127      ,448    ,31.661         ,37.227              ,0.85  
64   ,512  ,127      ,480    ,45.612         ,37.516              ,1.216 
64   ,512  ,127      ,512    ,45.445         ,38.135              ,1.192 
64   ,512  ,127      ,544    ,45.647         ,41.604              ,1.097 
64   ,512  ,127      ,576    ,45.612         ,42.634              ,1.07  
64   ,512  ,127      ,608    ,47.646         ,41.531              ,1.147 
64   ,512  ,127      ,64     ,11.82          ,14.893              ,0.794 
64   ,512  ,127      ,640    ,45.647         ,41.497              ,1.1   
64   ,512  ,127      ,672    ,45.657         ,41.698              ,1.095 
64   ,512  ,127      ,704    ,45.635         ,41.689              ,1.095 
64   ,512  ,127      ,736    ,45.631         ,41.568              ,1.098 
64   ,512  ,127      ,768    ,45.623         ,41.785              ,1.092 
64   ,512  ,127      ,800    ,45.639         ,41.499              ,1.1   
64   ,512  ,127      ,832    ,45.747         ,41.524              ,1.102 
64   ,512  ,127      ,864    ,45.628         ,41.5                ,1.099 
64   ,512  ,127      ,896    ,45.754         ,41.626              ,1.099 
64   ,512  ,127      ,928    ,45.66          ,42.102              ,1.085 
64   ,512  ,127      ,96     ,12.466         ,16.513              ,0.755 
64   ,512  ,127      ,960    ,46.467         ,41.564              ,1.118 
64   ,512  ,127      ,992    ,45.639         ,41.484              ,1.1   
64   ,528  ,127      ,16     ,8.146          ,9.17                ,0.888 
64   ,544  ,127      ,1024   ,45.65          ,41.55               ,1.099 
64   ,544  ,127      ,128    ,16.205         ,17.925              ,0.904 
64   ,544  ,127      ,256    ,24.496         ,23.797              ,1.029 
64   ,544  ,127      ,32     ,7.986          ,9.463               ,0.844 
64   ,544  ,127      ,512    ,45.613         ,38.692              ,1.179 
64   ,544  ,127      ,64     ,11.702         ,15.498              ,0.755 
64   ,560  ,127      ,16     ,7.931          ,9.284               ,0.854 
64   ,576  ,127      ,1024   ,47.078         ,46.296              ,1.017 
64   ,576  ,127      ,128    ,15.685         ,19.175              ,0.818 
64   ,576  ,127      ,256    ,24.469         ,24.79               ,0.987 
64   ,576  ,127      ,32     ,7.98           ,9.112               ,0.876 
64   ,576  ,127      ,512    ,36.553         ,39.743              ,0.92  
64   ,576  ,127      ,64     ,11.704         ,15.692              ,0.746 
64   ,592  ,127      ,16     ,7.987          ,9.275               ,0.861 
64   ,608  ,127      ,1024   ,47.954         ,46.277              ,1.036 
64   ,608  ,127      ,128    ,15.56          ,19.322              ,0.805 
64   ,608  ,127      ,256    ,24.471         ,24.902              ,0.983 
64   ,608  ,127      ,32     ,8.128          ,8.786               ,0.925 
64   ,608  ,127      ,512    ,36.538         ,38.313              ,0.954 
64   ,608  ,127      ,64     ,11.823         ,15.789              ,0.749 
64   ,624  ,127      ,16     ,8.239          ,8.949               ,0.921 
64   ,64   ,127      ,128    ,14.211         ,18.094              ,0.785 
64   ,64   ,127      ,160    ,14.186         ,17.917              ,0.792 
64   ,64   ,127      ,192    ,14.198         ,13.693              ,1.037 
64   ,64   ,127      ,224    ,14.167         ,13.813              ,1.026 
64   ,64   ,127      ,256    ,14.197         ,14.137              ,1.004 
64   ,64   ,127      ,288    ,14.179         ,13.739              ,1.032 
64   ,64   ,127      ,32     ,8.413          ,9.124               ,0.922 
64   ,64   ,127      ,320    ,14.193         ,14.154              ,1.003 
64   ,64   ,127      ,352    ,14.178         ,13.659              ,1.038 
64   ,64   ,127      ,384    ,14.17          ,13.727              ,1.032 
64   ,64   ,127      ,416    ,14.198         ,13.955              ,1.017 
64   ,64   ,127      ,448    ,14.183         ,14.055              ,1.009 
64   ,64   ,127      ,480    ,14.159         ,14.482              ,0.978 
64   ,64   ,127      ,512    ,14.209         ,14.355              ,0.99  
64   ,64   ,127      ,544    ,14.185         ,13.543              ,1.047 
64   ,64   ,127      ,576    ,14.186         ,13.246              ,1.071 
64   ,64   ,127      ,608    ,14.178         ,13.644              ,1.039 
64   ,64   ,127      ,64     ,11.14          ,14.054              ,0.793 
64   ,64   ,127      ,640    ,14.179         ,13.408              ,1.058 
64   ,64   ,127      ,672    ,14.162         ,13.757              ,1.029 
64   ,64   ,127      ,704    ,14.185         ,13.754              ,1.031 
64   ,64   ,127      ,736    ,14.187         ,14.247              ,0.996 
64   ,64   ,127      ,768    ,14.186         ,13.961              ,1.016 
64   ,64   ,127      ,96     ,14.472         ,16.34               ,0.886 
64   ,640  ,127      ,1024   ,50.528         ,46.462              ,1.088 
64   ,640  ,127      ,128    ,15.77          ,19.27               ,0.818 
64   ,640  ,127      ,256    ,24.464         ,23.926              ,1.022 
64   ,640  ,127      ,32     ,8.127          ,9.275               ,0.876 
64   ,640  ,127      ,512    ,36.539         ,38.523              ,0.949 
64   ,640  ,127      ,64     ,11.48          ,15.885              ,0.723 
64   ,656  ,127      ,16     ,8.09           ,9.604               ,0.842 
64   ,672  ,127      ,1024   ,50.522         ,46.415              ,1.088 
64   ,672  ,127      ,128    ,15.433         ,19.327              ,0.799 
64   ,672  ,127      ,256    ,24.473         ,24.043              ,1.018 
64   ,672  ,127      ,32     ,8.022          ,9.275               ,0.865 
64   ,672  ,127      ,512    ,36.526         ,38.373              ,0.952 
64   ,672  ,127      ,64     ,11.504         ,16.131              ,0.713 
64   ,688  ,127      ,16     ,7.772          ,9.446               ,0.823 
64   ,704  ,127      ,1024   ,51.762         ,51.756              ,1.0   
64   ,704  ,127      ,128    ,14.994         ,19.119              ,0.784 
64   ,704  ,127      ,256    ,24.468         ,24.094              ,1.016 
64   ,704  ,127      ,32     ,7.879          ,9.06                ,0.87  
64   ,704  ,127      ,512    ,36.534         ,38.902              ,0.939 
64   ,704  ,127      ,64     ,11.593         ,15.055              ,0.77  
64   ,720  ,127      ,16     ,8.176          ,9.409               ,0.869 
64   ,736  ,127      ,1024   ,53.036         ,51.776              ,1.024 
64   ,736  ,127      ,128    ,15.694         ,18.752              ,0.837 
64   ,736  ,127      ,256    ,24.559         ,24.598              ,0.998 
64   ,736  ,127      ,32     ,8.245          ,9.112               ,0.905 
64   ,736  ,127      ,512    ,36.553         ,38.249              ,0.956 
64   ,736  ,127      ,64     ,11.474         ,15.868              ,0.723 
64   ,7488 ,127      ,8192   ,335.579        ,309.388             ,1.085 
64   ,7520 ,127      ,8192   ,333.901        ,310.714             ,1.075 
64   ,7552 ,127      ,8192   ,337.389        ,309.396             ,1.09  
64   ,7584 ,127      ,8192   ,332.885        ,309.217             ,1.077 
64   ,7616 ,127      ,8192   ,340.286        ,312.344             ,1.089 
64   ,7648 ,127      ,8192   ,334.582        ,313.115             ,1.069 
64   ,768  ,127      ,1024   ,55.575         ,52.117              ,1.066 
64   ,768  ,127      ,128    ,16.385         ,19.115              ,0.857 
64   ,768  ,127      ,256    ,24.476         ,24.578              ,0.996 
64   ,768  ,127      ,512    ,36.545         ,38.263              ,0.955 
64   ,768  ,127      ,64     ,11.371         ,16.618              ,0.684 
64   ,7680 ,127      ,8192   ,339.98         ,319.364             ,1.065 
64   ,7712 ,127      ,8192   ,340.797        ,315.618             ,1.08  
64   ,7744 ,127      ,8192   ,337.328        ,318.327             ,1.06  
64   ,7776 ,127      ,8192   ,339.25         ,318.181             ,1.066 
64   ,7808 ,127      ,8192   ,342.429        ,315.16              ,1.087 
64   ,7840 ,127      ,8192   ,343.402        ,315.784             ,1.087 
64   ,7872 ,127      ,8192   ,345.654        ,322.79              ,1.071 
64   ,7904 ,127      ,8192   ,343.039        ,325.493             ,1.054 
64   ,7936 ,127      ,8192   ,346.126        ,321.615             ,1.076 
64   ,7968 ,127      ,8192   ,345.973        ,324.619             ,1.066 
64   ,80   ,127      ,16     ,8.127          ,9.107               ,0.892 
64   ,800  ,127      ,1024   ,55.356         ,51.892              ,1.067 
64   ,800  ,127      ,128    ,15.682         ,18.887              ,0.83  
64   ,800  ,127      ,256    ,24.659         ,23.97               ,1.029 
64   ,800  ,127      ,512    ,36.56          ,38.423              ,0.951 
64   ,8000 ,127      ,8192   ,351.063        ,324.66              ,1.081 
64   ,8032 ,127      ,8192   ,348.884        ,328.445             ,1.062 
64   ,8064 ,127      ,8192   ,345.907        ,330.401             ,1.047 
64   ,8096 ,127      ,8192   ,347.606        ,330.126             ,1.053 
64   ,832  ,127      ,1024   ,58.47          ,57.029              ,1.025 
64   ,832  ,127      ,128    ,15.717         ,18.825              ,0.835 
64   ,832  ,127      ,256    ,25.07          ,24.481              ,1.024 
64   ,832  ,127      ,512    ,36.517         ,38.58               ,0.947 
64   ,864  ,127      ,1024   ,58.007         ,56.577              ,1.025 
64   ,864  ,127      ,256    ,24.5           ,24.807              ,0.988 
64   ,864  ,127      ,512    ,36.539         ,38.751              ,0.943 
64   ,896  ,127      ,1024   ,60.306         ,57.951              ,1.041 
64   ,896  ,127      ,256    ,24.476         ,24.648              ,0.993 
64   ,896  ,127      ,512    ,36.547         ,38.437              ,0.951 
64   ,928  ,127      ,1024   ,60.318         ,56.638              ,1.065 
64   ,928  ,127      ,256    ,24.563         ,25.382              ,0.968 
64   ,928  ,127      ,512    ,36.51          ,39.339              ,0.928 
64   ,96   ,127      ,128    ,16.791         ,17.59               ,0.955 
64   ,96   ,127      ,256    ,15.692         ,15.259              ,1.028 
64   ,96   ,127      ,32     ,7.898          ,8.966               ,0.881 
64   ,96   ,127      ,512    ,15.754         ,15.908              ,0.99  
64   ,96   ,127      ,64     ,11.94          ,14.555              ,0.82  
64   ,960  ,127      ,1024   ,62.045         ,57.457              ,1.08  
64   ,960  ,127      ,256    ,24.646         ,24.87               ,0.991 
64   ,960  ,127      ,512    ,36.512         ,38.429              ,0.95  
64   ,992  ,127      ,1024   ,62.779         ,59.056              ,1.063 
64   ,992  ,127      ,512    ,36.502         ,38.5                ,0.948 
7    ,5000 ,127      ,7      ,7.766          ,9.279               ,0.837 
7    ,7    ,127      ,5000   ,8.023          ,7.318               ,1.096 
7    ,7    ,127      ,6      ,8.017          ,8.829               ,0.908 
7    ,7    ,127      ,7      ,7.873          ,8.958               ,0.879 
7    ,7    ,127      ,8      ,7.992          ,8.781               ,0.91  
0.9468079980272118

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH v1 4/7] x86: Optimize memrchr-evex.S
  2022-10-18  2:48 ` [PATCH v1 4/7] x86: Optimize memrchr-evex.S Noah Goldstein
@ 2022-10-18  2:51   ` Noah Goldstein
  0 siblings, 0 replies; 41+ messages in thread
From: Noah Goldstein @ 2022-10-18  2:51 UTC (permalink / raw)
  To: libc-alpha; +Cc: hjl.tools, carlos

[-- Attachment #1: Type: text/plain, Size: 22947 bytes --]

On Mon, Oct 17, 2022 at 7:49 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> Optimizations are:
> 1. Use the fact that lzcnt(0) -> VEC_SIZE for memchr to save a branch
>    in short string case.
> 2. Save several instructions in len = [VEC_SIZE, 4 * VEC_SIZE] case.
> 3. Use more code-size efficient instructions.
>         - tzcnt ...     -> bsf ...
>         - vpcmpb $0 ... -> vpcmpeq ...
>
> Code Size Changes:
> memrchr-evex.S      :  -29 bytes
>
> Net perf changes:
>
> Reported as geometric mean of all improvements / regressions from N=10
> runs of the benchtests. Value as New Time / Old Time so < 1.0 is
> improvement and 1.0 is regression.
>
> memrchr-evex.S      : 0.949 (Mostly from improvements in small strings)
>
> Full results attached in email.
>
> Full check passes on x86-64.
> ---
>  sysdeps/x86_64/multiarch/memrchr-evex.S | 538 ++++++++++++++----------
>  1 file changed, 324 insertions(+), 214 deletions(-)
>
> diff --git a/sysdeps/x86_64/multiarch/memrchr-evex.S b/sysdeps/x86_64/multiarch/memrchr-evex.S
> index 550b328c5a..dbcf52808f 100644
> --- a/sysdeps/x86_64/multiarch/memrchr-evex.S
> +++ b/sysdeps/x86_64/multiarch/memrchr-evex.S
> @@ -21,17 +21,19 @@
>  #if ISA_SHOULD_BUILD (4)
>
>  # include <sysdep.h>
> -# include "x86-evex256-vecs.h"
> -# if VEC_SIZE != 32
> -#  error "VEC_SIZE != 32 unimplemented"
> +
> +# ifndef VEC_SIZE
> +#  include "x86-evex256-vecs.h"
>  # endif
>
> +# include "reg-macros.h"
> +
>  # ifndef MEMRCHR
> -#  define MEMRCHR                              __memrchr_evex
> +#  define MEMRCHR      __memrchr_evex
>  # endif
>
> -# define PAGE_SIZE                     4096
> -# define VMMMATCH                      VMM(0)
> +# define PAGE_SIZE     4096
> +# define VMATCH        VMM(0)
>
>         .section SECTION(.text), "ax", @progbits
>  ENTRY_P2ALIGN(MEMRCHR, 6)
> @@ -43,294 +45,402 @@ ENTRY_P2ALIGN(MEMRCHR, 6)
>  # endif
>         jz      L(zero_0)
>
> -       /* Get end pointer. Minus one for two reasons. 1) It is necessary for a
> -          correct page cross check and 2) it correctly sets up end ptr to be
> -          subtract by lzcnt aligned.  */
> +       /* Get end pointer. Minus one for three reasons. 1) It is
> +          necessary for a correct page cross check and 2) it correctly
> +          sets up end ptr to be subtract by lzcnt aligned. 3) it is a
> +          necessary step in aligning ptr.  */
>         leaq    -1(%rdi, %rdx), %rax
> -       vpbroadcastb %esi, %VMMMATCH
> +       vpbroadcastb %esi, %VMATCH
>
>         /* Check if we can load 1x VEC without cross a page.  */
>         testl   $(PAGE_SIZE - VEC_SIZE), %eax
>         jz      L(page_cross)
>
> -       /* Don't use rax for pointer here because EVEX has better encoding with
> -          offset % VEC_SIZE == 0.  */
> -       vpcmpb  $0, -(VEC_SIZE)(%rdi, %rdx), %VMMMATCH, %k0
> -       kmovd   %k0, %ecx
> -
> -       /* Fall through for rdx (len) <= VEC_SIZE (expect small sizes).  */
> -       cmpq    $VEC_SIZE, %rdx
> -       ja      L(more_1x_vec)
> -L(ret_vec_x0_test):
> -
> -       /* If ecx is zero (no matches) lzcnt will set it 32 (VEC_SIZE) which
> -          will guarantee edx (len) is less than it.  */
> -       lzcntl  %ecx, %ecx
> -       cmpl    %ecx, %edx
> -       jle     L(zero_0)
> -       subq    %rcx, %rax
> +       /* Don't use rax for pointer here because EVEX has better
> +          encoding with offset % VEC_SIZE == 0.  */
> +       vpcmpeqb (VEC_SIZE * -1)(%rdi, %rdx), %VMATCH, %k0
> +       KMOV    %k0, %VRCX
> +
> +       /* If rcx is zero then lzcnt -> VEC_SIZE.  NB: there is a
> +          already a dependency between rcx and rsi so no worries about
> +          false-dep here.  */
> +       lzcnt   %VRCX, %VRSI
> +       /* If rdx <= rsi then either 1) rcx was non-zero (there was a
> +          match) but it was out of bounds or 2) rcx was zero and rdx
> +          was <= VEC_SIZE so we are done scanning.  */
> +       cmpq    %rsi, %rdx
> +       /* NB: Use branch to return zero/non-zero.  Common usage will
> +          branch on result of function (if return is null/non-null).
> +          This branch can be used to predict the ensuing one so there
> +          is no reason to extend the data-dependency with cmovcc.  */
> +       jbe     L(zero_0)
> +
> +       /* If rcx is zero then len must be > RDX, otherwise since we
> +          already tested len vs lzcnt(rcx) (in rsi) we are good to
> +          return this match.  */
> +       test    %VRCX, %VRCX
> +       jz      L(more_1x_vec)
> +       subq    %rsi, %rax
>         ret
>
> -       /* Fits in aligning bytes of first cache line.  */
> +       /* Fits in aligning bytes of first cache line for VEC_SIZE ==
> +          32.  */
> +# if VEC_SIZE == 32
> +       .p2align 4,, 2
>  L(zero_0):
>         xorl    %eax, %eax
>         ret
> -
> -       .p2align 4,, 9
> -L(ret_vec_x0_dec):
> -       decq    %rax
> -L(ret_vec_x0):
> -       lzcntl  %ecx, %ecx
> -       subq    %rcx, %rax
> -       ret
> +# endif
>
>         .p2align 4,, 10
>  L(more_1x_vec):
> -       testl   %ecx, %ecx
> -       jnz     L(ret_vec_x0)
> -
>         /* Align rax (pointer to string).  */
>         andq    $-VEC_SIZE, %rax
> -
> +L(page_cross_continue):
>         /* Recompute length after aligning.  */
> -       movq    %rax, %rdx
> +       subq    %rdi, %rax
>
> -       /* Need no matter what.  */
> -       vpcmpb  $0, -(VEC_SIZE)(%rax), %VMMMATCH, %k0
> -       kmovd   %k0, %ecx
> -
> -       subq    %rdi, %rdx
> -
> -       cmpq    $(VEC_SIZE * 2), %rdx
> +       cmpq    $(VEC_SIZE * 2), %rax
>         ja      L(more_2x_vec)
> +
>  L(last_2x_vec):
> +       vpcmpeqb (VEC_SIZE * -1)(%rdi, %rax), %VMATCH, %k0
> +       KMOV    %k0, %VRCX
>
> -       /* Must dec rax because L(ret_vec_x0_test) expects it.  */
> -       decq    %rax
> -       cmpl    $VEC_SIZE, %edx
> -       jbe     L(ret_vec_x0_test)
> +       test    %VRCX, %VRCX
> +       jnz     L(ret_vec_x0_test)
>
> -       testl   %ecx, %ecx
> -       jnz     L(ret_vec_x0)
> +       /* If VEC_SIZE == 64 need to subtract because lzcntq won't
> +          implicitly add VEC_SIZE to match position.  */
> +# if VEC_SIZE == 64
> +       subl    $VEC_SIZE, %eax
> +# else
> +       cmpb    $VEC_SIZE, %al
> +# endif
> +       jle     L(zero_2)
>
> -       /* Don't use rax for pointer here because EVEX has better encoding with
> -          offset % VEC_SIZE == 0.  */
> -       vpcmpb  $0, -(VEC_SIZE * 2)(%rdi, %rdx), %VMMMATCH, %k0
> -       kmovd   %k0, %ecx
> -       /* NB: 64-bit lzcnt. This will naturally add 32 to position.  */
> +       /* We adjusted rax (length) for VEC_SIZE == 64 so need seperate
> +          offsets.  */
> +# if VEC_SIZE == 64
> +       vpcmpeqb (VEC_SIZE * -1)(%rdi, %rax), %VMATCH, %k0
> +# else
> +       vpcmpeqb (VEC_SIZE * -2)(%rdi, %rax), %VMATCH, %k0
> +# endif
> +       KMOV    %k0, %VRCX
> +       /* NB: 64-bit lzcnt. This will naturally add 32 to position for
> +          VEC_SIZE == 32.  */
>         lzcntq  %rcx, %rcx
> -       cmpl    %ecx, %edx
> -       jle     L(zero_0)
> -       subq    %rcx, %rax
> -       ret
> -
> -       /* Inexpensive place to put this regarding code size / target alignments
> -          / ICache NLP. Necessary for 2-byte encoding of jump to page cross
> -          case which in turn is necessary for hot path (len <= VEC_SIZE) to fit
> -          in first cache line.  */
> -L(page_cross):
> -       movq    %rax, %rsi
> -       andq    $-VEC_SIZE, %rsi
> -       vpcmpb  $0, (%rsi), %VMMMATCH, %k0
> -       kmovd   %k0, %r8d
> -       /* Shift out negative alignment (because we are starting from endptr and
> -          working backwards).  */
> -       movl    %eax, %ecx
> -       /* notl because eax already has endptr - 1.  (-x = ~(x - 1)).  */
> -       notl    %ecx
> -       shlxl   %ecx, %r8d, %ecx
> -       cmpq    %rdi, %rsi
> -       ja      L(more_1x_vec)
> -       lzcntl  %ecx, %ecx
> -       cmpl    %ecx, %edx
> -       jle     L(zero_1)
> -       subq    %rcx, %rax
> +       subl    %ecx, %eax
> +       ja      L(first_vec_x1_ret)
> +       /* If VEC_SIZE == 64 put L(zero_0) here as we can't fit in the
> +          first cache line (this is the second cache line).  */
> +# if VEC_SIZE == 64
> +L(zero_0):
> +# endif
> +L(zero_2):
> +       xorl    %eax, %eax
>         ret
>
> -       /* Continue creating zero labels that fit in aligning bytes and get
> -          2-byte encoding / are in the same cache line as condition.  */
> -L(zero_1):
> -       xorl    %eax, %eax
> +       /* NB: Fits in aligning bytes before next cache line for
> +          VEC_SIZE == 32.  For VEC_SIZE == 64 this is attached to
> +          L(first_vec_x0_test).  */
> +# if VEC_SIZE == 32
> +L(first_vec_x1_ret):
> +       leaq    -1(%rdi, %rax), %rax
>         ret
> +# endif
>
> -       .p2align 4,, 8
> -L(ret_vec_x1):
> -       /* This will naturally add 32 to position.  */
> -       bsrl    %ecx, %ecx
> -       leaq    -(VEC_SIZE * 2)(%rcx, %rax), %rax
> +       .p2align 4,, 6
> +L(ret_vec_x0_test):
> +       lzcnt   %VRCX, %VRCX
> +       subl    %ecx, %eax
> +       jle     L(zero_2)
> +# if VEC_SIZE == 64
> +       /* Reuse code at the end of L(ret_vec_x0_test) as we can't fit
> +          L(first_vec_x1_ret) in the same cache line as its jmp base
> +          so we might as well save code size.  */
> +L(first_vec_x1_ret):
> +# endif
> +       leaq    -1(%rdi, %rax), %rax
>         ret
>
> -       .p2align 4,, 8
> +       .p2align 4,, 6
> +L(loop_last_4x_vec):
> +       /* Compute remaining length.  */
> +       subl    %edi, %eax
> +L(last_4x_vec):
> +       cmpl    $(VEC_SIZE * 2), %eax
> +       jle     L(last_2x_vec)
> +# if VEC_SIZE == 32
> +       /* Only align for VEC_SIZE == 32.  For VEC_SIZE == 64 we need
> +          the spare bytes to align the loop properly.  */
> +       .p2align 4,, 10
> +# endif
>  L(more_2x_vec):
> -       testl   %ecx, %ecx
> -       jnz     L(ret_vec_x0_dec)
>
> -       vpcmpb  $0, -(VEC_SIZE * 2)(%rax), %VMMMATCH, %k0
> -       kmovd   %k0, %ecx
> -       testl   %ecx, %ecx
> -       jnz     L(ret_vec_x1)
> +       /* Length > VEC_SIZE * 2 so check the first 2x VEC for match and
> +          return if either hit.  */
> +       vpcmpeqb (VEC_SIZE * -1)(%rdi, %rax), %VMATCH, %k0
> +       KMOV    %k0, %VRCX
> +
> +       test    %VRCX, %VRCX
> +       jnz     L(first_vec_x0)
> +
> +       vpcmpeqb (VEC_SIZE * -2)(%rdi, %rax), %VMATCH, %k0
> +       KMOV    %k0, %VRCX
> +       test    %VRCX, %VRCX
> +       jnz     L(first_vec_x1)
>
>         /* Need no matter what.  */
> -       vpcmpb  $0, -(VEC_SIZE * 3)(%rax), %VMMMATCH, %k0
> -       kmovd   %k0, %ecx
> +       vpcmpeqb (VEC_SIZE * -3)(%rdi, %rax), %VMATCH, %k0
> +       KMOV    %k0, %VRCX
>
> -       subq    $(VEC_SIZE * 4), %rdx
> +       /* Check if we are near the end.  */
> +       subq    $(VEC_SIZE * 4), %rax
>         ja      L(more_4x_vec)
>
> -       cmpl    $(VEC_SIZE * -1), %edx
> -       jle     L(ret_vec_x2_test)
> -L(last_vec):
> -       testl   %ecx, %ecx
> -       jnz     L(ret_vec_x2)
> +       test    %VRCX, %VRCX
> +       jnz     L(first_vec_x2_test)
>
> +       /* Adjust length for final check and check if we are at the end.
> +        */
> +       addl    $(VEC_SIZE * 1), %eax
> +       jle     L(zero_1)
>
> -       /* Need no matter what.  */
> -       vpcmpb  $0, -(VEC_SIZE * 4)(%rax), %VMMMATCH, %k0
> -       kmovd   %k0, %ecx
> -       lzcntl  %ecx, %ecx
> -       subq    $(VEC_SIZE * 3 + 1), %rax
> -       subq    %rcx, %rax
> -       cmpq    %rax, %rdi
> -       ja      L(zero_1)
> +       vpcmpeqb (VEC_SIZE * -1)(%rdi, %rax), %VMATCH, %k0
> +       KMOV    %k0, %VRCX
> +
> +       lzcnt   %VRCX, %VRCX
> +       subl    %ecx, %eax
> +       ja      L(first_vec_x3_ret)
> +L(zero_1):
> +       xorl    %eax, %eax
> +       ret
> +L(first_vec_x3_ret):
> +       leaq    -1(%rdi, %rax), %rax
>         ret
>
> -       .p2align 4,, 8
> -L(ret_vec_x2_test):
> -       lzcntl  %ecx, %ecx
> -       subq    $(VEC_SIZE * 2 + 1), %rax
> -       subq    %rcx, %rax
> -       cmpq    %rax, %rdi
> -       ja      L(zero_1)
> +       .p2align 4,, 6
> +L(first_vec_x2_test):
> +       /* Must adjust length before check.  */
> +       subl    $-(VEC_SIZE * 2 - 1), %eax
> +       lzcnt   %VRCX, %VRCX
> +       subl    %ecx, %eax
> +       jl      L(zero_4)
> +       addq    %rdi, %rax
>         ret
>
> -       .p2align 4,, 8
> -L(ret_vec_x2):
> -       bsrl    %ecx, %ecx
> -       leaq    -(VEC_SIZE * 3)(%rcx, %rax), %rax
> +
> +       .p2align 4,, 10
> +L(first_vec_x0):
> +       bsr     %VRCX, %VRCX
> +       leaq    (VEC_SIZE * -1)(%rdi, %rax), %rax
> +       addq    %rcx, %rax
>         ret
>
> -       .p2align 4,, 8
> -L(ret_vec_x3):
> -       bsrl    %ecx, %ecx
> -       leaq    -(VEC_SIZE * 4)(%rcx, %rax), %rax
> +       /* Fits unobtrusively here.  */
> +L(zero_4):
> +       xorl    %eax, %eax
> +       ret
> +
> +       .p2align 4,, 10
> +L(first_vec_x1):
> +       bsr     %VRCX, %VRCX
> +       leaq    (VEC_SIZE * -2)(%rdi, %rax), %rax
> +       addq    %rcx, %rax
>         ret
>
>         .p2align 4,, 8
> +L(first_vec_x3):
> +       bsr     %VRCX, %VRCX
> +       addq    %rdi, %rax
> +       addq    %rcx, %rax
> +       ret
> +
> +       .p2align 4,, 6
> +L(first_vec_x2):
> +       bsr     %VRCX, %VRCX
> +       leaq    (VEC_SIZE * 1)(%rdi, %rax), %rax
> +       addq    %rcx, %rax
> +       ret
> +
> +       .p2align 4,, 2
>  L(more_4x_vec):
> -       testl   %ecx, %ecx
> -       jnz     L(ret_vec_x2)
> +       test    %VRCX, %VRCX
> +       jnz     L(first_vec_x2)
>
> -       vpcmpb  $0, -(VEC_SIZE * 4)(%rax), %VMMMATCH, %k0
> -       kmovd   %k0, %ecx
> +       vpcmpeqb (%rdi, %rax), %VMATCH, %k0
> +       KMOV    %k0, %VRCX
>
> -       testl   %ecx, %ecx
> -       jnz     L(ret_vec_x3)
> +       test    %VRCX, %VRCX
> +       jnz     L(first_vec_x3)
>
>         /* Check if near end before re-aligning (otherwise might do an
>            unnecessary loop iteration).  */
> -       addq    $-(VEC_SIZE * 4), %rax
> -       cmpq    $(VEC_SIZE * 4), %rdx
> +       cmpq    $(VEC_SIZE * 4), %rax
>         jbe     L(last_4x_vec)
>
> -       decq    %rax
> -       andq    $-(VEC_SIZE * 4), %rax
> -       movq    %rdi, %rdx
> -       /* Get endptr for loop in rdx. NB: Can't just do while rax > rdi because
> -          lengths that overflow can be valid and break the comparison.  */
> -       andq    $-(VEC_SIZE * 4), %rdx
> +
> +       /* NB: We setup the loop to NOT use index-address-mode for the
> +          buffer.  This costs some instructions & code size but avoids
> +          stalls due to unlaminated micro-fused instructions (as used
> +          in the loop) from being forced to issue in the same group
> +          (essentially narrowing the backend width).  */
> +
> +       /* Get endptr for loop in rdx. NB: Can't just do while rax > rdi
> +          because lengths that overflow can be valid and break the
> +          comparison.  */
> +# if VEC_SIZE == 64
> +       /* Use rdx as intermediate to compute rax, this gets us imm8
> +          encoding which just allows the L(more_4x_vec) block to fit
> +          in 1 cache-line.  */
> +       leaq    (VEC_SIZE * 4)(%rdi), %rdx
> +       leaq    (VEC_SIZE * -1)(%rdx, %rax), %rax
> +
> +       /* No evex machine has partial register stalls. This can be
> +          replaced with: `andq $(VEC_SIZE * -4), %rax/%rdx` if that
> +          changes.  */
> +       xorb    %al, %al
> +       xorb    %dl, %dl
> +# else
> +       leaq    (VEC_SIZE * 3)(%rdi, %rax), %rax
> +       andq    $(VEC_SIZE * -4), %rax
> +       leaq    (VEC_SIZE * 4)(%rdi), %rdx
> +       andq    $(VEC_SIZE * -4), %rdx
> +# endif
> +
>
>         .p2align 4
>  L(loop_4x_vec):
> -       /* Store 1 were not-equals and 0 where equals in k1 (used to mask later
> -          on).  */
> -       vpcmpb  $4, (VEC_SIZE * 3)(%rax), %VMMMATCH, %k1
> +       /* NB: We could do the same optimization here as we do for
> +          memchr/rawmemchr by using VEX encoding in the loop for access
> +          to VEX vpcmpeqb + vpternlogd.  Since memrchr is not as hot as
> +          memchr it may not be worth the extra code size, but if the
> +          need arises it an easy ~15% perf improvement to the loop.  */
> +
> +       cmpq    %rdx, %rax
> +       je      L(loop_last_4x_vec)
> +       /* Store 1 were not-equals and 0 where equals in k1 (used to
> +          mask later on).  */
> +       vpcmpb  $4, (VEC_SIZE * -1)(%rax), %VMATCH, %k1
>
>         /* VEC(2/3) will have zero-byte where we found a CHAR.  */
> -       vpxorq  (VEC_SIZE * 2)(%rax), %VMMMATCH, %VMM(2)
> -       vpxorq  (VEC_SIZE * 1)(%rax), %VMMMATCH, %VMM(3)
> -       vpcmpb  $0, (VEC_SIZE * 0)(%rax), %VMMMATCH, %k4
> +       vpxorq  (VEC_SIZE * -2)(%rax), %VMATCH, %VMM(2)
> +       vpxorq  (VEC_SIZE * -3)(%rax), %VMATCH, %VMM(3)
> +       vpcmpeqb (VEC_SIZE * -4)(%rax), %VMATCH, %k4
>
> -       /* Combine VEC(2/3) with min and maskz with k1 (k1 has zero bit where
> -          CHAR is found and VEC(2/3) have zero-byte where CHAR is found.  */
> +       /* Combine VEC(2/3) with min and maskz with k1 (k1 has zero bit
> +          where CHAR is found and VEC(2/3) have zero-byte where CHAR
> +          is found.  */
>         vpminub %VMM(2), %VMM(3), %VMM(3){%k1}{z}
>         vptestnmb %VMM(3), %VMM(3), %k2
>
> -       /* Any 1s and we found CHAR.  */
> -       kortestd %k2, %k4
> -       jnz     L(loop_end)
> -
>         addq    $-(VEC_SIZE * 4), %rax
> -       cmpq    %rdx, %rax
> -       jne     L(loop_4x_vec)
>
> -       /* Need to re-adjust rdx / rax for L(last_4x_vec).  */
> -       subq    $-(VEC_SIZE * 4), %rdx
> -       movq    %rdx, %rax
> -       subl    %edi, %edx
> -L(last_4x_vec):
> +       /* Any 1s and we found CHAR.  */
> +       KORTEST %k2, %k4
> +       jz      L(loop_4x_vec)
> +
>
> -       /* Used no matter what.  */
> -       vpcmpb  $0, (VEC_SIZE * -1)(%rax), %VMMMATCH, %k0
> -       kmovd   %k0, %ecx
> +       /* K1 has non-matches for first VEC. inc; jz will overflow rcx
> +          iff all bytes where non-matches.  */
> +       KMOV    %k1, %VRCX
> +       inc     %VRCX
> +       jnz     L(first_vec_x0_end)
>
> -       cmpl    $(VEC_SIZE * 2), %edx
> -       jbe     L(last_2x_vec)
> +       vptestnmb %VMM(2), %VMM(2), %k0
> +       KMOV    %k0, %VRCX
> +       test    %VRCX, %VRCX
> +       jnz     L(first_vec_x1_end)
> +       KMOV    %k2, %VRCX
> +
> +       /* Seperate logic for VEC_SIZE == 64 and VEC_SIZE == 32 for
> +          returning last 2x VEC. For VEC_SIZE == 64 we test each VEC
> +          individually, for VEC_SIZE == 32 we combine them in a single
> +          64-bit GPR.  */
> +# if VEC_SIZE == 64
> +       test    %VRCX, %VRCX
> +       jnz     L(first_vec_x2_end)
> +       KMOV    %k4, %VRCX
> +# else
> +       /* Combine last 2 VEC matches for VEC_SIZE == 32. If rcx (from
> +          VEC(3)) is zero (no CHAR in VEC(3)) then it won't affect the
> +          result in rsi (from VEC(4)). If rcx is non-zero then CHAR in
> +          VEC(3) and bsrq will use that position.  */
> +       KMOV    %k4, %VRSI
> +       salq    $32, %rcx
> +       orq     %rsi, %rcx
> +# endif
> +       bsrq    %rcx, %rcx
> +       addq    %rcx, %rax
> +       ret
>
> -       testl   %ecx, %ecx
> -       jnz     L(ret_vec_x0_dec)
> +       .p2align 4,, 4
> +L(first_vec_x0_end):
> +       /* rcx has 1s at non-matches so we need to `not` it. We used
> +          `inc` to test if zero so use `neg` to complete the `not` so
> +          the last 1 bit represent a match.  NB: (-x + 1 == ~x).  */
> +       neg     %VRCX
> +       bsr     %VRCX, %VRCX
> +       leaq    (VEC_SIZE * 3)(%rcx, %rax), %rax
> +       ret
>
> +       .p2align 4,, 10
> +L(first_vec_x1_end):
> +       bsr     %VRCX, %VRCX
> +       leaq    (VEC_SIZE * 2)(%rcx, %rax), %rax
> +       ret
>
> -       vpcmpb  $0, (VEC_SIZE * -2)(%rax), %VMMMATCH, %k0
> -       kmovd   %k0, %ecx
> +# if VEC_SIZE == 64
> +       /* Since we can't combine the last 2x VEC for VEC_SIZE == 64
> +          need return label for it.  */
> +       .p2align 4,, 4
> +L(first_vec_x2_end):
> +       bsr     %VRCX, %VRCX
> +       leaq    (VEC_SIZE * 1)(%rcx, %rax), %rax
> +       ret
> +# endif
>
> -       testl   %ecx, %ecx
> -       jnz     L(ret_vec_x1)
>
> -       /* Used no matter what.  */
> -       vpcmpb  $0, (VEC_SIZE * -3)(%rax), %VMMMATCH, %k0
> -       kmovd   %k0, %ecx
> +       .p2align 4,, 4
> +L(page_cross):
> +       /* only lower bits of eax[log2(VEC_SIZE):0] are set so we can
> +          use movzbl to get the amount of bytes we are checking here.
> +        */
> +       movzbl  %al, %ecx
> +       andq    $-VEC_SIZE, %rax
> +       vpcmpeqb (%rax), %VMATCH, %k0
> +       KMOV    %k0, %VRSI
>
> -       cmpl    $(VEC_SIZE * 3), %edx
> -       ja      L(last_vec)
> +       /* eax was comptued as %rdi + %rdx - 1 so need to add back 1
> +          here.  */
> +       leal    1(%rcx), %r8d
>
> -       lzcntl  %ecx, %ecx
> -       subq    $(VEC_SIZE * 2 + 1), %rax
> -       subq    %rcx, %rax
> -       cmpq    %rax, %rdi
> -       jbe     L(ret_1)
> +       /* Invert ecx to get shift count for byte matches out of range.
> +        */
> +       notl    %ecx
> +       shlx    %VRCX, %VRSI, %VRSI
> +
> +       /* if r8 < rdx then the entire [buf, buf + len] is handled in
> +          the page cross case.  NB: we can't use the trick here we use
> +          in the non page-cross case because we aren't checking full
> +          VEC_SIZE.  */
> +       cmpq    %r8, %rdx
> +       ja      L(page_cross_check)
> +       lzcnt   %VRSI, %VRSI
> +       subl    %esi, %edx
> +       ja      L(page_cross_ret)
>         xorl    %eax, %eax
> -L(ret_1):
>         ret
>
> -       .p2align 4,, 6
> -L(loop_end):
> -       kmovd   %k1, %ecx
> -       notl    %ecx
> -       testl   %ecx, %ecx
> -       jnz     L(ret_vec_x0_end)
> +L(page_cross_check):
> +       test    %VRSI, %VRSI
> +       jz      L(page_cross_continue)
>
> -       vptestnmb %VMM(2), %VMM(2), %k0
> -       kmovd   %k0, %ecx
> -       testl   %ecx, %ecx
> -       jnz     L(ret_vec_x1_end)
> -
> -       kmovd   %k2, %ecx
> -       kmovd   %k4, %esi
> -       /* Combine last 2 VEC matches. If ecx (VEC3) is zero (no CHAR in VEC3)
> -          then it won't affect the result in esi (VEC4). If ecx is non-zero
> -          then CHAR in VEC3 and bsrq will use that position.  */
> -       salq    $32, %rcx
> -       orq     %rsi, %rcx
> -       bsrq    %rcx, %rcx
> -       addq    %rcx, %rax
> -       ret
> -       .p2align 4,, 4
> -L(ret_vec_x0_end):
> -       addq    $(VEC_SIZE), %rax
> -L(ret_vec_x1_end):
> -       bsrl    %ecx, %ecx
> -       leaq    (VEC_SIZE * 2)(%rax, %rcx), %rax
> +       lzcnt   %VRSI, %VRSI
> +       subl    %esi, %edx
> +L(page_cross_ret):
> +       leaq    -1(%rdi, %rdx), %rax
>         ret
> -
>  END(MEMRCHR)
>  #endif
> --
> 2.34.1
>

[-- Attachment #2: memrchr.txt --]
[-- Type: text/plain, Size: 263958 bytes --]

Results For: memrchr
align,invert_pos ,len  ,pos  ,seek_char ,__memrchr_evex ,__memrchr_evex_orig 
0    ,0          ,0    ,1    ,0         ,3.418          ,3.438               ,0.994 
0    ,0          ,0    ,1    ,23        ,3.422          ,3.401               ,1.006 
0    ,0          ,1    ,2    ,0         ,4.189          ,4.163               ,1.006 
0    ,0          ,1    ,2    ,23        ,4.154          ,4.346               ,0.956 
0    ,0          ,10   ,1    ,23        ,4.217          ,4.475               ,0.942 
0    ,0          ,10   ,11   ,0         ,4.195          ,4.46                ,0.941 
0    ,0          ,10   ,11   ,23        ,4.205          ,4.536               ,0.927 
0    ,0          ,10   ,2    ,0         ,4.186          ,4.507               ,0.929 
0    ,0          ,10   ,9    ,0         ,4.194          ,4.481               ,0.936 
0    ,0          ,10   ,9    ,23        ,4.153          ,4.426               ,0.938 
0    ,0          ,1024 ,1024 ,23        ,23.769         ,23.165              ,1.026 
0    ,0          ,1024 ,1056 ,23        ,24.087         ,23.455              ,1.027 
0    ,0          ,1024 ,1088 ,23        ,24.147         ,23.502              ,1.027 
0    ,0          ,1024 ,1120 ,23        ,25.172         ,23.543              ,1.069 
0    ,0          ,1024 ,1152 ,23        ,24.004         ,23.58               ,1.018 
0    ,0          ,1024 ,1184 ,23        ,24.429         ,23.461              ,1.041 
0    ,0          ,1024 ,1216 ,23        ,23.918         ,23.489              ,1.018 
0    ,0          ,1024 ,1248 ,23        ,24.118         ,23.375              ,1.032 
0    ,0          ,1024 ,1280 ,23        ,23.877         ,23.557              ,1.014 
0    ,0          ,1024 ,1312 ,23        ,23.889         ,23.481              ,1.017 
0    ,0          ,1024 ,1344 ,23        ,23.984         ,23.555              ,1.018 
0    ,0          ,1024 ,1376 ,23        ,24.068         ,23.516              ,1.023 
0    ,0          ,1024 ,1408 ,23        ,23.827         ,23.483              ,1.015 
0    ,0          ,1024 ,640  ,23        ,13.643         ,13.138              ,1.038 
0    ,0          ,1024 ,672  ,23        ,13.559         ,13.102              ,1.035 
0    ,0          ,1024 ,704  ,23        ,12.635         ,12.372              ,1.021 
0    ,0          ,1024 ,736  ,23        ,12.095         ,12.017              ,1.007 
0    ,0          ,1024 ,768  ,23        ,11.178         ,10.992              ,1.017 
0    ,0          ,1024 ,800  ,23        ,11.121         ,11.027              ,1.009 
0    ,0          ,1024 ,832  ,23        ,10.295         ,10.158              ,1.013 
0    ,0          ,1024 ,864  ,23        ,6.441          ,6.427               ,1.002 
0    ,0          ,1024 ,896  ,23        ,5.856          ,5.855               ,1.0   
0    ,0          ,1024 ,928  ,23        ,5.196          ,5.254               ,0.989 
0    ,0          ,1024 ,960  ,23        ,4.723          ,5.233               ,0.902 
0    ,0          ,1024 ,992  ,23        ,3.543          ,4.472               ,0.792 
0    ,0          ,1056 ,1024 ,23        ,3.325          ,4.322               ,0.769 
0    ,0          ,1088 ,1024 ,23        ,4.646          ,5.054               ,0.919 
0    ,0          ,11   ,1    ,23        ,4.205          ,4.538               ,0.927 
0    ,0          ,11   ,10   ,0         ,4.186          ,4.5                 ,0.93  
0    ,0          ,11   ,10   ,23        ,4.195          ,4.461               ,0.94  
0    ,0          ,11   ,12   ,0         ,4.192          ,4.58                ,0.915 
0    ,0          ,11   ,12   ,23        ,4.19           ,4.51                ,0.929 
0    ,0          ,11   ,2    ,0         ,4.187          ,4.506               ,0.929 
0    ,0          ,112  ,16   ,23        ,5.98           ,6.449               ,0.927 
0    ,0          ,1120 ,1024 ,23        ,5.18           ,5.246               ,0.987 
0    ,0          ,1152 ,1024 ,23        ,5.842          ,5.936               ,0.984 
0    ,0          ,1184 ,1024 ,23        ,6.452          ,6.508               ,0.992 
0    ,0          ,12   ,1    ,23        ,4.195          ,4.605               ,0.911 
0    ,0          ,12   ,11   ,0         ,4.184          ,4.469               ,0.936 
0    ,0          ,12   ,11   ,23        ,4.173          ,4.454               ,0.937 
0    ,0          ,12   ,13   ,0         ,4.184          ,4.527               ,0.924 
0    ,0          ,12   ,13   ,23        ,4.199          ,4.48                ,0.937 
0    ,0          ,12   ,2    ,0         ,4.174          ,4.475               ,0.933 
0    ,0          ,1216 ,1024 ,23        ,11.653         ,13.008              ,0.896 
0    ,0          ,1248 ,1024 ,23        ,11.125         ,10.955              ,1.016 
0    ,0          ,128  ,128  ,23        ,5.494          ,6.219               ,0.883 
0    ,0          ,128  ,160  ,23        ,5.583          ,6.388               ,0.874 
0    ,0          ,128  ,192  ,23        ,5.585          ,6.29                ,0.888 
0    ,0          ,128  ,224  ,23        ,5.7            ,6.495               ,0.878 
0    ,0          ,128  ,256  ,23        ,5.614          ,6.363               ,0.882 
0    ,0          ,128  ,288  ,23        ,5.896          ,6.294               ,0.937 
0    ,0          ,128  ,32   ,23        ,5.534          ,5.598               ,0.989 
0    ,0          ,128  ,320  ,23        ,5.806          ,6.433               ,0.903 
0    ,0          ,128  ,352  ,23        ,5.833          ,6.408               ,0.91  
0    ,0          ,128  ,384  ,23        ,5.735          ,6.268               ,0.915 
0    ,0          ,128  ,416  ,23        ,5.71           ,6.371               ,0.896 
0    ,0          ,128  ,448  ,23        ,5.789          ,6.429               ,0.9   
0    ,0          ,128  ,480  ,23        ,5.77           ,6.257               ,0.922 
0    ,0          ,128  ,512  ,23        ,5.998          ,6.57                ,0.913 
0    ,0          ,128  ,64   ,23        ,4.476          ,4.992               ,0.897 
0    ,0          ,128  ,96   ,23        ,3.252          ,4.281               ,0.76  
0    ,0          ,1280 ,1024 ,23        ,11.057         ,10.923              ,1.012 
0    ,0          ,13   ,1    ,23        ,4.208          ,4.542               ,0.926 
0    ,0          ,13   ,12   ,0         ,4.184          ,4.445               ,0.941 
0    ,0          ,13   ,12   ,23        ,4.172          ,4.463               ,0.935 
0    ,0          ,13   ,14   ,0         ,4.191          ,4.514               ,0.929 
0    ,0          ,13   ,14   ,23        ,4.216          ,4.526               ,0.932 
0    ,0          ,13   ,2    ,0         ,4.174          ,4.45                ,0.938 
0    ,0          ,1312 ,1024 ,23        ,11.061         ,10.852              ,1.019 
0    ,0          ,1344 ,1024 ,23        ,13.465         ,13.228              ,1.018 
0    ,0          ,1376 ,1024 ,23        ,13.411         ,13.115              ,1.023 
0    ,0          ,14   ,1    ,23        ,4.197          ,4.498               ,0.933 
0    ,0          ,14   ,13   ,0         ,4.207          ,4.517               ,0.931 
0    ,0          ,14   ,13   ,23        ,4.179          ,4.477               ,0.933 
0    ,0          ,14   ,15   ,0         ,4.196          ,4.564               ,0.919 
0    ,0          ,14   ,15   ,23        ,4.199          ,4.587               ,0.915 
0    ,0          ,14   ,2    ,0         ,4.156          ,4.496               ,0.924 
0    ,0          ,1408 ,1024 ,23        ,13.409         ,13.148              ,1.02  
0    ,0          ,144  ,16   ,23        ,6.901          ,7.075               ,0.975 
0    ,0          ,15   ,1    ,23        ,4.196          ,4.515               ,0.929 
0    ,0          ,15   ,14   ,0         ,4.179          ,4.52                ,0.925 
0    ,0          ,15   ,14   ,23        ,4.176          ,4.504               ,0.927 
0    ,0          ,15   ,16   ,0         ,4.159          ,4.736               ,0.878 
0    ,0          ,15   ,16   ,23        ,4.158          ,4.749               ,0.876 
0    ,0          ,15   ,2    ,0         ,4.137          ,4.425               ,0.935 
0    ,0          ,16   ,1    ,23        ,5.281          ,5.219               ,1.012 
0    ,0          ,16   ,112  ,23        ,4.115          ,4.617               ,0.891 
0    ,0          ,16   ,144  ,23        ,4.061          ,4.684               ,0.867 
0    ,0          ,16   ,15   ,0         ,4.158          ,4.145               ,1.003 
0    ,0          ,16   ,15   ,23        ,4.232          ,4.469               ,0.947 
0    ,0          ,16   ,16   ,23        ,4.003          ,4.376               ,0.915 
0    ,0          ,16   ,17   ,0         ,4.195          ,4.799               ,0.874 
0    ,0          ,16   ,17   ,23        ,4.229          ,4.909               ,0.862 
0    ,0          ,16   ,176  ,23        ,4.217          ,4.777               ,0.883 
0    ,0          ,16   ,2    ,0         ,4.538          ,5.01                ,0.906 
0    ,0          ,16   ,208  ,23        ,3.961          ,4.537               ,0.873 
0    ,0          ,16   ,240  ,23        ,4.05           ,4.54                ,0.892 
0    ,0          ,16   ,272  ,23        ,3.989          ,4.484               ,0.89  
0    ,0          ,16   ,304  ,23        ,3.959          ,4.366               ,0.907 
0    ,0          ,16   ,336  ,23        ,3.978          ,4.395               ,0.905 
0    ,0          ,16   ,368  ,23        ,3.985          ,4.44                ,0.898 
0    ,0          ,16   ,400  ,23        ,3.987          ,4.447               ,0.897 
0    ,0          ,16   ,48   ,23        ,4.266          ,4.654               ,0.917 
0    ,0          ,16   ,80   ,23        ,4.182          ,4.625               ,0.904 
0    ,0          ,160  ,128  ,23        ,3.255          ,4.341               ,0.75  
0    ,0          ,160  ,256  ,23        ,6.323          ,6.88                ,0.919 
0    ,0          ,160  ,32   ,23        ,7.131          ,7.717               ,0.924 
0    ,0          ,160  ,512  ,23        ,6.336          ,6.859               ,0.924 
0    ,0          ,160  ,64   ,23        ,5.431          ,5.07                ,1.071 
0    ,0          ,1664 ,2048 ,23        ,35.08          ,33.206              ,1.056 
0    ,0          ,1696 ,2048 ,23        ,35.042         ,33.087              ,1.059 
0    ,0          ,17   ,1    ,23        ,4.56           ,4.797               ,0.951 
0    ,0          ,17   ,16   ,0         ,4.133          ,4.507               ,0.917 
0    ,0          ,17   ,16   ,23        ,4.131          ,4.558               ,0.906 
0    ,0          ,17   ,18   ,0         ,4.218          ,4.905               ,0.86  
0    ,0          ,17   ,18   ,23        ,4.235          ,4.885               ,0.867 
0    ,0          ,17   ,2    ,0         ,4.15           ,4.69                ,0.885 
0    ,0          ,1728 ,2048 ,23        ,37.448         ,35.051              ,1.068 
0    ,0          ,176  ,16   ,23        ,7.794          ,8.079               ,0.965 
0    ,0          ,1760 ,2048 ,23        ,37.532         ,36.158              ,1.038 
0    ,0          ,1792 ,2048 ,23        ,37.367         ,34.982              ,1.068 
0    ,0          ,18   ,1    ,23        ,4.874          ,4.858               ,1.003 
0    ,0          ,18   ,17   ,0         ,4.153          ,4.146               ,1.002 
0    ,0          ,18   ,17   ,23        ,4.142          ,4.23                ,0.979 
0    ,0          ,18   ,19   ,0         ,4.241          ,5.171               ,0.82  
0    ,0          ,18   ,19   ,23        ,4.264          ,5.273               ,0.809 
0    ,0          ,18   ,2    ,0         ,4.248          ,4.713               ,0.901 
0    ,0          ,1824 ,2048 ,23        ,37.377         ,34.957              ,1.069 
0    ,0          ,1856 ,2048 ,23        ,39.768         ,36.956              ,1.076 
0    ,0          ,1888 ,2048 ,23        ,39.755         ,36.868              ,1.078 
0    ,0          ,19   ,1    ,23        ,4.293          ,4.876               ,0.88  
0    ,0          ,19   ,18   ,0         ,4.267          ,4.245               ,1.005 
0    ,0          ,19   ,18   ,23        ,4.253          ,4.25                ,1.001 
0    ,0          ,19   ,2    ,0         ,4.135          ,4.886               ,0.846 
0    ,0          ,19   ,20   ,0         ,4.143          ,4.899               ,0.846 
0    ,0          ,19   ,20   ,23        ,4.171          ,4.906               ,0.85  
0    ,0          ,192  ,128  ,23        ,4.577          ,4.988               ,0.918 
0    ,0          ,192  ,256  ,23        ,7.741          ,8.016               ,0.966 
0    ,0          ,192  ,32   ,23        ,7.599          ,7.85                ,0.968 
0    ,0          ,192  ,512  ,23        ,7.626          ,7.975               ,0.956 
0    ,0          ,192  ,64   ,23        ,5.756          ,5.79                ,0.994 
0    ,0          ,1920 ,2048 ,23        ,39.772         ,36.82               ,1.08  
0    ,0          ,1952 ,2048 ,23        ,39.921         ,36.885              ,1.082 
0    ,0          ,1984 ,2048 ,23        ,41.242         ,39.169              ,1.053 
0    ,0          ,2    ,1    ,0         ,4.017          ,4.433               ,0.906 
0    ,0          ,2    ,1    ,23        ,4.119          ,4.065               ,1.013 
0    ,0          ,2    ,2    ,0         ,4.107          ,4.409               ,0.931 
0    ,0          ,2    ,3    ,0         ,4.205          ,4.284               ,0.981 
0    ,0          ,2    ,3    ,23        ,4.206          ,4.457               ,0.944 
0    ,0          ,20   ,1    ,23        ,4.882          ,5.595               ,0.872 
0    ,0          ,20   ,19   ,0         ,4.166          ,4.266               ,0.977 
0    ,0          ,20   ,19   ,23        ,4.162          ,4.608               ,0.903 
0    ,0          ,20   ,2    ,0         ,4.678          ,4.665               ,1.003 
0    ,0          ,20   ,21   ,0         ,4.166          ,4.784               ,0.871 
0    ,0          ,20   ,21   ,23        ,4.179          ,4.784               ,0.874 
0    ,0          ,2016 ,2048 ,23        ,41.093         ,39.284              ,1.046 
0    ,0          ,2048 ,1024 ,0         ,24.885         ,24.022              ,1.036 
0    ,0          ,2048 ,1024 ,23        ,24.798         ,23.978              ,1.034 
0    ,0          ,2048 ,128  ,0         ,40.325         ,37.87               ,1.065 
0    ,0          ,2048 ,128  ,23        ,40.457         ,37.87               ,1.068 
0    ,0          ,2048 ,1664 ,23        ,13.382         ,13.063              ,1.024 
0    ,0          ,2048 ,1696 ,23        ,13.434         ,13.155              ,1.021 
0    ,0          ,2048 ,1728 ,23        ,12.592         ,12.27               ,1.026 
0    ,0          ,2048 ,1760 ,23        ,11.962         ,11.962              ,1.0   
0    ,0          ,2048 ,1792 ,23        ,11.011         ,10.902              ,1.01  
0    ,0          ,2048 ,1824 ,23        ,10.995         ,10.938              ,1.005 
0    ,0          ,2048 ,1856 ,23        ,10.201         ,10.131              ,1.007 
0    ,0          ,2048 ,1888 ,23        ,6.404          ,6.297               ,1.017 
0    ,0          ,2048 ,1920 ,23        ,5.714          ,5.748               ,0.994 
0    ,0          ,2048 ,1952 ,23        ,5.074          ,5.14                ,0.987 
0    ,0          ,2048 ,1984 ,23        ,4.564          ,4.995               ,0.914 
0    ,0          ,2048 ,2016 ,23        ,3.396          ,4.376               ,0.776 
0    ,0          ,2048 ,2048 ,0         ,40.658         ,39.074              ,1.041 
0    ,0          ,2048 ,2048 ,23        ,40.347         ,38.805              ,1.04  
0    ,0          ,2048 ,2080 ,23        ,41.989         ,39.193              ,1.071 
0    ,0          ,2048 ,2112 ,23        ,40.936         ,39.121              ,1.046 
0    ,0          ,2048 ,2144 ,23        ,41.161         ,39.188              ,1.05  
0    ,0          ,2048 ,2176 ,23        ,40.989         ,39.219              ,1.045 
0    ,0          ,2048 ,2208 ,23        ,41.301         ,39.104              ,1.056 
0    ,0          ,2048 ,2240 ,23        ,41.369         ,39.175              ,1.056 
0    ,0          ,2048 ,2272 ,23        ,41.537         ,39.286              ,1.057 
0    ,0          ,2048 ,2304 ,23        ,41.71          ,39.172              ,1.065 
0    ,0          ,2048 ,2336 ,23        ,41.176         ,39.057              ,1.054 
0    ,0          ,2048 ,2368 ,23        ,41.224         ,39.21               ,1.051 
0    ,0          ,2048 ,2400 ,23        ,41.119         ,39.128              ,1.051 
0    ,0          ,2048 ,2432 ,23        ,41.143         ,39.093              ,1.052 
0    ,0          ,2048 ,256  ,0         ,38.247         ,35.695              ,1.071 
0    ,0          ,2048 ,256  ,23        ,38.579         ,35.723              ,1.08  
0    ,0          ,2048 ,32   ,0         ,42.099         ,38.795              ,1.085 
0    ,0          ,2048 ,32   ,23        ,42.233         ,38.449              ,1.098 
0    ,0          ,2048 ,512  ,0         ,34.046         ,31.725              ,1.073 
0    ,0          ,2048 ,512  ,23        ,34.009         ,31.905              ,1.066 
0    ,0          ,2048 ,64   ,0         ,40.225         ,37.885              ,1.062 
0    ,0          ,2048 ,64   ,23        ,40.111         ,37.916              ,1.058 
0    ,0          ,208  ,16   ,23        ,8.706          ,8.964               ,0.971 
0    ,0          ,2080 ,2048 ,23        ,3.357          ,4.439               ,0.756 
0    ,0          ,21   ,1    ,23        ,4.667          ,5.023               ,0.929 
0    ,0          ,21   ,2    ,0         ,4.458          ,4.805               ,0.928 
0    ,0          ,21   ,20   ,0         ,4.133          ,4.503               ,0.918 
0    ,0          ,21   ,20   ,23        ,4.095          ,4.581               ,0.894 
0    ,0          ,21   ,22   ,0         ,4.178          ,4.819               ,0.867 
0    ,0          ,21   ,22   ,23        ,4.393          ,4.926               ,0.892 
0    ,0          ,2112 ,2048 ,23        ,4.632          ,5.076               ,0.913 
0    ,0          ,2144 ,2048 ,23        ,5.248          ,5.21                ,1.007 
0    ,0          ,2176 ,2048 ,23        ,5.902          ,5.979               ,0.987 
0    ,0          ,22   ,1    ,23        ,4.42           ,4.878               ,0.906 
0    ,0          ,22   ,2    ,0         ,4.384          ,4.881               ,0.898 
0    ,0          ,22   ,21   ,0         ,4.124          ,4.492               ,0.918 
0    ,0          ,22   ,21   ,23        ,4.122          ,4.854               ,0.849 
0    ,0          ,22   ,23   ,0         ,4.153          ,4.901               ,0.847 
0    ,0          ,22   ,23   ,23        ,4.231          ,5.081               ,0.833 
0    ,0          ,2208 ,2048 ,23        ,6.535          ,6.545               ,0.998 
0    ,0          ,224  ,128  ,23        ,5.022          ,5.113               ,0.982 
0    ,0          ,224  ,256  ,23        ,8.421          ,8.514               ,0.989 
0    ,0          ,224  ,32   ,23        ,8.467          ,8.063               ,1.05  
0    ,0          ,224  ,512  ,23        ,8.37           ,8.561               ,0.978 
0    ,0          ,224  ,64   ,23        ,6.351          ,6.307               ,1.007 
0    ,0          ,2240 ,2048 ,23        ,11.274         ,11.262              ,1.001 
0    ,0          ,2272 ,2048 ,23        ,11.408         ,12.951              ,0.881 
0    ,0          ,23   ,1    ,23        ,4.669          ,4.752               ,0.983 
0    ,0          ,23   ,2    ,0         ,4.335          ,4.581               ,0.946 
0    ,0          ,23   ,22   ,0         ,4.244          ,4.494               ,0.944 
0    ,0          ,23   ,22   ,23        ,4.233          ,4.686               ,0.903 
0    ,0          ,23   ,24   ,0         ,4.282          ,5.074               ,0.844 
0    ,0          ,23   ,24   ,23        ,4.348          ,5.185               ,0.839 
0    ,0          ,2304 ,2048 ,23        ,11.17          ,11.033              ,1.012 
0    ,0          ,2336 ,2048 ,23        ,11.212         ,11.015              ,1.018 
0    ,0          ,2368 ,2048 ,23        ,13.644         ,13.338              ,1.023 
0    ,0          ,24   ,1    ,23        ,4.491          ,4.865               ,0.923 
0    ,0          ,24   ,2    ,0         ,4.392          ,4.602               ,0.954 
0    ,0          ,24   ,23   ,0         ,4.367          ,4.125               ,1.059 
0    ,0          ,24   ,23   ,23        ,4.345          ,4.43                ,0.981 
0    ,0          ,24   ,25   ,0         ,4.235          ,4.93                ,0.859 
0    ,0          ,24   ,25   ,23        ,4.256          ,4.982               ,0.854 
0    ,0          ,240  ,16   ,23        ,9.724          ,9.198               ,1.057 
0    ,0          ,2400 ,2048 ,23        ,13.547         ,13.255              ,1.022 
0    ,0          ,2432 ,2048 ,23        ,13.511         ,13.073              ,1.034 
0    ,0          ,25   ,1    ,23        ,4.577          ,4.864               ,0.941 
0    ,0          ,25   ,2    ,0         ,4.484          ,4.924               ,0.911 
0    ,0          ,25   ,24   ,0         ,4.232          ,4.232               ,1.0   
0    ,0          ,25   ,24   ,23        ,4.258          ,4.546               ,0.936 
0    ,0          ,25   ,26   ,0         ,4.21           ,5.123               ,0.822 
0    ,0          ,25   ,26   ,23        ,4.275          ,5.077               ,0.842 
0    ,0          ,256  ,1    ,0         ,9.374          ,9.03                ,1.038 
0    ,0          ,256  ,1    ,23        ,9.42           ,9.104               ,1.035 
0    ,0          ,256  ,128  ,23        ,5.885          ,5.754               ,1.023 
0    ,0          ,256  ,160  ,23        ,5.017          ,5.037               ,0.996 
0    ,0          ,256  ,192  ,23        ,4.491          ,5.003               ,0.898 
0    ,0          ,256  ,2    ,0         ,9.37           ,9.08                ,1.032 
0    ,0          ,256  ,2    ,23        ,9.393          ,9.14                ,1.028 
0    ,0          ,256  ,224  ,23        ,3.294          ,4.214               ,0.782 
0    ,0          ,256  ,256  ,23        ,9.583          ,9.046               ,1.059 
0    ,0          ,256  ,288  ,23        ,9.911          ,9.147               ,1.084 
0    ,0          ,256  ,3    ,0         ,9.49           ,9.037               ,1.05  
0    ,0          ,256  ,3    ,23        ,9.481          ,9.085               ,1.044 
0    ,0          ,256  ,32   ,23        ,8.183          ,8.242               ,0.993 
0    ,0          ,256  ,320  ,23        ,10.085         ,9.136               ,1.104 
0    ,0          ,256  ,352  ,23        ,9.925          ,9.159               ,1.084 
0    ,0          ,256  ,384  ,23        ,9.788          ,9.133               ,1.072 
0    ,0          ,256  ,4    ,0         ,9.491          ,9.113               ,1.042 
0    ,0          ,256  ,4    ,23        ,9.462          ,9.09                ,1.041 
0    ,0          ,256  ,416  ,23        ,9.473          ,9.202               ,1.029 
0    ,0          ,256  ,448  ,23        ,9.785          ,9.19                ,1.065 
0    ,0          ,256  ,480  ,23        ,9.976          ,9.139               ,1.092 
0    ,0          ,256  ,5    ,0         ,9.455          ,9.037               ,1.046 
0    ,0          ,256  ,5    ,23        ,9.456          ,9.109               ,1.038 
0    ,0          ,256  ,512  ,23        ,10.303         ,9.137               ,1.128 
0    ,0          ,256  ,544  ,23        ,9.939          ,9.129               ,1.089 
0    ,0          ,256  ,576  ,23        ,9.972          ,9.208               ,1.083 
0    ,0          ,256  ,6    ,0         ,9.499          ,9.108               ,1.043 
0    ,0          ,256  ,6    ,23        ,9.527          ,9.122               ,1.044 
0    ,0          ,256  ,608  ,23        ,10.028         ,9.077               ,1.105 
0    ,0          ,256  ,64   ,23        ,7.529          ,7.676               ,0.981 
0    ,0          ,256  ,640  ,23        ,9.809          ,9.212               ,1.065 
0    ,0          ,256  ,7    ,0         ,9.347          ,9.11                ,1.026 
0    ,0          ,256  ,7    ,23        ,9.335          ,9.097               ,1.026 
0    ,0          ,256  ,96   ,23        ,6.396          ,6.311               ,1.013 
0    ,0          ,26   ,1    ,23        ,4.898          ,4.83                ,1.014 
0    ,0          ,26   ,2    ,0         ,4.35           ,4.506               ,0.965 
0    ,0          ,26   ,25   ,0         ,4.364          ,4.514               ,0.967 
0    ,0          ,26   ,25   ,23        ,4.278          ,4.753               ,0.9   
0    ,0          ,26   ,27   ,0         ,4.26           ,5.174               ,0.823 
0    ,0          ,26   ,27   ,23        ,4.346          ,5.225               ,0.832 
0    ,0          ,27   ,1    ,23        ,4.464          ,4.715               ,0.947 
0    ,0          ,27   ,2    ,0         ,4.282          ,4.628               ,0.925 
0    ,0          ,27   ,26   ,0         ,4.207          ,4.202               ,1.001 
0    ,0          ,27   ,26   ,23        ,4.212          ,4.463               ,0.944 
0    ,0          ,27   ,28   ,0         ,4.34           ,5.334               ,0.814 
0    ,0          ,27   ,28   ,23        ,4.412          ,5.311               ,0.831 
0    ,0          ,272  ,16   ,23        ,10.238         ,9.82                ,1.043 
0    ,0          ,28   ,1    ,23        ,4.983          ,4.597               ,1.084 
0    ,0          ,28   ,2    ,0         ,4.273          ,4.505               ,0.948 
0    ,0          ,28   ,27   ,0         ,4.113          ,4.321               ,0.952 
0    ,0          ,28   ,27   ,23        ,4.31           ,4.624               ,0.932 
0    ,0          ,28   ,29   ,0         ,4.225          ,5.028               ,0.84  
0    ,0          ,28   ,29   ,23        ,4.324          ,5.174               ,0.836 
0    ,0          ,288  ,128  ,23        ,6.538          ,6.34                ,1.031 
0    ,0          ,288  ,256  ,23        ,3.252          ,4.295               ,0.757 
0    ,0          ,288  ,32   ,23        ,9.603          ,9.187               ,1.045 
0    ,0          ,288  ,512  ,23        ,9.928          ,9.707               ,1.023 
0    ,0          ,288  ,64   ,23        ,8.104          ,8.175               ,0.991 
0    ,0          ,29   ,1    ,23        ,4.634          ,4.633               ,1.0   
0    ,0          ,29   ,2    ,0         ,4.298          ,4.656               ,0.923 
0    ,0          ,29   ,28   ,0         ,4.34           ,4.079               ,1.064 
0    ,0          ,29   ,28   ,23        ,4.308          ,4.52                ,0.953 
0    ,0          ,29   ,30   ,0         ,4.385          ,5.098               ,0.86  
0    ,0          ,29   ,30   ,23        ,4.542          ,5.193               ,0.875 
0    ,0          ,3    ,1    ,23        ,4.282          ,4.267               ,1.003 
0    ,0          ,3    ,2    ,0         ,4.201          ,4.472               ,0.939 
0    ,0          ,3    ,2    ,23        ,4.138          ,4.287               ,0.965 
0    ,0          ,3    ,4    ,0         ,4.207          ,4.563               ,0.922 
0    ,0          ,3    ,4    ,23        ,4.205          ,4.58                ,0.918 
0    ,0          ,30   ,1    ,23        ,4.4            ,4.555               ,0.966 
0    ,0          ,30   ,2    ,0         ,4.243          ,4.524               ,0.938 
0    ,0          ,30   ,29   ,0         ,4.707          ,4.227               ,1.114 
0    ,0          ,30   ,29   ,23        ,4.347          ,4.45                ,0.977 
0    ,0          ,30   ,31   ,0         ,4.101          ,5.022               ,0.817 
0    ,0          ,30   ,31   ,23        ,4.339          ,5.042               ,0.861 
0    ,0          ,304  ,16   ,23        ,12.804         ,12.704              ,1.008 
0    ,0          ,31   ,1    ,23        ,4.109          ,4.504               ,0.912 
0    ,0          ,31   ,2    ,0         ,4.08           ,4.378               ,0.932 
0    ,0          ,31   ,30   ,0         ,4.338          ,4.182               ,1.037 
0    ,0          ,31   ,30   ,23        ,4.303          ,4.371               ,0.984 
0    ,0          ,32   ,1    ,23        ,4.324          ,4.733               ,0.914 
0    ,0          ,32   ,128  ,23        ,3.876          ,4.41                ,0.879 
0    ,0          ,32   ,160  ,23        ,4.523          ,5.469               ,0.827 
0    ,0          ,32   ,192  ,23        ,4.559          ,5.435               ,0.839 
0    ,0          ,32   ,2    ,0         ,4.176          ,4.574               ,0.913 
0    ,0          ,32   ,224  ,23        ,3.898          ,4.405               ,0.885 
0    ,0          ,32   ,256  ,23        ,3.881          ,4.445               ,0.873 
0    ,0          ,32   ,288  ,23        ,3.876          ,4.443               ,0.872 
0    ,0          ,32   ,31   ,0         ,4.183          ,4.037               ,1.036 
0    ,0          ,32   ,31   ,23        ,4.283          ,4.072               ,1.052 
0    ,0          ,32   ,32   ,23        ,3.849          ,4.277               ,0.9   
0    ,0          ,32   ,320  ,23        ,3.891          ,4.443               ,0.876 
0    ,0          ,32   ,352  ,23        ,3.909          ,4.464               ,0.876 
0    ,0          ,32   ,384  ,23        ,3.955          ,4.498               ,0.879 
0    ,0          ,32   ,416  ,23        ,3.914          ,4.477               ,0.874 
0    ,0          ,32   ,64   ,23        ,3.894          ,4.427               ,0.879 
0    ,0          ,32   ,96   ,23        ,3.847          ,4.398               ,0.875 
0    ,0          ,320  ,128  ,23        ,11.123         ,11.053              ,1.006 
0    ,0          ,320  ,256  ,23        ,4.555          ,5.023               ,0.907 
0    ,0          ,320  ,32   ,23        ,12.421         ,11.808              ,1.052 
0    ,0          ,320  ,512  ,23        ,12.29          ,12.421              ,0.989 
0    ,0          ,320  ,64   ,23        ,11.301         ,11.207              ,1.008 
0    ,0          ,336  ,16   ,23        ,12.884         ,12.657              ,1.018 
0    ,0          ,352  ,128  ,23        ,11.08          ,11.015              ,1.006 
0    ,0          ,352  ,256  ,23        ,5.075          ,5.061               ,1.003 
0    ,0          ,352  ,32   ,23        ,12.231         ,11.846              ,1.033 
0    ,0          ,352  ,512  ,23        ,12.373         ,12.446              ,0.994 
0    ,0          ,352  ,64   ,23        ,11.425         ,11.232              ,1.017 
0    ,0          ,368  ,16   ,23        ,12.707         ,12.713              ,1.0   
0    ,0          ,3712 ,4096 ,23        ,68.838         ,66.167              ,1.04  
0    ,0          ,3744 ,4096 ,23        ,68.55          ,66.121              ,1.037 
0    ,0          ,3776 ,4096 ,23        ,71.251         ,68.184              ,1.045 
0    ,0          ,3808 ,4096 ,23        ,70.52          ,68.04               ,1.036 
0    ,0          ,384  ,128  ,23        ,11.138         ,10.911              ,1.021 
0    ,0          ,384  ,256  ,23        ,5.763          ,5.706               ,1.01  
0    ,0          ,384  ,32   ,23        ,12.496         ,11.787              ,1.06  
0    ,0          ,384  ,512  ,23        ,12.413         ,12.429              ,0.999 
0    ,0          ,384  ,64   ,23        ,11.346         ,11.194              ,1.014 
0    ,0          ,3840 ,4096 ,23        ,70.921         ,67.982              ,1.043 
0    ,0          ,3872 ,4096 ,23        ,70.855         ,68.093              ,1.041 
0    ,0          ,3904 ,4096 ,23        ,73.578         ,70.116              ,1.049 
0    ,0          ,3936 ,4096 ,23        ,72.436         ,70.06               ,1.034 
0    ,0          ,3968 ,4096 ,23        ,72.531         ,70.268              ,1.032 
0    ,0          ,4    ,1    ,23        ,4.268          ,4.29                ,0.995 
0    ,0          ,4    ,2    ,0         ,4.207          ,4.482               ,0.939 
0    ,0          ,4    ,3    ,0         ,4.215          ,4.494               ,0.938 
0    ,0          ,4    ,3    ,23        ,4.226          ,4.544               ,0.93  
0    ,0          ,4    ,5    ,0         ,4.203          ,4.52                ,0.93  
0    ,0          ,4    ,5    ,23        ,4.197          ,4.487               ,0.935 
0    ,0          ,400  ,16   ,23        ,12.812         ,12.571              ,1.019 
0    ,0          ,4000 ,4096 ,23        ,71.085         ,70.296              ,1.011 
0    ,0          ,4032 ,4096 ,23        ,74.803         ,73.621              ,1.016 
0    ,0          ,4064 ,4096 ,23        ,74.815         ,73.484              ,1.018 
0    ,0          ,4096 ,3712 ,23        ,14.069         ,13.773              ,1.021 
0    ,0          ,4096 ,3744 ,23        ,13.516         ,13.066              ,1.034 
0    ,0          ,4096 ,3776 ,23        ,12.518         ,12.377              ,1.011 
0    ,0          ,4096 ,3808 ,23        ,13.046         ,12.03               ,1.084 
0    ,0          ,4096 ,3840 ,23        ,10.975         ,10.873              ,1.009 
0    ,0          ,4096 ,3872 ,23        ,11.041         ,10.854              ,1.017 
0    ,0          ,4096 ,3904 ,23        ,10.191         ,10.07               ,1.012 
0    ,0          ,4096 ,3936 ,23        ,6.324          ,6.358               ,0.995 
0    ,0          ,4096 ,3968 ,23        ,5.714          ,5.649               ,1.012 
0    ,0          ,4096 ,4000 ,23        ,5.02           ,5.056               ,0.993 
0    ,0          ,4096 ,4032 ,23        ,4.488          ,5.032               ,0.892 
0    ,0          ,4096 ,4064 ,23        ,3.286          ,4.359               ,0.754 
0    ,0          ,4096 ,4096 ,23        ,74.788         ,78.609              ,0.951 
0    ,0          ,4096 ,4128 ,23        ,75.277         ,75.964              ,0.991 
0    ,0          ,4096 ,4160 ,23        ,74.6           ,74.628              ,1.0   
0    ,0          ,4096 ,4192 ,23        ,74.965         ,74.891              ,1.001 
0    ,0          ,4096 ,4224 ,23        ,74.557         ,74.726              ,0.998 
0    ,0          ,4096 ,4256 ,23        ,74.298         ,72.462              ,1.025 
0    ,0          ,4096 ,4288 ,23        ,74.739         ,72.21               ,1.035 
0    ,0          ,4096 ,4320 ,23        ,74.699         ,72.557              ,1.03  
0    ,0          ,4096 ,4352 ,23        ,74.596         ,72.514              ,1.029 
0    ,0          ,4096 ,4384 ,23        ,74.688         ,73.66               ,1.014 
0    ,0          ,4096 ,4416 ,23        ,74.296         ,72.451              ,1.025 
0    ,0          ,4096 ,4448 ,23        ,74.662         ,73.314              ,1.018 
0    ,0          ,4096 ,4480 ,23        ,74.891         ,72.29               ,1.036 
0    ,0          ,4128 ,4096 ,23        ,4.344          ,5.058               ,0.859 
0    ,0          ,416  ,128  ,23        ,11.221         ,10.957              ,1.024 
0    ,0          ,416  ,256  ,23        ,6.449          ,6.366               ,1.013 
0    ,0          ,416  ,32   ,23        ,12.327         ,11.748              ,1.049 
0    ,0          ,416  ,512  ,23        ,12.347         ,12.404              ,0.995 
0    ,0          ,416  ,64   ,23        ,11.289         ,11.135              ,1.014 
0    ,0          ,4160 ,4096 ,23        ,4.579          ,4.988               ,0.918 
0    ,0          ,4192 ,4096 ,23        ,5.138          ,5.187               ,0.991 
0    ,0          ,4224 ,4096 ,23        ,5.824          ,5.86                ,0.994 
0    ,0          ,4256 ,4096 ,23        ,6.424          ,6.516               ,0.986 
0    ,0          ,4288 ,4096 ,23        ,11.161         ,10.926              ,1.022 
0    ,0          ,4320 ,4096 ,23        ,11.105         ,10.958              ,1.013 
0    ,0          ,4352 ,4096 ,23        ,11.083         ,10.992              ,1.008 
0    ,0          ,4384 ,4096 ,23        ,10.99          ,10.838              ,1.014 
0    ,0          ,4416 ,4096 ,23        ,13.484         ,13.163              ,1.024 
0    ,0          ,4448 ,4096 ,23        ,13.418         ,13.091              ,1.025 
0    ,0          ,448  ,128  ,23        ,13.755         ,13.221              ,1.04  
0    ,0          ,448  ,256  ,23        ,11.26          ,11.111              ,1.013 
0    ,0          ,448  ,512  ,23        ,14.859         ,14.757              ,1.007 
0    ,0          ,448  ,64   ,23        ,13.641         ,13.489              ,1.011 
0    ,0          ,4480 ,4096 ,23        ,13.435         ,12.977              ,1.035 
0    ,0          ,48   ,16   ,23        ,3.391          ,4.449               ,0.762 
0    ,0          ,480  ,128  ,23        ,13.621         ,13.257              ,1.028 
0    ,0          ,480  ,256  ,23        ,11.281         ,11.051              ,1.021 
0    ,0          ,480  ,512  ,23        ,14.865         ,15.011              ,0.99  
0    ,0          ,5    ,1    ,23        ,4.198          ,4.338               ,0.968 
0    ,0          ,5    ,2    ,0         ,4.212          ,4.544               ,0.927 
0    ,0          ,5    ,4    ,0         ,4.205          ,4.594               ,0.915 
0    ,0          ,5    ,4    ,23        ,4.367          ,4.492               ,0.972 
0    ,0          ,5    ,6    ,0         ,4.188          ,4.574               ,0.916 
0    ,0          ,5    ,6    ,23        ,4.232          ,4.577               ,0.925 
0    ,0          ,512  ,128  ,23        ,13.613         ,13.219              ,1.03  
0    ,0          ,512  ,160  ,23        ,13.463         ,13.139              ,1.025 
0    ,0          ,512  ,192  ,23        ,12.682         ,12.373              ,1.025 
0    ,0          ,512  ,224  ,23        ,12.232         ,12.093              ,1.011 
0    ,0          ,512  ,256  ,23        ,11.181         ,10.969              ,1.019 
0    ,0          ,512  ,288  ,23        ,11.153         ,11.019              ,1.012 
0    ,0          ,512  ,320  ,23        ,10.398         ,10.24               ,1.015 
0    ,0          ,512  ,352  ,23        ,6.571          ,6.463               ,1.017 
0    ,0          ,512  ,384  ,23        ,5.884          ,5.854               ,1.005 
0    ,0          ,512  ,416  ,23        ,5.257          ,5.37                ,0.979 
0    ,0          ,512  ,448  ,23        ,4.481          ,5.013               ,0.894 
0    ,0          ,512  ,480  ,23        ,3.253          ,4.284               ,0.759 
0    ,0          ,512  ,512  ,23        ,14.58          ,14.476              ,1.007 
0    ,0          ,512  ,544  ,23        ,14.905         ,14.811              ,1.006 
0    ,0          ,512  ,576  ,23        ,14.782         ,14.686              ,1.007 
0    ,0          ,512  ,608  ,23        ,14.788         ,14.688              ,1.007 
0    ,0          ,512  ,640  ,23        ,14.832         ,14.711              ,1.008 
0    ,0          ,512  ,672  ,23        ,14.735         ,14.701              ,1.002 
0    ,0          ,512  ,704  ,23        ,14.843         ,14.758              ,1.006 
0    ,0          ,512  ,736  ,23        ,14.797         ,14.788              ,1.001 
0    ,0          ,512  ,768  ,23        ,14.959         ,14.667              ,1.02  
0    ,0          ,512  ,800  ,23        ,14.847         ,14.752              ,1.006 
0    ,0          ,512  ,832  ,23        ,14.836         ,14.812              ,1.002 
0    ,0          ,512  ,864  ,23        ,14.877         ,14.654              ,1.015 
0    ,0          ,512  ,896  ,23        ,14.891         ,14.634              ,1.018 
0    ,0          ,544  ,256  ,23        ,11.142         ,10.962              ,1.016 
0    ,0          ,544  ,512  ,23        ,3.3            ,4.358               ,0.757 
0    ,0          ,576  ,256  ,23        ,13.721         ,13.415              ,1.023 
0    ,0          ,576  ,512  ,23        ,4.544          ,4.96                ,0.916 
0    ,0          ,6    ,1    ,23        ,4.206          ,4.487               ,0.937 
0    ,0          ,6    ,2    ,0         ,4.197          ,4.454               ,0.942 
0    ,0          ,6    ,5    ,0         ,4.194          ,4.481               ,0.936 
0    ,0          ,6    ,5    ,23        ,4.189          ,4.53                ,0.925 
0    ,0          ,6    ,7    ,0         ,4.15           ,4.462               ,0.93  
0    ,0          ,6    ,7    ,23        ,4.198          ,4.576               ,0.917 
0    ,0          ,608  ,256  ,23        ,13.576         ,13.227              ,1.026 
0    ,0          ,608  ,512  ,23        ,5.103          ,5.174               ,0.986 
0    ,0          ,64   ,128  ,23        ,4.805          ,5.81                ,0.827 
0    ,0          ,64   ,160  ,23        ,4.72           ,5.852               ,0.807 
0    ,0          ,64   ,192  ,23        ,4.298          ,5.782               ,0.743 
0    ,0          ,64   ,224  ,23        ,4.403          ,5.79                ,0.761 
0    ,0          ,64   ,256  ,23        ,4.814          ,5.863               ,0.821 
0    ,0          ,64   ,288  ,23        ,4.314          ,5.684               ,0.759 
0    ,0          ,64   ,32   ,23        ,3.23           ,4.32                ,0.748 
0    ,0          ,64   ,320  ,23        ,4.499          ,5.676               ,0.792 
0    ,0          ,64   ,352  ,23        ,4.398          ,5.594               ,0.786 
0    ,0          ,64   ,384  ,23        ,4.453          ,5.79                ,0.769 
0    ,0          ,64   ,416  ,23        ,4.469          ,5.651               ,0.791 
0    ,0          ,64   ,448  ,23        ,4.034          ,5.665               ,0.712 
0    ,0          ,64   ,64   ,23        ,3.957          ,5.519               ,0.717 
0    ,0          ,64   ,96   ,23        ,4.271          ,5.786               ,0.738 
0    ,0          ,640  ,1024 ,23        ,17.874         ,17.001              ,1.051 
0    ,0          ,640  ,256  ,23        ,13.524         ,13.242              ,1.021 
0    ,0          ,640  ,512  ,23        ,5.775          ,5.943               ,0.972 
0    ,0          ,672  ,1024 ,23        ,18.025         ,16.955              ,1.063 
0    ,0          ,672  ,512  ,23        ,6.429          ,6.397               ,1.005 
0    ,0          ,7    ,1    ,23        ,4.195          ,4.535               ,0.925 
0    ,0          ,7    ,2    ,0         ,4.18           ,4.472               ,0.935 
0    ,0          ,7    ,6    ,0         ,4.172          ,4.461               ,0.935 
0    ,0          ,7    ,6    ,23        ,4.189          ,4.473               ,0.937 
0    ,0          ,7    ,8    ,0         ,4.195          ,4.487               ,0.935 
0    ,0          ,7    ,8    ,23        ,4.185          ,4.552               ,0.919 
0    ,0          ,704  ,1024 ,23        ,20.053         ,18.9                ,1.061 
0    ,0          ,704  ,512  ,23        ,11.132         ,10.917              ,1.02  
0    ,0          ,736  ,1024 ,23        ,20.077         ,19.034              ,1.055 
0    ,0          ,736  ,512  ,23        ,11.187         ,10.924              ,1.024 
0    ,0          ,768  ,1024 ,23        ,20.147         ,18.892              ,1.066 
0    ,0          ,768  ,512  ,23        ,11.013         ,10.847              ,1.015 
0    ,0          ,7808 ,8192 ,23        ,149.174        ,145.361             ,1.026 
0    ,0          ,7840 ,8192 ,23        ,147.627        ,144.985             ,1.018 
0    ,0          ,7872 ,8192 ,23        ,148.71         ,146.938             ,1.012 
0    ,0          ,7904 ,8192 ,23        ,146.995        ,147.014             ,1.0   
0    ,0          ,7936 ,8192 ,23        ,149.794        ,149.606             ,1.001 
0    ,0          ,7968 ,8192 ,23        ,148.543        ,146.792             ,1.012 
0    ,0          ,8    ,1    ,23        ,4.193          ,4.549               ,0.922 
0    ,0          ,8    ,2    ,0         ,4.173          ,4.499               ,0.928 
0    ,0          ,8    ,7    ,0         ,4.174          ,4.478               ,0.932 
0    ,0          ,8    ,7    ,23        ,4.143          ,4.44                ,0.933 
0    ,0          ,8    ,9    ,0         ,4.217          ,4.556               ,0.926 
0    ,0          ,8    ,9    ,23        ,4.185          ,4.504               ,0.929 
0    ,0          ,80   ,16   ,23        ,5.475          ,5.803               ,0.943 
0    ,0          ,800  ,1024 ,23        ,20.093         ,19.033              ,1.056 
0    ,0          ,800  ,512  ,23        ,10.965         ,10.79               ,1.016 
0    ,0          ,8000 ,8192 ,23        ,152.245        ,149.161             ,1.021 
0    ,0          ,8032 ,8192 ,23        ,153.62         ,148.513             ,1.034 
0    ,0          ,8064 ,8192 ,23        ,153.401        ,148.791             ,1.031 
0    ,0          ,8096 ,8192 ,23        ,153.071        ,149.409             ,1.025 
0    ,0          ,8128 ,8192 ,23        ,154.793        ,151.832             ,1.02  
0    ,0          ,8160 ,8192 ,23        ,156.429        ,151.318             ,1.034 
0    ,0          ,832  ,1024 ,23        ,22.263         ,21.374              ,1.042 
0    ,0          ,832  ,512  ,23        ,13.56          ,13.143              ,1.032 
0    ,0          ,864  ,1024 ,23        ,23.632         ,21.362              ,1.106 
0    ,0          ,864  ,512  ,23        ,13.695         ,13.194              ,1.038 
0    ,0          ,896  ,1024 ,23        ,22.51          ,21.245              ,1.06  
0    ,0          ,896  ,512  ,23        ,13.551         ,13.068              ,1.037 
0    ,0          ,9    ,1    ,23        ,4.182          ,4.447               ,0.94  
0    ,0          ,9    ,10   ,0         ,4.195          ,4.52                ,0.928 
0    ,0          ,9    ,10   ,23        ,4.186          ,4.553               ,0.919 
0    ,0          ,9    ,2    ,0         ,4.186          ,4.648               ,0.901 
0    ,0          ,9    ,8    ,0         ,4.165          ,4.435               ,0.939 
0    ,0          ,9    ,8    ,23        ,4.155          ,4.487               ,0.926 
0    ,0          ,928  ,1024 ,23        ,22.386         ,21.379              ,1.047 
0    ,0          ,96   ,128  ,23        ,4.658          ,5.522               ,0.843 
0    ,0          ,96   ,256  ,23        ,4.674          ,5.461               ,0.856 
0    ,0          ,96   ,32   ,23        ,4.411          ,5.283               ,0.835 
0    ,0          ,96   ,64   ,23        ,3.239          ,4.274               ,0.758 
0    ,0          ,960  ,1024 ,23        ,24.293         ,23.534              ,1.032 
0    ,0          ,992  ,1024 ,23        ,24.041         ,23.542              ,1.021 
0    ,1          ,0    ,1    ,0         ,3.423          ,3.428               ,0.999 
0    ,1          ,0    ,1    ,23        ,3.39           ,3.385               ,1.001 
0    ,1          ,1    ,2    ,0         ,4.211          ,4.157               ,1.013 
0    ,1          ,1    ,2    ,23        ,4.161          ,4.278               ,0.973 
0    ,1          ,10   ,1    ,23        ,4.161          ,4.154               ,1.002 
0    ,1          ,10   ,11   ,0         ,4.133          ,4.44                ,0.931 
0    ,1          ,10   ,11   ,23        ,4.121          ,4.464               ,0.923 
0    ,1          ,10   ,2    ,0         ,4.207          ,4.529               ,0.929 
0    ,1          ,10   ,9    ,0         ,4.156          ,4.505               ,0.922 
0    ,1          ,10   ,9    ,23        ,4.187          ,4.704               ,0.89  
0    ,1          ,1024 ,1024 ,23        ,23.738         ,23.259              ,1.021 
0    ,1          ,1024 ,1056 ,23        ,23.87          ,23.586              ,1.012 
0    ,1          ,1024 ,1088 ,23        ,23.857         ,23.546              ,1.013 
0    ,1          ,1024 ,1120 ,23        ,23.905         ,23.621              ,1.012 
0    ,1          ,1024 ,1152 ,23        ,23.997         ,23.459              ,1.023 
0    ,1          ,1024 ,1184 ,23        ,23.946         ,23.534              ,1.018 
0    ,1          ,1024 ,1216 ,23        ,23.982         ,23.428              ,1.024 
0    ,1          ,1024 ,1248 ,23        ,23.95          ,23.434              ,1.022 
0    ,1          ,1024 ,1280 ,23        ,23.935         ,23.544              ,1.017 
0    ,1          ,1024 ,1312 ,23        ,23.985         ,23.532              ,1.019 
0    ,1          ,1024 ,1344 ,23        ,24.067         ,23.475              ,1.025 
0    ,1          ,1024 ,1376 ,23        ,23.987         ,23.513              ,1.02  
0    ,1          ,1024 ,1408 ,23        ,24.006         ,23.494              ,1.022 
0    ,1          ,1024 ,640  ,23        ,19.057         ,18.032              ,1.057 
0    ,1          ,1024 ,672  ,23        ,19.163         ,18.794              ,1.02  
0    ,1          ,1024 ,704  ,23        ,19.599         ,19.03               ,1.03  
0    ,1          ,1024 ,736  ,23        ,21.345         ,20.606              ,1.036 
0    ,1          ,1024 ,768  ,23        ,21.385         ,20.711              ,1.033 
0    ,1          ,1024 ,800  ,23        ,21.301         ,20.988              ,1.015 
0    ,1          ,1024 ,832  ,23        ,23.064         ,22.265              ,1.036 
0    ,1          ,1024 ,864  ,23        ,22.926         ,22.395              ,1.024 
0    ,1          ,1024 ,896  ,23        ,22.998         ,22.413              ,1.026 
0    ,1          ,1024 ,928  ,23        ,22.761         ,21.987              ,1.035 
0    ,1          ,1024 ,960  ,23        ,22.991         ,22.411              ,1.026 
0    ,1          ,1024 ,992  ,23        ,24.487         ,22.796              ,1.074 
0    ,1          ,1056 ,1024 ,23        ,24.485         ,22.767              ,1.075 
0    ,1          ,1088 ,1024 ,23        ,25.28          ,24.354              ,1.038 
0    ,1          ,11   ,1    ,23        ,4.15           ,4.123               ,1.007 
0    ,1          ,11   ,10   ,0         ,4.153          ,4.487               ,0.926 
0    ,1          ,11   ,10   ,23        ,4.143          ,4.627               ,0.895 
0    ,1          ,11   ,12   ,0         ,4.137          ,4.416               ,0.937 
0    ,1          ,11   ,12   ,23        ,4.113          ,4.477               ,0.919 
0    ,1          ,11   ,2    ,0         ,4.186          ,4.511               ,0.928 
0    ,1          ,112  ,16   ,23        ,3.444          ,4.487               ,0.767 
0    ,1          ,1120 ,1024 ,23        ,24.853         ,23.996              ,1.036 
0    ,1          ,1152 ,1024 ,23        ,24.836         ,24.105              ,1.03  
0    ,1          ,1184 ,1024 ,23        ,24.769         ,24.118              ,1.027 
0    ,1          ,12   ,1    ,23        ,4.155          ,4.134               ,1.005 
0    ,1          ,12   ,11   ,0         ,4.135          ,4.489               ,0.921 
0    ,1          ,12   ,11   ,23        ,4.154          ,4.479               ,0.927 
0    ,1          ,12   ,13   ,0         ,4.122          ,4.421               ,0.932 
0    ,1          ,12   ,13   ,23        ,4.121          ,4.45                ,0.926 
0    ,1          ,12   ,2    ,0         ,4.234          ,4.525               ,0.936 
0    ,1          ,1216 ,1024 ,23        ,26.159         ,25.295              ,1.034 
0    ,1          ,1248 ,1024 ,23        ,25.724         ,25.002              ,1.029 
0    ,1          ,128  ,128  ,23        ,5.527          ,6.211               ,0.89  
0    ,1          ,128  ,160  ,23        ,5.632          ,6.358               ,0.886 
0    ,1          ,128  ,192  ,23        ,5.651          ,6.349               ,0.89  
0    ,1          ,128  ,224  ,23        ,5.845          ,6.607               ,0.885 
0    ,1          ,128  ,256  ,23        ,5.707          ,6.284               ,0.908 
0    ,1          ,128  ,288  ,23        ,5.811          ,6.321               ,0.919 
0    ,1          ,128  ,32   ,23        ,3.323          ,4.304               ,0.772 
0    ,1          ,128  ,320  ,23        ,5.638          ,6.332               ,0.89  
0    ,1          ,128  ,352  ,23        ,5.655          ,6.361               ,0.889 
0    ,1          ,128  ,384  ,23        ,5.699          ,6.283               ,0.907 
0    ,1          ,128  ,416  ,23        ,5.656          ,6.459               ,0.876 
0    ,1          ,128  ,448  ,23        ,5.635          ,6.363               ,0.885 
0    ,1          ,128  ,480  ,23        ,5.71           ,6.318               ,0.904 
0    ,1          ,128  ,512  ,23        ,5.703          ,6.359               ,0.897 
0    ,1          ,128  ,64   ,23        ,4.51           ,4.951               ,0.911 
0    ,1          ,128  ,96   ,23        ,5.346          ,5.354               ,0.999 
0    ,1          ,1280 ,1024 ,23        ,24.919         ,24.039              ,1.037 
0    ,1          ,13   ,1    ,23        ,4.143          ,4.125               ,1.004 
0    ,1          ,13   ,12   ,0         ,4.144          ,4.573               ,0.906 
0    ,1          ,13   ,12   ,23        ,4.135          ,4.602               ,0.899 
0    ,1          ,13   ,14   ,0         ,4.113          ,4.432               ,0.928 
0    ,1          ,13   ,14   ,23        ,4.115          ,4.429               ,0.929 
0    ,1          ,13   ,2    ,0         ,4.179          ,4.492               ,0.93  
0    ,1          ,1312 ,1024 ,23        ,25.275         ,24.133              ,1.047 
0    ,1          ,1344 ,1024 ,23        ,26.075         ,25.323              ,1.03  
0    ,1          ,1376 ,1024 ,23        ,25.723         ,25.122              ,1.024 
0    ,1          ,14   ,1    ,23        ,4.142          ,4.135               ,1.002 
0    ,1          ,14   ,13   ,0         ,4.103          ,4.449               ,0.922 
0    ,1          ,14   ,13   ,23        ,4.133          ,4.516               ,0.915 
0    ,1          ,14   ,15   ,0         ,4.123          ,4.406               ,0.936 
0    ,1          ,14   ,15   ,23        ,4.104          ,4.462               ,0.92  
0    ,1          ,14   ,2    ,0         ,4.173          ,4.457               ,0.936 
0    ,1          ,1408 ,1024 ,23        ,24.907         ,24.409              ,1.02  
0    ,1          ,144  ,16   ,23        ,3.43           ,4.454               ,0.77  
0    ,1          ,15   ,1    ,23        ,4.113          ,4.226               ,0.973 
0    ,1          ,15   ,14   ,0         ,4.147          ,4.499               ,0.922 
0    ,1          ,15   ,14   ,23        ,4.152          ,4.5                 ,0.923 
0    ,1          ,15   ,16   ,0         ,4.182          ,4.829               ,0.866 
0    ,1          ,15   ,16   ,23        ,4.133          ,4.804               ,0.86  
0    ,1          ,15   ,2    ,0         ,4.154          ,4.51                ,0.921 
0    ,1          ,16   ,1    ,23        ,4.502          ,4.956               ,0.909 
0    ,1          ,16   ,112  ,23        ,4.221          ,4.853               ,0.87  
0    ,1          ,16   ,144  ,23        ,4.201          ,4.701               ,0.894 
0    ,1          ,16   ,15   ,0         ,4.15           ,4.484               ,0.926 
0    ,1          ,16   ,15   ,23        ,4.126          ,4.496               ,0.918 
0    ,1          ,16   ,16   ,23        ,4.064          ,4.366               ,0.931 
0    ,1          ,16   ,17   ,0         ,4.113          ,4.803               ,0.856 
0    ,1          ,16   ,17   ,23        ,4.103          ,4.838               ,0.848 
0    ,1          ,16   ,176  ,23        ,4.23           ,4.688               ,0.902 
0    ,1          ,16   ,2    ,0         ,4.133          ,4.495               ,0.919 
0    ,1          ,16   ,208  ,23        ,4.159          ,4.737               ,0.878 
0    ,1          ,16   ,240  ,23        ,4.151          ,4.539               ,0.915 
0    ,1          ,16   ,272  ,23        ,4.158          ,4.497               ,0.925 
0    ,1          ,16   ,304  ,23        ,4.148          ,4.605               ,0.901 
0    ,1          ,16   ,336  ,23        ,4.108          ,4.553               ,0.902 
0    ,1          ,16   ,368  ,23        ,4.087          ,4.525               ,0.903 
0    ,1          ,16   ,400  ,23        ,4.103          ,4.517               ,0.908 
0    ,1          ,16   ,48   ,23        ,4.311          ,4.799               ,0.898 
0    ,1          ,16   ,80   ,23        ,4.274          ,4.821               ,0.887 
0    ,1          ,160  ,128  ,23        ,5.927          ,6.816               ,0.87  
0    ,1          ,160  ,256  ,23        ,6.323          ,6.878               ,0.919 
0    ,1          ,160  ,32   ,23        ,3.232          ,4.293               ,0.753 
0    ,1          ,160  ,512  ,23        ,6.283          ,6.836               ,0.919 
0    ,1          ,160  ,64   ,23        ,4.532          ,4.987               ,0.909 
0    ,1          ,1664 ,2048 ,23        ,35.104         ,33.034              ,1.063 
0    ,1          ,1696 ,2048 ,23        ,35.24          ,33.046              ,1.066 
0    ,1          ,17   ,1    ,23        ,4.418          ,4.512               ,0.979 
0    ,1          ,17   ,16   ,0         ,4.145          ,4.556               ,0.91  
0    ,1          ,17   ,16   ,23        ,4.182          ,4.727               ,0.885 
0    ,1          ,17   ,18   ,0         ,4.233          ,4.978               ,0.85  
0    ,1          ,17   ,18   ,23        ,4.254          ,5.039               ,0.844 
0    ,1          ,17   ,2    ,0         ,4.266          ,4.555               ,0.937 
0    ,1          ,1728 ,2048 ,23        ,37.491         ,35.015              ,1.071 
0    ,1          ,176  ,16   ,23        ,3.411          ,4.396               ,0.776 
0    ,1          ,1760 ,2048 ,23        ,37.37          ,34.912              ,1.07  
0    ,1          ,1792 ,2048 ,23        ,37.53          ,34.93               ,1.074 
0    ,1          ,18   ,1    ,23        ,4.763          ,4.398               ,1.083 
0    ,1          ,18   ,17   ,0         ,4.202          ,4.47                ,0.94  
0    ,1          ,18   ,17   ,23        ,4.24           ,4.736               ,0.895 
0    ,1          ,18   ,19   ,0         ,4.348          ,5.223               ,0.832 
0    ,1          ,18   ,19   ,23        ,4.364          ,5.198               ,0.84  
0    ,1          ,18   ,2    ,0         ,4.273          ,4.648               ,0.919 
0    ,1          ,1824 ,2048 ,23        ,37.273         ,34.977              ,1.066 
0    ,1          ,1856 ,2048 ,23        ,39.795         ,36.698              ,1.084 
0    ,1          ,1888 ,2048 ,23        ,39.8           ,36.793              ,1.082 
0    ,1          ,19   ,1    ,23        ,4.833          ,4.384               ,1.102 
0    ,1          ,19   ,18   ,0         ,4.112          ,4.681               ,0.878 
0    ,1          ,19   ,18   ,23        ,4.265          ,4.725               ,0.903 
0    ,1          ,19   ,2    ,0         ,4.419          ,4.585               ,0.964 
0    ,1          ,19   ,20   ,0         ,4.307          ,5.214               ,0.826 
0    ,1          ,19   ,20   ,23        ,4.281          ,5.201               ,0.823 
0    ,1          ,192  ,128  ,23        ,5.85           ,6.052               ,0.967 
0    ,1          ,192  ,256  ,23        ,7.698          ,8.011               ,0.961 
0    ,1          ,192  ,32   ,23        ,3.23           ,4.351               ,0.742 
0    ,1          ,192  ,512  ,23        ,7.812          ,7.959               ,0.982 
0    ,1          ,192  ,64   ,23        ,4.526          ,4.986               ,0.908 
0    ,1          ,1920 ,2048 ,23        ,39.832         ,36.887              ,1.08  
0    ,1          ,1952 ,2048 ,23        ,39.905         ,36.739              ,1.086 
0    ,1          ,1984 ,2048 ,23        ,41.095         ,39.225              ,1.048 
0    ,1          ,2    ,1    ,0         ,4.004          ,4.311               ,0.929 
0    ,1          ,2    ,1    ,23        ,4.172          ,4.119               ,1.013 
0    ,1          ,2    ,2    ,0         ,4.117          ,4.409               ,0.934 
0    ,1          ,2    ,3    ,0         ,4.184          ,4.265               ,0.981 
0    ,1          ,2    ,3    ,23        ,4.175          ,4.43                ,0.942 
0    ,1          ,20   ,1    ,23        ,5.188          ,4.264               ,1.217 
0    ,1          ,20   ,19   ,0         ,4.262          ,4.615               ,0.924 
0    ,1          ,20   ,19   ,23        ,4.254          ,4.553               ,0.934 
0    ,1          ,20   ,2    ,0         ,4.115          ,4.541               ,0.906 
0    ,1          ,20   ,21   ,0         ,4.287          ,5.104               ,0.84  
0    ,1          ,20   ,21   ,23        ,4.279          ,5.073               ,0.843 
0    ,1          ,2016 ,2048 ,23        ,41.129         ,39.183              ,1.05  
0    ,1          ,2048 ,1024 ,0         ,24.953         ,23.937              ,1.042 
0    ,1          ,2048 ,1024 ,23        ,24.711         ,24.006              ,1.029 
0    ,1          ,2048 ,128  ,0         ,5.71           ,5.745               ,0.994 
0    ,1          ,2048 ,128  ,23        ,5.843          ,5.794               ,1.009 
0    ,1          ,2048 ,1664 ,23        ,36.351         ,33.822              ,1.075 
0    ,1          ,2048 ,1696 ,23        ,36.765         ,34.673              ,1.06  
0    ,1          ,2048 ,1728 ,23        ,37.168         ,34.949              ,1.064 
0    ,1          ,2048 ,1760 ,23        ,38.289         ,35.715              ,1.072 
0    ,1          ,2048 ,1792 ,23        ,38.23          ,35.861              ,1.066 
0    ,1          ,2048 ,1824 ,23        ,38.586         ,36.795              ,1.049 
0    ,1          ,2048 ,1856 ,23        ,39.26          ,36.861              ,1.065 
0    ,1          ,2048 ,1888 ,23        ,40.437         ,37.994              ,1.064 
0    ,1          ,2048 ,1920 ,23        ,40.284         ,38.07               ,1.058 
0    ,1          ,2048 ,1952 ,23        ,40.586         ,37.813              ,1.073 
0    ,1          ,2048 ,1984 ,23        ,39.965         ,38.118              ,1.048 
0    ,1          ,2048 ,2016 ,23        ,41.727         ,38.99               ,1.07  
0    ,1          ,2048 ,2048 ,0         ,41.134         ,39.137              ,1.051 
0    ,1          ,2048 ,2048 ,23        ,40.753         ,38.875              ,1.048 
0    ,1          ,2048 ,2080 ,23        ,41.152         ,39.138              ,1.051 
0    ,1          ,2048 ,2112 ,23        ,40.93          ,39.184              ,1.045 
0    ,1          ,2048 ,2144 ,23        ,41.138         ,39.172              ,1.05  
0    ,1          ,2048 ,2176 ,23        ,40.9           ,39.152              ,1.045 
0    ,1          ,2048 ,2208 ,23        ,41.23          ,39.201              ,1.052 
0    ,1          ,2048 ,2240 ,23        ,41.061         ,39.443              ,1.041 
0    ,1          ,2048 ,2272 ,23        ,40.641         ,39.067              ,1.04  
0    ,1          ,2048 ,2304 ,23        ,41.158         ,39.063              ,1.054 
0    ,1          ,2048 ,2336 ,23        ,41.312         ,39.073              ,1.057 
0    ,1          ,2048 ,2368 ,23        ,41.658         ,39.009              ,1.068 
0    ,1          ,2048 ,2400 ,23        ,41.859         ,39.006              ,1.073 
0    ,1          ,2048 ,2432 ,23        ,40.989         ,39.167              ,1.047 
0    ,1          ,2048 ,256  ,0         ,10.94          ,10.82               ,1.011 
0    ,1          ,2048 ,256  ,23        ,11.102         ,10.942              ,1.015 
0    ,1          ,2048 ,32   ,0         ,3.304          ,4.286               ,0.771 
0    ,1          ,2048 ,32   ,23        ,3.378          ,4.353               ,0.776 
0    ,1          ,2048 ,512  ,0         ,16.189         ,15.214              ,1.064 
0    ,1          ,2048 ,512  ,23        ,16.122         ,15.213              ,1.06  
0    ,1          ,2048 ,64   ,0         ,4.514          ,5.003               ,0.902 
0    ,1          ,2048 ,64   ,23        ,4.628          ,5.153               ,0.898 
0    ,1          ,208  ,16   ,23        ,3.405          ,4.47                ,0.762 
0    ,1          ,2080 ,2048 ,23        ,42.232         ,38.967              ,1.084 
0    ,1          ,21   ,1    ,23        ,4.363          ,4.352               ,1.002 
0    ,1          ,21   ,2    ,0         ,4.18           ,4.6                 ,0.909 
0    ,1          ,21   ,20   ,0         ,4.103          ,4.667               ,0.879 
0    ,1          ,21   ,20   ,23        ,4.103          ,4.596               ,0.893 
0    ,1          ,21   ,22   ,0         ,4.178          ,4.871               ,0.858 
0    ,1          ,21   ,22   ,23        ,4.197          ,4.934               ,0.851 
0    ,1          ,2112 ,2048 ,23        ,42.324         ,41.347              ,1.024 
0    ,1          ,2144 ,2048 ,23        ,42.793         ,41.406              ,1.034 
0    ,1          ,2176 ,2048 ,23        ,42.943         ,40.143              ,1.07  
0    ,1          ,22   ,1    ,23        ,4.443          ,4.35                ,1.021 
0    ,1          ,22   ,2    ,0         ,4.115          ,4.639               ,0.887 
0    ,1          ,22   ,21   ,0         ,4.122          ,4.609               ,0.894 
0    ,1          ,22   ,21   ,23        ,4.345          ,4.551               ,0.955 
0    ,1          ,22   ,23   ,0         ,4.094          ,5.107               ,0.802 
0    ,1          ,22   ,23   ,23        ,4.204          ,5.181               ,0.811 
0    ,1          ,2208 ,2048 ,23        ,43.068         ,40.052              ,1.075 
0    ,1          ,224  ,128  ,23        ,5.845          ,5.742               ,1.018 
0    ,1          ,224  ,256  ,23        ,8.518          ,8.509               ,1.001 
0    ,1          ,224  ,32   ,23        ,3.239          ,4.35                ,0.745 
0    ,1          ,224  ,512  ,23        ,9.729          ,10.551              ,0.922 
0    ,1          ,224  ,64   ,23        ,4.526          ,4.966               ,0.912 
0    ,1          ,2240 ,2048 ,23        ,43.725         ,48.78               ,0.896 
0    ,1          ,2272 ,2048 ,23        ,42.924         ,47.349              ,0.907 
0    ,1          ,23   ,1    ,23        ,4.543          ,4.266               ,1.065 
0    ,1          ,23   ,2    ,0         ,4.277          ,4.532               ,0.944 
0    ,1          ,23   ,22   ,0         ,4.099          ,4.802               ,0.854 
0    ,1          ,23   ,22   ,23        ,4.133          ,4.778               ,0.865 
0    ,1          ,23   ,24   ,0         ,4.215          ,5.108               ,0.825 
0    ,1          ,23   ,24   ,23        ,4.268          ,5.171               ,0.825 
0    ,1          ,2304 ,2048 ,23        ,42.852         ,40.161              ,1.067 
0    ,1          ,2336 ,2048 ,23        ,43.491         ,40.145              ,1.083 
0    ,1          ,2368 ,2048 ,23        ,43.902         ,41.263              ,1.064 
0    ,1          ,24   ,1    ,23        ,4.538          ,4.25                ,1.068 
0    ,1          ,24   ,2    ,0         ,4.094          ,4.433               ,0.924 
0    ,1          ,24   ,23   ,0         ,4.073          ,4.562               ,0.893 
0    ,1          ,24   ,23   ,23        ,4.269          ,4.623               ,0.923 
0    ,1          ,24   ,25   ,0         ,4.21           ,4.716               ,0.893 
0    ,1          ,24   ,25   ,23        ,4.201          ,4.773               ,0.88  
0    ,1          ,240  ,16   ,23        ,3.403          ,4.445               ,0.765 
0    ,1          ,2400 ,2048 ,23        ,43.301         ,41.17               ,1.052 
0    ,1          ,2432 ,2048 ,23        ,42.931         ,40.218              ,1.067 
0    ,1          ,25   ,1    ,23        ,4.084          ,4.096               ,0.997 
0    ,1          ,25   ,2    ,0         ,4.083          ,4.336               ,0.942 
0    ,1          ,25   ,24   ,0         ,4.094          ,4.463               ,0.917 
0    ,1          ,25   ,24   ,23        ,4.12           ,4.513               ,0.913 
0    ,1          ,25   ,26   ,0         ,4.192          ,4.686               ,0.895 
0    ,1          ,25   ,26   ,23        ,4.252          ,4.744               ,0.896 
0    ,1          ,256  ,1    ,0         ,3.356          ,4.017               ,0.835 
0    ,1          ,256  ,1    ,23        ,3.311          ,4.038               ,0.82  
0    ,1          ,256  ,128  ,23        ,5.787          ,5.696               ,1.016 
0    ,1          ,256  ,160  ,23        ,6.457          ,6.393               ,1.01  
0    ,1          ,256  ,192  ,23        ,7.539          ,7.665               ,0.984 
0    ,1          ,256  ,2    ,0         ,3.387          ,4.086               ,0.829 
0    ,1          ,256  ,2    ,23        ,3.403          ,4.286               ,0.794 
0    ,1          ,256  ,224  ,23        ,8.117          ,8.164               ,0.994 
0    ,1          ,256  ,256  ,23        ,9.231          ,9.059               ,1.019 
0    ,1          ,256  ,288  ,23        ,9.817          ,9.142               ,1.074 
0    ,1          ,256  ,3    ,0         ,3.418          ,4.104               ,0.833 
0    ,1          ,256  ,3    ,23        ,3.324          ,4.306               ,0.772 
0    ,1          ,256  ,32   ,23        ,3.299          ,4.298               ,0.767 
0    ,1          ,256  ,320  ,23        ,10.067         ,9.128               ,1.103 
0    ,1          ,256  ,352  ,23        ,10.186         ,9.095               ,1.12  
0    ,1          ,256  ,384  ,23        ,9.765          ,9.16                ,1.066 
0    ,1          ,256  ,4    ,0         ,3.37           ,4.044               ,0.833 
0    ,1          ,256  ,4    ,23        ,3.421          ,4.386               ,0.78  
0    ,1          ,256  ,416  ,23        ,9.52           ,9.215               ,1.033 
0    ,1          ,256  ,448  ,23        ,9.746          ,9.154               ,1.065 
0    ,1          ,256  ,480  ,23        ,9.654          ,9.181               ,1.052 
0    ,1          ,256  ,5    ,0         ,3.347          ,4.388               ,0.763 
0    ,1          ,256  ,5    ,23        ,3.355          ,4.282               ,0.784 
0    ,1          ,256  ,512  ,23        ,10.728         ,9.886               ,1.085 
0    ,1          ,256  ,544  ,23        ,10.01          ,9.196               ,1.089 
0    ,1          ,256  ,576  ,23        ,9.991          ,9.187               ,1.088 
0    ,1          ,256  ,6    ,0         ,3.343          ,4.283               ,0.78  
0    ,1          ,256  ,6    ,23        ,3.403          ,4.253               ,0.8   
0    ,1          ,256  ,608  ,23        ,10.027         ,9.153               ,1.095 
0    ,1          ,256  ,64   ,23        ,4.534          ,5.013               ,0.904 
0    ,1          ,256  ,640  ,23        ,9.959          ,9.175               ,1.085 
0    ,1          ,256  ,7    ,0         ,3.341          ,4.217               ,0.792 
0    ,1          ,256  ,7    ,23        ,3.317          ,4.258               ,0.779 
0    ,1          ,256  ,96   ,23        ,5.124          ,5.187               ,0.988 
0    ,1          ,26   ,1    ,23        ,4.104          ,4.127               ,0.994 
0    ,1          ,26   ,2    ,0         ,4.133          ,4.44                ,0.931 
0    ,1          ,26   ,25   ,0         ,4.102          ,4.428               ,0.926 
0    ,1          ,26   ,25   ,23        ,4.11           ,4.455               ,0.923 
0    ,1          ,26   ,27   ,0         ,4.207          ,4.744               ,0.887 
0    ,1          ,26   ,27   ,23        ,4.251          ,4.863               ,0.874 
0    ,1          ,27   ,1    ,23        ,4.065          ,4.304               ,0.945 
0    ,1          ,27   ,2    ,0         ,4.116          ,4.688               ,0.878 
0    ,1          ,27   ,26   ,0         ,4.087          ,4.5                 ,0.908 
0    ,1          ,27   ,26   ,23        ,4.103          ,4.473               ,0.917 
0    ,1          ,27   ,28   ,0         ,4.215          ,4.685               ,0.9   
0    ,1          ,27   ,28   ,23        ,4.252          ,4.813               ,0.884 
0    ,1          ,272  ,16   ,23        ,3.419          ,4.415               ,0.774 
0    ,1          ,28   ,1    ,23        ,4.271          ,4.385               ,0.974 
0    ,1          ,28   ,2    ,0         ,4.103          ,4.637               ,0.885 
0    ,1          ,28   ,27   ,0         ,4.071          ,4.628               ,0.88  
0    ,1          ,28   ,27   ,23        ,4.234          ,4.75                ,0.891 
0    ,1          ,28   ,29   ,0         ,4.091          ,4.833               ,0.846 
0    ,1          ,28   ,29   ,23        ,4.176          ,4.851               ,0.861 
0    ,1          ,288  ,128  ,23        ,5.877          ,5.743               ,1.023 
0    ,1          ,288  ,256  ,23        ,9.433          ,9.16                ,1.03  
0    ,1          ,288  ,32   ,23        ,3.261          ,4.375               ,0.745 
0    ,1          ,288  ,512  ,23        ,11.343         ,11.207              ,1.012 
0    ,1          ,288  ,64   ,23        ,4.79           ,5.198               ,0.921 
0    ,1          ,29   ,1    ,23        ,4.275          ,4.51                ,0.948 
0    ,1          ,29   ,2    ,0         ,4.095          ,4.775               ,0.858 
0    ,1          ,29   ,28   ,0         ,4.069          ,4.641               ,0.877 
0    ,1          ,29   ,28   ,23        ,4.08           ,4.644               ,0.879 
0    ,1          ,29   ,30   ,0         ,4.188          ,4.678               ,0.895 
0    ,1          ,29   ,30   ,23        ,4.227          ,4.868               ,0.868 
0    ,1          ,3    ,1    ,23        ,4.279          ,4.261               ,1.004 
0    ,1          ,3    ,2    ,0         ,4.18           ,4.494               ,0.93  
0    ,1          ,3    ,2    ,23        ,4.139          ,4.418               ,0.937 
0    ,1          ,3    ,4    ,0         ,4.172          ,4.544               ,0.918 
0    ,1          ,3    ,4    ,23        ,4.162          ,4.447               ,0.936 
0    ,1          ,30   ,1    ,23        ,4.38           ,4.376               ,1.001 
0    ,1          ,30   ,2    ,0         ,4.226          ,4.556               ,0.928 
0    ,1          ,30   ,29   ,0         ,4.003          ,4.607               ,0.869 
0    ,1          ,30   ,29   ,23        ,4.121          ,4.721               ,0.873 
0    ,1          ,30   ,31   ,0         ,4.15           ,4.623               ,0.898 
0    ,1          ,30   ,31   ,23        ,4.213          ,4.766               ,0.884 
0    ,1          ,304  ,16   ,23        ,3.386          ,4.402               ,0.769 
0    ,1          ,31   ,1    ,23        ,4.391          ,4.049               ,1.085 
0    ,1          ,31   ,2    ,0         ,4.23           ,4.371               ,0.968 
0    ,1          ,31   ,30   ,0         ,4.103          ,4.42                ,0.928 
0    ,1          ,31   ,30   ,23        ,4.126          ,4.568               ,0.903 
0    ,1          ,32   ,1    ,23        ,4.365          ,4.4                 ,0.992 
0    ,1          ,32   ,128  ,23        ,3.883          ,4.314               ,0.9   
0    ,1          ,32   ,160  ,23        ,3.847          ,4.281               ,0.899 
0    ,1          ,32   ,192  ,23        ,3.868          ,4.344               ,0.89  
0    ,1          ,32   ,2    ,0         ,4.214          ,4.446               ,0.948 
0    ,1          ,32   ,224  ,23        ,3.891          ,4.307               ,0.904 
0    ,1          ,32   ,256  ,23        ,3.895          ,4.302               ,0.905 
0    ,1          ,32   ,288  ,23        ,3.883          ,4.224               ,0.919 
0    ,1          ,32   ,31   ,0         ,3.939          ,4.319               ,0.912 
0    ,1          ,32   ,31   ,23        ,3.97           ,4.4                 ,0.902 
0    ,1          ,32   ,32   ,23        ,3.885          ,4.254               ,0.913 
0    ,1          ,32   ,320  ,23        ,3.88           ,4.343               ,0.893 
0    ,1          ,32   ,352  ,23        ,3.861          ,4.31                ,0.896 
0    ,1          ,32   ,384  ,23        ,3.855          ,4.266               ,0.904 
0    ,1          ,32   ,416  ,23        ,3.862          ,4.263               ,0.906 
0    ,1          ,32   ,64   ,23        ,3.864          ,4.27                ,0.905 
0    ,1          ,32   ,96   ,23        ,3.883          ,4.237               ,0.916 
0    ,1          ,320  ,128  ,23        ,5.828          ,5.735               ,1.016 
0    ,1          ,320  ,256  ,23        ,11.634         ,12.101              ,0.961 
0    ,1          ,320  ,32   ,23        ,3.23           ,4.328               ,0.746 
0    ,1          ,320  ,512  ,23        ,13.992         ,13.316              ,1.051 
0    ,1          ,320  ,64   ,23        ,4.754          ,5.21                ,0.913 
0    ,1          ,336  ,16   ,23        ,3.421          ,4.404               ,0.777 
0    ,1          ,352  ,128  ,23        ,5.833          ,5.701               ,1.023 
0    ,1          ,352  ,256  ,23        ,10.79          ,10.713              ,1.007 
0    ,1          ,352  ,32   ,23        ,3.225          ,4.265               ,0.756 
0    ,1          ,352  ,512  ,23        ,12.328         ,12.392              ,0.995 
0    ,1          ,352  ,64   ,23        ,4.531          ,4.963               ,0.913 
0    ,1          ,368  ,16   ,23        ,3.413          ,4.456               ,0.766 
0    ,1          ,3712 ,4096 ,23        ,69.333         ,66.099              ,1.049 
0    ,1          ,3744 ,4096 ,23        ,69.039         ,66.061              ,1.045 
0    ,1          ,3776 ,4096 ,23        ,70.827         ,68.046              ,1.041 
0    ,1          ,3808 ,4096 ,23        ,70.877         ,68.138              ,1.04  
0    ,1          ,384  ,128  ,23        ,5.878          ,5.766               ,1.019 
0    ,1          ,384  ,256  ,23        ,11.108         ,10.965              ,1.013 
0    ,1          ,384  ,32   ,23        ,3.223          ,4.328               ,0.745 
0    ,1          ,384  ,512  ,23        ,12.444         ,12.442              ,1.0   
0    ,1          ,384  ,64   ,23        ,4.462          ,4.966               ,0.898 
0    ,1          ,3840 ,4096 ,23        ,70.751         ,68.629              ,1.031 
0    ,1          ,3872 ,4096 ,23        ,70.721         ,68.153              ,1.038 
0    ,1          ,3904 ,4096 ,23        ,72.867         ,69.83               ,1.043 
0    ,1          ,3936 ,4096 ,23        ,72.949         ,70.132              ,1.04  
0    ,1          ,3968 ,4096 ,23        ,72.668         ,69.874              ,1.04  
0    ,1          ,4    ,1    ,23        ,4.191          ,4.174               ,1.004 
0    ,1          ,4    ,2    ,0         ,4.217          ,4.484               ,0.94  
0    ,1          ,4    ,3    ,0         ,4.211          ,4.492               ,0.937 
0    ,1          ,4    ,3    ,23        ,4.226          ,4.515               ,0.936 
0    ,1          ,4    ,5    ,0         ,4.169          ,4.444               ,0.938 
0    ,1          ,4    ,5    ,23        ,4.133          ,4.466               ,0.925 
0    ,1          ,400  ,16   ,23        ,3.411          ,4.437               ,0.769 
0    ,1          ,4000 ,4096 ,23        ,72.637         ,70.087              ,1.036 
0    ,1          ,4032 ,4096 ,23        ,75.206         ,75.888              ,0.991 
0    ,1          ,4064 ,4096 ,23        ,74.927         ,76.127              ,0.984 
0    ,1          ,4096 ,3712 ,23        ,70.072         ,79.779              ,0.878 
0    ,1          ,4096 ,3744 ,23        ,72.571         ,85.249              ,0.851 
0    ,1          ,4096 ,3776 ,23        ,70.557         ,89.008              ,0.793 
0    ,1          ,4096 ,3808 ,23        ,72.002         ,84.525              ,0.852 
0    ,1          ,4096 ,3840 ,23        ,70.951         ,86.909              ,0.816 
0    ,1          ,4096 ,3872 ,23        ,72.268         ,90.092              ,0.802 
0    ,1          ,4096 ,3904 ,23        ,72.303         ,94.795              ,0.763 
0    ,1          ,4096 ,3936 ,23        ,73.649         ,89.655              ,0.821 
0    ,1          ,4096 ,3968 ,23        ,74.059         ,91.75               ,0.807 
0    ,1          ,4096 ,4000 ,23        ,75.714         ,79.387              ,0.954 
0    ,1          ,4096 ,4032 ,23        ,75.261         ,80.768              ,0.932 
0    ,1          ,4096 ,4064 ,23        ,75.595         ,77.004              ,0.982 
0    ,1          ,4096 ,4096 ,23        ,73.916         ,81.257              ,0.91  
0    ,1          ,4096 ,4128 ,23        ,76.574         ,77.095              ,0.993 
0    ,1          ,4096 ,4160 ,23        ,74.592         ,74.521              ,1.001 
0    ,1          ,4096 ,4192 ,23        ,74.809         ,75.758              ,0.987 
0    ,1          ,4096 ,4224 ,23        ,74.591         ,72.318              ,1.031 
0    ,1          ,4096 ,4256 ,23        ,74.684         ,72.269              ,1.033 
0    ,1          ,4096 ,4288 ,23        ,75.003         ,72.485              ,1.035 
0    ,1          ,4096 ,4320 ,23        ,75.238         ,72.553              ,1.037 
0    ,1          ,4096 ,4352 ,23        ,75.364         ,73.469              ,1.026 
0    ,1          ,4096 ,4384 ,23        ,74.274         ,72.128              ,1.03  
0    ,1          ,4096 ,4416 ,23        ,74.736         ,73.636              ,1.015 
0    ,1          ,4096 ,4448 ,23        ,74.785         ,73.923              ,1.012 
0    ,1          ,4096 ,4480 ,23        ,74.957         ,73.608              ,1.018 
0    ,1          ,4128 ,4096 ,23        ,81.208         ,84.484              ,0.961 
0    ,1          ,416  ,128  ,23        ,5.886          ,5.77                ,1.02  
0    ,1          ,416  ,256  ,23        ,10.978         ,10.936              ,1.004 
0    ,1          ,416  ,32   ,23        ,3.253          ,4.344               ,0.749 
0    ,1          ,416  ,512  ,23        ,13.349         ,12.419              ,1.075 
0    ,1          ,416  ,64   ,23        ,4.481          ,4.994               ,0.897 
0    ,1          ,4160 ,4096 ,23        ,80.335         ,87.261              ,0.921 
0    ,1          ,4192 ,4096 ,23        ,78.199         ,83.011              ,0.942 
0    ,1          ,4224 ,4096 ,23        ,77.686         ,96.108              ,0.808 
0    ,1          ,4256 ,4096 ,23        ,77.544         ,93.649              ,0.828 
0    ,1          ,4288 ,4096 ,23        ,79.398         ,96.434              ,0.823 
0    ,1          ,4320 ,4096 ,23        ,77.602         ,95.176              ,0.815 
0    ,1          ,4352 ,4096 ,23        ,75.837         ,92.704              ,0.818 
0    ,1          ,4384 ,4096 ,23        ,75.98          ,88.653              ,0.857 
0    ,1          ,4416 ,4096 ,23        ,77.536         ,94.707              ,0.819 
0    ,1          ,4448 ,4096 ,23        ,76.732         ,93.319              ,0.822 
0    ,1          ,448  ,128  ,23        ,6.04           ,5.886               ,1.026 
0    ,1          ,448  ,256  ,23        ,12.743         ,12.451              ,1.023 
0    ,1          ,448  ,512  ,23        ,14.784         ,14.702              ,1.006 
0    ,1          ,448  ,64   ,23        ,4.738          ,5.094               ,0.93  
0    ,1          ,4480 ,4096 ,23        ,76.162         ,91.738              ,0.83  
0    ,1          ,48   ,16   ,23        ,3.459          ,4.481               ,0.772 
0    ,1          ,480  ,128  ,23        ,6.028          ,5.989               ,1.007 
0    ,1          ,480  ,256  ,23        ,12.342         ,12.101              ,1.02  
0    ,1          ,480  ,512  ,23        ,14.825         ,14.735              ,1.006 
0    ,1          ,5    ,1    ,23        ,4.199          ,4.194               ,1.001 
0    ,1          ,5    ,2    ,0         ,4.241          ,4.485               ,0.946 
0    ,1          ,5    ,4    ,0         ,4.367          ,4.499               ,0.971 
0    ,1          ,5    ,4    ,23        ,4.416          ,4.694               ,0.941 
0    ,1          ,5    ,6    ,0         ,4.114          ,4.433               ,0.928 
0    ,1          ,5    ,6    ,23        ,4.139          ,4.462               ,0.928 
0    ,1          ,512  ,128  ,23        ,5.961          ,5.89                ,1.012 
0    ,1          ,512  ,160  ,23        ,6.461          ,6.469               ,0.999 
0    ,1          ,512  ,192  ,23        ,10.406         ,10.231              ,1.017 
0    ,1          ,512  ,224  ,23        ,12.542         ,12.734              ,0.985 
0    ,1          ,512  ,256  ,23        ,11.787         ,11.84               ,0.995 
0    ,1          ,512  ,288  ,23        ,14.081         ,14.495              ,0.971 
0    ,1          ,512  ,320  ,23        ,14.661         ,14.783              ,0.992 
0    ,1          ,512  ,352  ,23        ,13.625         ,13.207              ,1.032 
0    ,1          ,512  ,384  ,23        ,13.584         ,13.191              ,1.03  
0    ,1          ,512  ,416  ,23        ,13.168         ,12.934              ,1.018 
0    ,1          ,512  ,448  ,23        ,13.648         ,13.43               ,1.016 
0    ,1          ,512  ,480  ,23        ,14.762         ,13.753              ,1.073 
0    ,1          ,512  ,512  ,23        ,14.579         ,14.468              ,1.008 
0    ,1          ,512  ,544  ,23        ,14.807         ,14.665              ,1.01  
0    ,1          ,512  ,576  ,23        ,14.853         ,14.683              ,1.012 
0    ,1          ,512  ,608  ,23        ,14.739         ,14.655              ,1.006 
0    ,1          ,512  ,640  ,23        ,14.866         ,14.745              ,1.008 
0    ,1          ,512  ,672  ,23        ,14.932         ,14.777              ,1.011 
0    ,1          ,512  ,704  ,23        ,16.389         ,15.939              ,1.028 
0    ,1          ,512  ,736  ,23        ,16.393         ,16.39               ,1.0   
0    ,1          ,512  ,768  ,23        ,16.447         ,16.377              ,1.004 
0    ,1          ,512  ,800  ,23        ,16.298         ,16.318              ,0.999 
0    ,1          ,512  ,832  ,23        ,16.54          ,16.351              ,1.012 
0    ,1          ,512  ,864  ,23        ,14.867         ,14.846              ,1.001 
0    ,1          ,512  ,896  ,23        ,14.895         ,14.653              ,1.017 
0    ,1          ,544  ,256  ,23        ,11.198         ,11.012              ,1.017 
0    ,1          ,544  ,512  ,23        ,14.803         ,13.777              ,1.074 
0    ,1          ,576  ,256  ,23        ,13.002         ,12.636              ,1.029 
0    ,1          ,576  ,512  ,23        ,16.55          ,15.432              ,1.072 
0    ,1          ,6    ,1    ,23        ,4.259          ,4.164               ,1.023 
0    ,1          ,6    ,2    ,0         ,4.338          ,4.47                ,0.97  
0    ,1          ,6    ,5    ,0         ,4.184          ,4.499               ,0.93  
0    ,1          ,6    ,5    ,23        ,4.206          ,4.514               ,0.932 
0    ,1          ,6    ,7    ,0         ,4.125          ,4.407               ,0.936 
0    ,1          ,6    ,7    ,23        ,4.113          ,4.412               ,0.932 
0    ,1          ,608  ,256  ,23        ,12.281         ,12.127              ,1.013 
0    ,1          ,608  ,512  ,23        ,15.739         ,15.225              ,1.034 
0    ,1          ,64   ,128  ,23        ,4.853          ,5.804               ,0.836 
0    ,1          ,64   ,160  ,23        ,5.392          ,5.607               ,0.962 
0    ,1          ,64   ,192  ,23        ,4.525          ,5.79                ,0.782 
0    ,1          ,64   ,224  ,23        ,4.579          ,5.846               ,0.783 
0    ,1          ,64   ,256  ,23        ,4.617          ,5.902               ,0.782 
0    ,1          ,64   ,288  ,23        ,4.651          ,6.111               ,0.761 
0    ,1          ,64   ,32   ,23        ,3.238          ,4.312               ,0.751 
0    ,1          ,64   ,320  ,23        ,5.048          ,5.939               ,0.85  
0    ,1          ,64   ,352  ,23        ,4.62           ,5.914               ,0.781 
0    ,1          ,64   ,384  ,23        ,4.521          ,5.66                ,0.799 
0    ,1          ,64   ,416  ,23        ,4.346          ,5.729               ,0.759 
0    ,1          ,64   ,448  ,23        ,4.72           ,5.674               ,0.832 
0    ,1          ,64   ,64   ,23        ,3.925          ,5.463               ,0.719 
0    ,1          ,64   ,96   ,23        ,4.204          ,5.606               ,0.75  
0    ,1          ,640  ,1024 ,23        ,17.952         ,16.891              ,1.063 
0    ,1          ,640  ,256  ,23        ,11.257         ,11.097              ,1.014 
0    ,1          ,640  ,512  ,23        ,16.238         ,15.429              ,1.052 
0    ,1          ,672  ,1024 ,23        ,18.042         ,16.859              ,1.07  
0    ,1          ,672  ,512  ,23        ,16.17          ,15.357              ,1.053 
0    ,1          ,7    ,1    ,23        ,4.138          ,4.137               ,1.0   
0    ,1          ,7    ,2    ,0         ,4.197          ,4.512               ,0.93  
0    ,1          ,7    ,6    ,0         ,4.146          ,4.461               ,0.929 
0    ,1          ,7    ,6    ,23        ,4.186          ,4.504               ,0.929 
0    ,1          ,7    ,8    ,0         ,4.133          ,4.435               ,0.932 
0    ,1          ,7    ,8    ,23        ,4.117          ,4.477               ,0.919 
0    ,1          ,704  ,1024 ,23        ,19.989         ,18.893              ,1.058 
0    ,1          ,704  ,512  ,23        ,20.077         ,20.141              ,0.997 
0    ,1          ,736  ,1024 ,23        ,19.845         ,18.957              ,1.047 
0    ,1          ,736  ,512  ,23        ,19.561         ,19.732              ,0.991 
0    ,1          ,768  ,1024 ,23        ,20.058         ,18.981              ,1.057 
0    ,1          ,768  ,512  ,23        ,17.689         ,17.897              ,0.988 
0    ,1          ,7808 ,8192 ,23        ,149.36         ,144.988             ,1.03  
0    ,1          ,7840 ,8192 ,23        ,148.532        ,145.166             ,1.023 
0    ,1          ,7872 ,8192 ,23        ,147.767        ,146.593             ,1.008 
0    ,1          ,7904 ,8192 ,23        ,147.914        ,147.239             ,1.005 
0    ,1          ,7936 ,8192 ,23        ,149.197        ,146.703             ,1.017 
0    ,1          ,7968 ,8192 ,23        ,148.072        ,146.851             ,1.008 
0    ,1          ,8    ,1    ,23        ,4.163          ,4.146               ,1.004 
0    ,1          ,8    ,2    ,0         ,4.164          ,4.425               ,0.941 
0    ,1          ,8    ,7    ,0         ,4.131          ,4.43                ,0.932 
0    ,1          ,8    ,7    ,23        ,4.149          ,4.518               ,0.918 
0    ,1          ,8    ,9    ,0         ,4.164          ,4.415               ,0.943 
0    ,1          ,8    ,9    ,23        ,4.136          ,4.398               ,0.94  
0    ,1          ,80   ,16   ,23        ,3.421          ,4.386               ,0.78  
0    ,1          ,800  ,1024 ,23        ,19.99          ,18.946              ,1.055 
0    ,1          ,800  ,512  ,23        ,17.675         ,17.901              ,0.987 
0    ,1          ,8000 ,8192 ,23        ,153.636        ,148.856             ,1.032 
0    ,1          ,8032 ,8192 ,23        ,152.158        ,148.255             ,1.026 
0    ,1          ,8064 ,8192 ,23        ,152.037        ,148.839             ,1.021 
0    ,1          ,8096 ,8192 ,23        ,154.054        ,148.571             ,1.037 
0    ,1          ,8128 ,8192 ,23        ,155.183        ,151.288             ,1.026 
0    ,1          ,8160 ,8192 ,23        ,155.711        ,151.495             ,1.028 
0    ,1          ,832  ,1024 ,23        ,22.165         ,21.361              ,1.038 
0    ,1          ,832  ,512  ,23        ,19.59          ,19.99               ,0.98  
0    ,1          ,864  ,1024 ,23        ,22.101         ,21.311              ,1.037 
0    ,1          ,864  ,512  ,23        ,17.21          ,16.442              ,1.047 
0    ,1          ,896  ,1024 ,23        ,22.107         ,21.415              ,1.032 
0    ,1          ,896  ,512  ,23        ,16.326         ,15.414              ,1.059 
0    ,1          ,9    ,1    ,23        ,4.395          ,4.354               ,1.009 
0    ,1          ,9    ,10   ,0         ,4.161          ,4.489               ,0.927 
0    ,1          ,9    ,10   ,23        ,4.143          ,4.488               ,0.923 
0    ,1          ,9    ,2    ,0         ,4.387          ,4.657               ,0.942 
0    ,1          ,9    ,8    ,0         ,4.158          ,4.539               ,0.916 
0    ,1          ,9    ,8    ,23        ,4.156          ,4.578               ,0.908 
0    ,1          ,928  ,1024 ,23        ,22.059         ,21.296              ,1.036 
0    ,1          ,96   ,128  ,23        ,4.686          ,5.63                ,0.832 
0    ,1          ,96   ,256  ,23        ,4.694          ,5.464               ,0.859 
0    ,1          ,96   ,32   ,23        ,3.246          ,4.295               ,0.756 
0    ,1          ,96   ,64   ,23        ,4.504          ,5.22                ,0.863 
0    ,1          ,960  ,1024 ,23        ,24.02          ,23.368              ,1.028 
0    ,1          ,992  ,1024 ,23        ,23.956         ,23.332              ,1.027 
1    ,0          ,0    ,1    ,0         ,3.349          ,3.36                ,0.997 
1    ,0          ,0    ,1    ,23        ,3.388          ,3.386               ,1.001 
1    ,0          ,192  ,32   ,0         ,7.79           ,8.026               ,0.971 
1    ,0          ,192  ,32   ,23        ,7.959          ,8.361               ,0.952 
1    ,0          ,2    ,1    ,0         ,4.081          ,4.379               ,0.932 
1    ,0          ,2    ,1    ,23        ,4.085          ,4.446               ,0.919 
1    ,0          ,256  ,1    ,0         ,10.113         ,9.72                ,1.04  
1    ,0          ,256  ,1    ,23        ,10.068         ,9.714               ,1.036 
1    ,0          ,256  ,32   ,0         ,9.8            ,9.034               ,1.085 
1    ,0          ,256  ,32   ,23        ,9.776          ,9.058               ,1.079 
1    ,0          ,256  ,64   ,0         ,8.26           ,8.312               ,0.994 
1    ,0          ,256  ,64   ,23        ,8.315          ,8.25                ,1.008 
1    ,0          ,512  ,32   ,0         ,14.909         ,13.749              ,1.084 
1    ,0          ,512  ,32   ,23        ,14.863         ,13.751              ,1.081 
1    ,1          ,0    ,1    ,0         ,3.337          ,3.346               ,0.997 
1    ,1          ,0    ,1    ,23        ,3.364          ,3.368               ,0.999 
1    ,1          ,192  ,32   ,0         ,3.308          ,4.315               ,0.767 
1    ,1          ,192  ,32   ,23        ,3.315          ,4.295               ,0.772 
1    ,1          ,2    ,1    ,0         ,4.075          ,4.32                ,0.943 
1    ,1          ,2    ,1    ,23        ,4.041          ,4.317               ,0.936 
1    ,1          ,256  ,1    ,0         ,3.386          ,4.073               ,0.831 
1    ,1          ,256  ,1    ,23        ,3.364          ,4.028               ,0.835 
1    ,1          ,256  ,32   ,0         ,3.345          ,4.333               ,0.772 
1    ,1          ,256  ,32   ,23        ,3.329          ,4.235               ,0.786 
1    ,1          ,256  ,64   ,0         ,5.189          ,5.283               ,0.982 
1    ,1          ,256  ,64   ,23        ,5.308          ,5.38                ,0.986 
1    ,1          ,512  ,32   ,0         ,3.249          ,4.276               ,0.76  
1    ,1          ,512  ,32   ,23        ,3.265          ,4.255               ,0.767 
10   ,0          ,11   ,10   ,0         ,4.195          ,4.189               ,1.001 
10   ,0          ,11   ,10   ,23        ,4.166          ,4.45                ,0.936 
10   ,0          ,9    ,10   ,0         ,4.177          ,4.479               ,0.933 
10   ,0          ,9    ,10   ,23        ,4.185          ,4.482               ,0.934 
10   ,1          ,11   ,10   ,0         ,4.14           ,4.522               ,0.915 
10   ,1          ,11   ,10   ,23        ,4.195          ,4.477               ,0.937 
10   ,1          ,9    ,10   ,0         ,4.133          ,4.409               ,0.937 
10   ,1          ,9    ,10   ,23        ,4.184          ,4.433               ,0.944 
11   ,0          ,10   ,11   ,0         ,4.178          ,4.437               ,0.941 
11   ,0          ,10   ,11   ,23        ,4.19           ,4.559               ,0.919 
11   ,0          ,12   ,11   ,0         ,4.202          ,4.201               ,1.0   
11   ,0          ,12   ,11   ,23        ,4.191          ,4.483               ,0.935 
11   ,1          ,10   ,11   ,0         ,4.153          ,4.464               ,0.93  
11   ,1          ,10   ,11   ,23        ,4.181          ,4.466               ,0.936 
11   ,1          ,12   ,11   ,0         ,4.138          ,4.498               ,0.92  
11   ,1          ,12   ,11   ,23        ,4.138          ,4.507               ,0.918 
12   ,0          ,11   ,12   ,0         ,4.204          ,4.513               ,0.932 
12   ,0          ,11   ,12   ,23        ,4.205          ,4.505               ,0.933 
12   ,0          ,13   ,12   ,0         ,4.198          ,4.183               ,1.004 
12   ,0          ,13   ,12   ,23        ,4.173          ,4.442               ,0.939 
12   ,1          ,11   ,12   ,0         ,4.145          ,4.5                 ,0.921 
12   ,1          ,11   ,12   ,23        ,4.172          ,4.483               ,0.931 
12   ,1          ,13   ,12   ,0         ,4.131          ,4.469               ,0.924 
12   ,1          ,13   ,12   ,23        ,4.145          ,4.43                ,0.936 
13   ,0          ,12   ,13   ,0         ,4.169          ,4.443               ,0.938 
13   ,0          ,12   ,13   ,23        ,4.199          ,4.535               ,0.926 
13   ,0          ,14   ,13   ,0         ,4.191          ,4.174               ,1.004 
13   ,0          ,14   ,13   ,23        ,4.204          ,4.49                ,0.936 
13   ,1          ,12   ,13   ,0         ,4.125          ,4.425               ,0.932 
13   ,1          ,12   ,13   ,23        ,4.113          ,4.466               ,0.921 
13   ,1          ,14   ,13   ,0         ,4.123          ,4.507               ,0.915 
13   ,1          ,14   ,13   ,23        ,4.15           ,4.505               ,0.921 
14   ,0          ,13   ,14   ,0         ,4.202          ,4.466               ,0.941 
14   ,0          ,13   ,14   ,23        ,4.17           ,4.53                ,0.921 
14   ,0          ,15   ,14   ,0         ,4.186          ,4.161               ,1.006 
14   ,0          ,15   ,14   ,23        ,4.185          ,4.501               ,0.93  
14   ,1          ,13   ,14   ,0         ,4.113          ,4.432               ,0.928 
14   ,1          ,13   ,14   ,23        ,4.104          ,4.433               ,0.926 
14   ,1          ,15   ,14   ,0         ,4.112          ,4.451               ,0.924 
14   ,1          ,15   ,14   ,23        ,4.115          ,4.594               ,0.896 
15   ,0          ,14   ,15   ,0         ,4.177          ,4.53                ,0.922 
15   ,0          ,14   ,15   ,23        ,4.156          ,4.632               ,0.897 
15   ,0          ,16   ,15   ,0         ,4.184          ,4.177               ,1.002 
15   ,0          ,16   ,15   ,23        ,4.164          ,4.166               ,0.999 
15   ,1          ,14   ,15   ,0         ,4.137          ,4.463               ,0.927 
15   ,1          ,14   ,15   ,23        ,4.133          ,4.428               ,0.933 
15   ,1          ,16   ,15   ,0         ,4.106          ,4.403               ,0.933 
15   ,1          ,16   ,15   ,23        ,4.116          ,4.42                ,0.931 
16   ,0          ,15   ,16   ,0         ,4.189          ,4.708               ,0.89  
16   ,0          ,15   ,16   ,23        ,4.181          ,4.681               ,0.893 
16   ,0          ,17   ,16   ,0         ,3.555          ,3.491               ,1.018 
16   ,0          ,17   ,16   ,23        ,3.517          ,3.445               ,1.021 
16   ,1          ,15   ,16   ,0         ,4.115          ,4.687               ,0.878 
16   ,1          ,15   ,16   ,23        ,4.103          ,4.81                ,0.853 
16   ,1          ,17   ,16   ,0         ,3.421          ,3.745               ,0.914 
16   ,1          ,17   ,16   ,23        ,3.501          ,3.893               ,0.899 
17   ,0          ,16   ,17   ,0         ,3.502          ,3.792               ,0.924 
17   ,0          ,16   ,17   ,23        ,3.563          ,3.837               ,0.929 
17   ,0          ,18   ,17   ,0         ,3.617          ,3.489               ,1.037 
17   ,0          ,18   ,17   ,23        ,3.599          ,3.466               ,1.039 
17   ,1          ,16   ,17   ,0         ,3.453          ,3.799               ,0.909 
17   ,1          ,16   ,17   ,23        ,3.457          ,3.754               ,0.921 
17   ,1          ,18   ,17   ,0         ,3.426          ,3.829               ,0.895 
17   ,1          ,18   ,17   ,23        ,3.491          ,3.776               ,0.924 
18   ,0          ,17   ,18   ,0         ,3.471          ,3.744               ,0.927 
18   ,0          ,17   ,18   ,23        ,3.594          ,4.212               ,0.853 
18   ,0          ,19   ,18   ,0         ,3.574          ,3.461               ,1.033 
18   ,0          ,19   ,18   ,23        ,3.625          ,3.453               ,1.05  
18   ,1          ,17   ,18   ,0         ,3.448          ,3.792               ,0.909 
18   ,1          ,17   ,18   ,23        ,3.417          ,3.772               ,0.906 
18   ,1          ,19   ,18   ,0         ,3.438          ,3.747               ,0.917 
18   ,1          ,19   ,18   ,23        ,3.489          ,3.828               ,0.911 
19   ,0          ,18   ,19   ,0         ,3.465          ,3.738               ,0.927 
19   ,0          ,18   ,19   ,23        ,3.492          ,3.771               ,0.926 
19   ,0          ,20   ,19   ,0         ,3.48           ,3.469               ,1.003 
19   ,0          ,20   ,19   ,23        ,3.478          ,3.467               ,1.003 
19   ,1          ,18   ,19   ,0         ,3.407          ,3.901               ,0.873 
19   ,1          ,18   ,19   ,23        ,3.408          ,3.754               ,0.908 
19   ,1          ,20   ,19   ,0         ,3.41           ,3.798               ,0.898 
19   ,1          ,20   ,19   ,23        ,3.466          ,3.786               ,0.915 
2    ,0          ,1    ,2    ,0         ,4.372          ,4.196               ,1.042 
2    ,0          ,1    ,2    ,23        ,4.306          ,4.195               ,1.027 
2    ,0          ,192  ,64   ,0         ,6.455          ,6.489               ,0.995 
2    ,0          ,192  ,64   ,23        ,6.48           ,6.443               ,1.006 
2    ,0          ,256  ,2    ,0         ,10.036         ,9.667               ,1.038 
2    ,0          ,256  ,2    ,23        ,10.045         ,9.798               ,1.025 
2    ,0          ,256  ,64   ,0         ,8.226          ,8.181               ,1.006 
2    ,0          ,256  ,64   ,23        ,8.184          ,8.196               ,0.998 
2    ,0          ,3    ,2    ,0         ,4.164          ,4.49                ,0.927 
2    ,0          ,3    ,2    ,23        ,4.18           ,4.446               ,0.94  
2    ,0          ,512  ,64   ,0         ,13.641         ,13.46               ,1.013 
2    ,0          ,512  ,64   ,23        ,13.832         ,13.589              ,1.018 
2    ,1          ,1    ,2    ,0         ,4.201          ,4.197               ,1.001 
2    ,1          ,1    ,2    ,23        ,4.241          ,4.184               ,1.014 
2    ,1          ,192  ,64   ,0         ,5.238          ,5.391               ,0.972 
2    ,1          ,192  ,64   ,23        ,5.248          ,5.353               ,0.981 
2    ,1          ,256  ,2    ,0         ,3.384          ,4.073               ,0.831 
2    ,1          ,256  ,2    ,23        ,3.372          ,4.044               ,0.834 
2    ,1          ,256  ,64   ,0         ,5.25           ,5.379               ,0.976 
2    ,1          ,256  ,64   ,23        ,5.278          ,5.319               ,0.992 
2    ,1          ,3    ,2    ,0         ,4.162          ,4.499               ,0.925 
2    ,1          ,3    ,2    ,23        ,4.166          ,4.471               ,0.932 
2    ,1          ,512  ,64   ,0         ,5.206          ,5.288               ,0.984 
2    ,1          ,512  ,64   ,23        ,5.208          ,5.306               ,0.982 
20   ,0          ,19   ,20   ,0         ,3.442          ,3.753               ,0.917 
20   ,0          ,19   ,20   ,23        ,3.589          ,4.281               ,0.838 
20   ,0          ,21   ,20   ,0         ,3.47           ,3.472               ,0.999 
20   ,0          ,21   ,20   ,23        ,3.445          ,3.44                ,1.001 
20   ,1          ,19   ,20   ,0         ,3.464          ,3.799               ,0.912 
20   ,1          ,19   ,20   ,23        ,3.411          ,3.732               ,0.914 
20   ,1          ,21   ,20   ,0         ,3.427          ,3.752               ,0.913 
20   ,1          ,21   ,20   ,23        ,3.412          ,3.767               ,0.906 
2033 ,0          ,256  ,128  ,23        ,6.339          ,6.404               ,0.99  
2033 ,0          ,256  ,160  ,23        ,5.86           ,5.905               ,0.992 
2033 ,0          ,256  ,192  ,23        ,5.071          ,5.131               ,0.988 
2033 ,0          ,256  ,224  ,23        ,3.296          ,4.325               ,0.762 
2033 ,0          ,256  ,32   ,23        ,9.494          ,9.209               ,1.031 
2033 ,0          ,256  ,64   ,0         ,8.081          ,8.064               ,1.002 
2033 ,0          ,256  ,64   ,23        ,8.14           ,8.136               ,1.001 
2033 ,0          ,256  ,96   ,23        ,7.525          ,7.752               ,0.971 
2033 ,1          ,256  ,128  ,23        ,6.306          ,6.368               ,0.99  
2033 ,1          ,256  ,160  ,23        ,7.524          ,7.674               ,0.98  
2033 ,1          ,256  ,192  ,23        ,8.104          ,8.192               ,0.989 
2033 ,1          ,256  ,224  ,23        ,9.454          ,9.106               ,1.038 
2033 ,1          ,256  ,32   ,23        ,3.203          ,4.27                ,0.75  
2033 ,1          ,256  ,64   ,0         ,4.992          ,4.998               ,0.999 
2033 ,1          ,256  ,64   ,23        ,5.052          ,5.113               ,0.988 
2033 ,1          ,256  ,96   ,23        ,6.378          ,5.903               ,1.081 
2048 ,0          ,0    ,1    ,0         ,3.463          ,3.461               ,1.001 
2048 ,0          ,0    ,1    ,23        ,3.411          ,3.428               ,0.995 
2048 ,0          ,1    ,2    ,0         ,3.54           ,3.539               ,1.0   
2048 ,0          ,1    ,2    ,23        ,3.526          ,3.536               ,0.997 
2048 ,0          ,10   ,11   ,0         ,3.513          ,3.82                ,0.92  
2048 ,0          ,10   ,11   ,23        ,3.505          ,3.815               ,0.919 
2048 ,0          ,10   ,9    ,0         ,3.628          ,3.5                 ,1.037 
2048 ,0          ,10   ,9    ,23        ,3.606          ,3.471               ,1.039 
2048 ,0          ,1024 ,1024 ,23        ,23.684         ,23.169              ,1.022 
2048 ,0          ,1024 ,1056 ,23        ,24.146         ,23.329              ,1.035 
2048 ,0          ,1024 ,1088 ,23        ,23.947         ,23.398              ,1.023 
2048 ,0          ,1024 ,1120 ,23        ,24.092         ,23.35               ,1.032 
2048 ,0          ,1024 ,1152 ,23        ,23.887         ,23.474              ,1.018 
2048 ,0          ,1024 ,1184 ,23        ,24.093         ,23.382              ,1.03  
2048 ,0          ,1024 ,1216 ,23        ,23.971         ,23.45               ,1.022 
2048 ,0          ,1024 ,1248 ,23        ,23.738         ,23.327              ,1.018 
2048 ,0          ,1024 ,1280 ,23        ,24.19          ,23.303              ,1.038 
2048 ,0          ,1024 ,1312 ,23        ,23.884         ,23.469              ,1.018 
2048 ,0          ,1024 ,1344 ,23        ,24.022         ,23.396              ,1.027 
2048 ,0          ,1024 ,1376 ,23        ,24.044         ,23.464              ,1.025 
2048 ,0          ,1024 ,1408 ,23        ,24.205         ,23.387              ,1.035 
2048 ,0          ,1024 ,640  ,23        ,13.442         ,13.072              ,1.028 
2048 ,0          ,1024 ,672  ,23        ,13.435         ,13.121              ,1.024 
2048 ,0          ,1024 ,704  ,23        ,12.62          ,12.355              ,1.021 
2048 ,0          ,1024 ,736  ,23        ,12.037         ,11.918              ,1.01  
2048 ,0          ,1024 ,768  ,23        ,11.02          ,10.872              ,1.014 
2048 ,0          ,1024 ,800  ,23        ,11.008         ,10.915              ,1.008 
2048 ,0          ,1024 ,832  ,23        ,10.243         ,10.115              ,1.013 
2048 ,0          ,1024 ,864  ,23        ,6.275          ,6.314               ,0.994 
2048 ,0          ,1024 ,896  ,23        ,5.698          ,5.682               ,1.003 
2048 ,0          ,1024 ,928  ,23        ,5.052          ,5.163               ,0.979 
2048 ,0          ,1024 ,960  ,23        ,4.527          ,5.011               ,0.903 
2048 ,0          ,1024 ,992  ,23        ,3.306          ,4.366               ,0.757 
2048 ,0          ,1056 ,1024 ,23        ,3.44           ,4.398               ,0.782 
2048 ,0          ,1088 ,1024 ,23        ,4.793          ,5.114               ,0.937 
2048 ,0          ,11   ,10   ,0         ,3.507          ,3.507               ,1.0   
2048 ,0          ,11   ,10   ,23        ,3.544          ,3.466               ,1.022 
2048 ,0          ,11   ,12   ,0         ,3.513          ,3.808               ,0.923 
2048 ,0          ,11   ,12   ,23        ,3.509          ,3.829               ,0.916 
2048 ,0          ,112  ,16   ,23        ,5.87           ,6.368               ,0.922 
2048 ,0          ,1120 ,1024 ,23        ,5.272          ,5.287               ,0.997 
2048 ,0          ,1152 ,1024 ,23        ,5.875          ,5.89                ,0.997 
2048 ,0          ,1184 ,1024 ,23        ,6.431          ,6.543               ,0.983 
2048 ,0          ,12   ,11   ,0         ,3.498          ,3.498               ,1.0   
2048 ,0          ,12   ,11   ,23        ,3.499          ,3.487               ,1.003 
2048 ,0          ,12   ,13   ,0         ,3.516          ,3.831               ,0.918 
2048 ,0          ,12   ,13   ,23        ,3.517          ,3.788               ,0.928 
2048 ,0          ,1216 ,1024 ,23        ,11.136         ,11.006              ,1.012 
2048 ,0          ,1248 ,1024 ,23        ,11.191         ,10.993              ,1.018 
2048 ,0          ,128  ,128  ,23        ,5.465          ,6.191               ,0.883 
2048 ,0          ,128  ,160  ,23        ,5.556          ,6.264               ,0.887 
2048 ,0          ,128  ,192  ,23        ,5.583          ,6.338               ,0.881 
2048 ,0          ,128  ,224  ,23        ,5.654          ,6.521               ,0.867 
2048 ,0          ,128  ,256  ,23        ,5.623          ,6.318               ,0.89  
2048 ,0          ,128  ,288  ,23        ,5.709          ,6.298               ,0.907 
2048 ,0          ,128  ,32   ,23        ,4.983          ,5.05                ,0.987 
2048 ,0          ,128  ,320  ,23        ,5.631          ,6.292               ,0.895 
2048 ,0          ,128  ,352  ,23        ,5.629          ,6.368               ,0.884 
2048 ,0          ,128  ,384  ,23        ,5.641          ,6.284               ,0.898 
2048 ,0          ,128  ,416  ,23        ,5.619          ,6.342               ,0.886 
2048 ,0          ,128  ,448  ,23        ,5.776          ,6.284               ,0.919 
2048 ,0          ,128  ,480  ,23        ,5.666          ,6.492               ,0.873 
2048 ,0          ,128  ,512  ,23        ,5.8            ,6.407               ,0.905 
2048 ,0          ,128  ,64   ,23        ,4.435          ,4.895               ,0.906 
2048 ,0          ,128  ,96   ,23        ,3.254          ,4.338               ,0.75  
2048 ,0          ,1280 ,1024 ,23        ,11.096         ,10.953              ,1.013 
2048 ,0          ,13   ,12   ,0         ,3.569          ,3.504               ,1.018 
2048 ,0          ,13   ,12   ,23        ,3.538          ,3.47                ,1.019 
2048 ,0          ,13   ,14   ,0         ,3.515          ,3.817               ,0.921 
2048 ,0          ,13   ,14   ,23        ,3.51           ,3.852               ,0.911 
2048 ,0          ,1312 ,1024 ,23        ,11.092         ,10.994              ,1.009 
2048 ,0          ,1344 ,1024 ,23        ,13.447         ,13.208              ,1.018 
2048 ,0          ,1376 ,1024 ,23        ,13.525         ,13.099              ,1.033 
2048 ,0          ,14   ,13   ,0         ,3.539          ,3.464               ,1.022 
2048 ,0          ,14   ,13   ,23        ,3.589          ,3.464               ,1.036 
2048 ,0          ,14   ,15   ,0         ,3.513          ,3.835               ,0.916 
2048 ,0          ,14   ,15   ,23        ,3.496          ,3.853               ,0.907 
2048 ,0          ,1408 ,1024 ,23        ,13.448         ,13.089              ,1.027 
2048 ,0          ,144  ,16   ,23        ,6.865          ,7.049               ,0.974 
2048 ,0          ,15   ,14   ,0         ,3.494          ,3.484               ,1.003 
2048 ,0          ,15   ,14   ,23        ,3.47           ,3.459               ,1.003 
2048 ,0          ,15   ,16   ,0         ,3.508          ,3.864               ,0.908 
2048 ,0          ,15   ,16   ,23        ,3.505          ,3.779               ,0.928 
2048 ,0          ,16   ,112  ,23        ,3.312          ,3.654               ,0.906 
2048 ,0          ,16   ,144  ,23        ,3.341          ,3.651               ,0.915 
2048 ,0          ,16   ,15   ,0         ,3.56           ,3.496               ,1.018 
2048 ,0          ,16   ,15   ,23        ,3.543          ,3.473               ,1.02  
2048 ,0          ,16   ,16   ,23        ,3.312          ,3.632               ,0.912 
2048 ,0          ,16   ,17   ,0         ,3.477          ,3.864               ,0.9   
2048 ,0          ,16   ,17   ,23        ,3.498          ,3.849               ,0.909 
2048 ,0          ,16   ,176  ,23        ,3.339          ,3.831               ,0.872 
2048 ,0          ,16   ,208  ,23        ,3.334          ,3.852               ,0.866 
2048 ,0          ,16   ,240  ,23        ,3.357          ,3.691               ,0.91  
2048 ,0          ,16   ,272  ,23        ,3.328          ,3.623               ,0.919 
2048 ,0          ,16   ,304  ,23        ,3.338          ,3.609               ,0.925 
2048 ,0          ,16   ,336  ,23        ,3.303          ,3.757               ,0.879 
2048 ,0          ,16   ,368  ,23        ,3.313          ,3.658               ,0.906 
2048 ,0          ,16   ,400  ,23        ,3.338          ,3.645               ,0.916 
2048 ,0          ,16   ,48   ,23        ,3.434          ,3.768               ,0.911 
2048 ,0          ,16   ,80   ,23        ,3.362          ,3.709               ,0.906 
2048 ,0          ,160  ,128  ,23        ,3.332          ,4.353               ,0.766 
2048 ,0          ,160  ,256  ,23        ,6.361          ,6.839               ,0.93  
2048 ,0          ,160  ,32   ,23        ,5.902          ,6.567               ,0.899 
2048 ,0          ,160  ,512  ,23        ,6.388          ,6.876               ,0.929 
2048 ,0          ,160  ,64   ,23        ,5.608          ,5.445               ,1.03  
2048 ,0          ,1664 ,2048 ,23        ,35.279         ,33.179              ,1.063 
2048 ,0          ,1696 ,2048 ,23        ,35.335         ,32.926              ,1.073 
2048 ,0          ,17   ,16   ,0         ,3.544          ,3.56                ,0.996 
2048 ,0          ,17   ,16   ,23        ,3.517          ,3.453               ,1.018 
2048 ,0          ,17   ,18   ,0         ,3.476          ,3.852               ,0.902 
2048 ,0          ,17   ,18   ,23        ,3.503          ,3.907               ,0.897 
2048 ,0          ,1728 ,2048 ,23        ,37.296         ,34.931              ,1.068 
2048 ,0          ,176  ,16   ,23        ,7.832          ,7.948               ,0.985 
2048 ,0          ,1760 ,2048 ,23        ,37.417         ,34.976              ,1.07  
2048 ,0          ,1792 ,2048 ,23        ,37.407         ,34.976              ,1.07  
2048 ,0          ,18   ,17   ,0         ,3.525          ,3.463               ,1.018 
2048 ,0          ,18   ,17   ,23        ,3.582          ,3.473               ,1.031 
2048 ,0          ,18   ,19   ,0         ,3.47           ,3.852               ,0.901 
2048 ,0          ,18   ,19   ,23        ,3.58           ,3.865               ,0.926 
2048 ,0          ,1824 ,2048 ,23        ,38.219         ,34.979              ,1.093 
2048 ,0          ,1856 ,2048 ,23        ,39.896         ,36.873              ,1.082 
2048 ,0          ,1888 ,2048 ,23        ,40.946         ,36.868              ,1.111 
2048 ,0          ,19   ,18   ,0         ,3.545          ,3.481               ,1.018 
2048 ,0          ,19   ,18   ,23        ,3.523          ,3.453               ,1.02  
2048 ,0          ,19   ,20   ,0         ,3.484          ,3.88                ,0.898 
2048 ,0          ,19   ,20   ,23        ,3.506          ,3.841               ,0.913 
2048 ,0          ,192  ,128  ,23        ,4.561          ,5.013               ,0.91  
2048 ,0          ,192  ,256  ,23        ,7.593          ,8.011               ,0.948 
2048 ,0          ,192  ,32   ,23        ,6.355          ,6.332               ,1.004 
2048 ,0          ,192  ,512  ,23        ,7.653          ,8.082               ,0.947 
2048 ,0          ,192  ,64   ,23        ,6.131          ,6.291               ,0.975 
2048 ,0          ,1920 ,2048 ,23        ,39.89          ,36.853              ,1.082 
2048 ,0          ,1952 ,2048 ,23        ,39.822         ,36.855              ,1.08  
2048 ,0          ,1984 ,2048 ,23        ,41.25          ,39.162              ,1.053 
2048 ,0          ,2    ,1    ,0         ,3.347          ,3.397               ,0.985 
2048 ,0          ,2    ,1    ,23        ,3.325          ,3.385               ,0.982 
2048 ,0          ,2    ,3    ,0         ,3.54           ,3.539               ,1.0   
2048 ,0          ,2    ,3    ,23        ,3.514          ,3.522               ,0.998 
2048 ,0          ,20   ,19   ,0         ,3.47           ,3.462               ,1.002 
2048 ,0          ,20   ,19   ,23        ,3.464          ,3.436               ,1.008 
2048 ,0          ,20   ,21   ,0         ,3.467          ,3.911               ,0.886 
2048 ,0          ,20   ,21   ,23        ,3.493          ,3.865               ,0.904 
2048 ,0          ,2016 ,2048 ,23        ,41.309         ,39.339              ,1.05  
2048 ,0          ,2048 ,1024 ,0         ,24.616         ,24.068              ,1.023 
2048 ,0          ,2048 ,1024 ,23        ,24.546         ,24.028              ,1.022 
2048 ,0          ,2048 ,128  ,0         ,40.366         ,37.99               ,1.063 
2048 ,0          ,2048 ,128  ,23        ,40.404         ,38.013              ,1.063 
2048 ,0          ,2048 ,1664 ,23        ,13.369         ,13.11               ,1.02  
2048 ,0          ,2048 ,1696 ,23        ,13.513         ,13.09               ,1.032 
2048 ,0          ,2048 ,1728 ,23        ,12.621         ,12.286              ,1.027 
2048 ,0          ,2048 ,1760 ,23        ,12.034         ,11.873              ,1.014 
2048 ,0          ,2048 ,1792 ,23        ,11.031         ,10.867              ,1.015 
2048 ,0          ,2048 ,1824 ,23        ,10.976         ,10.866              ,1.01  
2048 ,0          ,2048 ,1856 ,23        ,10.294         ,10.084              ,1.021 
2048 ,0          ,2048 ,1888 ,23        ,6.301          ,6.378               ,0.988 
2048 ,0          ,2048 ,1920 ,23        ,5.694          ,5.726               ,0.994 
2048 ,0          ,2048 ,1952 ,23        ,5.134          ,5.134               ,1.0   
2048 ,0          ,2048 ,1984 ,23        ,4.512          ,4.998               ,0.903 
2048 ,0          ,2048 ,2016 ,23        ,3.279          ,4.353               ,0.753 
2048 ,0          ,2048 ,2048 ,0         ,41.113         ,39.089              ,1.052 
2048 ,0          ,2048 ,2048 ,23        ,40.748         ,38.985              ,1.045 
2048 ,0          ,2048 ,2080 ,23        ,41.151         ,39.17               ,1.051 
2048 ,0          ,2048 ,2112 ,23        ,41.662         ,39.215              ,1.062 
2048 ,0          ,2048 ,2144 ,23        ,40.926         ,39.238              ,1.043 
2048 ,0          ,2048 ,2176 ,23        ,41.05          ,39.135              ,1.049 
2048 ,0          ,2048 ,2208 ,23        ,41.111         ,39.221              ,1.048 
2048 ,0          ,2048 ,2240 ,23        ,41.765         ,39.184              ,1.066 
2048 ,0          ,2048 ,2272 ,23        ,41.716         ,39.238              ,1.063 
2048 ,0          ,2048 ,2304 ,23        ,41.438         ,39.096              ,1.06  
2048 ,0          ,2048 ,2336 ,23        ,41.485         ,39.242              ,1.057 
2048 ,0          ,2048 ,2368 ,23        ,41.24          ,39.093              ,1.055 
2048 ,0          ,2048 ,2400 ,23        ,41.013         ,39.161              ,1.047 
2048 ,0          ,2048 ,2432 ,23        ,41.102         ,39.342              ,1.045 
2048 ,0          ,2048 ,256  ,0         ,38.452         ,35.988              ,1.068 
2048 ,0          ,2048 ,256  ,23        ,38.301         ,35.758              ,1.071 
2048 ,0          ,2048 ,32   ,0         ,41.781         ,38.812              ,1.077 
2048 ,0          ,2048 ,32   ,23        ,44.281         ,39.195              ,1.13  
2048 ,0          ,2048 ,512  ,0         ,34.348         ,31.91               ,1.076 
2048 ,0          ,2048 ,512  ,23        ,33.872         ,32.116              ,1.055 
2048 ,0          ,2048 ,64   ,0         ,40.296         ,38.181              ,1.055 
2048 ,0          ,2048 ,64   ,23        ,40.361         ,38.244              ,1.055 
2048 ,0          ,208  ,16   ,23        ,8.578          ,8.634               ,0.994 
2048 ,0          ,2080 ,2048 ,23        ,4.266          ,5.08                ,0.84  
2048 ,0          ,21   ,20   ,0         ,3.465          ,3.459               ,1.002 
2048 ,0          ,21   ,20   ,23        ,3.428          ,3.424               ,1.001 
2048 ,0          ,21   ,22   ,0         ,3.475          ,3.947               ,0.88  
2048 ,0          ,21   ,22   ,23        ,3.454          ,3.821               ,0.904 
2048 ,0          ,2112 ,2048 ,23        ,4.681          ,5.082               ,0.921 
2048 ,0          ,2144 ,2048 ,23        ,5.191          ,5.178               ,1.002 
2048 ,0          ,2176 ,2048 ,23        ,5.85           ,5.898               ,0.992 
2048 ,0          ,22   ,21   ,0         ,3.452          ,3.454               ,0.999 
2048 ,0          ,22   ,21   ,23        ,3.432          ,3.43                ,1.001 
2048 ,0          ,22   ,23   ,0         ,3.368          ,3.792               ,0.888 
2048 ,0          ,22   ,23   ,23        ,3.393          ,3.863               ,0.878 
2048 ,0          ,2208 ,2048 ,23        ,6.405          ,6.453               ,0.993 
2048 ,0          ,224  ,128  ,23        ,5.058          ,5.126               ,0.987 
2048 ,0          ,224  ,256  ,23        ,8.307          ,8.51                ,0.976 
2048 ,0          ,224  ,32   ,23        ,7.691          ,7.981               ,0.964 
2048 ,0          ,224  ,512  ,23        ,8.364          ,8.613               ,0.971 
2048 ,0          ,224  ,64   ,23        ,6.404          ,6.347               ,1.009 
2048 ,0          ,2240 ,2048 ,23        ,11.093         ,11.172              ,0.993 
2048 ,0          ,2272 ,2048 ,23        ,11.228         ,10.94               ,1.026 
2048 ,0          ,23   ,22   ,0         ,3.428          ,3.419               ,1.002 
2048 ,0          ,23   ,22   ,23        ,3.399          ,3.395               ,1.001 
2048 ,0          ,23   ,24   ,0         ,3.416          ,3.751               ,0.911 
2048 ,0          ,23   ,24   ,23        ,3.385          ,3.793               ,0.892 
2048 ,0          ,2304 ,2048 ,23        ,11.049         ,10.974              ,1.007 
2048 ,0          ,2336 ,2048 ,23        ,11.05          ,10.978              ,1.007 
2048 ,0          ,2368 ,2048 ,23        ,13.475         ,13.076              ,1.031 
2048 ,0          ,24   ,23   ,0         ,3.354          ,3.356               ,0.999 
2048 ,0          ,24   ,23   ,23        ,3.368          ,3.304               ,1.019 
2048 ,0          ,24   ,25   ,0         ,3.33           ,3.767               ,0.884 
2048 ,0          ,24   ,25   ,23        ,3.354          ,3.782               ,0.887 
2048 ,0          ,240  ,16   ,23        ,9.582          ,9.159               ,1.046 
2048 ,0          ,2400 ,2048 ,23        ,13.457         ,13.079              ,1.029 
2048 ,0          ,2432 ,2048 ,23        ,13.477         ,13.159              ,1.024 
2048 ,0          ,25   ,24   ,0         ,3.433          ,3.383               ,1.015 
2048 ,0          ,25   ,24   ,23        ,3.384          ,3.347               ,1.011 
2048 ,0          ,25   ,26   ,0         ,3.314          ,3.723               ,0.89  
2048 ,0          ,25   ,26   ,23        ,3.38           ,3.814               ,0.886 
2048 ,0          ,256  ,1    ,0         ,9.557          ,9.11                ,1.049 
2048 ,0          ,256  ,1    ,23        ,9.566          ,9.122               ,1.049 
2048 ,0          ,256  ,128  ,23        ,5.78           ,5.798               ,0.997 
2048 ,0          ,256  ,160  ,23        ,4.983          ,5.054               ,0.986 
2048 ,0          ,256  ,192  ,23        ,4.45           ,4.911               ,0.906 
2048 ,0          ,256  ,2    ,0         ,9.521          ,9.135               ,1.042 
2048 ,0          ,256  ,2    ,23        ,10.222         ,9.108               ,1.122 
2048 ,0          ,256  ,224  ,23        ,3.245          ,4.215               ,0.77  
2048 ,0          ,256  ,256  ,23        ,9.442          ,8.961               ,1.054 
2048 ,0          ,256  ,288  ,23        ,9.698          ,9.134               ,1.062 
2048 ,0          ,256  ,3    ,0         ,9.608          ,9.093               ,1.057 
2048 ,0          ,256  ,3    ,23        ,9.62           ,9.126               ,1.054 
2048 ,0          ,256  ,32   ,23        ,8.114          ,8.202               ,0.989 
2048 ,0          ,256  ,320  ,23        ,9.752          ,9.123               ,1.069 
2048 ,0          ,256  ,352  ,23        ,10.174         ,9.138               ,1.113 
2048 ,0          ,256  ,384  ,23        ,10.156         ,9.144               ,1.111 
2048 ,0          ,256  ,4    ,0         ,9.474          ,9.091               ,1.042 
2048 ,0          ,256  ,4    ,23        ,9.492          ,9.078               ,1.046 
2048 ,0          ,256  ,416  ,23        ,9.699          ,9.124               ,1.063 
2048 ,0          ,256  ,448  ,23        ,9.847          ,9.108               ,1.081 
2048 ,0          ,256  ,480  ,23        ,9.726          ,9.125               ,1.066 
2048 ,0          ,256  ,5    ,0         ,9.63           ,9.157               ,1.052 
2048 ,0          ,256  ,5    ,23        ,9.593          ,9.153               ,1.048 
2048 ,0          ,256  ,512  ,23        ,10.136         ,9.089               ,1.115 
2048 ,0          ,256  ,544  ,23        ,9.845          ,9.143               ,1.077 
2048 ,0          ,256  ,576  ,23        ,9.788          ,9.088               ,1.077 
2048 ,0          ,256  ,6    ,0         ,9.447          ,9.155               ,1.032 
2048 ,0          ,256  ,6    ,23        ,9.406          ,9.105               ,1.033 
2048 ,0          ,256  ,608  ,23        ,9.896          ,9.094               ,1.088 
2048 ,0          ,256  ,64   ,23        ,7.507          ,7.706               ,0.974 
2048 ,0          ,256  ,640  ,23        ,9.947          ,9.177               ,1.084 
2048 ,0          ,256  ,7    ,0         ,9.53           ,9.15                ,1.042 
2048 ,0          ,256  ,7    ,23        ,9.433          ,9.137               ,1.032 
2048 ,0          ,256  ,96   ,23        ,6.304          ,6.35                ,0.993 
2048 ,0          ,26   ,25   ,0         ,3.338          ,3.339               ,1.0   
2048 ,0          ,26   ,25   ,23        ,3.341          ,3.357               ,0.995 
2048 ,0          ,26   ,27   ,0         ,3.316          ,3.722               ,0.891 
2048 ,0          ,26   ,27   ,23        ,3.386          ,3.82                ,0.886 
2048 ,0          ,27   ,26   ,0         ,3.326          ,3.323               ,1.001 
2048 ,0          ,27   ,26   ,23        ,3.323          ,3.343               ,0.994 
2048 ,0          ,27   ,28   ,0         ,3.317          ,3.714               ,0.893 
2048 ,0          ,27   ,28   ,23        ,3.378          ,3.728               ,0.906 
2048 ,0          ,272  ,16   ,23        ,10.169         ,9.726               ,1.045 
2048 ,0          ,28   ,27   ,0         ,3.346          ,3.348               ,1.0   
2048 ,0          ,28   ,27   ,23        ,3.336          ,3.324               ,1.004 
2048 ,0          ,28   ,29   ,0         ,3.329          ,3.782               ,0.88  
2048 ,0          ,28   ,29   ,23        ,3.382          ,3.731               ,0.907 
2048 ,0          ,288  ,128  ,23        ,6.37           ,6.387               ,0.997 
2048 ,0          ,288  ,256  ,23        ,3.324          ,4.284               ,0.776 
2048 ,0          ,288  ,32   ,23        ,9.742          ,9.09                ,1.072 
2048 ,0          ,288  ,512  ,23        ,10.021         ,9.685               ,1.035 
2048 ,0          ,288  ,64   ,23        ,8.105          ,8.149               ,0.995 
2048 ,0          ,29   ,28   ,0         ,3.366          ,3.347               ,1.006 
2048 ,0          ,29   ,28   ,23        ,3.32           ,3.313               ,1.002 
2048 ,0          ,29   ,30   ,0         ,3.292          ,3.68                ,0.895 
2048 ,0          ,29   ,30   ,23        ,3.5            ,3.946               ,0.887 
2048 ,0          ,3    ,2    ,0         ,3.555          ,3.513               ,1.012 
2048 ,0          ,3    ,2    ,23        ,3.513          ,3.517               ,0.999 
2048 ,0          ,3    ,4    ,0         ,3.51           ,3.695               ,0.95  
2048 ,0          ,3    ,4    ,23        ,3.494          ,3.649               ,0.958 
2048 ,0          ,30   ,29   ,0         ,3.354          ,3.354               ,1.0   
2048 ,0          ,30   ,29   ,23        ,3.299          ,3.358               ,0.982 
2048 ,0          ,30   ,31   ,0         ,3.314          ,3.982               ,0.832 
2048 ,0          ,30   ,31   ,23        ,3.386          ,3.756               ,0.902 
2048 ,0          ,304  ,16   ,23        ,12.566         ,12.45               ,1.009 
2048 ,0          ,31   ,30   ,0         ,3.424          ,3.352               ,1.022 
2048 ,0          ,31   ,30   ,23        ,3.451          ,3.35                ,1.03  
2048 ,0          ,32   ,128  ,23        ,3.298          ,3.807               ,0.866 
2048 ,0          ,32   ,160  ,23        ,3.226          ,3.7                 ,0.872 
2048 ,0          ,32   ,192  ,23        ,3.233          ,3.874               ,0.835 
2048 ,0          ,32   ,224  ,23        ,3.216          ,3.834               ,0.839 
2048 ,0          ,32   ,256  ,23        ,3.217          ,3.917               ,0.821 
2048 ,0          ,32   ,288  ,23        ,3.211          ,3.955               ,0.812 
2048 ,0          ,32   ,31   ,0         ,3.367          ,3.354               ,1.004 
2048 ,0          ,32   ,31   ,23        ,3.273          ,3.27                ,1.001 
2048 ,0          ,32   ,32   ,23        ,3.195          ,3.608               ,0.886 
2048 ,0          ,32   ,320  ,23        ,3.208          ,3.777               ,0.849 
2048 ,0          ,32   ,352  ,23        ,3.276          ,3.706               ,0.884 
2048 ,0          ,32   ,384  ,23        ,3.24           ,3.807               ,0.851 
2048 ,0          ,32   ,416  ,23        ,3.263          ,3.978               ,0.82  
2048 ,0          ,32   ,64   ,23        ,3.202          ,3.68                ,0.87  
2048 ,0          ,32   ,96   ,23        ,3.202          ,3.642               ,0.879 
2048 ,0          ,320  ,128  ,23        ,11.079         ,11.024              ,1.005 
2048 ,0          ,320  ,256  ,23        ,4.612          ,5.026               ,0.918 
2048 ,0          ,320  ,32   ,23        ,12.287         ,11.796              ,1.042 
2048 ,0          ,320  ,512  ,23        ,12.521         ,12.475              ,1.004 
2048 ,0          ,320  ,64   ,23        ,11.324         ,11.215              ,1.01  
2048 ,0          ,336  ,16   ,23        ,12.675         ,12.431              ,1.02  
2048 ,0          ,352  ,128  ,23        ,11.058         ,10.951              ,1.01  
2048 ,0          ,352  ,256  ,23        ,5.098          ,5.259               ,0.969 
2048 ,0          ,352  ,32   ,23        ,12.217         ,11.745              ,1.04  
2048 ,0          ,352  ,512  ,23        ,12.368         ,12.38               ,0.999 
2048 ,0          ,352  ,64   ,23        ,11.218         ,11.156              ,1.006 
2048 ,0          ,368  ,16   ,23        ,12.499         ,12.318              ,1.015 
2048 ,0          ,3712 ,4096 ,23        ,69.547         ,65.974              ,1.054 
2048 ,0          ,3744 ,4096 ,23        ,68.813         ,66.111              ,1.041 
2048 ,0          ,3776 ,4096 ,23        ,70.905         ,67.931              ,1.044 
2048 ,0          ,3808 ,4096 ,23        ,70.629         ,68.028              ,1.038 
2048 ,0          ,384  ,128  ,23        ,10.984         ,10.882              ,1.009 
2048 ,0          ,384  ,256  ,23        ,5.882          ,5.737               ,1.025 
2048 ,0          ,384  ,32   ,23        ,12.254         ,11.768              ,1.041 
2048 ,0          ,384  ,512  ,23        ,12.311         ,12.463              ,0.988 
2048 ,0          ,384  ,64   ,23        ,11.217         ,11.143              ,1.007 
2048 ,0          ,3840 ,4096 ,23        ,70.818         ,68.015              ,1.041 
2048 ,0          ,3872 ,4096 ,23        ,70.869         ,67.838              ,1.045 
2048 ,0          ,3904 ,4096 ,23        ,72.605         ,70.191              ,1.034 
2048 ,0          ,3936 ,4096 ,23        ,72.871         ,69.977              ,1.041 
2048 ,0          ,3968 ,4096 ,23        ,72.537         ,69.94               ,1.037 
2048 ,0          ,4    ,3    ,0         ,3.498          ,3.496               ,1.001 
2048 ,0          ,4    ,3    ,23        ,3.505          ,3.505               ,1.0   
2048 ,0          ,4    ,5    ,0         ,3.507          ,3.823               ,0.917 
2048 ,0          ,4    ,5    ,23        ,3.505          ,3.828               ,0.916 
2048 ,0          ,400  ,16   ,23        ,12.609         ,12.471              ,1.011 
2048 ,0          ,4000 ,4096 ,23        ,72.721         ,69.948              ,1.04  
2048 ,0          ,4032 ,4096 ,23        ,74.811         ,74.172              ,1.009 
2048 ,0          ,4064 ,4096 ,23        ,74.907         ,78.091              ,0.959 
2048 ,0          ,4096 ,3712 ,23        ,13.421         ,13.055              ,1.028 
2048 ,0          ,4096 ,3744 ,23        ,13.497         ,13.068              ,1.033 
2048 ,0          ,4096 ,3776 ,23        ,12.587         ,12.226              ,1.03  
2048 ,0          ,4096 ,3808 ,23        ,12.001         ,11.922              ,1.007 
2048 ,0          ,4096 ,3840 ,23        ,11.009         ,10.852              ,1.014 
2048 ,0          ,4096 ,3872 ,23        ,10.927         ,10.83               ,1.009 
2048 ,0          ,4096 ,3904 ,23        ,10.247         ,10.151              ,1.009 
2048 ,0          ,4096 ,3936 ,23        ,6.239          ,6.264               ,0.996 
2048 ,0          ,4096 ,3968 ,23        ,5.744          ,5.73                ,1.002 
2048 ,0          ,4096 ,4000 ,23        ,5.037          ,5.141               ,0.98  
2048 ,0          ,4096 ,4032 ,23        ,4.491          ,5.024               ,0.894 
2048 ,0          ,4096 ,4064 ,23        ,3.288          ,4.337               ,0.758 
2048 ,0          ,4096 ,4096 ,23        ,73.587         ,80.876              ,0.91  
2048 ,0          ,4096 ,4128 ,23        ,74.836         ,79.569              ,0.941 
2048 ,0          ,4096 ,4160 ,23        ,75.276         ,75.78               ,0.993 
2048 ,0          ,4096 ,4192 ,23        ,74.477         ,73.404              ,1.015 
2048 ,0          ,4096 ,4224 ,23        ,74.674         ,73.53               ,1.016 
2048 ,0          ,4096 ,4256 ,23        ,74.532         ,72.368              ,1.03  
2048 ,0          ,4096 ,4288 ,23        ,75.913         ,74.696              ,1.016 
2048 ,0          ,4096 ,4320 ,23        ,75.323         ,72.431              ,1.04  
2048 ,0          ,4096 ,4352 ,23        ,75.365         ,72.235              ,1.043 
2048 ,0          ,4096 ,4384 ,23        ,75.579         ,72.38               ,1.044 
2048 ,0          ,4096 ,4416 ,23        ,74.715         ,75.075              ,0.995 
2048 ,0          ,4096 ,4448 ,23        ,75.601         ,72.276              ,1.046 
2048 ,0          ,4096 ,4480 ,23        ,76.736         ,72.28               ,1.062 
2048 ,0          ,4128 ,4096 ,23        ,3.325          ,4.416               ,0.753 
2048 ,0          ,416  ,128  ,23        ,11.002         ,10.809              ,1.018 
2048 ,0          ,416  ,256  ,23        ,6.326          ,6.335               ,0.998 
2048 ,0          ,416  ,32   ,23        ,12.198         ,11.675              ,1.045 
2048 ,0          ,416  ,512  ,23        ,12.428         ,12.37               ,1.005 
2048 ,0          ,416  ,64   ,23        ,11.239         ,11.137              ,1.009 
2048 ,0          ,4160 ,4096 ,23        ,4.578          ,4.997               ,0.916 
2048 ,0          ,4192 ,4096 ,23        ,5.184          ,5.224               ,0.992 
2048 ,0          ,4224 ,4096 ,23        ,5.819          ,5.943               ,0.979 
2048 ,0          ,4256 ,4096 ,23        ,6.387          ,6.474               ,0.987 
2048 ,0          ,4288 ,4096 ,23        ,11.182         ,11.091              ,1.008 
2048 ,0          ,4320 ,4096 ,23        ,11.204         ,10.984              ,1.02  
2048 ,0          ,4352 ,4096 ,23        ,11.098         ,10.91               ,1.017 
2048 ,0          ,4384 ,4096 ,23        ,11.135         ,10.967              ,1.015 
2048 ,0          ,4416 ,4096 ,23        ,13.484         ,13.129              ,1.027 
2048 ,0          ,4448 ,4096 ,23        ,13.574         ,13.103              ,1.036 
2048 ,0          ,448  ,128  ,23        ,13.461         ,13.419              ,1.003 
2048 ,0          ,448  ,256  ,23        ,11.068         ,10.954              ,1.01  
2048 ,0          ,448  ,512  ,23        ,14.931         ,14.882              ,1.003 
2048 ,0          ,448  ,64   ,23        ,13.577         ,13.345              ,1.017 
2048 ,0          ,4480 ,4096 ,23        ,13.469         ,13.065              ,1.031 
2048 ,0          ,48   ,16   ,23        ,3.334          ,4.441               ,0.751 
2048 ,0          ,480  ,128  ,23        ,13.53          ,13.207              ,1.024 
2048 ,0          ,480  ,256  ,23        ,11.008         ,10.949              ,1.005 
2048 ,0          ,480  ,512  ,23        ,14.727         ,14.598              ,1.009 
2048 ,0          ,5    ,4    ,0         ,3.479          ,3.483               ,0.999 
2048 ,0          ,5    ,4    ,23        ,3.479          ,3.503               ,0.993 
2048 ,0          ,5    ,6    ,0         ,3.51           ,3.804               ,0.923 
2048 ,0          ,5    ,6    ,23        ,3.52           ,3.899               ,0.903 
2048 ,0          ,512  ,128  ,23        ,13.463         ,13.158              ,1.023 
2048 ,0          ,512  ,160  ,23        ,13.623         ,13.196              ,1.032 
2048 ,0          ,512  ,192  ,23        ,12.537         ,12.293              ,1.02  
2048 ,0          ,512  ,224  ,23        ,12.0           ,11.972              ,1.002 
2048 ,0          ,512  ,256  ,23        ,11.087         ,10.935              ,1.014 
2048 ,0          ,512  ,288  ,23        ,11.027         ,10.97               ,1.005 
2048 ,0          ,512  ,320  ,23        ,10.138         ,10.152              ,0.999 
2048 ,0          ,512  ,352  ,23        ,6.265          ,6.34                ,0.988 
2048 ,0          ,512  ,384  ,23        ,5.69           ,5.608               ,1.015 
2048 ,0          ,512  ,416  ,23        ,5.028          ,5.157               ,0.975 
2048 ,0          ,512  ,448  ,23        ,4.797          ,5.683               ,0.844 
2048 ,0          ,512  ,480  ,23        ,3.233          ,4.224               ,0.766 
2048 ,0          ,512  ,512  ,23        ,14.551         ,14.522              ,1.002 
2048 ,0          ,512  ,544  ,23        ,14.772         ,14.737              ,1.002 
2048 ,0          ,512  ,576  ,23        ,14.667         ,14.562              ,1.007 
2048 ,0          ,512  ,608  ,23        ,14.682         ,14.617              ,1.004 
2048 ,0          ,512  ,640  ,23        ,14.661         ,14.6                ,1.004 
2048 ,0          ,512  ,672  ,23        ,14.665         ,14.656              ,1.001 
2048 ,0          ,512  ,704  ,23        ,14.665         ,14.687              ,0.999 
2048 ,0          ,512  ,736  ,23        ,14.731         ,14.601              ,1.009 
2048 ,0          ,512  ,768  ,23        ,14.897         ,14.6                ,1.02  
2048 ,0          ,512  ,800  ,23        ,14.785         ,14.65               ,1.009 
2048 ,0          ,512  ,832  ,23        ,14.891         ,14.669              ,1.015 
2048 ,0          ,512  ,864  ,23        ,14.849         ,14.615              ,1.016 
2048 ,0          ,512  ,896  ,23        ,14.72          ,14.634              ,1.006 
2048 ,0          ,544  ,256  ,23        ,10.996         ,10.865              ,1.012 
2048 ,0          ,544  ,512  ,23        ,3.248          ,4.276               ,0.759 
2048 ,0          ,576  ,256  ,23        ,13.439         ,13.071              ,1.028 
2048 ,0          ,576  ,512  ,23        ,4.626          ,4.94                ,0.937 
2048 ,0          ,6    ,5    ,0         ,3.481          ,3.479               ,1.001 
2048 ,0          ,6    ,5    ,23        ,3.47           ,3.492               ,0.994 
2048 ,0          ,6    ,7    ,0         ,3.501          ,3.774               ,0.928 
2048 ,0          ,6    ,7    ,23        ,3.505          ,3.778               ,0.928 
2048 ,0          ,608  ,256  ,23        ,13.441         ,13.094              ,1.027 
2048 ,0          ,608  ,512  ,23        ,5.086          ,5.049               ,1.007 
2048 ,0          ,64   ,128  ,23        ,4.526          ,5.673               ,0.798 
2048 ,0          ,64   ,160  ,23        ,5.108          ,5.749               ,0.888 
2048 ,0          ,64   ,192  ,23        ,4.403          ,5.781               ,0.762 
2048 ,0          ,64   ,224  ,23        ,4.082          ,5.526               ,0.739 
2048 ,0          ,64   ,256  ,23        ,4.87           ,5.732               ,0.85  
2048 ,0          ,64   ,288  ,23        ,4.263          ,5.594               ,0.762 
2048 ,0          ,64   ,32   ,23        ,3.25           ,4.277               ,0.76  
2048 ,0          ,64   ,320  ,23        ,4.159          ,5.667               ,0.734 
2048 ,0          ,64   ,352  ,23        ,4.306          ,5.558               ,0.775 
2048 ,0          ,64   ,384  ,23        ,4.127          ,5.556               ,0.743 
2048 ,0          ,64   ,416  ,23        ,3.993          ,5.581               ,0.715 
2048 ,0          ,64   ,448  ,23        ,4.039          ,5.602               ,0.721 
2048 ,0          ,64   ,64   ,23        ,3.931          ,5.5                 ,0.715 
2048 ,0          ,64   ,96   ,23        ,4.169          ,5.622               ,0.742 
2048 ,0          ,640  ,1024 ,23        ,18.071         ,16.744              ,1.079 
2048 ,0          ,640  ,256  ,23        ,13.471         ,13.237              ,1.018 
2048 ,0          ,640  ,512  ,23        ,5.816          ,5.886               ,0.988 
2048 ,0          ,672  ,1024 ,23        ,17.95          ,16.844              ,1.066 
2048 ,0          ,672  ,512  ,23        ,6.391          ,6.386               ,1.001 
2048 ,0          ,7    ,6    ,0         ,3.552          ,3.481               ,1.02  
2048 ,0          ,7    ,6    ,23        ,3.551          ,3.49                ,1.017 
2048 ,0          ,7    ,8    ,0         ,3.504          ,3.815               ,0.919 
2048 ,0          ,7    ,8    ,23        ,3.51           ,3.842               ,0.914 
2048 ,0          ,704  ,1024 ,23        ,20.085         ,18.85               ,1.066 
2048 ,0          ,704  ,512  ,23        ,11.283         ,11.057              ,1.02  
2048 ,0          ,736  ,1024 ,23        ,19.965         ,18.856              ,1.059 
2048 ,0          ,736  ,512  ,23        ,11.185         ,10.965              ,1.02  
2048 ,0          ,768  ,1024 ,23        ,19.936         ,18.907              ,1.054 
2048 ,0          ,768  ,512  ,23        ,11.171         ,11.031              ,1.013 
2048 ,0          ,8    ,7    ,0         ,3.479          ,3.479               ,1.0   
2048 ,0          ,8    ,7    ,23        ,3.48           ,3.496               ,0.995 
2048 ,0          ,8    ,9    ,0         ,3.513          ,3.913               ,0.898 
2048 ,0          ,8    ,9    ,23        ,3.496          ,3.87                ,0.903 
2048 ,0          ,80   ,16   ,23        ,5.21           ,5.703               ,0.914 
2048 ,0          ,800  ,1024 ,23        ,19.938         ,18.905              ,1.055 
2048 ,0          ,800  ,512  ,23        ,11.05          ,10.895              ,1.014 
2048 ,0          ,832  ,1024 ,23        ,22.375         ,21.231              ,1.054 
2048 ,0          ,832  ,512  ,23        ,13.52          ,13.105              ,1.032 
2048 ,0          ,864  ,1024 ,23        ,22.656         ,21.233              ,1.067 
2048 ,0          ,864  ,512  ,23        ,13.516         ,13.191              ,1.025 
2048 ,0          ,896  ,1024 ,23        ,22.215         ,21.336              ,1.041 
2048 ,0          ,896  ,512  ,23        ,13.443         ,13.154              ,1.022 
2048 ,0          ,9    ,10   ,0         ,3.517          ,3.797               ,0.926 
2048 ,0          ,9    ,10   ,23        ,3.518          ,3.848               ,0.914 
2048 ,0          ,9    ,8    ,0         ,3.496          ,3.499               ,0.999 
2048 ,0          ,9    ,8    ,23        ,3.453          ,3.554               ,0.971 
2048 ,0          ,928  ,1024 ,23        ,22.273         ,21.297              ,1.046 
2048 ,0          ,96   ,128  ,23        ,4.731          ,5.592               ,0.846 
2048 ,0          ,96   ,256  ,23        ,4.71           ,5.554               ,0.848 
2048 ,0          ,96   ,32   ,23        ,4.409          ,5.313               ,0.83  
2048 ,0          ,96   ,64   ,23        ,3.207          ,4.273               ,0.75  
2048 ,0          ,960  ,1024 ,23        ,23.981         ,23.402              ,1.025 
2048 ,0          ,992  ,1024 ,23        ,24.358         ,23.441              ,1.039 
2048 ,1          ,0    ,1    ,0         ,3.437          ,3.442               ,0.999 
2048 ,1          ,0    ,1    ,23        ,3.445          ,3.436               ,1.003 
2048 ,1          ,1    ,2    ,0         ,3.513          ,3.513               ,1.0   
2048 ,1          ,1    ,2    ,23        ,3.496          ,3.496               ,1.0   
2048 ,1          ,10   ,11   ,0         ,3.501          ,3.76                ,0.931 
2048 ,1          ,10   ,11   ,23        ,3.455          ,3.75                ,0.921 
2048 ,1          ,10   ,9    ,0         ,3.455          ,3.751               ,0.921 
2048 ,1          ,10   ,9    ,23        ,3.471          ,3.815               ,0.91  
2048 ,1          ,1024 ,1024 ,23        ,23.681         ,23.172              ,1.022 
2048 ,1          ,1024 ,1056 ,23        ,24.018         ,23.409              ,1.026 
2048 ,1          ,1024 ,1088 ,23        ,23.873         ,23.464              ,1.017 
2048 ,1          ,1024 ,1120 ,23        ,23.888         ,23.422              ,1.02  
2048 ,1          ,1024 ,1152 ,23        ,24.371         ,23.484              ,1.038 
2048 ,1          ,1024 ,1184 ,23        ,23.848         ,23.405              ,1.019 
2048 ,1          ,1024 ,1216 ,23        ,24.043         ,23.451              ,1.025 
2048 ,1          ,1024 ,1248 ,23        ,24.004         ,23.38               ,1.027 
2048 ,1          ,1024 ,1280 ,23        ,23.93          ,23.469              ,1.02  
2048 ,1          ,1024 ,1312 ,23        ,24.036         ,23.463              ,1.024 
2048 ,1          ,1024 ,1344 ,23        ,23.917         ,23.598              ,1.013 
2048 ,1          ,1024 ,1376 ,23        ,24.0           ,23.448              ,1.024 
2048 ,1          ,1024 ,1408 ,23        ,23.797         ,23.41               ,1.017 
2048 ,1          ,1024 ,640  ,23        ,19.256         ,17.932              ,1.074 
2048 ,1          ,1024 ,672  ,23        ,19.274         ,18.702              ,1.031 
2048 ,1          ,1024 ,704  ,23        ,19.583         ,19.016              ,1.03  
2048 ,1          ,1024 ,736  ,23        ,21.657         ,21.181              ,1.022 
2048 ,1          ,1024 ,768  ,23        ,21.382         ,20.596              ,1.038 
2048 ,1          ,1024 ,800  ,23        ,21.327         ,20.988              ,1.016 
2048 ,1          ,1024 ,832  ,23        ,22.794         ,21.178              ,1.076 
2048 ,1          ,1024 ,864  ,23        ,25.276         ,23.911              ,1.057 
2048 ,1          ,1024 ,896  ,23        ,23.175         ,22.452              ,1.032 
2048 ,1          ,1024 ,928  ,23        ,22.915         ,21.791              ,1.052 
2048 ,1          ,1024 ,960  ,23        ,22.993         ,22.289              ,1.032 
2048 ,1          ,1024 ,992  ,23        ,24.477         ,22.764              ,1.075 
2048 ,1          ,1056 ,1024 ,23        ,24.882         ,22.996              ,1.082 
2048 ,1          ,1088 ,1024 ,23        ,25.201         ,24.201              ,1.041 
2048 ,1          ,11   ,10   ,0         ,3.474          ,3.821               ,0.909 
2048 ,1          ,11   ,10   ,23        ,3.48           ,3.84                ,0.906 
2048 ,1          ,11   ,12   ,0         ,3.483          ,3.737               ,0.932 
2048 ,1          ,11   ,12   ,23        ,3.478          ,3.723               ,0.934 
2048 ,1          ,112  ,16   ,23        ,3.405          ,4.408               ,0.772 
2048 ,1          ,1120 ,1024 ,23        ,24.984         ,23.963              ,1.043 
2048 ,1          ,1152 ,1024 ,23        ,25.071         ,24.232              ,1.035 
2048 ,1          ,1184 ,1024 ,23        ,24.877         ,23.927              ,1.04  
2048 ,1          ,12   ,11   ,0         ,3.47           ,3.782               ,0.918 
2048 ,1          ,12   ,11   ,23        ,3.462          ,3.786               ,0.914 
2048 ,1          ,12   ,13   ,0         ,3.453          ,3.751               ,0.921 
2048 ,1          ,12   ,13   ,23        ,3.447          ,3.764               ,0.916 
2048 ,1          ,1216 ,1024 ,23        ,26.061         ,25.352              ,1.028 
2048 ,1          ,1248 ,1024 ,23        ,25.572         ,24.964              ,1.024 
2048 ,1          ,128  ,128  ,23        ,5.537          ,6.198               ,0.893 
2048 ,1          ,128  ,160  ,23        ,5.578          ,6.506               ,0.857 
2048 ,1          ,128  ,192  ,23        ,6.237          ,6.392               ,0.976 
2048 ,1          ,128  ,224  ,23        ,5.869          ,6.69                ,0.877 
2048 ,1          ,128  ,256  ,23        ,5.857          ,6.772               ,0.865 
2048 ,1          ,128  ,288  ,23        ,5.87           ,6.417               ,0.915 
2048 ,1          ,128  ,32   ,23        ,3.301          ,4.298               ,0.768 
2048 ,1          ,128  ,320  ,23        ,5.815          ,6.36                ,0.914 
2048 ,1          ,128  ,352  ,23        ,5.673          ,6.415               ,0.884 
2048 ,1          ,128  ,384  ,23        ,5.8            ,6.415               ,0.904 
2048 ,1          ,128  ,416  ,23        ,5.674          ,6.395               ,0.887 
2048 ,1          ,128  ,448  ,23        ,5.755          ,6.428               ,0.895 
2048 ,1          ,128  ,480  ,23        ,5.657          ,6.361               ,0.889 
2048 ,1          ,128  ,512  ,23        ,5.611          ,6.36                ,0.882 
2048 ,1          ,128  ,64   ,23        ,4.46           ,4.934               ,0.904 
2048 ,1          ,128  ,96   ,23        ,5.021          ,4.998               ,1.005 
2048 ,1          ,1280 ,1024 ,23        ,24.908         ,23.976              ,1.039 
2048 ,1          ,13   ,12   ,0         ,3.444          ,3.755               ,0.917 
2048 ,1          ,13   ,12   ,23        ,3.472          ,3.812               ,0.911 
2048 ,1          ,13   ,14   ,0         ,3.473          ,3.816               ,0.91  
2048 ,1          ,13   ,14   ,23        ,3.434          ,3.802               ,0.903 
2048 ,1          ,1312 ,1024 ,23        ,24.608         ,23.985              ,1.026 
2048 ,1          ,1344 ,1024 ,23        ,25.881         ,25.416              ,1.018 
2048 ,1          ,1376 ,1024 ,23        ,25.645         ,25.013              ,1.025 
2048 ,1          ,14   ,13   ,0         ,3.45           ,3.834               ,0.9   
2048 ,1          ,14   ,13   ,23        ,3.469          ,3.79                ,0.915 
2048 ,1          ,14   ,15   ,0         ,3.435          ,3.776               ,0.91  
2048 ,1          ,14   ,15   ,23        ,3.438          ,3.738               ,0.92  
2048 ,1          ,1408 ,1024 ,23        ,24.834         ,23.943              ,1.037 
2048 ,1          ,144  ,16   ,23        ,3.548          ,4.576               ,0.775 
2048 ,1          ,15   ,14   ,0         ,3.438          ,3.866               ,0.889 
2048 ,1          ,15   ,14   ,23        ,3.436          ,3.798               ,0.905 
2048 ,1          ,15   ,16   ,0         ,3.433          ,3.839               ,0.894 
2048 ,1          ,15   ,16   ,23        ,3.43           ,3.778               ,0.908 
2048 ,1          ,16   ,112  ,23        ,3.391          ,3.774               ,0.899 
2048 ,1          ,16   ,144  ,23        ,3.386          ,3.972               ,0.853 
2048 ,1          ,16   ,15   ,0         ,3.447          ,3.804               ,0.906 
2048 ,1          ,16   ,15   ,23        ,3.44           ,3.795               ,0.906 
2048 ,1          ,16   ,16   ,23        ,3.315          ,3.632               ,0.913 
2048 ,1          ,16   ,17   ,0         ,3.422          ,3.758               ,0.911 
2048 ,1          ,16   ,17   ,23        ,3.419          ,3.765               ,0.908 
2048 ,1          ,16   ,176  ,23        ,3.459          ,3.786               ,0.914 
2048 ,1          ,16   ,208  ,23        ,3.413          ,4.137               ,0.825 
2048 ,1          ,16   ,240  ,23        ,3.395          ,3.752               ,0.905 
2048 ,1          ,16   ,272  ,23        ,3.425          ,3.817               ,0.897 
2048 ,1          ,16   ,304  ,23        ,3.424          ,3.723               ,0.92  
2048 ,1          ,16   ,336  ,23        ,3.419          ,3.784               ,0.904 
2048 ,1          ,16   ,368  ,23        ,3.419          ,3.736               ,0.915 
2048 ,1          ,16   ,400  ,23        ,3.538          ,4.046               ,0.874 
2048 ,1          ,16   ,48   ,23        ,3.467          ,3.696               ,0.938 
2048 ,1          ,16   ,80   ,23        ,3.403          ,4.171               ,0.816 
2048 ,1          ,160  ,128  ,23        ,5.868          ,6.426               ,0.913 
2048 ,1          ,160  ,256  ,23        ,6.337          ,6.906               ,0.918 
2048 ,1          ,160  ,32   ,23        ,3.302          ,4.334               ,0.762 
2048 ,1          ,160  ,512  ,23        ,6.306          ,6.793               ,0.928 
2048 ,1          ,160  ,64   ,23        ,4.555          ,4.97                ,0.916 
2048 ,1          ,1664 ,2048 ,23        ,34.862         ,33.197              ,1.05  
2048 ,1          ,1696 ,2048 ,23        ,35.439         ,32.938              ,1.076 
2048 ,1          ,17   ,16   ,0         ,3.435          ,3.868               ,0.888 
2048 ,1          ,17   ,16   ,23        ,3.464          ,3.803               ,0.911 
2048 ,1          ,17   ,18   ,0         ,3.421          ,3.797               ,0.901 
2048 ,1          ,17   ,18   ,23        ,3.428          ,3.745               ,0.915 
2048 ,1          ,1728 ,2048 ,23        ,37.457         ,34.958              ,1.072 
2048 ,1          ,176  ,16   ,23        ,3.403          ,4.396               ,0.774 
2048 ,1          ,1760 ,2048 ,23        ,37.347         ,35.059              ,1.065 
2048 ,1          ,1792 ,2048 ,23        ,37.752         ,34.841              ,1.084 
2048 ,1          ,18   ,17   ,0         ,3.422          ,3.802               ,0.9   
2048 ,1          ,18   ,17   ,23        ,3.434          ,3.789               ,0.906 
2048 ,1          ,18   ,19   ,0         ,3.415          ,3.846               ,0.888 
2048 ,1          ,18   ,19   ,23        ,3.404          ,3.748               ,0.908 
2048 ,1          ,1824 ,2048 ,23        ,37.784         ,35.07               ,1.077 
2048 ,1          ,1856 ,2048 ,23        ,39.798         ,37.007              ,1.075 
2048 ,1          ,1888 ,2048 ,23        ,40.312         ,36.903              ,1.092 
2048 ,1          ,19   ,18   ,0         ,3.411          ,3.741               ,0.912 
2048 ,1          ,19   ,18   ,23        ,3.469          ,3.748               ,0.925 
2048 ,1          ,19   ,20   ,0         ,3.461          ,3.876               ,0.893 
2048 ,1          ,19   ,20   ,23        ,3.405          ,3.693               ,0.922 
2048 ,1          ,192  ,128  ,23        ,5.978          ,5.848               ,1.022 
2048 ,1          ,192  ,256  ,23        ,7.589          ,7.992               ,0.949 
2048 ,1          ,192  ,32   ,23        ,3.262          ,4.205               ,0.776 
2048 ,1          ,192  ,512  ,23        ,7.709          ,7.981               ,0.966 
2048 ,1          ,192  ,64   ,23        ,4.555          ,5.075               ,0.898 
2048 ,1          ,1920 ,2048 ,23        ,39.793         ,36.902              ,1.078 
2048 ,1          ,1952 ,2048 ,23        ,41.486         ,44.016              ,0.943 
2048 ,1          ,1984 ,2048 ,23        ,41.077         ,39.209              ,1.048 
2048 ,1          ,2    ,1    ,0         ,3.371          ,3.351               ,1.006 
2048 ,1          ,2    ,1    ,23        ,3.308          ,3.327               ,0.994 
2048 ,1          ,2    ,3    ,0         ,3.485          ,3.483               ,1.001 
2048 ,1          ,2    ,3    ,23        ,3.506          ,3.482               ,1.007 
2048 ,1          ,20   ,19   ,0         ,3.406          ,3.744               ,0.91  
2048 ,1          ,20   ,19   ,23        ,3.413          ,3.812               ,0.895 
2048 ,1          ,20   ,21   ,0         ,3.411          ,3.784               ,0.901 
2048 ,1          ,20   ,21   ,23        ,3.411          ,3.793               ,0.899 
2048 ,1          ,2016 ,2048 ,23        ,41.249         ,39.02               ,1.057 
2048 ,1          ,2048 ,1024 ,0         ,25.076         ,24.999              ,1.003 
2048 ,1          ,2048 ,1024 ,23        ,25.215         ,23.937              ,1.053 
2048 ,1          ,2048 ,128  ,0         ,5.711          ,5.729               ,0.997 
2048 ,1          ,2048 ,128  ,23        ,5.856          ,5.822               ,1.006 
2048 ,1          ,2048 ,1664 ,23        ,36.215         ,33.796              ,1.072 
2048 ,1          ,2048 ,1696 ,23        ,36.431         ,34.857              ,1.045 
2048 ,1          ,2048 ,1728 ,23        ,36.928         ,35.063              ,1.053 
2048 ,1          ,2048 ,1760 ,23        ,38.583         ,35.859              ,1.076 
2048 ,1          ,2048 ,1792 ,23        ,38.764         ,35.756              ,1.084 
2048 ,1          ,2048 ,1824 ,23        ,38.618         ,36.792              ,1.05  
2048 ,1          ,2048 ,1856 ,23        ,38.976         ,37.025              ,1.053 
2048 ,1          ,2048 ,1888 ,23        ,40.443         ,37.942              ,1.066 
2048 ,1          ,2048 ,1920 ,23        ,40.582         ,38.171              ,1.063 
2048 ,1          ,2048 ,1952 ,23        ,40.151         ,37.89               ,1.06  
2048 ,1          ,2048 ,1984 ,23        ,40.186         ,38.162              ,1.053 
2048 ,1          ,2048 ,2016 ,23        ,41.892         ,38.985              ,1.075 
2048 ,1          ,2048 ,2048 ,0         ,41.081         ,39.247              ,1.047 
2048 ,1          ,2048 ,2048 ,23        ,40.686         ,38.893              ,1.046 
2048 ,1          ,2048 ,2080 ,23        ,41.042         ,39.398              ,1.042 
2048 ,1          ,2048 ,2112 ,23        ,40.904         ,39.153              ,1.045 
2048 ,1          ,2048 ,2144 ,23        ,40.781         ,39.152              ,1.042 
2048 ,1          ,2048 ,2176 ,23        ,40.768         ,39.1                ,1.043 
2048 ,1          ,2048 ,2208 ,23        ,41.123         ,39.144              ,1.051 
2048 ,1          ,2048 ,2240 ,23        ,40.927         ,39.255              ,1.043 
2048 ,1          ,2048 ,2272 ,23        ,41.538         ,39.136              ,1.061 
2048 ,1          ,2048 ,2304 ,23        ,41.543         ,38.997              ,1.065 
2048 ,1          ,2048 ,2336 ,23        ,41.158         ,39.195              ,1.05  
2048 ,1          ,2048 ,2368 ,23        ,41.253         ,39.028              ,1.057 
2048 ,1          ,2048 ,2400 ,23        ,40.961         ,39.309              ,1.042 
2048 ,1          ,2048 ,2432 ,23        ,41.071         ,39.172              ,1.048 
2048 ,1          ,2048 ,256  ,0         ,10.915         ,10.883              ,1.003 
2048 ,1          ,2048 ,256  ,23        ,11.075         ,10.971              ,1.01  
2048 ,1          ,2048 ,32   ,0         ,3.208          ,4.253               ,0.754 
2048 ,1          ,2048 ,32   ,23        ,3.279          ,4.314               ,0.76  
2048 ,1          ,2048 ,512  ,0         ,16.181         ,15.286              ,1.059 
2048 ,1          ,2048 ,512  ,23        ,16.205         ,15.351              ,1.056 
2048 ,1          ,2048 ,64   ,0         ,4.482          ,5.009               ,0.895 
2048 ,1          ,2048 ,64   ,23        ,4.603          ,5.025               ,0.916 
2048 ,1          ,208  ,16   ,23        ,3.445          ,4.426               ,0.778 
2048 ,1          ,2080 ,2048 ,23        ,41.699         ,40.366              ,1.033 
2048 ,1          ,21   ,20   ,0         ,3.404          ,3.711               ,0.917 
2048 ,1          ,21   ,20   ,23        ,3.391          ,3.749               ,0.905 
2048 ,1          ,21   ,22   ,0         ,3.384          ,3.796               ,0.892 
2048 ,1          ,21   ,22   ,23        ,3.396          ,3.748               ,0.906 
2048 ,1          ,2112 ,2048 ,23        ,42.686         ,43.257              ,0.987 
2048 ,1          ,2144 ,2048 ,23        ,42.54          ,41.21               ,1.032 
2048 ,1          ,2176 ,2048 ,23        ,43.195         ,40.169              ,1.075 
2048 ,1          ,22   ,21   ,0         ,3.416          ,3.734               ,0.915 
2048 ,1          ,22   ,21   ,23        ,3.521          ,3.868               ,0.91  
2048 ,1          ,22   ,23   ,0         ,3.317          ,3.729               ,0.89  
2048 ,1          ,22   ,23   ,23        ,3.334          ,3.767               ,0.885 
2048 ,1          ,2208 ,2048 ,23        ,42.439         ,40.008              ,1.061 
2048 ,1          ,224  ,128  ,23        ,5.957          ,5.827               ,1.022 
2048 ,1          ,224  ,256  ,23        ,8.545          ,8.508               ,1.004 
2048 ,1          ,224  ,32   ,23        ,3.207          ,4.26                ,0.753 
2048 ,1          ,224  ,512  ,23        ,8.343          ,8.497               ,0.982 
2048 ,1          ,224  ,64   ,23        ,4.448          ,4.96                ,0.897 
2048 ,1          ,2240 ,2048 ,23        ,43.571         ,45.058              ,0.967 
2048 ,1          ,2272 ,2048 ,23        ,42.912         ,42.728              ,1.004 
2048 ,1          ,23   ,22   ,0         ,3.394          ,3.785               ,0.897 
2048 ,1          ,23   ,22   ,23        ,3.384          ,3.754               ,0.902 
2048 ,1          ,23   ,24   ,0         ,3.317          ,3.668               ,0.904 
2048 ,1          ,23   ,24   ,23        ,3.314          ,3.724               ,0.89  
2048 ,1          ,2304 ,2048 ,23        ,42.771         ,39.964              ,1.07  
2048 ,1          ,2336 ,2048 ,23        ,42.913         ,40.147              ,1.069 
2048 ,1          ,2368 ,2048 ,23        ,43.474         ,41.447              ,1.049 
2048 ,1          ,24   ,23   ,0         ,3.307          ,3.725               ,0.888 
2048 ,1          ,24   ,23   ,23        ,3.323          ,3.713               ,0.895 
2048 ,1          ,24   ,25   ,0         ,3.305          ,3.678               ,0.898 
2048 ,1          ,24   ,25   ,23        ,3.325          ,3.782               ,0.879 
2048 ,1          ,240  ,16   ,23        ,3.455          ,4.437               ,0.779 
2048 ,1          ,2400 ,2048 ,23        ,42.787         ,41.418              ,1.033 
2048 ,1          ,2432 ,2048 ,23        ,42.908         ,40.099              ,1.07  
2048 ,1          ,25   ,24   ,0         ,3.307          ,3.706               ,0.892 
2048 ,1          ,25   ,24   ,23        ,3.307          ,3.68                ,0.899 
2048 ,1          ,25   ,26   ,0         ,3.289          ,3.688               ,0.892 
2048 ,1          ,25   ,26   ,23        ,3.299          ,3.718               ,0.887 
2048 ,1          ,256  ,1    ,0         ,3.324          ,4.004               ,0.83  
2048 ,1          ,256  ,1    ,23        ,3.381          ,4.196               ,0.806 
2048 ,1          ,256  ,128  ,23        ,5.894          ,5.787               ,1.018 
2048 ,1          ,256  ,160  ,23        ,6.323          ,6.406               ,0.987 
2048 ,1          ,256  ,192  ,23        ,7.529          ,7.735               ,0.973 
2048 ,1          ,256  ,2    ,0         ,3.32           ,3.977               ,0.835 
2048 ,1          ,256  ,2    ,23        ,3.348          ,4.14                ,0.809 
2048 ,1          ,256  ,224  ,23        ,8.118          ,8.128               ,0.999 
2048 ,1          ,256  ,256  ,23        ,9.182          ,8.984               ,1.022 
2048 ,1          ,256  ,288  ,23        ,9.62           ,9.058               ,1.062 
2048 ,1          ,256  ,3    ,0         ,3.354          ,4.087               ,0.821 
2048 ,1          ,256  ,3    ,23        ,3.347          ,4.316               ,0.776 
2048 ,1          ,256  ,32   ,23        ,3.264          ,4.293               ,0.76  
2048 ,1          ,256  ,320  ,23        ,9.926          ,9.136               ,1.087 
2048 ,1          ,256  ,352  ,23        ,9.658          ,9.098               ,1.062 
2048 ,1          ,256  ,384  ,23        ,9.576          ,9.107               ,1.052 
2048 ,1          ,256  ,4    ,0         ,3.4            ,4.008               ,0.848 
2048 ,1          ,256  ,4    ,23        ,3.364          ,4.342               ,0.775 
2048 ,1          ,256  ,416  ,23        ,9.663          ,9.139               ,1.057 
2048 ,1          ,256  ,448  ,23        ,9.757          ,9.102               ,1.072 
2048 ,1          ,256  ,480  ,23        ,9.625          ,9.163               ,1.05  
2048 ,1          ,256  ,5    ,0         ,3.357          ,4.362               ,0.769 
2048 ,1          ,256  ,5    ,23        ,3.34           ,4.315               ,0.774 
2048 ,1          ,256  ,512  ,23        ,9.671          ,9.06                ,1.067 
2048 ,1          ,256  ,544  ,23        ,9.883          ,9.129               ,1.083 
2048 ,1          ,256  ,576  ,23        ,9.888          ,9.119               ,1.084 
2048 ,1          ,256  ,6    ,0         ,3.39           ,4.327               ,0.783 
2048 ,1          ,256  ,6    ,23        ,3.334          ,4.319               ,0.772 
2048 ,1          ,256  ,608  ,23        ,9.895          ,9.127               ,1.084 
2048 ,1          ,256  ,64   ,23        ,4.498          ,4.943               ,0.91  
2048 ,1          ,256  ,640  ,23        ,9.786          ,9.185               ,1.065 
2048 ,1          ,256  ,7    ,0         ,3.39           ,4.375               ,0.775 
2048 ,1          ,256  ,7    ,23        ,3.293          ,4.262               ,0.773 
2048 ,1          ,256  ,96   ,23        ,5.225          ,5.151               ,1.014 
2048 ,1          ,26   ,25   ,0         ,3.328          ,3.688               ,0.902 
2048 ,1          ,26   ,25   ,23        ,3.313          ,3.762               ,0.88  
2048 ,1          ,26   ,27   ,0         ,3.287          ,3.712               ,0.885 
2048 ,1          ,26   ,27   ,23        ,3.315          ,3.767               ,0.88  
2048 ,1          ,27   ,26   ,0         ,3.322          ,3.78                ,0.879 
2048 ,1          ,27   ,26   ,23        ,3.292          ,3.665               ,0.898 
2048 ,1          ,27   ,28   ,0         ,3.296          ,3.684               ,0.895 
2048 ,1          ,27   ,28   ,23        ,3.316          ,3.698               ,0.897 
2048 ,1          ,272  ,16   ,23        ,3.494          ,4.388               ,0.796 
2048 ,1          ,28   ,27   ,0         ,3.291          ,3.672               ,0.896 
2048 ,1          ,28   ,27   ,23        ,3.297          ,3.665               ,0.9   
2048 ,1          ,28   ,29   ,0         ,3.246          ,3.679               ,0.882 
2048 ,1          ,28   ,29   ,23        ,3.261          ,3.706               ,0.88  
2048 ,1          ,288  ,128  ,23        ,5.859          ,5.684               ,1.031 
2048 ,1          ,288  ,256  ,23        ,9.42           ,9.085               ,1.037 
2048 ,1          ,288  ,32   ,23        ,3.346          ,4.272               ,0.783 
2048 ,1          ,288  ,512  ,23        ,10.938         ,9.765               ,1.12  
2048 ,1          ,288  ,64   ,23        ,4.518          ,4.978               ,0.908 
2048 ,1          ,29   ,28   ,0         ,3.302          ,3.742               ,0.882 
2048 ,1          ,29   ,28   ,23        ,3.313          ,3.767               ,0.879 
2048 ,1          ,29   ,30   ,0         ,3.242          ,3.614               ,0.897 
2048 ,1          ,29   ,30   ,23        ,3.254          ,3.675               ,0.885 
2048 ,1          ,3    ,2    ,0         ,3.503          ,3.611               ,0.97  
2048 ,1          ,3    ,2    ,23        ,3.537          ,3.667               ,0.964 
2048 ,1          ,3    ,4    ,0         ,3.482          ,3.682               ,0.946 
2048 ,1          ,3    ,4    ,23        ,3.493          ,3.751               ,0.931 
2048 ,1          ,30   ,29   ,0         ,3.333          ,3.644               ,0.915 
2048 ,1          ,30   ,29   ,23        ,3.308          ,3.649               ,0.906 
2048 ,1          ,30   ,31   ,0         ,3.246          ,3.707               ,0.876 
2048 ,1          ,30   ,31   ,23        ,3.305          ,3.811               ,0.867 
2048 ,1          ,304  ,16   ,23        ,3.646          ,4.393               ,0.83  
2048 ,1          ,31   ,30   ,0         ,3.246          ,3.692               ,0.879 
2048 ,1          ,31   ,30   ,23        ,3.263          ,3.656               ,0.892 
2048 ,1          ,32   ,128  ,23        ,3.259          ,3.794               ,0.859 
2048 ,1          ,32   ,160  ,23        ,3.228          ,3.802               ,0.849 
2048 ,1          ,32   ,192  ,23        ,3.189          ,3.771               ,0.846 
2048 ,1          ,32   ,224  ,23        ,3.195          ,3.778               ,0.846 
2048 ,1          ,32   ,256  ,23        ,3.209          ,3.76                ,0.853 
2048 ,1          ,32   ,288  ,23        ,3.194          ,3.777               ,0.845 
2048 ,1          ,32   ,31   ,0         ,3.361          ,3.726               ,0.902 
2048 ,1          ,32   ,31   ,23        ,3.262          ,3.687               ,0.885 
2048 ,1          ,32   ,32   ,23        ,3.218          ,3.603               ,0.893 
2048 ,1          ,32   ,320  ,23        ,3.206          ,3.757               ,0.853 
2048 ,1          ,32   ,352  ,23        ,3.207          ,4.206               ,0.763 
2048 ,1          ,32   ,384  ,23        ,3.192          ,3.632               ,0.879 
2048 ,1          ,32   ,416  ,23        ,3.201          ,3.664               ,0.874 
2048 ,1          ,32   ,64   ,23        ,3.198          ,3.688               ,0.867 
2048 ,1          ,32   ,96   ,23        ,3.194          ,3.604               ,0.886 
2048 ,1          ,320  ,128  ,23        ,5.891          ,5.772               ,1.021 
2048 ,1          ,320  ,256  ,23        ,11.291         ,11.226              ,1.006 
2048 ,1          ,320  ,32   ,23        ,3.245          ,4.266               ,0.761 
2048 ,1          ,320  ,512  ,23        ,12.526         ,12.428              ,1.008 
2048 ,1          ,320  ,64   ,23        ,4.582          ,5.042               ,0.909 
2048 ,1          ,336  ,16   ,23        ,3.517          ,4.497               ,0.782 
2048 ,1          ,352  ,128  ,23        ,5.85           ,5.714               ,1.024 
2048 ,1          ,352  ,256  ,23        ,10.741         ,10.742              ,1.0   
2048 ,1          ,352  ,32   ,23        ,3.249          ,4.292               ,0.757 
2048 ,1          ,352  ,512  ,23        ,12.32          ,12.408              ,0.993 
2048 ,1          ,352  ,64   ,23        ,4.457          ,4.923               ,0.905 
2048 ,1          ,368  ,16   ,23        ,3.511          ,4.434               ,0.792 
2048 ,1          ,3712 ,4096 ,23        ,68.489         ,66.198              ,1.035 
2048 ,1          ,3744 ,4096 ,23        ,68.677         ,66.109              ,1.039 
2048 ,1          ,3776 ,4096 ,23        ,71.691         ,67.799              ,1.057 
2048 ,1          ,3808 ,4096 ,23        ,71.504         ,67.805              ,1.055 
2048 ,1          ,384  ,128  ,23        ,5.875          ,5.783               ,1.016 
2048 ,1          ,384  ,256  ,23        ,11.113         ,10.895              ,1.02  
2048 ,1          ,384  ,32   ,23        ,3.254          ,4.309               ,0.755 
2048 ,1          ,384  ,512  ,23        ,12.331         ,12.499              ,0.987 
2048 ,1          ,384  ,64   ,23        ,4.464          ,5.013               ,0.89  
2048 ,1          ,3840 ,4096 ,23        ,70.694         ,68.116              ,1.038 
2048 ,1          ,3872 ,4096 ,23        ,73.877         ,68.454              ,1.079 
2048 ,1          ,3904 ,4096 ,23        ,72.541         ,70.04               ,1.036 
2048 ,1          ,3936 ,4096 ,23        ,73.383         ,70.09               ,1.047 
2048 ,1          ,3968 ,4096 ,23        ,72.701         ,69.794              ,1.042 
2048 ,1          ,4    ,3    ,0         ,3.484          ,3.619               ,0.963 
2048 ,1          ,4    ,3    ,23        ,3.502          ,3.83                ,0.914 
2048 ,1          ,4    ,5    ,0         ,3.489          ,3.843               ,0.908 
2048 ,1          ,4    ,5    ,23        ,3.472          ,3.755               ,0.925 
2048 ,1          ,400  ,16   ,23        ,3.538          ,4.395               ,0.805 
2048 ,1          ,4000 ,4096 ,23        ,72.763         ,69.933              ,1.04  
2048 ,1          ,4032 ,4096 ,23        ,75.324         ,73.786              ,1.021 
2048 ,1          ,4064 ,4096 ,23        ,75.294         ,77.265              ,0.974 
2048 ,1          ,4096 ,3712 ,23        ,69.851         ,84.762              ,0.824 
2048 ,1          ,4096 ,3744 ,23        ,70.069         ,88.333              ,0.793 
2048 ,1          ,4096 ,3776 ,23        ,70.358         ,87.416              ,0.805 
2048 ,1          ,4096 ,3808 ,23        ,71.758         ,88.088              ,0.815 
2048 ,1          ,4096 ,3840 ,23        ,71.583         ,87.556              ,0.818 
2048 ,1          ,4096 ,3872 ,23        ,72.253         ,90.508              ,0.798 
2048 ,1          ,4096 ,3904 ,23        ,72.376         ,88.634              ,0.817 
2048 ,1          ,4096 ,3936 ,23        ,73.841         ,92.295              ,0.8   
2048 ,1          ,4096 ,3968 ,23        ,73.143         ,90.489              ,0.808 
2048 ,1          ,4096 ,4000 ,23        ,75.65          ,77.651              ,0.974 
2048 ,1          ,4096 ,4032 ,23        ,74.764         ,81.749              ,0.915 
2048 ,1          ,4096 ,4064 ,23        ,76.065         ,76.817              ,0.99  
2048 ,1          ,4096 ,4096 ,23        ,74.391         ,78.379              ,0.949 
2048 ,1          ,4096 ,4128 ,23        ,73.732         ,75.892              ,0.972 
2048 ,1          ,4096 ,4160 ,23        ,75.304         ,75.832              ,0.993 
2048 ,1          ,4096 ,4192 ,23        ,74.492         ,73.606              ,1.012 
2048 ,1          ,4096 ,4224 ,23        ,74.489         ,73.615              ,1.012 
2048 ,1          ,4096 ,4256 ,23        ,74.484         ,72.2                ,1.032 
2048 ,1          ,4096 ,4288 ,23        ,75.384         ,72.181              ,1.044 
2048 ,1          ,4096 ,4320 ,23        ,74.622         ,72.361              ,1.031 
2048 ,1          ,4096 ,4352 ,23        ,75.096         ,72.422              ,1.037 
2048 ,1          ,4096 ,4384 ,23        ,76.202         ,72.374              ,1.053 
2048 ,1          ,4096 ,4416 ,23        ,75.16          ,72.391              ,1.038 
2048 ,1          ,4096 ,4448 ,23        ,74.821         ,72.219              ,1.036 
2048 ,1          ,4096 ,4480 ,23        ,75.141         ,72.577              ,1.035 
2048 ,1          ,4128 ,4096 ,23        ,75.369         ,78.484              ,0.96  
2048 ,1          ,416  ,128  ,23        ,5.832          ,5.765               ,1.012 
2048 ,1          ,416  ,256  ,23        ,11.054         ,10.929              ,1.011 
2048 ,1          ,416  ,32   ,23        ,3.241          ,4.228               ,0.766 
2048 ,1          ,416  ,512  ,23        ,12.319         ,12.347              ,0.998 
2048 ,1          ,416  ,64   ,23        ,4.45           ,4.906               ,0.907 
2048 ,1          ,4160 ,4096 ,23        ,77.359         ,83.759              ,0.924 
2048 ,1          ,4192 ,4096 ,23        ,76.357         ,84.019              ,0.909 
2048 ,1          ,4224 ,4096 ,23        ,76.074         ,91.78               ,0.829 
2048 ,1          ,4256 ,4096 ,23        ,78.019         ,94.719              ,0.824 
2048 ,1          ,4288 ,4096 ,23        ,77.286         ,97.559              ,0.792 
2048 ,1          ,4320 ,4096 ,23        ,76.485         ,92.901              ,0.823 
2048 ,1          ,4352 ,4096 ,23        ,75.637         ,95.281              ,0.794 
2048 ,1          ,4384 ,4096 ,23        ,75.698         ,93.036              ,0.814 
2048 ,1          ,4416 ,4096 ,23        ,77.48          ,95.707              ,0.81  
2048 ,1          ,4448 ,4096 ,23        ,76.454         ,91.462              ,0.836 
2048 ,1          ,448  ,128  ,23        ,5.813          ,5.711               ,1.018 
2048 ,1          ,448  ,256  ,23        ,12.816         ,12.35               ,1.038 
2048 ,1          ,448  ,512  ,23        ,14.674         ,14.628              ,1.003 
2048 ,1          ,448  ,64   ,23        ,4.442          ,4.924               ,0.902 
2048 ,1          ,4480 ,4096 ,23        ,76.075         ,93.781              ,0.811 
2048 ,1          ,48   ,16   ,23        ,3.445          ,4.502               ,0.765 
2048 ,1          ,480  ,128  ,23        ,5.814          ,5.689               ,1.022 
2048 ,1          ,480  ,256  ,23        ,12.205         ,12.044              ,1.013 
2048 ,1          ,480  ,512  ,23        ,15.914         ,14.713              ,1.082 
2048 ,1          ,5    ,4    ,0         ,3.498          ,3.674               ,0.952 
2048 ,1          ,5    ,4    ,23        ,3.483          ,3.772               ,0.923 
2048 ,1          ,5    ,6    ,0         ,3.449          ,3.775               ,0.913 
2048 ,1          ,5    ,6    ,23        ,3.428          ,3.742               ,0.916 
2048 ,1          ,512  ,128  ,23        ,5.791          ,5.722               ,1.012 
2048 ,1          ,512  ,160  ,23        ,6.318          ,6.301               ,1.003 
2048 ,1          ,512  ,192  ,23        ,10.321         ,10.168              ,1.015 
2048 ,1          ,512  ,224  ,23        ,11.147         ,10.939              ,1.019 
2048 ,1          ,512  ,256  ,23        ,11.093         ,10.978              ,1.01  
2048 ,1          ,512  ,288  ,23        ,12.125         ,11.993              ,1.011 
2048 ,1          ,512  ,320  ,23        ,12.56          ,12.279              ,1.023 
2048 ,1          ,512  ,352  ,23        ,13.506         ,13.157              ,1.027 
2048 ,1          ,512  ,384  ,23        ,13.466         ,13.174              ,1.022 
2048 ,1          ,512  ,416  ,23        ,12.965         ,12.872              ,1.007 
2048 ,1          ,512  ,448  ,23        ,13.483         ,13.375              ,1.008 
2048 ,1          ,512  ,480  ,23        ,14.817         ,13.634              ,1.087 
2048 ,1          ,512  ,512  ,23        ,14.559         ,14.493              ,1.005 
2048 ,1          ,512  ,544  ,23        ,14.822         ,14.561              ,1.018 
2048 ,1          ,512  ,576  ,23        ,14.717         ,14.66               ,1.004 
2048 ,1          ,512  ,608  ,23        ,14.66          ,14.694              ,0.998 
2048 ,1          ,512  ,640  ,23        ,14.671         ,14.562              ,1.007 
2048 ,1          ,512  ,672  ,23        ,14.723         ,14.617              ,1.007 
2048 ,1          ,512  ,704  ,23        ,14.675         ,14.615              ,1.004 
2048 ,1          ,512  ,736  ,23        ,14.707         ,14.61               ,1.007 
2048 ,1          ,512  ,768  ,23        ,14.827         ,14.53               ,1.02  
2048 ,1          ,512  ,800  ,23        ,15.494         ,14.584              ,1.062 
2048 ,1          ,512  ,832  ,23        ,14.77          ,14.653              ,1.008 
2048 ,1          ,512  ,864  ,23        ,14.695         ,14.596              ,1.007 
2048 ,1          ,512  ,896  ,23        ,14.832         ,14.61               ,1.015 
2048 ,1          ,544  ,256  ,23        ,11.006         ,10.769              ,1.022 
2048 ,1          ,544  ,512  ,23        ,15.21          ,13.694              ,1.111 
2048 ,1          ,576  ,256  ,23        ,12.478         ,12.305              ,1.014 
2048 ,1          ,576  ,512  ,23        ,16.333         ,15.302              ,1.067 
2048 ,1          ,6    ,5    ,0         ,3.461          ,3.779               ,0.916 
2048 ,1          ,6    ,5    ,23        ,3.428          ,3.78                ,0.907 
2048 ,1          ,6    ,7    ,0         ,3.479          ,3.763               ,0.924 
2048 ,1          ,6    ,7    ,23        ,3.483          ,3.803               ,0.916 
2048 ,1          ,608  ,256  ,23        ,12.176         ,11.991              ,1.015 
2048 ,1          ,608  ,512  ,23        ,15.473         ,15.183              ,1.019 
2048 ,1          ,64   ,128  ,23        ,5.12           ,5.677               ,0.902 
2048 ,1          ,64   ,160  ,23        ,4.687          ,5.677               ,0.825 
2048 ,1          ,64   ,192  ,23        ,4.7            ,5.632               ,0.835 
2048 ,1          ,64   ,224  ,23        ,4.516          ,5.655               ,0.799 
2048 ,1          ,64   ,256  ,23        ,4.817          ,5.876               ,0.82  
2048 ,1          ,64   ,288  ,23        ,4.224          ,5.693               ,0.742 
2048 ,1          ,64   ,32   ,23        ,3.247          ,4.254               ,0.763 
2048 ,1          ,64   ,320  ,23        ,4.46           ,5.626               ,0.793 
2048 ,1          ,64   ,352  ,23        ,4.276          ,5.622               ,0.761 
2048 ,1          ,64   ,384  ,23        ,4.33           ,5.617               ,0.771 
2048 ,1          ,64   ,416  ,23        ,4.271          ,5.649               ,0.756 
2048 ,1          ,64   ,448  ,23        ,4.157          ,5.673               ,0.733 
2048 ,1          ,64   ,64   ,23        ,3.91           ,5.456               ,0.717 
2048 ,1          ,64   ,96   ,23        ,4.257          ,5.556               ,0.766 
2048 ,1          ,640  ,1024 ,23        ,17.999         ,16.833              ,1.069 
2048 ,1          ,640  ,256  ,23        ,11.042         ,10.904              ,1.013 
2048 ,1          ,640  ,512  ,23        ,16.35          ,15.346              ,1.065 
2048 ,1          ,672  ,1024 ,23        ,18.041         ,16.833              ,1.072 
2048 ,1          ,672  ,512  ,23        ,16.261         ,15.334              ,1.06  
2048 ,1          ,7    ,6    ,0         ,3.465          ,3.836               ,0.903 
2048 ,1          ,7    ,6    ,23        ,3.43           ,3.826               ,0.897 
2048 ,1          ,7    ,8    ,0         ,3.71           ,4.003               ,0.927 
2048 ,1          ,7    ,8    ,23        ,3.811          ,4.17                ,0.914 
2048 ,1          ,704  ,1024 ,23        ,19.927         ,18.826              ,1.058 
2048 ,1          ,704  ,512  ,23        ,17.378         ,16.727              ,1.039 
2048 ,1          ,736  ,1024 ,23        ,20.04          ,18.797              ,1.066 
2048 ,1          ,736  ,512  ,23        ,17.213         ,16.433              ,1.047 
2048 ,1          ,768  ,1024 ,23        ,20.014         ,18.842              ,1.062 
2048 ,1          ,768  ,512  ,23        ,16.1           ,15.313              ,1.051 
2048 ,1          ,8    ,7    ,0         ,3.446          ,3.766               ,0.915 
2048 ,1          ,8    ,7    ,23        ,3.433          ,3.782               ,0.908 
2048 ,1          ,8    ,9    ,0         ,3.526          ,3.78                ,0.933 
2048 ,1          ,8    ,9    ,23        ,3.491          ,3.811               ,0.916 
2048 ,1          ,80   ,16   ,23        ,3.42           ,4.38                ,0.781 
2048 ,1          ,800  ,1024 ,23        ,19.962         ,18.903              ,1.056 
2048 ,1          ,800  ,512  ,23        ,16.225         ,15.375              ,1.055 
2048 ,1          ,832  ,1024 ,23        ,22.151         ,21.284              ,1.041 
2048 ,1          ,832  ,512  ,23        ,17.124         ,16.708              ,1.025 
2048 ,1          ,864  ,1024 ,23        ,23.339         ,24.42               ,0.956 
2048 ,1          ,864  ,512  ,23        ,17.202         ,16.328              ,1.054 
2048 ,1          ,896  ,1024 ,23        ,22.236         ,21.205              ,1.049 
2048 ,1          ,896  ,512  ,23        ,16.355         ,15.416              ,1.061 
2048 ,1          ,9    ,10   ,0         ,3.48           ,3.778               ,0.921 
2048 ,1          ,9    ,10   ,23        ,3.443          ,3.719               ,0.926 
2048 ,1          ,9    ,8    ,0         ,3.65           ,3.945               ,0.925 
2048 ,1          ,9    ,8    ,23        ,3.683          ,3.928               ,0.938 
2048 ,1          ,928  ,1024 ,23        ,22.326         ,21.2                ,1.053 
2048 ,1          ,96   ,128  ,23        ,4.704          ,5.525               ,0.851 
2048 ,1          ,96   ,256  ,23        ,4.67           ,5.495               ,0.85  
2048 ,1          ,96   ,32   ,23        ,3.248          ,4.268               ,0.761 
2048 ,1          ,96   ,64   ,23        ,4.369          ,5.302               ,0.824 
2048 ,1          ,960  ,1024 ,23        ,24.304         ,23.316              ,1.042 
2048 ,1          ,992  ,1024 ,23        ,24.035         ,23.415              ,1.026 
2049 ,0          ,0    ,1    ,0         ,3.388          ,3.386               ,1.001 
2049 ,0          ,0    ,1    ,23        ,3.411          ,3.413               ,0.999 
2049 ,0          ,192  ,32   ,0         ,8.465          ,8.054               ,1.051 
2049 ,0          ,192  ,32   ,23        ,8.623          ,8.072               ,1.068 
2049 ,0          ,2    ,1    ,0         ,3.411          ,3.474               ,0.982 
2049 ,0          ,2    ,1    ,23        ,3.402          ,3.46                ,0.983 
2049 ,0          ,256  ,1    ,0         ,10.146         ,9.762               ,1.039 
2049 ,0          ,256  ,1    ,23        ,10.207         ,9.816               ,1.04  
2049 ,0          ,256  ,32   ,0         ,9.431          ,9.176               ,1.028 
2049 ,0          ,256  ,32   ,23        ,9.5            ,9.119               ,1.042 
2049 ,0          ,256  ,64   ,0         ,8.329          ,8.337               ,0.999 
2049 ,0          ,256  ,64   ,23        ,8.292          ,8.252               ,1.005 
2049 ,0          ,512  ,32   ,0         ,14.913         ,13.763              ,1.084 
2049 ,0          ,512  ,32   ,23        ,15.176         ,13.834              ,1.097 
2049 ,1          ,0    ,1    ,0         ,3.37           ,3.369               ,1.0   
2049 ,1          ,0    ,1    ,23        ,3.427          ,3.441               ,0.996 
2049 ,1          ,192  ,32   ,0         ,3.275          ,4.343               ,0.754 
2049 ,1          ,192  ,32   ,23        ,3.4            ,4.408               ,0.771 
2049 ,1          ,2    ,1    ,0         ,3.406          ,3.411               ,0.999 
2049 ,1          ,2    ,1    ,23        ,3.405          ,3.397               ,1.002 
2049 ,1          ,256  ,1    ,0         ,3.416          ,4.089               ,0.835 
2049 ,1          ,256  ,1    ,23        ,3.374          ,4.054               ,0.832 
2049 ,1          ,256  ,32   ,0         ,3.248          ,4.24                ,0.766 
2049 ,1          ,256  ,32   ,23        ,3.245          ,4.254               ,0.763 
2049 ,1          ,256  ,64   ,0         ,5.166          ,5.233               ,0.987 
2049 ,1          ,256  ,64   ,23        ,5.213          ,5.294               ,0.985 
2049 ,1          ,512  ,32   ,0         ,3.285          ,4.229               ,0.777 
2049 ,1          ,512  ,32   ,23        ,3.212          ,4.224               ,0.76  
2050 ,0          ,1    ,2    ,0         ,3.549          ,3.548               ,1.0   
2050 ,0          ,1    ,2    ,23        ,3.55           ,3.543               ,1.002 
2050 ,0          ,192  ,64   ,0         ,6.513          ,6.456               ,1.009 
2050 ,0          ,192  ,64   ,23        ,6.544          ,6.5                 ,1.007 
2050 ,0          ,256  ,2    ,0         ,10.138         ,9.834               ,1.031 
2050 ,0          ,256  ,2    ,23        ,10.159         ,9.691               ,1.048 
2050 ,0          ,256  ,64   ,0         ,8.284          ,8.25                ,1.004 
2050 ,0          ,256  ,64   ,23        ,8.251          ,8.234               ,1.002 
2050 ,0          ,3    ,2    ,0         ,3.531          ,3.524               ,1.002 
2050 ,0          ,3    ,2    ,23        ,3.513          ,3.504               ,1.003 
2050 ,0          ,512  ,64   ,0         ,13.657         ,13.549              ,1.008 
2050 ,0          ,512  ,64   ,23        ,13.823         ,13.575              ,1.018 
2050 ,1          ,1    ,2    ,0         ,3.531          ,3.537               ,0.998 
2050 ,1          ,1    ,2    ,23        ,3.515          ,3.513               ,1.001 
2050 ,1          ,192  ,64   ,0         ,5.306          ,5.413               ,0.98  
2050 ,1          ,192  ,64   ,23        ,5.277          ,5.358               ,0.985 
2050 ,1          ,256  ,2    ,0         ,3.396          ,4.192               ,0.81  
2050 ,1          ,256  ,2    ,23        ,3.346          ,4.06                ,0.824 
2050 ,1          ,256  ,64   ,0         ,5.222          ,5.321               ,0.981 
2050 ,1          ,256  ,64   ,23        ,5.228          ,5.349               ,0.977 
2050 ,1          ,3    ,2    ,0         ,3.496          ,3.498               ,0.999 
2050 ,1          ,3    ,2    ,23        ,3.506          ,3.507               ,1.0   
2050 ,1          ,512  ,64   ,0         ,5.208          ,5.408               ,0.963 
2050 ,1          ,512  ,64   ,23        ,5.184          ,5.293               ,0.979 
2051 ,0          ,192  ,96   ,0         ,6.0            ,6.047               ,0.992 
2051 ,0          ,192  ,96   ,23        ,5.958          ,5.968               ,0.998 
2051 ,0          ,2    ,3    ,0         ,3.53           ,3.541               ,0.997 
2051 ,0          ,2    ,3    ,23        ,3.534          ,3.522               ,1.003 
2051 ,0          ,256  ,3    ,0         ,10.095         ,9.692               ,1.042 
2051 ,0          ,256  ,3    ,23        ,10.193         ,9.724               ,1.048 
2051 ,0          ,256  ,64   ,0         ,8.314          ,8.239               ,1.009 
2051 ,0          ,256  ,64   ,23        ,8.326          ,8.256               ,1.008 
2051 ,0          ,256  ,96   ,0         ,7.603          ,7.747               ,0.981 
2051 ,0          ,256  ,96   ,23        ,7.573          ,7.725               ,0.98  
2051 ,0          ,4    ,3    ,0         ,3.504          ,3.504               ,1.0   
2051 ,0          ,4    ,3    ,23        ,3.496          ,3.496               ,1.0   
2051 ,0          ,512  ,96   ,0         ,13.211         ,12.936              ,1.021 
2051 ,0          ,512  ,96   ,23        ,13.208         ,13.085              ,1.009 
2051 ,1          ,192  ,96   ,0         ,5.949          ,6.016               ,0.989 
2051 ,1          ,192  ,96   ,23        ,5.904          ,6.076               ,0.972 
2051 ,1          ,2    ,3    ,0         ,3.497          ,3.495               ,1.001 
2051 ,1          ,2    ,3    ,23        ,3.497          ,3.517               ,0.994 
2051 ,1          ,256  ,3    ,0         ,3.381          ,4.172               ,0.81  
2051 ,1          ,256  ,3    ,23        ,3.341          ,4.091               ,0.817 
2051 ,1          ,256  ,64   ,0         ,5.222          ,5.316               ,0.982 
2051 ,1          ,256  ,64   ,23        ,5.23           ,5.37                ,0.974 
2051 ,1          ,256  ,96   ,0         ,5.887          ,6.006               ,0.98  
2051 ,1          ,256  ,96   ,23        ,5.906          ,5.991               ,0.986 
2051 ,1          ,4    ,3    ,0         ,3.52           ,3.558               ,0.989 
2051 ,1          ,4    ,3    ,23        ,3.484          ,3.791               ,0.919 
2051 ,1          ,512  ,96   ,0         ,5.905          ,6.02                ,0.981 
2051 ,1          ,512  ,96   ,23        ,5.908          ,5.878               ,1.005 
2052 ,0          ,192  ,128  ,0         ,5.272          ,5.41                ,0.975 
2052 ,0          ,192  ,128  ,23        ,5.28           ,5.402               ,0.977 
2052 ,0          ,256  ,128  ,0         ,6.532          ,6.439               ,1.014 
2052 ,0          ,256  ,128  ,23        ,6.509          ,6.455               ,1.008 
2052 ,0          ,256  ,4    ,0         ,10.128         ,9.792               ,1.034 
2052 ,0          ,256  ,4    ,23        ,10.096         ,9.741               ,1.036 
2052 ,0          ,256  ,64   ,0         ,8.399          ,8.293               ,1.013 
2052 ,0          ,256  ,64   ,23        ,8.299          ,8.262               ,1.005 
2052 ,0          ,3    ,4    ,0         ,3.492          ,3.55                ,0.984 
2052 ,0          ,3    ,4    ,23        ,3.504          ,3.546               ,0.988 
2052 ,0          ,5    ,4    ,0         ,3.496          ,3.496               ,1.0   
2052 ,0          ,5    ,4    ,23        ,3.504          ,3.505               ,1.0   
2052 ,0          ,512  ,128  ,0         ,13.597         ,13.283              ,1.024 
2052 ,0          ,512  ,128  ,23        ,13.652         ,13.299              ,1.027 
2052 ,1          ,192  ,128  ,0         ,6.488          ,6.387               ,1.016 
2052 ,1          ,192  ,128  ,23        ,6.483          ,6.481               ,1.0   
2052 ,1          ,256  ,128  ,0         ,6.522          ,6.501               ,1.003 
2052 ,1          ,256  ,128  ,23        ,6.518          ,6.495               ,1.004 
2052 ,1          ,256  ,4    ,0         ,3.361          ,4.15                ,0.81  
2052 ,1          ,256  ,4    ,23        ,3.379          ,4.086               ,0.827 
2052 ,1          ,256  ,64   ,0         ,5.265          ,5.251               ,1.003 
2052 ,1          ,256  ,64   ,23        ,5.238          ,5.33                ,0.983 
2052 ,1          ,3    ,4    ,0         ,3.508          ,3.514               ,0.998 
2052 ,1          ,3    ,4    ,23        ,3.462          ,3.479               ,0.995 
2052 ,1          ,5    ,4    ,0         ,3.487          ,3.545               ,0.984 
2052 ,1          ,5    ,4    ,23        ,3.484          ,3.597               ,0.969 
2052 ,1          ,512  ,128  ,0         ,6.45           ,6.437               ,1.002 
2052 ,1          ,512  ,128  ,23        ,6.444          ,6.466               ,0.997 
2053 ,0          ,192  ,160  ,0         ,3.24           ,4.324               ,0.749 
2053 ,0          ,192  ,160  ,23        ,3.274          ,4.253               ,0.77  
2053 ,0          ,256  ,160  ,0         ,5.867          ,5.982               ,0.981 
2053 ,0          ,256  ,160  ,23        ,5.975          ,5.956               ,1.003 
2053 ,0          ,256  ,5    ,0         ,10.122         ,9.718               ,1.042 
2053 ,0          ,256  ,5    ,23        ,10.101         ,9.755               ,1.036 
2053 ,0          ,256  ,64   ,0         ,8.357          ,8.414               ,0.993 
2053 ,0          ,256  ,64   ,23        ,8.361          ,8.378               ,0.998 
2053 ,0          ,4    ,5    ,0         ,3.496          ,3.869               ,0.904 
2053 ,0          ,4    ,5    ,23        ,3.473          ,3.894               ,0.892 
2053 ,0          ,512  ,160  ,0         ,13.614         ,13.214              ,1.03  
2053 ,0          ,512  ,160  ,23        ,13.638         ,13.225              ,1.031 
2053 ,0          ,6    ,5    ,0         ,3.516          ,3.504               ,1.003 
2053 ,0          ,6    ,5    ,23        ,3.496          ,3.495               ,1.0   
2053 ,1          ,192  ,160  ,0         ,7.838          ,8.059               ,0.972 
2053 ,1          ,192  ,160  ,23        ,7.827          ,8.053               ,0.972 
2053 ,1          ,256  ,160  ,0         ,7.583          ,7.746               ,0.979 
2053 ,1          ,256  ,160  ,23        ,7.592          ,7.749               ,0.98  
2053 ,1          ,256  ,5    ,0         ,3.464          ,4.369               ,0.793 
2053 ,1          ,256  ,5    ,23        ,3.348          ,4.324               ,0.774 
2053 ,1          ,256  ,64   ,0         ,5.315          ,5.335               ,0.996 
2053 ,1          ,256  ,64   ,23        ,5.31           ,5.424               ,0.979 
2053 ,1          ,4    ,5    ,0         ,3.464          ,3.863               ,0.897 
2053 ,1          ,4    ,5    ,23        ,3.455          ,3.87                ,0.893 
2053 ,1          ,512  ,160  ,0         ,10.158         ,10.017              ,1.014 
2053 ,1          ,512  ,160  ,23        ,10.075         ,9.991               ,1.008 
2053 ,1          ,6    ,5    ,0         ,3.47           ,3.821               ,0.908 
2053 ,1          ,6    ,5    ,23        ,3.476          ,3.876               ,0.897 
2054 ,0          ,192  ,192  ,0         ,8.426          ,8.567               ,0.984 
2054 ,0          ,192  ,192  ,23        ,8.405          ,8.561               ,0.982 
2054 ,0          ,256  ,192  ,0         ,5.233          ,5.382               ,0.972 
2054 ,0          ,256  ,192  ,23        ,5.214          ,5.354               ,0.974 
2054 ,0          ,256  ,6    ,0         ,10.145         ,9.765               ,1.039 
2054 ,0          ,256  ,6    ,23        ,10.062         ,9.814               ,1.025 
2054 ,0          ,256  ,64   ,0         ,8.312          ,8.227               ,1.01  
2054 ,0          ,256  ,64   ,23        ,8.273          ,8.11                ,1.02  
2054 ,0          ,5    ,6    ,0         ,3.499          ,3.73                ,0.938 
2054 ,0          ,5    ,6    ,23        ,3.513          ,3.756               ,0.935 
2054 ,0          ,512  ,192  ,0         ,12.795         ,12.541              ,1.02  
2054 ,0          ,512  ,192  ,23        ,12.761         ,12.412              ,1.028 
2054 ,0          ,7    ,6    ,0         ,3.56           ,3.496               ,1.018 
2054 ,0          ,7    ,6    ,23        ,3.566          ,3.517               ,1.014 
2054 ,1          ,192  ,192  ,0         ,8.48           ,8.552               ,0.992 
2054 ,1          ,192  ,192  ,23        ,8.451          ,8.551               ,0.988 
2054 ,1          ,256  ,192  ,0         ,8.24           ,8.209               ,1.004 
2054 ,1          ,256  ,192  ,23        ,8.225          ,8.177               ,1.006 
2054 ,1          ,256  ,6    ,0         ,3.427          ,4.402               ,0.779 
2054 ,1          ,256  ,6    ,23        ,3.369          ,4.361               ,0.773 
2054 ,1          ,256  ,64   ,0         ,5.376          ,5.333               ,1.008 
2054 ,1          ,256  ,64   ,23        ,5.295          ,5.229               ,1.012 
2054 ,1          ,5    ,6    ,0         ,3.436          ,3.685               ,0.932 
2054 ,1          ,5    ,6    ,23        ,3.463          ,3.72                ,0.931 
2054 ,1          ,512  ,192  ,0         ,10.367         ,10.211              ,1.015 
2054 ,1          ,512  ,192  ,23        ,10.37          ,10.282              ,1.008 
2054 ,1          ,7    ,6    ,0         ,3.444          ,3.807               ,0.905 
2054 ,1          ,7    ,6    ,23        ,3.444          ,3.792               ,0.908 
2055 ,0          ,192  ,224  ,0         ,8.429          ,8.533               ,0.988 
2055 ,0          ,192  ,224  ,23        ,8.472          ,8.531               ,0.993 
2055 ,0          ,256  ,224  ,0         ,3.249          ,4.21                ,0.772 
2055 ,0          ,256  ,224  ,23        ,3.84           ,4.417               ,0.869 
2055 ,0          ,256  ,64   ,0         ,8.361          ,8.366               ,0.999 
2055 ,0          ,256  ,64   ,23        ,8.348          ,8.399               ,0.994 
2055 ,0          ,256  ,7    ,0         ,10.118         ,9.814               ,1.031 
2055 ,0          ,256  ,7    ,23        ,10.168         ,9.769               ,1.041 
2055 ,0          ,512  ,224  ,0         ,12.336         ,12.213              ,1.01  
2055 ,0          ,512  ,224  ,23        ,12.279         ,12.07               ,1.017 
2055 ,0          ,6    ,7    ,0         ,3.497          ,3.816               ,0.916 
2055 ,0          ,6    ,7    ,23        ,3.484          ,3.852               ,0.905 
2055 ,0          ,8    ,7    ,0         ,3.506          ,3.504               ,1.001 
2055 ,0          ,8    ,7    ,23        ,3.493          ,3.53                ,0.989 
2055 ,1          ,192  ,224  ,0         ,8.352          ,8.496               ,0.983 
2055 ,1          ,192  ,224  ,23        ,8.384          ,8.532               ,0.983 
2055 ,1          ,256  ,224  ,0         ,9.445          ,9.069               ,1.041 
2055 ,1          ,256  ,224  ,23        ,9.443          ,9.126               ,1.035 
2055 ,1          ,256  ,64   ,0         ,5.354          ,5.363               ,0.998 
2055 ,1          ,256  ,64   ,23        ,5.319          ,5.354               ,0.993 
2055 ,1          ,256  ,7    ,0         ,3.388          ,4.359               ,0.777 
2055 ,1          ,256  ,7    ,23        ,3.374          ,4.319               ,0.781 
2055 ,1          ,512  ,224  ,0         ,11.048         ,11.11               ,0.994 
2055 ,1          ,512  ,224  ,23        ,11.151         ,11.079              ,1.007 
2055 ,1          ,6    ,7    ,0         ,3.447          ,3.83                ,0.9   
2055 ,1          ,6    ,7    ,23        ,3.447          ,3.798               ,0.908 
2055 ,1          ,8    ,7    ,0         ,3.433          ,3.783               ,0.907 
2055 ,1          ,8    ,7    ,23        ,3.455          ,3.733               ,0.925 
2056 ,0          ,7    ,8    ,0         ,3.481          ,3.779               ,0.921 
2056 ,0          ,7    ,8    ,23        ,3.496          ,3.8                 ,0.92  
2056 ,0          ,9    ,8    ,0         ,3.511          ,3.504               ,1.002 
2056 ,0          ,9    ,8    ,23        ,3.496          ,3.496               ,1.0   
2056 ,1          ,7    ,8    ,0         ,3.621          ,3.933               ,0.921 
2056 ,1          ,7    ,8    ,23        ,3.598          ,3.914               ,0.919 
2056 ,1          ,9    ,8    ,0         ,3.666          ,3.972               ,0.923 
2056 ,1          ,9    ,8    ,23        ,3.658          ,3.983               ,0.918 
2057 ,0          ,10   ,9    ,0         ,3.638          ,3.496               ,1.041 
2057 ,0          ,10   ,9    ,23        ,3.625          ,3.496               ,1.037 
2057 ,0          ,8    ,9    ,0         ,3.497          ,3.858               ,0.906 
2057 ,0          ,8    ,9    ,23        ,3.513          ,3.811               ,0.922 
2057 ,1          ,10   ,9    ,0         ,3.471          ,3.794               ,0.915 
2057 ,1          ,10   ,9    ,23        ,3.472          ,3.769               ,0.921 
2057 ,1          ,8    ,9    ,0         ,3.478          ,3.864               ,0.9   
2057 ,1          ,8    ,9    ,23        ,3.496          ,3.862               ,0.905 
2058 ,0          ,11   ,10   ,0         ,3.504          ,3.506               ,0.999 
2058 ,0          ,11   ,10   ,23        ,3.507          ,3.496               ,1.003 
2058 ,0          ,9    ,10   ,0         ,3.512          ,3.828               ,0.917 
2058 ,0          ,9    ,10   ,23        ,3.515          ,3.8                 ,0.925 
2058 ,1          ,11   ,10   ,0         ,3.489          ,3.794               ,0.92  
2058 ,1          ,11   ,10   ,23        ,3.453          ,3.791               ,0.911 
2058 ,1          ,9    ,10   ,0         ,3.487          ,3.845               ,0.907 
2058 ,1          ,9    ,10   ,23        ,3.468          ,3.756               ,0.923 
2059 ,0          ,10   ,11   ,0         ,3.489          ,3.82                ,0.913 
2059 ,0          ,10   ,11   ,23        ,3.513          ,3.813               ,0.921 
2059 ,0          ,12   ,11   ,0         ,3.505          ,3.506               ,1.0   
2059 ,0          ,12   ,11   ,23        ,3.487          ,3.487               ,1.0   
2059 ,1          ,10   ,11   ,0         ,3.461          ,3.781               ,0.915 
2059 ,1          ,10   ,11   ,23        ,3.446          ,3.746               ,0.92  
2059 ,1          ,12   ,11   ,0         ,3.454          ,3.762               ,0.918 
2059 ,1          ,12   ,11   ,23        ,3.461          ,3.811               ,0.908 
2060 ,0          ,11   ,12   ,0         ,3.503          ,3.818               ,0.918 
2060 ,0          ,11   ,12   ,23        ,3.515          ,3.825               ,0.919 
2060 ,0          ,13   ,12   ,0         ,3.578          ,3.515               ,1.018 
2060 ,0          ,13   ,12   ,23        ,3.56           ,3.496               ,1.018 
2060 ,1          ,11   ,12   ,0         ,3.471          ,3.81                ,0.911 
2060 ,1          ,11   ,12   ,23        ,3.467          ,3.772               ,0.919 
2060 ,1          ,13   ,12   ,0         ,3.453          ,3.829               ,0.902 
2060 ,1          ,13   ,12   ,23        ,3.452          ,3.79                ,0.911 
2061 ,0          ,12   ,13   ,0         ,3.562          ,4.023               ,0.885 
2061 ,0          ,12   ,13   ,23        ,3.504          ,3.865               ,0.907 
2061 ,0          ,14   ,13   ,0         ,3.56           ,3.496               ,1.018 
2061 ,0          ,14   ,13   ,23        ,3.554          ,3.479               ,1.022 
2061 ,1          ,12   ,13   ,0         ,3.461          ,3.79                ,0.913 
2061 ,1          ,12   ,13   ,23        ,3.48           ,3.792               ,0.918 
2061 ,1          ,14   ,13   ,0         ,3.436          ,3.84                ,0.895 
2061 ,1          ,14   ,13   ,23        ,3.47           ,3.837               ,0.904 
2062 ,0          ,13   ,14   ,0         ,3.503          ,3.819               ,0.917 
2062 ,0          ,13   ,14   ,23        ,3.504          ,3.834               ,0.914 
2062 ,0          ,15   ,14   ,0         ,3.496          ,3.493               ,1.001 
2062 ,0          ,15   ,14   ,23        ,3.504          ,3.498               ,1.002 
2062 ,1          ,13   ,14   ,0         ,3.461          ,3.783               ,0.915 
2062 ,1          ,13   ,14   ,23        ,3.474          ,3.8                 ,0.914 
2062 ,1          ,15   ,14   ,0         ,3.453          ,3.812               ,0.906 
2062 ,1          ,15   ,14   ,23        ,3.428          ,3.762               ,0.911 
2063 ,0          ,14   ,15   ,0         ,3.49           ,3.872               ,0.901 
2063 ,0          ,14   ,15   ,23        ,3.483          ,3.818               ,0.912 
2063 ,0          ,16   ,15   ,0         ,3.495          ,3.532               ,0.99  
2063 ,0          ,16   ,15   ,23        ,3.525          ,3.486               ,1.011 
2063 ,1          ,14   ,15   ,0         ,3.44           ,3.736               ,0.921 
2063 ,1          ,14   ,15   ,23        ,3.469          ,3.807               ,0.911 
2063 ,1          ,16   ,15   ,0         ,3.428          ,3.747               ,0.915 
2063 ,1          ,16   ,15   ,23        ,3.448          ,3.754               ,0.919 
2064 ,0          ,15   ,16   ,0         ,3.472          ,3.783               ,0.918 
2064 ,0          ,15   ,16   ,23        ,3.487          ,3.89                ,0.896 
2064 ,0          ,17   ,16   ,0         ,3.483          ,3.476               ,1.002 
2064 ,0          ,17   ,16   ,23        ,3.534          ,3.47                ,1.018 
2064 ,1          ,15   ,16   ,0         ,3.446          ,3.762               ,0.916 
2064 ,1          ,15   ,16   ,23        ,3.413          ,3.703               ,0.922 
2064 ,1          ,17   ,16   ,0         ,3.427          ,3.773               ,0.908 
2064 ,1          ,17   ,16   ,23        ,3.429          ,3.755               ,0.913 
2065 ,0          ,16   ,17   ,0         ,3.48           ,3.805               ,0.915 
2065 ,0          ,16   ,17   ,23        ,3.575          ,3.861               ,0.926 
2065 ,0          ,18   ,17   ,0         ,3.576          ,3.504               ,1.02  
2065 ,0          ,18   ,17   ,23        ,3.542          ,3.47                ,1.021 
2065 ,1          ,16   ,17   ,0         ,3.412          ,3.755               ,0.909 
2065 ,1          ,16   ,17   ,23        ,3.427          ,3.758               ,0.912 
2065 ,1          ,18   ,17   ,0         ,3.404          ,3.738               ,0.911 
2065 ,1          ,18   ,17   ,23        ,3.416          ,3.919               ,0.871 
2066 ,0          ,17   ,18   ,0         ,3.465          ,3.787               ,0.915 
2066 ,0          ,17   ,18   ,23        ,3.47           ,3.82                ,0.908 
2066 ,0          ,19   ,18   ,0         ,3.544          ,3.477               ,1.019 
2066 ,0          ,19   ,18   ,23        ,3.549          ,3.47                ,1.023 
2066 ,1          ,17   ,18   ,0         ,3.466          ,3.726               ,0.93  
2066 ,1          ,17   ,18   ,23        ,3.43           ,3.762               ,0.912 
2066 ,1          ,19   ,18   ,0         ,3.419          ,3.851               ,0.888 
2066 ,1          ,19   ,18   ,23        ,3.418          ,3.719               ,0.919 
2067 ,0          ,18   ,19   ,0         ,3.472          ,3.868               ,0.898 
2067 ,0          ,18   ,19   ,23        ,3.484          ,4.045               ,0.861 
2067 ,0          ,20   ,19   ,0         ,3.489          ,3.487               ,1.0   
2067 ,0          ,20   ,19   ,23        ,3.461          ,3.464               ,0.999 
2067 ,1          ,18   ,19   ,0         ,3.404          ,3.725               ,0.914 
2067 ,1          ,18   ,19   ,23        ,3.43           ,3.831               ,0.895 
2067 ,1          ,20   ,19   ,0         ,3.428          ,3.832               ,0.895 
2067 ,1          ,20   ,19   ,23        ,3.429          ,3.845               ,0.892 
2068 ,0          ,19   ,20   ,0         ,3.453          ,3.838               ,0.9   
2068 ,0          ,19   ,20   ,23        ,3.48           ,3.844               ,0.905 
2068 ,0          ,21   ,20   ,0         ,3.497          ,3.497               ,1.0   
2068 ,0          ,21   ,20   ,23        ,3.479          ,3.477               ,1.0   
2068 ,1          ,19   ,20   ,0         ,3.424          ,3.765               ,0.91  
2068 ,1          ,19   ,20   ,23        ,3.412          ,3.735               ,0.914 
2068 ,1          ,21   ,20   ,0         ,3.427          ,3.913               ,0.876 
2068 ,1          ,21   ,20   ,23        ,3.406          ,3.786               ,0.9   
2069 ,0          ,20   ,21   ,0         ,3.436          ,3.796               ,0.905 
2069 ,0          ,20   ,21   ,23        ,3.487          ,3.781               ,0.922 
2069 ,0          ,22   ,21   ,0         ,3.508          ,3.496               ,1.003 
2069 ,0          ,22   ,21   ,23        ,3.489          ,3.479               ,1.003 
2069 ,1          ,20   ,21   ,0         ,3.394          ,3.792               ,0.895 
2069 ,1          ,20   ,21   ,23        ,3.397          ,3.781               ,0.898 
2069 ,1          ,22   ,21   ,0         ,3.418          ,3.828               ,0.893 
2069 ,1          ,22   ,21   ,23        ,3.403          ,3.76                ,0.905 
2070 ,0          ,21   ,22   ,0         ,3.451          ,3.855               ,0.895 
2070 ,0          ,21   ,22   ,23        ,3.428          ,3.835               ,0.894 
2070 ,0          ,23   ,22   ,0         ,3.487          ,3.485               ,1.001 
2070 ,0          ,23   ,22   ,23        ,3.471          ,3.466               ,1.002 
2070 ,1          ,21   ,22   ,0         ,3.394          ,3.784               ,0.897 
2070 ,1          ,21   ,22   ,23        ,3.387          ,3.752               ,0.903 
2070 ,1          ,23   ,22   ,0         ,3.435          ,3.774               ,0.91  
2070 ,1          ,23   ,22   ,23        ,3.403          ,3.708               ,0.918 
2071 ,0          ,22   ,23   ,0         ,3.276          ,3.639               ,0.9   
2071 ,0          ,22   ,23   ,23        ,3.33           ,3.764               ,0.885 
2071 ,0          ,24   ,23   ,0         ,3.419          ,3.429               ,0.997 
2071 ,0          ,24   ,23   ,23        ,3.403          ,3.401               ,1.0   
2071 ,1          ,22   ,23   ,0         ,3.277          ,3.616               ,0.906 
2071 ,1          ,22   ,23   ,23        ,3.276          ,3.854               ,0.85  
2071 ,1          ,24   ,23   ,0         ,3.348          ,3.665               ,0.913 
2071 ,1          ,24   ,23   ,23        ,3.322          ,3.811               ,0.872 
2072 ,0          ,23   ,24   ,0         ,3.473          ,3.881               ,0.895 
2072 ,0          ,23   ,24   ,23        ,3.483          ,3.846               ,0.906 
2072 ,0          ,25   ,24   ,0         ,3.402          ,3.419               ,0.995 
2072 ,0          ,25   ,24   ,23        ,3.485          ,3.384               ,1.03  
2072 ,1          ,23   ,24   ,0         ,3.263          ,3.647               ,0.895 
2072 ,1          ,23   ,24   ,23        ,3.276          ,3.68                ,0.89  
2072 ,1          ,25   ,24   ,0         ,3.35           ,3.731               ,0.898 
2072 ,1          ,25   ,24   ,23        ,3.354          ,3.683               ,0.911 
2073 ,0          ,24   ,25   ,0         ,3.264          ,3.662               ,0.891 
2073 ,0          ,24   ,25   ,23        ,3.301          ,3.751               ,0.88  
2073 ,0          ,26   ,25   ,0         ,3.431          ,3.411               ,1.006 
2073 ,0          ,26   ,25   ,23        ,3.436          ,3.436               ,1.0   
2073 ,1          ,24   ,25   ,0         ,3.268          ,3.694               ,0.885 
2073 ,1          ,24   ,25   ,23        ,3.294          ,3.617               ,0.911 
2073 ,1          ,26   ,25   ,0         ,3.369          ,3.776               ,0.892 
2073 ,1          ,26   ,25   ,23        ,3.388          ,3.786               ,0.895 
2074 ,0          ,25   ,26   ,0         ,3.276          ,3.654               ,0.897 
2074 ,0          ,25   ,26   ,23        ,3.329          ,3.683               ,0.904 
2074 ,0          ,27   ,26   ,0         ,3.419          ,3.424               ,0.999 
2074 ,0          ,27   ,26   ,23        ,3.402          ,3.394               ,1.002 
2074 ,1          ,25   ,26   ,0         ,3.261          ,3.634               ,0.897 
2074 ,1          ,25   ,26   ,23        ,3.26           ,3.648               ,0.894 
2074 ,1          ,27   ,26   ,0         ,3.334          ,3.822               ,0.872 
2074 ,1          ,27   ,26   ,23        ,3.334          ,3.865               ,0.863 
2075 ,0          ,26   ,27   ,0         ,3.271          ,3.72                ,0.879 
2075 ,0          ,26   ,27   ,23        ,3.291          ,3.673               ,0.896 
2075 ,0          ,28   ,27   ,0         ,3.423          ,3.415               ,1.002 
2075 ,0          ,28   ,27   ,23        ,3.402          ,3.386               ,1.005 
2075 ,1          ,26   ,27   ,0         ,3.248          ,3.762               ,0.863 
2075 ,1          ,26   ,27   ,23        ,3.261          ,3.658               ,0.892 
2075 ,1          ,28   ,27   ,0         ,3.358          ,3.749               ,0.896 
2075 ,1          ,28   ,27   ,23        ,3.373          ,3.82                ,0.883 
2076 ,0          ,27   ,28   ,0         ,3.261          ,3.709               ,0.879 
2076 ,0          ,27   ,28   ,23        ,3.268          ,3.627               ,0.901 
2076 ,0          ,29   ,28   ,0         ,3.427          ,3.422               ,1.001 
2076 ,0          ,29   ,28   ,23        ,3.381          ,3.39                ,0.997 
2076 ,1          ,27   ,28   ,0         ,3.273          ,3.618               ,0.904 
2076 ,1          ,27   ,28   ,23        ,3.248          ,3.641               ,0.892 
2076 ,1          ,29   ,28   ,0         ,3.34           ,3.761               ,0.888 
2076 ,1          ,29   ,28   ,23        ,3.353          ,3.824               ,0.877 
2077 ,0          ,28   ,29   ,0         ,3.273          ,3.713               ,0.881 
2077 ,0          ,28   ,29   ,23        ,3.312          ,3.725               ,0.889 
2077 ,0          ,30   ,29   ,0         ,3.42           ,3.43                ,0.997 
2077 ,0          ,30   ,29   ,23        ,3.368          ,3.365               ,1.001 
2077 ,1          ,28   ,29   ,0         ,3.259          ,3.656               ,0.891 
2077 ,1          ,28   ,29   ,23        ,3.284          ,3.626               ,0.906 
2077 ,1          ,30   ,29   ,0         ,3.352          ,3.624               ,0.925 
2077 ,1          ,30   ,29   ,23        ,3.392          ,3.608               ,0.94  
2078 ,0          ,29   ,30   ,0         ,3.241          ,3.74                ,0.867 
2078 ,0          ,29   ,30   ,23        ,3.329          ,3.738               ,0.891 
2078 ,0          ,31   ,30   ,0         ,3.503          ,3.438               ,1.019 
2078 ,0          ,31   ,30   ,23        ,3.464          ,3.394               ,1.021 
2078 ,1          ,29   ,30   ,0         ,3.226          ,3.57                ,0.904 
2078 ,1          ,29   ,30   ,23        ,3.247          ,3.626               ,0.896 
2078 ,1          ,31   ,30   ,0         ,3.273          ,3.643               ,0.898 
2078 ,1          ,31   ,30   ,23        ,3.248          ,3.582               ,0.907 
2079 ,0          ,30   ,31   ,0         ,3.255          ,3.636               ,0.895 
2079 ,0          ,30   ,31   ,23        ,3.283          ,3.665               ,0.896 
2079 ,0          ,32   ,31   ,0         ,3.436          ,3.444               ,0.997 
2079 ,0          ,32   ,31   ,23        ,3.423          ,3.417               ,1.002 
2079 ,1          ,30   ,31   ,0         ,3.31           ,3.66                ,0.904 
2079 ,1          ,30   ,31   ,23        ,3.292          ,3.876               ,0.849 
2079 ,1          ,32   ,31   ,0         ,3.257          ,3.627               ,0.898 
2079 ,1          ,32   ,31   ,23        ,3.245          ,3.728               ,0.871 
21   ,0          ,20   ,21   ,0         ,3.465          ,3.812               ,0.909 
21   ,0          ,20   ,21   ,23        ,3.466          ,3.805               ,0.911 
21   ,0          ,22   ,21   ,0         ,3.464          ,3.45                ,1.004 
21   ,0          ,22   ,21   ,23        ,3.457          ,3.436               ,1.006 
21   ,1          ,20   ,21   ,0         ,3.411          ,3.801               ,0.897 
21   ,1          ,20   ,21   ,23        ,3.415          ,3.805               ,0.897 
21   ,1          ,22   ,21   ,0         ,3.403          ,3.76                ,0.905 
21   ,1          ,22   ,21   ,23        ,3.428          ,3.741               ,0.916 
22   ,0          ,21   ,22   ,0         ,3.523          ,3.95                ,0.892 
22   ,0          ,21   ,22   ,23        ,3.556          ,3.995               ,0.89  
22   ,0          ,23   ,22   ,0         ,3.473          ,3.465               ,1.002 
22   ,0          ,23   ,22   ,23        ,3.51           ,3.446               ,1.018 
22   ,1          ,21   ,22   ,0         ,3.37           ,3.698               ,0.911 
22   ,1          ,21   ,22   ,23        ,3.373          ,3.759               ,0.897 
22   ,1          ,23   ,22   ,0         ,3.414          ,3.718               ,0.918 
22   ,1          ,23   ,22   ,23        ,3.436          ,3.8                 ,0.904 
23   ,0          ,22   ,23   ,0         ,3.275          ,3.709               ,0.883 
23   ,0          ,22   ,23   ,23        ,3.309          ,3.803               ,0.87  
23   ,0          ,24   ,23   ,0         ,3.466          ,3.47                ,0.999 
23   ,0          ,24   ,23   ,23        ,3.431          ,3.428               ,1.001 
23   ,1          ,22   ,23   ,0         ,3.267          ,3.657               ,0.894 
23   ,1          ,22   ,23   ,23        ,3.305          ,3.615               ,0.914 
23   ,1          ,24   ,23   ,0         ,3.37           ,3.752               ,0.898 
23   ,1          ,24   ,23   ,23        ,3.401          ,3.827               ,0.889 
24   ,0          ,23   ,24   ,0         ,3.31           ,3.67                ,0.902 
24   ,0          ,23   ,24   ,23        ,3.321          ,3.74                ,0.888 
24   ,0          ,25   ,24   ,0         ,3.439          ,3.438               ,1.0   
24   ,0          ,25   ,24   ,23        ,3.535          ,3.438               ,1.028 
24   ,1          ,23   ,24   ,0         ,3.28           ,3.61                ,0.909 
24   ,1          ,23   ,24   ,23        ,3.284          ,3.661               ,0.897 
24   ,1          ,25   ,24   ,0         ,3.384          ,3.736               ,0.906 
24   ,1          ,25   ,24   ,23        ,3.424          ,3.787               ,0.904 
25   ,0          ,24   ,25   ,0         ,3.269          ,3.667               ,0.891 
25   ,0          ,24   ,25   ,23        ,3.312          ,3.687               ,0.898 
25   ,0          ,26   ,25   ,0         ,3.411          ,3.436               ,0.993 
25   ,0          ,26   ,25   ,23        ,3.562          ,3.589               ,0.992 
25   ,1          ,24   ,25   ,0         ,3.272          ,3.672               ,0.891 
25   ,1          ,24   ,25   ,23        ,3.317          ,3.68                ,0.901 
25   ,1          ,26   ,25   ,0         ,3.362          ,3.81                ,0.882 
25   ,1          ,26   ,25   ,23        ,3.444          ,3.735               ,0.922 
26   ,0          ,25   ,26   ,0         ,3.268          ,3.693               ,0.885 
26   ,0          ,25   ,26   ,23        ,3.302          ,3.781               ,0.873 
26   ,0          ,27   ,26   ,0         ,3.452          ,3.444               ,1.002 
26   ,0          ,27   ,26   ,23        ,3.45           ,3.439               ,1.003 
26   ,1          ,25   ,26   ,0         ,3.247          ,3.653               ,0.889 
26   ,1          ,25   ,26   ,23        ,3.277          ,3.872               ,0.846 
26   ,1          ,27   ,26   ,0         ,3.442          ,3.868               ,0.89  
26   ,1          ,27   ,26   ,23        ,3.391          ,3.725               ,0.91  
27   ,0          ,26   ,27   ,0         ,3.261          ,3.705               ,0.88  
27   ,0          ,26   ,27   ,23        ,3.305          ,3.738               ,0.884 
27   ,0          ,28   ,27   ,0         ,3.436          ,3.449               ,0.996 
27   ,0          ,28   ,27   ,23        ,3.492          ,3.421               ,1.021 
27   ,1          ,26   ,27   ,0         ,3.264          ,3.63                ,0.899 
27   ,1          ,26   ,27   ,23        ,3.262          ,3.694               ,0.883 
27   ,1          ,28   ,27   ,0         ,3.404          ,3.806               ,0.895 
27   ,1          ,28   ,27   ,23        ,3.406          ,3.845               ,0.886 
28   ,0          ,27   ,28   ,0         ,3.292          ,3.615               ,0.911 
28   ,0          ,27   ,28   ,23        ,3.323          ,3.718               ,0.894 
28   ,0          ,29   ,28   ,0         ,3.439          ,3.444               ,0.998 
28   ,0          ,29   ,28   ,23        ,3.413          ,3.395               ,1.005 
28   ,1          ,27   ,28   ,0         ,3.261          ,3.596               ,0.907 
28   ,1          ,27   ,28   ,23        ,3.265          ,3.853               ,0.847 
28   ,1          ,29   ,28   ,0         ,3.372          ,3.676               ,0.917 
28   ,1          ,29   ,28   ,23        ,3.375          ,3.738               ,0.903 
29   ,0          ,28   ,29   ,0         ,3.246          ,3.689               ,0.88  
29   ,0          ,28   ,29   ,23        ,3.299          ,3.705               ,0.89  
29   ,0          ,30   ,29   ,0         ,3.447          ,3.438               ,1.003 
29   ,0          ,30   ,29   ,23        ,3.425          ,3.422               ,1.001 
29   ,1          ,28   ,29   ,0         ,3.239          ,3.61                ,0.897 
29   ,1          ,28   ,29   ,23        ,3.347          ,3.768               ,0.888 
29   ,1          ,30   ,29   ,0         ,3.261          ,3.658               ,0.892 
29   ,1          ,30   ,29   ,23        ,3.303          ,3.631               ,0.91  
3    ,0          ,192  ,96   ,0         ,6.483          ,5.935               ,1.092 
3    ,0          ,192  ,96   ,23        ,5.869          ,5.858               ,1.002 
3    ,0          ,2    ,3    ,0         ,4.233          ,4.2                 ,1.008 
3    ,0          ,2    ,3    ,23        ,4.233          ,4.242               ,0.998 
3    ,0          ,256  ,3    ,0         ,10.072         ,9.729               ,1.035 
3    ,0          ,256  ,3    ,23        ,10.679         ,10.612              ,1.006 
3    ,0          ,256  ,64   ,0         ,8.268          ,8.29                ,0.997 
3    ,0          ,256  ,64   ,23        ,8.247          ,8.186               ,1.007 
3    ,0          ,256  ,96   ,0         ,7.513          ,7.67                ,0.979 
3    ,0          ,256  ,96   ,23        ,7.592          ,7.747               ,0.98  
3    ,0          ,4    ,3    ,0         ,4.207          ,4.522               ,0.93  
3    ,0          ,4    ,3    ,23        ,4.221          ,4.512               ,0.935 
3    ,0          ,512  ,96   ,0         ,13.282         ,12.996              ,1.022 
3    ,0          ,512  ,96   ,23        ,13.327         ,13.035              ,1.022 
3    ,1          ,192  ,96   ,0         ,5.932          ,5.933               ,1.0   
3    ,1          ,192  ,96   ,23        ,5.886          ,5.938               ,0.991 
3    ,1          ,2    ,3    ,0         ,4.164          ,4.218               ,0.987 
3    ,1          ,2    ,3    ,23        ,4.194          ,4.244               ,0.988 
3    ,1          ,256  ,3    ,0         ,3.393          ,4.06                ,0.836 
3    ,1          ,256  ,3    ,23        ,3.314          ,4.018               ,0.825 
3    ,1          ,256  ,64   ,0         ,5.346          ,5.323               ,1.004 
3    ,1          ,256  ,64   ,23        ,5.401          ,5.379               ,1.004 
3    ,1          ,256  ,96   ,0         ,5.907          ,6.015               ,0.982 
3    ,1          ,256  ,96   ,23        ,5.894          ,5.994               ,0.983 
3    ,1          ,4    ,3    ,0         ,4.222          ,4.462               ,0.946 
3    ,1          ,4    ,3    ,23        ,4.202          ,4.457               ,0.943 
3    ,1          ,512  ,96   ,0         ,6.332          ,6.458               ,0.98  
3    ,1          ,512  ,96   ,23        ,6.048          ,6.186               ,0.978 
30   ,0          ,29   ,30   ,0         ,3.267          ,3.692               ,0.885 
30   ,0          ,29   ,30   ,23        ,3.302          ,3.689               ,0.895 
30   ,0          ,31   ,30   ,0         ,3.436          ,3.436               ,1.0   
30   ,0          ,31   ,30   ,23        ,3.491          ,3.427               ,1.019 
30   ,1          ,29   ,30   ,0         ,3.253          ,3.624               ,0.898 
30   ,1          ,29   ,30   ,23        ,3.271          ,3.603               ,0.908 
30   ,1          ,31   ,30   ,0         ,3.248          ,3.711               ,0.875 
30   ,1          ,31   ,30   ,23        ,3.247          ,3.671               ,0.885 
31   ,0          ,30   ,31   ,0         ,3.289          ,3.637               ,0.904 
31   ,0          ,30   ,31   ,23        ,3.286          ,3.717               ,0.884 
31   ,0          ,32   ,31   ,0         ,3.414          ,3.412               ,1.001 
31   ,0          ,32   ,31   ,23        ,3.457          ,3.388               ,1.02  
31   ,1          ,30   ,31   ,0         ,3.255          ,3.799               ,0.857 
31   ,1          ,30   ,31   ,23        ,3.253          ,3.822               ,0.851 
31   ,1          ,32   ,31   ,0         ,3.263          ,3.634               ,0.898 
31   ,1          ,32   ,31   ,23        ,3.255          ,3.69                ,0.882 
4    ,0          ,192  ,128  ,0         ,5.21           ,5.286               ,0.986 
4    ,0          ,192  ,128  ,23        ,5.27           ,5.233               ,1.007 
4    ,0          ,256  ,128  ,0         ,6.462          ,6.416               ,1.007 
4    ,0          ,256  ,128  ,23        ,6.477          ,6.394               ,1.013 
4    ,0          ,256  ,4    ,0         ,10.073         ,9.654               ,1.043 
4    ,0          ,256  ,4    ,23        ,10.041         ,9.734               ,1.031 
4    ,0          ,256  ,64   ,0         ,8.241          ,8.268               ,0.997 
4    ,0          ,256  ,64   ,23        ,8.268          ,8.297               ,0.996 
4    ,0          ,3    ,4    ,0         ,4.184          ,4.469               ,0.936 
4    ,0          ,3    ,4    ,23        ,4.208          ,4.45                ,0.946 
4    ,0          ,5    ,4    ,0         ,4.371          ,4.509               ,0.969 
4    ,0          ,5    ,4    ,23        ,4.211          ,4.508               ,0.934 
4    ,0          ,512  ,128  ,0         ,13.634         ,13.36               ,1.02  
4    ,0          ,512  ,128  ,23        ,13.683         ,13.272              ,1.031 
4    ,1          ,192  ,128  ,0         ,6.457          ,6.41                ,1.007 
4    ,1          ,192  ,128  ,23        ,6.451          ,6.345               ,1.017 
4    ,1          ,256  ,128  ,0         ,6.449          ,6.421               ,1.004 
4    ,1          ,256  ,128  ,23        ,6.448          ,6.405               ,1.007 
4    ,1          ,256  ,4    ,0         ,3.429          ,4.108               ,0.835 
4    ,1          ,256  ,4    ,23        ,3.362          ,4.021               ,0.836 
4    ,1          ,256  ,64   ,0         ,5.265          ,5.317               ,0.99  
4    ,1          ,256  ,64   ,23        ,5.263          ,5.326               ,0.988 
4    ,1          ,3    ,4    ,0         ,4.21           ,4.562               ,0.923 
4    ,1          ,3    ,4    ,23        ,4.161          ,4.446               ,0.936 
4    ,1          ,5    ,4    ,0         ,4.235          ,4.46                ,0.95  
4    ,1          ,5    ,4    ,23        ,4.204          ,4.69                ,0.896 
4    ,1          ,512  ,128  ,0         ,6.594          ,6.506               ,1.014 
4    ,1          ,512  ,128  ,23        ,6.558          ,6.454               ,1.016 
4081 ,0          ,0    ,1    ,0         ,3.328          ,3.334               ,0.998 
4081 ,0          ,0    ,1    ,23        ,3.421          ,3.473               ,0.985 
4081 ,0          ,1    ,2    ,0         ,3.568          ,3.567               ,1.0   
4081 ,0          ,1    ,2    ,23        ,3.552          ,3.548               ,1.001 
4081 ,0          ,10   ,11   ,0         ,3.488          ,3.796               ,0.919 
4081 ,0          ,10   ,11   ,23        ,3.507          ,3.81                ,0.92  
4081 ,0          ,10   ,9    ,0         ,3.634          ,3.504               ,1.037 
4081 ,0          ,10   ,9    ,23        ,3.59           ,3.453               ,1.04  
4081 ,0          ,11   ,10   ,0         ,3.551          ,3.487               ,1.018 
4081 ,0          ,11   ,10   ,23        ,3.588          ,3.442               ,1.042 
4081 ,0          ,11   ,12   ,0         ,3.491          ,3.805               ,0.917 
4081 ,0          ,11   ,12   ,23        ,3.507          ,3.889               ,0.902 
4081 ,0          ,12   ,11   ,0         ,3.608          ,3.487               ,1.035 
4081 ,0          ,12   ,11   ,23        ,3.691          ,3.476               ,1.062 
4081 ,0          ,12   ,13   ,0         ,3.522          ,3.813               ,0.924 
4081 ,0          ,12   ,13   ,23        ,3.507          ,3.807               ,0.921 
4081 ,0          ,13   ,12   ,0         ,3.608          ,3.479               ,1.037 
4081 ,0          ,13   ,12   ,23        ,3.584          ,3.448               ,1.039 
4081 ,0          ,13   ,14   ,0         ,3.495          ,3.849               ,0.908 
4081 ,0          ,13   ,14   ,23        ,3.526          ,3.806               ,0.927 
4081 ,0          ,14   ,13   ,0         ,3.56           ,3.496               ,1.018 
4081 ,0          ,14   ,13   ,23        ,3.545          ,3.472               ,1.021 
4081 ,0          ,14   ,15   ,0         ,3.453          ,3.832               ,0.901 
4081 ,0          ,14   ,15   ,23        ,3.47           ,3.83                ,0.906 
4081 ,0          ,15   ,14   ,0         ,3.612          ,3.629               ,0.995 
4081 ,0          ,15   ,14   ,23        ,3.605          ,3.48                ,1.036 
4081 ,0          ,15   ,16   ,0         ,3.456          ,3.789               ,0.912 
4081 ,0          ,15   ,16   ,23        ,3.485          ,3.813               ,0.914 
4081 ,0          ,16   ,15   ,0         ,4.358          ,5.092               ,0.856 
4081 ,0          ,16   ,15   ,23        ,4.293          ,4.833               ,0.888 
4081 ,0          ,16   ,17   ,0         ,5.493          ,6.653               ,0.826 
4081 ,0          ,16   ,17   ,23        ,5.899          ,6.629               ,0.89  
4081 ,0          ,17   ,16   ,0         ,4.313          ,5.001               ,0.863 
4081 ,0          ,17   ,16   ,23        ,4.27           ,4.804               ,0.889 
4081 ,0          ,17   ,18   ,0         ,5.592          ,6.589               ,0.849 
4081 ,0          ,17   ,18   ,23        ,5.535          ,6.561               ,0.844 
4081 ,0          ,18   ,17   ,0         ,4.304          ,4.956               ,0.868 
4081 ,0          ,18   ,17   ,23        ,4.269          ,4.925               ,0.867 
4081 ,0          ,18   ,19   ,0         ,5.584          ,6.491               ,0.86  
4081 ,0          ,18   ,19   ,23        ,5.477          ,6.531               ,0.839 
4081 ,0          ,19   ,18   ,0         ,4.269          ,4.875               ,0.876 
4081 ,0          ,19   ,18   ,23        ,4.28           ,4.996               ,0.857 
4081 ,0          ,19   ,20   ,0         ,5.576          ,6.572               ,0.849 
4081 ,0          ,19   ,20   ,23        ,5.478          ,6.529               ,0.839 
4081 ,0          ,2    ,1    ,0         ,3.335          ,3.353               ,0.995 
4081 ,0          ,2    ,1    ,23        ,3.299          ,3.362               ,0.981 
4081 ,0          ,2    ,3    ,0         ,3.583          ,3.523               ,1.017 
4081 ,0          ,2    ,3    ,23        ,3.816          ,3.518               ,1.085 
4081 ,0          ,20   ,19   ,0         ,4.259          ,4.889               ,0.871 
4081 ,0          ,20   ,19   ,23        ,4.261          ,4.978               ,0.856 
4081 ,0          ,20   ,21   ,0         ,5.467          ,6.426               ,0.851 
4081 ,0          ,20   ,21   ,23        ,5.516          ,6.597               ,0.836 
4081 ,0          ,21   ,20   ,0         ,4.275          ,4.977               ,0.859 
4081 ,0          ,21   ,20   ,23        ,4.272          ,4.964               ,0.861 
4081 ,0          ,21   ,22   ,0         ,5.518          ,6.467               ,0.853 
4081 ,0          ,21   ,22   ,23        ,5.442          ,6.613               ,0.823 
4081 ,0          ,22   ,21   ,0         ,4.269          ,4.914               ,0.869 
4081 ,0          ,22   ,21   ,23        ,4.258          ,4.938               ,0.862 
4081 ,0          ,22   ,23   ,0         ,5.204          ,6.275               ,0.829 
4081 ,0          ,22   ,23   ,23        ,5.249          ,6.268               ,0.837 
4081 ,0          ,23   ,22   ,0         ,4.252          ,4.912               ,0.866 
4081 ,0          ,23   ,22   ,23        ,4.36           ,4.926               ,0.885 
4081 ,0          ,23   ,24   ,0         ,5.544          ,6.542               ,0.848 
4081 ,0          ,23   ,24   ,23        ,5.578          ,6.614               ,0.843 
4081 ,0          ,24   ,23   ,0         ,4.167          ,4.923               ,0.846 
4081 ,0          ,24   ,23   ,23        ,4.157          ,4.787               ,0.868 
4081 ,0          ,24   ,25   ,0         ,5.189          ,6.246               ,0.831 
4081 ,0          ,24   ,25   ,23        ,5.29           ,6.232               ,0.849 
4081 ,0          ,25   ,24   ,0         ,4.456          ,5.051               ,0.882 
4081 ,0          ,25   ,24   ,23        ,4.461          ,4.982               ,0.895 
4081 ,0          ,25   ,26   ,0         ,5.197          ,6.236               ,0.833 
4081 ,0          ,25   ,26   ,23        ,5.218          ,6.284               ,0.83  
4081 ,0          ,256  ,128  ,23        ,6.345          ,6.425               ,0.987 
4081 ,0          ,256  ,160  ,23        ,5.787          ,5.736               ,1.009 
4081 ,0          ,256  ,192  ,23        ,5.127          ,5.074               ,1.011 
4081 ,0          ,256  ,224  ,23        ,3.228          ,4.286               ,0.753 
4081 ,0          ,256  ,32   ,23        ,9.424          ,9.114               ,1.034 
4081 ,0          ,256  ,64   ,0         ,8.057          ,8.034               ,1.003 
4081 ,0          ,256  ,64   ,23        ,8.139          ,8.21                ,0.991 
4081 ,0          ,256  ,96   ,23        ,7.521          ,7.755               ,0.97  
4081 ,0          ,26   ,25   ,0         ,4.261          ,4.895               ,0.87  
4081 ,0          ,26   ,25   ,23        ,4.15           ,4.738               ,0.876 
4081 ,0          ,26   ,27   ,0         ,5.226          ,6.194               ,0.844 
4081 ,0          ,26   ,27   ,23        ,5.208          ,6.221               ,0.837 
4081 ,0          ,27   ,26   ,0         ,4.198          ,4.959               ,0.847 
4081 ,0          ,27   ,26   ,23        ,4.207          ,4.838               ,0.87  
4081 ,0          ,27   ,28   ,0         ,5.165          ,6.236               ,0.828 
4081 ,0          ,27   ,28   ,23        ,5.2            ,6.273               ,0.829 
4081 ,0          ,28   ,27   ,0         ,4.175          ,4.877               ,0.856 
4081 ,0          ,28   ,27   ,23        ,4.212          ,4.816               ,0.875 
4081 ,0          ,28   ,29   ,0         ,5.182          ,6.259               ,0.828 
4081 ,0          ,28   ,29   ,23        ,5.202          ,6.294               ,0.827 
4081 ,0          ,29   ,28   ,0         ,4.188          ,4.718               ,0.888 
4081 ,0          ,29   ,28   ,23        ,4.188          ,4.742               ,0.883 
4081 ,0          ,29   ,30   ,0         ,5.184          ,6.245               ,0.83  
4081 ,0          ,29   ,30   ,23        ,5.155          ,6.227               ,0.828 
4081 ,0          ,3    ,2    ,0         ,3.568          ,3.579               ,0.997 
4081 ,0          ,3    ,2    ,23        ,3.576          ,3.567               ,1.002 
4081 ,0          ,3    ,4    ,0         ,3.478          ,3.537               ,0.983 
4081 ,0          ,3    ,4    ,23        ,3.495          ,3.569               ,0.979 
4081 ,0          ,30   ,29   ,0         ,4.319          ,4.964               ,0.87  
4081 ,0          ,30   ,29   ,23        ,4.139          ,4.838               ,0.856 
4081 ,0          ,30   ,31   ,0         ,5.255          ,6.292               ,0.835 
4081 ,0          ,30   ,31   ,23        ,5.205          ,6.271               ,0.83  
4081 ,0          ,31   ,30   ,0         ,4.193          ,4.675               ,0.897 
4081 ,0          ,31   ,30   ,23        ,4.19           ,4.816               ,0.87  
4081 ,0          ,32   ,31   ,0         ,4.277          ,4.788               ,0.893 
4081 ,0          ,32   ,31   ,23        ,4.308          ,4.793               ,0.899 
4081 ,0          ,4    ,3    ,0         ,3.539          ,3.539               ,1.0   
4081 ,0          ,4    ,3    ,23        ,3.535          ,3.541               ,0.998 
4081 ,0          ,4    ,5    ,0         ,3.502          ,3.849               ,0.91  
4081 ,0          ,4    ,5    ,23        ,3.487          ,3.792               ,0.92  
4081 ,0          ,5    ,4    ,0         ,3.515          ,3.513               ,1.001 
4081 ,0          ,5    ,4    ,23        ,3.546          ,3.477               ,1.02  
4081 ,0          ,5    ,6    ,0         ,3.444          ,3.718               ,0.926 
4081 ,0          ,5    ,6    ,23        ,3.519          ,3.728               ,0.944 
4081 ,0          ,6    ,5    ,0         ,3.568          ,3.503               ,1.019 
4081 ,0          ,6    ,5    ,23        ,3.545          ,3.488               ,1.016 
4081 ,0          ,6    ,7    ,0         ,3.495          ,3.789               ,0.922 
4081 ,0          ,6    ,7    ,23        ,3.509          ,3.773               ,0.93  
4081 ,0          ,7    ,6    ,0         ,3.543          ,3.47                ,1.021 
4081 ,0          ,7    ,6    ,23        ,3.51           ,3.43                ,1.023 
4081 ,0          ,7    ,8    ,0         ,3.46           ,3.777               ,0.916 
4081 ,0          ,7    ,8    ,23        ,3.48           ,3.8                 ,0.916 
4081 ,0          ,8    ,7    ,0         ,3.478          ,3.484               ,0.998 
4081 ,0          ,8    ,7    ,23        ,3.437          ,3.447               ,0.997 
4081 ,0          ,8    ,9    ,0         ,3.496          ,3.794               ,0.921 
4081 ,0          ,8    ,9    ,23        ,3.496          ,3.775               ,0.926 
4081 ,0          ,9    ,10   ,0         ,3.499          ,3.8                 ,0.921 
4081 ,0          ,9    ,10   ,23        ,3.496          ,3.813               ,0.917 
4081 ,0          ,9    ,8    ,0         ,3.59           ,3.461               ,1.037 
4081 ,0          ,9    ,8    ,23        ,3.581          ,3.45                ,1.038 
4081 ,1          ,0    ,1    ,0         ,3.322          ,3.322               ,1.0   
4081 ,1          ,0    ,1    ,23        ,3.345          ,3.355               ,0.997 
4081 ,1          ,1    ,2    ,0         ,3.522          ,3.524               ,0.999 
4081 ,1          ,1    ,2    ,23        ,3.531          ,3.523               ,1.002 
4081 ,1          ,10   ,11   ,0         ,3.478          ,3.818               ,0.911 
4081 ,1          ,10   ,11   ,23        ,3.47           ,3.774               ,0.919 
4081 ,1          ,10   ,9    ,0         ,3.487          ,3.826               ,0.911 
4081 ,1          ,10   ,9    ,23        ,3.493          ,3.8                 ,0.919 
4081 ,1          ,11   ,10   ,0         ,3.478          ,3.814               ,0.912 
4081 ,1          ,11   ,10   ,23        ,3.483          ,3.812               ,0.914 
4081 ,1          ,11   ,12   ,0         ,3.463          ,3.771               ,0.918 
4081 ,1          ,11   ,12   ,23        ,3.463          ,3.804               ,0.91  
4081 ,1          ,12   ,11   ,0         ,3.486          ,3.895               ,0.895 
4081 ,1          ,12   ,11   ,23        ,3.644          ,3.801               ,0.959 
4081 ,1          ,12   ,13   ,0         ,3.454          ,3.818               ,0.905 
4081 ,1          ,12   ,13   ,23        ,3.465          ,3.808               ,0.91  
4081 ,1          ,13   ,12   ,0         ,3.461          ,3.791               ,0.913 
4081 ,1          ,13   ,12   ,23        ,3.473          ,3.809               ,0.912 
4081 ,1          ,13   ,14   ,0         ,3.461          ,3.757               ,0.921 
4081 ,1          ,13   ,14   ,23        ,3.444          ,3.726               ,0.924 
4081 ,1          ,14   ,13   ,0         ,3.472          ,3.856               ,0.9   
4081 ,1          ,14   ,13   ,23        ,3.47           ,3.8                 ,0.913 
4081 ,1          ,14   ,15   ,0         ,3.452          ,3.751               ,0.92  
4081 ,1          ,14   ,15   ,23        ,3.453          ,3.739               ,0.923 
4081 ,1          ,15   ,14   ,0         ,3.457          ,3.776               ,0.916 
4081 ,1          ,15   ,14   ,23        ,3.462          ,3.809               ,0.909 
4081 ,1          ,15   ,16   ,0         ,3.453          ,3.796               ,0.91  
4081 ,1          ,15   ,16   ,23        ,3.436          ,3.8                 ,0.904 
4081 ,1          ,16   ,15   ,0         ,5.506          ,6.168               ,0.893 
4081 ,1          ,16   ,15   ,23        ,5.485          ,6.22                ,0.882 
4081 ,1          ,16   ,17   ,0         ,5.501          ,6.576               ,0.837 
4081 ,1          ,16   ,17   ,23        ,6.139          ,6.505               ,0.944 
4081 ,1          ,17   ,16   ,0         ,5.489          ,6.26                ,0.877 
4081 ,1          ,17   ,16   ,23        ,5.488          ,6.179               ,0.888 
4081 ,1          ,17   ,18   ,0         ,5.792          ,6.478               ,0.894 
4081 ,1          ,17   ,18   ,23        ,5.989          ,6.565               ,0.912 
4081 ,1          ,18   ,17   ,0         ,5.48           ,6.292               ,0.871 
4081 ,1          ,18   ,17   ,23        ,5.471          ,6.216               ,0.88  
4081 ,1          ,18   ,19   ,0         ,5.471          ,6.459               ,0.847 
4081 ,1          ,18   ,19   ,23        ,5.632          ,6.597               ,0.854 
4081 ,1          ,19   ,18   ,0         ,5.582          ,6.172               ,0.904 
4081 ,1          ,19   ,18   ,23        ,5.591          ,6.198               ,0.902 
4081 ,1          ,19   ,20   ,0         ,5.474          ,6.548               ,0.836 
4081 ,1          ,19   ,20   ,23        ,5.929          ,6.513               ,0.91  
4081 ,1          ,2    ,1    ,0         ,3.407          ,3.322               ,1.026 
4081 ,1          ,2    ,1    ,23        ,3.346          ,3.291               ,1.017 
4081 ,1          ,2    ,3    ,0         ,3.504          ,3.506               ,0.999 
4081 ,1          ,2    ,3    ,23        ,3.721          ,3.504               ,1.062 
4081 ,1          ,20   ,19   ,0         ,5.43           ,6.217               ,0.873 
4081 ,1          ,20   ,19   ,23        ,5.467          ,6.141               ,0.89  
4081 ,1          ,20   ,21   ,0         ,5.474          ,6.456               ,0.848 
4081 ,1          ,20   ,21   ,23        ,6.175          ,6.515               ,0.948 
4081 ,1          ,21   ,20   ,0         ,5.465          ,6.235               ,0.876 
4081 ,1          ,21   ,20   ,23        ,5.418          ,6.206               ,0.873 
4081 ,1          ,21   ,22   ,0         ,5.401          ,6.443               ,0.838 
4081 ,1          ,21   ,22   ,23        ,5.636          ,6.478               ,0.87  
4081 ,1          ,22   ,21   ,0         ,5.452          ,6.25                ,0.872 
4081 ,1          ,22   ,21   ,23        ,5.589          ,6.216               ,0.899 
4081 ,1          ,22   ,23   ,0         ,5.384          ,6.197               ,0.869 
4081 ,1          ,22   ,23   ,23        ,5.376          ,6.252               ,0.86  
4081 ,1          ,23   ,22   ,0         ,5.519          ,6.078               ,0.908 
4081 ,1          ,23   ,22   ,23        ,5.642          ,6.109               ,0.924 
4081 ,1          ,23   ,24   ,0         ,5.251          ,6.195               ,0.848 
4081 ,1          ,23   ,24   ,23        ,5.572          ,6.267               ,0.889 
4081 ,1          ,24   ,23   ,0         ,5.368          ,6.16                ,0.871 
4081 ,1          ,24   ,23   ,23        ,5.402          ,6.088               ,0.887 
4081 ,1          ,24   ,25   ,0         ,5.345          ,6.252               ,0.855 
4081 ,1          ,24   ,25   ,23        ,5.546          ,6.208               ,0.893 
4081 ,1          ,25   ,24   ,0         ,5.324          ,6.039               ,0.882 
4081 ,1          ,25   ,24   ,23        ,5.375          ,6.101               ,0.881 
4081 ,1          ,25   ,26   ,0         ,5.344          ,6.321               ,0.845 
4081 ,1          ,25   ,26   ,23        ,5.374          ,6.246               ,0.86  
4081 ,1          ,256  ,128  ,23        ,6.281          ,6.339               ,0.991 
4081 ,1          ,256  ,160  ,23        ,7.477          ,7.663               ,0.976 
4081 ,1          ,256  ,192  ,23        ,8.116          ,8.147               ,0.996 
4081 ,1          ,256  ,224  ,23        ,9.5            ,9.063               ,1.048 
4081 ,1          ,256  ,32   ,23        ,3.232          ,4.25                ,0.76  
4081 ,1          ,256  ,64   ,0         ,4.977          ,4.975               ,1.0   
4081 ,1          ,256  ,64   ,23        ,5.114          ,5.19                ,0.985 
4081 ,1          ,256  ,96   ,23        ,5.733          ,5.714               ,1.003 
4081 ,1          ,26   ,25   ,0         ,5.416          ,6.007               ,0.902 
4081 ,1          ,26   ,25   ,23        ,5.249          ,6.035               ,0.87  
4081 ,1          ,26   ,27   ,0         ,5.325          ,6.228               ,0.855 
4081 ,1          ,26   ,27   ,23        ,5.374          ,6.324               ,0.85  
4081 ,1          ,27   ,26   ,0         ,5.611          ,6.046               ,0.928 
4081 ,1          ,27   ,26   ,23        ,5.404          ,6.115               ,0.884 
4081 ,1          ,27   ,28   ,0         ,5.621          ,6.306               ,0.891 
4081 ,1          ,27   ,28   ,23        ,5.453          ,6.267               ,0.87  
4081 ,1          ,28   ,27   ,0         ,5.385          ,6.124               ,0.879 
4081 ,1          ,28   ,27   ,23        ,5.539          ,6.022               ,0.92  
4081 ,1          ,28   ,29   ,0         ,5.337          ,6.177               ,0.864 
4081 ,1          ,28   ,29   ,23        ,5.511          ,6.237               ,0.884 
4081 ,1          ,29   ,28   ,0         ,5.351          ,6.095               ,0.878 
4081 ,1          ,29   ,28   ,23        ,5.318          ,6.129               ,0.868 
4081 ,1          ,29   ,30   ,0         ,5.171          ,6.267               ,0.825 
4081 ,1          ,29   ,30   ,23        ,5.327          ,6.211               ,0.858 
4081 ,1          ,3    ,2    ,0         ,3.549          ,3.799               ,0.934 
4081 ,1          ,3    ,2    ,23        ,3.535          ,3.531               ,1.001 
4081 ,1          ,3    ,4    ,0         ,3.459          ,3.524               ,0.982 
4081 ,1          ,3    ,4    ,23        ,3.472          ,3.674               ,0.945 
4081 ,1          ,30   ,29   ,0         ,5.237          ,5.939               ,0.882 
4081 ,1          ,30   ,29   ,23        ,5.445          ,5.969               ,0.912 
4081 ,1          ,30   ,31   ,0         ,5.192          ,6.329               ,0.82  
4081 ,1          ,30   ,31   ,23        ,5.317          ,6.325               ,0.841 
4081 ,1          ,31   ,30   ,0         ,5.214          ,5.965               ,0.874 
4081 ,1          ,31   ,30   ,23        ,5.407          ,5.984               ,0.904 
4081 ,1          ,32   ,31   ,0         ,5.297          ,5.845               ,0.906 
4081 ,1          ,32   ,31   ,23        ,5.336          ,5.931               ,0.9   
4081 ,1          ,4    ,3    ,0         ,3.531          ,3.67                ,0.962 
4081 ,1          ,4    ,3    ,23        ,3.508          ,3.506               ,1.0   
4081 ,1          ,4    ,5    ,0         ,3.463          ,3.868               ,0.895 
4081 ,1          ,4    ,5    ,23        ,3.47           ,3.788               ,0.916 
4081 ,1          ,5    ,4    ,0         ,3.507          ,3.656               ,0.959 
4081 ,1          ,5    ,4    ,23        ,3.528          ,3.477               ,1.015 
4081 ,1          ,5    ,6    ,0         ,3.473          ,3.738               ,0.929 
4081 ,1          ,5    ,6    ,23        ,3.461          ,3.819               ,0.906 
4081 ,1          ,6    ,5    ,0         ,3.551          ,3.909               ,0.908 
4081 ,1          ,6    ,5    ,23        ,3.599          ,3.901               ,0.923 
4081 ,1          ,6    ,7    ,0         ,3.452          ,3.765               ,0.917 
4081 ,1          ,6    ,7    ,23        ,3.461          ,3.792               ,0.913 
4081 ,1          ,7    ,6    ,0         ,3.427          ,3.755               ,0.913 
4081 ,1          ,7    ,6    ,23        ,3.537          ,3.775               ,0.937 
4081 ,1          ,7    ,8    ,0         ,3.636          ,4.008               ,0.907 
4081 ,1          ,7    ,8    ,23        ,3.625          ,3.944               ,0.919 
4081 ,1          ,8    ,7    ,0         ,3.506          ,3.79                ,0.925 
4081 ,1          ,8    ,7    ,23        ,3.585          ,3.774               ,0.95  
4081 ,1          ,8    ,9    ,0         ,3.52           ,3.924               ,0.897 
4081 ,1          ,8    ,9    ,23        ,3.512          ,3.873               ,0.907 
4081 ,1          ,9    ,10   ,0         ,3.478          ,3.764               ,0.924 
4081 ,1          ,9    ,10   ,23        ,3.487          ,3.822               ,0.912 
4081 ,1          ,9    ,8    ,0         ,3.603          ,3.902               ,0.923 
4081 ,1          ,9    ,8    ,23        ,3.627          ,3.883               ,0.934 
5    ,0          ,192  ,160  ,0         ,3.234          ,4.329               ,0.747 
5    ,0          ,192  ,160  ,23        ,3.358          ,4.273               ,0.786 
5    ,0          ,256  ,160  ,0         ,5.896          ,5.802               ,1.016 
5    ,0          ,256  ,160  ,23        ,6.013          ,6.081               ,0.989 
5    ,0          ,256  ,5    ,0         ,10.077         ,9.818               ,1.026 
5    ,0          ,256  ,5    ,23        ,10.08          ,9.683               ,1.041 
5    ,0          ,256  ,64   ,0         ,8.217          ,8.225               ,0.999 
5    ,0          ,256  ,64   ,23        ,8.213          ,8.278               ,0.992 
5    ,0          ,4    ,5    ,0         ,4.201          ,4.468               ,0.94  
5    ,0          ,4    ,5    ,23        ,4.221          ,4.469               ,0.944 
5    ,0          ,512  ,160  ,0         ,13.621         ,13.395              ,1.017 
5    ,0          ,512  ,160  ,23        ,13.603         ,13.138              ,1.035 
5    ,0          ,6    ,5    ,0         ,4.175          ,4.345               ,0.961 
5    ,0          ,6    ,5    ,23        ,4.216          ,4.49                ,0.939 
5    ,1          ,192  ,160  ,0         ,7.866          ,8.066               ,0.975 
5    ,1          ,192  ,160  ,23        ,7.847          ,8.033               ,0.977 
5    ,1          ,256  ,160  ,0         ,7.574          ,7.718               ,0.981 
5    ,1          ,256  ,160  ,23        ,7.543          ,7.721               ,0.977 
5    ,1          ,256  ,5    ,0         ,3.39           ,4.381               ,0.774 
5    ,1          ,256  ,5    ,23        ,3.396          ,4.384               ,0.775 
5    ,1          ,256  ,64   ,0         ,5.329          ,5.4                 ,0.987 
5    ,1          ,256  ,64   ,23        ,5.284          ,5.306               ,0.996 
5    ,1          ,4    ,5    ,0         ,4.153          ,4.485               ,0.926 
5    ,1          ,4    ,5    ,23        ,4.205          ,4.489               ,0.937 
5    ,1          ,512  ,160  ,0         ,10.136         ,10.086              ,1.005 
5    ,1          ,512  ,160  ,23        ,10.16          ,10.029              ,1.013 
5    ,1          ,6    ,5    ,0         ,4.141          ,4.45                ,0.931 
5    ,1          ,6    ,5    ,23        ,4.159          ,4.407               ,0.944 
6    ,0          ,192  ,192  ,0         ,8.466          ,8.568               ,0.988 
6    ,0          ,192  ,192  ,23        ,8.381          ,8.541               ,0.981 
6    ,0          ,256  ,192  ,0         ,5.349          ,5.326               ,1.004 
6    ,0          ,256  ,192  ,23        ,5.313          ,5.372               ,0.989 
6    ,0          ,256  ,6    ,0         ,10.052         ,9.778               ,1.028 
6    ,0          ,256  ,6    ,23        ,10.047         ,9.727               ,1.033 
6    ,0          ,256  ,64   ,0         ,8.271          ,8.25                ,1.003 
6    ,0          ,256  ,64   ,23        ,8.258          ,8.246               ,1.001 
6    ,0          ,5    ,6    ,0         ,4.174          ,4.491               ,0.929 
6    ,0          ,5    ,6    ,23        ,4.194          ,4.476               ,0.937 
6    ,0          ,512  ,192  ,0         ,13.042         ,13.456              ,0.969 
6    ,0          ,512  ,192  ,23        ,12.808         ,12.509              ,1.024 
6    ,0          ,7    ,6    ,0         ,4.204          ,4.499               ,0.934 
6    ,0          ,7    ,6    ,23        ,4.174          ,4.509               ,0.926 
6    ,1          ,192  ,192  ,0         ,8.472          ,8.607               ,0.984 
6    ,1          ,192  ,192  ,23        ,8.554          ,8.574               ,0.998 
6    ,1          ,256  ,192  ,0         ,8.208          ,8.187               ,1.003 
6    ,1          ,256  ,192  ,23        ,8.208          ,8.232               ,0.997 
6    ,1          ,256  ,6    ,0         ,3.384          ,4.388               ,0.771 
6    ,1          ,256  ,6    ,23        ,3.361          ,4.371               ,0.769 
6    ,1          ,256  ,64   ,0         ,5.348          ,5.409               ,0.989 
6    ,1          ,256  ,64   ,23        ,5.348          ,5.415               ,0.988 
6    ,1          ,5    ,6    ,0         ,4.127          ,4.483               ,0.92  
6    ,1          ,5    ,6    ,23        ,4.113          ,4.404               ,0.934 
6    ,1          ,512  ,192  ,0         ,10.449         ,10.309              ,1.014 
6    ,1          ,512  ,192  ,23        ,10.502         ,10.355              ,1.014 
6    ,1          ,7    ,6    ,0         ,4.147          ,4.488               ,0.924 
6    ,1          ,7    ,6    ,23        ,4.128          ,4.552               ,0.907 
7    ,0          ,192  ,224  ,0         ,8.453          ,8.551               ,0.989 
7    ,0          ,192  ,224  ,23        ,8.44           ,8.569               ,0.985 
7    ,0          ,256  ,224  ,0         ,3.235          ,4.249               ,0.761 
7    ,0          ,256  ,224  ,23        ,3.231          ,4.29                ,0.753 
7    ,0          ,256  ,64   ,0         ,8.219          ,8.284               ,0.992 
7    ,0          ,256  ,64   ,23        ,8.332          ,8.302               ,1.004 
7    ,0          ,256  ,7    ,0         ,10.106         ,9.71                ,1.041 
7    ,0          ,256  ,7    ,23        ,10.066         ,9.764               ,1.031 
7    ,0          ,512  ,224  ,0         ,12.335         ,12.171              ,1.013 
7    ,0          ,512  ,224  ,23        ,12.275         ,12.182              ,1.008 
7    ,0          ,6    ,7    ,0         ,4.185          ,4.495               ,0.931 
7    ,0          ,6    ,7    ,23        ,4.164          ,4.44                ,0.938 
7    ,0          ,8    ,7    ,0         ,4.184          ,4.505               ,0.929 
7    ,0          ,8    ,7    ,23        ,4.171          ,4.415               ,0.945 
7    ,1          ,192  ,224  ,0         ,8.406          ,8.524               ,0.986 
7    ,1          ,192  ,224  ,23        ,8.382          ,8.572               ,0.978 
7    ,1          ,256  ,224  ,0         ,9.559          ,9.064               ,1.055 
7    ,1          ,256  ,224  ,23        ,9.579          ,9.117               ,1.051 
7    ,1          ,256  ,64   ,0         ,5.316          ,5.677               ,0.936 
7    ,1          ,256  ,64   ,23        ,5.332          ,5.502               ,0.969 
7    ,1          ,256  ,7    ,0         ,3.386          ,4.357               ,0.777 
7    ,1          ,256  ,7    ,23        ,3.34           ,4.305               ,0.776 
7    ,1          ,512  ,224  ,0         ,11.153         ,11.117              ,1.003 
7    ,1          ,512  ,224  ,23        ,11.268         ,11.236              ,1.003 
7    ,1          ,6    ,7    ,0         ,4.146          ,4.433               ,0.935 
7    ,1          ,6    ,7    ,23        ,4.13           ,4.414               ,0.936 
7    ,1          ,8    ,7    ,0         ,4.093          ,4.547               ,0.9   
7    ,1          ,8    ,7    ,23        ,4.133          ,4.452               ,0.928 
8    ,0          ,7    ,8    ,0         ,4.17           ,4.412               ,0.945 
8    ,0          ,7    ,8    ,23        ,4.182          ,4.49                ,0.931 
8    ,0          ,9    ,8    ,0         ,4.195          ,4.183               ,1.003 
8    ,0          ,9    ,8    ,23        ,4.318          ,4.436               ,0.973 
8    ,1          ,7    ,8    ,0         ,4.136          ,4.474               ,0.925 
8    ,1          ,7    ,8    ,23        ,4.143          ,4.439               ,0.933 
8    ,1          ,9    ,8    ,0         ,4.155          ,4.435               ,0.937 
8    ,1          ,9    ,8    ,23        ,4.154          ,4.462               ,0.931 
9    ,0          ,10   ,9    ,0         ,4.159          ,4.139               ,1.005 
9    ,0          ,10   ,9    ,23        ,4.166          ,4.519               ,0.922 
9    ,0          ,8    ,9    ,0         ,4.222          ,4.596               ,0.919 
9    ,0          ,8    ,9    ,23        ,4.175          ,4.51                ,0.926 
9    ,1          ,10   ,9    ,0         ,4.143          ,4.474               ,0.926 
9    ,1          ,10   ,9    ,23        ,4.155          ,4.534               ,0.916 
9    ,1          ,8    ,9    ,0         ,4.168          ,4.554               ,0.915 
9    ,1          ,8    ,9    ,23        ,4.172          ,4.452               ,0.937 
0.9494839567637894

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH v1 5/7] x86: Optimize strrchr-evex.S and implement with VMM headers
  2022-10-18  2:48 ` [PATCH v1 5/7] x86: Optimize strrchr-evex.S and implement with VMM headers Noah Goldstein
@ 2022-10-18  2:52   ` Noah Goldstein
  0 siblings, 0 replies; 41+ messages in thread
From: Noah Goldstein @ 2022-10-18  2:52 UTC (permalink / raw)
  To: libc-alpha; +Cc: hjl.tools, carlos

[-- Attachment #1: Type: text/plain, Size: 18830 bytes --]

On Mon, Oct 17, 2022 at 7:49 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> Optimization is:
> 1. Cache latest result in "fast path" loop with `vmovdqu` instead of
>   `kunpckdq`.  This helps if there are more than one matches.
>
> Code Size Changes:
> strrchr-evex.S       :  +30 bytes (Same number of cache lines)
>
> Net perf changes:
>
> Reported as geometric mean of all improvements / regressions from N=10
> runs of the benchtests. Value as New Time / Old Time so < 1.0 is
> improvement and 1.0 is regression.
>
> strrchr-evex.S       : 0.932 (From cases with higher match frequency)
>
> Full results attached in email.
>
> Full check passes on x86-64.
> ---
>  sysdeps/x86_64/multiarch/strrchr-evex.S | 371 +++++++++++++-----------
>  1 file changed, 200 insertions(+), 171 deletions(-)
>
> diff --git a/sysdeps/x86_64/multiarch/strrchr-evex.S b/sysdeps/x86_64/multiarch/strrchr-evex.S
> index 992b45fb47..45487dc87a 100644
> --- a/sysdeps/x86_64/multiarch/strrchr-evex.S
> +++ b/sysdeps/x86_64/multiarch/strrchr-evex.S
> @@ -26,25 +26,30 @@
>  #  define STRRCHR      __strrchr_evex
>  # endif
>
> -# define VMOVU vmovdqu64
> -# define VMOVA vmovdqa64
> +# include "x86-evex256-vecs.h"
>
>  # ifdef USE_AS_WCSRCHR
> -#  define SHIFT_REG    esi
> -
> -#  define kunpck       kunpckbw
> +#  define RCX_M        cl
> +#  define SHIFT_REG    rcx
> +#  define VPCOMPRESS   vpcompressd
> +#  define kunpck_2x    kunpckbw
>  #  define kmov_2x      kmovd
>  #  define maskz_2x     ecx
>  #  define maskm_2x     eax
>  #  define CHAR_SIZE    4
>  #  define VPMIN        vpminud
>  #  define VPTESTN      vptestnmd
> +#  define VPTEST       vptestmd
>  #  define VPBROADCAST  vpbroadcastd
> +#  define VPCMPEQ      vpcmpeqd
>  #  define VPCMP        vpcmpd
> -# else
> -#  define SHIFT_REG    edi
>
> -#  define kunpck       kunpckdq
> +#  define USE_WIDE_CHAR
> +# else
> +#  define RCX_M        ecx
> +#  define SHIFT_REG    rdi
> +#  define VPCOMPRESS   vpcompressb
> +#  define kunpck_2x    kunpckdq
>  #  define kmov_2x      kmovq
>  #  define maskz_2x     rcx
>  #  define maskm_2x     rax
> @@ -52,58 +57,48 @@
>  #  define CHAR_SIZE    1
>  #  define VPMIN        vpminub
>  #  define VPTESTN      vptestnmb
> +#  define VPTEST       vptestmb
>  #  define VPBROADCAST  vpbroadcastb
> +#  define VPCMPEQ      vpcmpeqb
>  #  define VPCMP        vpcmpb
>  # endif
>
> -# define XMMZERO       xmm16
> -# define YMMZERO       ymm16
> -# define YMMMATCH      ymm17
> -# define YMMSAVE       ymm18
> +# include "reg-macros.h"
>
> -# define YMM1  ymm19
> -# define YMM2  ymm20
> -# define YMM3  ymm21
> -# define YMM4  ymm22
> -# define YMM5  ymm23
> -# define YMM6  ymm24
> -# define YMM7  ymm25
> -# define YMM8  ymm26
> -
> -
> -# define VEC_SIZE      32
> +# define VMATCH        VMM(0)
> +# define CHAR_PER_VEC  (VEC_SIZE / CHAR_SIZE)
>  # define PAGE_SIZE     4096
> -       .section .text.evex, "ax", @progbits
> -ENTRY(STRRCHR)
> +
> +       .section SECTION(.text), "ax", @progbits
> +ENTRY_P2ALIGN(STRRCHR, 6)
>         movl    %edi, %eax
> -       /* Broadcast CHAR to YMMMATCH.  */
> -       VPBROADCAST %esi, %YMMMATCH
> +       /* Broadcast CHAR to VMATCH.  */
> +       VPBROADCAST %esi, %VMATCH
>
>         andl    $(PAGE_SIZE - 1), %eax
>         cmpl    $(PAGE_SIZE - VEC_SIZE), %eax
>         jg      L(cross_page_boundary)
>
> -L(page_cross_continue):
> -       VMOVU   (%rdi), %YMM1
> -       /* k0 has a 1 for each zero CHAR in YMM1.  */
> -       VPTESTN %YMM1, %YMM1, %k0
> -       kmovd   %k0, %ecx
> -       testl   %ecx, %ecx
> +       VMOVU   (%rdi), %VMM(1)
> +       /* k0 has a 1 for each zero CHAR in VEC(1).  */
> +       VPTESTN %VMM(1), %VMM(1), %k0
> +       KMOV    %k0, %VRSI
> +       test    %VRSI, %VRSI
>         jz      L(aligned_more)
>         /* fallthrough: zero CHAR in first VEC.  */
> -
> -       /* K1 has a 1 for each search CHAR match in YMM1.  */
> -       VPCMP   $0, %YMMMATCH, %YMM1, %k1
> -       kmovd   %k1, %eax
> +L(page_cross_return):
> +       /* K1 has a 1 for each search CHAR match in VEC(1).  */
> +       VPCMPEQ %VMATCH, %VMM(1), %k1
> +       KMOV    %k1, %VRAX
>         /* Build mask up until first zero CHAR (used to mask of
>            potential search CHAR matches past the end of the string).
>          */
> -       blsmskl %ecx, %ecx
> -       andl    %ecx, %eax
> +       blsmsk  %VRSI, %VRSI
> +       and     %VRSI, %VRAX
>         jz      L(ret0)
> -       /* Get last match (the `andl` removed any out of bounds
> -          matches).  */
> -       bsrl    %eax, %eax
> +       /* Get last match (the `and` removed any out of bounds matches).
> +        */
> +       bsr     %VRAX, %VRAX
>  # ifdef USE_AS_WCSRCHR
>         leaq    (%rdi, %rax, CHAR_SIZE), %rax
>  # else
> @@ -116,22 +111,22 @@ L(ret0):
>            search path for earlier matches.  */
>         .p2align 4,, 6
>  L(first_vec_x1):
> -       VPCMP   $0, %YMMMATCH, %YMM2, %k1
> -       kmovd   %k1, %eax
> -       blsmskl %ecx, %ecx
> +       VPCMPEQ %VMATCH, %VMM(2), %k1
> +       KMOV    %k1, %VRAX
> +       blsmsk  %VRCX, %VRCX
>         /* eax non-zero if search CHAR in range.  */
> -       andl    %ecx, %eax
> +       and     %VRCX, %VRAX
>         jnz     L(first_vec_x1_return)
>
> -       /* fallthrough: no match in YMM2 then need to check for earlier
> -          matches (in YMM1).  */
> +       /* fallthrough: no match in VEC(2) then need to check for
> +          earlier matches (in VEC(1)).  */
>         .p2align 4,, 4
>  L(first_vec_x0_test):
> -       VPCMP   $0, %YMMMATCH, %YMM1, %k1
> -       kmovd   %k1, %eax
> -       testl   %eax, %eax
> +       VPCMPEQ %VMATCH, %VMM(1), %k1
> +       KMOV    %k1, %VRAX
> +       test    %VRAX, %VRAX
>         jz      L(ret1)
> -       bsrl    %eax, %eax
> +       bsr     %VRAX, %VRAX
>  # ifdef USE_AS_WCSRCHR
>         leaq    (%rsi, %rax, CHAR_SIZE), %rax
>  # else
> @@ -142,129 +137,144 @@ L(ret1):
>
>         .p2align 4,, 10
>  L(first_vec_x1_or_x2):
> -       VPCMP   $0, %YMM3, %YMMMATCH, %k3
> -       VPCMP   $0, %YMM2, %YMMMATCH, %k2
> +       VPCMPEQ %VMM(3), %VMATCH, %k3
> +       VPCMPEQ %VMM(2), %VMATCH, %k2
>         /* K2 and K3 have 1 for any search CHAR match. Test if any
> -          matches between either of them. Otherwise check YMM1.  */
> -       kortestd %k2, %k3
> +          matches between either of them. Otherwise check VEC(1).  */
> +       KORTEST %k2, %k3
>         jz      L(first_vec_x0_test)
>
> -       /* Guranteed that YMM2 and YMM3 are within range so merge the
> -          two bitmasks then get last result.  */
> -       kunpck  %k2, %k3, %k3
> -       kmovq   %k3, %rax
> -       bsrq    %rax, %rax
> -       leaq    (VEC_SIZE)(%r8, %rax, CHAR_SIZE), %rax
> +       /* Guranteed that VEC(2) and VEC(3) are within range so merge
> +          the two bitmasks then get last result.  */
> +       kunpck_2x %k2, %k3, %k3
> +       kmov_2x %k3, %maskm_2x
> +       bsr     %maskm_2x, %maskm_2x
> +       leaq    (VEC_SIZE * 1)(%r8, %rax, CHAR_SIZE), %rax
>         ret
>
> -       .p2align 4,, 6
> +       .p2align 4,, 7
>  L(first_vec_x3):
> -       VPCMP   $0, %YMMMATCH, %YMM4, %k1
> -       kmovd   %k1, %eax
> -       blsmskl %ecx, %ecx
> -       /* If no search CHAR match in range check YMM1/YMM2/YMM3.  */
> -       andl    %ecx, %eax
> +       VPCMPEQ %VMATCH, %VMM(4), %k1
> +       KMOV    %k1, %VRAX
> +       blsmsk  %VRCX, %VRCX
> +       /* If no search CHAR match in range check VEC(1)/VEC(2)/VEC(3).
> +        */
> +       and     %VRCX, %VRAX
>         jz      L(first_vec_x1_or_x2)
> -       bsrl    %eax, %eax
> +       bsr     %VRAX, %VRAX
>         leaq    (VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax
>         ret
>
> +
>         .p2align 4,, 6
>  L(first_vec_x0_x1_test):
> -       VPCMP   $0, %YMMMATCH, %YMM2, %k1
> -       kmovd   %k1, %eax
> -       /* Check YMM2 for last match first. If no match try YMM1.  */
> -       testl   %eax, %eax
> +       VPCMPEQ %VMATCH, %VMM(2), %k1
> +       KMOV    %k1, %VRAX
> +       /* Check VEC(2) for last match first. If no match try VEC(1).
> +        */
> +       test    %VRAX, %VRAX
>         jz      L(first_vec_x0_test)
>         .p2align 4,, 4
>  L(first_vec_x1_return):
> -       bsrl    %eax, %eax
> +       bsr     %VRAX, %VRAX
>         leaq    (VEC_SIZE)(%rdi, %rax, CHAR_SIZE), %rax
>         ret
>
> +
>         .p2align 4,, 10
>  L(first_vec_x2):
> -       VPCMP   $0, %YMMMATCH, %YMM3, %k1
> -       kmovd   %k1, %eax
> -       blsmskl %ecx, %ecx
> -       /* Check YMM3 for last match first. If no match try YMM2/YMM1.
> -        */
> -       andl    %ecx, %eax
> +       VPCMPEQ %VMATCH, %VMM(3), %k1
> +       KMOV    %k1, %VRAX
> +       blsmsk  %VRCX, %VRCX
> +       /* Check VEC(3) for last match first. If no match try
> +          VEC(2)/VEC(1).  */
> +       and     %VRCX, %VRAX
>         jz      L(first_vec_x0_x1_test)
> -       bsrl    %eax, %eax
> +       bsr     %VRAX, %VRAX
>         leaq    (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
>         ret
>
>
> -       .p2align 4
> +       .p2align 4,, 12
>  L(aligned_more):
> -       /* Need to keep original pointer incase YMM1 has last match.  */
> +L(page_cross_continue):
> +       /* Need to keep original pointer incase VEC(1) has last match.
> +        */
>         movq    %rdi, %rsi
>         andq    $-VEC_SIZE, %rdi
> -       VMOVU   VEC_SIZE(%rdi), %YMM2
> -       VPTESTN %YMM2, %YMM2, %k0
> -       kmovd   %k0, %ecx
> -       testl   %ecx, %ecx
> +
> +       VMOVU   VEC_SIZE(%rdi), %VMM(2)
> +       VPTESTN %VMM(2), %VMM(2), %k0
> +       KMOV    %k0, %VRCX
> +
> +       test    %VRCX, %VRCX
>         jnz     L(first_vec_x1)
>
> -       VMOVU   (VEC_SIZE * 2)(%rdi), %YMM3
> -       VPTESTN %YMM3, %YMM3, %k0
> -       kmovd   %k0, %ecx
> -       testl   %ecx, %ecx
> +       VMOVU   (VEC_SIZE * 2)(%rdi), %VMM(3)
> +       VPTESTN %VMM(3), %VMM(3), %k0
> +       KMOV    %k0, %VRCX
> +
> +       test    %VRCX, %VRCX
>         jnz     L(first_vec_x2)
>
> -       VMOVU   (VEC_SIZE * 3)(%rdi), %YMM4
> -       VPTESTN %YMM4, %YMM4, %k0
> -       kmovd   %k0, %ecx
> +       VMOVU   (VEC_SIZE * 3)(%rdi), %VMM(4)
> +       VPTESTN %VMM(4), %VMM(4), %k0
> +       KMOV    %k0, %VRCX
>         movq    %rdi, %r8
> -       testl   %ecx, %ecx
> +       test    %VRCX, %VRCX
>         jnz     L(first_vec_x3)
>
>         andq    $-(VEC_SIZE * 2), %rdi
> -       .p2align 4
> +       .p2align 4,, 10
>  L(first_aligned_loop):
> -       /* Preserve YMM1, YMM2, YMM3, and YMM4 until we can gurantee
> -          they don't store a match.  */
> -       VMOVA   (VEC_SIZE * 4)(%rdi), %YMM5
> -       VMOVA   (VEC_SIZE * 5)(%rdi), %YMM6
> +       /* Preserve VEC(1), VEC(2), VEC(3), and VEC(4) until we can
> +          gurantee they don't store a match.  */
> +       VMOVA   (VEC_SIZE * 4)(%rdi), %VMM(5)
> +       VMOVA   (VEC_SIZE * 5)(%rdi), %VMM(6)
>
> -       VPCMP   $0, %YMM5, %YMMMATCH, %k2
> -       vpxord  %YMM6, %YMMMATCH, %YMM7
> +       VPCMPEQ %VMM(5), %VMATCH, %k2
> +       vpxord  %VMM(6), %VMATCH, %VMM(7)
>
> -       VPMIN   %YMM5, %YMM6, %YMM8
> -       VPMIN   %YMM8, %YMM7, %YMM7
> +       VPMIN   %VMM(5), %VMM(6), %VMM(8)
> +       VPMIN   %VMM(8), %VMM(7), %VMM(7)
>
> -       VPTESTN %YMM7, %YMM7, %k1
> +       VPTESTN %VMM(7), %VMM(7), %k1
>         subq    $(VEC_SIZE * -2), %rdi
> -       kortestd %k1, %k2
> +       KORTEST %k1, %k2
>         jz      L(first_aligned_loop)
>
> -       VPCMP   $0, %YMM6, %YMMMATCH, %k3
> -       VPTESTN %YMM8, %YMM8, %k1
> -       ktestd  %k1, %k1
> +       VPCMPEQ %VMM(6), %VMATCH, %k3
> +       VPTESTN %VMM(8), %VMM(8), %k1
> +
> +       /* If k1 is zero, then we found a CHAR match but no null-term.
> +          We can now safely throw out VEC1-4.  */
> +       KTEST   %k1, %k1
>         jz      L(second_aligned_loop_prep)
>
> -       kortestd %k2, %k3
> +       KORTEST %k2, %k3
>         jnz     L(return_first_aligned_loop)
>
> +
>         .p2align 4,, 6
>  L(first_vec_x1_or_x2_or_x3):
> -       VPCMP   $0, %YMM4, %YMMMATCH, %k4
> -       kmovd   %k4, %eax
> -       testl   %eax, %eax
> +       VPCMPEQ %VMM(4), %VMATCH, %k4
> +       KMOV    %k4, %VRAX
> +       bsr     %VRAX, %VRAX
>         jz      L(first_vec_x1_or_x2)
> -       bsrl    %eax, %eax
>         leaq    (VEC_SIZE * 3)(%r8, %rax, CHAR_SIZE), %rax
>         ret
>
> +
>         .p2align 4,, 8
>  L(return_first_aligned_loop):
> -       VPTESTN %YMM5, %YMM5, %k0
> -       kunpck  %k0, %k1, %k0
> +       VPTESTN %VMM(5), %VMM(5), %k0
> +
> +       /* Combined results from VEC5/6.  */
> +       kunpck_2x %k0, %k1, %k0
>         kmov_2x %k0, %maskz_2x
>
>         blsmsk  %maskz_2x, %maskz_2x
> -       kunpck  %k2, %k3, %k3
> +       kunpck_2x %k2, %k3, %k3
>         kmov_2x %k3, %maskm_2x
>         and     %maskz_2x, %maskm_2x
>         jz      L(first_vec_x1_or_x2_or_x3)
> @@ -280,47 +290,62 @@ L(return_first_aligned_loop):
>  L(second_aligned_loop_prep):
>  L(second_aligned_loop_set_furthest_match):
>         movq    %rdi, %rsi
> -       kunpck  %k2, %k3, %k4
> -
> +       /* Ideally we would safe k2/k3 but `kmov/kunpck` take uops on
> +          port0 and have noticable overhead in the loop.  */
> +       VMOVA   %VMM(5), %VMM(7)
> +       VMOVA   %VMM(6), %VMM(8)
>         .p2align 4
>  L(second_aligned_loop):
> -       VMOVU   (VEC_SIZE * 4)(%rdi), %YMM1
> -       VMOVU   (VEC_SIZE * 5)(%rdi), %YMM2
> -
> -       VPCMP   $0, %YMM1, %YMMMATCH, %k2
> -       vpxord  %YMM2, %YMMMATCH, %YMM3
> +       VMOVU   (VEC_SIZE * 4)(%rdi), %VMM(5)
> +       VMOVU   (VEC_SIZE * 5)(%rdi), %VMM(6)
> +       VPCMPEQ %VMM(5), %VMATCH, %k2
> +       vpxord  %VMM(6), %VMATCH, %VMM(3)
>
> -       VPMIN   %YMM1, %YMM2, %YMM4
> -       VPMIN   %YMM3, %YMM4, %YMM3
> +       VPMIN   %VMM(5), %VMM(6), %VMM(4)
> +       VPMIN   %VMM(3), %VMM(4), %VMM(3)
>
> -       VPTESTN %YMM3, %YMM3, %k1
> +       VPTESTN %VMM(3), %VMM(3), %k1
>         subq    $(VEC_SIZE * -2), %rdi
> -       kortestd %k1, %k2
> +       KORTEST %k1, %k2
>         jz      L(second_aligned_loop)
> -
> -       VPCMP   $0, %YMM2, %YMMMATCH, %k3
> -       VPTESTN %YMM4, %YMM4, %k1
> -       ktestd  %k1, %k1
> +       VPCMPEQ %VMM(6), %VMATCH, %k3
> +       VPTESTN %VMM(4), %VMM(4), %k1
> +       KTEST   %k1, %k1
>         jz      L(second_aligned_loop_set_furthest_match)
>
> -       kortestd %k2, %k3
> -       /* branch here because there is a significant advantage interms
> -          of output dependency chance in using edx.  */
> +       /* branch here because we know we have a match in VEC7/8 but
> +          might not in VEC5/6 so the latter is expected to be less
> +          likely.  */
> +       KORTEST %k2, %k3
>         jnz     L(return_new_match)
> +
>  L(return_old_match):
> -       kmovq   %k4, %rax
> -       bsrq    %rax, %rax
> -       leaq    (VEC_SIZE * 2)(%rsi, %rax, CHAR_SIZE), %rax
> +       VPCMPEQ %VMM(8), %VMATCH, %k0
> +       KMOV    %k0, %VRCX
> +       bsr     %VRCX, %VRCX
> +       jnz     L(return_old_match_ret)
> +
> +       VPCMPEQ %VMM(7), %VMATCH, %k0
> +       KMOV    %k0, %VRCX
> +       bsr     %VRCX, %VRCX
> +       subq    $VEC_SIZE, %rsi
> +L(return_old_match_ret):
> +       leaq    (VEC_SIZE * 3)(%rsi, %rcx, CHAR_SIZE), %rax
>         ret
>
> +       .p2align 4,, 10
>  L(return_new_match):
> -       VPTESTN %YMM1, %YMM1, %k0
> -       kunpck  %k0, %k1, %k0
> +       VPTESTN %VMM(5), %VMM(5), %k0
> +
> +       /* Combined results from VEC5/6.  */
> +       kunpck_2x %k0, %k1, %k0
>         kmov_2x %k0, %maskz_2x
>
>         blsmsk  %maskz_2x, %maskz_2x
> -       kunpck  %k2, %k3, %k3
> +       kunpck_2x %k2, %k3, %k3
>         kmov_2x %k3, %maskm_2x
> +
> +       /* Match at end was out-of-bounds so use last known match.  */
>         and     %maskz_2x, %maskm_2x
>         jz      L(return_old_match)
>
> @@ -328,49 +353,53 @@ L(return_new_match):
>         leaq    (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
>         ret
>
> +       .p2align 4,, 4
>  L(cross_page_boundary):
> -       /* eax contains all the page offset bits of src (rdi). `xor rdi,
> -          rax` sets pointer will all page offset bits cleared so
> -          offset of (PAGE_SIZE - VEC_SIZE) will get last aligned VEC
> -          before page cross (guranteed to be safe to read). Doing this
> -          as opposed to `movq %rdi, %rax; andq $-VEC_SIZE, %rax` saves
> -          a bit of code size.  */
>         xorq    %rdi, %rax
> -       VMOVU   (PAGE_SIZE - VEC_SIZE)(%rax), %YMM1
> -       VPTESTN %YMM1, %YMM1, %k0
> -       kmovd   %k0, %ecx
> +       mov     $-1, %VRDX
> +       VMOVU   (PAGE_SIZE - VEC_SIZE)(%rax), %VMM(6)
> +       VPTESTN %VMM(6), %VMM(6), %k0
> +       KMOV    %k0, %VRSI
> +
> +# ifdef USE_AS_WCSRCHR
> +       movl    %edi, %ecx
> +       and     $(VEC_SIZE - 1), %ecx
> +       shrl    $2, %ecx
> +# endif
> +       shlx    %VGPR(SHIFT_REG), %VRDX, %VRDX
>
> -       /* Shift out zero CHAR matches that are before the begining of
> -          src (rdi).  */
>  # ifdef USE_AS_WCSRCHR
> -       movl    %edi, %esi
> -       andl    $(VEC_SIZE - 1), %esi
> -       shrl    $2, %esi
> +       kmovb   %edx, %k1
> +# else
> +       KMOV    %VRDX, %k1
>  # endif
> -       shrxl   %SHIFT_REG, %ecx, %ecx
>
> -       testl   %ecx, %ecx
> +       /* Need to adjust result to VEC(1) so it can be re-used by
> +          L(return_vec_x0_test).  The alternative is to collect VEC(1)
> +          will a page cross load which is far more expensive.  */
> +       VPCOMPRESS %VMM(6), %VMM(1){%k1}{z}
> +
> +       /* We could technically just jmp back after the vpcompress but
> +          it doesn't save any 16-byte blocks.  */
> +       shrx    %VGPR(SHIFT_REG), %VRSI, %VRSI
> +       test    %VRSI, %VRSI
>         jz      L(page_cross_continue)
>
> -       /* Found zero CHAR so need to test for search CHAR.  */
> -       VPCMP   $0, %YMMMATCH, %YMM1, %k1
> -       kmovd   %k1, %eax
> -       /* Shift out search CHAR matches that are before the begining of
> -          src (rdi).  */
> -       shrxl   %SHIFT_REG, %eax, %eax
> -
> -       /* Check if any search CHAR match in range.  */
> -       blsmskl %ecx, %ecx
> -       andl    %ecx, %eax
> -       jz      L(ret3)
> -       bsrl    %eax, %eax
> +       /* Duplicate of return logic from ENTRY. Doesn't cause spill to
> +          next cache line so might as well copy it here.  */
> +       VPCMPEQ %VMATCH, %VMM(1), %k1
> +       KMOV    %k1, %VRAX
> +       blsmsk  %VRSI, %VRSI
> +       and     %VRSI, %VRAX
> +       jz      L(ret_page_cross)
> +       bsr     %VRAX, %VRAX
>  # ifdef USE_AS_WCSRCHR
>         leaq    (%rdi, %rax, CHAR_SIZE), %rax
>  # else
>         addq    %rdi, %rax
>  # endif
> -L(ret3):
> +L(ret_page_cross):
>         ret
> -
> +       /* 1 byte till next cache line.  */
>  END(STRRCHR)
>  #endif
> --
> 2.34.1
>

[-- Attachment #2: strrchr.txt --]
[-- Type: text/plain, Size: 146277 bytes --]

Results For: strrchr
align,freq ,len  ,max_char ,pos  ,seek ,__strrchr_evex ,__strrchr_evex_orig 
0    ,1    ,1    ,127      ,0    ,0    ,3.544          ,3.517               ,1.007 
0    ,1    ,1    ,127      ,0    ,23   ,3.516          ,3.443               ,1.021 
0    ,1    ,10   ,127      ,9    ,0    ,3.696          ,3.703               ,0.998 
0    ,1    ,10   ,127      ,9    ,23   ,3.686          ,3.694               ,0.998 
0    ,1    ,1024 ,127      ,0    ,0    ,3.541          ,3.424               ,1.034 
0    ,1    ,1024 ,127      ,0    ,23   ,33.578         ,34.335              ,0.978 
0    ,1    ,1024 ,127      ,1024 ,0    ,32.236         ,33.703              ,0.956 
0    ,1    ,1024 ,127      ,1024 ,23   ,32.255         ,33.207              ,0.971 
0    ,1    ,1024 ,127      ,144  ,0    ,9.681          ,10.761              ,0.9   
0    ,1    ,1024 ,127      ,144  ,23   ,32.131         ,33.112              ,0.97  
0    ,1    ,1024 ,127      ,192  ,0    ,11.173         ,12.72               ,0.878 
0    ,1    ,1024 ,127      ,192  ,23   ,32.363         ,33.122              ,0.977 
0    ,1    ,1024 ,127      ,240  ,0    ,11.068         ,12.692              ,0.872 
0    ,1    ,1024 ,127      ,240  ,23   ,33.127         ,34.004              ,0.974 
0    ,1    ,1024 ,127      ,288  ,0    ,12.579         ,14.452              ,0.87  
0    ,1    ,1024 ,127      ,288  ,23   ,32.186         ,33.59               ,0.958 
0    ,1    ,1024 ,127      ,48   ,0    ,4.728          ,5.669               ,0.834 
0    ,1    ,1024 ,127      ,48   ,23   ,33.103         ,35.064              ,0.944 
0    ,1    ,1024 ,127      ,736  ,0    ,23.846         ,25.818              ,0.924 
0    ,1    ,1024 ,127      ,736  ,23   ,32.278         ,34.687              ,0.931 
0    ,1    ,1024 ,127      ,784  ,0    ,25.566         ,27.51               ,0.929 
0    ,1    ,1024 ,127      ,784  ,23   ,34.801         ,35.27               ,0.987 
0    ,1    ,1024 ,127      ,832  ,0    ,27.218         ,29.017              ,0.938 
0    ,1    ,1024 ,127      ,832  ,23   ,35.255         ,36.369              ,0.969 
0    ,1    ,1024 ,127      ,880  ,0    ,27.413         ,29.169              ,0.94  
0    ,1    ,1024 ,127      ,880  ,23   ,35.739         ,38.252              ,0.934 
0    ,1    ,1024 ,127      ,928  ,0    ,29.056         ,30.715              ,0.946 
0    ,1    ,1024 ,127      ,928  ,23   ,34.646         ,36.301              ,0.954 
0    ,1    ,1024 ,127      ,96   ,0    ,5.923          ,7.163               ,0.827 
0    ,1    ,1024 ,127      ,96   ,23   ,31.587         ,33.085              ,0.955 
0    ,1    ,1024 ,127      ,976  ,0    ,30.784         ,32.313              ,0.953 
0    ,1    ,1024 ,127      ,976  ,23   ,35.76          ,36.116              ,0.99  
0    ,1    ,1072 ,127      ,1024 ,0    ,32.421         ,33.863              ,0.957 
0    ,1    ,1072 ,127      ,1024 ,23   ,35.049         ,37.781              ,0.928 
0    ,1    ,11   ,127      ,10   ,0    ,3.7            ,3.705               ,0.999 
0    ,1    ,11   ,127      ,10   ,23   ,3.697          ,3.699               ,0.999 
0    ,1    ,112  ,127      ,144  ,0    ,5.914          ,7.112               ,0.832 
0    ,1    ,112  ,127      ,144  ,23   ,6.616          ,8.659               ,0.764 
0    ,1    ,112  ,127      ,16   ,0    ,3.565          ,3.576               ,0.997 
0    ,1    ,112  ,127      ,16   ,23   ,7.888          ,8.337               ,0.946 
0    ,1    ,112  ,127      ,256  ,0    ,5.916          ,6.985               ,0.847 
0    ,1    ,112  ,127      ,256  ,23   ,6.581          ,8.642               ,0.762 
0    ,1    ,112  ,127      ,64   ,0    ,5.352          ,6.955               ,0.77  
0    ,1    ,112  ,127      ,64   ,23   ,7.401          ,8.403               ,0.881 
0    ,1    ,112  ,127      ,96   ,0    ,7.084          ,7.239               ,0.979 
0    ,1    ,112  ,127      ,96   ,23   ,6.146          ,7.451               ,0.825 
0    ,1    ,1120 ,127      ,1024 ,0    ,32.469         ,34.045              ,0.954 
0    ,1    ,1120 ,127      ,1024 ,23   ,37.041         ,37.475              ,0.988 
0    ,1    ,1168 ,127      ,1024 ,0    ,32.342         ,34.027              ,0.95  
0    ,1    ,1168 ,127      ,1024 ,23   ,41.858         ,42.644              ,0.982 
0    ,1    ,12   ,127      ,11   ,0    ,3.723          ,3.729               ,0.998 
0    ,1    ,12   ,127      ,11   ,23   ,3.729          ,3.714               ,1.004 
0    ,1    ,1216 ,127      ,1024 ,0    ,32.631         ,34.11               ,0.957 
0    ,1    ,1216 ,127      ,1024 ,23   ,39.96          ,40.846              ,0.978 
0    ,1    ,1264 ,127      ,1024 ,0    ,32.411         ,33.803              ,0.959 
0    ,1    ,1264 ,127      ,1024 ,23   ,40.231         ,41.592              ,0.967 
0    ,1    ,128  ,127      ,0    ,0    ,3.492          ,3.474               ,1.005 
0    ,1    ,128  ,127      ,0    ,23   ,9.893          ,11.676              ,0.847 
0    ,1    ,128  ,127      ,112  ,0    ,5.956          ,6.956               ,0.856 
0    ,1    ,128  ,127      ,112  ,23   ,10.68          ,12.006              ,0.89  
0    ,1    ,128  ,127      ,128  ,0    ,9.81           ,10.775              ,0.91  
0    ,1    ,128  ,127      ,128  ,23   ,8.669          ,9.809               ,0.884 
0    ,1    ,128  ,127      ,144  ,0    ,10.021         ,10.81               ,0.927 
0    ,1    ,128  ,127      ,144  ,23   ,9.02           ,10.357              ,0.871 
0    ,1    ,128  ,127      ,192  ,0    ,9.76           ,10.873              ,0.898 
0    ,1    ,128  ,127      ,192  ,23   ,8.848          ,10.049              ,0.88  
0    ,1    ,128  ,127      ,240  ,0    ,9.979          ,10.941              ,0.912 
0    ,1    ,128  ,127      ,240  ,23   ,8.658          ,10.027              ,0.863 
0    ,1    ,128  ,127      ,288  ,0    ,10.064         ,11.29               ,0.891 
0    ,1    ,128  ,127      ,288  ,23   ,8.632          ,9.855               ,0.876 
0    ,1    ,128  ,127      ,32   ,0    ,4.695          ,5.663               ,0.829 
0    ,1    ,128  ,127      ,32   ,23   ,9.803          ,11.638              ,0.842 
0    ,1    ,128  ,127      ,48   ,0    ,4.642          ,5.951               ,0.78  
0    ,1    ,128  ,127      ,48   ,23   ,9.872          ,11.125              ,0.887 
0    ,1    ,128  ,127      ,80   ,0    ,5.074          ,6.474               ,0.784 
0    ,1    ,128  ,127      ,80   ,23   ,9.745          ,11.426              ,0.853 
0    ,1    ,128  ,127      ,96   ,0    ,6.501          ,7.481               ,0.869 
0    ,1    ,128  ,127      ,96   ,23   ,8.717          ,10.366              ,0.841 
0    ,1    ,13   ,127      ,12   ,0    ,3.737          ,3.739               ,1.0   
0    ,1    ,13   ,127      ,12   ,23   ,3.715          ,3.723               ,0.998 
0    ,1    ,1312 ,127      ,1024 ,0    ,32.427         ,34.083              ,0.951 
0    ,1    ,1312 ,127      ,1024 ,23   ,43.614         ,44.317              ,0.984 
0    ,1    ,14   ,127      ,13   ,0    ,3.739          ,3.748               ,0.997 
0    ,1    ,14   ,127      ,13   ,23   ,3.721          ,3.723               ,0.999 
0    ,1    ,144  ,127      ,128  ,0    ,10.539         ,10.65               ,0.99  
0    ,1    ,144  ,127      ,128  ,23   ,9.751          ,11.0                ,0.886 
0    ,1    ,15   ,127      ,14   ,0    ,3.727          ,3.723               ,1.001 
0    ,1    ,15   ,127      ,14   ,23   ,3.746          ,3.761               ,0.996 
0    ,1    ,16   ,127      ,0    ,0    ,3.416          ,3.418               ,0.999 
0    ,1    ,16   ,127      ,0    ,23   ,3.436          ,3.446               ,0.997 
0    ,1    ,16   ,127      ,144  ,0    ,3.619          ,3.628               ,0.997 
0    ,1    ,16   ,127      ,144  ,23   ,4.019          ,4.016               ,1.001 
0    ,1    ,16   ,127      ,15   ,0    ,3.727          ,3.728               ,1.0   
0    ,1    ,16   ,127      ,15   ,23   ,3.733          ,3.737               ,0.999 
0    ,1    ,16   ,127      ,16   ,0    ,3.42           ,3.416               ,1.001 
0    ,1    ,16   ,127      ,16   ,23   ,3.974          ,3.956               ,1.005 
0    ,1    ,16   ,127      ,192  ,0    ,3.592          ,3.595               ,0.999 
0    ,1    ,16   ,127      ,192  ,23   ,3.981          ,3.992               ,0.997 
0    ,1    ,16   ,127      ,240  ,0    ,3.562          ,3.58                ,0.995 
0    ,1    ,16   ,127      ,240  ,23   ,3.973          ,3.91                ,1.016 
0    ,1    ,16   ,127      ,256  ,0    ,3.529          ,3.523               ,1.002 
0    ,1    ,16   ,127      ,256  ,23   ,4.114          ,4.057               ,1.014 
0    ,1    ,16   ,127      ,288  ,0    ,3.634          ,3.672               ,0.99  
0    ,1    ,16   ,127      ,288  ,23   ,4.061          ,4.511               ,0.9   
0    ,1    ,16   ,127      ,48   ,0    ,3.539          ,3.546               ,0.998 
0    ,1    ,16   ,127      ,48   ,23   ,4.021          ,4.002               ,1.005 
0    ,1    ,16   ,127      ,64   ,0    ,3.514          ,3.512               ,1.001 
0    ,1    ,16   ,127      ,64   ,23   ,3.973          ,3.968               ,1.001 
0    ,1    ,16   ,127      ,96   ,0    ,3.523          ,3.528               ,0.998 
0    ,1    ,16   ,127      ,96   ,23   ,4.019          ,4.016               ,1.001 
0    ,1    ,160  ,127      ,144  ,0    ,9.98           ,10.889              ,0.917 
0    ,1    ,160  ,127      ,144  ,23   ,9.568          ,11.164              ,0.857 
0    ,1    ,160  ,127      ,16   ,0    ,3.6            ,3.606               ,0.998 
0    ,1    ,160  ,127      ,16   ,23   ,12.466         ,13.468              ,0.926 
0    ,1    ,160  ,127      ,256  ,0    ,9.782          ,10.864              ,0.9   
0    ,1    ,160  ,127      ,256  ,23   ,8.74           ,10.167              ,0.86  
0    ,1    ,160  ,127      ,64   ,0    ,5.308          ,6.514               ,0.815 
0    ,1    ,160  ,127      ,64   ,23   ,10.168         ,11.726              ,0.867 
0    ,1    ,160  ,127      ,96   ,0    ,6.254          ,7.071               ,0.885 
0    ,1    ,160  ,127      ,96   ,23   ,8.791          ,10.18               ,0.864 
0    ,1    ,17   ,127      ,16   ,0    ,3.755          ,3.747               ,1.002 
0    ,1    ,17   ,127      ,16   ,23   ,3.749          ,3.742               ,1.002 
0    ,1    ,176  ,127      ,128  ,0    ,10.071         ,10.816              ,0.931 
0    ,1    ,176  ,127      ,128  ,23   ,9.842          ,10.987              ,0.896 
0    ,1    ,176  ,127      ,160  ,0    ,9.943          ,10.917              ,0.911 
0    ,1    ,176  ,127      ,160  ,23   ,9.401          ,10.855              ,0.866 
0    ,1    ,176  ,127      ,32   ,0    ,4.799          ,5.122               ,0.937 
0    ,1    ,176  ,127      ,32   ,23   ,9.912          ,11.536              ,0.859 
0    ,1    ,1760 ,127      ,2048 ,0    ,72.585         ,67.02               ,1.083 
0    ,1    ,1760 ,127      ,2048 ,23   ,72.354         ,70.53               ,1.026 
0    ,1    ,1760 ,127      ,288  ,0    ,13.012         ,14.359              ,0.906 
0    ,1    ,1760 ,127      ,288  ,23   ,53.579         ,75.066              ,0.714 
0    ,1    ,18   ,127      ,17   ,0    ,3.723          ,3.734               ,0.997 
0    ,1    ,18   ,127      ,17   ,23   ,3.719          ,3.723               ,0.999 
0    ,1    ,1808 ,127      ,2048 ,0    ,74.738         ,72.933              ,1.025 
0    ,1    ,1808 ,127      ,2048 ,23   ,73.708         ,69.817              ,1.056 
0    ,1    ,1808 ,127      ,240  ,0    ,11.466         ,12.936              ,0.886 
0    ,1    ,1808 ,127      ,240  ,23   ,52.31          ,74.244              ,0.705 
0    ,1    ,1856 ,127      ,192  ,0    ,11.493         ,12.777              ,0.9   
0    ,1    ,1856 ,127      ,192  ,23   ,56.637         ,75.554              ,0.75  
0    ,1    ,1856 ,127      ,2048 ,0    ,76.752         ,75.005              ,1.023 
0    ,1    ,1856 ,127      ,2048 ,23   ,75.763         ,75.647              ,1.002 
0    ,1    ,19   ,127      ,18   ,0    ,3.733          ,3.737               ,0.999 
0    ,1    ,19   ,127      ,18   ,23   ,3.709          ,3.721               ,0.997 
0    ,1    ,1904 ,127      ,144  ,0    ,9.908          ,10.968              ,0.903 
0    ,1    ,1904 ,127      ,144  ,23   ,54.074         ,73.718              ,0.734 
0    ,1    ,1904 ,127      ,2048 ,0    ,76.284         ,74.221              ,1.028 
0    ,1    ,1904 ,127      ,2048 ,23   ,76.677         ,75.992              ,1.009 
0    ,1    ,192  ,127      ,176  ,0    ,9.472          ,10.994              ,0.862 
0    ,1    ,192  ,127      ,176  ,23   ,11.123         ,12.382              ,0.898 
0    ,1    ,1952 ,127      ,2048 ,0    ,78.545         ,76.143              ,1.032 
0    ,1    ,1952 ,127      ,2048 ,23   ,77.067         ,79.001              ,0.976 
0    ,1    ,1952 ,127      ,96   ,0    ,5.921          ,7.117               ,0.832 
0    ,1    ,1952 ,127      ,96   ,23   ,78.832         ,77.729              ,1.014 
0    ,1    ,2    ,127      ,1    ,0    ,3.567          ,3.564               ,1.001 
0    ,1    ,2    ,127      ,1    ,23   ,3.474          ,3.479               ,0.999 
0    ,1    ,20   ,127      ,19   ,0    ,3.735          ,3.735               ,1.0   
0    ,1    ,20   ,127      ,19   ,23   ,3.724          ,3.732               ,0.998 
0    ,1    ,2000 ,127      ,2048 ,0    ,79.702         ,75.814              ,1.051 
0    ,1    ,2000 ,127      ,2048 ,23   ,82.662         ,80.546              ,1.026 
0    ,1    ,2000 ,127      ,48   ,0    ,4.641          ,5.922               ,0.784 
0    ,1    ,2000 ,127      ,48   ,23   ,78.879         ,80.783              ,0.976 
0    ,1    ,2048 ,127      ,0    ,0    ,3.782          ,3.626               ,1.043 
0    ,1    ,2048 ,127      ,0    ,23   ,88.244         ,81.961              ,1.077 
0    ,1    ,2048 ,127      ,1024 ,0    ,32.647         ,34.351              ,0.95  
0    ,1    ,2048 ,127      ,1024 ,23   ,63.059         ,63.135              ,0.999 
0    ,1    ,2048 ,127      ,128  ,0    ,10.048         ,10.918              ,0.92  
0    ,1    ,2048 ,127      ,128  ,23   ,59.186         ,76.42               ,0.774 
0    ,1    ,2048 ,127      ,144  ,0    ,9.887          ,10.926              ,0.905 
0    ,1    ,2048 ,127      ,144  ,23   ,58.821         ,78.417              ,0.75  
0    ,1    ,2048 ,127      ,1760 ,0    ,72.29          ,65.89               ,1.097 
0    ,1    ,2048 ,127      ,1760 ,23   ,67.317         ,70.046              ,0.961 
0    ,1    ,2048 ,127      ,1808 ,0    ,74.84          ,72.362              ,1.034 
0    ,1    ,2048 ,127      ,1808 ,23   ,66.716         ,67.123              ,0.994 
0    ,1    ,2048 ,127      ,1856 ,0    ,76.885         ,73.468              ,1.047 
0    ,1    ,2048 ,127      ,1856 ,23   ,65.625         ,65.874              ,0.996 
0    ,1    ,2048 ,127      ,1904 ,0    ,76.289         ,73.484              ,1.038 
0    ,1    ,2048 ,127      ,1904 ,23   ,68.284         ,72.223              ,0.945 
0    ,1    ,2048 ,127      ,192  ,0    ,11.595         ,12.834              ,0.903 
0    ,1    ,2048 ,127      ,192  ,23   ,83.689         ,78.987              ,1.06  
0    ,1    ,2048 ,127      ,1952 ,0    ,77.785         ,75.969              ,1.024 
0    ,1    ,2048 ,127      ,1952 ,23   ,66.755         ,70.554              ,0.946 
0    ,1    ,2048 ,127      ,2000 ,0    ,78.418         ,75.695              ,1.036 
0    ,1    ,2048 ,127      ,2000 ,23   ,71.465         ,70.069              ,1.02  
0    ,1    ,2048 ,127      ,2048 ,0    ,77.747         ,79.151              ,0.982 
0    ,1    ,2048 ,127      ,2048 ,23   ,79.178         ,75.595              ,1.047 
0    ,1    ,2048 ,127      ,240  ,0    ,11.527         ,12.989              ,0.887 
0    ,1    ,2048 ,127      ,240  ,23   ,69.117         ,80.272              ,0.861 
0    ,1    ,2048 ,127      ,256  ,0    ,13.007         ,14.606              ,0.891 
0    ,1    ,2048 ,127      ,256  ,23   ,83.298         ,73.27               ,1.137 
0    ,1    ,2048 ,127      ,288  ,0    ,12.933         ,14.597              ,0.886 
0    ,1    ,2048 ,127      ,288  ,23   ,82.111         ,83.34               ,0.985 
0    ,1    ,2048 ,127      ,32   ,0    ,4.737          ,4.775               ,0.992 
0    ,1    ,2048 ,127      ,32   ,23   ,78.505         ,82.154              ,0.956 
0    ,1    ,2048 ,127      ,4096 ,0    ,81.364         ,81.725              ,0.996 
0    ,1    ,2048 ,127      ,4096 ,23   ,80.334         ,74.208              ,1.083 
0    ,1    ,2048 ,127      ,48   ,0    ,4.651          ,5.776               ,0.805 
0    ,1    ,2048 ,127      ,48   ,23   ,85.128         ,83.639              ,1.018 
0    ,1    ,2048 ,127      ,512  ,0    ,19.777         ,20.435              ,0.968 
0    ,1    ,2048 ,127      ,512  ,23   ,72.706         ,76.171              ,0.955 
0    ,1    ,2048 ,127      ,64   ,0    ,7.221          ,8.125               ,0.889 
0    ,1    ,2048 ,127      ,64   ,23   ,78.534         ,81.715              ,0.961 
0    ,1    ,2048 ,127      ,96   ,0    ,6.214          ,7.207               ,0.862 
0    ,1    ,2048 ,127      ,96   ,23   ,82.681         ,82.298              ,1.005 
0    ,1    ,208  ,127      ,16   ,0    ,3.591          ,3.628               ,0.99  
0    ,1    ,208  ,127      ,16   ,23   ,11.938         ,13.415              ,0.89  
0    ,1    ,208  ,127      ,192  ,0    ,11.149         ,12.75               ,0.874 
0    ,1    ,208  ,127      ,192  ,23   ,11.712         ,13.415              ,0.873 
0    ,1    ,208  ,127      ,256  ,0    ,11.173         ,12.8                ,0.873 
0    ,1    ,208  ,127      ,256  ,23   ,10.529         ,12.765              ,0.825 
0    ,1    ,208  ,127      ,48   ,0    ,4.648          ,5.711               ,0.814 
0    ,1    ,208  ,127      ,48   ,23   ,11.612         ,13.052              ,0.89  
0    ,1    ,208  ,127      ,64   ,0    ,5.235          ,6.257               ,0.837 
0    ,1    ,208  ,127      ,64   ,23   ,11.809         ,13.386              ,0.882 
0    ,1    ,2096 ,127      ,2048 ,0    ,80.173         ,79.245              ,1.012 
0    ,1    ,2096 ,127      ,2048 ,23   ,71.71          ,74.296              ,0.965 
0    ,1    ,21   ,127      ,20   ,0    ,3.705          ,3.715               ,0.997 
0    ,1    ,21   ,127      ,20   ,23   ,3.699          ,3.703               ,0.999 
0    ,1    ,2144 ,127      ,2048 ,0    ,81.623         ,79.436              ,1.028 
0    ,1    ,2144 ,127      ,2048 ,23   ,69.342         ,69.534              ,0.997 
0    ,1    ,2192 ,127      ,2048 ,0    ,79.678         ,81.904              ,0.973 
0    ,1    ,2192 ,127      ,2048 ,23   ,70.794         ,72.831              ,0.972 
0    ,1    ,22   ,127      ,21   ,0    ,3.69           ,3.695               ,0.999 
0    ,1    ,22   ,127      ,21   ,23   ,3.691          ,3.7                 ,0.997 
0    ,1    ,224  ,127      ,128  ,0    ,10.675         ,10.772              ,0.991 
0    ,1    ,224  ,127      ,128  ,23   ,12.065         ,12.436              ,0.97  
0    ,1    ,224  ,127      ,208  ,0    ,11.13          ,12.897              ,0.863 
0    ,1    ,224  ,127      ,208  ,23   ,11.075         ,12.8                ,0.865 
0    ,1    ,224  ,127      ,288  ,0    ,11.031         ,12.636              ,0.873 
0    ,1    ,224  ,127      ,288  ,23   ,10.654         ,12.748              ,0.836 
0    ,1    ,224  ,127      ,32   ,0    ,4.819          ,4.812               ,1.001 
0    ,1    ,224  ,127      ,32   ,23   ,11.908         ,13.53               ,0.88  
0    ,1    ,224  ,127      ,512  ,0    ,11.124         ,12.704              ,0.876 
0    ,1    ,224  ,127      ,512  ,23   ,10.477         ,12.634              ,0.829 
0    ,1    ,2240 ,127      ,2048 ,0    ,80.648         ,76.87               ,1.049 
0    ,1    ,2240 ,127      ,2048 ,23   ,71.957         ,72.525              ,0.992 
0    ,1    ,2288 ,127      ,2048 ,0    ,80.434         ,76.598              ,1.05  
0    ,1    ,2288 ,127      ,2048 ,23   ,72.331         ,73.156              ,0.989 
0    ,1    ,23   ,127      ,22   ,0    ,3.704          ,3.709               ,0.999 
0    ,1    ,23   ,127      ,22   ,23   ,3.733          ,3.727               ,1.002 
0    ,1    ,2336 ,127      ,2048 ,0    ,79.963         ,74.387              ,1.075 
0    ,1    ,2336 ,127      ,2048 ,23   ,72.296         ,72.702              ,0.994 
0    ,1    ,24   ,127      ,23   ,0    ,3.71           ,3.719               ,0.998 
0    ,1    ,24   ,127      ,23   ,23   ,3.704          ,3.759               ,0.985 
0    ,1    ,240  ,127      ,224  ,0    ,11.049         ,12.695              ,0.87  
0    ,1    ,240  ,127      ,224  ,23   ,11.624         ,13.28               ,0.875 
0    ,1    ,25   ,127      ,24   ,0    ,3.733          ,3.749               ,0.996 
0    ,1    ,25   ,127      ,24   ,23   ,3.698          ,3.695               ,1.001 
0    ,1    ,256  ,127      ,0    ,0    ,3.37           ,3.366               ,1.001 
0    ,1    ,256  ,127      ,0    ,23   ,13.22          ,14.761              ,0.896 
0    ,1    ,256  ,127      ,112  ,0    ,6.14           ,7.417               ,0.828 
0    ,1    ,256  ,127      ,112  ,23   ,11.638         ,13.641              ,0.853 
0    ,1    ,256  ,127      ,144  ,0    ,9.767          ,10.759              ,0.908 
0    ,1    ,256  ,127      ,144  ,23   ,13.663         ,14.306              ,0.955 
0    ,1    ,256  ,127      ,16   ,0    ,3.506          ,3.647               ,0.961 
0    ,1    ,256  ,127      ,16   ,23   ,14.096         ,16.019              ,0.88  
0    ,1    ,256  ,127      ,160  ,0    ,9.716          ,10.87               ,0.894 
0    ,1    ,256  ,127      ,160  ,23   ,12.678         ,14.429              ,0.879 
0    ,1    ,256  ,127      ,192  ,0    ,11.156         ,12.571              ,0.887 
0    ,1    ,256  ,127      ,192  ,23   ,13.887         ,14.705              ,0.944 
0    ,1    ,256  ,127      ,208  ,0    ,11.074         ,12.458              ,0.889 
0    ,1    ,256  ,127      ,208  ,23   ,13.711         ,14.331              ,0.957 
0    ,1    ,256  ,127      ,240  ,0    ,11.224         ,12.857              ,0.873 
0    ,1    ,256  ,127      ,240  ,23   ,13.125         ,14.22               ,0.923 
0    ,1    ,256  ,127      ,256  ,0    ,12.597         ,14.283              ,0.882 
0    ,1    ,256  ,127      ,256  ,23   ,12.154         ,14.09               ,0.863 
0    ,1    ,256  ,127      ,288  ,0    ,12.885         ,14.712              ,0.876 
0    ,1    ,256  ,127      ,288  ,23   ,12.466         ,14.646              ,0.851 
0    ,1    ,256  ,127      ,48   ,0    ,4.674          ,5.7                 ,0.82  
0    ,1    ,256  ,127      ,48   ,23   ,13.01          ,14.715              ,0.884 
0    ,1    ,256  ,127      ,64   ,0    ,5.267          ,6.387               ,0.825 
0    ,1    ,256  ,127      ,64   ,23   ,13.105         ,15.062              ,0.87  
0    ,1    ,256  ,127      ,96   ,0    ,6.543          ,6.909               ,0.947 
0    ,1    ,256  ,127      ,96   ,23   ,11.623         ,13.606              ,0.854 
0    ,1    ,26   ,127      ,25   ,0    ,3.7            ,3.709               ,0.998 
0    ,1    ,26   ,127      ,25   ,23   ,3.766          ,4.352               ,0.865 
0    ,1    ,27   ,127      ,26   ,0    ,3.681          ,3.947               ,0.933 
0    ,1    ,27   ,127      ,26   ,23   ,3.668          ,4.398               ,0.834 
0    ,1    ,272  ,127      ,128  ,0    ,10.053         ,10.934              ,0.919 
0    ,1    ,272  ,127      ,128  ,23   ,13.747         ,14.33               ,0.959 
0    ,1    ,272  ,127      ,240  ,0    ,11.105         ,12.569              ,0.884 
0    ,1    ,272  ,127      ,240  ,23   ,13.305         ,14.284              ,0.931 
0    ,1    ,272  ,127      ,256  ,0    ,12.583         ,14.344              ,0.877 
0    ,1    ,272  ,127      ,256  ,23   ,12.561         ,14.402              ,0.872 
0    ,1    ,272  ,127      ,32   ,0    ,4.822          ,4.83                ,0.998 
0    ,1    ,272  ,127      ,32   ,23   ,13.325         ,14.978              ,0.89  
0    ,1    ,272  ,127      ,512  ,0    ,12.503         ,14.272              ,0.876 
0    ,1    ,272  ,127      ,512  ,23   ,12.508         ,14.244              ,0.878 
0    ,1    ,28   ,127      ,27   ,0    ,3.672          ,4.37                ,0.84  
0    ,1    ,28   ,127      ,27   ,23   ,3.659          ,4.465               ,0.82  
0    ,1    ,288  ,127      ,272  ,0    ,12.534         ,14.043              ,0.893 
0    ,1    ,288  ,127      ,272  ,23   ,13.204         ,15.618              ,0.845 
0    ,1    ,29   ,127      ,28   ,0    ,3.635          ,4.457               ,0.816 
0    ,1    ,29   ,127      ,28   ,23   ,3.67           ,4.827               ,0.76  
0    ,1    ,3    ,127      ,2    ,0    ,3.584          ,3.592               ,0.998 
0    ,1    ,3    ,127      ,2    ,23   ,3.577          ,3.569               ,1.002 
0    ,1    ,30   ,127      ,29   ,0    ,3.596          ,4.365               ,0.824 
0    ,1    ,30   ,127      ,29   ,23   ,3.62           ,4.661               ,0.777 
0    ,1    ,304  ,127      ,16   ,0    ,3.653          ,3.666               ,0.996 
0    ,1    ,304  ,127      ,16   ,23   ,13.304         ,15.457              ,0.861 
0    ,1    ,304  ,127      ,256  ,0    ,12.587         ,14.097              ,0.893 
0    ,1    ,304  ,127      ,256  ,23   ,12.562         ,14.481              ,0.867 
0    ,1    ,304  ,127      ,64   ,0    ,5.279          ,6.555               ,0.805 
0    ,1    ,304  ,127      ,64   ,23   ,13.121         ,14.861              ,0.883 
0    ,1    ,31   ,127      ,30   ,0    ,3.586          ,4.467               ,0.803 
0    ,1    ,31   ,127      ,30   ,23   ,3.568          ,4.776               ,0.747 
0    ,1    ,32   ,127      ,0    ,0    ,3.645          ,3.648               ,0.999 
0    ,1    ,32   ,127      ,0    ,23   ,4.841          ,5.816               ,0.832 
0    ,1    ,32   ,127      ,128  ,0    ,4.627          ,5.595               ,0.827 
0    ,1    ,32   ,127      ,128  ,23   ,4.649          ,5.333               ,0.872 
0    ,1    ,32   ,127      ,144  ,0    ,4.776          ,5.245               ,0.911 
0    ,1    ,32   ,127      ,144  ,23   ,4.805          ,5.254               ,0.915 
0    ,1    ,32   ,127      ,16   ,0    ,3.53           ,3.534               ,0.999 
0    ,1    ,32   ,127      ,16   ,23   ,4.931          ,5.879               ,0.839 
0    ,1    ,32   ,127      ,192  ,0    ,4.807          ,5.028               ,0.956 
0    ,1    ,32   ,127      ,192  ,23   ,4.66           ,5.322               ,0.876 
0    ,1    ,32   ,127      ,240  ,0    ,4.811          ,4.83                ,0.996 
0    ,1    ,32   ,127      ,240  ,23   ,4.627          ,5.454               ,0.848 
0    ,1    ,32   ,127      ,288  ,0    ,4.822          ,4.831               ,0.998 
0    ,1    ,32   ,127      ,288  ,23   ,4.566          ,4.729               ,0.966 
0    ,1    ,32   ,127      ,31   ,0    ,3.564          ,4.746               ,0.751 
0    ,1    ,32   ,127      ,31   ,23   ,4.92           ,6.186               ,0.795 
0    ,1    ,32   ,127      ,32   ,0    ,4.869          ,5.902               ,0.825 
0    ,1    ,32   ,127      ,32   ,23   ,4.656          ,5.193               ,0.897 
0    ,1    ,32   ,127      ,48   ,0    ,4.859          ,6.062               ,0.802 
0    ,1    ,32   ,127      ,48   ,23   ,4.69           ,5.338               ,0.879 
0    ,1    ,32   ,127      ,96   ,0    ,4.681          ,5.639               ,0.83  
0    ,1    ,32   ,127      ,96   ,23   ,4.766          ,5.533               ,0.861 
0    ,1    ,320  ,127      ,128  ,0    ,9.939          ,10.727              ,0.927 
0    ,1    ,320  ,127      ,128  ,23   ,15.298         ,15.876              ,0.964 
0    ,1    ,320  ,127      ,192  ,0    ,11.04          ,12.492              ,0.884 
0    ,1    ,320  ,127      ,192  ,23   ,16.42          ,17.013              ,0.965 
0    ,1    ,320  ,127      ,32   ,0    ,4.835          ,4.834               ,1.0   
0    ,1    ,320  ,127      ,32   ,23   ,14.782         ,16.742              ,0.883 
0    ,1    ,320  ,127      ,512  ,0    ,14.196         ,15.898              ,0.893 
0    ,1    ,320  ,127      ,512  ,23   ,14.053         ,15.998              ,0.878 
0    ,1    ,352  ,127      ,256  ,0    ,12.7           ,14.27               ,0.89  
0    ,1    ,352  ,127      ,256  ,23   ,15.426         ,15.609              ,0.988 
0    ,1    ,352  ,127      ,64   ,0    ,5.422          ,6.416               ,0.845 
0    ,1    ,352  ,127      ,64   ,23   ,14.687         ,16.523              ,0.889 
0    ,1    ,368  ,127      ,128  ,0    ,9.913          ,10.831              ,0.915 
0    ,1    ,368  ,127      ,128  ,23   ,15.186         ,15.821              ,0.96  
0    ,1    ,368  ,127      ,144  ,0    ,9.51           ,11.059              ,0.86  
0    ,1    ,368  ,127      ,144  ,23   ,15.085         ,15.558              ,0.97  
0    ,1    ,368  ,127      ,512  ,0    ,14.234         ,15.896              ,0.895 
0    ,1    ,368  ,127      ,512  ,23   ,13.945         ,15.618              ,0.893 
0    ,1    ,4    ,127      ,3    ,0    ,3.632          ,3.617               ,1.004 
0    ,1    ,4    ,127      ,3    ,23   ,3.637          ,3.643               ,0.999 
0    ,1    ,400  ,127      ,256  ,0    ,12.559         ,14.284              ,0.879 
0    ,1    ,400  ,127      ,256  ,23   ,16.876         ,17.411              ,0.969 
0    ,1    ,416  ,127      ,128  ,0    ,9.837          ,10.848              ,0.907 
0    ,1    ,416  ,127      ,128  ,23   ,16.675         ,17.163              ,0.972 
0    ,1    ,416  ,127      ,512  ,0    ,16.474         ,17.564              ,0.938 
0    ,1    ,416  ,127      ,512  ,23   ,15.701         ,17.825              ,0.881 
0    ,1    ,416  ,127      ,96   ,0    ,5.938          ,7.292               ,0.814 
0    ,1    ,416  ,127      ,96   ,23   ,15.155         ,17.112              ,0.886 
0    ,1    ,448  ,127      ,256  ,0    ,12.521         ,14.474              ,0.865 
0    ,1    ,448  ,127      ,256  ,23   ,18.264         ,19.268              ,0.948 
0    ,1    ,464  ,127      ,48   ,0    ,4.874          ,5.896               ,0.827 
0    ,1    ,464  ,127      ,48   ,23   ,17.857         ,19.505              ,0.916 
0    ,1    ,464  ,127      ,512  ,0    ,17.821         ,19.223              ,0.927 
0    ,1    ,464  ,127      ,512  ,23   ,17.551         ,19.683              ,0.892 
0    ,1    ,48   ,127      ,32   ,0    ,4.791          ,5.543               ,0.864 
0    ,1    ,48   ,127      ,32   ,23   ,4.744          ,5.872               ,0.808 
0    ,1    ,496  ,127      ,256  ,0    ,12.58          ,14.438              ,0.871 
0    ,1    ,496  ,127      ,256  ,23   ,18.256         ,19.055              ,0.958 
0    ,1    ,5    ,127      ,4    ,0    ,3.653          ,3.663               ,0.997 
0    ,1    ,5    ,127      ,4    ,23   ,3.622          ,3.608               ,1.004 
0    ,1    ,512  ,127      ,0    ,0    ,3.573          ,3.4                 ,1.051 
0    ,1    ,512  ,127      ,0    ,23   ,20.054         ,21.502              ,0.933 
0    ,1    ,512  ,127      ,144  ,0    ,9.761          ,10.997              ,0.888 
0    ,1    ,512  ,127      ,144  ,23   ,19.22          ,20.417              ,0.941 
0    ,1    ,512  ,127      ,192  ,0    ,11.159         ,12.818              ,0.871 
0    ,1    ,512  ,127      ,192  ,23   ,19.607         ,20.825              ,0.941 
0    ,1    ,512  ,127      ,224  ,0    ,11.047         ,12.752              ,0.866 
0    ,1    ,512  ,127      ,224  ,23   ,19.094         ,20.966              ,0.911 
0    ,1    ,512  ,127      ,240  ,0    ,11.033         ,12.452              ,0.886 
0    ,1    ,512  ,127      ,240  ,23   ,19.086         ,20.511              ,0.931 
0    ,1    ,512  ,127      ,272  ,0    ,12.532         ,13.969              ,0.897 
0    ,1    ,512  ,127      ,272  ,23   ,19.826         ,20.773              ,0.954 
0    ,1    ,512  ,127      ,288  ,0    ,12.496         ,14.261              ,0.876 
0    ,1    ,512  ,127      ,288  ,23   ,19.445         ,20.762              ,0.937 
0    ,1    ,512  ,127      ,320  ,0    ,14.078         ,16.046              ,0.877 
0    ,1    ,512  ,127      ,320  ,23   ,20.954         ,21.579              ,0.971 
0    ,1    ,512  ,127      ,368  ,0    ,14.253         ,15.703              ,0.908 
0    ,1    ,512  ,127      ,368  ,23   ,19.604         ,20.677              ,0.948 
0    ,1    ,512  ,127      ,416  ,0    ,16.554         ,17.606              ,0.94  
0    ,1    ,512  ,127      ,416  ,23   ,19.012         ,21.311              ,0.892 
0    ,1    ,512  ,127      ,464  ,0    ,17.839         ,19.299              ,0.924 
0    ,1    ,512  ,127      ,464  ,23   ,22.135         ,22.763              ,0.972 
0    ,1    ,512  ,127      ,48   ,0    ,4.661          ,6.264               ,0.744 
0    ,1    ,512  ,127      ,48   ,23   ,19.491         ,21.405              ,0.911 
0    ,1    ,512  ,127      ,512  ,0    ,19.234         ,20.714              ,0.929 
0    ,1    ,512  ,127      ,512  ,23   ,19.288         ,20.435              ,0.944 
0    ,1    ,512  ,127      ,96   ,0    ,5.952          ,6.848               ,0.869 
0    ,1    ,512  ,127      ,96   ,23   ,18.592         ,20.281              ,0.917 
0    ,1    ,544  ,127      ,256  ,0    ,12.612         ,14.768              ,0.854 
0    ,1    ,544  ,127      ,256  ,23   ,20.733         ,21.594              ,0.96  
0    ,1    ,560  ,127      ,512  ,0    ,19.59          ,20.681              ,0.947 
0    ,1    ,560  ,127      ,512  ,23   ,19.847         ,21.351              ,0.93  
0    ,1    ,6    ,127      ,5    ,0    ,3.671          ,3.658               ,1.003 
0    ,1    ,6    ,127      ,5    ,23   ,3.663          ,3.665               ,1.0   
0    ,1    ,608  ,127      ,512  ,0    ,19.259         ,20.855              ,0.923 
0    ,1    ,608  ,127      ,512  ,23   ,21.831         ,22.177              ,0.984 
0    ,1    ,64   ,127      ,0    ,0    ,3.591          ,3.584               ,1.002 
0    ,1    ,64   ,127      ,0    ,23   ,6.11           ,7.48                ,0.817 
0    ,1    ,64   ,127      ,144  ,0    ,5.299          ,6.624               ,0.8   
0    ,1    ,64   ,127      ,144  ,23   ,6.103          ,7.954               ,0.767 
0    ,1    ,64   ,127      ,16   ,0    ,3.547          ,3.54                ,1.002 
0    ,1    ,64   ,127      ,16   ,23   ,6.239          ,7.433               ,0.839 
0    ,1    ,64   ,127      ,192  ,0    ,5.228          ,6.494               ,0.805 
0    ,1    ,64   ,127      ,192  ,23   ,6.089          ,7.405               ,0.822 
0    ,1    ,64   ,127      ,240  ,0    ,5.337          ,6.325               ,0.844 
0    ,1    ,64   ,127      ,240  ,23   ,5.933          ,7.424               ,0.799 
0    ,1    ,64   ,127      ,256  ,0    ,5.153          ,6.461               ,0.798 
0    ,1    ,64   ,127      ,256  ,23   ,6.065          ,7.338               ,0.827 
0    ,1    ,64   ,127      ,288  ,0    ,5.255          ,6.329               ,0.83  
0    ,1    ,64   ,127      ,288  ,23   ,6.204          ,7.661               ,0.81  
0    ,1    ,64   ,127      ,48   ,0    ,4.77           ,6.012               ,0.793 
0    ,1    ,64   ,127      ,48   ,23   ,6.232          ,6.901               ,0.903 
0    ,1    ,64   ,127      ,64   ,0    ,5.194          ,6.384               ,0.814 
0    ,1    ,64   ,127      ,64   ,23   ,6.168          ,7.698               ,0.801 
0    ,1    ,64   ,127      ,96   ,0    ,5.355          ,6.648               ,0.805 
0    ,1    ,64   ,127      ,96   ,23   ,6.225          ,7.239               ,0.86  
0    ,1    ,656  ,127      ,512  ,0    ,19.237         ,20.849              ,0.923 
0    ,1    ,656  ,127      ,512  ,23   ,25.157         ,25.417              ,0.99  
0    ,1    ,7    ,127      ,6    ,0    ,3.682          ,3.676               ,1.002 
0    ,1    ,7    ,127      ,6    ,23   ,3.681          ,3.686               ,0.999 
0    ,1    ,704  ,127      ,512  ,0    ,19.416         ,20.861              ,0.931 
0    ,1    ,704  ,127      ,512  ,23   ,26.922         ,27.455              ,0.981 
0    ,1    ,736  ,127      ,1024 ,0    ,23.724         ,25.647              ,0.925 
0    ,1    ,736  ,127      ,1024 ,23   ,24.28          ,25.158              ,0.965 
0    ,1    ,736  ,127      ,288  ,0    ,12.557         ,14.263              ,0.88  
0    ,1    ,736  ,127      ,288  ,23   ,24.1           ,25.359              ,0.95  
0    ,1    ,752  ,127      ,512  ,0    ,19.288         ,20.77               ,0.929 
0    ,1    ,752  ,127      ,512  ,23   ,24.721         ,25.492              ,0.97  
0    ,1    ,784  ,127      ,1024 ,0    ,25.562         ,27.595              ,0.926 
0    ,1    ,784  ,127      ,1024 ,23   ,25.672         ,26.564              ,0.966 
0    ,1    ,784  ,127      ,240  ,0    ,11.077         ,12.808              ,0.865 
0    ,1    ,784  ,127      ,240  ,23   ,25.631         ,27.289              ,0.939 
0    ,1    ,8    ,127      ,7    ,0    ,3.672          ,3.684               ,0.997 
0    ,1    ,8    ,127      ,7    ,23   ,3.673          ,3.677               ,0.999 
0    ,1    ,80   ,127      ,128  ,0    ,5.302          ,6.554               ,0.809 
0    ,1    ,80   ,127      ,128  ,23   ,6.077          ,7.276               ,0.835 
0    ,1    ,80   ,127      ,32   ,0    ,4.847          ,5.65                ,0.858 
0    ,1    ,80   ,127      ,32   ,23   ,5.625          ,7.083               ,0.794 
0    ,1    ,80   ,127      ,48   ,0    ,4.631          ,6.016               ,0.77  
0    ,1    ,80   ,127      ,48   ,23   ,5.831          ,7.136               ,0.817 
0    ,1    ,80   ,127      ,64   ,0    ,5.316          ,6.333               ,0.84  
0    ,1    ,80   ,127      ,64   ,23   ,5.267          ,6.441               ,0.818 
0    ,1    ,800  ,127      ,512  ,0    ,19.236         ,20.914              ,0.92  
0    ,1    ,800  ,127      ,512  ,23   ,27.817         ,28.042              ,0.992 
0    ,1    ,832  ,127      ,1024 ,0    ,27.381         ,28.889              ,0.948 
0    ,1    ,832  ,127      ,1024 ,23   ,27.413         ,28.736              ,0.954 
0    ,1    ,832  ,127      ,192  ,0    ,11.178         ,12.692              ,0.881 
0    ,1    ,832  ,127      ,192  ,23   ,27.812         ,28.661              ,0.97  
0    ,1    ,880  ,127      ,1024 ,0    ,27.289         ,29.033              ,0.94  
0    ,1    ,880  ,127      ,1024 ,23   ,27.349         ,28.569              ,0.957 
0    ,1    ,880  ,127      ,144  ,0    ,9.651          ,10.911              ,0.885 
0    ,1    ,880  ,127      ,144  ,23   ,27.463         ,28.561              ,0.962 
0    ,1    ,9    ,127      ,8    ,0    ,3.673          ,3.681               ,0.998 
0    ,1    ,9    ,127      ,8    ,23   ,3.677          ,3.681               ,0.999 
0    ,1    ,928  ,127      ,1024 ,0    ,29.345         ,30.681              ,0.956 
0    ,1    ,928  ,127      ,1024 ,23   ,28.9           ,30.428              ,0.95  
0    ,1    ,928  ,127      ,96   ,0    ,5.816          ,7.183               ,0.81  
0    ,1    ,928  ,127      ,96   ,23   ,28.379         ,30.276              ,0.937 
0    ,1    ,96   ,127      ,80   ,0    ,5.241          ,6.312               ,0.83  
0    ,1    ,96   ,127      ,80   ,23   ,6.822          ,8.131               ,0.839 
0    ,1    ,976  ,127      ,1024 ,0    ,30.829         ,32.366              ,0.953 
0    ,1    ,976  ,127      ,1024 ,23   ,30.546         ,32.039              ,0.953 
0    ,1    ,976  ,127      ,48   ,0    ,4.598          ,5.709               ,0.805 
0    ,1    ,976  ,127      ,48   ,23   ,31.183         ,32.569              ,0.957 
0    ,16   ,1    ,127      ,0    ,23   ,3.423          ,3.476               ,0.985 
0    ,16   ,10   ,127      ,9    ,23   ,3.672          ,3.68                ,0.998 
0    ,16   ,1024 ,127      ,0    ,23   ,35.156         ,35.625              ,0.987 
0    ,16   ,1024 ,127      ,1024 ,23   ,46.115         ,51.936              ,0.888 
0    ,16   ,1024 ,127      ,144  ,23   ,32.346         ,33.068              ,0.978 
0    ,16   ,1024 ,127      ,192  ,23   ,33.999         ,34.024              ,0.999 
0    ,16   ,1024 ,127      ,240  ,23   ,33.276         ,33.903              ,0.981 
0    ,16   ,1024 ,127      ,288  ,23   ,34.194         ,35.141              ,0.973 
0    ,16   ,1024 ,127      ,48   ,23   ,33.991         ,35.283              ,0.963 
0    ,16   ,1024 ,127      ,736  ,23   ,39.45          ,43.531              ,0.906 
0    ,16   ,1024 ,127      ,784  ,23   ,41.73          ,44.826              ,0.931 
0    ,16   ,1024 ,127      ,832  ,23   ,42.524         ,46.58               ,0.913 
0    ,16   ,1024 ,127      ,880  ,23   ,41.334         ,46.709              ,0.885 
0    ,16   ,1024 ,127      ,928  ,23   ,43.223         ,48.508              ,0.891 
0    ,16   ,1024 ,127      ,96   ,23   ,31.878         ,32.982              ,0.967 
0    ,16   ,1024 ,127      ,976  ,23   ,45.181         ,50.271              ,0.899 
0    ,16   ,1072 ,127      ,1024 ,23   ,44.743         ,52.252              ,0.856 
0    ,16   ,11   ,127      ,10   ,23   ,3.676          ,3.681               ,0.999 
0    ,16   ,112  ,127      ,144  ,23   ,6.145          ,7.762               ,0.792 
0    ,16   ,112  ,127      ,16   ,23   ,7.397          ,8.975               ,0.824 
0    ,16   ,112  ,127      ,256  ,23   ,6.223          ,7.516               ,0.828 
0    ,16   ,112  ,127      ,64   ,23   ,6.857          ,8.175               ,0.839 
0    ,16   ,112  ,127      ,96   ,23   ,6.036          ,8.011               ,0.753 
0    ,16   ,1120 ,127      ,1024 ,23   ,47.607         ,53.044              ,0.898 
0    ,16   ,1168 ,127      ,1024 ,23   ,49.148         ,54.622              ,0.9   
0    ,16   ,12   ,127      ,11   ,23   ,3.724          ,3.714               ,1.003 
0    ,16   ,1216 ,127      ,1024 ,23   ,50.295         ,56.053              ,0.897 
0    ,16   ,1264 ,127      ,1024 ,23   ,50.214         ,56.184              ,0.894 
0    ,16   ,128  ,127      ,0    ,23   ,10.092         ,11.285              ,0.894 
0    ,16   ,128  ,127      ,112  ,23   ,10.241         ,12.219              ,0.838 
0    ,16   ,128  ,127      ,128  ,23   ,8.506          ,9.882               ,0.861 
0    ,16   ,128  ,127      ,144  ,23   ,8.673          ,10.425              ,0.832 
0    ,16   ,128  ,127      ,192  ,23   ,8.597          ,10.005              ,0.859 
0    ,16   ,128  ,127      ,240  ,23   ,8.82           ,9.903               ,0.891 
0    ,16   ,128  ,127      ,288  ,23   ,9.115          ,9.831               ,0.927 
0    ,16   ,128  ,127      ,32   ,23   ,9.89           ,11.123              ,0.889 
0    ,16   ,128  ,127      ,48   ,23   ,9.811          ,10.943              ,0.897 
0    ,16   ,128  ,127      ,80   ,23   ,9.694          ,11.279              ,0.859 
0    ,16   ,128  ,127      ,96   ,23   ,8.558          ,10.178              ,0.841 
0    ,16   ,13   ,127      ,12   ,23   ,3.717          ,3.723               ,0.998 
0    ,16   ,1312 ,127      ,1024 ,23   ,51.791         ,56.858              ,0.911 
0    ,16   ,14   ,127      ,13   ,23   ,3.727          ,3.742               ,0.996 
0    ,16   ,144  ,127      ,128  ,23   ,9.626          ,11.354              ,0.848 
0    ,16   ,15   ,127      ,14   ,23   ,3.757          ,3.761               ,0.999 
0    ,16   ,16   ,127      ,0    ,23   ,3.44           ,3.463               ,0.994 
0    ,16   ,16   ,127      ,144  ,23   ,3.494          ,3.485               ,1.002 
0    ,16   ,16   ,127      ,15   ,23   ,3.759          ,3.737               ,1.006 
0    ,16   ,16   ,127      ,16   ,23   ,3.443          ,3.436               ,1.002 
0    ,16   ,16   ,127      ,192  ,23   ,3.441          ,3.446               ,0.998 
0    ,16   ,16   ,127      ,240  ,23   ,3.507          ,3.437               ,1.02  
0    ,16   ,16   ,127      ,256  ,23   ,4.063          ,4.208               ,0.965 
0    ,16   ,16   ,127      ,288  ,23   ,3.942          ,5.13                ,0.768 
0    ,16   ,16   ,127      ,48   ,23   ,3.506          ,3.49                ,1.004 
0    ,16   ,16   ,127      ,64   ,23   ,3.463          ,3.45                ,1.004 
0    ,16   ,16   ,127      ,96   ,23   ,3.556          ,3.59                ,0.991 
0    ,16   ,160  ,127      ,144  ,23   ,9.543          ,10.925              ,0.873 
0    ,16   ,160  ,127      ,16   ,23   ,11.937         ,13.496              ,0.884 
0    ,16   ,160  ,127      ,256  ,23   ,9.598          ,10.946              ,0.877 
0    ,16   ,160  ,127      ,64   ,23   ,9.934          ,11.743              ,0.846 
0    ,16   ,160  ,127      ,96   ,23   ,8.443          ,10.136              ,0.833 
0    ,16   ,17   ,127      ,16   ,23   ,3.727          ,3.732               ,0.999 
0    ,16   ,176  ,127      ,128  ,23   ,9.582          ,11.262              ,0.851 
0    ,16   ,176  ,127      ,160  ,23   ,9.554          ,11.055              ,0.864 
0    ,16   ,176  ,127      ,32   ,23   ,9.86           ,11.081              ,0.89  
0    ,16   ,1760 ,127      ,2048 ,23   ,63.084         ,66.584              ,0.947 
0    ,16   ,1760 ,127      ,288  ,23   ,53.245         ,69.121              ,0.77  
0    ,16   ,18   ,127      ,17   ,23   ,3.742          ,3.752               ,0.997 
0    ,16   ,1808 ,127      ,2048 ,23   ,68.785         ,84.159              ,0.817 
0    ,16   ,1808 ,127      ,240  ,23   ,54.581         ,72.741              ,0.75  
0    ,16   ,1856 ,127      ,192  ,23   ,56.278         ,75.587              ,0.745 
0    ,16   ,1856 ,127      ,2048 ,23   ,73.053         ,80.743              ,0.905 
0    ,16   ,19   ,127      ,18   ,23   ,3.737          ,3.733               ,1.001 
0    ,16   ,1904 ,127      ,144  ,23   ,54.328         ,73.507              ,0.739 
0    ,16   ,1904 ,127      ,2048 ,23   ,74.909         ,92.431              ,0.81  
0    ,16   ,192  ,127      ,176  ,23   ,11.059         ,12.261              ,0.902 
0    ,16   ,1952 ,127      ,2048 ,23   ,78.504         ,90.135              ,0.871 
0    ,16   ,1952 ,127      ,96   ,23   ,79.378         ,74.225              ,1.069 
0    ,16   ,2    ,127      ,1    ,23   ,3.537          ,3.558               ,0.994 
0    ,16   ,20   ,127      ,19   ,23   ,3.709          ,3.708               ,1.0   
0    ,16   ,2000 ,127      ,2048 ,23   ,99.918         ,98.989              ,1.009 
0    ,16   ,2000 ,127      ,48   ,23   ,76.753         ,80.542              ,0.953 
0    ,16   ,2048 ,127      ,0    ,23   ,87.485         ,82.133              ,1.065 
0    ,16   ,2048 ,127      ,1024 ,23   ,72.783         ,76.22               ,0.955 
0    ,16   ,2048 ,127      ,128  ,23   ,59.247         ,78.764              ,0.752 
0    ,16   ,2048 ,127      ,144  ,23   ,59.192         ,79.441              ,0.745 
0    ,16   ,2048 ,127      ,1760 ,23   ,73.833         ,77.765              ,0.949 
0    ,16   ,2048 ,127      ,1808 ,23   ,74.339         ,77.085              ,0.964 
0    ,16   ,2048 ,127      ,1856 ,23   ,77.191         ,81.074              ,0.952 
0    ,16   ,2048 ,127      ,1904 ,23   ,76.824         ,82.066              ,0.936 
0    ,16   ,2048 ,127      ,192  ,23   ,86.203         ,80.613              ,1.069 
0    ,16   ,2048 ,127      ,1952 ,23   ,77.058         ,82.31               ,0.936 
0    ,16   ,2048 ,127      ,2000 ,23   ,77.78          ,81.103              ,0.959 
0    ,16   ,2048 ,127      ,2048 ,23   ,89.228         ,98.592              ,0.905 
0    ,16   ,2048 ,127      ,240  ,23   ,89.15          ,80.75               ,1.104 
0    ,16   ,2048 ,127      ,256  ,23   ,76.621         ,80.8                ,0.948 
0    ,16   ,2048 ,127      ,288  ,23   ,76.627         ,81.19               ,0.944 
0    ,16   ,2048 ,127      ,32   ,23   ,83.681         ,83.687              ,1.0   
0    ,16   ,2048 ,127      ,4096 ,23   ,66.343         ,66.725              ,0.994 
0    ,16   ,2048 ,127      ,48   ,23   ,85.866         ,82.182              ,1.045 
0    ,16   ,2048 ,127      ,512  ,23   ,66.063         ,86.839              ,0.761 
0    ,16   ,2048 ,127      ,64   ,23   ,81.378         ,82.892              ,0.982 
0    ,16   ,2048 ,127      ,96   ,23   ,83.878         ,82.449              ,1.017 
0    ,16   ,208  ,127      ,16   ,23   ,13.435         ,15.442              ,0.87  
0    ,16   ,208  ,127      ,192  ,23   ,11.717         ,13.484              ,0.869 
0    ,16   ,208  ,127      ,256  ,23   ,11.884         ,13.591              ,0.874 
0    ,16   ,208  ,127      ,48   ,23   ,11.524         ,12.662              ,0.91  
0    ,16   ,208  ,127      ,64   ,23   ,11.472         ,13.313              ,0.862 
0    ,16   ,2096 ,127      ,2048 ,23   ,93.442         ,101.974             ,0.916 
0    ,16   ,21   ,127      ,20   ,23   ,3.709          ,3.718               ,0.998 
0    ,16   ,2144 ,127      ,2048 ,23   ,80.888         ,84.367              ,0.959 
0    ,16   ,2192 ,127      ,2048 ,23   ,82.046         ,86.499              ,0.949 
0    ,16   ,22   ,127      ,21   ,23   ,3.733          ,3.746               ,0.997 
0    ,16   ,224  ,127      ,128  ,23   ,12.25          ,12.453              ,0.984 
0    ,16   ,224  ,127      ,208  ,23   ,11.703         ,13.484              ,0.868 
0    ,16   ,224  ,127      ,288  ,23   ,11.943         ,13.419              ,0.89  
0    ,16   ,224  ,127      ,32   ,23   ,11.735         ,13.127              ,0.894 
0    ,16   ,224  ,127      ,512  ,23   ,11.823         ,13.665              ,0.865 
0    ,16   ,2240 ,127      ,2048 ,23   ,82.929         ,86.557              ,0.958 
0    ,16   ,2288 ,127      ,2048 ,23   ,84.119         ,88.351              ,0.952 
0    ,16   ,23   ,127      ,22   ,23   ,3.697          ,3.701               ,0.999 
0    ,16   ,2336 ,127      ,2048 ,23   ,86.203         ,91.177              ,0.945 
0    ,16   ,24   ,127      ,23   ,23   ,3.708          ,3.727               ,0.995 
0    ,16   ,240  ,127      ,224  ,23   ,11.697         ,13.366              ,0.875 
0    ,16   ,25   ,127      ,24   ,23   ,3.686          ,3.691               ,0.998 
0    ,16   ,256  ,127      ,0    ,23   ,13.148         ,14.87               ,0.884 
0    ,16   ,256  ,127      ,112  ,23   ,11.602         ,13.654              ,0.85  
0    ,16   ,256  ,127      ,144  ,23   ,13.744         ,14.255              ,0.964 
0    ,16   ,256  ,127      ,16   ,23   ,14.073         ,15.819              ,0.89  
0    ,16   ,256  ,127      ,160  ,23   ,13.033         ,13.983              ,0.932 
0    ,16   ,256  ,127      ,192  ,23   ,15.173         ,15.751              ,0.963 
0    ,16   ,256  ,127      ,208  ,23   ,15.162         ,15.174              ,0.999 
0    ,16   ,256  ,127      ,240  ,23   ,14.087         ,15.432              ,0.913 
0    ,16   ,256  ,127      ,256  ,23   ,13.781         ,14.937              ,0.923 
0    ,16   ,256  ,127      ,288  ,23   ,14.053         ,15.58               ,0.902 
0    ,16   ,256  ,127      ,48   ,23   ,12.989         ,14.678              ,0.885 
0    ,16   ,256  ,127      ,64   ,23   ,13.104         ,14.815              ,0.885 
0    ,16   ,256  ,127      ,96   ,23   ,12.296         ,13.633              ,0.902 
0    ,16   ,26   ,127      ,25   ,23   ,3.694          ,4.014               ,0.92  
0    ,16   ,27   ,127      ,26   ,23   ,3.667          ,4.435               ,0.827 
0    ,16   ,272  ,127      ,128  ,23   ,14.104         ,14.31               ,0.986 
0    ,16   ,272  ,127      ,240  ,23   ,13.943         ,15.081              ,0.925 
0    ,16   ,272  ,127      ,256  ,23   ,14.874         ,16.778              ,0.886 
0    ,16   ,272  ,127      ,32   ,23   ,13.353         ,14.858              ,0.899 
0    ,16   ,272  ,127      ,512  ,23   ,15.361         ,16.809              ,0.914 
0    ,16   ,28   ,127      ,27   ,23   ,3.61           ,4.249               ,0.85  
0    ,16   ,288  ,127      ,272  ,23   ,14.759         ,16.717              ,0.883 
0    ,16   ,29   ,127      ,28   ,23   ,3.586          ,4.717               ,0.76  
0    ,16   ,3    ,127      ,2    ,23   ,3.55           ,3.552               ,1.0   
0    ,16   ,30   ,127      ,29   ,23   ,3.556          ,4.517               ,0.787 
0    ,16   ,304  ,127      ,16   ,23   ,13.226         ,15.267              ,0.866 
0    ,16   ,304  ,127      ,256  ,23   ,14.781         ,16.331              ,0.905 
0    ,16   ,304  ,127      ,64   ,23   ,13.231         ,14.884              ,0.889 
0    ,16   ,31   ,127      ,30   ,23   ,3.557          ,4.682               ,0.76  
0    ,16   ,32   ,127      ,0    ,23   ,4.746          ,5.742               ,0.827 
0    ,16   ,32   ,127      ,128  ,23   ,4.737          ,5.81                ,0.815 
0    ,16   ,32   ,127      ,144  ,23   ,4.794          ,5.435               ,0.882 
0    ,16   ,32   ,127      ,16   ,23   ,5.018          ,5.978               ,0.839 
0    ,16   ,32   ,127      ,192  ,23   ,4.794          ,5.239               ,0.915 
0    ,16   ,32   ,127      ,240  ,23   ,4.836          ,5.23                ,0.925 
0    ,16   ,32   ,127      ,288  ,23   ,4.991          ,5.088               ,0.981 
0    ,16   ,32   ,127      ,31   ,23   ,4.977          ,6.171               ,0.807 
0    ,16   ,32   ,127      ,32   ,23   ,4.722          ,5.156               ,0.916 
0    ,16   ,32   ,127      ,48   ,23   ,4.839          ,5.381               ,0.899 
0    ,16   ,32   ,127      ,96   ,23   ,4.767          ,5.31                ,0.898 
0    ,16   ,320  ,127      ,128  ,23   ,15.782         ,15.903              ,0.992 
0    ,16   ,320  ,127      ,192  ,23   ,16.323         ,16.866              ,0.968 
0    ,16   ,320  ,127      ,32   ,23   ,14.879         ,16.277              ,0.914 
0    ,16   ,320  ,127      ,512  ,23   ,16.451         ,17.92               ,0.918 
0    ,16   ,352  ,127      ,256  ,23   ,17.511         ,17.819              ,0.983 
0    ,16   ,352  ,127      ,64   ,23   ,14.815         ,16.341              ,0.907 
0    ,16   ,368  ,127      ,128  ,23   ,15.423         ,15.674              ,0.984 
0    ,16   ,368  ,127      ,144  ,23   ,15.187         ,15.589              ,0.974 
0    ,16   ,368  ,127      ,512  ,23   ,17.211         ,18.847              ,0.913 
0    ,16   ,4    ,127      ,3    ,23   ,3.626          ,3.626               ,1.0   
0    ,16   ,400  ,127      ,256  ,23   ,18.667         ,19.108              ,0.977 
0    ,16   ,416  ,127      ,128  ,23   ,17.039         ,17.143              ,0.994 
0    ,16   ,416  ,127      ,512  ,23   ,19.609         ,22.131              ,0.886 
0    ,16   ,416  ,127      ,96   ,23   ,15.149         ,16.801              ,0.902 
0    ,16   ,448  ,127      ,256  ,23   ,20.406         ,20.88               ,0.977 
0    ,16   ,464  ,127      ,48   ,23   ,17.838         ,19.562              ,0.912 
0    ,16   ,464  ,127      ,512  ,23   ,21.966         ,24.893              ,0.882 
0    ,16   ,48   ,127      ,32   ,23   ,4.756          ,5.976               ,0.796 
0    ,16   ,496  ,127      ,256  ,23   ,20.015         ,20.996              ,0.953 
0    ,16   ,5    ,127      ,4    ,23   ,3.64           ,3.645               ,0.999 
0    ,16   ,512  ,127      ,0    ,23   ,20.138         ,21.422              ,0.94  
0    ,16   ,512  ,127      ,144  ,23   ,19.299         ,20.491              ,0.942 
0    ,16   ,512  ,127      ,192  ,23   ,20.641         ,21.228              ,0.972 
0    ,16   ,512  ,127      ,224  ,23   ,19.919         ,21.418              ,0.93  
0    ,16   ,512  ,127      ,240  ,23   ,19.886         ,21.227              ,0.937 
0    ,16   ,512  ,127      ,272  ,23   ,21.673         ,22.245              ,0.974 
0    ,16   ,512  ,127      ,288  ,23   ,20.776         ,22.48               ,0.924 
0    ,16   ,512  ,127      ,320  ,23   ,22.71          ,23.222              ,0.978 
0    ,16   ,512  ,127      ,368  ,23   ,21.441         ,23.129              ,0.927 
0    ,16   ,512  ,127      ,416  ,23   ,22.708         ,24.682              ,0.92  
0    ,16   ,512  ,127      ,464  ,23   ,24.446         ,26.164              ,0.934 
0    ,16   ,512  ,127      ,48   ,23   ,19.545         ,21.413              ,0.913 
0    ,16   ,512  ,127      ,512  ,23   ,23.411         ,25.834              ,0.906 
0    ,16   ,512  ,127      ,96   ,23   ,18.371         ,20.211              ,0.909 
0    ,16   ,544  ,127      ,256  ,23   ,21.697         ,22.356              ,0.971 
0    ,16   ,560  ,127      ,512  ,23   ,23.957         ,27.865              ,0.86  
0    ,16   ,6    ,127      ,5    ,23   ,3.632          ,3.631               ,1.0   
0    ,16   ,608  ,127      ,512  ,23   ,27.142         ,28.995              ,0.936 
0    ,16   ,64   ,127      ,0    ,23   ,5.987          ,7.428               ,0.806 
0    ,16   ,64   ,127      ,144  ,23   ,5.619          ,7.135               ,0.787 
0    ,16   ,64   ,127      ,16   ,23   ,6.311          ,7.747               ,0.815 
0    ,16   ,64   ,127      ,192  ,23   ,5.832          ,6.837               ,0.853 
0    ,16   ,64   ,127      ,240  ,23   ,5.56           ,6.734               ,0.826 
0    ,16   ,64   ,127      ,256  ,23   ,5.577          ,7.227               ,0.772 
0    ,16   ,64   ,127      ,288  ,23   ,5.588          ,6.94                ,0.805 
0    ,16   ,64   ,127      ,48   ,23   ,6.014          ,6.969               ,0.863 
0    ,16   ,64   ,127      ,64   ,23   ,5.462          ,6.745               ,0.81  
0    ,16   ,64   ,127      ,96   ,23   ,5.712          ,6.962               ,0.82  
0    ,16   ,656  ,127      ,512  ,23   ,28.971         ,30.462              ,0.951 
0    ,16   ,7    ,127      ,6    ,23   ,3.667          ,3.676               ,0.998 
0    ,16   ,704  ,127      ,512  ,23   ,30.073         ,32.169              ,0.935 
0    ,16   ,736  ,127      ,1024 ,23   ,32.012         ,37.302              ,0.858 
0    ,16   ,736  ,127      ,288  ,23   ,25.764         ,26.994              ,0.954 
0    ,16   ,752  ,127      ,512  ,23   ,29.962         ,31.965              ,0.937 
0    ,16   ,784  ,127      ,1024 ,23   ,35.242         ,39.835              ,0.885 
0    ,16   ,784  ,127      ,240  ,23   ,26.927         ,27.763              ,0.97  
0    ,16   ,8    ,127      ,7    ,23   ,3.673          ,3.675               ,0.999 
0    ,16   ,80   ,127      ,128  ,23   ,5.101          ,6.469               ,0.789 
0    ,16   ,80   ,127      ,32   ,23   ,5.631          ,6.966               ,0.808 
0    ,16   ,80   ,127      ,48   ,23   ,5.592          ,7.347               ,0.761 
0    ,16   ,80   ,127      ,64   ,23   ,5.51           ,6.519               ,0.845 
0    ,16   ,800  ,127      ,512  ,23   ,31.451         ,33.341              ,0.943 
0    ,16   ,832  ,127      ,1024 ,23   ,37.751         ,41.145              ,0.918 
0    ,16   ,832  ,127      ,192  ,23   ,29.179         ,29.61               ,0.985 
0    ,16   ,880  ,127      ,1024 ,23   ,36.699         ,43.216              ,0.849 
0    ,16   ,880  ,127      ,144  ,23   ,27.501         ,28.428              ,0.967 
0    ,16   ,9    ,127      ,8    ,23   ,3.697          ,3.69                ,1.002 
0    ,16   ,928  ,127      ,1024 ,23   ,39.492         ,45.945              ,0.86  
0    ,16   ,928  ,127      ,96   ,23   ,28.492         ,29.903              ,0.953 
0    ,16   ,96   ,127      ,80   ,23   ,6.925          ,8.89                ,0.779 
0    ,16   ,976  ,127      ,1024 ,23   ,42.324         ,49.155              ,0.861 
0    ,16   ,976  ,127      ,48   ,23   ,30.692         ,33.073              ,0.928 
0    ,256  ,1    ,127      ,0    ,23   ,3.411          ,3.412               ,1.0   
0    ,256  ,10   ,127      ,9    ,23   ,3.686          ,3.684               ,1.0   
0    ,256  ,1024 ,127      ,0    ,23   ,34.709         ,35.556              ,0.976 
0    ,256  ,1024 ,127      ,1024 ,23   ,45.274         ,52.188              ,0.868 
0    ,256  ,1024 ,127      ,144  ,23   ,32.07          ,33.091              ,0.969 
0    ,256  ,1024 ,127      ,192  ,23   ,33.703         ,33.832              ,0.996 
0    ,256  ,1024 ,127      ,240  ,23   ,33.109         ,33.946              ,0.975 
0    ,256  ,1024 ,127      ,288  ,23   ,34.084         ,34.92               ,0.976 
0    ,256  ,1024 ,127      ,48   ,23   ,34.475         ,35.329              ,0.976 
0    ,256  ,1024 ,127      ,736  ,23   ,39.61          ,43.488              ,0.911 
0    ,256  ,1024 ,127      ,784  ,23   ,41.782         ,45.546              ,0.917 
0    ,256  ,1024 ,127      ,832  ,23   ,43.057         ,47.044              ,0.915 
0    ,256  ,1024 ,127      ,880  ,23   ,41.487         ,46.535              ,0.892 
0    ,256  ,1024 ,127      ,928  ,23   ,43.168         ,48.525              ,0.89  
0    ,256  ,1024 ,127      ,96   ,23   ,33.652         ,34.474              ,0.976 
0    ,256  ,1024 ,127      ,976  ,23   ,46.616         ,52.153              ,0.894 
0    ,256  ,1072 ,127      ,1024 ,23   ,44.604         ,52.102              ,0.856 
0    ,256  ,11   ,127      ,10   ,23   ,3.676          ,3.693               ,0.995 
0    ,256  ,112  ,127      ,144  ,23   ,5.759          ,8.036               ,0.717 
0    ,256  ,112  ,127      ,16   ,23   ,7.152          ,8.592               ,0.832 
0    ,256  ,112  ,127      ,256  ,23   ,6.546          ,7.122               ,0.919 
0    ,256  ,112  ,127      ,64   ,23   ,7.357          ,8.239               ,0.893 
0    ,256  ,112  ,127      ,96   ,23   ,6.03           ,7.705               ,0.783 
0    ,256  ,1120 ,127      ,1024 ,23   ,48.567         ,55.012              ,0.883 
0    ,256  ,1168 ,127      ,1024 ,23   ,50.174         ,56.468              ,0.889 
0    ,256  ,12   ,127      ,11   ,23   ,3.694          ,3.7                 ,0.999 
0    ,256  ,1216 ,127      ,1024 ,23   ,52.104         ,62.223              ,0.837 
0    ,256  ,1264 ,127      ,1024 ,23   ,51.973         ,61.665              ,0.843 
0    ,256  ,128  ,127      ,0    ,23   ,10.291         ,11.387              ,0.904 
0    ,256  ,128  ,127      ,112  ,23   ,10.013         ,12.404              ,0.807 
0    ,256  ,128  ,127      ,128  ,23   ,8.508          ,9.901               ,0.859 
0    ,256  ,128  ,127      ,144  ,23   ,8.673          ,10.408              ,0.833 
0    ,256  ,128  ,127      ,192  ,23   ,9.455          ,10.24               ,0.923 
0    ,256  ,128  ,127      ,240  ,23   ,9.477          ,9.951               ,0.952 
0    ,256  ,128  ,127      ,288  ,23   ,8.708          ,10.013              ,0.87  
0    ,256  ,128  ,127      ,32   ,23   ,10.207         ,11.374              ,0.897 
0    ,256  ,128  ,127      ,48   ,23   ,9.979          ,11.239              ,0.888 
0    ,256  ,128  ,127      ,80   ,23   ,9.717          ,11.026              ,0.881 
0    ,256  ,128  ,127      ,96   ,23   ,8.574          ,10.069              ,0.852 
0    ,256  ,13   ,127      ,12   ,23   ,3.728          ,3.724               ,1.001 
0    ,256  ,1312 ,127      ,1024 ,23   ,53.612         ,58.805              ,0.912 
0    ,256  ,14   ,127      ,13   ,23   ,3.758          ,3.727               ,1.008 
0    ,256  ,144  ,127      ,128  ,23   ,9.776          ,11.126              ,0.879 
0    ,256  ,15   ,127      ,14   ,23   ,3.741          ,3.747               ,0.999 
0    ,256  ,16   ,127      ,0    ,23   ,3.42           ,3.427               ,0.998 
0    ,256  ,16   ,127      ,144  ,23   ,3.522          ,3.526               ,0.999 
0    ,256  ,16   ,127      ,15   ,23   ,3.727          ,3.723               ,1.001 
0    ,256  ,16   ,127      ,16   ,23   ,3.44           ,3.436               ,1.001 
0    ,256  ,16   ,127      ,192  ,23   ,3.456          ,3.459               ,0.999 
0    ,256  ,16   ,127      ,240  ,23   ,3.449          ,3.422               ,1.008 
0    ,256  ,16   ,127      ,256  ,23   ,3.465          ,3.458               ,1.002 
0    ,256  ,16   ,127      ,288  ,23   ,3.449          ,3.969               ,0.869 
0    ,256  ,16   ,127      ,48   ,23   ,3.512          ,3.505               ,1.002 
0    ,256  ,16   ,127      ,64   ,23   ,3.448          ,3.455               ,0.998 
0    ,256  ,16   ,127      ,96   ,23   ,3.583          ,3.552               ,1.009 
0    ,256  ,160  ,127      ,144  ,23   ,9.741          ,11.038              ,0.883 
0    ,256  ,160  ,127      ,16   ,23   ,12.077         ,13.522              ,0.893 
0    ,256  ,160  ,127      ,256  ,23   ,9.67           ,10.917              ,0.886 
0    ,256  ,160  ,127      ,64   ,23   ,10.052         ,12.105              ,0.83  
0    ,256  ,160  ,127      ,96   ,23   ,8.69           ,10.045              ,0.865 
0    ,256  ,17   ,127      ,16   ,23   ,3.741          ,3.747               ,0.998 
0    ,256  ,176  ,127      ,128  ,23   ,9.853          ,10.804              ,0.912 
0    ,256  ,176  ,127      ,160  ,23   ,9.652          ,10.972              ,0.88  
0    ,256  ,176  ,127      ,32   ,23   ,9.892          ,11.296              ,0.876 
0    ,256  ,1760 ,127      ,2048 ,23   ,96.6           ,108.661             ,0.889 
0    ,256  ,1760 ,127      ,288  ,23   ,53.499         ,70.529              ,0.759 
0    ,256  ,18   ,127      ,17   ,23   ,3.694          ,3.699               ,0.999 
0    ,256  ,1808 ,127      ,2048 ,23   ,99.662         ,111.351             ,0.895 
0    ,256  ,1808 ,127      ,240  ,23   ,55.199         ,73.631              ,0.75  
0    ,256  ,1856 ,127      ,192  ,23   ,56.473         ,75.74               ,0.746 
0    ,256  ,1856 ,127      ,2048 ,23   ,100.381        ,112.404             ,0.893 
0    ,256  ,19   ,127      ,18   ,23   ,3.725          ,3.73                ,0.999 
0    ,256  ,1904 ,127      ,144  ,23   ,54.26          ,72.948              ,0.744 
0    ,256  ,1904 ,127      ,2048 ,23   ,100.691        ,115.118             ,0.875 
0    ,256  ,192  ,127      ,176  ,23   ,11.119         ,12.39               ,0.897 
0    ,256  ,1952 ,127      ,2048 ,23   ,124.851        ,122.253             ,1.021 
0    ,256  ,1952 ,127      ,96   ,23   ,78.921         ,74.065              ,1.066 
0    ,256  ,2    ,127      ,1    ,23   ,3.481          ,3.487               ,0.998 
0    ,256  ,20   ,127      ,19   ,23   ,3.699          ,3.714               ,0.996 
0    ,256  ,2000 ,127      ,2048 ,23   ,106.015        ,121.298             ,0.874 
0    ,256  ,2000 ,127      ,48   ,23   ,77.55          ,80.627              ,0.962 
0    ,256  ,2048 ,127      ,0    ,23   ,89.12          ,83.645              ,1.065 
0    ,256  ,2048 ,127      ,1024 ,23   ,72.634         ,75.941              ,0.956 
0    ,256  ,2048 ,127      ,128  ,23   ,59.134         ,79.509              ,0.744 
0    ,256  ,2048 ,127      ,144  ,23   ,75.153         ,97.646              ,0.77  
0    ,256  ,2048 ,127      ,1760 ,23   ,102.92         ,118.472             ,0.869 
0    ,256  ,2048 ,127      ,1808 ,23   ,106.085        ,118.723             ,0.894 
0    ,256  ,2048 ,127      ,1856 ,23   ,105.164        ,120.566             ,0.872 
0    ,256  ,2048 ,127      ,1904 ,23   ,103.725        ,120.612             ,0.86  
0    ,256  ,2048 ,127      ,192  ,23   ,85.861         ,80.948              ,1.061 
0    ,256  ,2048 ,127      ,1952 ,23   ,108.137        ,121.018             ,0.894 
0    ,256  ,2048 ,127      ,2000 ,23   ,107.436        ,122.155             ,0.88  
0    ,256  ,2048 ,127      ,2048 ,23   ,105.609        ,121.073             ,0.872 
0    ,256  ,2048 ,127      ,240  ,23   ,86.38          ,81.444              ,1.061 
0    ,256  ,2048 ,127      ,256  ,23   ,77.811         ,81.742              ,0.952 
0    ,256  ,2048 ,127      ,288  ,23   ,82.038         ,82.261              ,0.997 
0    ,256  ,2048 ,127      ,32   ,23   ,83.405         ,82.95               ,1.005 
0    ,256  ,2048 ,127      ,4096 ,23   ,108.251        ,121.632             ,0.89  
0    ,256  ,2048 ,127      ,48   ,23   ,80.984         ,79.95               ,1.013 
0    ,256  ,2048 ,127      ,512  ,23   ,65.482         ,85.708              ,0.764 
0    ,256  ,2048 ,127      ,64   ,23   ,81.546         ,81.911              ,0.996 
0    ,256  ,2048 ,127      ,96   ,23   ,81.451         ,80.689              ,1.009 
0    ,256  ,208  ,127      ,16   ,23   ,13.609         ,15.418              ,0.883 
0    ,256  ,208  ,127      ,192  ,23   ,11.816         ,13.444              ,0.879 
0    ,256  ,208  ,127      ,256  ,23   ,11.905         ,13.285              ,0.896 
0    ,256  ,208  ,127      ,48   ,23   ,11.446         ,12.963              ,0.883 
0    ,256  ,208  ,127      ,64   ,23   ,11.48          ,13.384              ,0.858 
0    ,256  ,2096 ,127      ,2048 ,23   ,109.087        ,126.066             ,0.865 
0    ,256  ,21   ,127      ,20   ,23   ,3.704          ,3.707               ,0.999 
0    ,256  ,2144 ,127      ,2048 ,23   ,109.486        ,125.478             ,0.873 
0    ,256  ,2192 ,127      ,2048 ,23   ,135.983        ,130.385             ,1.043 
0    ,256  ,22   ,127      ,21   ,23   ,3.689          ,3.704               ,0.996 
0    ,256  ,224  ,127      ,128  ,23   ,12.611         ,12.678              ,0.995 
0    ,256  ,224  ,127      ,208  ,23   ,11.663         ,13.529              ,0.862 
0    ,256  ,224  ,127      ,288  ,23   ,11.764         ,13.531              ,0.869 
0    ,256  ,224  ,127      ,32   ,23   ,11.551         ,13.114              ,0.881 
0    ,256  ,224  ,127      ,512  ,23   ,12.103         ,13.777              ,0.879 
0    ,256  ,2240 ,127      ,2048 ,23   ,115.602        ,129.427             ,0.893 
0    ,256  ,2288 ,127      ,2048 ,23   ,113.611        ,131.4               ,0.865 
0    ,256  ,23   ,127      ,22   ,23   ,3.697          ,3.706               ,0.998 
0    ,256  ,2336 ,127      ,2048 ,23   ,116.71         ,131.11              ,0.89  
0    ,256  ,24   ,127      ,23   ,23   ,3.69           ,3.699               ,0.998 
0    ,256  ,240  ,127      ,224  ,23   ,11.676         ,13.415              ,0.87  
0    ,256  ,25   ,127      ,24   ,23   ,3.703          ,3.707               ,0.999 
0    ,256  ,256  ,127      ,0    ,23   ,13.206         ,14.905              ,0.886 
0    ,256  ,256  ,127      ,112  ,23   ,11.793         ,13.555              ,0.87  
0    ,256  ,256  ,127      ,144  ,23   ,14.229         ,14.538              ,0.979 
0    ,256  ,256  ,127      ,16   ,23   ,14.062         ,15.819              ,0.889 
0    ,256  ,256  ,127      ,160  ,23   ,13.551         ,13.99               ,0.969 
0    ,256  ,256  ,127      ,192  ,23   ,15.303         ,15.686              ,0.976 
0    ,256  ,256  ,127      ,208  ,23   ,15.876         ,15.401              ,1.031 
0    ,256  ,256  ,127      ,240  ,23   ,14.077         ,15.305              ,0.92  
0    ,256  ,256  ,127      ,256  ,23   ,13.849         ,15.122              ,0.916 
0    ,256  ,256  ,127      ,288  ,23   ,14.039         ,15.513              ,0.905 
0    ,256  ,256  ,127      ,48   ,23   ,13.063         ,15.006              ,0.87  
0    ,256  ,256  ,127      ,64   ,23   ,13.097         ,14.9                ,0.879 
0    ,256  ,256  ,127      ,96   ,23   ,12.041         ,13.591              ,0.886 
0    ,256  ,26   ,127      ,25   ,23   ,3.684          ,3.919               ,0.94  
0    ,256  ,27   ,127      ,26   ,23   ,3.687          ,4.32                ,0.854 
0    ,256  ,272  ,127      ,128  ,23   ,14.05          ,14.025              ,1.002 
0    ,256  ,272  ,127      ,240  ,23   ,13.934         ,15.159              ,0.919 
0    ,256  ,272  ,127      ,256  ,23   ,14.791         ,16.623              ,0.89  
0    ,256  ,272  ,127      ,32   ,23   ,13.133         ,14.673              ,0.895 
0    ,256  ,272  ,127      ,512  ,23   ,15.565         ,17.073              ,0.912 
0    ,256  ,28   ,127      ,27   ,23   ,3.635          ,4.447               ,0.817 
0    ,256  ,288  ,127      ,272  ,23   ,14.816         ,16.57               ,0.894 
0    ,256  ,29   ,127      ,28   ,23   ,3.599          ,4.758               ,0.756 
0    ,256  ,3    ,127      ,2    ,23   ,3.601          ,3.605               ,0.999 
0    ,256  ,30   ,127      ,29   ,23   ,3.555          ,4.478               ,0.794 
0    ,256  ,304  ,127      ,16   ,23   ,13.191         ,15.236              ,0.866 
0    ,256  ,304  ,127      ,256  ,23   ,14.764         ,16.62               ,0.888 
0    ,256  ,304  ,127      ,64   ,23   ,12.998         ,14.505              ,0.896 
0    ,256  ,31   ,127      ,30   ,23   ,3.566          ,4.646               ,0.767 
0    ,256  ,32   ,127      ,0    ,23   ,4.846          ,5.699               ,0.85  
0    ,256  ,32   ,127      ,128  ,23   ,4.771          ,5.539               ,0.861 
0    ,256  ,32   ,127      ,144  ,23   ,4.874          ,5.301               ,0.919 
0    ,256  ,32   ,127      ,16   ,23   ,4.943          ,6.15                ,0.804 
0    ,256  ,32   ,127      ,192  ,23   ,4.817          ,5.222               ,0.922 
0    ,256  ,32   ,127      ,240  ,23   ,4.759          ,5.217               ,0.912 
0    ,256  ,32   ,127      ,288  ,23   ,4.935          ,4.967               ,0.994 
0    ,256  ,32   ,127      ,31   ,23   ,4.958          ,6.072               ,0.817 
0    ,256  ,32   ,127      ,32   ,23   ,4.797          ,5.177               ,0.927 
0    ,256  ,32   ,127      ,48   ,23   ,4.848          ,5.526               ,0.877 
0    ,256  ,32   ,127      ,96   ,23   ,4.846          ,5.738               ,0.845 
0    ,256  ,320  ,127      ,128  ,23   ,17.561         ,17.135              ,1.025 
0    ,256  ,320  ,127      ,192  ,23   ,16.239         ,16.659              ,0.975 
0    ,256  ,320  ,127      ,32   ,23   ,16.288         ,18.004              ,0.905 
0    ,256  ,320  ,127      ,512  ,23   ,16.393         ,17.792              ,0.921 
0    ,256  ,352  ,127      ,256  ,23   ,17.41          ,17.634              ,0.987 
0    ,256  ,352  ,127      ,64   ,23   ,16.285         ,17.774              ,0.916 
0    ,256  ,368  ,127      ,128  ,23   ,16.755         ,16.958              ,0.988 
0    ,256  ,368  ,127      ,144  ,23   ,15.238         ,15.994              ,0.953 
0    ,256  ,368  ,127      ,512  ,23   ,17.195         ,18.66               ,0.921 
0    ,256  ,4    ,127      ,3    ,23   ,3.618          ,3.613               ,1.001 
0    ,256  ,400  ,127      ,256  ,23   ,20.116         ,20.573              ,0.978 
0    ,256  ,416  ,127      ,128  ,23   ,18.265         ,18.429              ,0.991 
0    ,256  ,416  ,127      ,512  ,23   ,19.667         ,22.126              ,0.889 
0    ,256  ,416  ,127      ,96   ,23   ,15.156         ,17.022              ,0.89  
0    ,256  ,448  ,127      ,256  ,23   ,21.112         ,22.16               ,0.953 
0    ,256  ,464  ,127      ,48   ,23   ,17.832         ,19.4                ,0.919 
0    ,256  ,464  ,127      ,512  ,23   ,21.967         ,24.716              ,0.889 
0    ,256  ,48   ,127      ,32   ,23   ,4.753          ,5.809               ,0.818 
0    ,256  ,496  ,127      ,256  ,23   ,21.503         ,22.245              ,0.967 
0    ,256  ,5    ,127      ,4    ,23   ,3.618          ,3.638               ,0.994 
0    ,256  ,512  ,127      ,0    ,23   ,21.293         ,22.854              ,0.932 
0    ,256  ,512  ,127      ,144  ,23   ,19.517         ,20.58               ,0.948 
0    ,256  ,512  ,127      ,192  ,23   ,20.597         ,21.183              ,0.972 
0    ,256  ,512  ,127      ,224  ,23   ,19.726         ,21.519              ,0.917 
0    ,256  ,512  ,127      ,240  ,23   ,19.967         ,21.324              ,0.936 
0    ,256  ,512  ,127      ,272  ,23   ,21.963         ,22.42               ,0.98  
0    ,256  ,512  ,127      ,288  ,23   ,21.693         ,22.394              ,0.969 
0    ,256  ,512  ,127      ,320  ,23   ,22.427         ,23.135              ,0.969 
0    ,256  ,512  ,127      ,368  ,23   ,21.574         ,23.369              ,0.923 
0    ,256  ,512  ,127      ,416  ,23   ,22.767         ,24.602              ,0.925 
0    ,256  ,512  ,127      ,464  ,23   ,25.806         ,27.773              ,0.929 
0    ,256  ,512  ,127      ,48   ,23   ,20.766         ,22.559              ,0.92  
0    ,256  ,512  ,127      ,512  ,23   ,24.747         ,27.864              ,0.888 
0    ,256  ,512  ,127      ,96   ,23   ,20.261         ,21.807              ,0.929 
0    ,256  ,544  ,127      ,256  ,23   ,22.772         ,23.891              ,0.953 
0    ,256  ,560  ,127      ,512  ,23   ,23.875         ,28.033              ,0.852 
0    ,256  ,6    ,127      ,5    ,23   ,3.635          ,3.652               ,0.995 
0    ,256  ,608  ,127      ,512  ,23   ,27.933         ,30.862              ,0.905 
0    ,256  ,64   ,127      ,0    ,23   ,6.107          ,7.522               ,0.812 
0    ,256  ,64   ,127      ,144  ,23   ,5.573          ,7.086               ,0.787 
0    ,256  ,64   ,127      ,16   ,23   ,6.229          ,7.81                ,0.798 
0    ,256  ,64   ,127      ,192  ,23   ,5.633          ,6.975               ,0.808 
0    ,256  ,64   ,127      ,240  ,23   ,5.521          ,6.79                ,0.813 
0    ,256  ,64   ,127      ,256  ,23   ,5.61           ,6.727               ,0.834 
0    ,256  ,64   ,127      ,288  ,23   ,5.601          ,7.077               ,0.791 
0    ,256  ,64   ,127      ,48   ,23   ,5.888          ,6.835               ,0.861 
0    ,256  ,64   ,127      ,64   ,23   ,5.486          ,6.603               ,0.831 
0    ,256  ,64   ,127      ,96   ,23   ,5.794          ,6.914               ,0.838 
0    ,256  ,656  ,127      ,512  ,23   ,29.612         ,32.328              ,0.916 
0    ,256  ,7    ,127      ,6    ,23   ,3.667          ,3.671               ,0.999 
0    ,256  ,704  ,127      ,512  ,23   ,31.313         ,33.468              ,0.936 
0    ,256  ,736  ,127      ,1024 ,23   ,32.047         ,36.808              ,0.871 
0    ,256  ,736  ,127      ,288  ,23   ,25.736         ,27.07               ,0.951 
0    ,256  ,752  ,127      ,512  ,23   ,31.08          ,33.516              ,0.927 
0    ,256  ,784  ,127      ,1024 ,23   ,34.631         ,39.876              ,0.868 
0    ,256  ,784  ,127      ,240  ,23   ,26.905         ,27.933              ,0.963 
0    ,256  ,8    ,127      ,7    ,23   ,3.68           ,3.69                ,0.997 
0    ,256  ,80   ,127      ,128  ,23   ,5.225          ,6.739               ,0.775 
0    ,256  ,80   ,127      ,32   ,23   ,5.733          ,6.858               ,0.836 
0    ,256  ,80   ,127      ,48   ,23   ,5.568          ,7.196               ,0.774 
0    ,256  ,80   ,127      ,64   ,23   ,5.386          ,7.077               ,0.761 
0    ,256  ,800  ,127      ,512  ,23   ,33.027         ,34.808              ,0.949 
0    ,256  ,832  ,127      ,1024 ,23   ,36.505         ,41.175              ,0.887 
0    ,256  ,832  ,127      ,192  ,23   ,29.101         ,29.286              ,0.994 
0    ,256  ,880  ,127      ,1024 ,23   ,36.353         ,42.688              ,0.852 
0    ,256  ,880  ,127      ,144  ,23   ,27.392         ,28.384              ,0.965 
0    ,256  ,9    ,127      ,8    ,23   ,3.672          ,3.676               ,0.999 
0    ,256  ,928  ,127      ,1024 ,23   ,39.49          ,46.211              ,0.855 
0    ,256  ,928  ,127      ,96   ,23   ,28.702         ,30.43               ,0.943 
0    ,256  ,96   ,127      ,80   ,23   ,6.891          ,8.695               ,0.793 
0    ,256  ,976  ,127      ,1024 ,23   ,42.081         ,49.033              ,0.858 
0    ,256  ,976  ,127      ,48   ,23   ,31.039         ,32.831              ,0.945 
0    ,4    ,1    ,127      ,0    ,23   ,3.487          ,3.483               ,1.001 
0    ,4    ,10   ,127      ,9    ,23   ,3.672          ,3.676               ,0.999 
0    ,4    ,1024 ,127      ,0    ,23   ,35.088         ,35.799              ,0.98  
0    ,4    ,1024 ,127      ,1024 ,23   ,36.532         ,36.781              ,0.993 
0    ,4    ,1024 ,127      ,144  ,23   ,32.306         ,33.063              ,0.977 
0    ,4    ,1024 ,127      ,192  ,23   ,33.842         ,34.018              ,0.995 
0    ,4    ,1024 ,127      ,240  ,23   ,32.997         ,33.75               ,0.978 
0    ,4    ,1024 ,127      ,288  ,23   ,34.319         ,35.103              ,0.978 
0    ,4    ,1024 ,127      ,48   ,23   ,34.234         ,35.567              ,0.963 
0    ,4    ,1024 ,127      ,736  ,23   ,36.362         ,38.79               ,0.937 
0    ,4    ,1024 ,127      ,784  ,23   ,37.155         ,37.523              ,0.99  
0    ,4    ,1024 ,127      ,832  ,23   ,37.483         ,41.873              ,0.895 
0    ,4    ,1024 ,127      ,880  ,23   ,38.046         ,40.079              ,0.949 
0    ,4    ,1024 ,127      ,928  ,23   ,36.544         ,38.781              ,0.942 
0    ,4    ,1024 ,127      ,96   ,23   ,32.139         ,33.166              ,0.969 
0    ,4    ,1024 ,127      ,976  ,23   ,39.354         ,39.628              ,0.993 
0    ,4    ,1072 ,127      ,1024 ,23   ,38.495         ,39.141              ,0.984 
0    ,4    ,11   ,127      ,10   ,23   ,3.695          ,3.691               ,1.001 
0    ,4    ,112  ,127      ,144  ,23   ,5.946          ,7.881               ,0.755 
0    ,4    ,112  ,127      ,16   ,23   ,7.018          ,8.82                ,0.796 
0    ,4    ,112  ,127      ,256  ,23   ,7.148          ,8.811               ,0.811 
0    ,4    ,112  ,127      ,64   ,23   ,7.008          ,8.515               ,0.823 
0    ,4    ,112  ,127      ,96   ,23   ,6.209          ,7.835               ,0.793 
0    ,4    ,1120 ,127      ,1024 ,23   ,41.472         ,42.441              ,0.977 
0    ,4    ,1168 ,127      ,1024 ,23   ,42.698         ,44.814              ,0.953 
0    ,4    ,12   ,127      ,11   ,23   ,3.753          ,3.733               ,1.005 
0    ,4    ,1216 ,127      ,1024 ,23   ,44.024         ,45.246              ,0.973 
0    ,4    ,1264 ,127      ,1024 ,23   ,45.369         ,47.162              ,0.962 
0    ,4    ,128  ,127      ,0    ,23   ,9.94           ,11.416              ,0.871 
0    ,4    ,128  ,127      ,112  ,23   ,9.966          ,12.549              ,0.794 
0    ,4    ,128  ,127      ,128  ,23   ,8.554          ,9.86                ,0.868 
0    ,4    ,128  ,127      ,144  ,23   ,8.637          ,10.054              ,0.859 
0    ,4    ,128  ,127      ,192  ,23   ,8.596          ,10.029              ,0.857 
0    ,4    ,128  ,127      ,240  ,23   ,8.663          ,10.525              ,0.823 
0    ,4    ,128  ,127      ,288  ,23   ,9.681          ,11.264              ,0.859 
0    ,4    ,128  ,127      ,32   ,23   ,10.167         ,11.199              ,0.908 
0    ,4    ,128  ,127      ,48   ,23   ,9.971          ,11.3                ,0.882 
0    ,4    ,128  ,127      ,80   ,23   ,9.776          ,10.966              ,0.892 
0    ,4    ,128  ,127      ,96   ,23   ,8.636          ,10.134              ,0.852 
0    ,4    ,13   ,127      ,12   ,23   ,3.756          ,3.761               ,0.999 
0    ,4    ,1312 ,127      ,1024 ,23   ,44.547         ,45.754              ,0.974 
0    ,4    ,14   ,127      ,13   ,23   ,3.732          ,3.747               ,0.996 
0    ,4    ,144  ,127      ,128  ,23   ,10.127         ,10.97               ,0.923 
0    ,4    ,15   ,127      ,14   ,23   ,3.765          ,3.775               ,0.997 
0    ,4    ,16   ,127      ,0    ,23   ,3.486          ,3.454               ,1.009 
0    ,4    ,16   ,127      ,144  ,23   ,4.053          ,4.049               ,1.001 
0    ,4    ,16   ,127      ,15   ,23   ,3.748          ,3.746               ,1.0   
0    ,4    ,16   ,127      ,16   ,23   ,3.495          ,3.484               ,1.003 
0    ,4    ,16   ,127      ,192  ,23   ,3.997          ,3.997               ,1.0   
0    ,4    ,16   ,127      ,240  ,23   ,4.024          ,3.986               ,1.009 
0    ,4    ,16   ,127      ,256  ,23   ,4.065          ,4.184               ,0.972 
0    ,4    ,16   ,127      ,288  ,23   ,3.979          ,4.677               ,0.851 
0    ,4    ,16   ,127      ,48   ,23   ,3.554          ,3.506               ,1.014 
0    ,4    ,16   ,127      ,64   ,23   ,3.98           ,4.086               ,0.974 
0    ,4    ,16   ,127      ,96   ,23   ,4.047          ,4.051               ,0.999 
0    ,4    ,160  ,127      ,144  ,23   ,9.849          ,11.009              ,0.895 
0    ,4    ,160  ,127      ,16   ,23   ,11.933         ,13.626              ,0.876 
0    ,4    ,160  ,127      ,256  ,23   ,9.79           ,11.149              ,0.878 
0    ,4    ,160  ,127      ,64   ,23   ,9.971          ,11.581              ,0.861 
0    ,4    ,160  ,127      ,96   ,23   ,8.583          ,10.209              ,0.841 
0    ,4    ,17   ,127      ,16   ,23   ,3.734          ,3.748               ,0.996 
0    ,4    ,176  ,127      ,128  ,23   ,9.491          ,11.309              ,0.839 
0    ,4    ,176  ,127      ,160  ,23   ,9.467          ,11.071              ,0.855 
0    ,4    ,176  ,127      ,32   ,23   ,9.901          ,11.688              ,0.847 
0    ,4    ,1760 ,127      ,2048 ,23   ,53.712         ,54.559              ,0.984 
0    ,4    ,1760 ,127      ,288  ,23   ,53.455         ,68.237              ,0.783 
0    ,4    ,18   ,127      ,17   ,23   ,3.708          ,3.72                ,0.997 
0    ,4    ,1808 ,127      ,2048 ,23   ,55.458         ,55.699              ,0.996 
0    ,4    ,1808 ,127      ,240  ,23   ,55.07          ,72.605              ,0.758 
0    ,4    ,1856 ,127      ,192  ,23   ,57.064         ,75.144              ,0.759 
0    ,4    ,1856 ,127      ,2048 ,23   ,57.648         ,57.765              ,0.998 
0    ,4    ,19   ,127      ,18   ,23   ,3.727          ,3.728               ,1.0   
0    ,4    ,1904 ,127      ,144  ,23   ,53.758         ,75.311              ,0.714 
0    ,4    ,1904 ,127      ,2048 ,23   ,58.335         ,57.869              ,1.008 
0    ,4    ,192  ,127      ,176  ,23   ,11.282         ,12.456              ,0.906 
0    ,4    ,1952 ,127      ,2048 ,23   ,59.45          ,59.715              ,0.996 
0    ,4    ,1952 ,127      ,96   ,23   ,78.648         ,74.899              ,1.05  
0    ,4    ,2    ,127      ,1    ,23   ,3.523          ,3.519               ,1.001 
0    ,4    ,20   ,127      ,19   ,23   ,3.699          ,3.701               ,0.999 
0    ,4    ,2000 ,127      ,2048 ,23   ,61.658         ,61.447              ,1.003 
0    ,4    ,2000 ,127      ,48   ,23   ,78.569         ,79.898              ,0.983 
0    ,4    ,2048 ,127      ,0    ,23   ,88.014         ,84.246              ,1.045 
0    ,4    ,2048 ,127      ,1024 ,23   ,64.842         ,65.933              ,0.983 
0    ,4    ,2048 ,127      ,128  ,23   ,58.983         ,78.869              ,0.748 
0    ,4    ,2048 ,127      ,144  ,23   ,59.302         ,80.038              ,0.741 
0    ,4    ,2048 ,127      ,1760 ,23   ,66.634         ,68.68               ,0.97  
0    ,4    ,2048 ,127      ,1808 ,23   ,67.537         ,67.691              ,0.998 
0    ,4    ,2048 ,127      ,1856 ,23   ,68.247         ,69.206              ,0.986 
0    ,4    ,2048 ,127      ,1904 ,23   ,68.615         ,69.063              ,0.994 
0    ,4    ,2048 ,127      ,192  ,23   ,82.408         ,80.87               ,1.019 
0    ,4    ,2048 ,127      ,1952 ,23   ,69.018         ,70.204              ,0.983 
0    ,4    ,2048 ,127      ,2000 ,23   ,71.407         ,71.917              ,0.993 
0    ,4    ,2048 ,127      ,2048 ,23   ,62.308         ,62.297              ,1.0   
0    ,4    ,2048 ,127      ,240  ,23   ,85.405         ,81.052              ,1.054 
0    ,4    ,2048 ,127      ,256  ,23   ,65.475         ,81.985              ,0.799 
0    ,4    ,2048 ,127      ,288  ,23   ,83.898         ,81.127              ,1.034 
0    ,4    ,2048 ,127      ,32   ,23   ,81.603         ,83.495              ,0.977 
0    ,4    ,2048 ,127      ,4096 ,23   ,59.799         ,60.581              ,0.987 
0    ,4    ,2048 ,127      ,48   ,23   ,86.377         ,83.332              ,1.037 
0    ,4    ,2048 ,127      ,512  ,23   ,81.995         ,89.633              ,0.915 
0    ,4    ,2048 ,127      ,64   ,23   ,80.486         ,82.866              ,0.971 
0    ,4    ,2048 ,127      ,96   ,23   ,83.451         ,82.836              ,1.007 
0    ,4    ,208  ,127      ,16   ,23   ,11.882         ,13.43               ,0.885 
0    ,4    ,208  ,127      ,192  ,23   ,11.884         ,13.32               ,0.892 
0    ,4    ,208  ,127      ,256  ,23   ,12.054         ,13.906              ,0.867 
0    ,4    ,208  ,127      ,48   ,23   ,11.496         ,13.008              ,0.884 
0    ,4    ,208  ,127      ,64   ,23   ,11.587         ,13.549              ,0.855 
0    ,4    ,2096 ,127      ,2048 ,23   ,71.654         ,72.334              ,0.991 
0    ,4    ,21   ,127      ,20   ,23   ,3.71           ,3.726               ,0.996 
0    ,4    ,2144 ,127      ,2048 ,23   ,78.692         ,78.067              ,1.008 
0    ,4    ,2192 ,127      ,2048 ,23   ,73.591         ,74.875              ,0.983 
0    ,4    ,22   ,127      ,21   ,23   ,3.733          ,3.731               ,1.0   
0    ,4    ,224  ,127      ,128  ,23   ,12.304         ,12.664              ,0.972 
0    ,4    ,224  ,127      ,208  ,23   ,11.724         ,13.224              ,0.887 
0    ,4    ,224  ,127      ,288  ,23   ,11.811         ,13.797              ,0.856 
0    ,4    ,224  ,127      ,32   ,23   ,11.783         ,13.401              ,0.879 
0    ,4    ,224  ,127      ,512  ,23   ,12.283         ,12.816              ,0.958 
0    ,4    ,2240 ,127      ,2048 ,23   ,71.418         ,71.731              ,0.996 
0    ,4    ,2288 ,127      ,2048 ,23   ,73.787         ,73.959              ,0.998 
0    ,4    ,23   ,127      ,22   ,23   ,3.71           ,3.714               ,0.999 
0    ,4    ,2336 ,127      ,2048 ,23   ,76.417         ,77.051              ,0.992 
0    ,4    ,24   ,127      ,23   ,23   ,3.69           ,3.695               ,0.999 
0    ,4    ,240  ,127      ,224  ,23   ,11.699         ,13.407              ,0.873 
0    ,4    ,25   ,127      ,24   ,23   ,3.696          ,3.7                 ,0.999 
0    ,4    ,256  ,127      ,0    ,23   ,13.175         ,14.916              ,0.883 
0    ,4    ,256  ,127      ,112  ,23   ,11.733         ,13.597              ,0.863 
0    ,4    ,256  ,127      ,144  ,23   ,13.758         ,14.227              ,0.967 
0    ,4    ,256  ,127      ,16   ,23   ,14.011         ,16.088              ,0.871 
0    ,4    ,256  ,127      ,160  ,23   ,12.872         ,14.245              ,0.904 
0    ,4    ,256  ,127      ,192  ,23   ,14.946         ,15.606              ,0.958 
0    ,4    ,256  ,127      ,208  ,23   ,14.959         ,15.547              ,0.962 
0    ,4    ,256  ,127      ,240  ,23   ,14.08          ,15.414              ,0.914 
0    ,4    ,256  ,127      ,256  ,23   ,14.899         ,14.784              ,1.008 
0    ,4    ,256  ,127      ,288  ,23   ,15.122         ,15.701              ,0.963 
0    ,4    ,256  ,127      ,48   ,23   ,13.193         ,14.705              ,0.897 
0    ,4    ,256  ,127      ,64   ,23   ,13.232         ,14.869              ,0.89  
0    ,4    ,256  ,127      ,96   ,23   ,12.125         ,13.907              ,0.872 
0    ,4    ,26   ,127      ,25   ,23   ,3.784          ,3.794               ,0.997 
0    ,4    ,27   ,127      ,26   ,23   ,3.811          ,4.602               ,0.828 
0    ,4    ,272  ,127      ,128  ,23   ,13.842         ,14.291              ,0.969 
0    ,4    ,272  ,127      ,240  ,23   ,14.653         ,15.561              ,0.942 
0    ,4    ,272  ,127      ,256  ,23   ,14.838         ,16.751              ,0.886 
0    ,4    ,272  ,127      ,32   ,23   ,13.459         ,15.027              ,0.896 
0    ,4    ,272  ,127      ,512  ,23   ,13.731         ,15.173              ,0.905 
0    ,4    ,28   ,127      ,27   ,23   ,3.636          ,4.669               ,0.779 
0    ,4    ,288  ,127      ,272  ,23   ,14.789         ,16.787              ,0.881 
0    ,4    ,29   ,127      ,28   ,23   ,3.719          ,4.744               ,0.784 
0    ,4    ,3    ,127      ,2    ,23   ,3.573          ,3.568               ,1.001 
0    ,4    ,30   ,127      ,29   ,23   ,3.614          ,4.445               ,0.813 
0    ,4    ,304  ,127      ,16   ,23   ,13.289         ,15.266              ,0.871 
0    ,4    ,304  ,127      ,256  ,23   ,15.052         ,16.634              ,0.905 
0    ,4    ,304  ,127      ,64   ,23   ,13.123         ,14.957              ,0.877 
0    ,4    ,31   ,127      ,30   ,23   ,3.543          ,4.653               ,0.761 
0    ,4    ,32   ,127      ,0    ,23   ,4.84           ,5.883               ,0.823 
0    ,4    ,32   ,127      ,128  ,23   ,4.882          ,6.274               ,0.778 
0    ,4    ,32   ,127      ,144  ,23   ,4.997          ,5.352               ,0.934 
0    ,4    ,32   ,127      ,16   ,23   ,5.05           ,5.737               ,0.88  
0    ,4    ,32   ,127      ,192  ,23   ,4.696          ,5.292               ,0.887 
0    ,4    ,32   ,127      ,240  ,23   ,4.824          ,5.315               ,0.908 
0    ,4    ,32   ,127      ,288  ,23   ,4.576          ,5.298               ,0.864 
0    ,4    ,32   ,127      ,31   ,23   ,4.885          ,6.199               ,0.788 
0    ,4    ,32   ,127      ,32   ,23   ,4.831          ,5.242               ,0.922 
0    ,4    ,32   ,127      ,48   ,23   ,4.873          ,5.114               ,0.953 
0    ,4    ,32   ,127      ,96   ,23   ,4.817          ,5.54                ,0.87  
0    ,4    ,320  ,127      ,128  ,23   ,15.269         ,16.163              ,0.945 
0    ,4    ,320  ,127      ,192  ,23   ,16.373         ,16.909              ,0.968 
0    ,4    ,320  ,127      ,32   ,23   ,14.899         ,16.804              ,0.887 
0    ,4    ,320  ,127      ,512  ,23   ,16.514         ,16.873              ,0.979 
0    ,4    ,352  ,127      ,256  ,23   ,17.465         ,18.182              ,0.961 
0    ,4    ,352  ,127      ,64   ,23   ,14.578         ,16.396              ,0.889 
0    ,4    ,368  ,127      ,128  ,23   ,15.143         ,15.633              ,0.969 
0    ,4    ,368  ,127      ,144  ,23   ,15.407         ,15.945              ,0.966 
0    ,4    ,368  ,127      ,512  ,23   ,16.472         ,16.683              ,0.987 
0    ,4    ,4    ,127      ,3    ,23   ,3.651          ,3.63                ,1.006 
0    ,4    ,400  ,127      ,256  ,23   ,18.655         ,19.162              ,0.974 
0    ,4    ,416  ,127      ,128  ,23   ,16.666         ,17.079              ,0.976 
0    ,4    ,416  ,127      ,512  ,23   ,17.694         ,19.41               ,0.912 
0    ,4    ,416  ,127      ,96   ,23   ,15.159         ,17.093              ,0.887 
0    ,4    ,448  ,127      ,256  ,23   ,20.259         ,20.835              ,0.972 
0    ,4    ,464  ,127      ,48   ,23   ,17.868         ,19.735              ,0.905 
0    ,4    ,464  ,127      ,512  ,23   ,20.457         ,20.864              ,0.981 
0    ,4    ,48   ,127      ,32   ,23   ,4.758          ,6.032               ,0.789 
0    ,4    ,496  ,127      ,256  ,23   ,20.028         ,21.012              ,0.953 
0    ,4    ,5    ,127      ,4    ,23   ,3.656          ,3.657               ,1.0   
0    ,4    ,512  ,127      ,0    ,23   ,20.213         ,21.638              ,0.934 
0    ,4    ,512  ,127      ,144  ,23   ,19.229         ,20.362              ,0.944 
0    ,4    ,512  ,127      ,192  ,23   ,20.727         ,21.305              ,0.973 
0    ,4    ,512  ,127      ,224  ,23   ,19.978         ,21.341              ,0.936 
0    ,4    ,512  ,127      ,240  ,23   ,19.722         ,21.646              ,0.911 
0    ,4    ,512  ,127      ,272  ,23   ,21.835         ,22.32               ,0.978 
0    ,4    ,512  ,127      ,288  ,23   ,20.791         ,22.309              ,0.932 
0    ,4    ,512  ,127      ,320  ,23   ,21.715         ,22.348              ,0.972 
0    ,4    ,512  ,127      ,368  ,23   ,20.717         ,22.498              ,0.921 
0    ,4    ,512  ,127      ,416  ,23   ,21.48          ,23.417              ,0.917 
0    ,4    ,512  ,127      ,464  ,23   ,21.983         ,22.526              ,0.976 
0    ,4    ,512  ,127      ,48   ,23   ,19.467         ,21.459              ,0.907 
0    ,4    ,512  ,127      ,512  ,23   ,21.556         ,22.18               ,0.972 
0    ,4    ,512  ,127      ,96   ,23   ,18.418         ,20.613              ,0.894 
0    ,4    ,544  ,127      ,256  ,23   ,21.565         ,22.443              ,0.961 
0    ,4    ,560  ,127      ,512  ,23   ,22.296         ,24.882              ,0.896 
0    ,4    ,6    ,127      ,5    ,23   ,3.641          ,3.644               ,0.999 
0    ,4    ,608  ,127      ,512  ,23   ,25.329         ,26.104              ,0.97  
0    ,4    ,64   ,127      ,0    ,23   ,6.115          ,7.521               ,0.813 
0    ,4    ,64   ,127      ,144  ,23   ,5.601          ,7.039               ,0.796 
0    ,4    ,64   ,127      ,16   ,23   ,6.232          ,7.576               ,0.823 
0    ,4    ,64   ,127      ,192  ,23   ,5.577          ,7.049               ,0.791 
0    ,4    ,64   ,127      ,240  ,23   ,5.614          ,7.129               ,0.788 
0    ,4    ,64   ,127      ,256  ,23   ,5.995          ,7.719               ,0.777 
0    ,4    ,64   ,127      ,288  ,23   ,6.475          ,7.773               ,0.833 
0    ,4    ,64   ,127      ,48   ,23   ,5.991          ,7.007               ,0.855 
0    ,4    ,64   ,127      ,64   ,23   ,5.507          ,6.613               ,0.833 
0    ,4    ,64   ,127      ,96   ,23   ,5.654          ,7.243               ,0.781 
0    ,4    ,656  ,127      ,512  ,23   ,26.114         ,26.541              ,0.984 
0    ,4    ,7    ,127      ,6    ,23   ,3.672          ,3.677               ,0.999 
0    ,4    ,704  ,127      ,512  ,23   ,28.874         ,29.88               ,0.966 
0    ,4    ,736  ,127      ,1024 ,23   ,25.831         ,26.654              ,0.969 
0    ,4    ,736  ,127      ,288  ,23   ,25.879         ,27.268              ,0.949 
0    ,4    ,752  ,127      ,512  ,23   ,29.208         ,30.644              ,0.953 
0    ,4    ,784  ,127      ,1024 ,23   ,27.628         ,29.265              ,0.944 
0    ,4    ,784  ,127      ,240  ,23   ,26.804         ,27.815              ,0.964 
0    ,4    ,8    ,127      ,7    ,23   ,3.672          ,3.676               ,0.999 
0    ,4    ,80   ,127      ,128  ,23   ,5.119          ,6.532               ,0.784 
0    ,4    ,80   ,127      ,32   ,23   ,5.646          ,7.295               ,0.774 
0    ,4    ,80   ,127      ,48   ,23   ,5.663          ,7.321               ,0.773 
0    ,4    ,80   ,127      ,64   ,23   ,5.202          ,6.616               ,0.786 
0    ,4    ,800  ,127      ,512  ,23   ,30.149         ,31.294              ,0.963 
0    ,4    ,832  ,127      ,1024 ,23   ,30.554         ,30.594              ,0.999 
0    ,4    ,832  ,127      ,192  ,23   ,29.195         ,29.332              ,0.995 
0    ,4    ,880  ,127      ,1024 ,23   ,30.665         ,31.135              ,0.985 
0    ,4    ,880  ,127      ,144  ,23   ,27.531         ,28.4                ,0.969 
0    ,4    ,9    ,127      ,8    ,23   ,3.676          ,3.68                ,0.999 
0    ,4    ,928  ,127      ,1024 ,23   ,32.165         ,32.648              ,0.985 
0    ,4    ,928  ,127      ,96   ,23   ,29.012         ,29.832              ,0.973 
0    ,4    ,96   ,127      ,80   ,23   ,7.17           ,8.775               ,0.817 
0    ,4    ,976  ,127      ,1024 ,23   ,33.58          ,34.26               ,0.98  
0    ,4    ,976  ,127      ,48   ,23   ,30.879         ,32.767              ,0.942 
0    ,64   ,1    ,127      ,0    ,23   ,3.477          ,3.46                ,1.005 
0    ,64   ,10   ,127      ,9    ,23   ,3.68           ,3.686               ,0.999 
0    ,64   ,1024 ,127      ,0    ,23   ,35.631         ,35.876              ,0.993 
0    ,64   ,1024 ,127      ,1024 ,23   ,45.181         ,51.777              ,0.873 
0    ,64   ,1024 ,127      ,144  ,23   ,32.18          ,33.228              ,0.968 
0    ,64   ,1024 ,127      ,192  ,23   ,34.128         ,34.105              ,1.001 
0    ,64   ,1024 ,127      ,240  ,23   ,33.026         ,33.942              ,0.973 
0    ,64   ,1024 ,127      ,288  ,23   ,34.054         ,35.033              ,0.972 
0    ,64   ,1024 ,127      ,48   ,23   ,34.209         ,35.787              ,0.956 
0    ,64   ,1024 ,127      ,736  ,23   ,39.5           ,43.527              ,0.907 
0    ,64   ,1024 ,127      ,784  ,23   ,41.692         ,44.959              ,0.927 
0    ,64   ,1024 ,127      ,832  ,23   ,42.868         ,46.789              ,0.916 
0    ,64   ,1024 ,127      ,880  ,23   ,41.615         ,46.687              ,0.891 
0    ,64   ,1024 ,127      ,928  ,23   ,42.763         ,48.59               ,0.88  
0    ,64   ,1024 ,127      ,96   ,23   ,31.607         ,33.517              ,0.943 
0    ,64   ,1024 ,127      ,976  ,23   ,45.214         ,50.379              ,0.897 
0    ,64   ,1072 ,127      ,1024 ,23   ,44.69          ,52.169              ,0.857 
0    ,64   ,11   ,127      ,10   ,23   ,3.69           ,3.695               ,0.999 
0    ,64   ,112  ,127      ,144  ,23   ,5.793          ,7.603               ,0.762 
0    ,64   ,112  ,127      ,16   ,23   ,7.068          ,8.932               ,0.791 
0    ,64   ,112  ,127      ,256  ,23   ,6.182          ,7.265               ,0.851 
0    ,64   ,112  ,127      ,64   ,23   ,6.858          ,8.588               ,0.799 
0    ,64   ,112  ,127      ,96   ,23   ,5.911          ,7.981               ,0.741 
0    ,64   ,1120 ,127      ,1024 ,23   ,48.057         ,53.429              ,0.899 
0    ,64   ,1168 ,127      ,1024 ,23   ,48.896         ,54.819              ,0.892 
0    ,64   ,12   ,127      ,11   ,23   ,3.686          ,3.69                ,0.999 
0    ,64   ,1216 ,127      ,1024 ,23   ,52.335         ,60.571              ,0.864 
0    ,64   ,1264 ,127      ,1024 ,23   ,50.943         ,55.742              ,0.914 
0    ,64   ,128  ,127      ,0    ,23   ,10.083         ,11.255              ,0.896 
0    ,64   ,128  ,127      ,112  ,23   ,10.139         ,12.492              ,0.812 
0    ,64   ,128  ,127      ,128  ,23   ,8.529          ,9.939               ,0.858 
0    ,64   ,128  ,127      ,144  ,23   ,8.655          ,10.129              ,0.855 
0    ,64   ,128  ,127      ,192  ,23   ,8.766          ,10.197              ,0.86  
0    ,64   ,128  ,127      ,240  ,23   ,8.902          ,10.167              ,0.876 
0    ,64   ,128  ,127      ,288  ,23   ,8.909          ,10.185              ,0.875 
0    ,64   ,128  ,127      ,32   ,23   ,10.029         ,11.241              ,0.892 
0    ,64   ,128  ,127      ,48   ,23   ,9.828          ,11.021              ,0.892 
0    ,64   ,128  ,127      ,80   ,23   ,9.741          ,10.845              ,0.898 
0    ,64   ,128  ,127      ,96   ,23   ,8.592          ,10.001              ,0.859 
0    ,64   ,13   ,127      ,12   ,23   ,3.724          ,3.726               ,0.999 
0    ,64   ,1312 ,127      ,1024 ,23   ,52.227         ,57.168              ,0.914 
0    ,64   ,14   ,127      ,13   ,23   ,3.732          ,3.733               ,1.0   
0    ,64   ,144  ,127      ,128  ,23   ,9.849          ,11.39               ,0.865 
0    ,64   ,15   ,127      ,14   ,23   ,3.752          ,3.76                ,0.998 
0    ,64   ,16   ,127      ,0    ,23   ,3.444          ,3.423               ,1.006 
0    ,64   ,16   ,127      ,144  ,23   ,3.555          ,3.509               ,1.013 
0    ,64   ,16   ,127      ,15   ,23   ,3.756          ,3.76                ,0.999 
0    ,64   ,16   ,127      ,16   ,23   ,3.467          ,3.473               ,0.998 
0    ,64   ,16   ,127      ,192  ,23   ,3.464          ,3.467               ,0.999 
0    ,64   ,16   ,127      ,240  ,23   ,3.429          ,3.432               ,0.999 
0    ,64   ,16   ,127      ,256  ,23   ,3.533          ,3.541               ,0.998 
0    ,64   ,16   ,127      ,288  ,23   ,3.431          ,3.914               ,0.877 
0    ,64   ,16   ,127      ,48   ,23   ,3.498          ,3.491               ,1.002 
0    ,64   ,16   ,127      ,64   ,23   ,3.462          ,3.488               ,0.993 
0    ,64   ,16   ,127      ,96   ,23   ,3.498          ,3.503               ,0.998 
0    ,64   ,160  ,127      ,144  ,23   ,9.941          ,10.922              ,0.91  
0    ,64   ,160  ,127      ,16   ,23   ,12.535         ,13.57               ,0.924 
0    ,64   ,160  ,127      ,256  ,23   ,9.738          ,11.169              ,0.872 
0    ,64   ,160  ,127      ,64   ,23   ,9.907          ,11.578              ,0.856 
0    ,64   ,160  ,127      ,96   ,23   ,8.759          ,10.112              ,0.866 
0    ,64   ,17   ,127      ,16   ,23   ,3.727          ,3.734               ,0.998 
0    ,64   ,176  ,127      ,128  ,23   ,9.609          ,11.025              ,0.872 
0    ,64   ,176  ,127      ,160  ,23   ,9.717          ,10.7                ,0.908 
0    ,64   ,176  ,127      ,32   ,23   ,9.898          ,11.644              ,0.85  
0    ,64   ,1760 ,127      ,2048 ,23   ,95.593         ,108.833             ,0.878 
0    ,64   ,1760 ,127      ,288  ,23   ,53.134         ,68.014              ,0.781 
0    ,64   ,18   ,127      ,17   ,23   ,3.706          ,3.704               ,1.0   
0    ,64   ,1808 ,127      ,2048 ,23   ,98.905         ,112.156             ,0.882 
0    ,64   ,1808 ,127      ,240  ,23   ,54.856         ,72.125              ,0.761 
0    ,64   ,1856 ,127      ,192  ,23   ,57.624         ,75.466              ,0.764 
0    ,64   ,1856 ,127      ,2048 ,23   ,100.943        ,113.966             ,0.886 
0    ,64   ,19   ,127      ,18   ,23   ,3.723          ,3.734               ,0.997 
0    ,64   ,1904 ,127      ,144  ,23   ,53.812         ,72.327              ,0.744 
0    ,64   ,1904 ,127      ,2048 ,23   ,101.654        ,115.491             ,0.88  
0    ,64   ,192  ,127      ,176  ,23   ,11.159         ,12.463              ,0.895 
0    ,64   ,1952 ,127      ,2048 ,23   ,103.488        ,118.261             ,0.875 
0    ,64   ,1952 ,127      ,96   ,23   ,79.301         ,73.929              ,1.073 
0    ,64   ,2    ,127      ,1    ,23   ,3.515          ,3.483               ,1.009 
0    ,64   ,20   ,127      ,19   ,23   ,3.728          ,3.742               ,0.996 
0    ,64   ,2000 ,127      ,2048 ,23   ,105.812        ,121.233             ,0.873 
0    ,64   ,2000 ,127      ,48   ,23   ,80.643         ,81.529              ,0.989 
0    ,64   ,2048 ,127      ,0    ,23   ,86.262         ,81.948              ,1.053 
0    ,64   ,2048 ,127      ,1024 ,23   ,86.661         ,89.785              ,0.965 
0    ,64   ,2048 ,127      ,128  ,23   ,75.644         ,97.954              ,0.772 
0    ,64   ,2048 ,127      ,144  ,23   ,59.987         ,78.445              ,0.765 
0    ,64   ,2048 ,127      ,1760 ,23   ,102.241        ,118.125             ,0.866 
0    ,64   ,2048 ,127      ,1808 ,23   ,106.846        ,118.735             ,0.9   
0    ,64   ,2048 ,127      ,1856 ,23   ,105.991        ,120.37              ,0.881 
0    ,64   ,2048 ,127      ,1904 ,23   ,105.956        ,120.117             ,0.882 
0    ,64   ,2048 ,127      ,192  ,23   ,85.502         ,80.687              ,1.06  
0    ,64   ,2048 ,127      ,1952 ,23   ,110.138        ,120.182             ,0.916 
0    ,64   ,2048 ,127      ,2000 ,23   ,106.915        ,121.95              ,0.877 
0    ,64   ,2048 ,127      ,2048 ,23   ,105.578        ,120.927             ,0.873 
0    ,64   ,2048 ,127      ,240  ,23   ,83.414         ,80.906              ,1.031 
0    ,64   ,2048 ,127      ,256  ,23   ,93.172         ,100.421             ,0.928 
0    ,64   ,2048 ,127      ,288  ,23   ,78.99          ,81.159              ,0.973 
0    ,64   ,2048 ,127      ,32   ,23   ,83.089         ,81.832              ,1.015 
0    ,64   ,2048 ,127      ,4096 ,23   ,106.049        ,121.237             ,0.875 
0    ,64   ,2048 ,127      ,48   ,23   ,83.57          ,83.426              ,1.002 
0    ,64   ,2048 ,127      ,512  ,23   ,82.349         ,104.994             ,0.784 
0    ,64   ,2048 ,127      ,64   ,23   ,83.331         ,85.268              ,0.977 
0    ,64   ,2048 ,127      ,96   ,23   ,83.696         ,82.087              ,1.02  
0    ,64   ,208  ,127      ,16   ,23   ,13.753         ,15.158              ,0.907 
0    ,64   ,208  ,127      ,192  ,23   ,11.843         ,13.269              ,0.893 
0    ,64   ,208  ,127      ,256  ,23   ,11.776         ,13.823              ,0.852 
0    ,64   ,208  ,127      ,48   ,23   ,11.55          ,13.355              ,0.865 
0    ,64   ,208  ,127      ,64   ,23   ,11.594         ,13.322              ,0.87  
0    ,64   ,2096 ,127      ,2048 ,23   ,107.816        ,124.266             ,0.868 
0    ,64   ,21   ,127      ,20   ,23   ,3.724          ,3.731               ,0.998 
0    ,64   ,2144 ,127      ,2048 ,23   ,109.099        ,125.101             ,0.872 
0    ,64   ,2192 ,127      ,2048 ,23   ,112.059        ,126.594             ,0.885 
0    ,64   ,22   ,127      ,21   ,23   ,3.7            ,3.712               ,0.997 
0    ,64   ,224  ,127      ,128  ,23   ,12.329         ,12.832              ,0.961 
0    ,64   ,224  ,127      ,208  ,23   ,11.74          ,13.336              ,0.88  
0    ,64   ,224  ,127      ,288  ,23   ,11.795         ,13.64               ,0.865 
0    ,64   ,224  ,127      ,32   ,23   ,11.841         ,13.709              ,0.864 
0    ,64   ,224  ,127      ,512  ,23   ,11.797         ,13.576              ,0.869 
0    ,64   ,2240 ,127      ,2048 ,23   ,115.099        ,130.109             ,0.885 
0    ,64   ,2288 ,127      ,2048 ,23   ,115.993        ,130.564             ,0.888 
0    ,64   ,23   ,127      ,22   ,23   ,3.694          ,3.688               ,1.001 
0    ,64   ,2336 ,127      ,2048 ,23   ,116.474        ,131.123             ,0.888 
0    ,64   ,24   ,127      ,23   ,23   ,3.704          ,3.704               ,1.0   
0    ,64   ,240  ,127      ,224  ,23   ,11.727         ,13.426              ,0.873 
0    ,64   ,25   ,127      ,24   ,23   ,3.689          ,3.699               ,0.997 
0    ,64   ,256  ,127      ,0    ,23   ,13.312         ,14.788              ,0.9   
0    ,64   ,256  ,127      ,112  ,23   ,11.622         ,13.572              ,0.856 
0    ,64   ,256  ,127      ,144  ,23   ,13.655         ,14.327              ,0.953 
0    ,64   ,256  ,127      ,16   ,23   ,14.022         ,15.824              ,0.886 
0    ,64   ,256  ,127      ,160  ,23   ,12.666         ,13.875              ,0.913 
0    ,64   ,256  ,127      ,192  ,23   ,15.27          ,15.362              ,0.994 
0    ,64   ,256  ,127      ,208  ,23   ,15.9           ,15.162              ,1.049 
0    ,64   ,256  ,127      ,240  ,23   ,14.188         ,15.325              ,0.926 
0    ,64   ,256  ,127      ,256  ,23   ,13.844         ,14.895              ,0.929 
0    ,64   ,256  ,127      ,288  ,23   ,14.001         ,15.395              ,0.909 
0    ,64   ,256  ,127      ,48   ,23   ,13.079         ,14.809              ,0.883 
0    ,64   ,256  ,127      ,64   ,23   ,13.093         ,14.756              ,0.887 
0    ,64   ,256  ,127      ,96   ,23   ,11.849         ,13.573              ,0.873 
0    ,64   ,26   ,127      ,25   ,23   ,3.693          ,3.791               ,0.974 
0    ,64   ,27   ,127      ,26   ,23   ,3.684          ,4.445               ,0.829 
0    ,64   ,272  ,127      ,128  ,23   ,13.84          ,14.229              ,0.973 
0    ,64   ,272  ,127      ,240  ,23   ,14.123         ,15.343              ,0.92  
0    ,64   ,272  ,127      ,256  ,23   ,15.111         ,16.873              ,0.896 
0    ,64   ,272  ,127      ,32   ,23   ,13.101         ,14.83               ,0.883 
0    ,64   ,272  ,127      ,512  ,23   ,15.437         ,16.894              ,0.914 
0    ,64   ,28   ,127      ,27   ,23   ,3.627          ,4.543               ,0.798 
0    ,64   ,288  ,127      ,272  ,23   ,14.97          ,16.801              ,0.891 
0    ,64   ,29   ,127      ,28   ,23   ,3.621          ,4.85                ,0.746 
0    ,64   ,3    ,127      ,2    ,23   ,3.57           ,3.607               ,0.99  
0    ,64   ,30   ,127      ,29   ,23   ,3.565          ,4.593               ,0.776 
0    ,64   ,304  ,127      ,16   ,23   ,13.418         ,15.238              ,0.881 
0    ,64   ,304  ,127      ,256  ,23   ,14.792         ,16.665              ,0.888 
0    ,64   ,304  ,127      ,64   ,23   ,12.953         ,14.355              ,0.902 
0    ,64   ,31   ,127      ,30   ,23   ,3.56           ,4.69                ,0.759 
0    ,64   ,32   ,127      ,0    ,23   ,4.862          ,5.751               ,0.845 
0    ,64   ,32   ,127      ,128  ,23   ,4.827          ,5.497               ,0.878 
0    ,64   ,32   ,127      ,144  ,23   ,4.828          ,5.031               ,0.96  
0    ,64   ,32   ,127      ,16   ,23   ,4.956          ,5.831               ,0.85  
0    ,64   ,32   ,127      ,192  ,23   ,4.827          ,5.696               ,0.847 
0    ,64   ,32   ,127      ,240  ,23   ,4.771          ,5.059               ,0.943 
0    ,64   ,32   ,127      ,288  ,23   ,4.807          ,5.232               ,0.919 
0    ,64   ,32   ,127      ,31   ,23   ,4.951          ,6.084               ,0.814 
0    ,64   ,32   ,127      ,32   ,23   ,4.786          ,5.448               ,0.878 
0    ,64   ,32   ,127      ,48   ,23   ,4.868          ,5.681               ,0.857 
0    ,64   ,32   ,127      ,96   ,23   ,4.822          ,5.475               ,0.881 
0    ,64   ,320  ,127      ,128  ,23   ,15.677         ,15.721              ,0.997 
0    ,64   ,320  ,127      ,192  ,23   ,16.694         ,17.08               ,0.977 
0    ,64   ,320  ,127      ,32   ,23   ,14.719         ,16.668              ,0.883 
0    ,64   ,320  ,127      ,512  ,23   ,16.663         ,18.256              ,0.913 
0    ,64   ,352  ,127      ,256  ,23   ,17.748         ,17.855              ,0.994 
0    ,64   ,352  ,127      ,64   ,23   ,14.732         ,16.748              ,0.88  
0    ,64   ,368  ,127      ,128  ,23   ,15.716         ,15.943              ,0.986 
0    ,64   ,368  ,127      ,144  ,23   ,15.335         ,15.757              ,0.973 
0    ,64   ,368  ,127      ,512  ,23   ,17.381         ,19.018              ,0.914 
0    ,64   ,4    ,127      ,3    ,23   ,3.602          ,3.604               ,0.999 
0    ,64   ,400  ,127      ,256  ,23   ,19.026         ,19.222              ,0.99  
0    ,64   ,416  ,127      ,128  ,23   ,16.741         ,17.173              ,0.975 
0    ,64   ,416  ,127      ,512  ,23   ,19.657         ,22.136              ,0.888 
0    ,64   ,416  ,127      ,96   ,23   ,14.974         ,16.767              ,0.893 
0    ,64   ,448  ,127      ,256  ,23   ,21.109         ,22.279              ,0.947 
0    ,64   ,464  ,127      ,48   ,23   ,17.829         ,19.286              ,0.924 
0    ,64   ,464  ,127      ,512  ,23   ,22.083         ,25.044              ,0.882 
0    ,64   ,48   ,127      ,32   ,23   ,4.734          ,5.789               ,0.818 
0    ,64   ,496  ,127      ,256  ,23   ,20.453         ,20.97               ,0.975 
0    ,64   ,5    ,127      ,4    ,23   ,3.606          ,3.607               ,1.0   
0    ,64   ,512  ,127      ,0    ,23   ,20.014         ,21.348              ,0.938 
0    ,64   ,512  ,127      ,144  ,23   ,19.349         ,20.692              ,0.935 
0    ,64   ,512  ,127      ,192  ,23   ,20.569         ,21.411              ,0.961 
0    ,64   ,512  ,127      ,224  ,23   ,19.958         ,21.504              ,0.928 
0    ,64   ,512  ,127      ,240  ,23   ,19.893         ,21.299              ,0.934 
0    ,64   ,512  ,127      ,272  ,23   ,21.985         ,22.42               ,0.981 
0    ,64   ,512  ,127      ,288  ,23   ,20.742         ,22.523              ,0.921 
0    ,64   ,512  ,127      ,320  ,23   ,22.376         ,23.266              ,0.962 
0    ,64   ,512  ,127      ,368  ,23   ,21.452         ,23.471              ,0.914 
0    ,64   ,512  ,127      ,416  ,23   ,22.458         ,24.594              ,0.913 
0    ,64   ,512  ,127      ,464  ,23   ,24.658         ,26.085              ,0.945 
0    ,64   ,512  ,127      ,48   ,23   ,19.463         ,21.332              ,0.912 
0    ,64   ,512  ,127      ,512  ,23   ,23.517         ,25.921              ,0.907 
0    ,64   ,512  ,127      ,96   ,23   ,18.336         ,20.142              ,0.91  
0    ,64   ,544  ,127      ,256  ,23   ,21.941         ,22.492              ,0.975 
0    ,64   ,560  ,127      ,512  ,23   ,23.841         ,27.595              ,0.864 
0    ,64   ,6    ,127      ,5    ,23   ,3.681          ,3.685               ,0.999 
0    ,64   ,608  ,127      ,512  ,23   ,27.294         ,29.2                ,0.935 
0    ,64   ,64   ,127      ,0    ,23   ,6.033          ,7.373               ,0.818 
0    ,64   ,64   ,127      ,144  ,23   ,5.558          ,7.033               ,0.79  
0    ,64   ,64   ,127      ,16   ,23   ,6.095          ,7.241               ,0.842 
0    ,64   ,64   ,127      ,192  ,23   ,5.791          ,7.44                ,0.778 
0    ,64   ,64   ,127      ,240  ,23   ,5.507          ,6.615               ,0.833 
0    ,64   ,64   ,127      ,256  ,23   ,5.591          ,7.123               ,0.785 
0    ,64   ,64   ,127      ,288  ,23   ,5.582          ,6.965               ,0.801 
0    ,64   ,64   ,127      ,48   ,23   ,5.855          ,6.783               ,0.863 
0    ,64   ,64   ,127      ,64   ,23   ,5.442          ,6.561               ,0.829 
0    ,64   ,64   ,127      ,96   ,23   ,5.666          ,6.827               ,0.83  
0    ,64   ,656  ,127      ,512  ,23   ,29.126         ,30.657              ,0.95  
0    ,64   ,7    ,127      ,6    ,23   ,3.668          ,3.667               ,1.0   
0    ,64   ,704  ,127      ,512  ,23   ,31.35          ,33.496              ,0.936 
0    ,64   ,736  ,127      ,1024 ,23   ,32.03          ,37.032              ,0.865 
0    ,64   ,736  ,127      ,288  ,23   ,25.77          ,27.312              ,0.944 
0    ,64   ,752  ,127      ,512  ,23   ,30.567         ,32.325              ,0.946 
0    ,64   ,784  ,127      ,1024 ,23   ,34.95          ,39.598              ,0.883 
0    ,64   ,784  ,127      ,240  ,23   ,26.742         ,27.676              ,0.966 
0    ,64   ,8    ,127      ,7    ,23   ,3.673          ,3.676               ,0.999 
0    ,64   ,80   ,127      ,128  ,23   ,5.149          ,6.69                ,0.77  
0    ,64   ,80   ,127      ,32   ,23   ,5.633          ,6.993               ,0.806 
0    ,64   ,80   ,127      ,48   ,23   ,5.498          ,6.875               ,0.8   
0    ,64   ,80   ,127      ,64   ,23   ,5.662          ,6.599               ,0.858 
0    ,64   ,800  ,127      ,512  ,23   ,31.594         ,33.21               ,0.951 
0    ,64   ,832  ,127      ,1024 ,23   ,36.503         ,41.066              ,0.889 
0    ,64   ,832  ,127      ,192  ,23   ,29.027         ,29.238              ,0.993 
0    ,64   ,880  ,127      ,1024 ,23   ,36.586         ,42.805              ,0.855 
0    ,64   ,880  ,127      ,144  ,23   ,27.332         ,28.541              ,0.958 
0    ,64   ,9    ,127      ,8    ,23   ,3.684          ,3.676               ,1.002 
0    ,64   ,928  ,127      ,1024 ,23   ,39.856         ,45.936              ,0.868 
0    ,64   ,928  ,127      ,96   ,23   ,28.458         ,29.941              ,0.95  
0    ,64   ,96   ,127      ,80   ,23   ,7.176          ,8.65                ,0.83  
0    ,64   ,976  ,127      ,1024 ,23   ,42.183         ,49.163              ,0.858 
0    ,64   ,976  ,127      ,48   ,23   ,30.857         ,32.39               ,0.953 
1    ,1    ,2048 ,127      ,32   ,0    ,4.685          ,5.861               ,0.799 
1    ,1    ,2048 ,127      ,32   ,23   ,78.6           ,82.253              ,0.956 
1    ,1    ,256  ,127      ,64   ,0    ,5.233          ,6.673               ,0.784 
1    ,1    ,256  ,127      ,64   ,23   ,13.465         ,14.706              ,0.916 
1    ,16   ,2048 ,127      ,32   ,23   ,82.704         ,83.403              ,0.992 
1    ,16   ,256  ,127      ,64   ,23   ,15.204         ,16.726              ,0.909 
1    ,256  ,2048 ,127      ,32   ,23   ,81.618         ,82.322              ,0.991 
1    ,256  ,256  ,127      ,64   ,23   ,15.03          ,16.798              ,0.895 
1    ,4    ,2048 ,127      ,32   ,23   ,83.859         ,82.807              ,1.013 
1    ,4    ,256  ,127      ,64   ,23   ,13.23          ,15.045              ,0.879 
1    ,64   ,2048 ,127      ,32   ,23   ,81.43          ,82.62               ,0.986 
1    ,64   ,256  ,127      ,64   ,23   ,15.01          ,16.573              ,0.906 
105  ,1    ,256  ,127      ,64   ,0    ,5.352          ,6.474               ,0.827 
105  ,1    ,256  ,127      ,64   ,23   ,13.077         ,14.624              ,0.894 
105  ,16   ,256  ,127      ,64   ,23   ,13.041         ,14.829              ,0.879 
105  ,256  ,256  ,127      ,64   ,23   ,14.802         ,16.29               ,0.909 
105  ,4    ,256  ,127      ,64   ,23   ,12.997         ,14.595              ,0.89  
105  ,64   ,256  ,127      ,64   ,23   ,13.035         ,14.716              ,0.886 
15   ,1    ,256  ,127      ,64   ,0    ,5.226          ,6.566               ,0.796 
15   ,1    ,256  ,127      ,64   ,23   ,13.119         ,14.785              ,0.887 
15   ,16   ,256  ,127      ,64   ,23   ,13.1           ,14.861              ,0.881 
15   ,256  ,256  ,127      ,64   ,23   ,14.709         ,16.324              ,0.901 
15   ,4    ,256  ,127      ,64   ,23   ,13.123         ,14.835              ,0.885 
15   ,64   ,256  ,127      ,64   ,23   ,13.167         ,14.712              ,0.895 
2    ,1    ,2048 ,127      ,64   ,0    ,7.626          ,8.793               ,0.867 
2    ,1    ,2048 ,127      ,64   ,23   ,77.366         ,82.498              ,0.938 
2    ,1    ,256  ,127      ,64   ,0    ,5.227          ,6.64                ,0.787 
2    ,1    ,256  ,127      ,64   ,23   ,12.981         ,14.799              ,0.877 
2    ,16   ,2048 ,127      ,64   ,23   ,83.108         ,81.199              ,1.024 
2    ,16   ,256  ,127      ,64   ,23   ,13.098         ,14.731              ,0.889 
2    ,256  ,2048 ,127      ,64   ,23   ,81.901         ,82.507              ,0.993 
2    ,256  ,256  ,127      ,64   ,23   ,14.835         ,16.387              ,0.905 
2    ,4    ,2048 ,127      ,64   ,23   ,75.757         ,80.933              ,0.936 
2    ,4    ,256  ,127      ,64   ,23   ,13.078         ,14.639              ,0.893 
2    ,64   ,2048 ,127      ,64   ,23   ,82.172         ,83.939              ,0.979 
2    ,64   ,256  ,127      ,64   ,23   ,13.102         ,14.615              ,0.896 
3    ,1    ,2048 ,127      ,128  ,0    ,9.898          ,11.049              ,0.896 
3    ,1    ,2048 ,127      ,128  ,23   ,59.115         ,76.422              ,0.774 
3    ,1    ,256  ,127      ,64   ,0    ,5.225          ,6.408               ,0.815 
3    ,1    ,256  ,127      ,64   ,23   ,13.076         ,14.678              ,0.891 
3    ,16   ,2048 ,127      ,128  ,23   ,59.384         ,78.46               ,0.757 
3    ,16   ,256  ,127      ,64   ,23   ,13.018         ,14.459              ,0.9   
3    ,256  ,2048 ,127      ,128  ,23   ,59.252         ,79.934              ,0.741 
3    ,256  ,256  ,127      ,64   ,23   ,14.763         ,16.14               ,0.915 
3    ,4    ,2048 ,127      ,128  ,23   ,59.275         ,79.176              ,0.749 
3    ,4    ,256  ,127      ,64   ,23   ,13.088         ,14.694              ,0.891 
3    ,64   ,2048 ,127      ,128  ,23   ,76.52          ,98.562              ,0.776 
3    ,64   ,256  ,127      ,64   ,23   ,13.063         ,14.775              ,0.884 
30   ,1    ,256  ,127      ,64   ,0    ,5.241          ,6.562               ,0.799 
30   ,1    ,256  ,127      ,64   ,23   ,12.995         ,14.582              ,0.891 
30   ,16   ,256  ,127      ,64   ,23   ,13.097         ,14.652              ,0.894 
30   ,256  ,256  ,127      ,64   ,23   ,14.91          ,16.136              ,0.924 
30   ,4    ,256  ,127      ,64   ,23   ,13.003         ,14.628              ,0.889 
30   ,64   ,256  ,127      ,64   ,23   ,12.969         ,14.458              ,0.897 
4    ,1    ,2048 ,127      ,256  ,0    ,12.666         ,14.449              ,0.877 
4    ,1    ,2048 ,127      ,256  ,23   ,85.886         ,84.646              ,1.015 
4    ,1    ,256  ,127      ,64   ,0    ,5.269          ,6.734               ,0.783 
4    ,1    ,256  ,127      ,64   ,23   ,13.011         ,14.621              ,0.89  
4    ,16   ,2048 ,127      ,256  ,23   ,75.435         ,80.758              ,0.934 
4    ,16   ,256  ,127      ,64   ,23   ,13.119         ,14.838              ,0.884 
4    ,256  ,2048 ,127      ,256  ,23   ,75.436         ,81.42               ,0.927 
4    ,256  ,256  ,127      ,64   ,23   ,14.658         ,16.187              ,0.905 
4    ,4    ,2048 ,127      ,256  ,23   ,63.209         ,81.426              ,0.776 
4    ,4    ,256  ,127      ,64   ,23   ,13.057         ,14.749              ,0.885 
4    ,64   ,2048 ,127      ,256  ,23   ,88.512         ,101.06              ,0.876 
4    ,64   ,256  ,127      ,64   ,23   ,13.01          ,14.584              ,0.892 
4080 ,1    ,31   ,127      ,30   ,0    ,6.155          ,6.297               ,0.977 
4080 ,1    ,31   ,127      ,30   ,23   ,6.037          ,6.162               ,0.98  
4080 ,1    ,32   ,127      ,31   ,0    ,6.109          ,6.575               ,0.929 
4080 ,1    ,32   ,127      ,31   ,23   ,6.023          ,7.74                ,0.778 
4080 ,16   ,31   ,127      ,30   ,23   ,6.11           ,6.175               ,0.989 
4080 ,16   ,32   ,127      ,31   ,23   ,6.097          ,7.955               ,0.766 
4080 ,256  ,31   ,127      ,30   ,23   ,6.081          ,6.327               ,0.961 
4080 ,256  ,32   ,127      ,31   ,23   ,6.049          ,7.932               ,0.763 
4080 ,4    ,31   ,127      ,30   ,23   ,6.091          ,6.381               ,0.955 
4080 ,4    ,32   ,127      ,31   ,23   ,6.111          ,7.706               ,0.793 
4080 ,64   ,31   ,127      ,30   ,23   ,6.125          ,6.272               ,0.977 
4080 ,64   ,32   ,127      ,31   ,23   ,6.073          ,7.895               ,0.769 
4081 ,1    ,29   ,127      ,28   ,0    ,6.238          ,6.291               ,0.991 
4081 ,1    ,29   ,127      ,28   ,23   ,6.155          ,5.567               ,1.106 
4081 ,1    ,30   ,127      ,29   ,0    ,6.169          ,5.816               ,1.061 
4081 ,1    ,30   ,127      ,29   ,23   ,6.08           ,6.068               ,1.002 
4081 ,16   ,29   ,127      ,28   ,23   ,6.169          ,5.642               ,1.093 
4081 ,16   ,30   ,127      ,29   ,23   ,6.066          ,5.876               ,1.032 
4081 ,256  ,29   ,127      ,28   ,23   ,6.169          ,5.577               ,1.106 
4081 ,256  ,30   ,127      ,29   ,23   ,6.148          ,5.812               ,1.058 
4081 ,4    ,29   ,127      ,28   ,23   ,6.178          ,5.484               ,1.127 
4081 ,4    ,30   ,127      ,29   ,23   ,6.125          ,6.238               ,0.982 
4081 ,64   ,29   ,127      ,28   ,23   ,6.192          ,6.002               ,1.032 
4081 ,64   ,30   ,127      ,29   ,23   ,6.125          ,6.381               ,0.96  
4082 ,1    ,27   ,127      ,26   ,0    ,6.339          ,5.755               ,1.102 
4082 ,1    ,27   ,127      ,26   ,23   ,6.293          ,5.593               ,1.125 
4082 ,1    ,28   ,127      ,27   ,0    ,6.285          ,5.665               ,1.109 
4082 ,1    ,28   ,127      ,27   ,23   ,6.215          ,6.327               ,0.982 
4082 ,16   ,27   ,127      ,26   ,23   ,6.292          ,5.593               ,1.125 
4082 ,16   ,28   ,127      ,27   ,23   ,6.208          ,5.963               ,1.041 
4082 ,256  ,27   ,127      ,26   ,23   ,6.292          ,5.882               ,1.07  
4082 ,256  ,28   ,127      ,27   ,23   ,6.199          ,6.268               ,0.989 
4082 ,4    ,27   ,127      ,26   ,23   ,6.287          ,6.009               ,1.046 
4082 ,4    ,28   ,127      ,27   ,23   ,6.215          ,5.995               ,1.037 
4082 ,64   ,27   ,127      ,26   ,23   ,6.324          ,5.593               ,1.131 
4082 ,64   ,28   ,127      ,27   ,23   ,6.215          ,6.053               ,1.027 
4083 ,1    ,25   ,127      ,24   ,0    ,6.356          ,5.67                ,1.121 
4083 ,1    ,25   ,127      ,24   ,23   ,6.374          ,5.649               ,1.128 
4083 ,1    ,26   ,127      ,25   ,0    ,6.369          ,5.649               ,1.127 
4083 ,1    ,26   ,127      ,25   ,23   ,6.323          ,5.621               ,1.125 
4083 ,16   ,25   ,127      ,24   ,23   ,6.355          ,5.649               ,1.125 
4083 ,16   ,26   ,127      ,25   ,23   ,6.346          ,5.621               ,1.129 
4083 ,256  ,25   ,127      ,24   ,23   ,6.355          ,5.65                ,1.125 
4083 ,256  ,26   ,127      ,25   ,23   ,6.339          ,5.94                ,1.067 
4083 ,4    ,25   ,127      ,24   ,23   ,6.356          ,5.649               ,1.125 
4083 ,4    ,26   ,127      ,25   ,23   ,6.34           ,5.635               ,1.125 
4083 ,64   ,25   ,127      ,24   ,23   ,6.383          ,5.65                ,1.13  
4083 ,64   ,26   ,127      ,25   ,23   ,6.36           ,5.629               ,1.13  
4084 ,1    ,23   ,127      ,22   ,0    ,6.387          ,5.697               ,1.121 
4084 ,1    ,23   ,127      ,22   ,23   ,6.371          ,5.664               ,1.125 
4084 ,1    ,24   ,127      ,23   ,0    ,6.404          ,5.692               ,1.125 
4084 ,1    ,24   ,127      ,23   ,23   ,6.371          ,5.68                ,1.122 
4084 ,16   ,23   ,127      ,22   ,23   ,6.388          ,5.678               ,1.125 
4084 ,16   ,24   ,127      ,23   ,23   ,6.379          ,5.664               ,1.126 
4084 ,256  ,23   ,127      ,22   ,23   ,6.372          ,5.664               ,1.125 
4084 ,256  ,24   ,127      ,23   ,23   ,6.371          ,5.65                ,1.128 
4084 ,4    ,23   ,127      ,22   ,23   ,6.389          ,5.678               ,1.125 
4084 ,4    ,24   ,127      ,23   ,23   ,6.356          ,5.649               ,1.125 
4084 ,64   ,23   ,127      ,22   ,23   ,6.355          ,5.665               ,1.122 
4084 ,64   ,24   ,127      ,23   ,23   ,6.356          ,5.651               ,1.125 
4085 ,1    ,21   ,127      ,20   ,0    ,6.371          ,5.688               ,1.12  
4085 ,1    ,21   ,127      ,20   ,23   ,6.396          ,5.678               ,1.126 
4085 ,1    ,22   ,127      ,21   ,0    ,6.365          ,5.65                ,1.127 
4085 ,1    ,22   ,127      ,21   ,23   ,6.419          ,5.692               ,1.128 
4085 ,16   ,21   ,127      ,20   ,23   ,6.397          ,5.663               ,1.13  
4085 ,16   ,22   ,127      ,21   ,23   ,6.422          ,5.707               ,1.125 
4085 ,256  ,21   ,127      ,20   ,23   ,6.388          ,5.678               ,1.125 
4085 ,256  ,22   ,127      ,21   ,23   ,6.356          ,5.649               ,1.125 
4085 ,4    ,21   ,127      ,20   ,23   ,6.395          ,5.657               ,1.13  
4085 ,4    ,22   ,127      ,21   ,23   ,6.449          ,5.729               ,1.126 
4085 ,64   ,21   ,127      ,20   ,23   ,6.405          ,5.706               ,1.123 
4085 ,64   ,22   ,127      ,21   ,23   ,6.372          ,5.673               ,1.123 
4086 ,1    ,19   ,127      ,18   ,0    ,6.404          ,5.693               ,1.125 
4086 ,1    ,19   ,127      ,18   ,23   ,6.397          ,5.686               ,1.125 
4086 ,1    ,20   ,127      ,19   ,0    ,6.436          ,5.721               ,1.125 
4086 ,1    ,20   ,127      ,19   ,23   ,6.404          ,5.692               ,1.125 
4086 ,16   ,19   ,127      ,18   ,23   ,6.388          ,5.678               ,1.125 
4086 ,16   ,20   ,127      ,19   ,23   ,6.387          ,5.678               ,1.125 
4086 ,256  ,19   ,127      ,18   ,23   ,6.389          ,5.664               ,1.128 
4086 ,256  ,20   ,127      ,19   ,23   ,6.388          ,5.678               ,1.125 
4086 ,4    ,19   ,127      ,18   ,23   ,6.398          ,5.687               ,1.125 
4086 ,4    ,20   ,127      ,19   ,23   ,6.382          ,5.697               ,1.12  
4086 ,64   ,19   ,127      ,18   ,23   ,6.425          ,5.693               ,1.129 
4086 ,64   ,20   ,127      ,19   ,23   ,6.388          ,5.678               ,1.125 
4087 ,1    ,17   ,127      ,16   ,0    ,6.437          ,5.744               ,1.121 
4087 ,1    ,17   ,127      ,16   ,23   ,6.42           ,5.707               ,1.125 
4087 ,1    ,18   ,127      ,17   ,0    ,6.436          ,5.733               ,1.123 
4087 ,1    ,18   ,127      ,17   ,23   ,6.404          ,5.692               ,1.125 
4087 ,16   ,17   ,127      ,16   ,23   ,6.437          ,5.721               ,1.125 
4087 ,16   ,18   ,127      ,17   ,23   ,6.42           ,5.707               ,1.125 
4087 ,256  ,17   ,127      ,16   ,23   ,6.425          ,5.747               ,1.118 
4087 ,256  ,18   ,127      ,17   ,23   ,6.412          ,5.703               ,1.124 
4087 ,4    ,17   ,127      ,16   ,23   ,6.435          ,5.707               ,1.128 
4087 ,4    ,18   ,127      ,17   ,23   ,6.44           ,5.707               ,1.128 
4087 ,64   ,17   ,127      ,16   ,23   ,6.387          ,5.678               ,1.125 
4087 ,64   ,18   ,127      ,17   ,23   ,6.388          ,5.687               ,1.123 
4088 ,1    ,15   ,127      ,14   ,0    ,6.388          ,5.686               ,1.123 
4088 ,1    ,15   ,127      ,14   ,23   ,6.429          ,5.726               ,1.123 
4088 ,1    ,16   ,127      ,15   ,0    ,6.452          ,5.736               ,1.125 
4088 ,1    ,16   ,127      ,15   ,23   ,6.42           ,5.707               ,1.125 
4088 ,16   ,15   ,127      ,14   ,23   ,6.469          ,5.75                ,1.125 
4088 ,16   ,16   ,127      ,15   ,23   ,6.388          ,5.704               ,1.12  
4088 ,256  ,15   ,127      ,14   ,23   ,6.436          ,5.721               ,1.125 
4088 ,256  ,16   ,127      ,15   ,23   ,6.453          ,5.736               ,1.125 
4088 ,4    ,15   ,127      ,14   ,23   ,6.469          ,5.751               ,1.125 
4088 ,4    ,16   ,127      ,15   ,23   ,6.387          ,5.678               ,1.125 
4088 ,64   ,15   ,127      ,14   ,23   ,6.469          ,5.75                ,1.125 
4088 ,64   ,16   ,127      ,15   ,23   ,6.42           ,5.719               ,1.123 
4089 ,1    ,13   ,127      ,12   ,0    ,6.42           ,5.707               ,1.125 
4089 ,1    ,13   ,127      ,12   ,23   ,6.421          ,5.707               ,1.125 
4089 ,1    ,14   ,127      ,13   ,0    ,6.404          ,5.692               ,1.125 
4089 ,1    ,14   ,127      ,13   ,23   ,6.404          ,5.692               ,1.125 
4089 ,16   ,13   ,127      ,12   ,23   ,6.395          ,5.672               ,1.128 
4089 ,16   ,14   ,127      ,13   ,23   ,6.436          ,5.734               ,1.122 
4089 ,256  ,13   ,127      ,12   ,23   ,6.388          ,5.686               ,1.123 
4089 ,256  ,14   ,127      ,13   ,23   ,6.437          ,5.721               ,1.125 
4089 ,4    ,13   ,127      ,12   ,23   ,6.454          ,5.754               ,1.122 
4089 ,4    ,14   ,127      ,13   ,23   ,6.452          ,5.729               ,1.126 
4089 ,64   ,13   ,127      ,12   ,23   ,6.403          ,5.692               ,1.125 
4089 ,64   ,14   ,127      ,13   ,23   ,6.453          ,5.745               ,1.123 
4090 ,1    ,11   ,127      ,10   ,0    ,6.371          ,5.664               ,1.125 
4090 ,1    ,11   ,127      ,10   ,23   ,6.372          ,5.664               ,1.125 
4090 ,1    ,12   ,127      ,11   ,0    ,6.435          ,5.707               ,1.128 
4090 ,1    ,12   ,127      ,11   ,23   ,6.378          ,5.649               ,1.129 
4090 ,16   ,11   ,127      ,10   ,23   ,6.355          ,5.657               ,1.123 
4090 ,16   ,12   ,127      ,11   ,23   ,6.411          ,5.692               ,1.126 
4090 ,256  ,11   ,127      ,10   ,23   ,6.356          ,5.649               ,1.125 
4090 ,256  ,12   ,127      ,11   ,23   ,6.404          ,5.7                 ,1.123 
4090 ,4    ,11   ,127      ,10   ,23   ,6.404          ,5.708               ,1.122 
4090 ,4    ,12   ,127      ,11   ,23   ,6.436          ,5.721               ,1.125 
4090 ,64   ,11   ,127      ,10   ,23   ,6.356          ,5.649               ,1.125 
4090 ,64   ,12   ,127      ,11   ,23   ,6.421          ,5.707               ,1.125 
4091 ,1    ,10   ,127      ,9    ,0    ,6.356          ,5.649               ,1.125 
4091 ,1    ,10   ,127      ,9    ,23   ,6.354          ,5.649               ,1.125 
4091 ,1    ,9    ,127      ,8    ,0    ,6.323          ,5.621               ,1.125 
4091 ,1    ,9    ,127      ,8    ,23   ,6.366          ,5.655               ,1.126 
4091 ,16   ,10   ,127      ,9    ,23   ,6.339          ,5.635               ,1.125 
4091 ,16   ,9    ,127      ,8    ,23   ,6.324          ,5.621               ,1.125 
4091 ,256  ,10   ,127      ,9    ,23   ,6.339          ,5.643               ,1.123 
4091 ,256  ,9    ,127      ,8    ,23   ,6.339          ,5.635               ,1.125 
4091 ,4    ,10   ,127      ,9    ,23   ,6.332          ,5.621               ,1.127 
4091 ,4    ,9    ,127      ,8    ,23   ,6.323          ,5.621               ,1.125 
4091 ,64   ,10   ,127      ,9    ,23   ,6.355          ,5.649               ,1.125 
4091 ,64   ,9    ,127      ,8    ,23   ,6.324          ,5.621               ,1.125 
4092 ,1    ,7    ,127      ,6    ,0    ,6.331          ,5.621               ,1.126 
4092 ,1    ,7    ,127      ,6    ,23   ,6.332          ,5.621               ,1.126 
4092 ,1    ,8    ,127      ,7    ,0    ,6.323          ,5.621               ,1.125 
4092 ,1    ,8    ,127      ,7    ,23   ,6.334          ,5.621               ,1.127 
4092 ,16   ,7    ,127      ,6    ,23   ,6.323          ,5.621               ,1.125 
4092 ,16   ,8    ,127      ,7    ,23   ,6.323          ,5.629               ,1.123 
4092 ,256  ,7    ,127      ,6    ,23   ,6.35           ,5.621               ,1.13  
4092 ,256  ,8    ,127      ,7    ,23   ,6.324          ,5.63                ,1.123 
4092 ,4    ,7    ,127      ,6    ,23   ,6.324          ,5.621               ,1.125 
4092 ,4    ,8    ,127      ,7    ,23   ,6.323          ,5.621               ,1.125 
4092 ,64   ,7    ,127      ,6    ,23   ,6.307          ,5.607               ,1.125 
4092 ,64   ,8    ,127      ,7    ,23   ,6.324          ,5.621               ,1.125 
4093 ,1    ,5    ,127      ,4    ,0    ,6.298          ,5.593               ,1.126 
4093 ,1    ,5    ,127      ,4    ,23   ,6.278          ,5.606               ,1.12  
4093 ,1    ,6    ,127      ,5    ,0    ,6.339          ,5.662               ,1.12  
4093 ,1    ,6    ,127      ,5    ,23   ,6.359          ,5.659               ,1.124 
4093 ,16   ,5    ,127      ,4    ,23   ,6.215          ,5.535               ,1.123 
4093 ,16   ,6    ,127      ,5    ,23   ,6.293          ,5.593               ,1.125 
4093 ,256  ,5    ,127      ,4    ,23   ,6.245          ,5.552               ,1.125 
4093 ,256  ,6    ,127      ,5    ,23   ,6.325          ,5.647               ,1.12  
4093 ,4    ,5    ,127      ,4    ,23   ,6.277          ,5.606               ,1.12  
4093 ,4    ,6    ,127      ,5    ,23   ,6.261          ,5.566               ,1.125 
4093 ,64   ,5    ,127      ,4    ,23   ,6.259          ,5.589               ,1.12  
4093 ,64   ,6    ,127      ,5    ,23   ,6.366          ,5.635               ,1.13  
4094 ,1    ,3    ,127      ,2    ,0    ,6.185          ,5.499               ,1.125 
4094 ,1    ,3    ,127      ,2    ,23   ,6.199          ,5.483               ,1.131 
4094 ,1    ,4    ,127      ,3    ,0    ,6.278          ,5.606               ,1.12  
4094 ,1    ,4    ,127      ,3    ,23   ,6.263          ,5.601               ,1.118 
4094 ,16   ,3    ,127      ,2    ,23   ,6.133          ,5.471               ,1.121 
4094 ,16   ,4    ,127      ,3    ,23   ,6.2            ,5.56                ,1.115 
4094 ,256  ,3    ,127      ,2    ,23   ,6.195          ,5.526               ,1.121 
4094 ,256  ,4    ,127      ,3    ,23   ,6.244          ,5.562               ,1.123 
4094 ,4    ,3    ,127      ,2    ,23   ,6.214          ,5.479               ,1.134 
4094 ,4    ,4    ,127      ,3    ,23   ,6.236          ,5.525               ,1.129 
4094 ,64   ,3    ,127      ,2    ,23   ,6.284          ,5.555               ,1.131 
4094 ,64   ,4    ,127      ,3    ,23   ,6.213          ,5.498               ,1.13  
4095 ,1    ,1    ,127      ,0    ,0    ,4.929          ,4.205               ,1.172 
4095 ,1    ,1    ,127      ,0    ,23   ,6.295          ,5.356               ,1.175 
4095 ,1    ,2    ,127      ,1    ,0    ,6.179          ,5.473               ,1.129 
4095 ,1    ,2    ,127      ,1    ,23   ,6.04           ,5.368               ,1.125 
4095 ,16   ,1    ,127      ,0    ,23   ,6.421          ,5.472               ,1.173 
4095 ,16   ,2    ,127      ,1    ,23   ,6.148          ,5.433               ,1.132 
4095 ,256  ,1    ,127      ,0    ,23   ,6.21           ,5.237               ,1.186 
4095 ,256  ,2    ,127      ,1    ,23   ,6.178          ,5.394               ,1.145 
4095 ,4    ,1    ,127      ,0    ,23   ,6.433          ,5.447               ,1.181 
4095 ,4    ,2    ,127      ,1    ,23   ,6.104          ,5.403               ,1.13  
4095 ,64   ,1    ,127      ,0    ,23   ,6.385          ,5.443               ,1.173 
4095 ,64   ,2    ,127      ,1    ,23   ,6.044          ,5.356               ,1.129 
45   ,1    ,256  ,127      ,64   ,0    ,5.423          ,6.636               ,0.817 
45   ,1    ,256  ,127      ,64   ,23   ,13.085         ,14.98               ,0.873 
45   ,16   ,256  ,127      ,64   ,23   ,13.131         ,14.652              ,0.896 
45   ,256  ,256  ,127      ,64   ,23   ,14.91          ,16.404              ,0.909 
45   ,4    ,256  ,127      ,64   ,23   ,13.095         ,14.858              ,0.881 
45   ,64   ,256  ,127      ,64   ,23   ,13.063         ,14.828              ,0.881 
5    ,1    ,2048 ,127      ,512  ,0    ,19.362         ,20.923              ,0.925 
5    ,1    ,2048 ,127      ,512  ,23   ,84.057         ,83.9                ,1.002 
5    ,1    ,256  ,127      ,64   ,0    ,5.28           ,6.531               ,0.808 
5    ,1    ,256  ,127      ,64   ,23   ,13.172         ,14.676              ,0.898 
5    ,16   ,2048 ,127      ,512  ,23   ,65.801         ,86.021              ,0.765 
5    ,16   ,256  ,127      ,64   ,23   ,13.018         ,14.652              ,0.888 
5    ,256  ,2048 ,127      ,512  ,23   ,65.78          ,85.645              ,0.768 
5    ,256  ,256  ,127      ,64   ,23   ,13.007         ,14.684              ,0.886 
5    ,4    ,2048 ,127      ,512  ,23   ,69.663         ,83.343              ,0.836 
5    ,4    ,256  ,127      ,64   ,23   ,13.015         ,14.68               ,0.887 
5    ,64   ,2048 ,127      ,512  ,23   ,81.575         ,103.302             ,0.79  
5    ,64   ,256  ,127      ,64   ,23   ,13.035         ,14.543              ,0.896 
6    ,1    ,2048 ,127      ,1024 ,0    ,32.464         ,33.905              ,0.957 
6    ,1    ,2048 ,127      ,1024 ,23   ,62.443         ,62.809              ,0.994 
6    ,1    ,256  ,127      ,64   ,0    ,5.3            ,6.445               ,0.822 
6    ,1    ,256  ,127      ,64   ,23   ,13.008         ,14.468              ,0.899 
6    ,16   ,2048 ,127      ,1024 ,23   ,72.848         ,76.0                ,0.959 
6    ,16   ,256  ,127      ,64   ,23   ,12.983         ,14.459              ,0.898 
6    ,256  ,2048 ,127      ,1024 ,23   ,72.617         ,75.992              ,0.956 
6    ,256  ,256  ,127      ,64   ,23   ,13.031         ,14.626              ,0.891 
6    ,4    ,2048 ,127      ,1024 ,23   ,64.053         ,65.069              ,0.984 
6    ,4    ,256  ,127      ,64   ,23   ,13.051         ,14.561              ,0.896 
6    ,64   ,2048 ,127      ,1024 ,23   ,72.693         ,76.084              ,0.955 
6    ,64   ,256  ,127      ,64   ,23   ,13.037         ,14.607              ,0.892 
60   ,1    ,256  ,127      ,64   ,0    ,5.395          ,6.594               ,0.818 
60   ,1    ,256  ,127      ,64   ,23   ,13.131         ,14.686              ,0.894 
60   ,16   ,256  ,127      ,64   ,23   ,13.114         ,14.83               ,0.884 
60   ,256  ,256  ,127      ,64   ,23   ,13.192         ,14.717              ,0.896 
60   ,4    ,256  ,127      ,64   ,23   ,13.162         ,15.038              ,0.875 
60   ,64   ,256  ,127      ,64   ,23   ,13.117         ,14.73               ,0.89  
7    ,1    ,2048 ,127      ,2048 ,0    ,80.932         ,81.343              ,0.995 
7    ,1    ,2048 ,127      ,2048 ,23   ,80.463         ,74.44               ,1.081 
7    ,1    ,256  ,127      ,64   ,0    ,5.323          ,6.552               ,0.812 
7    ,1    ,256  ,127      ,64   ,23   ,13.13          ,14.673              ,0.895 
7    ,16   ,2048 ,127      ,2048 ,23   ,94.479         ,99.784              ,0.947 
7    ,16   ,256  ,127      ,64   ,23   ,13.142         ,14.68               ,0.895 
7    ,256  ,2048 ,127      ,2048 ,23   ,107.288        ,121.184             ,0.885 
7    ,256  ,256  ,127      ,64   ,23   ,13.012         ,14.641              ,0.889 
7    ,4    ,2048 ,127      ,2048 ,23   ,62.841         ,62.668              ,1.003 
7    ,4    ,256  ,127      ,64   ,23   ,13.038         ,14.618              ,0.892 
7    ,64   ,2048 ,127      ,2048 ,23   ,105.909        ,121.875             ,0.869 
7    ,64   ,256  ,127      ,64   ,23   ,13.02          ,14.491              ,0.899 
75   ,1    ,256  ,127      ,64   ,0    ,5.24           ,6.448               ,0.813 
75   ,1    ,256  ,127      ,64   ,23   ,13.071         ,14.458              ,0.904 
75   ,16   ,256  ,127      ,64   ,23   ,13.04          ,14.67               ,0.889 
75   ,256  ,256  ,127      ,64   ,23   ,14.747         ,16.432              ,0.897 
75   ,4    ,256  ,127      ,64   ,23   ,13.025         ,14.718              ,0.885 
75   ,64   ,256  ,127      ,64   ,23   ,13.032         ,14.728              ,0.885 
8    ,1    ,2048 ,127      ,4096 ,0    ,80.369         ,81.326              ,0.988 
8    ,1    ,2048 ,127      ,4096 ,23   ,80.042         ,73.995              ,1.082 
8    ,16   ,2048 ,127      ,4096 ,23   ,66.75          ,67.302              ,0.992 
8    ,256  ,2048 ,127      ,4096 ,23   ,107.316        ,121.695             ,0.882 
8    ,4    ,2048 ,127      ,4096 ,23   ,59.898         ,60.757              ,0.986 
8    ,64   ,2048 ,127      ,4096 ,23   ,105.027        ,121.241             ,0.866 
90   ,1    ,256  ,127      ,64   ,0    ,5.205          ,6.318               ,0.824 
90   ,1    ,256  ,127      ,64   ,23   ,13.045         ,14.626              ,0.892 
90   ,16   ,256  ,127      ,64   ,23   ,13.004         ,14.613              ,0.89  
90   ,256  ,256  ,127      ,64   ,23   ,14.72          ,16.383              ,0.899 
90   ,4    ,256  ,127      ,64   ,23   ,13.025         ,14.953              ,0.871 
90   ,64   ,256  ,127      ,64   ,23   ,12.978         ,14.562              ,0.891 
0.9316776326414105

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH v1 7/7] Bench: Improve benchtests for memchr, strchr, strnlen, strrchr
  2022-10-18  2:49 ` [PATCH v1 7/7] Bench: Improve benchtests for memchr, strchr, strnlen, strrchr Noah Goldstein
@ 2022-10-18 21:00   ` H.J. Lu
  2022-10-18 21:05     ` Noah Goldstein
  0 siblings, 1 reply; 41+ messages in thread
From: H.J. Lu @ 2022-10-18 21:00 UTC (permalink / raw)
  To: Noah Goldstein; +Cc: libc-alpha, carlos

On Mon, Oct 17, 2022 at 7:49 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> 1. Add more complete coverage in the medium size range.
> 2. In strnlen remove the `1 << i` which was UB (`i` could go beyond
>    32/64)
> 3. Add timer for total benchmark runtime (useful for deciding about
>    tradeoff between coverage and runtime).

So this is only used for total runtime and won't be used for performance
comparison.  Will "time ./bench" be sufficient?

> ---
>  benchtests/bench-memchr.c    | 83 +++++++++++++++++++++++++-----------
>  benchtests/bench-rawmemchr.c | 36 ++++++++++++++--
>  benchtests/bench-strchr.c    | 42 +++++++++++++-----
>  benchtests/bench-strnlen.c   | 19 ++++++---
>  benchtests/bench-strrchr.c   | 33 +++++++++++++-
>  5 files changed, 166 insertions(+), 47 deletions(-)
>
> diff --git a/benchtests/bench-memchr.c b/benchtests/bench-memchr.c
> index 0facda2fa0..c4d758ae61 100644
> --- a/benchtests/bench-memchr.c
> +++ b/benchtests/bench-memchr.c
> @@ -126,9 +126,10 @@ do_test (json_ctx_t *json_ctx, size_t align, size_t pos, size_t len,
>  int
>  test_main (void)
>  {
> -  size_t i;
> +  size_t i, j, al, al_max;
>    int repeats;
>    json_ctx_t json_ctx;
> +  timing_t bench_start, bench_stop, bench_total_time;
>    test_init ();
>
>    json_init (&json_ctx, 0, stdout);
> @@ -147,35 +148,47 @@ test_main (void)
>
>    json_array_begin (&json_ctx, "results");
>
> +  TIMING_NOW (bench_start);
> +  al_max = 0;
> +#ifdef USE_AS_MEMRCHR
> +  al_max = getpagesize () / 2;
> +#endif
> +
>    for (repeats = 0; repeats < 2; ++repeats)
>      {
> -      for (i = 1; i < 8; ++i)
> +      for (al = 0; al <= al_max; al += getpagesize () / 2)
>         {
> -         do_test (&json_ctx, 0, 16 << i, 2048, 23, repeats);
> -         do_test (&json_ctx, i, 64, 256, 23, repeats);
> -         do_test (&json_ctx, 0, 16 << i, 2048, 0, repeats);
> -         do_test (&json_ctx, i, 64, 256, 0, repeats);
> -
> -         do_test (&json_ctx, getpagesize () - 15, 64, 256, 0, repeats);
> +         for (i = 1; i < 8; ++i)
> +           {
> +             do_test (&json_ctx, al, 16 << i, 2048, 23, repeats);
> +             do_test (&json_ctx, al + i, 64, 256, 23, repeats);
> +             do_test (&json_ctx, al, 16 << i, 2048, 0, repeats);
> +             do_test (&json_ctx, al + i, 64, 256, 0, repeats);
> +
> +             do_test (&json_ctx, al + getpagesize () - 15, 64, 256, 0,
> +                      repeats);
>  #ifdef USE_AS_MEMRCHR
> -         /* Also test the position close to the beginning for memrchr.  */
> -         do_test (&json_ctx, 0, i, 256, 23, repeats);
> -         do_test (&json_ctx, 0, i, 256, 0, repeats);
> -         do_test (&json_ctx, i, i, 256, 23, repeats);
> -         do_test (&json_ctx, i, i, 256, 0, repeats);
> +             /* Also test the position close to the beginning for memrchr.  */
> +             do_test (&json_ctx, al, i, 256, 23, repeats);
> +             do_test (&json_ctx, al, i, 256, 0, repeats);
> +             do_test (&json_ctx, al + i, i, 256, 23, repeats);
> +             do_test (&json_ctx, al + i, i, 256, 0, repeats);
>  #endif
> +           }
> +         for (i = 1; i < 8; ++i)
> +           {
> +             do_test (&json_ctx, al + i, i << 5, 192, 23, repeats);
> +             do_test (&json_ctx, al + i, i << 5, 192, 0, repeats);
> +             do_test (&json_ctx, al + i, i << 5, 256, 23, repeats);
> +             do_test (&json_ctx, al + i, i << 5, 256, 0, repeats);
> +             do_test (&json_ctx, al + i, i << 5, 512, 23, repeats);
> +             do_test (&json_ctx, al + i, i << 5, 512, 0, repeats);
> +
> +             do_test (&json_ctx, al + getpagesize () - 15, i << 5, 256, 23,
> +                      repeats);
> +           }
>         }
> -      for (i = 1; i < 8; ++i)
> -       {
> -         do_test (&json_ctx, i, i << 5, 192, 23, repeats);
> -         do_test (&json_ctx, i, i << 5, 192, 0, repeats);
> -         do_test (&json_ctx, i, i << 5, 256, 23, repeats);
> -         do_test (&json_ctx, i, i << 5, 256, 0, repeats);
> -         do_test (&json_ctx, i, i << 5, 512, 23, repeats);
> -         do_test (&json_ctx, i, i << 5, 512, 0, repeats);
> -
> -         do_test (&json_ctx, getpagesize () - 15, i << 5, 256, 23, repeats);
> -       }
> +
>        for (i = 1; i < 32; ++i)
>         {
>           do_test (&json_ctx, 0, i, i + 1, 23, repeats);
> @@ -207,11 +220,33 @@ test_main (void)
>           do_test (&json_ctx, 0, 2, i + 1, 0, repeats);
>  #endif
>         }
> +      for (al = 0; al <= al_max; al += getpagesize () / 2)
> +       {
> +         for (i = (16 / sizeof (CHAR)); i <= (8192 / sizeof (CHAR)); i += i)
> +           {
> +             for (j = 0; j <= (384 / sizeof (CHAR));
> +                  j += (32 / sizeof (CHAR)))
> +               {
> +                 do_test (&json_ctx, al, i + j, i, 23, repeats);
> +                 do_test (&json_ctx, al, i, i + j, 23, repeats);
> +                 if (j < i)
> +                   {
> +                     do_test (&json_ctx, al, i - j, i, 23, repeats);
> +                     do_test (&json_ctx, al, i, i - j, 23, repeats);
> +                   }
> +               }
> +           }
> +       }
> +
>  #ifndef USE_AS_MEMRCHR
>        break;
>  #endif
>      }
>
> +  TIMING_NOW (bench_stop);
> +  TIMING_DIFF (bench_total_time, bench_start, bench_stop);
> +  json_attr_double (&json_ctx, "benchtime", bench_total_time);
> +
>    json_array_end (&json_ctx);
>    json_attr_object_end (&json_ctx);
>    json_attr_object_end (&json_ctx);
> diff --git a/benchtests/bench-rawmemchr.c b/benchtests/bench-rawmemchr.c
> index b1803afc14..667ecd48f9 100644
> --- a/benchtests/bench-rawmemchr.c
> +++ b/benchtests/bench-rawmemchr.c
> @@ -70,7 +70,7 @@ do_test (json_ctx_t *json_ctx, size_t align, size_t pos, size_t len, int seek_ch
>    size_t i;
>    char *result;
>
> -  align &= 7;
> +  align &= getpagesize () - 1;
>    if (align + len >= page_size)
>      return;
>
> @@ -106,7 +106,7 @@ test_main (void)
>  {
>    json_ctx_t json_ctx;
>    size_t i;
> -
> +  timing_t bench_start, bench_stop, bench_total_time;
>    test_init ();
>
>    json_init (&json_ctx, 0, stdout);
> @@ -120,11 +120,12 @@ test_main (void)
>
>    json_array_begin (&json_ctx, "ifuncs");
>    FOR_EACH_IMPL (impl, 0)
> -      json_element_string (&json_ctx, impl->name);
> +    json_element_string (&json_ctx, impl->name);
>    json_array_end (&json_ctx);
>
>    json_array_begin (&json_ctx, "results");
>
> +  TIMING_NOW (bench_start);
>    for (i = 1; i < 7; ++i)
>      {
>        do_test (&json_ctx, 0, 16 << i, 2048, 23);
> @@ -137,6 +138,35 @@ test_main (void)
>        do_test (&json_ctx, 0, i, i + 1, 23);
>        do_test (&json_ctx, 0, i, i + 1, 0);
>      }
> +  for (; i < 256; i += 32)
> +    {
> +      do_test (&json_ctx, 0, i, i + 1, 23);
> +      do_test (&json_ctx, 0, i - 1, i, 23);
> +    }
> +  for (; i < 512; i += 64)
> +    {
> +      do_test (&json_ctx, 0, i, i + 1, 23);
> +      do_test (&json_ctx, 0, i - 1, i, 23);
> +    }
> +  for (; i < 1024; i += 128)
> +    {
> +      do_test (&json_ctx, 0, i, i + 1, 23);
> +      do_test (&json_ctx, 0, i - 1, i, 23);
> +    }
> +  for (; i < 2048; i += 256)
> +    {
> +      do_test (&json_ctx, 0, i, i + 1, 23);
> +      do_test (&json_ctx, 0, i - 1, i, 23);
> +    }
> +  for (; i < 4096; i += 512)
> +    {
> +      do_test (&json_ctx, 0, i, i + 1, 23);
> +      do_test (&json_ctx, 0, i - 1, i, 23);
> +    }
> +
> +  TIMING_NOW (bench_stop);
> +  TIMING_DIFF (bench_total_time, bench_start, bench_stop);
> +  json_attr_double (&json_ctx, "benchtime", bench_total_time);
>
>    json_array_end (&json_ctx);
>    json_attr_object_end (&json_ctx);
> diff --git a/benchtests/bench-strchr.c b/benchtests/bench-strchr.c
> index 54640bde7e..af325806ce 100644
> --- a/benchtests/bench-strchr.c
> +++ b/benchtests/bench-strchr.c
> @@ -287,8 +287,8 @@ int
>  test_main (void)
>  {
>    json_ctx_t json_ctx;
> -  size_t i;
> -
> +  size_t i, j;
> +  timing_t bench_start, bench_stop, bench_total_time;
>    test_init ();
>
>    json_init (&json_ctx, 0, stdout);
> @@ -307,6 +307,7 @@ test_main (void)
>
>    json_array_begin (&json_ctx, "results");
>
> +  TIMING_NOW (bench_start);
>    for (i = 1; i < 8; ++i)
>      {
>        do_test (&json_ctx, 0, 16 << i, 2048, SMALL_CHAR, MIDDLE_CHAR);
> @@ -367,15 +368,34 @@ test_main (void)
>        do_test (&json_ctx, 0, i, i + 1, 0, BIG_CHAR);
>      }
>
> -  DO_RAND_TEST(&json_ctx, 0, 15, 16, 0.0);
> -  DO_RAND_TEST(&json_ctx, 0, 15, 16, 0.1);
> -  DO_RAND_TEST(&json_ctx, 0, 15, 16, 0.25);
> -  DO_RAND_TEST(&json_ctx, 0, 15, 16, 0.33);
> -  DO_RAND_TEST(&json_ctx, 0, 15, 16, 0.5);
> -  DO_RAND_TEST(&json_ctx, 0, 15, 16, 0.66);
> -  DO_RAND_TEST(&json_ctx, 0, 15, 16, 0.75);
> -  DO_RAND_TEST(&json_ctx, 0, 15, 16, 0.9);
> -  DO_RAND_TEST(&json_ctx, 0, 15, 16, 1.0);
> +  for (i = 16 / sizeof (CHAR); i <= 8192 / sizeof (CHAR); i += i)
> +    {
> +      for (j = 32 / sizeof (CHAR); j <= 320 / sizeof (CHAR);
> +          j += 32 / sizeof (CHAR))
> +       {
> +         do_test (&json_ctx, 0, i, i + j, 0, MIDDLE_CHAR);
> +         do_test (&json_ctx, 0, i + j, i, 0, MIDDLE_CHAR);
> +         if (i > j)
> +           {
> +             do_test (&json_ctx, 0, i, i - j, 0, MIDDLE_CHAR);
> +             do_test (&json_ctx, 0, i - j, i, 0, MIDDLE_CHAR);
> +           }
> +       }
> +    }
> +
> +  DO_RAND_TEST (&json_ctx, 0, 15, 16, 0.0);
> +  DO_RAND_TEST (&json_ctx, 0, 15, 16, 0.1);
> +  DO_RAND_TEST (&json_ctx, 0, 15, 16, 0.25);
> +  DO_RAND_TEST (&json_ctx, 0, 15, 16, 0.33);
> +  DO_RAND_TEST (&json_ctx, 0, 15, 16, 0.5);
> +  DO_RAND_TEST (&json_ctx, 0, 15, 16, 0.66);
> +  DO_RAND_TEST (&json_ctx, 0, 15, 16, 0.75);
> +  DO_RAND_TEST (&json_ctx, 0, 15, 16, 0.9);
> +  DO_RAND_TEST (&json_ctx, 0, 15, 16, 1.0);
> +
> +  TIMING_NOW (bench_stop);
> +  TIMING_DIFF (bench_total_time, bench_start, bench_stop);
> +  json_attr_double (&json_ctx, "benchtime", bench_total_time);
>
>    json_array_end (&json_ctx);
>    json_attr_object_end (&json_ctx);
> diff --git a/benchtests/bench-strnlen.c b/benchtests/bench-strnlen.c
> index 13b46b3f57..c6281b6373 100644
> --- a/benchtests/bench-strnlen.c
> +++ b/benchtests/bench-strnlen.c
> @@ -117,7 +117,7 @@ test_main (void)
>  {
>    size_t i, j;
>    json_ctx_t json_ctx;
> -
> +  timing_t bench_start, bench_stop, bench_total_time;
>    test_init ();
>
>    json_init (&json_ctx, 0, stdout);
> @@ -136,6 +136,7 @@ test_main (void)
>
>    json_array_begin (&json_ctx, "results");
>
> +  TIMING_NOW (bench_start);
>    for (i = 0; i <= 1; ++i)
>      {
>        do_test (&json_ctx, i, 1, 128, MIDDLE_CHAR);
> @@ -195,23 +196,27 @@ test_main (void)
>      {
>        for (j = 0; j <= (704 / sizeof (CHAR)); j += (32 / sizeof (CHAR)))
>         {
> -         do_test (&json_ctx, 0, 1 << i, (i + j), BIG_CHAR);
>           do_test (&json_ctx, 0, i + j, i, BIG_CHAR);
> -
> -         do_test (&json_ctx, 64, 1 << i, (i + j), BIG_CHAR);
>           do_test (&json_ctx, 64, i + j, i, BIG_CHAR);
>
> +         do_test (&json_ctx, 0, i, i + j, BIG_CHAR);
> +         do_test (&json_ctx, 64, i, i + j, BIG_CHAR);
> +
>           if (j < i)
>             {
> -             do_test (&json_ctx, 0, 1 << i, i - j, BIG_CHAR);
>               do_test (&json_ctx, 0, i - j, i, BIG_CHAR);
> -
> -             do_test (&json_ctx, 64, 1 << i, i - j, BIG_CHAR);
>               do_test (&json_ctx, 64, i - j, i, BIG_CHAR);
> +
> +             do_test (&json_ctx, 0, i, i - j, BIG_CHAR);
> +             do_test (&json_ctx, 64, i, i - j, BIG_CHAR);
>             }
>         }
>      }
>
> +  TIMING_NOW (bench_stop);
> +  TIMING_DIFF (bench_total_time, bench_start, bench_stop);
> +  json_attr_double (&json_ctx, "benchtime", bench_total_time);
> +
>    json_array_end (&json_ctx);
>    json_attr_object_end (&json_ctx);
>    json_attr_object_end (&json_ctx);
> diff --git a/benchtests/bench-strrchr.c b/benchtests/bench-strrchr.c
> index 7cd2a15484..e6d8163047 100644
> --- a/benchtests/bench-strrchr.c
> +++ b/benchtests/bench-strrchr.c
> @@ -151,8 +151,9 @@ int
>  test_main (void)
>  {
>    json_ctx_t json_ctx;
> -  size_t i, j;
> +  size_t i, j, k;
>    int seek;
> +  timing_t bench_start, bench_stop, bench_total_time;
>
>    test_init ();
>    json_init (&json_ctx, 0, stdout);
> @@ -171,9 +172,10 @@ test_main (void)
>
>    json_array_begin (&json_ctx, "results");
>
> +  TIMING_NOW (bench_start);
>    for (seek = 0; seek <= 23; seek += 23)
>      {
> -      for (j = 1; j < 32; j += j)
> +      for (j = 1; j <= 256; j = (j * 4))
>         {
>           for (i = 1; i < 9; ++i)
>             {
> @@ -197,12 +199,39 @@ test_main (void)
>               do_test (&json_ctx, getpagesize () - i / 2 - 1, i, i + 1, seek,
>                        SMALL_CHAR, j);
>             }
> +
> +         for (i = (16 / sizeof (CHAR)); i <= (288 / sizeof (CHAR)); i += 32)
> +           {
> +             do_test (&json_ctx, 0, i - 16, i, seek, SMALL_CHAR, j);
> +             do_test (&json_ctx, 0, i, i + 16, seek, SMALL_CHAR, j);
> +           }
> +
> +         for (i = (16 / sizeof (CHAR)); i <= (2048 / sizeof (CHAR)); i += i)
> +           {
> +             for (k = 0; k <= (288 / sizeof (CHAR));
> +                  k += (48 / sizeof (CHAR)))
> +               {
> +                 do_test (&json_ctx, 0, k, i, seek, SMALL_CHAR, j);
> +                 do_test (&json_ctx, 0, i, i + k, seek, SMALL_CHAR, j);
> +
> +                 if (k < i)
> +                   {
> +                     do_test (&json_ctx, 0, i - k, i, seek, SMALL_CHAR, j);
> +                     do_test (&json_ctx, 0, k, i - k, seek, SMALL_CHAR, j);
> +                     do_test (&json_ctx, 0, i, i - k, seek, SMALL_CHAR, j);
> +                   }
> +               }
> +           }
> +
>           if (seek == 0)
>             {
>               break;
>             }
>         }
>      }
> +  TIMING_NOW (bench_stop);
> +  TIMING_DIFF (bench_total_time, bench_start, bench_stop);
> +  json_attr_double (&json_ctx, "benchtime", bench_total_time);
>
>    json_array_end (&json_ctx);
>    json_attr_object_end (&json_ctx);
> --
> 2.34.1
>


-- 
H.J.

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH v1 7/7] Bench: Improve benchtests for memchr, strchr, strnlen, strrchr
  2022-10-18 21:00   ` H.J. Lu
@ 2022-10-18 21:05     ` Noah Goldstein
  2022-10-18 21:53       ` H.J. Lu
  0 siblings, 1 reply; 41+ messages in thread
From: Noah Goldstein @ 2022-10-18 21:05 UTC (permalink / raw)
  To: H.J. Lu; +Cc: libc-alpha, carlos

On Tue, Oct 18, 2022 at 4:01 PM H.J. Lu <hjl.tools@gmail.com> wrote:
>
> On Mon, Oct 17, 2022 at 7:49 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> >
> > 1. Add more complete coverage in the medium size range.
> > 2. In strnlen remove the `1 << i` which was UB (`i` could go beyond
> >    32/64)
> > 3. Add timer for total benchmark runtime (useful for deciding about
> >    tradeoff between coverage and runtime).
>
> So this is only used for total runtime and won't be used for performance
> comparison.  Will "time ./bench" be sufficient?

Yes but when running something like:
make bench BENCHSET="string-benchset"
its hard to get the time of an individual test. We also do timeout on
an individual test basis so seems reasonable to get total runtime
on individual test basis.

>
> > ---
> >  benchtests/bench-memchr.c    | 83 +++++++++++++++++++++++++-----------
> >  benchtests/bench-rawmemchr.c | 36 ++++++++++++++--
> >  benchtests/bench-strchr.c    | 42 +++++++++++++-----
> >  benchtests/bench-strnlen.c   | 19 ++++++---
> >  benchtests/bench-strrchr.c   | 33 +++++++++++++-
> >  5 files changed, 166 insertions(+), 47 deletions(-)
> >
> > diff --git a/benchtests/bench-memchr.c b/benchtests/bench-memchr.c
> > index 0facda2fa0..c4d758ae61 100644
> > --- a/benchtests/bench-memchr.c
> > +++ b/benchtests/bench-memchr.c
> > @@ -126,9 +126,10 @@ do_test (json_ctx_t *json_ctx, size_t align, size_t pos, size_t len,
> >  int
> >  test_main (void)
> >  {
> > -  size_t i;
> > +  size_t i, j, al, al_max;
> >    int repeats;
> >    json_ctx_t json_ctx;
> > +  timing_t bench_start, bench_stop, bench_total_time;
> >    test_init ();
> >
> >    json_init (&json_ctx, 0, stdout);
> > @@ -147,35 +148,47 @@ test_main (void)
> >
> >    json_array_begin (&json_ctx, "results");
> >
> > +  TIMING_NOW (bench_start);
> > +  al_max = 0;
> > +#ifdef USE_AS_MEMRCHR
> > +  al_max = getpagesize () / 2;
> > +#endif
> > +
> >    for (repeats = 0; repeats < 2; ++repeats)
> >      {
> > -      for (i = 1; i < 8; ++i)
> > +      for (al = 0; al <= al_max; al += getpagesize () / 2)
> >         {
> > -         do_test (&json_ctx, 0, 16 << i, 2048, 23, repeats);
> > -         do_test (&json_ctx, i, 64, 256, 23, repeats);
> > -         do_test (&json_ctx, 0, 16 << i, 2048, 0, repeats);
> > -         do_test (&json_ctx, i, 64, 256, 0, repeats);
> > -
> > -         do_test (&json_ctx, getpagesize () - 15, 64, 256, 0, repeats);
> > +         for (i = 1; i < 8; ++i)
> > +           {
> > +             do_test (&json_ctx, al, 16 << i, 2048, 23, repeats);
> > +             do_test (&json_ctx, al + i, 64, 256, 23, repeats);
> > +             do_test (&json_ctx, al, 16 << i, 2048, 0, repeats);
> > +             do_test (&json_ctx, al + i, 64, 256, 0, repeats);
> > +
> > +             do_test (&json_ctx, al + getpagesize () - 15, 64, 256, 0,
> > +                      repeats);
> >  #ifdef USE_AS_MEMRCHR
> > -         /* Also test the position close to the beginning for memrchr.  */
> > -         do_test (&json_ctx, 0, i, 256, 23, repeats);
> > -         do_test (&json_ctx, 0, i, 256, 0, repeats);
> > -         do_test (&json_ctx, i, i, 256, 23, repeats);
> > -         do_test (&json_ctx, i, i, 256, 0, repeats);
> > +             /* Also test the position close to the beginning for memrchr.  */
> > +             do_test (&json_ctx, al, i, 256, 23, repeats);
> > +             do_test (&json_ctx, al, i, 256, 0, repeats);
> > +             do_test (&json_ctx, al + i, i, 256, 23, repeats);
> > +             do_test (&json_ctx, al + i, i, 256, 0, repeats);
> >  #endif
> > +           }
> > +         for (i = 1; i < 8; ++i)
> > +           {
> > +             do_test (&json_ctx, al + i, i << 5, 192, 23, repeats);
> > +             do_test (&json_ctx, al + i, i << 5, 192, 0, repeats);
> > +             do_test (&json_ctx, al + i, i << 5, 256, 23, repeats);
> > +             do_test (&json_ctx, al + i, i << 5, 256, 0, repeats);
> > +             do_test (&json_ctx, al + i, i << 5, 512, 23, repeats);
> > +             do_test (&json_ctx, al + i, i << 5, 512, 0, repeats);
> > +
> > +             do_test (&json_ctx, al + getpagesize () - 15, i << 5, 256, 23,
> > +                      repeats);
> > +           }
> >         }
> > -      for (i = 1; i < 8; ++i)
> > -       {
> > -         do_test (&json_ctx, i, i << 5, 192, 23, repeats);
> > -         do_test (&json_ctx, i, i << 5, 192, 0, repeats);
> > -         do_test (&json_ctx, i, i << 5, 256, 23, repeats);
> > -         do_test (&json_ctx, i, i << 5, 256, 0, repeats);
> > -         do_test (&json_ctx, i, i << 5, 512, 23, repeats);
> > -         do_test (&json_ctx, i, i << 5, 512, 0, repeats);
> > -
> > -         do_test (&json_ctx, getpagesize () - 15, i << 5, 256, 23, repeats);
> > -       }
> > +
> >        for (i = 1; i < 32; ++i)
> >         {
> >           do_test (&json_ctx, 0, i, i + 1, 23, repeats);
> > @@ -207,11 +220,33 @@ test_main (void)
> >           do_test (&json_ctx, 0, 2, i + 1, 0, repeats);
> >  #endif
> >         }
> > +      for (al = 0; al <= al_max; al += getpagesize () / 2)
> > +       {
> > +         for (i = (16 / sizeof (CHAR)); i <= (8192 / sizeof (CHAR)); i += i)
> > +           {
> > +             for (j = 0; j <= (384 / sizeof (CHAR));
> > +                  j += (32 / sizeof (CHAR)))
> > +               {
> > +                 do_test (&json_ctx, al, i + j, i, 23, repeats);
> > +                 do_test (&json_ctx, al, i, i + j, 23, repeats);
> > +                 if (j < i)
> > +                   {
> > +                     do_test (&json_ctx, al, i - j, i, 23, repeats);
> > +                     do_test (&json_ctx, al, i, i - j, 23, repeats);
> > +                   }
> > +               }
> > +           }
> > +       }
> > +
> >  #ifndef USE_AS_MEMRCHR
> >        break;
> >  #endif
> >      }
> >
> > +  TIMING_NOW (bench_stop);
> > +  TIMING_DIFF (bench_total_time, bench_start, bench_stop);
> > +  json_attr_double (&json_ctx, "benchtime", bench_total_time);
> > +
> >    json_array_end (&json_ctx);
> >    json_attr_object_end (&json_ctx);
> >    json_attr_object_end (&json_ctx);
> > diff --git a/benchtests/bench-rawmemchr.c b/benchtests/bench-rawmemchr.c
> > index b1803afc14..667ecd48f9 100644
> > --- a/benchtests/bench-rawmemchr.c
> > +++ b/benchtests/bench-rawmemchr.c
> > @@ -70,7 +70,7 @@ do_test (json_ctx_t *json_ctx, size_t align, size_t pos, size_t len, int seek_ch
> >    size_t i;
> >    char *result;
> >
> > -  align &= 7;
> > +  align &= getpagesize () - 1;
> >    if (align + len >= page_size)
> >      return;
> >
> > @@ -106,7 +106,7 @@ test_main (void)
> >  {
> >    json_ctx_t json_ctx;
> >    size_t i;
> > -
> > +  timing_t bench_start, bench_stop, bench_total_time;
> >    test_init ();
> >
> >    json_init (&json_ctx, 0, stdout);
> > @@ -120,11 +120,12 @@ test_main (void)
> >
> >    json_array_begin (&json_ctx, "ifuncs");
> >    FOR_EACH_IMPL (impl, 0)
> > -      json_element_string (&json_ctx, impl->name);
> > +    json_element_string (&json_ctx, impl->name);
> >    json_array_end (&json_ctx);
> >
> >    json_array_begin (&json_ctx, "results");
> >
> > +  TIMING_NOW (bench_start);
> >    for (i = 1; i < 7; ++i)
> >      {
> >        do_test (&json_ctx, 0, 16 << i, 2048, 23);
> > @@ -137,6 +138,35 @@ test_main (void)
> >        do_test (&json_ctx, 0, i, i + 1, 23);
> >        do_test (&json_ctx, 0, i, i + 1, 0);
> >      }
> > +  for (; i < 256; i += 32)
> > +    {
> > +      do_test (&json_ctx, 0, i, i + 1, 23);
> > +      do_test (&json_ctx, 0, i - 1, i, 23);
> > +    }
> > +  for (; i < 512; i += 64)
> > +    {
> > +      do_test (&json_ctx, 0, i, i + 1, 23);
> > +      do_test (&json_ctx, 0, i - 1, i, 23);
> > +    }
> > +  for (; i < 1024; i += 128)
> > +    {
> > +      do_test (&json_ctx, 0, i, i + 1, 23);
> > +      do_test (&json_ctx, 0, i - 1, i, 23);
> > +    }
> > +  for (; i < 2048; i += 256)
> > +    {
> > +      do_test (&json_ctx, 0, i, i + 1, 23);
> > +      do_test (&json_ctx, 0, i - 1, i, 23);
> > +    }
> > +  for (; i < 4096; i += 512)
> > +    {
> > +      do_test (&json_ctx, 0, i, i + 1, 23);
> > +      do_test (&json_ctx, 0, i - 1, i, 23);
> > +    }
> > +
> > +  TIMING_NOW (bench_stop);
> > +  TIMING_DIFF (bench_total_time, bench_start, bench_stop);
> > +  json_attr_double (&json_ctx, "benchtime", bench_total_time);
> >
> >    json_array_end (&json_ctx);
> >    json_attr_object_end (&json_ctx);
> > diff --git a/benchtests/bench-strchr.c b/benchtests/bench-strchr.c
> > index 54640bde7e..af325806ce 100644
> > --- a/benchtests/bench-strchr.c
> > +++ b/benchtests/bench-strchr.c
> > @@ -287,8 +287,8 @@ int
> >  test_main (void)
> >  {
> >    json_ctx_t json_ctx;
> > -  size_t i;
> > -
> > +  size_t i, j;
> > +  timing_t bench_start, bench_stop, bench_total_time;
> >    test_init ();
> >
> >    json_init (&json_ctx, 0, stdout);
> > @@ -307,6 +307,7 @@ test_main (void)
> >
> >    json_array_begin (&json_ctx, "results");
> >
> > +  TIMING_NOW (bench_start);
> >    for (i = 1; i < 8; ++i)
> >      {
> >        do_test (&json_ctx, 0, 16 << i, 2048, SMALL_CHAR, MIDDLE_CHAR);
> > @@ -367,15 +368,34 @@ test_main (void)
> >        do_test (&json_ctx, 0, i, i + 1, 0, BIG_CHAR);
> >      }
> >
> > -  DO_RAND_TEST(&json_ctx, 0, 15, 16, 0.0);
> > -  DO_RAND_TEST(&json_ctx, 0, 15, 16, 0.1);
> > -  DO_RAND_TEST(&json_ctx, 0, 15, 16, 0.25);
> > -  DO_RAND_TEST(&json_ctx, 0, 15, 16, 0.33);
> > -  DO_RAND_TEST(&json_ctx, 0, 15, 16, 0.5);
> > -  DO_RAND_TEST(&json_ctx, 0, 15, 16, 0.66);
> > -  DO_RAND_TEST(&json_ctx, 0, 15, 16, 0.75);
> > -  DO_RAND_TEST(&json_ctx, 0, 15, 16, 0.9);
> > -  DO_RAND_TEST(&json_ctx, 0, 15, 16, 1.0);
> > +  for (i = 16 / sizeof (CHAR); i <= 8192 / sizeof (CHAR); i += i)
> > +    {
> > +      for (j = 32 / sizeof (CHAR); j <= 320 / sizeof (CHAR);
> > +          j += 32 / sizeof (CHAR))
> > +       {
> > +         do_test (&json_ctx, 0, i, i + j, 0, MIDDLE_CHAR);
> > +         do_test (&json_ctx, 0, i + j, i, 0, MIDDLE_CHAR);
> > +         if (i > j)
> > +           {
> > +             do_test (&json_ctx, 0, i, i - j, 0, MIDDLE_CHAR);
> > +             do_test (&json_ctx, 0, i - j, i, 0, MIDDLE_CHAR);
> > +           }
> > +       }
> > +    }
> > +
> > +  DO_RAND_TEST (&json_ctx, 0, 15, 16, 0.0);
> > +  DO_RAND_TEST (&json_ctx, 0, 15, 16, 0.1);
> > +  DO_RAND_TEST (&json_ctx, 0, 15, 16, 0.25);
> > +  DO_RAND_TEST (&json_ctx, 0, 15, 16, 0.33);
> > +  DO_RAND_TEST (&json_ctx, 0, 15, 16, 0.5);
> > +  DO_RAND_TEST (&json_ctx, 0, 15, 16, 0.66);
> > +  DO_RAND_TEST (&json_ctx, 0, 15, 16, 0.75);
> > +  DO_RAND_TEST (&json_ctx, 0, 15, 16, 0.9);
> > +  DO_RAND_TEST (&json_ctx, 0, 15, 16, 1.0);
> > +
> > +  TIMING_NOW (bench_stop);
> > +  TIMING_DIFF (bench_total_time, bench_start, bench_stop);
> > +  json_attr_double (&json_ctx, "benchtime", bench_total_time);
> >
> >    json_array_end (&json_ctx);
> >    json_attr_object_end (&json_ctx);
> > diff --git a/benchtests/bench-strnlen.c b/benchtests/bench-strnlen.c
> > index 13b46b3f57..c6281b6373 100644
> > --- a/benchtests/bench-strnlen.c
> > +++ b/benchtests/bench-strnlen.c
> > @@ -117,7 +117,7 @@ test_main (void)
> >  {
> >    size_t i, j;
> >    json_ctx_t json_ctx;
> > -
> > +  timing_t bench_start, bench_stop, bench_total_time;
> >    test_init ();
> >
> >    json_init (&json_ctx, 0, stdout);
> > @@ -136,6 +136,7 @@ test_main (void)
> >
> >    json_array_begin (&json_ctx, "results");
> >
> > +  TIMING_NOW (bench_start);
> >    for (i = 0; i <= 1; ++i)
> >      {
> >        do_test (&json_ctx, i, 1, 128, MIDDLE_CHAR);
> > @@ -195,23 +196,27 @@ test_main (void)
> >      {
> >        for (j = 0; j <= (704 / sizeof (CHAR)); j += (32 / sizeof (CHAR)))
> >         {
> > -         do_test (&json_ctx, 0, 1 << i, (i + j), BIG_CHAR);
> >           do_test (&json_ctx, 0, i + j, i, BIG_CHAR);
> > -
> > -         do_test (&json_ctx, 64, 1 << i, (i + j), BIG_CHAR);
> >           do_test (&json_ctx, 64, i + j, i, BIG_CHAR);
> >
> > +         do_test (&json_ctx, 0, i, i + j, BIG_CHAR);
> > +         do_test (&json_ctx, 64, i, i + j, BIG_CHAR);
> > +
> >           if (j < i)
> >             {
> > -             do_test (&json_ctx, 0, 1 << i, i - j, BIG_CHAR);
> >               do_test (&json_ctx, 0, i - j, i, BIG_CHAR);
> > -
> > -             do_test (&json_ctx, 64, 1 << i, i - j, BIG_CHAR);
> >               do_test (&json_ctx, 64, i - j, i, BIG_CHAR);
> > +
> > +             do_test (&json_ctx, 0, i, i - j, BIG_CHAR);
> > +             do_test (&json_ctx, 64, i, i - j, BIG_CHAR);
> >             }
> >         }
> >      }
> >
> > +  TIMING_NOW (bench_stop);
> > +  TIMING_DIFF (bench_total_time, bench_start, bench_stop);
> > +  json_attr_double (&json_ctx, "benchtime", bench_total_time);
> > +
> >    json_array_end (&json_ctx);
> >    json_attr_object_end (&json_ctx);
> >    json_attr_object_end (&json_ctx);
> > diff --git a/benchtests/bench-strrchr.c b/benchtests/bench-strrchr.c
> > index 7cd2a15484..e6d8163047 100644
> > --- a/benchtests/bench-strrchr.c
> > +++ b/benchtests/bench-strrchr.c
> > @@ -151,8 +151,9 @@ int
> >  test_main (void)
> >  {
> >    json_ctx_t json_ctx;
> > -  size_t i, j;
> > +  size_t i, j, k;
> >    int seek;
> > +  timing_t bench_start, bench_stop, bench_total_time;
> >
> >    test_init ();
> >    json_init (&json_ctx, 0, stdout);
> > @@ -171,9 +172,10 @@ test_main (void)
> >
> >    json_array_begin (&json_ctx, "results");
> >
> > +  TIMING_NOW (bench_start);
> >    for (seek = 0; seek <= 23; seek += 23)
> >      {
> > -      for (j = 1; j < 32; j += j)
> > +      for (j = 1; j <= 256; j = (j * 4))
> >         {
> >           for (i = 1; i < 9; ++i)
> >             {
> > @@ -197,12 +199,39 @@ test_main (void)
> >               do_test (&json_ctx, getpagesize () - i / 2 - 1, i, i + 1, seek,
> >                        SMALL_CHAR, j);
> >             }
> > +
> > +         for (i = (16 / sizeof (CHAR)); i <= (288 / sizeof (CHAR)); i += 32)
> > +           {
> > +             do_test (&json_ctx, 0, i - 16, i, seek, SMALL_CHAR, j);
> > +             do_test (&json_ctx, 0, i, i + 16, seek, SMALL_CHAR, j);
> > +           }
> > +
> > +         for (i = (16 / sizeof (CHAR)); i <= (2048 / sizeof (CHAR)); i += i)
> > +           {
> > +             for (k = 0; k <= (288 / sizeof (CHAR));
> > +                  k += (48 / sizeof (CHAR)))
> > +               {
> > +                 do_test (&json_ctx, 0, k, i, seek, SMALL_CHAR, j);
> > +                 do_test (&json_ctx, 0, i, i + k, seek, SMALL_CHAR, j);
> > +
> > +                 if (k < i)
> > +                   {
> > +                     do_test (&json_ctx, 0, i - k, i, seek, SMALL_CHAR, j);
> > +                     do_test (&json_ctx, 0, k, i - k, seek, SMALL_CHAR, j);
> > +                     do_test (&json_ctx, 0, i, i - k, seek, SMALL_CHAR, j);
> > +                   }
> > +               }
> > +           }
> > +
> >           if (seek == 0)
> >             {
> >               break;
> >             }
> >         }
> >      }
> > +  TIMING_NOW (bench_stop);
> > +  TIMING_DIFF (bench_total_time, bench_start, bench_stop);
> > +  json_attr_double (&json_ctx, "benchtime", bench_total_time);
> >
> >    json_array_end (&json_ctx);
> >    json_attr_object_end (&json_ctx);
> > --
> > 2.34.1
> >
>
>
> --
> H.J.

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH v1 7/7] Bench: Improve benchtests for memchr, strchr, strnlen, strrchr
  2022-10-18 21:05     ` Noah Goldstein
@ 2022-10-18 21:53       ` H.J. Lu
  2022-10-18 22:58         ` Noah Goldstein
  0 siblings, 1 reply; 41+ messages in thread
From: H.J. Lu @ 2022-10-18 21:53 UTC (permalink / raw)
  To: Noah Goldstein; +Cc: libc-alpha, carlos

On Tue, Oct 18, 2022 at 2:05 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> On Tue, Oct 18, 2022 at 4:01 PM H.J. Lu <hjl.tools@gmail.com> wrote:
> >
> > On Mon, Oct 17, 2022 at 7:49 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> > >
> > > 1. Add more complete coverage in the medium size range.
> > > 2. In strnlen remove the `1 << i` which was UB (`i` could go beyond
> > >    32/64)
> > > 3. Add timer for total benchmark runtime (useful for deciding about
> > >    tradeoff between coverage and runtime).
> >
> > So this is only used for total runtime and won't be used for performance
> > comparison.  Will "time ./bench" be sufficient?
>
> Yes but when running something like:
> make bench BENCHSET="string-benchset"
> its hard to get the time of an individual test. We also do timeout on
> an individual test basis so seems reasonable to get total runtime
> on individual test basis.

Can you make it a separate patch to add total time?  One can tell
which bench takes too much time when running

make bench BENCHSET="string-benchset"

and use "time ./bench" on that bench.

> >
> > > ---
> > >  benchtests/bench-memchr.c    | 83 +++++++++++++++++++++++++-----------
> > >  benchtests/bench-rawmemchr.c | 36 ++++++++++++++--
> > >  benchtests/bench-strchr.c    | 42 +++++++++++++-----
> > >  benchtests/bench-strnlen.c   | 19 ++++++---
> > >  benchtests/bench-strrchr.c   | 33 +++++++++++++-
> > >  5 files changed, 166 insertions(+), 47 deletions(-)
> > >
> > > diff --git a/benchtests/bench-memchr.c b/benchtests/bench-memchr.c
> > > index 0facda2fa0..c4d758ae61 100644
> > > --- a/benchtests/bench-memchr.c
> > > +++ b/benchtests/bench-memchr.c
> > > @@ -126,9 +126,10 @@ do_test (json_ctx_t *json_ctx, size_t align, size_t pos, size_t len,
> > >  int
> > >  test_main (void)
> > >  {
> > > -  size_t i;
> > > +  size_t i, j, al, al_max;
> > >    int repeats;
> > >    json_ctx_t json_ctx;
> > > +  timing_t bench_start, bench_stop, bench_total_time;
> > >    test_init ();
> > >
> > >    json_init (&json_ctx, 0, stdout);
> > > @@ -147,35 +148,47 @@ test_main (void)
> > >
> > >    json_array_begin (&json_ctx, "results");
> > >
> > > +  TIMING_NOW (bench_start);
> > > +  al_max = 0;
> > > +#ifdef USE_AS_MEMRCHR
> > > +  al_max = getpagesize () / 2;
> > > +#endif
> > > +
> > >    for (repeats = 0; repeats < 2; ++repeats)
> > >      {
> > > -      for (i = 1; i < 8; ++i)
> > > +      for (al = 0; al <= al_max; al += getpagesize () / 2)
> > >         {
> > > -         do_test (&json_ctx, 0, 16 << i, 2048, 23, repeats);
> > > -         do_test (&json_ctx, i, 64, 256, 23, repeats);
> > > -         do_test (&json_ctx, 0, 16 << i, 2048, 0, repeats);
> > > -         do_test (&json_ctx, i, 64, 256, 0, repeats);
> > > -
> > > -         do_test (&json_ctx, getpagesize () - 15, 64, 256, 0, repeats);
> > > +         for (i = 1; i < 8; ++i)
> > > +           {
> > > +             do_test (&json_ctx, al, 16 << i, 2048, 23, repeats);
> > > +             do_test (&json_ctx, al + i, 64, 256, 23, repeats);
> > > +             do_test (&json_ctx, al, 16 << i, 2048, 0, repeats);
> > > +             do_test (&json_ctx, al + i, 64, 256, 0, repeats);
> > > +
> > > +             do_test (&json_ctx, al + getpagesize () - 15, 64, 256, 0,
> > > +                      repeats);
> > >  #ifdef USE_AS_MEMRCHR
> > > -         /* Also test the position close to the beginning for memrchr.  */
> > > -         do_test (&json_ctx, 0, i, 256, 23, repeats);
> > > -         do_test (&json_ctx, 0, i, 256, 0, repeats);
> > > -         do_test (&json_ctx, i, i, 256, 23, repeats);
> > > -         do_test (&json_ctx, i, i, 256, 0, repeats);
> > > +             /* Also test the position close to the beginning for memrchr.  */
> > > +             do_test (&json_ctx, al, i, 256, 23, repeats);
> > > +             do_test (&json_ctx, al, i, 256, 0, repeats);
> > > +             do_test (&json_ctx, al + i, i, 256, 23, repeats);
> > > +             do_test (&json_ctx, al + i, i, 256, 0, repeats);
> > >  #endif
> > > +           }
> > > +         for (i = 1; i < 8; ++i)
> > > +           {
> > > +             do_test (&json_ctx, al + i, i << 5, 192, 23, repeats);
> > > +             do_test (&json_ctx, al + i, i << 5, 192, 0, repeats);
> > > +             do_test (&json_ctx, al + i, i << 5, 256, 23, repeats);
> > > +             do_test (&json_ctx, al + i, i << 5, 256, 0, repeats);
> > > +             do_test (&json_ctx, al + i, i << 5, 512, 23, repeats);
> > > +             do_test (&json_ctx, al + i, i << 5, 512, 0, repeats);
> > > +
> > > +             do_test (&json_ctx, al + getpagesize () - 15, i << 5, 256, 23,
> > > +                      repeats);
> > > +           }
> > >         }
> > > -      for (i = 1; i < 8; ++i)
> > > -       {
> > > -         do_test (&json_ctx, i, i << 5, 192, 23, repeats);
> > > -         do_test (&json_ctx, i, i << 5, 192, 0, repeats);
> > > -         do_test (&json_ctx, i, i << 5, 256, 23, repeats);
> > > -         do_test (&json_ctx, i, i << 5, 256, 0, repeats);
> > > -         do_test (&json_ctx, i, i << 5, 512, 23, repeats);
> > > -         do_test (&json_ctx, i, i << 5, 512, 0, repeats);
> > > -
> > > -         do_test (&json_ctx, getpagesize () - 15, i << 5, 256, 23, repeats);
> > > -       }
> > > +
> > >        for (i = 1; i < 32; ++i)
> > >         {
> > >           do_test (&json_ctx, 0, i, i + 1, 23, repeats);
> > > @@ -207,11 +220,33 @@ test_main (void)
> > >           do_test (&json_ctx, 0, 2, i + 1, 0, repeats);
> > >  #endif
> > >         }
> > > +      for (al = 0; al <= al_max; al += getpagesize () / 2)
> > > +       {
> > > +         for (i = (16 / sizeof (CHAR)); i <= (8192 / sizeof (CHAR)); i += i)
> > > +           {
> > > +             for (j = 0; j <= (384 / sizeof (CHAR));
> > > +                  j += (32 / sizeof (CHAR)))
> > > +               {
> > > +                 do_test (&json_ctx, al, i + j, i, 23, repeats);
> > > +                 do_test (&json_ctx, al, i, i + j, 23, repeats);
> > > +                 if (j < i)
> > > +                   {
> > > +                     do_test (&json_ctx, al, i - j, i, 23, repeats);
> > > +                     do_test (&json_ctx, al, i, i - j, 23, repeats);
> > > +                   }
> > > +               }
> > > +           }
> > > +       }
> > > +
> > >  #ifndef USE_AS_MEMRCHR
> > >        break;
> > >  #endif
> > >      }
> > >
> > > +  TIMING_NOW (bench_stop);
> > > +  TIMING_DIFF (bench_total_time, bench_start, bench_stop);
> > > +  json_attr_double (&json_ctx, "benchtime", bench_total_time);
> > > +
> > >    json_array_end (&json_ctx);
> > >    json_attr_object_end (&json_ctx);
> > >    json_attr_object_end (&json_ctx);
> > > diff --git a/benchtests/bench-rawmemchr.c b/benchtests/bench-rawmemchr.c
> > > index b1803afc14..667ecd48f9 100644
> > > --- a/benchtests/bench-rawmemchr.c
> > > +++ b/benchtests/bench-rawmemchr.c
> > > @@ -70,7 +70,7 @@ do_test (json_ctx_t *json_ctx, size_t align, size_t pos, size_t len, int seek_ch
> > >    size_t i;
> > >    char *result;
> > >
> > > -  align &= 7;
> > > +  align &= getpagesize () - 1;
> > >    if (align + len >= page_size)
> > >      return;
> > >
> > > @@ -106,7 +106,7 @@ test_main (void)
> > >  {
> > >    json_ctx_t json_ctx;
> > >    size_t i;
> > > -
> > > +  timing_t bench_start, bench_stop, bench_total_time;
> > >    test_init ();
> > >
> > >    json_init (&json_ctx, 0, stdout);
> > > @@ -120,11 +120,12 @@ test_main (void)
> > >
> > >    json_array_begin (&json_ctx, "ifuncs");
> > >    FOR_EACH_IMPL (impl, 0)
> > > -      json_element_string (&json_ctx, impl->name);
> > > +    json_element_string (&json_ctx, impl->name);
> > >    json_array_end (&json_ctx);
> > >
> > >    json_array_begin (&json_ctx, "results");
> > >
> > > +  TIMING_NOW (bench_start);
> > >    for (i = 1; i < 7; ++i)
> > >      {
> > >        do_test (&json_ctx, 0, 16 << i, 2048, 23);
> > > @@ -137,6 +138,35 @@ test_main (void)
> > >        do_test (&json_ctx, 0, i, i + 1, 23);
> > >        do_test (&json_ctx, 0, i, i + 1, 0);
> > >      }
> > > +  for (; i < 256; i += 32)
> > > +    {
> > > +      do_test (&json_ctx, 0, i, i + 1, 23);
> > > +      do_test (&json_ctx, 0, i - 1, i, 23);
> > > +    }
> > > +  for (; i < 512; i += 64)
> > > +    {
> > > +      do_test (&json_ctx, 0, i, i + 1, 23);
> > > +      do_test (&json_ctx, 0, i - 1, i, 23);
> > > +    }
> > > +  for (; i < 1024; i += 128)
> > > +    {
> > > +      do_test (&json_ctx, 0, i, i + 1, 23);
> > > +      do_test (&json_ctx, 0, i - 1, i, 23);
> > > +    }
> > > +  for (; i < 2048; i += 256)
> > > +    {
> > > +      do_test (&json_ctx, 0, i, i + 1, 23);
> > > +      do_test (&json_ctx, 0, i - 1, i, 23);
> > > +    }
> > > +  for (; i < 4096; i += 512)
> > > +    {
> > > +      do_test (&json_ctx, 0, i, i + 1, 23);
> > > +      do_test (&json_ctx, 0, i - 1, i, 23);
> > > +    }
> > > +
> > > +  TIMING_NOW (bench_stop);
> > > +  TIMING_DIFF (bench_total_time, bench_start, bench_stop);
> > > +  json_attr_double (&json_ctx, "benchtime", bench_total_time);
> > >
> > >    json_array_end (&json_ctx);
> > >    json_attr_object_end (&json_ctx);
> > > diff --git a/benchtests/bench-strchr.c b/benchtests/bench-strchr.c
> > > index 54640bde7e..af325806ce 100644
> > > --- a/benchtests/bench-strchr.c
> > > +++ b/benchtests/bench-strchr.c
> > > @@ -287,8 +287,8 @@ int
> > >  test_main (void)
> > >  {
> > >    json_ctx_t json_ctx;
> > > -  size_t i;
> > > -
> > > +  size_t i, j;
> > > +  timing_t bench_start, bench_stop, bench_total_time;
> > >    test_init ();
> > >
> > >    json_init (&json_ctx, 0, stdout);
> > > @@ -307,6 +307,7 @@ test_main (void)
> > >
> > >    json_array_begin (&json_ctx, "results");
> > >
> > > +  TIMING_NOW (bench_start);
> > >    for (i = 1; i < 8; ++i)
> > >      {
> > >        do_test (&json_ctx, 0, 16 << i, 2048, SMALL_CHAR, MIDDLE_CHAR);
> > > @@ -367,15 +368,34 @@ test_main (void)
> > >        do_test (&json_ctx, 0, i, i + 1, 0, BIG_CHAR);
> > >      }
> > >
> > > -  DO_RAND_TEST(&json_ctx, 0, 15, 16, 0.0);
> > > -  DO_RAND_TEST(&json_ctx, 0, 15, 16, 0.1);
> > > -  DO_RAND_TEST(&json_ctx, 0, 15, 16, 0.25);
> > > -  DO_RAND_TEST(&json_ctx, 0, 15, 16, 0.33);
> > > -  DO_RAND_TEST(&json_ctx, 0, 15, 16, 0.5);
> > > -  DO_RAND_TEST(&json_ctx, 0, 15, 16, 0.66);
> > > -  DO_RAND_TEST(&json_ctx, 0, 15, 16, 0.75);
> > > -  DO_RAND_TEST(&json_ctx, 0, 15, 16, 0.9);
> > > -  DO_RAND_TEST(&json_ctx, 0, 15, 16, 1.0);
> > > +  for (i = 16 / sizeof (CHAR); i <= 8192 / sizeof (CHAR); i += i)
> > > +    {
> > > +      for (j = 32 / sizeof (CHAR); j <= 320 / sizeof (CHAR);
> > > +          j += 32 / sizeof (CHAR))
> > > +       {
> > > +         do_test (&json_ctx, 0, i, i + j, 0, MIDDLE_CHAR);
> > > +         do_test (&json_ctx, 0, i + j, i, 0, MIDDLE_CHAR);
> > > +         if (i > j)
> > > +           {
> > > +             do_test (&json_ctx, 0, i, i - j, 0, MIDDLE_CHAR);
> > > +             do_test (&json_ctx, 0, i - j, i, 0, MIDDLE_CHAR);
> > > +           }
> > > +       }
> > > +    }
> > > +
> > > +  DO_RAND_TEST (&json_ctx, 0, 15, 16, 0.0);
> > > +  DO_RAND_TEST (&json_ctx, 0, 15, 16, 0.1);
> > > +  DO_RAND_TEST (&json_ctx, 0, 15, 16, 0.25);
> > > +  DO_RAND_TEST (&json_ctx, 0, 15, 16, 0.33);
> > > +  DO_RAND_TEST (&json_ctx, 0, 15, 16, 0.5);
> > > +  DO_RAND_TEST (&json_ctx, 0, 15, 16, 0.66);
> > > +  DO_RAND_TEST (&json_ctx, 0, 15, 16, 0.75);
> > > +  DO_RAND_TEST (&json_ctx, 0, 15, 16, 0.9);
> > > +  DO_RAND_TEST (&json_ctx, 0, 15, 16, 1.0);
> > > +
> > > +  TIMING_NOW (bench_stop);
> > > +  TIMING_DIFF (bench_total_time, bench_start, bench_stop);
> > > +  json_attr_double (&json_ctx, "benchtime", bench_total_time);
> > >
> > >    json_array_end (&json_ctx);
> > >    json_attr_object_end (&json_ctx);
> > > diff --git a/benchtests/bench-strnlen.c b/benchtests/bench-strnlen.c
> > > index 13b46b3f57..c6281b6373 100644
> > > --- a/benchtests/bench-strnlen.c
> > > +++ b/benchtests/bench-strnlen.c
> > > @@ -117,7 +117,7 @@ test_main (void)
> > >  {
> > >    size_t i, j;
> > >    json_ctx_t json_ctx;
> > > -
> > > +  timing_t bench_start, bench_stop, bench_total_time;
> > >    test_init ();
> > >
> > >    json_init (&json_ctx, 0, stdout);
> > > @@ -136,6 +136,7 @@ test_main (void)
> > >
> > >    json_array_begin (&json_ctx, "results");
> > >
> > > +  TIMING_NOW (bench_start);
> > >    for (i = 0; i <= 1; ++i)
> > >      {
> > >        do_test (&json_ctx, i, 1, 128, MIDDLE_CHAR);
> > > @@ -195,23 +196,27 @@ test_main (void)
> > >      {
> > >        for (j = 0; j <= (704 / sizeof (CHAR)); j += (32 / sizeof (CHAR)))
> > >         {
> > > -         do_test (&json_ctx, 0, 1 << i, (i + j), BIG_CHAR);
> > >           do_test (&json_ctx, 0, i + j, i, BIG_CHAR);
> > > -
> > > -         do_test (&json_ctx, 64, 1 << i, (i + j), BIG_CHAR);
> > >           do_test (&json_ctx, 64, i + j, i, BIG_CHAR);
> > >
> > > +         do_test (&json_ctx, 0, i, i + j, BIG_CHAR);
> > > +         do_test (&json_ctx, 64, i, i + j, BIG_CHAR);
> > > +
> > >           if (j < i)
> > >             {
> > > -             do_test (&json_ctx, 0, 1 << i, i - j, BIG_CHAR);
> > >               do_test (&json_ctx, 0, i - j, i, BIG_CHAR);
> > > -
> > > -             do_test (&json_ctx, 64, 1 << i, i - j, BIG_CHAR);
> > >               do_test (&json_ctx, 64, i - j, i, BIG_CHAR);
> > > +
> > > +             do_test (&json_ctx, 0, i, i - j, BIG_CHAR);
> > > +             do_test (&json_ctx, 64, i, i - j, BIG_CHAR);
> > >             }
> > >         }
> > >      }
> > >
> > > +  TIMING_NOW (bench_stop);
> > > +  TIMING_DIFF (bench_total_time, bench_start, bench_stop);
> > > +  json_attr_double (&json_ctx, "benchtime", bench_total_time);
> > > +
> > >    json_array_end (&json_ctx);
> > >    json_attr_object_end (&json_ctx);
> > >    json_attr_object_end (&json_ctx);
> > > diff --git a/benchtests/bench-strrchr.c b/benchtests/bench-strrchr.c
> > > index 7cd2a15484..e6d8163047 100644
> > > --- a/benchtests/bench-strrchr.c
> > > +++ b/benchtests/bench-strrchr.c
> > > @@ -151,8 +151,9 @@ int
> > >  test_main (void)
> > >  {
> > >    json_ctx_t json_ctx;
> > > -  size_t i, j;
> > > +  size_t i, j, k;
> > >    int seek;
> > > +  timing_t bench_start, bench_stop, bench_total_time;
> > >
> > >    test_init ();
> > >    json_init (&json_ctx, 0, stdout);
> > > @@ -171,9 +172,10 @@ test_main (void)
> > >
> > >    json_array_begin (&json_ctx, "results");
> > >
> > > +  TIMING_NOW (bench_start);
> > >    for (seek = 0; seek <= 23; seek += 23)
> > >      {
> > > -      for (j = 1; j < 32; j += j)
> > > +      for (j = 1; j <= 256; j = (j * 4))
> > >         {
> > >           for (i = 1; i < 9; ++i)
> > >             {
> > > @@ -197,12 +199,39 @@ test_main (void)
> > >               do_test (&json_ctx, getpagesize () - i / 2 - 1, i, i + 1, seek,
> > >                        SMALL_CHAR, j);
> > >             }
> > > +
> > > +         for (i = (16 / sizeof (CHAR)); i <= (288 / sizeof (CHAR)); i += 32)
> > > +           {
> > > +             do_test (&json_ctx, 0, i - 16, i, seek, SMALL_CHAR, j);
> > > +             do_test (&json_ctx, 0, i, i + 16, seek, SMALL_CHAR, j);
> > > +           }
> > > +
> > > +         for (i = (16 / sizeof (CHAR)); i <= (2048 / sizeof (CHAR)); i += i)
> > > +           {
> > > +             for (k = 0; k <= (288 / sizeof (CHAR));
> > > +                  k += (48 / sizeof (CHAR)))
> > > +               {
> > > +                 do_test (&json_ctx, 0, k, i, seek, SMALL_CHAR, j);
> > > +                 do_test (&json_ctx, 0, i, i + k, seek, SMALL_CHAR, j);
> > > +
> > > +                 if (k < i)
> > > +                   {
> > > +                     do_test (&json_ctx, 0, i - k, i, seek, SMALL_CHAR, j);
> > > +                     do_test (&json_ctx, 0, k, i - k, seek, SMALL_CHAR, j);
> > > +                     do_test (&json_ctx, 0, i, i - k, seek, SMALL_CHAR, j);
> > > +                   }
> > > +               }
> > > +           }
> > > +
> > >           if (seek == 0)
> > >             {
> > >               break;
> > >             }
> > >         }
> > >      }
> > > +  TIMING_NOW (bench_stop);
> > > +  TIMING_DIFF (bench_total_time, bench_start, bench_stop);
> > > +  json_attr_double (&json_ctx, "benchtime", bench_total_time);
> > >
> > >    json_array_end (&json_ctx);
> > >    json_attr_object_end (&json_ctx);
> > > --
> > > 2.34.1
> > >
> >
> >
> > --
> > H.J.



-- 
H.J.

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH v1 7/7] Bench: Improve benchtests for memchr, strchr, strnlen, strrchr
  2022-10-18 21:53       ` H.J. Lu
@ 2022-10-18 22:58         ` Noah Goldstein
  0 siblings, 0 replies; 41+ messages in thread
From: Noah Goldstein @ 2022-10-18 22:58 UTC (permalink / raw)
  To: H.J. Lu; +Cc: libc-alpha, carlos

On Tue, Oct 18, 2022 at 2:54 PM H.J. Lu <hjl.tools@gmail.com> wrote:
>
> On Tue, Oct 18, 2022 at 2:05 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> >
> > On Tue, Oct 18, 2022 at 4:01 PM H.J. Lu <hjl.tools@gmail.com> wrote:
> > >
> > > On Mon, Oct 17, 2022 at 7:49 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> > > >
> > > > 1. Add more complete coverage in the medium size range.
> > > > 2. In strnlen remove the `1 << i` which was UB (`i` could go beyond
> > > >    32/64)
> > > > 3. Add timer for total benchmark runtime (useful for deciding about
> > > >    tradeoff between coverage and runtime).
> > >
> > > So this is only used for total runtime and won't be used for performance
> > > comparison.  Will "time ./bench" be sufficient?
> >
> > Yes but when running something like:
> > make bench BENCHSET="string-benchset"
> > its hard to get the time of an individual test. We also do timeout on
> > an individual test basis so seems reasonable to get total runtime
> > on individual test basis.
>
> Can you make it a separate patch to add total time?  One can tell
> which bench takes too much time when running
>
> make bench BENCHSET="string-benchset"
>
> and use "time ./bench" on that bench.

Its not that its not doable, this is just something I found
convenient when adding benchtests and figured others
might as well.

I'll remove it from this patchset for now.

Once this stuff is done i can resubmit and we can weigh
the pros/cons.
>
> > >
> > > > ---
> > > >  benchtests/bench-memchr.c    | 83 +++++++++++++++++++++++++-----------
> > > >  benchtests/bench-rawmemchr.c | 36 ++++++++++++++--
> > > >  benchtests/bench-strchr.c    | 42 +++++++++++++-----
> > > >  benchtests/bench-strnlen.c   | 19 ++++++---
> > > >  benchtests/bench-strrchr.c   | 33 +++++++++++++-
> > > >  5 files changed, 166 insertions(+), 47 deletions(-)
> > > >
> > > > diff --git a/benchtests/bench-memchr.c b/benchtests/bench-memchr.c
> > > > index 0facda2fa0..c4d758ae61 100644
> > > > --- a/benchtests/bench-memchr.c
> > > > +++ b/benchtests/bench-memchr.c
> > > > @@ -126,9 +126,10 @@ do_test (json_ctx_t *json_ctx, size_t align, size_t pos, size_t len,
> > > >  int
> > > >  test_main (void)
> > > >  {
> > > > -  size_t i;
> > > > +  size_t i, j, al, al_max;
> > > >    int repeats;
> > > >    json_ctx_t json_ctx;
> > > > +  timing_t bench_start, bench_stop, bench_total_time;
> > > >    test_init ();
> > > >
> > > >    json_init (&json_ctx, 0, stdout);
> > > > @@ -147,35 +148,47 @@ test_main (void)
> > > >
> > > >    json_array_begin (&json_ctx, "results");
> > > >
> > > > +  TIMING_NOW (bench_start);
> > > > +  al_max = 0;
> > > > +#ifdef USE_AS_MEMRCHR
> > > > +  al_max = getpagesize () / 2;
> > > > +#endif
> > > > +
> > > >    for (repeats = 0; repeats < 2; ++repeats)
> > > >      {
> > > > -      for (i = 1; i < 8; ++i)
> > > > +      for (al = 0; al <= al_max; al += getpagesize () / 2)
> > > >         {
> > > > -         do_test (&json_ctx, 0, 16 << i, 2048, 23, repeats);
> > > > -         do_test (&json_ctx, i, 64, 256, 23, repeats);
> > > > -         do_test (&json_ctx, 0, 16 << i, 2048, 0, repeats);
> > > > -         do_test (&json_ctx, i, 64, 256, 0, repeats);
> > > > -
> > > > -         do_test (&json_ctx, getpagesize () - 15, 64, 256, 0, repeats);
> > > > +         for (i = 1; i < 8; ++i)
> > > > +           {
> > > > +             do_test (&json_ctx, al, 16 << i, 2048, 23, repeats);
> > > > +             do_test (&json_ctx, al + i, 64, 256, 23, repeats);
> > > > +             do_test (&json_ctx, al, 16 << i, 2048, 0, repeats);
> > > > +             do_test (&json_ctx, al + i, 64, 256, 0, repeats);
> > > > +
> > > > +             do_test (&json_ctx, al + getpagesize () - 15, 64, 256, 0,
> > > > +                      repeats);
> > > >  #ifdef USE_AS_MEMRCHR
> > > > -         /* Also test the position close to the beginning for memrchr.  */
> > > > -         do_test (&json_ctx, 0, i, 256, 23, repeats);
> > > > -         do_test (&json_ctx, 0, i, 256, 0, repeats);
> > > > -         do_test (&json_ctx, i, i, 256, 23, repeats);
> > > > -         do_test (&json_ctx, i, i, 256, 0, repeats);
> > > > +             /* Also test the position close to the beginning for memrchr.  */
> > > > +             do_test (&json_ctx, al, i, 256, 23, repeats);
> > > > +             do_test (&json_ctx, al, i, 256, 0, repeats);
> > > > +             do_test (&json_ctx, al + i, i, 256, 23, repeats);
> > > > +             do_test (&json_ctx, al + i, i, 256, 0, repeats);
> > > >  #endif
> > > > +           }
> > > > +         for (i = 1; i < 8; ++i)
> > > > +           {
> > > > +             do_test (&json_ctx, al + i, i << 5, 192, 23, repeats);
> > > > +             do_test (&json_ctx, al + i, i << 5, 192, 0, repeats);
> > > > +             do_test (&json_ctx, al + i, i << 5, 256, 23, repeats);
> > > > +             do_test (&json_ctx, al + i, i << 5, 256, 0, repeats);
> > > > +             do_test (&json_ctx, al + i, i << 5, 512, 23, repeats);
> > > > +             do_test (&json_ctx, al + i, i << 5, 512, 0, repeats);
> > > > +
> > > > +             do_test (&json_ctx, al + getpagesize () - 15, i << 5, 256, 23,
> > > > +                      repeats);
> > > > +           }
> > > >         }
> > > > -      for (i = 1; i < 8; ++i)
> > > > -       {
> > > > -         do_test (&json_ctx, i, i << 5, 192, 23, repeats);
> > > > -         do_test (&json_ctx, i, i << 5, 192, 0, repeats);
> > > > -         do_test (&json_ctx, i, i << 5, 256, 23, repeats);
> > > > -         do_test (&json_ctx, i, i << 5, 256, 0, repeats);
> > > > -         do_test (&json_ctx, i, i << 5, 512, 23, repeats);
> > > > -         do_test (&json_ctx, i, i << 5, 512, 0, repeats);
> > > > -
> > > > -         do_test (&json_ctx, getpagesize () - 15, i << 5, 256, 23, repeats);
> > > > -       }
> > > > +
> > > >        for (i = 1; i < 32; ++i)
> > > >         {
> > > >           do_test (&json_ctx, 0, i, i + 1, 23, repeats);
> > > > @@ -207,11 +220,33 @@ test_main (void)
> > > >           do_test (&json_ctx, 0, 2, i + 1, 0, repeats);
> > > >  #endif
> > > >         }
> > > > +      for (al = 0; al <= al_max; al += getpagesize () / 2)
> > > > +       {
> > > > +         for (i = (16 / sizeof (CHAR)); i <= (8192 / sizeof (CHAR)); i += i)
> > > > +           {
> > > > +             for (j = 0; j <= (384 / sizeof (CHAR));
> > > > +                  j += (32 / sizeof (CHAR)))
> > > > +               {
> > > > +                 do_test (&json_ctx, al, i + j, i, 23, repeats);
> > > > +                 do_test (&json_ctx, al, i, i + j, 23, repeats);
> > > > +                 if (j < i)
> > > > +                   {
> > > > +                     do_test (&json_ctx, al, i - j, i, 23, repeats);
> > > > +                     do_test (&json_ctx, al, i, i - j, 23, repeats);
> > > > +                   }
> > > > +               }
> > > > +           }
> > > > +       }
> > > > +
> > > >  #ifndef USE_AS_MEMRCHR
> > > >        break;
> > > >  #endif
> > > >      }
> > > >
> > > > +  TIMING_NOW (bench_stop);
> > > > +  TIMING_DIFF (bench_total_time, bench_start, bench_stop);
> > > > +  json_attr_double (&json_ctx, "benchtime", bench_total_time);
> > > > +
> > > >    json_array_end (&json_ctx);
> > > >    json_attr_object_end (&json_ctx);
> > > >    json_attr_object_end (&json_ctx);
> > > > diff --git a/benchtests/bench-rawmemchr.c b/benchtests/bench-rawmemchr.c
> > > > index b1803afc14..667ecd48f9 100644
> > > > --- a/benchtests/bench-rawmemchr.c
> > > > +++ b/benchtests/bench-rawmemchr.c
> > > > @@ -70,7 +70,7 @@ do_test (json_ctx_t *json_ctx, size_t align, size_t pos, size_t len, int seek_ch
> > > >    size_t i;
> > > >    char *result;
> > > >
> > > > -  align &= 7;
> > > > +  align &= getpagesize () - 1;
> > > >    if (align + len >= page_size)
> > > >      return;
> > > >
> > > > @@ -106,7 +106,7 @@ test_main (void)
> > > >  {
> > > >    json_ctx_t json_ctx;
> > > >    size_t i;
> > > > -
> > > > +  timing_t bench_start, bench_stop, bench_total_time;
> > > >    test_init ();
> > > >
> > > >    json_init (&json_ctx, 0, stdout);
> > > > @@ -120,11 +120,12 @@ test_main (void)
> > > >
> > > >    json_array_begin (&json_ctx, "ifuncs");
> > > >    FOR_EACH_IMPL (impl, 0)
> > > > -      json_element_string (&json_ctx, impl->name);
> > > > +    json_element_string (&json_ctx, impl->name);
> > > >    json_array_end (&json_ctx);
> > > >
> > > >    json_array_begin (&json_ctx, "results");
> > > >
> > > > +  TIMING_NOW (bench_start);
> > > >    for (i = 1; i < 7; ++i)
> > > >      {
> > > >        do_test (&json_ctx, 0, 16 << i, 2048, 23);
> > > > @@ -137,6 +138,35 @@ test_main (void)
> > > >        do_test (&json_ctx, 0, i, i + 1, 23);
> > > >        do_test (&json_ctx, 0, i, i + 1, 0);
> > > >      }
> > > > +  for (; i < 256; i += 32)
> > > > +    {
> > > > +      do_test (&json_ctx, 0, i, i + 1, 23);
> > > > +      do_test (&json_ctx, 0, i - 1, i, 23);
> > > > +    }
> > > > +  for (; i < 512; i += 64)
> > > > +    {
> > > > +      do_test (&json_ctx, 0, i, i + 1, 23);
> > > > +      do_test (&json_ctx, 0, i - 1, i, 23);
> > > > +    }
> > > > +  for (; i < 1024; i += 128)
> > > > +    {
> > > > +      do_test (&json_ctx, 0, i, i + 1, 23);
> > > > +      do_test (&json_ctx, 0, i - 1, i, 23);
> > > > +    }
> > > > +  for (; i < 2048; i += 256)
> > > > +    {
> > > > +      do_test (&json_ctx, 0, i, i + 1, 23);
> > > > +      do_test (&json_ctx, 0, i - 1, i, 23);
> > > > +    }
> > > > +  for (; i < 4096; i += 512)
> > > > +    {
> > > > +      do_test (&json_ctx, 0, i, i + 1, 23);
> > > > +      do_test (&json_ctx, 0, i - 1, i, 23);
> > > > +    }
> > > > +
> > > > +  TIMING_NOW (bench_stop);
> > > > +  TIMING_DIFF (bench_total_time, bench_start, bench_stop);
> > > > +  json_attr_double (&json_ctx, "benchtime", bench_total_time);
> > > >
> > > >    json_array_end (&json_ctx);
> > > >    json_attr_object_end (&json_ctx);
> > > > diff --git a/benchtests/bench-strchr.c b/benchtests/bench-strchr.c
> > > > index 54640bde7e..af325806ce 100644
> > > > --- a/benchtests/bench-strchr.c
> > > > +++ b/benchtests/bench-strchr.c
> > > > @@ -287,8 +287,8 @@ int
> > > >  test_main (void)
> > > >  {
> > > >    json_ctx_t json_ctx;
> > > > -  size_t i;
> > > > -
> > > > +  size_t i, j;
> > > > +  timing_t bench_start, bench_stop, bench_total_time;
> > > >    test_init ();
> > > >
> > > >    json_init (&json_ctx, 0, stdout);
> > > > @@ -307,6 +307,7 @@ test_main (void)
> > > >
> > > >    json_array_begin (&json_ctx, "results");
> > > >
> > > > +  TIMING_NOW (bench_start);
> > > >    for (i = 1; i < 8; ++i)
> > > >      {
> > > >        do_test (&json_ctx, 0, 16 << i, 2048, SMALL_CHAR, MIDDLE_CHAR);
> > > > @@ -367,15 +368,34 @@ test_main (void)
> > > >        do_test (&json_ctx, 0, i, i + 1, 0, BIG_CHAR);
> > > >      }
> > > >
> > > > -  DO_RAND_TEST(&json_ctx, 0, 15, 16, 0.0);
> > > > -  DO_RAND_TEST(&json_ctx, 0, 15, 16, 0.1);
> > > > -  DO_RAND_TEST(&json_ctx, 0, 15, 16, 0.25);
> > > > -  DO_RAND_TEST(&json_ctx, 0, 15, 16, 0.33);
> > > > -  DO_RAND_TEST(&json_ctx, 0, 15, 16, 0.5);
> > > > -  DO_RAND_TEST(&json_ctx, 0, 15, 16, 0.66);
> > > > -  DO_RAND_TEST(&json_ctx, 0, 15, 16, 0.75);
> > > > -  DO_RAND_TEST(&json_ctx, 0, 15, 16, 0.9);
> > > > -  DO_RAND_TEST(&json_ctx, 0, 15, 16, 1.0);
> > > > +  for (i = 16 / sizeof (CHAR); i <= 8192 / sizeof (CHAR); i += i)
> > > > +    {
> > > > +      for (j = 32 / sizeof (CHAR); j <= 320 / sizeof (CHAR);
> > > > +          j += 32 / sizeof (CHAR))
> > > > +       {
> > > > +         do_test (&json_ctx, 0, i, i + j, 0, MIDDLE_CHAR);
> > > > +         do_test (&json_ctx, 0, i + j, i, 0, MIDDLE_CHAR);
> > > > +         if (i > j)
> > > > +           {
> > > > +             do_test (&json_ctx, 0, i, i - j, 0, MIDDLE_CHAR);
> > > > +             do_test (&json_ctx, 0, i - j, i, 0, MIDDLE_CHAR);
> > > > +           }
> > > > +       }
> > > > +    }
> > > > +
> > > > +  DO_RAND_TEST (&json_ctx, 0, 15, 16, 0.0);
> > > > +  DO_RAND_TEST (&json_ctx, 0, 15, 16, 0.1);
> > > > +  DO_RAND_TEST (&json_ctx, 0, 15, 16, 0.25);
> > > > +  DO_RAND_TEST (&json_ctx, 0, 15, 16, 0.33);
> > > > +  DO_RAND_TEST (&json_ctx, 0, 15, 16, 0.5);
> > > > +  DO_RAND_TEST (&json_ctx, 0, 15, 16, 0.66);
> > > > +  DO_RAND_TEST (&json_ctx, 0, 15, 16, 0.75);
> > > > +  DO_RAND_TEST (&json_ctx, 0, 15, 16, 0.9);
> > > > +  DO_RAND_TEST (&json_ctx, 0, 15, 16, 1.0);
> > > > +
> > > > +  TIMING_NOW (bench_stop);
> > > > +  TIMING_DIFF (bench_total_time, bench_start, bench_stop);
> > > > +  json_attr_double (&json_ctx, "benchtime", bench_total_time);
> > > >
> > > >    json_array_end (&json_ctx);
> > > >    json_attr_object_end (&json_ctx);
> > > > diff --git a/benchtests/bench-strnlen.c b/benchtests/bench-strnlen.c
> > > > index 13b46b3f57..c6281b6373 100644
> > > > --- a/benchtests/bench-strnlen.c
> > > > +++ b/benchtests/bench-strnlen.c
> > > > @@ -117,7 +117,7 @@ test_main (void)
> > > >  {
> > > >    size_t i, j;
> > > >    json_ctx_t json_ctx;
> > > > -
> > > > +  timing_t bench_start, bench_stop, bench_total_time;
> > > >    test_init ();
> > > >
> > > >    json_init (&json_ctx, 0, stdout);
> > > > @@ -136,6 +136,7 @@ test_main (void)
> > > >
> > > >    json_array_begin (&json_ctx, "results");
> > > >
> > > > +  TIMING_NOW (bench_start);
> > > >    for (i = 0; i <= 1; ++i)
> > > >      {
> > > >        do_test (&json_ctx, i, 1, 128, MIDDLE_CHAR);
> > > > @@ -195,23 +196,27 @@ test_main (void)
> > > >      {
> > > >        for (j = 0; j <= (704 / sizeof (CHAR)); j += (32 / sizeof (CHAR)))
> > > >         {
> > > > -         do_test (&json_ctx, 0, 1 << i, (i + j), BIG_CHAR);
> > > >           do_test (&json_ctx, 0, i + j, i, BIG_CHAR);
> > > > -
> > > > -         do_test (&json_ctx, 64, 1 << i, (i + j), BIG_CHAR);
> > > >           do_test (&json_ctx, 64, i + j, i, BIG_CHAR);
> > > >
> > > > +         do_test (&json_ctx, 0, i, i + j, BIG_CHAR);
> > > > +         do_test (&json_ctx, 64, i, i + j, BIG_CHAR);
> > > > +
> > > >           if (j < i)
> > > >             {
> > > > -             do_test (&json_ctx, 0, 1 << i, i - j, BIG_CHAR);
> > > >               do_test (&json_ctx, 0, i - j, i, BIG_CHAR);
> > > > -
> > > > -             do_test (&json_ctx, 64, 1 << i, i - j, BIG_CHAR);
> > > >               do_test (&json_ctx, 64, i - j, i, BIG_CHAR);
> > > > +
> > > > +             do_test (&json_ctx, 0, i, i - j, BIG_CHAR);
> > > > +             do_test (&json_ctx, 64, i, i - j, BIG_CHAR);
> > > >             }
> > > >         }
> > > >      }
> > > >
> > > > +  TIMING_NOW (bench_stop);
> > > > +  TIMING_DIFF (bench_total_time, bench_start, bench_stop);
> > > > +  json_attr_double (&json_ctx, "benchtime", bench_total_time);
> > > > +
> > > >    json_array_end (&json_ctx);
> > > >    json_attr_object_end (&json_ctx);
> > > >    json_attr_object_end (&json_ctx);
> > > > diff --git a/benchtests/bench-strrchr.c b/benchtests/bench-strrchr.c
> > > > index 7cd2a15484..e6d8163047 100644
> > > > --- a/benchtests/bench-strrchr.c
> > > > +++ b/benchtests/bench-strrchr.c
> > > > @@ -151,8 +151,9 @@ int
> > > >  test_main (void)
> > > >  {
> > > >    json_ctx_t json_ctx;
> > > > -  size_t i, j;
> > > > +  size_t i, j, k;
> > > >    int seek;
> > > > +  timing_t bench_start, bench_stop, bench_total_time;
> > > >
> > > >    test_init ();
> > > >    json_init (&json_ctx, 0, stdout);
> > > > @@ -171,9 +172,10 @@ test_main (void)
> > > >
> > > >    json_array_begin (&json_ctx, "results");
> > > >
> > > > +  TIMING_NOW (bench_start);
> > > >    for (seek = 0; seek <= 23; seek += 23)
> > > >      {
> > > > -      for (j = 1; j < 32; j += j)
> > > > +      for (j = 1; j <= 256; j = (j * 4))
> > > >         {
> > > >           for (i = 1; i < 9; ++i)
> > > >             {
> > > > @@ -197,12 +199,39 @@ test_main (void)
> > > >               do_test (&json_ctx, getpagesize () - i / 2 - 1, i, i + 1, seek,
> > > >                        SMALL_CHAR, j);
> > > >             }
> > > > +
> > > > +         for (i = (16 / sizeof (CHAR)); i <= (288 / sizeof (CHAR)); i += 32)
> > > > +           {
> > > > +             do_test (&json_ctx, 0, i - 16, i, seek, SMALL_CHAR, j);
> > > > +             do_test (&json_ctx, 0, i, i + 16, seek, SMALL_CHAR, j);
> > > > +           }
> > > > +
> > > > +         for (i = (16 / sizeof (CHAR)); i <= (2048 / sizeof (CHAR)); i += i)
> > > > +           {
> > > > +             for (k = 0; k <= (288 / sizeof (CHAR));
> > > > +                  k += (48 / sizeof (CHAR)))
> > > > +               {
> > > > +                 do_test (&json_ctx, 0, k, i, seek, SMALL_CHAR, j);
> > > > +                 do_test (&json_ctx, 0, i, i + k, seek, SMALL_CHAR, j);
> > > > +
> > > > +                 if (k < i)
> > > > +                   {
> > > > +                     do_test (&json_ctx, 0, i - k, i, seek, SMALL_CHAR, j);
> > > > +                     do_test (&json_ctx, 0, k, i - k, seek, SMALL_CHAR, j);
> > > > +                     do_test (&json_ctx, 0, i, i - k, seek, SMALL_CHAR, j);
> > > > +                   }
> > > > +               }
> > > > +           }
> > > > +
> > > >           if (seek == 0)
> > > >             {
> > > >               break;
> > > >             }
> > > >         }
> > > >      }
> > > > +  TIMING_NOW (bench_stop);
> > > > +  TIMING_DIFF (bench_total_time, bench_start, bench_stop);
> > > > +  json_attr_double (&json_ctx, "benchtime", bench_total_time);
> > > >
> > > >    json_array_end (&json_ctx);
> > > >    json_attr_object_end (&json_ctx);
> > > > --
> > > > 2.34.1
> > > >
> > >
> > >
> > > --
> > > H.J.
>
>
>
> --
> H.J.

^ permalink raw reply	[flat|nested] 41+ messages in thread

* [PATCH v2 1/7] x86: Optimize memchr-evex.S and implement with VMM headers
  2022-10-18  2:48 [PATCH v1 1/7] x86: Optimize memchr-evex.S and implement with VMM headers Noah Goldstein
                   ` (6 preceding siblings ...)
  2022-10-18  2:50 ` [PATCH v1 1/7] x86: Optimize memchr-evex.S and implement with VMM headers Noah Goldstein
@ 2022-10-18 23:19 ` Noah Goldstein
  2022-10-18 23:19   ` [PATCH v2 2/7] x86: Shrink / minorly optimize strchr-evex " Noah Goldstein
                     ` (5 more replies)
  2022-10-19  0:44 ` [PATCH v3 1/7] x86: Optimize memchr-evex.S and implement with VMM headers Noah Goldstein
  8 siblings, 6 replies; 41+ messages in thread
From: Noah Goldstein @ 2022-10-18 23:19 UTC (permalink / raw)
  To: libc-alpha; +Cc: goldstein.w.n, hjl.tools, carlos

Optimizations are:

1. Use the fact that tzcnt(0) -> VEC_SIZE for memchr to save a branch
   in short string case.
2. Restructure code so that small strings are given the hot path.
	- This is a net-zero on the benchmark suite but in general makes
      sense as smaller sizes are far more common.
3. Use more code-size efficient instructions.
	- tzcnt ...     -> bsf ...
	- vpcmpb $0 ... -> vpcmpeq ...
4. Align labels less aggressively, especially if it doesn't save fetch
   blocks / causes the basic-block to span extra cache-lines.

The optimizations (especially for point 2) make the memchr and
rawmemchr code essentially incompatible so split rawmemchr-evex
to a new file.

Code Size Changes:
memchr-evex.S       : -107 bytes
rawmemchr-evex.S    :  -53 bytes

Net perf changes:

Reported as geometric mean of all improvements / regressions from N=10
runs of the benchtests. Value as New Time / Old Time so < 1.0 is
improvement and 1.0 is regression.

memchr-evex.S       : 0.928
rawmemchr-evex.S    : 0.986 (Less targets cross cache lines)

Full results attached in email.

Full check passes on x86-64.
---
 sysdeps/x86_64/multiarch/memchr-evex.S        | 939 ++++++++++--------
 sysdeps/x86_64/multiarch/rawmemchr-evex-rtm.S |   9 +-
 sysdeps/x86_64/multiarch/rawmemchr-evex.S     | 313 +++++-
 3 files changed, 851 insertions(+), 410 deletions(-)

diff --git a/sysdeps/x86_64/multiarch/memchr-evex.S b/sysdeps/x86_64/multiarch/memchr-evex.S
index 0dd4f1dcce..23a1c0018e 100644
--- a/sysdeps/x86_64/multiarch/memchr-evex.S
+++ b/sysdeps/x86_64/multiarch/memchr-evex.S
@@ -21,17 +21,27 @@
 
 #if ISA_SHOULD_BUILD (4)
 
+# ifndef VEC_SIZE
+#  include "x86-evex256-vecs.h"
+# endif
+
 # ifndef MEMCHR
 #  define MEMCHR	__memchr_evex
 # endif
 
 # ifdef USE_AS_WMEMCHR
+#  define PC_SHIFT_GPR	rcx
+#  define VPTESTN	vptestnmd
 #  define VPBROADCAST	vpbroadcastd
 #  define VPMINU	vpminud
 #  define VPCMP	vpcmpd
 #  define VPCMPEQ	vpcmpeqd
 #  define CHAR_SIZE	4
+
+#  define USE_WIDE_CHAR
 # else
+#  define PC_SHIFT_GPR	rdi
+#  define VPTESTN	vptestnmb
 #  define VPBROADCAST	vpbroadcastb
 #  define VPMINU	vpminub
 #  define VPCMP	vpcmpb
@@ -39,534 +49,661 @@
 #  define CHAR_SIZE	1
 # endif
 
-	/* In the 4x loop the RTM and non-RTM versions have data pointer
-	   off by VEC_SIZE * 4 with RTM version being VEC_SIZE * 4 greater.
-	   This is represented by BASE_OFFSET. As well because the RTM
-	   version uses vpcmp which stores a bit per element compared where
-	   the non-RTM version uses vpcmpeq which stores a bit per byte
-	   compared RET_SCALE of CHAR_SIZE is only relevant for the RTM
-	   version.  */
-# ifdef USE_IN_RTM
+# include "reg-macros.h"
+
+
+/* If not in an RTM and VEC_SIZE != 64 (the VEC_SIZE = 64
+   doesn't have VEX encoding), use VEX encoding in loop so we
+   can use vpcmpeqb + vptern which is more efficient than the
+   EVEX alternative.  */
+# if defined USE_IN_RTM || VEC_SIZE == 64
+#  undef COND_VZEROUPPER
+#  undef VZEROUPPER_RETURN
+#  undef VZEROUPPER
+
+#  define COND_VZEROUPPER
+#  define VZEROUPPER_RETURN	ret
 #  define VZEROUPPER
-#  define BASE_OFFSET	(VEC_SIZE * 4)
-#  define RET_SCALE	CHAR_SIZE
+
+#  define USE_TERN_IN_LOOP	0
 # else
+#  define USE_TERN_IN_LOOP	1
+#  undef VZEROUPPER
 #  define VZEROUPPER	vzeroupper
-#  define BASE_OFFSET	0
-#  define RET_SCALE	1
 # endif
 
-	/* In the return from 4x loop memchr and rawmemchr versions have
-	   data pointers off by VEC_SIZE * 4 with memchr version being
-	   VEC_SIZE * 4 greater.  */
-# ifdef USE_AS_RAWMEMCHR
-#  define RET_OFFSET	(BASE_OFFSET - (VEC_SIZE * 4))
-#  define RAW_PTR_REG	rcx
-#  define ALGN_PTR_REG	rdi
+# if USE_TERN_IN_LOOP
+	/* Resulting bitmask for vpmovmskb has 4-bits set for each wchar
+	   so we don't want to multiply resulting index.  */
+#  define TERN_CHAR_MULT	1
+
+#  ifdef USE_AS_WMEMCHR
+#   define TEST_END()	inc %VRCX
+#  else
+#   define TEST_END()	add %rdx, %rcx
+#  endif
 # else
-#  define RET_OFFSET	BASE_OFFSET
-#  define RAW_PTR_REG	rdi
-#  define ALGN_PTR_REG	rcx
+#  define TERN_CHAR_MULT	CHAR_SIZE
+#  define TEST_END()	KORTEST %k2, %k3
 # endif
 
-# define XMMZERO	xmm23
-# define YMMZERO	ymm23
-# define XMMMATCH	xmm16
-# define YMMMATCH	ymm16
-# define YMM1		ymm17
-# define YMM2		ymm18
-# define YMM3		ymm19
-# define YMM4		ymm20
-# define YMM5		ymm21
-# define YMM6		ymm22
+# if defined USE_AS_WMEMCHR || !USE_TERN_IN_LOOP
+#  ifndef USE_AS_WMEMCHR
+#   define GPR_X0_IS_RET	1
+#  else
+#   define GPR_X0_IS_RET	0
+#  endif
+#  define GPR_X0	rax
+# else
+#  define GPR_X0_IS_RET	0
+#  define GPR_X0	rdx
+# endif
+
+# define CHAR_PER_VEC	(VEC_SIZE / CHAR_SIZE)
 
-# ifndef SECTION
-#  define SECTION(p)	p##.evex
+# if CHAR_PER_VEC == 64
+#  define LAST_VEC_OFFSET	(VEC_SIZE * 3)
+# else
+#  define LAST_VEC_OFFSET	(VEC_SIZE * 2)
+# endif
+# if CHAR_PER_VEC >= 32
+#  define MASK_GPR(...)	VGPR(__VA_ARGS__)
+# elif CHAR_PER_VEC == 16
+#  define MASK_GPR(reg)	VGPR_SZ(reg, 16)
+# else
+#  define MASK_GPR(reg)	VGPR_SZ(reg, 8)
 # endif
 
-# define VEC_SIZE 32
-# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE)
-# define PAGE_SIZE 4096
+# define VMATCH	VMM(0)
+# define VMATCH_LO	VMM_lo(0)
 
-	.section SECTION(.text),"ax",@progbits
+# define PAGE_SIZE	4096
+
+
+	.section SECTION(.text), "ax", @progbits
 ENTRY_P2ALIGN (MEMCHR, 6)
-# ifndef USE_AS_RAWMEMCHR
 	/* Check for zero length.  */
 	test	%RDX_LP, %RDX_LP
-	jz	L(zero)
+	jz	L(zero_0)
 
-#  ifdef __ILP32__
+# ifdef __ILP32__
 	/* Clear the upper 32 bits.  */
 	movl	%edx, %edx
-#  endif
 # endif
-	/* Broadcast CHAR to YMMMATCH.  */
-	VPBROADCAST %esi, %YMMMATCH
+	VPBROADCAST %esi, %VMATCH
 	/* Check if we may cross page boundary with one vector load.  */
 	movl	%edi, %eax
 	andl	$(PAGE_SIZE - 1), %eax
 	cmpl	$(PAGE_SIZE - VEC_SIZE), %eax
-	ja	L(cross_page_boundary)
+	ja	L(page_cross)
+
+	VPCMPEQ	(%rdi), %VMATCH, %k0
+	KMOV	%k0, %VRAX
+# ifndef USE_AS_WMEMCHR
+	/* If rcx is zero then tzcnt -> CHAR_PER_VEC.  NB: there is a
+	   already a dependency between rcx and rsi so no worries about
+	   false-dep here.  */
+	tzcnt	%VRAX, %VRSI
+	/* If rdx <= rsi then either 1) rcx was non-zero (there was a
+	   match) but it was out of bounds or 2) rcx was zero and rdx
+	   was <= VEC_SIZE so we are done scanning.  */
+	cmpq	%rsi, %rdx
+	/* NB: Use branch to return zero/non-zero.  Common usage will
+	   branch on result of function (if return is null/non-null).
+	   This branch can be used to predict the ensuing one so there
+	   is no reason to extend the data-dependency with cmovcc.  */
+	jbe	L(zero_0)
+
+	/* If rcx is zero then len must be > RDX, otherwise since we
+	   already tested len vs lzcnt(rcx) (in rsi) we are good to
+	   return this match.  */
+	test	%VRAX, %VRAX
+	jz	L(more_1x_vec)
+	leaq	(%rdi, %rsi), %rax
+# else
 
-	/* Check the first VEC_SIZE bytes.  */
-	VPCMP	$0, (%rdi), %YMMMATCH, %k0
-	kmovd	%k0, %eax
-# ifndef USE_AS_RAWMEMCHR
-	/* If length < CHAR_PER_VEC handle special.  */
+	/* We can't use the `tzcnt` trick for wmemchr because CHAR_SIZE
+	   > 1 so if rcx is tzcnt != CHAR_PER_VEC.  */
 	cmpq	$CHAR_PER_VEC, %rdx
-	jbe	L(first_vec_x0)
-# endif
-	testl	%eax, %eax
-	jz	L(aligned_more)
-	tzcntl	%eax, %eax
-# ifdef USE_AS_WMEMCHR
-	/* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count.  */
+	ja	L(more_1x_vec)
+	tzcnt	%VRAX, %VRAX
+	cmpl	%eax, %edx
+	jbe	L(zero_0)
+L(first_vec_x0_ret):
 	leaq	(%rdi, %rax, CHAR_SIZE), %rax
-# else
-	addq	%rdi, %rax
 # endif
 	ret
 
-# ifndef USE_AS_RAWMEMCHR
-L(zero):
-	xorl	%eax, %eax
-	ret
-
-	.p2align 4
-L(first_vec_x0):
-	/* Check if first match was before length. NB: tzcnt has false data-
-	   dependency on destination. eax already had a data-dependency on esi
-	   so this should have no affect here.  */
-	tzcntl	%eax, %esi
-#  ifdef USE_AS_WMEMCHR
-	leaq	(%rdi, %rsi, CHAR_SIZE), %rdi
-#  else
-	addq	%rsi, %rdi
-#  endif
+	/* Only fits in first cache line for VEC_SIZE == 32.  */
+# if VEC_SIZE == 32
+	.p2align 4,, 2
+L(zero_0):
 	xorl	%eax, %eax
-	cmpl	%esi, %edx
-	cmovg	%rdi, %rax
 	ret
 # endif
 
-	.p2align 4
-L(cross_page_boundary):
-	/* Save pointer before aligning as its original value is
-	   necessary for computer return address if byte is found or
-	   adjusting length if it is not and this is memchr.  */
-	movq	%rdi, %rcx
-	/* Align data to VEC_SIZE. ALGN_PTR_REG is rcx for memchr and rdi
-	   for rawmemchr.  */
-	andq	$-VEC_SIZE, %ALGN_PTR_REG
-	VPCMP	$0, (%ALGN_PTR_REG), %YMMMATCH, %k0
-	kmovd	%k0, %r8d
+	.p2align 4,, 9
+L(more_1x_vec):
 # ifdef USE_AS_WMEMCHR
-	/* NB: Divide shift count by 4 since each bit in K0 represent 4
-	   bytes.  */
-	sarl	$2, %eax
-# endif
-# ifndef USE_AS_RAWMEMCHR
-	movl	$(PAGE_SIZE / CHAR_SIZE), %esi
-	subl	%eax, %esi
+	/* If wmemchr still need to test if there was a match in first
+	   VEC.  Use bsf to test here so we can reuse
+	   L(first_vec_x0_ret).  */
+	bsf	%VRAX, %VRAX
+	jnz	L(first_vec_x0_ret)
 # endif
+
+L(page_cross_continue):
 # ifdef USE_AS_WMEMCHR
-	andl	$(CHAR_PER_VEC - 1), %eax
-# endif
-	/* Remove the leading bytes.  */
-	sarxl	%eax, %r8d, %eax
-# ifndef USE_AS_RAWMEMCHR
-	/* Check the end of data.  */
-	cmpq	%rsi, %rdx
-	jbe	L(first_vec_x0)
+	/* We can't use end of the buffer to re-calculate length for
+	   wmemchr as len * CHAR_SIZE may overflow.  */
+	leaq	-(VEC_SIZE + CHAR_SIZE)(%rdi), %rax
+	andq	$(VEC_SIZE * -1), %rdi
+	subq	%rdi, %rax
+	sarq	$2, %rax
+	addq	%rdx, %rax
+# else
+	leaq	-(VEC_SIZE + 1)(%rdx, %rdi), %rax
+	andq	$(VEC_SIZE * -1), %rdi
+	subq	%rdi, %rax
 # endif
-	testl	%eax, %eax
-	jz	L(cross_page_continue)
-	tzcntl	%eax, %eax
+
+	/* rax contains remaining length - 1.  -1 so we can get imm8
+	   encoding in a few additional places saving code size.  */
+
+	/* Needed regardless of remaining length.  */
+	VPCMPEQ	VEC_SIZE(%rdi), %VMATCH, %k0
+	KMOV	%k0, %VRDX
+
+	/* We cannot fold the above `sub %rdi, %rax` with the `cmp
+	   $(CHAR_PER_VEC * 2), %rax` because its possible for a very
+	   large length to overflow and cause the subtract to carry
+	   despite length being above CHAR_PER_VEC * 2.  */
+	cmpq	$(CHAR_PER_VEC * 2 - 1), %rax
+	ja	L(more_2x_vec)
+L(last_2x_vec):
+
+	test	%VRDX, %VRDX
+	jnz	L(first_vec_x1_check)
+
+	/* Check the end of data.  NB: use 8-bit operations to save code
+	   size.  We no longer need the full-width of eax and will
+	   perform a write-only operation over eax so there will be no
+	   partial-register stalls.  */
+	subb	$(CHAR_PER_VEC * 1 - 1), %al
+	jle	L(zero_0)
+
+	VPCMPEQ	(VEC_SIZE * 2)(%rdi), %VMATCH, %k0
+	KMOV	%k0, %VRCX
 # ifdef USE_AS_WMEMCHR
-	/* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count.  */
-	leaq	(%RAW_PTR_REG, %rax, CHAR_SIZE), %rax
+	/* For wmemchr against we can't take advantage of tzcnt(0) ==
+	   VEC_SIZE as CHAR_PER_VEC != VEC_SIZE.  */
+	test	%VRCX, %VRCX
+	jz	L(zero_0)
+# endif
+	tzcnt	%VRCX, %VRCX
+	cmp	%cl, %al
+
+	/* Same CFG for VEC_SIZE == 64 and VEC_SIZE == 32.  We give
+	   fallthrough to L(zero_0) for VEC_SIZE == 64 here as there is
+	   not enough space before the next cache line to fit the `lea`
+	   for return.  */
+# if VEC_SIZE == 64
+	ja	L(first_vec_x2_ret)
+L(zero_0):
+	xorl	%eax, %eax
+	ret
 # else
-	addq	%RAW_PTR_REG, %rax
+	jbe	L(zero_0)
+	leaq	(VEC_SIZE * 2)(%rdi, %rcx, CHAR_SIZE), %rax
+	ret
 # endif
+
+	.p2align 4,, 5
+L(first_vec_x1_check):
+	bsf	%VRDX, %VRDX
+	cmpb	%dl, %al
+	jb	L(zero_4)
+	leaq	(VEC_SIZE * 1)(%rdi, %rdx, CHAR_SIZE), %rax
 	ret
 
-	.p2align 4
-L(first_vec_x1):
-	tzcntl	%eax, %eax
-	leaq	VEC_SIZE(%rdi, %rax, CHAR_SIZE), %rax
+	/* Fits at the end of the cache line here for VEC_SIZE == 32.
+	 */
+# if VEC_SIZE == 32
+L(zero_4):
+	xorl	%eax, %eax
 	ret
+# endif
 
-	.p2align 4
+
+	.p2align 4,, 4
 L(first_vec_x2):
-	tzcntl	%eax, %eax
-	leaq	(VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
+	bsf	%VRCX, %VRCX
+L(first_vec_x2_ret):
+	leaq	(VEC_SIZE * 2)(%rdi, %rcx, CHAR_SIZE), %rax
 	ret
 
-	.p2align 4
-L(first_vec_x3):
-	tzcntl	%eax, %eax
-	leaq	(VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax
+	/* Fits at the end of the cache line here for VEC_SIZE == 64.
+	 */
+# if VEC_SIZE == 64
+L(zero_4):
+	xorl	%eax, %eax
 	ret
+# endif
 
-	.p2align 4
-L(first_vec_x4):
-	tzcntl	%eax, %eax
-	leaq	(VEC_SIZE * 4)(%rdi, %rax, CHAR_SIZE), %rax
+	.p2align 4,, 4
+L(first_vec_x1):
+	bsf	%VRDX, %VRDX
+	leaq	(VEC_SIZE * 1)(%rdi, %rdx, CHAR_SIZE), %rax
 	ret
 
-	.p2align 5
-L(aligned_more):
-	/* Check the first 4 * VEC_SIZE.  Only one VEC_SIZE at a time
-	   since data is only aligned to VEC_SIZE.  */
 
-# ifndef USE_AS_RAWMEMCHR
-	/* Align data to VEC_SIZE.  */
-L(cross_page_continue):
-	xorl	%ecx, %ecx
-	subl	%edi, %ecx
-	andq	$-VEC_SIZE, %rdi
-	/* esi is for adjusting length to see if near the end.  */
-	leal	(VEC_SIZE * 5)(%rdi, %rcx), %esi
-#  ifdef USE_AS_WMEMCHR
-	/* NB: Divide bytes by 4 to get the wchar_t count.  */
-	sarl	$2, %esi
-#  endif
-# else
-	andq	$-VEC_SIZE, %rdi
-L(cross_page_continue):
-# endif
-	/* Load first VEC regardless.  */
-	VPCMP	$0, (VEC_SIZE)(%rdi), %YMMMATCH, %k0
-	kmovd	%k0, %eax
-# ifndef USE_AS_RAWMEMCHR
-	/* Adjust length. If near end handle specially.  */
-	subq	%rsi, %rdx
-	jbe	L(last_4x_vec_or_less)
-# endif
-	testl	%eax, %eax
+	.p2align 4,, 5
+L(more_2x_vec):
+	/* Length > VEC_SIZE * 2 so check first 2x VEC before rechecking
+	   length.  */
+
+
+	/* Already computed matches for first VEC in rdx.  */
+	test	%VRDX, %VRDX
 	jnz	L(first_vec_x1)
 
-	VPCMP	$0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k0
-	kmovd	%k0, %eax
-	testl	%eax, %eax
+
+	VPCMPEQ	(VEC_SIZE * 2)(%rdi), %VMATCH, %k0
+	KMOV	%k0, %VRCX
+	test	%VRCX, %VRCX
 	jnz	L(first_vec_x2)
 
-	VPCMP	$0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k0
-	kmovd	%k0, %eax
-	testl	%eax, %eax
+	/* Needed regardless of next length check.  */
+	VPCMPEQ	(VEC_SIZE * 3)(%rdi), %VMATCH, %k0
+	KMOV	%k0, %VRCX
+
+	/* Check if we are near the end.  */
+	cmpq	$(CHAR_PER_VEC * 4 - 1), %rax
+	ja	L(more_4x_vec)
+
+	test	%VRCX, %VRCX
+	jnz	L(first_vec_x3_check)
+
+	/* Use 8-bit instructions to save code size.  We won't use full-
+	   width eax again and will perform a write-only operation to
+	   eax so no worries about partial-register stalls.  */
+	subb	$(CHAR_PER_VEC * 3), %al
+	jb	L(zero_2)
+L(last_vec_check):
+	VPCMPEQ	(VEC_SIZE * 4)(%rdi), %VMATCH, %k0
+	KMOV	%k0, %VRCX
+# ifdef USE_AS_WMEMCHR
+	/* For wmemchr against we can't take advantage of tzcnt(0) ==
+	   VEC_SIZE as CHAR_PER_VEC != VEC_SIZE.  */
+	test	%VRCX, %VRCX
+	jz	L(zero_2)
+# endif
+	tzcnt	%VRCX, %VRCX
+	cmp	%cl, %al
+	jae	L(first_vec_x4_ret)
+L(zero_2):
+	xorl	%eax, %eax
+	ret
+
+	/* Fits at the end of the cache line here for VEC_SIZE == 64.
+	   For VEC_SIZE == 32 we put the return label at the end of
+	   L(first_vec_x4).  */
+# if VEC_SIZE == 64
+L(first_vec_x4_ret):
+	leaq	(VEC_SIZE * 4)(%rdi, %rcx, CHAR_SIZE), %rax
+	ret
+# endif
+
+	.p2align 4,, 6
+L(first_vec_x4):
+	bsf	%VRCX, %VRCX
+# if VEC_SIZE == 32
+	/* Place L(first_vec_x4_ret) here as we can't fit it in the same
+	   cache line as where it is called from so we might as well
+	   save code size by reusing return of L(first_vec_x4).  */
+L(first_vec_x4_ret):
+# endif
+	leaq	(VEC_SIZE * 4)(%rdi, %rcx, CHAR_SIZE), %rax
+	ret
+
+	.p2align 4,, 6
+L(first_vec_x3_check):
+	/* Need to adjust remaining length before checking.  */
+	addb	$-(CHAR_PER_VEC * 2), %al
+	bsf	%VRCX, %VRCX
+	cmpb	%cl, %al
+	jb	L(zero_2)
+	leaq	(VEC_SIZE * 3)(%rdi, %rcx, CHAR_SIZE), %rax
+	ret
+
+	.p2align 4,, 6
+L(first_vec_x3):
+	bsf	%VRCX, %VRCX
+	leaq	(VEC_SIZE * 3)(%rdi, %rcx, CHAR_SIZE), %rax
+	ret
+
+	.p2align 4,, 3
+# if !USE_TERN_IN_LOOP
+	.p2align 4,, 10
+# endif
+L(more_4x_vec):
+	test	%VRCX, %VRCX
 	jnz	L(first_vec_x3)
 
-	VPCMP	$0, (VEC_SIZE * 4)(%rdi), %YMMMATCH, %k0
-	kmovd	%k0, %eax
-	testl	%eax, %eax
+	VPCMPEQ	(VEC_SIZE * 4)(%rdi), %VMATCH, %k0
+	KMOV	%k0, %VRCX
+	test	%VRCX, %VRCX
 	jnz	L(first_vec_x4)
 
+	subq	$-(VEC_SIZE * 5), %rdi
+	subq	$(CHAR_PER_VEC * 8), %rax
+	jb	L(last_4x_vec)
 
-# ifndef USE_AS_RAWMEMCHR
-	/* Check if at last CHAR_PER_VEC * 4 length.  */
-	subq	$(CHAR_PER_VEC * 4), %rdx
-	jbe	L(last_4x_vec_or_less_cmpeq)
-	/* +VEC_SIZE if USE_IN_RTM otherwise +VEC_SIZE * 5.  */
-	addq	$(VEC_SIZE + (VEC_SIZE * 4 - BASE_OFFSET)), %rdi
-
-	/* Align data to VEC_SIZE * 4 for the loop and readjust length.
-	 */
-#  ifdef USE_AS_WMEMCHR
+# ifdef USE_AS_WMEMCHR
 	movl	%edi, %ecx
-	andq	$-(4 * VEC_SIZE), %rdi
+# else
+	addq	%rdi, %rax
+# endif
+
+
+# if VEC_SIZE == 64
+	/* use xorb to do `andq $-(VEC_SIZE * 4), %rdi`. No evex
+	   processor has partial register stalls (all have merging
+	   uop). If that changes this can be removed.  */
+	xorb	%dil, %dil
+# else
+	andq	$-(VEC_SIZE * 4), %rdi
+# endif
+
+# ifdef USE_AS_WMEMCHR
 	subl	%edi, %ecx
-	/* NB: Divide bytes by 4 to get the wchar_t count.  */
 	sarl	$2, %ecx
-	addq	%rcx, %rdx
-#  else
-	addq	%rdi, %rdx
-	andq	$-(4 * VEC_SIZE), %rdi
-	subq	%rdi, %rdx
-#  endif
+	addq	%rcx, %rax
 # else
-	addq	$(VEC_SIZE + (VEC_SIZE * 4 - BASE_OFFSET)), %rdi
-	andq	$-(4 * VEC_SIZE), %rdi
+	subq	%rdi, %rax
 # endif
-# ifdef USE_IN_RTM
-	vpxorq	%XMMZERO, %XMMZERO, %XMMZERO
-# else
-	/* copy ymmmatch to ymm0 so we can use vpcmpeq which is not
-	   encodable with EVEX registers (ymm16-ymm31).  */
-	vmovdqa64 %YMMMATCH, %ymm0
+
+
+
+# if USE_TERN_IN_LOOP
+	/* copy VMATCH to low ymm so we can use vpcmpeq which is not
+	   encodable with EVEX registers.  NB: this is VEC_SIZE == 32
+	   only as there is no way to encode vpcmpeq with zmm0-15.  */
+	vmovdqa64 %VMATCH, %VMATCH_LO
 # endif
 
-	/* Compare 4 * VEC at a time forward.  */
-	.p2align 4
+	.p2align 4,, 11
 L(loop_4x_vec):
-	/* Two versions of the loop. One that does not require
-	   vzeroupper by not using ymm0-ymm15 and another does that require
-	   vzeroupper because it uses ymm0-ymm15. The reason why ymm0-ymm15
-	   is used at all is because there is no EVEX encoding vpcmpeq and
-	   with vpcmpeq this loop can be performed more efficiently. The
-	   non-vzeroupper version is safe for RTM while the vzeroupper
-	   version should be prefered if RTM are not supported.  */
-# ifdef USE_IN_RTM
-	/* It would be possible to save some instructions using 4x VPCMP
-	   but bottleneck on port 5 makes it not woth it.  */
-	VPCMP	$4, (VEC_SIZE * 4)(%rdi), %YMMMATCH, %k1
-	/* xor will set bytes match esi to zero.  */
-	vpxorq	(VEC_SIZE * 5)(%rdi), %YMMMATCH, %YMM2
-	vpxorq	(VEC_SIZE * 6)(%rdi), %YMMMATCH, %YMM3
-	VPCMP	$0, (VEC_SIZE * 7)(%rdi), %YMMMATCH, %k3
-	/* Reduce VEC2 / VEC3 with min and VEC1 with zero mask.  */
-	VPMINU	%YMM2, %YMM3, %YMM3{%k1}{z}
-	VPCMP	$0, %YMM3, %YMMZERO, %k2
-# else
+	/* Two versions of the loop.  One that does not require
+	   vzeroupper by not using ymmm0-15 and another does that
+	   require vzeroupper because it uses ymmm0-15.  The reason why
+	   ymm0-15 is used at all is because there is no EVEX encoding
+	   vpcmpeq and with vpcmpeq this loop can be performed more
+	   efficiently.  The non-vzeroupper version is safe for RTM
+	   while the vzeroupper version should be prefered if RTM are
+	   not supported.   Which loop version we use is determined by
+	   USE_TERN_IN_LOOP.  */
+
+# if USE_TERN_IN_LOOP
 	/* Since vptern can only take 3x vectors fastest to do 1 vec
 	   seperately with EVEX vpcmp.  */
 #  ifdef USE_AS_WMEMCHR
 	/* vptern can only accept masks for epi32/epi64 so can only save
-	   instruction using not equals mask on vptern with wmemchr.  */
-	VPCMP	$4, (%rdi), %YMMMATCH, %k1
+	   instruction using not equals mask on vptern with wmemchr.
+	 */
+	VPCMP	$4, (VEC_SIZE * 0)(%rdi), %VMATCH, %k1
 #  else
-	VPCMP	$0, (%rdi), %YMMMATCH, %k1
+	VPCMPEQ	(VEC_SIZE * 0)(%rdi), %VMATCH, %k1
 #  endif
 	/* Compare 3x with vpcmpeq and or them all together with vptern.
 	 */
-	VPCMPEQ	VEC_SIZE(%rdi), %ymm0, %ymm2
-	VPCMPEQ	(VEC_SIZE * 2)(%rdi), %ymm0, %ymm3
-	VPCMPEQ	(VEC_SIZE * 3)(%rdi), %ymm0, %ymm4
+	VPCMPEQ	(VEC_SIZE * 1)(%rdi), %VMATCH_LO, %VMM_lo(2)
+	VPCMPEQ	(VEC_SIZE * 2)(%rdi), %VMATCH_LO, %VMM_lo(3)
+	VPCMPEQ	(VEC_SIZE * 3)(%rdi), %VMATCH_LO, %VMM_lo(4)
 #  ifdef USE_AS_WMEMCHR
-	/* This takes the not of or between ymm2, ymm3, ymm4 as well as
-	   combines result from VEC0 with zero mask.  */
-	vpternlogd $1, %ymm2, %ymm3, %ymm4{%k1}{z}
-	vpmovmskb %ymm4, %ecx
+	/* This takes the not of or between VEC_lo(2), VEC_lo(3),
+	   VEC_lo(4) as well as combines result from VEC(0) with zero
+	   mask.  */
+	vpternlogd $1, %VMM_lo(2), %VMM_lo(3), %VMM_lo(4){%k1}{z}
+	vpmovmskb %VMM_lo(4), %VRCX
 #  else
-	/* 254 is mask for oring ymm2, ymm3, ymm4 into ymm4.  */
-	vpternlogd $254, %ymm2, %ymm3, %ymm4
-	vpmovmskb %ymm4, %ecx
-	kmovd	%k1, %eax
+	/* 254 is mask for oring VEC_lo(2), VEC_lo(3), VEC_lo(4) into
+	   VEC_lo(4).  */
+	vpternlogd $254, %VMM_lo(2), %VMM_lo(3), %VMM_lo(4)
+	vpmovmskb %VMM_lo(4), %VRCX
+	KMOV	%k1, %edx
 #  endif
-# endif
 
-# ifdef USE_AS_RAWMEMCHR
-	subq	$-(VEC_SIZE * 4), %rdi
-# endif
-# ifdef USE_IN_RTM
-	kortestd %k2, %k3
 # else
-#  ifdef USE_AS_WMEMCHR
-	/* ecx contains not of matches. All 1s means no matches. incl will
-	   overflow and set zeroflag if that is the case.  */
-	incl	%ecx
-#  else
-	/* If either VEC1 (eax) or VEC2-VEC4 (ecx) are not zero. Adding
-	   to ecx is not an issue because if eax is non-zero it will be
-	   used for returning the match. If it is zero the add does
-	   nothing.  */
-	addq	%rax, %rcx
-#  endif
+	/* Loop version that uses EVEX encoding.  */
+	VPCMP	$4, (VEC_SIZE * 0)(%rdi), %VMATCH, %k1
+	vpxorq	(VEC_SIZE * 1)(%rdi), %VMATCH, %VMM(2)
+	vpxorq	(VEC_SIZE * 2)(%rdi), %VMATCH, %VMM(3)
+	VPCMPEQ	(VEC_SIZE * 3)(%rdi), %VMATCH, %k3
+	VPMINU	%VMM(2), %VMM(3), %VMM(3){%k1}{z}
+	VPTESTN	%VMM(3), %VMM(3), %k2
 # endif
-# ifdef USE_AS_RAWMEMCHR
-	jz	L(loop_4x_vec)
-# else
-	jnz	L(loop_4x_vec_end)
+
+
+	TEST_END ()
+	jnz	L(loop_vec_ret)
 
 	subq	$-(VEC_SIZE * 4), %rdi
 
-	subq	$(CHAR_PER_VEC * 4), %rdx
-	ja	L(loop_4x_vec)
+	subq	$(CHAR_PER_VEC * 4), %rax
+	jae	L(loop_4x_vec)
 
-	/* Fall through into less than 4 remaining vectors of length case.
+	/* COND_VZEROUPPER is vzeroupper if we use the VEX encoded loop.
 	 */
-	VPCMP	$0, BASE_OFFSET(%rdi), %YMMMATCH, %k0
-	addq	$(BASE_OFFSET - VEC_SIZE), %rdi
-	kmovd	%k0, %eax
-	VZEROUPPER
-
-L(last_4x_vec_or_less):
-	/* Check if first VEC contained match.  */
-	testl	%eax, %eax
-	jnz	L(first_vec_x1_check)
+	COND_VZEROUPPER
 
-	/* If remaining length > CHAR_PER_VEC * 2.  */
-	addl	$(CHAR_PER_VEC * 2), %edx
-	jg	L(last_4x_vec)
-
-L(last_2x_vec):
-	/* If remaining length < CHAR_PER_VEC.  */
-	addl	$CHAR_PER_VEC, %edx
-	jle	L(zero_end)
-
-	/* Check VEC2 and compare any match with remaining length.  */
-	VPCMP	$0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k0
-	kmovd	%k0, %eax
-	tzcntl	%eax, %eax
-	cmpl	%eax, %edx
-	jbe	L(set_zero_end)
-	leaq	(VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
-L(zero_end):
-	ret
+	.p2align 4,, 10
+L(last_4x_vec):
+	/* For CHAR_PER_VEC == 64 we don't need to mask as we use 8-bit
+	   instructions on eax from here on out.  */
+# if CHAR_PER_VEC != 64
+	andl	$(CHAR_PER_VEC * 4 - 1), %eax
+# endif
+	VPCMPEQ	(VEC_SIZE * 0)(%rdi), %VMATCH, %k0
+	subq	$(VEC_SIZE * 1), %rdi
+	KMOV	%k0, %VRDX
+	cmpb	$(CHAR_PER_VEC * 2 - 1), %al
+	jbe	L(last_2x_vec)
+	test	%VRDX, %VRDX
+	jnz	L(last_vec_x1_novzero)
+
+	VPCMPEQ	(VEC_SIZE * 2)(%rdi), %VMATCH, %k0
+	KMOV	%k0, %VRDX
+	test	%VRDX, %VRDX
+	jnz	L(last_vec_x2_novzero)
+
+	VPCMPEQ	(VEC_SIZE * 3)(%rdi), %VMATCH, %k0
+	KMOV	%k0, %VRCX
+	test	%VRCX, %VRCX
+	jnz	L(first_vec_x3_check)
+
+	subb	$(CHAR_PER_VEC * 3), %al
+	jae	L(last_vec_check)
 
-L(set_zero_end):
 	xorl	%eax, %eax
 	ret
 
-	.p2align 4
-L(first_vec_x1_check):
-	/* eax must be non-zero. Use bsfl to save code size.  */
-	bsfl	%eax, %eax
-	/* Adjust length.  */
-	subl	$-(CHAR_PER_VEC * 4), %edx
-	/* Check if match within remaining length.  */
-	cmpl	%eax, %edx
-	jbe	L(set_zero_end)
-	/* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count.  */
-	leaq	VEC_SIZE(%rdi, %rax, CHAR_SIZE), %rax
+# if defined USE_AS_WMEMCHR && USE_TERN_IN_LOOP
+L(last_vec_x2_novzero):
+	addq	$VEC_SIZE, %rdi
+L(last_vec_x1_novzero):
+	bsf	%VRDX, %VRDX
+	leaq	(VEC_SIZE * 1)(%rdi, %rdx, CHAR_SIZE), %rax
 	ret
+# endif
 
-	.p2align 4
-L(loop_4x_vec_end):
+# if CHAR_PER_VEC == 64
+	/* Since we can't combine the last 2x VEC when CHAR_PER_VEC ==
+	   64 it needs a seperate return label.  */
+	.p2align 4,, 4
+L(last_vec_x2):
+L(last_vec_x2_novzero):
+	bsf	%VRDX, %VRDX
+	leaq	(VEC_SIZE * 2)(%rdi, %rdx, TERN_CHAR_MULT), %rax
+	ret
 # endif
-	/* rawmemchr will fall through into this if match was found in
-	   loop.  */
 
-# if defined USE_IN_RTM || defined USE_AS_WMEMCHR
-	/* k1 has not of matches with VEC1.  */
-	kmovd	%k1, %eax
-#  ifdef USE_AS_WMEMCHR
-	subl	$((1 << CHAR_PER_VEC) - 1), %eax
-#  else
-	incl	%eax
-#  endif
+	.p2align 4,, 4
+L(loop_vec_ret):
+# if defined USE_AS_WMEMCHR || !USE_TERN_IN_LOOP
+	KMOV	%k1, %VRAX
+	inc	%MASK_GPR(rax)
 # else
-	/* eax already has matches for VEC1.  */
-	testl	%eax, %eax
+	test	%VRDX, %VRDX
 # endif
-	jnz	L(last_vec_x1_return)
+	jnz	L(last_vec_x0)
 
-# ifdef USE_IN_RTM
-	VPCMP	$0, %YMM2, %YMMZERO, %k0
-	kmovd	%k0, %eax
+
+# if USE_TERN_IN_LOOP
+	vpmovmskb %VMM_lo(2), %VRDX
 # else
-	vpmovmskb %ymm2, %eax
+	VPTESTN	%VMM(2), %VMM(2), %k1
+	KMOV	%k1, %VRDX
 # endif
-	testl	%eax, %eax
-	jnz	L(last_vec_x2_return)
+	test	%VRDX, %VRDX
+	jnz	L(last_vec_x1)
 
-# ifdef USE_IN_RTM
-	kmovd	%k2, %eax
-	testl	%eax, %eax
-	jnz	L(last_vec_x3_return)
 
-	kmovd	%k3, %eax
-	tzcntl	%eax, %eax
-	leaq	(VEC_SIZE * 3 + RET_OFFSET)(%rdi, %rax, CHAR_SIZE), %rax
+# if USE_TERN_IN_LOOP
+	vpmovmskb %VMM_lo(3), %VRDX
 # else
-	vpmovmskb %ymm3, %eax
-	/* Combine matches in VEC3 (eax) with matches in VEC4 (ecx).  */
-	salq	$VEC_SIZE, %rcx
-	orq	%rcx, %rax
-	tzcntq	%rax, %rax
-	leaq	(VEC_SIZE * 2 + RET_OFFSET)(%rdi, %rax), %rax
-	VZEROUPPER
+	KMOV	%k2, %VRDX
 # endif
-	ret
 
-	.p2align 4,, 10
-L(last_vec_x1_return):
-	tzcntl	%eax, %eax
-# if defined USE_AS_WMEMCHR || RET_OFFSET != 0
-	/* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count.  */
-	leaq	RET_OFFSET(%rdi, %rax, CHAR_SIZE), %rax
+	/* No longer need any of the lo vecs (ymm0-15) so vzeroupper
+	   (only if used VEX encoded loop).  */
+	COND_VZEROUPPER
+
+	/* Seperate logic for CHAR_PER_VEC == 64 vs the rest.  For
+	   CHAR_PER_VEC we test the last 2x VEC seperately, for
+	   CHAR_PER_VEC <= 32 we can combine the results from the 2x
+	   VEC in a single GPR.  */
+# if CHAR_PER_VEC == 64
+#  if USE_TERN_IN_LOOP
+#   error "Unsupported"
+#  endif
+
+
+	/* If CHAR_PER_VEC == 64 we can't combine the last two VEC.  */
+	test	%VRDX, %VRDX
+	jnz	L(last_vec_x2)
+	KMOV	%k3, %VRDX
 # else
-	addq	%rdi, %rax
+	/* CHAR_PER_VEC <= 32 so we can combine the results from the
+	   last 2x VEC.  */
+
+#  if !USE_TERN_IN_LOOP
+	KMOV	%k3, %VRCX
+#  endif
+	salq	$(VEC_SIZE / TERN_CHAR_MULT), %rcx
+	addq	%rcx, %rdx
+#  if !defined USE_AS_WMEMCHR || !USE_TERN_IN_LOOP
+L(last_vec_x2_novzero):
+#  endif
 # endif
-	VZEROUPPER
+	bsf	%rdx, %rdx
+	leaq	(LAST_VEC_OFFSET)(%rdi, %rdx, TERN_CHAR_MULT), %rax
 	ret
 
-	.p2align 4
-L(last_vec_x2_return):
-	tzcntl	%eax, %eax
-	/* NB: Multiply bytes by RET_SCALE to get the wchar_t count
-	   if relevant (RET_SCALE = CHAR_SIZE if USE_AS_WMEMCHAR and
-	   USE_IN_RTM are both defined. Otherwise RET_SCALE = 1.  */
-	leaq	(VEC_SIZE + RET_OFFSET)(%rdi, %rax, RET_SCALE), %rax
-	VZEROUPPER
+	.p2align 4,, 8
+L(last_vec_x1):
+	COND_VZEROUPPER
+# if !defined USE_AS_WMEMCHR || !USE_TERN_IN_LOOP
+L(last_vec_x1_novzero):
+# endif
+	bsf	%VRDX, %VRDX
+	leaq	(VEC_SIZE * 1)(%rdi, %rdx, TERN_CHAR_MULT), %rax
 	ret
 
-# ifdef USE_IN_RTM
-	.p2align 4
-L(last_vec_x3_return):
-	tzcntl	%eax, %eax
-	/* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count.  */
-	leaq	(VEC_SIZE * 2 + RET_OFFSET)(%rdi, %rax, CHAR_SIZE), %rax
+
+	.p2align 4,, 4
+L(last_vec_x0):
+	COND_VZEROUPPER
+	bsf	%VGPR(GPR_X0), %VGPR(GPR_X0)
+# if GPR_X0_IS_RET
+	addq	%rdi, %rax
+# else
+	leaq	(%rdi, %GPR_X0, CHAR_SIZE), %rax
+# endif
 	ret
+
+	.p2align 4,, 6
+L(page_cross):
+	/* Need to preserve eax to compute inbound bytes we are
+	   checking.  */
+# ifdef USE_AS_WMEMCHR
+	movl	%eax, %ecx
+# else
+	xorl	%ecx, %ecx
+	subl	%eax, %ecx
 # endif
 
-# ifndef USE_AS_RAWMEMCHR
-	.p2align 4,, 5
-L(last_4x_vec_or_less_cmpeq):
-	VPCMP	$0, (VEC_SIZE * 5)(%rdi), %YMMMATCH, %k0
-	kmovd	%k0, %eax
-	subq	$-(VEC_SIZE * 4), %rdi
-	/* Check first VEC regardless.  */
-	testl	%eax, %eax
-	jnz	L(first_vec_x1_check)
+	xorq	%rdi, %rax
+	VPCMPEQ	(PAGE_SIZE - VEC_SIZE)(%rax), %VMATCH, %k0
+	KMOV	%k0, %VRAX
 
-	/* If remaining length <= CHAR_PER_VEC * 2.  */
-	addl	$(CHAR_PER_VEC * 2), %edx
-	jle	L(last_2x_vec)
+# ifdef USE_AS_WMEMCHR
+	/* NB: Divide by CHAR_SIZE to shift out out of bounds bytes.  */
+	shrl	$2, %ecx
+	andl	$(CHAR_PER_VEC - 1), %ecx
+# endif
 
-	.p2align 4
-L(last_4x_vec):
-	VPCMP	$0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k0
-	kmovd	%k0, %eax
-	testl	%eax, %eax
-	jnz	L(last_vec_x2)
 
+	shrx	%VGPR(PC_SHIFT_GPR), %VRAX, %VRAX
 
-	VPCMP	$0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k0
-	kmovd	%k0, %eax
-	/* Create mask for possible matches within remaining length.  */
-#  ifdef USE_AS_WMEMCHR
-	movl	$((1 << (CHAR_PER_VEC * 2)) - 1), %ecx
-	bzhil	%edx, %ecx, %ecx
-#  else
-	movq	$-1, %rcx
-	bzhiq	%rdx, %rcx, %rcx
-#  endif
-	/* Test matches in data against length match.  */
-	andl	%ecx, %eax
-	jnz	L(last_vec_x3)
+# ifdef USE_AS_WMEMCHR
+	negl	%ecx
+# endif
 
-	/* if remaining length <= CHAR_PER_VEC * 3 (Note this is after
-	   remaining length was found to be > CHAR_PER_VEC * 2.  */
-	subl	$CHAR_PER_VEC, %edx
-	jbe	L(zero_end2)
+	/* mask lower bits from ecx (negative eax) to get bytes till
+	   next VEC.  */
+	andl	$(CHAR_PER_VEC - 1), %ecx
 
+	/* Check if VEC is entirely contained in the remainder of the
+	   page.  */
+	cmpq	%rcx, %rdx
+	jbe	L(page_cross_ret)
 
-	VPCMP	$0, (VEC_SIZE * 4)(%rdi), %YMMMATCH, %k0
-	kmovd	%k0, %eax
-	/* Shift remaining length mask for last VEC.  */
-#  ifdef USE_AS_WMEMCHR
-	shrl	$CHAR_PER_VEC, %ecx
-#  else
-	shrq	$CHAR_PER_VEC, %rcx
-#  endif
-	andl	%ecx, %eax
-	jz	L(zero_end2)
-	bsfl	%eax, %eax
-	leaq	(VEC_SIZE * 4)(%rdi, %rax, CHAR_SIZE), %rax
-L(zero_end2):
-	ret
+	/* Length crosses the page so if rax is zero (no matches)
+	   continue.  */
+	test	%VRAX, %VRAX
+	jz	L(page_cross_continue)
 
-L(last_vec_x2):
-	tzcntl	%eax, %eax
-	leaq	(VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
+	/* if rdx > rcx then any match here must be in [buf:buf + len].
+	 */
+	tzcnt	%VRAX, %VRAX
+# ifdef USE_AS_WMEMCHR
+	leaq	(%rdi, %rax, CHAR_SIZE), %rax
+# else
+	addq	%rdi, %rax
+# endif
 	ret
 
-	.p2align 4
-L(last_vec_x3):
-	tzcntl	%eax, %eax
-	leaq	(VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax
+	.p2align 4,, 2
+L(page_cross_zero):
+	xorl	%eax, %eax
 	ret
+
+	.p2align 4,, 4
+L(page_cross_ret):
+	/* Search is entirely contained in page cross case.  */
+# ifdef USE_AS_WMEMCHR
+	test	%VRAX, %VRAX
+	jz	L(page_cross_zero)
+# endif
+	tzcnt	%VRAX, %VRAX
+	cmpl	%eax, %edx
+	jbe	L(page_cross_zero)
+# ifdef USE_AS_WMEMCHR
+	leaq	(%rdi, %rax, CHAR_SIZE), %rax
+# else
+	addq	%rdi, %rax
 # endif
-	/* 7 bytes from next cache line.  */
+	ret
 END (MEMCHR)
 #endif
diff --git a/sysdeps/x86_64/multiarch/rawmemchr-evex-rtm.S b/sysdeps/x86_64/multiarch/rawmemchr-evex-rtm.S
index deda1ca395..2073eaa620 100644
--- a/sysdeps/x86_64/multiarch/rawmemchr-evex-rtm.S
+++ b/sysdeps/x86_64/multiarch/rawmemchr-evex-rtm.S
@@ -1,3 +1,6 @@
-#define MEMCHR __rawmemchr_evex_rtm
-#define USE_AS_RAWMEMCHR 1
-#include "memchr-evex-rtm.S"
+#define RAWMEMCHR	__rawmemchr_evex_rtm
+
+#define USE_IN_RTM	1
+#define SECTION(p)	p##.evex.rtm
+
+#include "rawmemchr-evex.S"
diff --git a/sysdeps/x86_64/multiarch/rawmemchr-evex.S b/sysdeps/x86_64/multiarch/rawmemchr-evex.S
index dc1c450699..dad54def2b 100644
--- a/sysdeps/x86_64/multiarch/rawmemchr-evex.S
+++ b/sysdeps/x86_64/multiarch/rawmemchr-evex.S
@@ -1,7 +1,308 @@
-#ifndef RAWMEMCHR
-# define RAWMEMCHR	__rawmemchr_evex
-#endif
-#define USE_AS_RAWMEMCHR	1
-#define MEMCHR	RAWMEMCHR
+/* rawmemchr optimized with 256-bit EVEX instructions.
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <isa-level.h>
+#include <sysdep.h>
+
+#if ISA_SHOULD_BUILD (4)
+
+# ifndef VEC_SIZE
+#  include "x86-evex256-vecs.h"
+# endif
+
+# ifndef RAWMEMCHR
+#  define RAWMEMCHR	__rawmemchr_evex
+# endif
+
+
+# define PC_SHIFT_GPR	rdi
+# define REG_WIDTH	VEC_SIZE
+# define VPTESTN	vptestnmb
+# define VPBROADCAST	vpbroadcastb
+# define VPMINU	vpminub
+# define VPCMP	vpcmpb
+# define VPCMPEQ	vpcmpeqb
+# define CHAR_SIZE	1
+
+# include "reg-macros.h"
+
+/* If not in an RTM and VEC_SIZE != 64 (the VEC_SIZE = 64
+   doesn't have VEX encoding), use VEX encoding in loop so we
+   can use vpcmpeqb + vptern which is more efficient than the
+   EVEX alternative.  */
+# if defined USE_IN_RTM || VEC_SIZE == 64
+#  undef COND_VZEROUPPER
+#  undef VZEROUPPER_RETURN
+#  undef VZEROUPPER
+
+
+#  define COND_VZEROUPPER
+#  define VZEROUPPER_RETURN	ret
+#  define VZEROUPPER
+
+#  define USE_TERN_IN_LOOP	0
+# else
+#  define USE_TERN_IN_LOOP	1
+#  undef VZEROUPPER
+#  define VZEROUPPER	vzeroupper
+# endif
+
+# define CHAR_PER_VEC	VEC_SIZE
+
+# if CHAR_PER_VEC == 64
+
+#  define TAIL_RETURN_LBL	first_vec_x2
+#  define TAIL_RETURN_OFFSET	(CHAR_PER_VEC * 2)
+
+#  define FALLTHROUGH_RETURN_LBL	first_vec_x3
+#  define FALLTHROUGH_RETURN_OFFSET	(CHAR_PER_VEC * 3)
+
+# else	/* !(CHAR_PER_VEC == 64) */
+
+#  define TAIL_RETURN_LBL	first_vec_x3
+#  define TAIL_RETURN_OFFSET	(CHAR_PER_VEC * 3)
+
+#  define FALLTHROUGH_RETURN_LBL	first_vec_x2
+#  define FALLTHROUGH_RETURN_OFFSET	(CHAR_PER_VEC * 2)
+# endif	/* !(CHAR_PER_VEC == 64) */
+
+
+# define VMATCH	VMM(0)
+# define VMATCH_LO	VMM_lo(0)
+
+# define PAGE_SIZE	4096
+
+	.section SECTION(.text), "ax", @progbits
+ENTRY_P2ALIGN (RAWMEMCHR, 6)
+	VPBROADCAST %esi, %VMATCH
+	/* Check if we may cross page boundary with one vector load.  */
+	movl	%edi, %eax
+	andl	$(PAGE_SIZE - 1), %eax
+	cmpl	$(PAGE_SIZE - VEC_SIZE), %eax
+	ja	L(page_cross)
+
+	VPCMPEQ	(%rdi), %VMATCH, %k0
+	KMOV	%k0, %VRAX
+
+	test	%VRAX, %VRAX
+	jz	L(aligned_more)
+L(first_vec_x0):
+	bsf	%VRAX, %VRAX
+	addq	%rdi, %rax
+	ret
+
+	.p2align 4,, 4
+L(first_vec_x4):
+	bsf	%VRAX, %VRAX
+	leaq	(VEC_SIZE * 4)(%rdi, %rax), %rax
+	ret
 
-#include "memchr-evex.S"
+	/* For VEC_SIZE == 32 we can fit this in aligning bytes so might
+	   as well place it more locally.  For VEC_SIZE == 64 we reuse
+	   return code at the end of loop's return.  */
+# if VEC_SIZE == 32
+	.p2align 4,, 4
+L(FALLTHROUGH_RETURN_LBL):
+	bsf	%VRAX, %VRAX
+	leaq	(FALLTHROUGH_RETURN_OFFSET)(%rdi, %rax), %rax
+	ret
+# endif
+
+	.p2align 4,, 6
+L(page_cross):
+	/* eax has lower page-offset bits of rdi so xor will zero them
+	   out.  */
+	xorq	%rdi, %rax
+	VPCMPEQ	(PAGE_SIZE - VEC_SIZE)(%rax), %VMATCH, %k0
+	KMOV	%k0, %VRAX
+
+	/* Shift out out-of-bounds matches.  */
+	shrx	%VRDI, %VRAX, %VRAX
+	test	%VRAX, %VRAX
+	jnz	L(first_vec_x0)
+
+	.p2align 4,, 10
+L(aligned_more):
+L(page_cross_continue):
+	/* Align pointer.  */
+	andq	$(VEC_SIZE * -1), %rdi
+
+	VPCMPEQ	VEC_SIZE(%rdi), %VMATCH, %k0
+	KMOV	%k0, %VRAX
+	test	%VRAX, %VRAX
+	jnz	L(first_vec_x1)
+
+	VPCMPEQ	(VEC_SIZE * 2)(%rdi), %VMATCH, %k0
+	KMOV	%k0, %VRAX
+	test	%VRAX, %VRAX
+	jnz	L(first_vec_x2)
+
+	VPCMPEQ	(VEC_SIZE * 3)(%rdi), %VMATCH, %k0
+	KMOV	%k0, %VRAX
+	test	%VRAX, %VRAX
+	jnz	L(first_vec_x3)
+
+	VPCMPEQ	(VEC_SIZE * 4)(%rdi), %VMATCH, %k0
+	KMOV	%k0, %VRAX
+	test	%VRAX, %VRAX
+	jnz	L(first_vec_x4)
+
+	subq	$-(VEC_SIZE * 1), %rdi
+# if VEC_SIZE == 64
+	/* Saves code size.  No evex512 processor has partial register
+	   stalls.  If that change this can be replaced with `andq
+	   $-(VEC_SIZE * 4), %rdi`.  */
+	xorb	%dil, %dil
+# else
+	andq	$-(VEC_SIZE * 4), %rdi
+# endif
+
+# if USE_TERN_IN_LOOP
+	/* copy VMATCH to low ymm so we can use vpcmpeq which is not
+	   encodable with EVEX registers.  NB: this is VEC_SIZE == 32
+	   only as there is no way to encode vpcmpeq with zmm0-15.  */
+	vmovdqa64 %VMATCH, %VMATCH_LO
+# endif
+
+	.p2align 4
+L(loop_4x_vec):
+	/* Two versions of the loop.  One that does not require
+	   vzeroupper by not using ymm0-15 and another does that
+	   require vzeroupper because it uses ymm0-15.  The reason why
+	   ymm0-15 is used at all is because there is no EVEX encoding
+	   vpcmpeq and with vpcmpeq this loop can be performed more
+	   efficiently.  The non-vzeroupper version is safe for RTM
+	   while the vzeroupper version should be prefered if RTM are
+	   not supported.   Which loop version we use is determined by
+	   USE_TERN_IN_LOOP.  */
+
+# if USE_TERN_IN_LOOP
+	/* Since vptern can only take 3x vectors fastest to do 1 vec
+	   seperately with EVEX vpcmp.  */
+	VPCMPEQ	(VEC_SIZE * 4)(%rdi), %VMATCH, %k1
+	/* Compare 3x with vpcmpeq and or them all together with vptern.
+	 */
+
+	VPCMPEQ	(VEC_SIZE * 5)(%rdi), %VMATCH_LO, %VMM_lo(2)
+	subq	$(VEC_SIZE * -4), %rdi
+	VPCMPEQ	(VEC_SIZE * 2)(%rdi), %VMATCH_LO, %VMM_lo(3)
+	VPCMPEQ	(VEC_SIZE * 3)(%rdi), %VMATCH_LO, %VMM_lo(4)
+
+	/* 254 is mask for oring VEC_lo(2), VEC_lo(3), VEC_lo(4) into
+	   VEC_lo(4).  */
+	vpternlogd $254, %VMM_lo(2), %VMM_lo(3), %VMM_lo(4)
+	vpmovmskb %VMM_lo(4), %VRCX
+
+	KMOV	%k1, %eax
+
+	/* NB:  rax has match from first VEC and rcx has matches from
+	   VEC 2-4.  If rax is non-zero we will return that match.  If
+	   rax is zero adding won't disturb the bits in rcx.  */
+	add	%rax, %rcx
+# else
+	/* Loop version that uses EVEX encoding.  */
+	VPCMP	$4, (VEC_SIZE * 4)(%rdi), %VMATCH, %k1
+	vpxorq	(VEC_SIZE * 5)(%rdi), %VMATCH, %VMM(2)
+	vpxorq	(VEC_SIZE * 6)(%rdi), %VMATCH, %VMM(3)
+	VPCMPEQ	(VEC_SIZE * 7)(%rdi), %VMATCH, %k3
+	VPMINU	%VMM(2), %VMM(3), %VMM(3){%k1}{z}
+	VPTESTN	%VMM(3), %VMM(3), %k2
+	subq	$(VEC_SIZE * -4), %rdi
+	KORTEST %k2, %k3
+# endif
+	jz	L(loop_4x_vec)
+
+# if USE_TERN_IN_LOOP
+	test	%VRAX, %VRAX
+# else
+	KMOV	%k1, %VRAX
+	inc	%VRAX
+# endif
+	jnz	L(last_vec_x0)
+
+
+# if USE_TERN_IN_LOOP
+	vpmovmskb %VMM_lo(2), %VRAX
+# else
+	VPTESTN	%VMM(2), %VMM(2), %k1
+	KMOV	%k1, %VRAX
+# endif
+	test	%VRAX, %VRAX
+	jnz	L(last_vec_x1)
+
+
+# if USE_TERN_IN_LOOP
+	vpmovmskb %VMM_lo(3), %VRAX
+# else
+	KMOV	%k2, %VRAX
+# endif
+
+	/* No longer need any of the lo vecs (ymm0-15) so vzeroupper
+	   (only if used VEX encoded loop).  */
+	COND_VZEROUPPER
+
+	/* Seperate logic for VEC_SIZE == 64 and VEC_SIZE == 32 for
+	   returning last 2x VEC. For VEC_SIZE == 64 we test each VEC
+	   individually, for VEC_SIZE == 32 we combine them in a single
+	   64-bit GPR.  */
+# if CHAR_PER_VEC == 64
+#  if USE_TERN_IN_LOOP
+#   error "Unsupported"
+#  endif
+
+
+	/* If CHAR_PER_VEC == 64 we can't combine the last two VEC.  */
+	test	%VRAX, %VRAX
+	jnz	L(first_vec_x2)
+	KMOV	%k3, %VRAX
+L(FALLTHROUGH_RETURN_LBL):
+# else
+	/* CHAR_PER_VEC <= 32 so we can combine the results from the
+	   last 2x VEC.  */
+#  if !USE_TERN_IN_LOOP
+	KMOV	%k3, %VRCX
+#  endif
+	salq	$CHAR_PER_VEC, %rcx
+	addq	%rcx, %rax
+# endif
+	bsf	%rax, %rax
+	leaq	(FALLTHROUGH_RETURN_OFFSET)(%rdi, %rax), %rax
+	ret
+
+	.p2align 4,, 8
+L(TAIL_RETURN_LBL):
+	bsf	%rax, %rax
+	leaq	(TAIL_RETURN_OFFSET)(%rdi, %rax), %rax
+	ret
+
+	.p2align 4,, 8
+L(last_vec_x1):
+	COND_VZEROUPPER
+L(first_vec_x1):
+	bsf	%VRAX, %VRAX
+	leaq	(VEC_SIZE * 1)(%rdi, %rax), %rax
+	ret
+
+	.p2align 4,, 8
+L(last_vec_x0):
+	COND_VZEROUPPER
+	bsf	%VRAX, %VRAX
+	addq	%rdi, %rax
+	ret
+END (RAWMEMCHR)
+#endif
-- 
2.34.1


^ permalink raw reply	[flat|nested] 41+ messages in thread

* [PATCH v2 2/7] x86: Shrink / minorly optimize strchr-evex and implement with VMM headers
  2022-10-18 23:19 ` [PATCH v2 " Noah Goldstein
@ 2022-10-18 23:19   ` Noah Goldstein
  2022-10-18 23:19   ` [PATCH v2 3/7] x86: Optimize strnlen-evex.S " Noah Goldstein
                     ` (4 subsequent siblings)
  5 siblings, 0 replies; 41+ messages in thread
From: Noah Goldstein @ 2022-10-18 23:19 UTC (permalink / raw)
  To: libc-alpha; +Cc: goldstein.w.n, hjl.tools, carlos

Size Optimizations:
1. Condence hot path for better cache-locality.
    - This is most impact for strchrnul where the logic strings with
      len <= VEC_SIZE or with a match in the first VEC no fits entirely
      in the first cache line.
2. Reuse common targets in first 4x VEC and after the loop.
3. Don't align targets so aggressively if it doesn't change the number
   of fetch blocks it will require and put more care in avoiding the
   case where targets unnecessarily split cache lines.
4. Align the loop better for DSB/LSD
5. Use more code-size efficient instructions.
	- tzcnt ...     -> bsf ...
	- vpcmpb $0 ... -> vpcmpeq ...
6. Align labels less aggressively, especially if it doesn't save fetch
   blocks / causes the basic-block to span extra cache-lines.

Code Size Changes:
strchr-evex.S	: -63 bytes
strchrnul-evex.S: -48 bytes

Net perf changes:
Reported as geometric mean of all improvements / regressions from N=10
runs of the benchtests. Value as New Time / Old Time so < 1.0 is
improvement and 1.0 is regression.

strchr-evex.S (Fixed)   : 0.971
strchr-evex.S (Rand)    : 0.932
strchrnul-evex.S        : 0.965

Full results attached in email.

Full check passes on x86-64.
---
 sysdeps/x86_64/multiarch/strchr-evex.S | 558 +++++++++++++++----------
 1 file changed, 340 insertions(+), 218 deletions(-)

diff --git a/sysdeps/x86_64/multiarch/strchr-evex.S b/sysdeps/x86_64/multiarch/strchr-evex.S
index a1c15c4419..c2a0d112f7 100644
--- a/sysdeps/x86_64/multiarch/strchr-evex.S
+++ b/sysdeps/x86_64/multiarch/strchr-evex.S
@@ -26,48 +26,75 @@
 #  define STRCHR	__strchr_evex
 # endif
 
-# define VMOVU		vmovdqu64
-# define VMOVA		vmovdqa64
+# ifndef VEC_SIZE
+#  include "x86-evex256-vecs.h"
+# endif
 
 # ifdef USE_AS_WCSCHR
 #  define VPBROADCAST	vpbroadcastd
-#  define VPCMP		vpcmpd
+#  define VPCMP	vpcmpd
+#  define VPCMPEQ	vpcmpeqd
 #  define VPTESTN	vptestnmd
+#  define VPTEST	vptestmd
 #  define VPMINU	vpminud
 #  define CHAR_REG	esi
-#  define SHIFT_REG	ecx
+#  define SHIFT_REG	rcx
 #  define CHAR_SIZE	4
+
+#  define USE_WIDE_CHAR
 # else
 #  define VPBROADCAST	vpbroadcastb
-#  define VPCMP		vpcmpb
+#  define VPCMP	vpcmpb
+#  define VPCMPEQ	vpcmpeqb
 #  define VPTESTN	vptestnmb
+#  define VPTEST	vptestmb
 #  define VPMINU	vpminub
 #  define CHAR_REG	sil
-#  define SHIFT_REG	edx
+#  define SHIFT_REG	rdi
 #  define CHAR_SIZE	1
 # endif
 
-# define XMMZERO	xmm16
-
-# define YMMZERO	ymm16
-# define YMM0		ymm17
-# define YMM1		ymm18
-# define YMM2		ymm19
-# define YMM3		ymm20
-# define YMM4		ymm21
-# define YMM5		ymm22
-# define YMM6		ymm23
-# define YMM7		ymm24
-# define YMM8		ymm25
-
-# define VEC_SIZE 32
-# define PAGE_SIZE 4096
-# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE)
-
-	.section .text.evex,"ax",@progbits
-ENTRY_P2ALIGN (STRCHR, 5)
-	/* Broadcast CHAR to YMM0.	*/
-	VPBROADCAST	%esi, %YMM0
+# include "reg-macros.h"
+
+# if VEC_SIZE == 64
+#  define MASK_GPR	rcx
+#  define LOOP_REG	rax
+
+#  define COND_MASK(k_reg)	{%k_reg}
+# else
+#  define MASK_GPR	rax
+#  define LOOP_REG	rdi
+
+#  define COND_MASK(k_reg)
+# endif
+
+# define CHAR_PER_VEC	(VEC_SIZE / CHAR_SIZE)
+
+
+# if CHAR_PER_VEC == 64
+#  define LAST_VEC_OFFSET	(VEC_SIZE * 3)
+#  define TESTZ(reg)	incq %VGPR_SZ(reg, 64)
+# else
+
+#  if CHAR_PER_VEC == 32
+#   define TESTZ(reg)	incl %VGPR_SZ(reg, 32)
+#  elif CHAR_PER_VEC == 16
+#   define TESTZ(reg)	incw %VGPR_SZ(reg, 16)
+#  else
+#   define TESTZ(reg)	incb %VGPR_SZ(reg, 8)
+#  endif
+
+#  define LAST_VEC_OFFSET	(VEC_SIZE * 2)
+# endif
+
+# define VMATCH	VMM(0)
+
+# define PAGE_SIZE	4096
+
+	.section SECTION(.text), "ax", @progbits
+ENTRY_P2ALIGN (STRCHR, 6)
+	/* Broadcast CHAR to VEC_0.  */
+	VPBROADCAST %esi, %VMATCH
 	movl	%edi, %eax
 	andl	$(PAGE_SIZE - 1), %eax
 	/* Check if we cross page boundary with one vector load.
@@ -75,19 +102,27 @@ ENTRY_P2ALIGN (STRCHR, 5)
 	cmpl	$(PAGE_SIZE - VEC_SIZE), %eax
 	ja	L(cross_page_boundary)
 
+
 	/* Check the first VEC_SIZE bytes. Search for both CHAR and the
 	   null bytes.  */
-	VMOVU	(%rdi), %YMM1
-
+	VMOVU	(%rdi), %VMM(1)
 	/* Leaves only CHARS matching esi as 0.  */
-	vpxorq	%YMM1, %YMM0, %YMM2
-	VPMINU	%YMM2, %YMM1, %YMM2
-	/* Each bit in K0 represents a CHAR or a null byte in YMM1.  */
-	VPTESTN	%YMM2, %YMM2, %k0
-	kmovd	%k0, %eax
-	testl	%eax, %eax
+	vpxorq	%VMM(1), %VMATCH, %VMM(2)
+	VPMINU	%VMM(2), %VMM(1), %VMM(2)
+	/* Each bit in K0 represents a CHAR or a null byte in VEC_1.  */
+	VPTESTN	%VMM(2), %VMM(2), %k0
+	KMOV	%k0, %VRAX
+# if VEC_SIZE == 64 && defined USE_AS_STRCHRNUL
+	/* If VEC_SIZE == 64 && STRCHRNUL use bsf to test condition so
+	   that all logic for match/null in first VEC first in 1x cache
+	   lines.  This has a slight cost to larger sizes.  */
+	bsf	%VRAX, %VRAX
+	jz	L(aligned_more)
+# else
+	test	%VRAX, %VRAX
 	jz	L(aligned_more)
-	tzcntl	%eax, %eax
+	bsf	%VRAX, %VRAX
+# endif
 # ifndef USE_AS_STRCHRNUL
 	/* Found CHAR or the null byte.  */
 	cmp	(%rdi, %rax, CHAR_SIZE), %CHAR_REG
@@ -109,287 +144,374 @@ ENTRY_P2ALIGN (STRCHR, 5)
 # endif
 	ret
 
-
-
-	.p2align 4,, 10
-L(first_vec_x4):
-# ifndef USE_AS_STRCHRNUL
-	/* Check to see if first match was CHAR (k0) or null (k1).  */
-	kmovd	%k0, %eax
-	tzcntl	%eax, %eax
-	kmovd	%k1, %ecx
-	/* bzhil will not be 0 if first match was null.  */
-	bzhil	%eax, %ecx, %ecx
-	jne	L(zero)
-# else
-	/* Combine CHAR and null matches.  */
-	kord	%k0, %k1, %k0
-	kmovd	%k0, %eax
-	tzcntl	%eax, %eax
-# endif
-	/* NB: Multiply sizeof char type (1 or 4) to get the number of
-	   bytes.  */
-	leaq	(VEC_SIZE * 4)(%rdi, %rax, CHAR_SIZE), %rax
-	ret
-
 # ifndef USE_AS_STRCHRNUL
 L(zero):
 	xorl	%eax, %eax
 	ret
 # endif
 
-
-	.p2align 4
+	.p2align 4,, 2
+L(first_vec_x3):
+	subq	$-(VEC_SIZE * 2), %rdi
+# if VEC_SIZE == 32
+	/* Reuse L(first_vec_x3) for last VEC2 only for VEC_SIZE == 32.
+	   For VEC_SIZE == 64 the registers don't match.  */
+L(last_vec_x2):
+# endif
 L(first_vec_x1):
 	/* Use bsf here to save 1-byte keeping keeping the block in 1x
 	   fetch block. eax guranteed non-zero.  */
-	bsfl	%eax, %eax
+	bsf	%VRCX, %VRCX
 # ifndef USE_AS_STRCHRNUL
-	/* Found CHAR or the null byte.	 */
-	cmp	(VEC_SIZE)(%rdi, %rax, CHAR_SIZE), %CHAR_REG
+	/* Found CHAR or the null byte.  */
+	cmp	(VEC_SIZE)(%rdi, %rcx, CHAR_SIZE), %CHAR_REG
 	jne	L(zero)
-
 # endif
 	/* NB: Multiply sizeof char type (1 or 4) to get the number of
 	   bytes.  */
-	leaq	(VEC_SIZE)(%rdi, %rax, CHAR_SIZE), %rax
+	leaq	(VEC_SIZE)(%rdi, %rcx, CHAR_SIZE), %rax
 	ret
 
-	.p2align 4,, 10
+	.p2align 4,, 2
+L(first_vec_x4):
+	subq	$-(VEC_SIZE * 2), %rdi
 L(first_vec_x2):
 # ifndef USE_AS_STRCHRNUL
 	/* Check to see if first match was CHAR (k0) or null (k1).  */
-	kmovd	%k0, %eax
-	tzcntl	%eax, %eax
-	kmovd	%k1, %ecx
+	KMOV	%k0, %VRAX
+	tzcnt	%VRAX, %VRAX
+	KMOV	%k1, %VRCX
 	/* bzhil will not be 0 if first match was null.  */
-	bzhil	%eax, %ecx, %ecx
+	bzhi	%VRAX, %VRCX, %VRCX
 	jne	L(zero)
 # else
 	/* Combine CHAR and null matches.  */
-	kord	%k0, %k1, %k0
-	kmovd	%k0, %eax
-	tzcntl	%eax, %eax
+	KOR	%k0, %k1, %k0
+	KMOV	%k0, %VRAX
+	bsf	%VRAX, %VRAX
 # endif
 	/* NB: Multiply sizeof char type (1 or 4) to get the number of
 	   bytes.  */
 	leaq	(VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
 	ret
 
-	.p2align 4,, 10
-L(first_vec_x3):
-	/* Use bsf here to save 1-byte keeping keeping the block in 1x
-	   fetch block. eax guranteed non-zero.  */
-	bsfl	%eax, %eax
-# ifndef USE_AS_STRCHRNUL
-	/* Found CHAR or the null byte.	 */
-	cmp	(VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %CHAR_REG
-	jne	L(zero)
+# ifdef USE_AS_STRCHRNUL
+	/* We use this as a hook to get imm8 encoding for the jmp to
+	   L(page_cross_boundary).  This allows the hot case of a
+	   match/null-term in first VEC to fit entirely in 1 cache
+	   line.  */
+L(cross_page_boundary):
+	jmp	L(cross_page_boundary_real)
 # endif
-	/* NB: Multiply sizeof char type (1 or 4) to get the number of
-	   bytes.  */
-	leaq	(VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax
-	ret
 
 	.p2align 4
 L(aligned_more):
+L(cross_page_continue):
 	/* Align data to VEC_SIZE.  */
 	andq	$-VEC_SIZE, %rdi
-L(cross_page_continue):
-	/* Check the next 4 * VEC_SIZE. Only one VEC_SIZE at a time since
-	   data is only aligned to VEC_SIZE. Use two alternating methods
-	   for checking VEC to balance latency and port contention.  */
 
-	/* This method has higher latency but has better port
-	   distribution.  */
-	VMOVA	(VEC_SIZE)(%rdi), %YMM1
+	/* Check the next 4 * VEC_SIZE. Only one VEC_SIZE at a time
+	   since data is only aligned to VEC_SIZE. Use two alternating
+	   methods for checking VEC to balance latency and port
+	   contention.  */
+
+    /* Method(1) with 8c latency:
+	   For VEC_SIZE == 32:
+	   p0 * 1.83, p1 * 0.83, p5 * 1.33
+	   For VEC_SIZE == 64:
+	   p0 * 2.50, p1 * 0.00, p5 * 1.50  */
+	VMOVA	(VEC_SIZE)(%rdi), %VMM(1)
 	/* Leaves only CHARS matching esi as 0.  */
-	vpxorq	%YMM1, %YMM0, %YMM2
-	VPMINU	%YMM2, %YMM1, %YMM2
-	/* Each bit in K0 represents a CHAR or a null byte in YMM1.  */
-	VPTESTN	%YMM2, %YMM2, %k0
-	kmovd	%k0, %eax
-	testl	%eax, %eax
+	vpxorq	%VMM(1), %VMATCH, %VMM(2)
+	VPMINU	%VMM(2), %VMM(1), %VMM(2)
+	/* Each bit in K0 represents a CHAR or a null byte in VEC_1.  */
+	VPTESTN	%VMM(2), %VMM(2), %k0
+	KMOV	%k0, %VRCX
+	test	%VRCX, %VRCX
 	jnz	L(first_vec_x1)
 
-	/* This method has higher latency but has better port
-	   distribution.  */
-	VMOVA	(VEC_SIZE * 2)(%rdi), %YMM1
-	/* Each bit in K0 represents a CHAR in YMM1.  */
-	VPCMP	$0, %YMM1, %YMM0, %k0
-	/* Each bit in K1 represents a CHAR in YMM1.  */
-	VPTESTN	%YMM1, %YMM1, %k1
-	kortestd	%k0, %k1
+    /* Method(2) with 6c latency:
+	   For VEC_SIZE == 32:
+	   p0 * 1.00, p1 * 0.00, p5 * 2.00
+	   For VEC_SIZE == 64:
+	   p0 * 1.00, p1 * 0.00, p5 * 2.00  */
+	VMOVA	(VEC_SIZE * 2)(%rdi), %VMM(1)
+	/* Each bit in K0 represents a CHAR in VEC_1.  */
+	VPCMPEQ	%VMM(1), %VMATCH, %k0
+	/* Each bit in K1 represents a CHAR in VEC_1.  */
+	VPTESTN	%VMM(1), %VMM(1), %k1
+	KORTEST %k0, %k1
 	jnz	L(first_vec_x2)
 
-	VMOVA	(VEC_SIZE * 3)(%rdi), %YMM1
+	/* By swapping between Method 1/2 we get more fair port
+	   distrubition and better throughput.  */
+
+	VMOVA	(VEC_SIZE * 3)(%rdi), %VMM(1)
 	/* Leaves only CHARS matching esi as 0.  */
-	vpxorq	%YMM1, %YMM0, %YMM2
-	VPMINU	%YMM2, %YMM1, %YMM2
-	/* Each bit in K0 represents a CHAR or a null byte in YMM1.  */
-	VPTESTN	%YMM2, %YMM2, %k0
-	kmovd	%k0, %eax
-	testl	%eax, %eax
+	vpxorq	%VMM(1), %VMATCH, %VMM(2)
+	VPMINU	%VMM(2), %VMM(1), %VMM(2)
+	/* Each bit in K0 represents a CHAR or a null byte in VEC_1.  */
+	VPTESTN	%VMM(2), %VMM(2), %k0
+	KMOV	%k0, %VRCX
+	test	%VRCX, %VRCX
 	jnz	L(first_vec_x3)
 
-	VMOVA	(VEC_SIZE * 4)(%rdi), %YMM1
-	/* Each bit in K0 represents a CHAR in YMM1.  */
-	VPCMP	$0, %YMM1, %YMM0, %k0
-	/* Each bit in K1 represents a CHAR in YMM1.  */
-	VPTESTN	%YMM1, %YMM1, %k1
-	kortestd	%k0, %k1
+	VMOVA	(VEC_SIZE * 4)(%rdi), %VMM(1)
+	/* Each bit in K0 represents a CHAR in VEC_1.  */
+	VPCMPEQ	%VMM(1), %VMATCH, %k0
+	/* Each bit in K1 represents a CHAR in VEC_1.  */
+	VPTESTN	%VMM(1), %VMM(1), %k1
+	KORTEST %k0, %k1
 	jnz	L(first_vec_x4)
 
 	/* Align data to VEC_SIZE * 4 for the loop.  */
+# if VEC_SIZE == 64
+	/* Use rax for the loop reg as it allows to the loop to fit in
+	   exactly 2-cache-lines. (more efficient imm32 + gpr
+	   encoding).  */
+	leaq	(VEC_SIZE)(%rdi), %rax
+	/* No partial register stalls on evex512 processors.  */
+	xorb	%al, %al
+# else
+	/* For VEC_SIZE == 32 continue using rdi for loop reg so we can
+	   reuse more code and save space.  */
 	addq	$VEC_SIZE, %rdi
 	andq	$-(VEC_SIZE * 4), %rdi
-
+# endif
 	.p2align 4
 L(loop_4x_vec):
-	/* Check 4x VEC at a time. No penalty to imm32 offset with evex
-	   encoding.  */
-	VMOVA	(VEC_SIZE * 4)(%rdi), %YMM1
-	VMOVA	(VEC_SIZE * 5)(%rdi), %YMM2
-	VMOVA	(VEC_SIZE * 6)(%rdi), %YMM3
-	VMOVA	(VEC_SIZE * 7)(%rdi), %YMM4
-
-	/* For YMM1 and YMM3 use xor to set the CHARs matching esi to
+	/* Check 4x VEC at a time. No penalty for imm32 offset with evex
+	   encoding (if offset % VEC_SIZE == 0).  */
+	VMOVA	(VEC_SIZE * 4)(%LOOP_REG), %VMM(1)
+	VMOVA	(VEC_SIZE * 5)(%LOOP_REG), %VMM(2)
+	VMOVA	(VEC_SIZE * 6)(%LOOP_REG), %VMM(3)
+	VMOVA	(VEC_SIZE * 7)(%LOOP_REG), %VMM(4)
+
+	/* Collect bits where VEC_1 does NOT match esi.  This is later
+	   use to mask of results (getting not matches allows us to
+	   save an instruction on combining).  */
+	VPCMP	$4, %VMATCH, %VMM(1), %k1
+
+	/* Two methods for loop depending on VEC_SIZE.  This is because
+	   with zmm registers VPMINU can only run on p0 (as opposed to
+	   p0/p1 for ymm) so it is less prefered.  */
+# if VEC_SIZE == 32
+	/* For VEC_2 and VEC_3 use xor to set the CHARs matching esi to
 	   zero.  */
-	vpxorq	%YMM1, %YMM0, %YMM5
-	/* For YMM2 and YMM4 cmp not equals to CHAR and store result in
-	   k register. Its possible to save either 1 or 2 instructions
-	   using cmp no equals method for either YMM1 or YMM1 and YMM3
-	   respectively but bottleneck on p5 makes it not worth it.  */
-	VPCMP	$4, %YMM0, %YMM2, %k2
-	vpxorq	%YMM3, %YMM0, %YMM7
-	VPCMP	$4, %YMM0, %YMM4, %k4
-
-	/* Use min to select all zeros from either xor or end of string).
-	 */
-	VPMINU	%YMM1, %YMM5, %YMM1
-	VPMINU	%YMM3, %YMM7, %YMM3
+	vpxorq	%VMM(2), %VMATCH, %VMM(6)
+	vpxorq	%VMM(3), %VMATCH, %VMM(7)
 
-	/* Use min + zeromask to select for zeros. Since k2 and k4 will
-	   have 0 as positions that matched with CHAR which will set
-	   zero in the corresponding destination bytes in YMM2 / YMM4.
-	 */
-	VPMINU	%YMM1, %YMM2, %YMM2{%k2}{z}
-	VPMINU	%YMM3, %YMM4, %YMM4
-	VPMINU	%YMM2, %YMM4, %YMM4{%k4}{z}
-
-	VPTESTN	%YMM4, %YMM4, %k1
-	kmovd	%k1, %ecx
-	subq	$-(VEC_SIZE * 4), %rdi
-	testl	%ecx, %ecx
+	/* Find non-matches in VEC_4 while combining with non-matches
+	   from VEC_1.  NB: Try and use masked predicate execution on
+	   instructions that have mask result as it has no latency
+	   penalty.  */
+	VPCMP	$4, %VMATCH, %VMM(4), %k4{%k1}
+
+	/* Combined zeros from VEC_1 / VEC_2 (search for null term).  */
+	VPMINU	%VMM(1), %VMM(2), %VMM(2)
+
+	/* Use min to select all zeros from either xor or end of
+	   string).  */
+	VPMINU	%VMM(3), %VMM(7), %VMM(3)
+	VPMINU	%VMM(2), %VMM(6), %VMM(2)
+
+	/* Combined zeros from VEC_2 / VEC_3 (search for null term).  */
+	VPMINU	%VMM(3), %VMM(4), %VMM(4)
+
+	/* Combined zeros from VEC_2 / VEC_4 (this has all null term and
+	   esi matches for VEC_2 / VEC_3).  */
+	VPMINU	%VMM(2), %VMM(4), %VMM(4)
+# else
+	/* Collect non-matches for VEC_2.  */
+	VPCMP	$4, %VMM(2), %VMATCH, %k2
+
+	/* Combined zeros from VEC_1 / VEC_2 (search for null term).  */
+	VPMINU	%VMM(1), %VMM(2), %VMM(2)
+
+	/* Find non-matches in VEC_3/VEC_4 while combining with non-
+	   matches from VEC_1/VEC_2 respectively.  */
+	VPCMP	$4, %VMM(3), %VMATCH, %k3{%k1}
+	VPCMP	$4, %VMM(4), %VMATCH, %k4{%k2}
+
+	/* Finish combining zeros in all VECs.  */
+	VPMINU	%VMM(3), %VMM(4), %VMM(4)
+
+	/* Combine in esi matches for VEC_3 (if there was a match with
+	   esi, the corresponding bit in %k3 is zero so the
+	   VPMINU_MASKZ will have a zero in the result).  NB: This make
+	   the VPMINU 3c latency.  The only way to avoid it is to
+	   createa a 12c dependency chain on all the `VPCMP $4, ...`
+	   which has higher total latency.  */
+	VPMINU	%VMM(2), %VMM(4), %VMM(4){%k3}{z}
+# endif
+	VPTEST	%VMM(4), %VMM(4), %k0{%k4}
+	KMOV	%k0, %VRDX
+	subq	$-(VEC_SIZE * 4), %LOOP_REG
+
+	/* TESTZ is inc using the proper register width depending on
+	   CHAR_PER_VEC. An esi match or null-term match leaves a zero-
+	   bit in rdx so inc won't overflow and won't be zero.  */
+	TESTZ	(rdx)
 	jz	L(loop_4x_vec)
 
-	VPTESTN	%YMM1, %YMM1, %k0
-	kmovd	%k0, %eax
-	testl	%eax, %eax
-	jnz	L(last_vec_x1)
+	VPTEST	%VMM(1), %VMM(1), %k0{%k1}
+	KMOV	%k0, %VGPR(MASK_GPR)
+	TESTZ	(MASK_GPR)
+# if VEC_SIZE == 32
+	/* We can reuse the return code in page_cross logic for VEC_SIZE
+	   == 32.  */
+	jnz	L(last_vec_x1_vec_size32)
+# else
+	jnz	L(last_vec_x1_vec_size64)
+# endif
+
 
-	VPTESTN	%YMM2, %YMM2, %k0
-	kmovd	%k0, %eax
-	testl	%eax, %eax
+	/* COND_MASK integates the esi matches for VEC_SIZE == 64. For
+	   VEC_SIZE == 32 they are already integrated.  */
+	VPTEST	%VMM(2), %VMM(2), %k0 COND_MASK(k2)
+	KMOV	%k0, %VRCX
+	TESTZ	(rcx)
 	jnz	L(last_vec_x2)
 
-	VPTESTN	%YMM3, %YMM3, %k0
-	kmovd	%k0, %eax
-	/* Combine YMM3 matches (eax) with YMM4 matches (ecx).  */
-# ifdef USE_AS_WCSCHR
-	sall	$8, %ecx
-	orl	%ecx, %eax
-	bsfl	%eax, %eax
+	VPTEST	%VMM(3), %VMM(3), %k0 COND_MASK(k3)
+	KMOV	%k0, %VRCX
+# if CHAR_PER_VEC == 64
+	TESTZ	(rcx)
+	jnz	L(last_vec_x3)
 # else
-	salq	$32, %rcx
-	orq	%rcx, %rax
-	bsfq	%rax, %rax
+	salq	$CHAR_PER_VEC, %rdx
+	TESTZ	(rcx)
+	orq	%rcx, %rdx
 # endif
+
+	bsfq	%rdx, %rdx
+
 # ifndef USE_AS_STRCHRNUL
 	/* Check if match was CHAR or null.  */
-	cmp	(VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %CHAR_REG
+	cmp	(LAST_VEC_OFFSET)(%LOOP_REG, %rdx, CHAR_SIZE), %CHAR_REG
 	jne	L(zero_end)
 # endif
 	/* NB: Multiply sizeof char type (1 or 4) to get the number of
 	   bytes.  */
-	leaq	(VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
+	leaq	(LAST_VEC_OFFSET)(%LOOP_REG, %rdx, CHAR_SIZE), %rax
 	ret
 
-	.p2align 4,, 8
-L(last_vec_x1):
-	bsfl	%eax, %eax
-# ifdef USE_AS_WCSCHR
-	/* NB: Multiply wchar_t count by 4 to get the number of bytes.
-	   */
-	leaq	(%rdi, %rax, CHAR_SIZE), %rax
-# else
-	addq	%rdi, %rax
+# ifndef USE_AS_STRCHRNUL
+L(zero_end):
+	xorl	%eax, %eax
+	ret
 # endif
 
-# ifndef USE_AS_STRCHRNUL
+
+	/* Seperate return label for last VEC1 because for VEC_SIZE ==
+	   32 we can reuse return code in L(page_cross) but VEC_SIZE ==
+	   64 has mismatched registers.  */
+# if VEC_SIZE == 64
+	.p2align 4,, 8
+L(last_vec_x1_vec_size64):
+	bsf	%VRCX, %VRCX
+#  ifndef USE_AS_STRCHRNUL
 	/* Check if match was null.  */
-	cmp	(%rax), %CHAR_REG
+	cmp	(%rax, %rcx, CHAR_SIZE), %CHAR_REG
 	jne	L(zero_end)
-# endif
-
+#  endif
+#  ifdef USE_AS_WCSCHR
+	/* NB: Multiply wchar_t count by 4 to get the number of bytes.
+	 */
+	leaq	(%rax, %rcx, CHAR_SIZE), %rax
+#  else
+	addq	%rcx, %rax
+#  endif
 	ret
 
+	/* Since we can't combine the last 2x matches for CHAR_PER_VEC
+	   == 64 we need return label for last VEC3.  */
+#  if CHAR_PER_VEC == 64
 	.p2align 4,, 8
+L(last_vec_x3):
+	addq	$VEC_SIZE, %LOOP_REG
+#  endif
+
+	/* Duplicate L(last_vec_x2) for VEC_SIZE == 64 because we can't
+	   reuse L(first_vec_x3) due to register mismatch.  */
 L(last_vec_x2):
-	bsfl	%eax, %eax
-# ifndef USE_AS_STRCHRNUL
+	bsf	%VGPR(MASK_GPR), %VGPR(MASK_GPR)
+#  ifndef USE_AS_STRCHRNUL
 	/* Check if match was null.  */
-	cmp	(VEC_SIZE)(%rdi, %rax, CHAR_SIZE), %CHAR_REG
+	cmp	(VEC_SIZE * 1)(%LOOP_REG, %MASK_GPR, CHAR_SIZE), %CHAR_REG
 	jne	L(zero_end)
-# endif
+#  endif
 	/* NB: Multiply sizeof char type (1 or 4) to get the number of
 	   bytes.  */
-	leaq	(VEC_SIZE)(%rdi, %rax, CHAR_SIZE), %rax
+	leaq	(VEC_SIZE * 1)(%LOOP_REG, %MASK_GPR, CHAR_SIZE), %rax
 	ret
+# endif
 
-	/* Cold case for crossing page with first load.	 */
-	.p2align 4,, 8
+	/* Cold case for crossing page with first load.  */
+	.p2align 4,, 10
+# ifndef USE_AS_STRCHRNUL
 L(cross_page_boundary):
-	movq	%rdi, %rdx
+# endif
+L(cross_page_boundary_real):
 	/* Align rdi.  */
-	andq	$-VEC_SIZE, %rdi
-	VMOVA	(%rdi), %YMM1
-	/* Leaves only CHARS matching esi as 0.  */
-	vpxorq	%YMM1, %YMM0, %YMM2
-	VPMINU	%YMM2, %YMM1, %YMM2
-	/* Each bit in K0 represents a CHAR or a null byte in YMM1.  */
-	VPTESTN	%YMM2, %YMM2, %k0
-	kmovd	%k0, %eax
+	xorq	%rdi, %rax
+	VMOVA	(PAGE_SIZE - VEC_SIZE)(%rax), %VMM(1)
+	/* Use high latency method of getting matches to save code size.
+	 */
+
+	/* K1 has 1s where VEC(1) does NOT match esi.  */
+	VPCMP	$4, %VMM(1), %VMATCH, %k1
+	/* K0 has ones where K1 is 1 (non-match with esi), and non-zero
+	   (null).  */
+	VPTEST	%VMM(1), %VMM(1), %k0{%k1}
+	KMOV	%k0, %VRAX
 	/* Remove the leading bits.  */
 # ifdef USE_AS_WCSCHR
-	movl	%edx, %SHIFT_REG
+	movl	%edi, %VGPR_SZ(SHIFT_REG, 32)
 	/* NB: Divide shift count by 4 since each bit in K1 represent 4
 	   bytes.  */
-	sarl	$2, %SHIFT_REG
-	andl	$(CHAR_PER_VEC - 1), %SHIFT_REG
+	sarl	$2, %VGPR_SZ(SHIFT_REG, 32)
+	andl	$(CHAR_PER_VEC - 1), %VGPR_SZ(SHIFT_REG, 32)
+
+	/* if wcsrchr we need to reverse matches as we can't rely on
+	   signed shift to bring in ones. There is not sarx for
+	   gpr8/16. Also not we can't use inc here as the lower bits
+	   represent matches out of range so we can't rely on overflow.
+	 */
+	xorl	$((1 << CHAR_PER_VEC)- 1), %eax
+# endif
+	/* Use arithmatic shift so that leading 1s are filled in.  */
+	sarx	%VGPR(SHIFT_REG), %VRAX, %VRAX
+	/* If eax is all ones then no matches for esi or NULL.  */
+
+# ifdef USE_AS_WCSCHR
+	test	%VRAX, %VRAX
+# else
+	inc	%VRAX
 # endif
-	sarxl	%SHIFT_REG, %eax, %eax
-	/* If eax is zero continue.  */
-	testl	%eax, %eax
 	jz	L(cross_page_continue)
-	bsfl	%eax, %eax
 
+	.p2align 4,, 10
+L(last_vec_x1_vec_size32):
+	bsf	%VRAX, %VRAX
 # ifdef USE_AS_WCSCHR
-	/* NB: Multiply wchar_t count by 4 to get the number of
-	   bytes.  */
-	leaq	(%rdx, %rax, CHAR_SIZE), %rax
+	/* NB: Multiply wchar_t count by 4 to get the number of bytes.
+	 */
+	leaq	(%rdi, %rax, CHAR_SIZE), %rax
 # else
-	addq	%rdx, %rax
+	addq	%rdi, %rax
 # endif
 # ifndef USE_AS_STRCHRNUL
 	/* Check to see if match was CHAR or null.  */
 	cmp	(%rax), %CHAR_REG
-	je	L(cross_page_ret)
-L(zero_end):
-	xorl	%eax, %eax
-L(cross_page_ret):
+	jne	L(zero_end_0)
 # endif
 	ret
+# ifndef USE_AS_STRCHRNUL
+L(zero_end_0):
+	xorl	%eax, %eax
+	ret
+# endif
 
 END (STRCHR)
 #endif
-- 
2.34.1


^ permalink raw reply	[flat|nested] 41+ messages in thread

* [PATCH v2 3/7] x86: Optimize strnlen-evex.S and implement with VMM headers
  2022-10-18 23:19 ` [PATCH v2 " Noah Goldstein
  2022-10-18 23:19   ` [PATCH v2 2/7] x86: Shrink / minorly optimize strchr-evex " Noah Goldstein
@ 2022-10-18 23:19   ` Noah Goldstein
  2022-10-18 23:19   ` [PATCH v2 4/7] x86: Optimize memrchr-evex.S Noah Goldstein
                     ` (3 subsequent siblings)
  5 siblings, 0 replies; 41+ messages in thread
From: Noah Goldstein @ 2022-10-18 23:19 UTC (permalink / raw)
  To: libc-alpha; +Cc: goldstein.w.n, hjl.tools, carlos

Optimizations are:
1. Use the fact that bsf(0) leaves the destination unchanged to save a
   branch in short string case.
2. Restructure code so that small strings are given the hot path.
        - This is a net-zero on the benchmark suite but in general makes
      sense as smaller sizes are far more common.
3. Use more code-size efficient instructions.
	- tzcnt ...     -> bsf ...
	- vpcmpb $0 ... -> vpcmpeq ...
4. Align labels less aggressively, especially if it doesn't save fetch
   blocks / causes the basic-block to span extra cache-lines.

The optimizations (especially for point 2) make the strnlen and
strlen code essentially incompatible so split strnlen-evex
to a new file.

Code Size Changes:
strlen-evex.S       :  -23 bytes
strnlen-evex.S      : -167 bytes

Net perf changes:

Reported as geometric mean of all improvements / regressions from N=10
runs of the benchtests. Value as New Time / Old Time so < 1.0 is
improvement and 1.0 is regression.

strlen-evex.S       : 0.992 (No real change)
strnlen-evex.S      : 0.947

Full results attached in email.

Full check passes on x86-64.
---
 sysdeps/x86_64/multiarch/strlen-evex.S  | 544 +++++++-----------------
 sysdeps/x86_64/multiarch/strnlen-evex.S | 427 ++++++++++++++++++-
 sysdeps/x86_64/multiarch/wcsnlen-evex.S |   5 +-
 3 files changed, 572 insertions(+), 404 deletions(-)

diff --git a/sysdeps/x86_64/multiarch/strlen-evex.S b/sysdeps/x86_64/multiarch/strlen-evex.S
index 2109ec2f7a..487846f098 100644
--- a/sysdeps/x86_64/multiarch/strlen-evex.S
+++ b/sysdeps/x86_64/multiarch/strlen-evex.S
@@ -26,466 +26,220 @@
 #  define STRLEN	__strlen_evex
 # endif
 
-# define VMOVA		vmovdqa64
+# ifndef VEC_SIZE
+#  include "x86-evex256-vecs.h"
+# endif
 
 # ifdef USE_AS_WCSLEN
-#  define VPCMP		vpcmpd
+#  define VPCMPEQ	vpcmpeqd
+#  define VPCMPNEQ	vpcmpneqd
+#  define VPTESTN	vptestnmd
+#  define VPTEST	vptestmd
 #  define VPMINU	vpminud
-#  define SHIFT_REG ecx
 #  define CHAR_SIZE	4
+#  define CHAR_SIZE_SHIFT_REG(reg)	sar $2, %reg
 # else
-#  define VPCMP		vpcmpb
+#  define VPCMPEQ	vpcmpeqb
+#  define VPCMPNEQ	vpcmpneqb
+#  define VPTESTN	vptestnmb
+#  define VPTEST	vptestmb
 #  define VPMINU	vpminub
-#  define SHIFT_REG edx
 #  define CHAR_SIZE	1
+#  define CHAR_SIZE_SHIFT_REG(reg)
+
+#  define REG_WIDTH	VEC_SIZE
 # endif
 
-# define XMMZERO	xmm16
-# define YMMZERO	ymm16
-# define YMM1		ymm17
-# define YMM2		ymm18
-# define YMM3		ymm19
-# define YMM4		ymm20
-# define YMM5		ymm21
-# define YMM6		ymm22
-
-# define VEC_SIZE 32
-# define PAGE_SIZE 4096
-# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE)
-
-	.section .text.evex,"ax",@progbits
-ENTRY (STRLEN)
-# ifdef USE_AS_STRNLEN
-	/* Check zero length.  */
-	test	%RSI_LP, %RSI_LP
-	jz	L(zero)
-#  ifdef __ILP32__
-	/* Clear the upper 32 bits.  */
-	movl	%esi, %esi
-#  endif
-	mov	%RSI_LP, %R8_LP
+# define CHAR_PER_VEC	(VEC_SIZE / CHAR_SIZE)
+
+# include "reg-macros.h"
+
+# if CHAR_PER_VEC == 64
+
+#  define TAIL_RETURN_LBL	first_vec_x2
+#  define TAIL_RETURN_OFFSET	(CHAR_PER_VEC * 2)
+
+#  define FALLTHROUGH_RETURN_LBL	first_vec_x3
+#  define FALLTHROUGH_RETURN_OFFSET	(CHAR_PER_VEC * 3)
+
+# else
+
+#  define TAIL_RETURN_LBL	first_vec_x3
+#  define TAIL_RETURN_OFFSET	(CHAR_PER_VEC * 3)
+
+#  define FALLTHROUGH_RETURN_LBL	first_vec_x2
+#  define FALLTHROUGH_RETURN_OFFSET	(CHAR_PER_VEC * 2)
 # endif
+
+# define XZERO	VMM_128(0)
+# define VZERO	VMM(0)
+# define PAGE_SIZE	4096
+
+	.section SECTION(.text), "ax", @progbits
+ENTRY_P2ALIGN (STRLEN, 6)
 	movl	%edi, %eax
-	vpxorq	%XMMZERO, %XMMZERO, %XMMZERO
-	/* Clear high bits from edi. Only keeping bits relevant to page
-	   cross check.  */
+	vpxorq	%XZERO, %XZERO, %XZERO
 	andl	$(PAGE_SIZE - 1), %eax
-	/* Check if we may cross page boundary with one vector load.  */
 	cmpl	$(PAGE_SIZE - VEC_SIZE), %eax
 	ja	L(cross_page_boundary)
 
 	/* Check the first VEC_SIZE bytes.  Each bit in K0 represents a
 	   null byte.  */
-	VPCMP	$0, (%rdi), %YMMZERO, %k0
-	kmovd	%k0, %eax
-# ifdef USE_AS_STRNLEN
-	/* If length < CHAR_PER_VEC handle special.  */
-	cmpq	$CHAR_PER_VEC, %rsi
-	jbe	L(first_vec_x0)
-# endif
-	testl	%eax, %eax
+	VPCMPEQ	(%rdi), %VZERO, %k0
+	KMOV	%k0, %VRAX
+	test	%VRAX, %VRAX
 	jz	L(aligned_more)
-	tzcntl	%eax, %eax
-	ret
-# ifdef USE_AS_STRNLEN
-L(zero):
-	xorl	%eax, %eax
-	ret
-
-	.p2align 4
-L(first_vec_x0):
-	/* Set bit for max len so that tzcnt will return min of max len
-	   and position of first match.  */
-	btsq	%rsi, %rax
-	tzcntl	%eax, %eax
-	ret
-# endif
-
-	.p2align 4
-L(first_vec_x1):
-	tzcntl	%eax, %eax
-	/* Safe to use 32 bit instructions as these are only called for
-	   size = [1, 159].  */
-# ifdef USE_AS_STRNLEN
-	/* Use ecx which was computed earlier to compute correct value.
-	 */
-	leal	-(CHAR_PER_VEC * 4 + 1)(%rcx, %rax), %eax
-# else
-	subl	%edx, %edi
-#  ifdef USE_AS_WCSLEN
-	/* NB: Divide bytes by 4 to get the wchar_t count.  */
-	sarl	$2, %edi
-#  endif
-	leal	CHAR_PER_VEC(%rdi, %rax), %eax
-# endif
-	ret
-
-	.p2align 4
-L(first_vec_x2):
-	tzcntl	%eax, %eax
-	/* Safe to use 32 bit instructions as these are only called for
-	   size = [1, 159].  */
-# ifdef USE_AS_STRNLEN
-	/* Use ecx which was computed earlier to compute correct value.
-	 */
-	leal	-(CHAR_PER_VEC * 3 + 1)(%rcx, %rax), %eax
-# else
-	subl	%edx, %edi
-#  ifdef USE_AS_WCSLEN
-	/* NB: Divide bytes by 4 to get the wchar_t count.  */
-	sarl	$2, %edi
-#  endif
-	leal	(CHAR_PER_VEC * 2)(%rdi, %rax), %eax
-# endif
+	bsf	%VRAX, %VRAX
 	ret
 
-	.p2align 4
-L(first_vec_x3):
-	tzcntl	%eax, %eax
-	/* Safe to use 32 bit instructions as these are only called for
-	   size = [1, 159].  */
-# ifdef USE_AS_STRNLEN
-	/* Use ecx which was computed earlier to compute correct value.
-	 */
-	leal	-(CHAR_PER_VEC * 2 + 1)(%rcx, %rax), %eax
-# else
-	subl	%edx, %edi
-#  ifdef USE_AS_WCSLEN
-	/* NB: Divide bytes by 4 to get the wchar_t count.  */
-	sarl	$2, %edi
-#  endif
-	leal	(CHAR_PER_VEC * 3)(%rdi, %rax), %eax
-# endif
-	ret
-
-	.p2align 4
+	.p2align 4,, 8
 L(first_vec_x4):
-	tzcntl	%eax, %eax
-	/* Safe to use 32 bit instructions as these are only called for
-	   size = [1, 159].  */
-# ifdef USE_AS_STRNLEN
-	/* Use ecx which was computed earlier to compute correct value.
-	 */
-	leal	-(CHAR_PER_VEC + 1)(%rcx, %rax), %eax
-# else
-	subl	%edx, %edi
-#  ifdef USE_AS_WCSLEN
-	/* NB: Divide bytes by 4 to get the wchar_t count.  */
-	sarl	$2, %edi
-#  endif
+	bsf	%VRAX, %VRAX
+	subl	%ecx, %edi
+	CHAR_SIZE_SHIFT_REG (edi)
 	leal	(CHAR_PER_VEC * 4)(%rdi, %rax), %eax
-# endif
 	ret
 
-	.p2align 5
+
+
+	/* Aligned more for strnlen compares remaining length vs 2 *
+	   CHAR_PER_VEC, 4 * CHAR_PER_VEC, and 8 * CHAR_PER_VEC before
+	   going to the loop.  */
+	.p2align 4,, 10
 L(aligned_more):
-	movq	%rdi, %rdx
-	/* Align data to VEC_SIZE.  */
-	andq	$-(VEC_SIZE), %rdi
+	movq	%rdi, %rcx
+	andq	$(VEC_SIZE * -1), %rdi
 L(cross_page_continue):
-	/* Check the first 4 * VEC_SIZE.  Only one VEC_SIZE at a time
-	   since data is only aligned to VEC_SIZE.  */
-# ifdef USE_AS_STRNLEN
-	/* + CHAR_SIZE because it simplies the logic in
-	   last_4x_vec_or_less.  */
-	leaq	(VEC_SIZE * 5 + CHAR_SIZE)(%rdi), %rcx
-	subq	%rdx, %rcx
-#  ifdef USE_AS_WCSLEN
-	/* NB: Divide bytes by 4 to get the wchar_t count.  */
-	sarl	$2, %ecx
-#  endif
-# endif
-	/* Load first VEC regardless.  */
-	VPCMP	$0, VEC_SIZE(%rdi), %YMMZERO, %k0
-# ifdef USE_AS_STRNLEN
-	/* Adjust length. If near end handle specially.  */
-	subq	%rcx, %rsi
-	jb	L(last_4x_vec_or_less)
-# endif
-	kmovd	%k0, %eax
-	testl	%eax, %eax
+	/* Remaining length >= 2 * CHAR_PER_VEC so do VEC0/VEC1 without
+	   rechecking bounds.  */
+	VPCMPEQ	(VEC_SIZE * 1)(%rdi), %VZERO, %k0
+	KMOV	%k0, %VRAX
+	test	%VRAX, %VRAX
 	jnz	L(first_vec_x1)
 
-	VPCMP	$0, (VEC_SIZE * 2)(%rdi), %YMMZERO, %k0
-	kmovd	%k0, %eax
-	test	%eax, %eax
+	VPCMPEQ	(VEC_SIZE * 2)(%rdi), %VZERO, %k0
+	KMOV	%k0, %VRAX
+	test	%VRAX, %VRAX
 	jnz	L(first_vec_x2)
 
-	VPCMP	$0, (VEC_SIZE * 3)(%rdi), %YMMZERO, %k0
-	kmovd	%k0, %eax
-	testl	%eax, %eax
+	VPCMPEQ	(VEC_SIZE * 3)(%rdi), %VZERO, %k0
+	KMOV	%k0, %VRAX
+	test	%VRAX, %VRAX
 	jnz	L(first_vec_x3)
 
-	VPCMP	$0, (VEC_SIZE * 4)(%rdi), %YMMZERO, %k0
-	kmovd	%k0, %eax
-	testl	%eax, %eax
+	VPCMPEQ	(VEC_SIZE * 4)(%rdi), %VZERO, %k0
+	KMOV	%k0, %VRAX
+	test	%VRAX, %VRAX
 	jnz	L(first_vec_x4)
 
-	addq	$VEC_SIZE, %rdi
-# ifdef USE_AS_STRNLEN
-	/* Check if at last VEC_SIZE * 4 length.  */
-	cmpq	$(CHAR_PER_VEC * 4 - 1), %rsi
-	jbe	L(last_4x_vec_or_less_load)
-	movl	%edi, %ecx
-	andl	$(VEC_SIZE * 4 - 1), %ecx
-#  ifdef USE_AS_WCSLEN
-	/* NB: Divide bytes by 4 to get the wchar_t count.  */
-	sarl	$2, %ecx
-#  endif
-	/* Readjust length.  */
-	addq	%rcx, %rsi
-# endif
-	/* Align data to VEC_SIZE * 4.  */
+	subq	$(VEC_SIZE * -1), %rdi
+
+# if CHAR_PER_VEC == 64
+	/* No partial register stalls on processors that we use evex512
+	   on and this saves code size.  */
+	xorb	%dil, %dil
+# else
 	andq	$-(VEC_SIZE * 4), %rdi
+# endif
+
+
 
 	/* Compare 4 * VEC at a time forward.  */
 	.p2align 4
 L(loop_4x_vec):
-	/* Load first VEC regardless.  */
-	VMOVA	(VEC_SIZE * 4)(%rdi), %YMM1
-# ifdef USE_AS_STRNLEN
-	/* Break if at end of length.  */
-	subq	$(CHAR_PER_VEC * 4), %rsi
-	jb	L(last_4x_vec_or_less_cmpeq)
-# endif
-	/* Save some code size by microfusing VPMINU with the load. Since
-	   the matches in ymm2/ymm4 can only be returned if there where no
-	   matches in ymm1/ymm3 respectively there is no issue with overlap.
-	 */
-	VPMINU	(VEC_SIZE * 5)(%rdi), %YMM1, %YMM2
-	VMOVA	(VEC_SIZE * 6)(%rdi), %YMM3
-	VPMINU	(VEC_SIZE * 7)(%rdi), %YMM3, %YMM4
+	VMOVA	(VEC_SIZE * 4)(%rdi), %VMM(1)
+	VPMINU	(VEC_SIZE * 5)(%rdi), %VMM(1), %VMM(2)
+	VMOVA	(VEC_SIZE * 6)(%rdi), %VMM(3)
+	VPMINU	(VEC_SIZE * 7)(%rdi), %VMM(3), %VMM(4)
+	VPTESTN	%VMM(2), %VMM(2), %k0
+	VPTESTN	%VMM(4), %VMM(4), %k2
 
-	VPCMP	$0, %YMM2, %YMMZERO, %k0
-	VPCMP	$0, %YMM4, %YMMZERO, %k1
 	subq	$-(VEC_SIZE * 4), %rdi
-	kortestd	%k0, %k1
+	KORTEST %k0, %k2
 	jz	L(loop_4x_vec)
 
-	/* Check if end was in first half.  */
-	kmovd	%k0, %eax
-	subq	%rdx, %rdi
-# ifdef USE_AS_WCSLEN
-	shrq	$2, %rdi
-# endif
-	testl	%eax, %eax
-	jz	L(second_vec_return)
+	VPTESTN	%VMM(1), %VMM(1), %k1
+	KMOV	%k1, %VRAX
+	test	%VRAX, %VRAX
+	jnz	L(first_vec_x0)
 
-	VPCMP	$0, %YMM1, %YMMZERO, %k2
-	kmovd	%k2, %edx
-	/* Combine VEC1 matches (edx) with VEC2 matches (eax).  */
-# ifdef USE_AS_WCSLEN
-	sall	$CHAR_PER_VEC, %eax
-	orl	%edx, %eax
-	tzcntl	%eax, %eax
-# else
-	salq	$CHAR_PER_VEC, %rax
-	orq	%rdx, %rax
-	tzcntq	%rax, %rax
-# endif
-	addq	%rdi, %rax
-	ret
-
-
-# ifdef USE_AS_STRNLEN
-
-L(last_4x_vec_or_less_load):
-	/* Depending on entry adjust rdi / prepare first VEC in YMM1.  */
-	VMOVA	(VEC_SIZE * 4)(%rdi), %YMM1
-L(last_4x_vec_or_less_cmpeq):
-	VPCMP	$0, %YMM1, %YMMZERO, %k0
-	addq	$(VEC_SIZE * 3), %rdi
-L(last_4x_vec_or_less):
-	kmovd	%k0, %eax
-	/* If remaining length > VEC_SIZE * 2. This works if esi is off by
-	   VEC_SIZE * 4.  */
-	testl	$(CHAR_PER_VEC * 2), %esi
-	jnz	L(last_4x_vec)
-
-	/* length may have been negative or positive by an offset of
-	   CHAR_PER_VEC * 4 depending on where this was called from. This
-	   fixes that.  */
-	andl	$(CHAR_PER_VEC * 4 - 1), %esi
-	testl	%eax, %eax
-	jnz	L(last_vec_x1_check)
+	KMOV	%k0, %VRAX
+	test	%VRAX, %VRAX
+	jnz	L(first_vec_x1)
 
-	/* Check the end of data.  */
-	subl	$CHAR_PER_VEC, %esi
-	jb	L(max)
+	VPTESTN	%VMM(3), %VMM(3), %k0
 
-	VPCMP	$0, (VEC_SIZE * 2)(%rdi), %YMMZERO, %k0
-	kmovd	%k0, %eax
-	tzcntl	%eax, %eax
-	/* Check the end of data.  */
-	cmpl	%eax, %esi
-	jb	L(max)
-
-	subq	%rdx, %rdi
-#  ifdef USE_AS_WCSLEN
-	/* NB: Divide bytes by 4 to get the wchar_t count.  */
-	sarq	$2, %rdi
-#  endif
-	leaq	(CHAR_PER_VEC * 2)(%rdi, %rax), %rax
-	ret
-L(max):
-	movq	%r8, %rax
-	ret
-# endif
-
-	/* Placed here in strnlen so that the jcc L(last_4x_vec_or_less)
-	   in the 4x VEC loop can use 2 byte encoding.  */
-	.p2align 4
-L(second_vec_return):
-	VPCMP	$0, %YMM3, %YMMZERO, %k0
-	/* Combine YMM3 matches (k0) with YMM4 matches (k1).  */
-# ifdef USE_AS_WCSLEN
-	kunpckbw	%k0, %k1, %k0
-	kmovd	%k0, %eax
-	tzcntl	%eax, %eax
+# if CHAR_PER_VEC == 64
+	KMOV	%k0, %VRAX
+	test	%VRAX, %VRAX
+	jnz	L(first_vec_x2)
+	KMOV	%k2, %VRAX
 # else
-	kunpckdq	%k0, %k1, %k0
-	kmovq	%k0, %rax
-	tzcntq	%rax, %rax
+	/* We can only combine last 2x VEC masks if CHAR_PER_VEC <= 32.
+	 */
+	kmovd	%k2, %edx
+	kmovd	%k0, %eax
+	salq	$CHAR_PER_VEC, %rdx
+	orq	%rdx, %rax
 # endif
-	leaq	(CHAR_PER_VEC * 2)(%rdi, %rax), %rax
-	ret
 
-
-# ifdef USE_AS_STRNLEN
-L(last_vec_x1_check):
-	tzcntl	%eax, %eax
-	/* Check the end of data.  */
-	cmpl	%eax, %esi
-	jb	L(max)
-	subq	%rdx, %rdi
-#  ifdef USE_AS_WCSLEN
-	/* NB: Divide bytes by 4 to get the wchar_t count.  */
-	sarq	$2, %rdi
-#  endif
-	leaq	(CHAR_PER_VEC)(%rdi, %rax), %rax
+	/* first_vec_x3 for strlen-ZMM and first_vec_x2 for strlen-YMM.
+	 */
+	.p2align 4,, 2
+L(FALLTHROUGH_RETURN_LBL):
+	bsfq	%rax, %rax
+	subq	%rcx, %rdi
+	CHAR_SIZE_SHIFT_REG (rdi)
+	leaq	(FALLTHROUGH_RETURN_OFFSET)(%rdi, %rax), %rax
 	ret
 
-	.p2align 4
-L(last_4x_vec):
-	/* Test first 2x VEC normally.  */
-	testl	%eax, %eax
-	jnz	L(last_vec_x1)
-
-	VPCMP	$0, (VEC_SIZE * 2)(%rdi), %YMMZERO, %k0
-	kmovd	%k0, %eax
-	testl	%eax, %eax
-	jnz	L(last_vec_x2)
-
-	/* Normalize length.  */
-	andl	$(CHAR_PER_VEC * 4 - 1), %esi
-	VPCMP	$0, (VEC_SIZE * 3)(%rdi), %YMMZERO, %k0
-	kmovd	%k0, %eax
-	testl	%eax, %eax
-	jnz	L(last_vec_x3)
-
-	/* Check the end of data.  */
-	subl	$(CHAR_PER_VEC * 3), %esi
-	jb	L(max)
-
-	VPCMP	$0, (VEC_SIZE * 4)(%rdi), %YMMZERO, %k0
-	kmovd	%k0, %eax
-	tzcntl	%eax, %eax
-	/* Check the end of data.  */
-	cmpl	%eax, %esi
-	jb	L(max_end)
-
-	subq	%rdx, %rdi
-#  ifdef USE_AS_WCSLEN
-	/* NB: Divide bytes by 4 to get the wchar_t count.  */
-	sarq	$2, %rdi
-#  endif
-	leaq	(CHAR_PER_VEC * 4)(%rdi, %rax), %rax
+	.p2align 4,, 8
+L(first_vec_x0):
+	bsf	%VRAX, %VRAX
+	sub	%rcx, %rdi
+	CHAR_SIZE_SHIFT_REG (rdi)
+	addq	%rdi, %rax
 	ret
 
-	.p2align 4
-L(last_vec_x1):
-	tzcntl	%eax, %eax
-	subq	%rdx, %rdi
-#  ifdef USE_AS_WCSLEN
-	/* NB: Divide bytes by 4 to get the wchar_t count.  */
-	sarq	$2, %rdi
-#  endif
+	.p2align 4,, 10
+L(first_vec_x1):
+	bsf	%VRAX, %VRAX
+	sub	%rcx, %rdi
+	CHAR_SIZE_SHIFT_REG (rdi)
 	leaq	(CHAR_PER_VEC)(%rdi, %rax), %rax
 	ret
 
-	.p2align 4
-L(last_vec_x2):
-	tzcntl	%eax, %eax
-	subq	%rdx, %rdi
-#  ifdef USE_AS_WCSLEN
-	/* NB: Divide bytes by 4 to get the wchar_t count.  */
-	sarq	$2, %rdi
-#  endif
-	leaq	(CHAR_PER_VEC * 2)(%rdi, %rax), %rax
-	ret
-
-	.p2align 4
-L(last_vec_x3):
-	tzcntl	%eax, %eax
-	subl	$(CHAR_PER_VEC * 2), %esi
-	/* Check the end of data.  */
-	cmpl	%eax, %esi
-	jb	L(max_end)
-	subq	%rdx, %rdi
-#  ifdef USE_AS_WCSLEN
-	/* NB: Divide bytes by 4 to get the wchar_t count.  */
-	sarq	$2, %rdi
-#  endif
-	leaq	(CHAR_PER_VEC * 3)(%rdi, %rax), %rax
-	ret
-L(max_end):
-	movq	%r8, %rax
+	.p2align 4,, 10
+	/* first_vec_x2 for strlen-ZMM and first_vec_x3 for strlen-YMM.
+	 */
+L(TAIL_RETURN_LBL):
+	bsf	%VRAX, %VRAX
+	sub	%VRCX, %VRDI
+	CHAR_SIZE_SHIFT_REG (VRDI)
+	lea	(TAIL_RETURN_OFFSET)(%rdi, %rax), %VRAX
 	ret
-# endif
 
-	/* Cold case for crossing page with first load.	 */
-	.p2align 4
+	.p2align 4,, 8
 L(cross_page_boundary):
-	movq	%rdi, %rdx
+	movq	%rdi, %rcx
 	/* Align data to VEC_SIZE.  */
 	andq	$-VEC_SIZE, %rdi
-	VPCMP	$0, (%rdi), %YMMZERO, %k0
-	kmovd	%k0, %eax
-	/* Remove the leading bytes.  */
+
+	VPCMPEQ	(%rdi), %VZERO, %k0
+
+	KMOV	%k0, %VRAX
 # ifdef USE_AS_WCSLEN
-	/* NB: Divide shift count by 4 since each bit in K0 represent 4
-	   bytes.  */
-	movl	%edx, %ecx
-	shrl	$2, %ecx
-	andl	$(CHAR_PER_VEC - 1), %ecx
-# endif
-	/* SHIFT_REG is ecx for USE_AS_WCSLEN and edx otherwise.  */
-	sarxl	%SHIFT_REG, %eax, %eax
+	movl	%ecx, %edx
+	shrl	$2, %edx
+	andl	$(CHAR_PER_VEC - 1), %edx
+	shrx	%edx, %eax, %eax
 	testl	%eax, %eax
-# ifndef USE_AS_STRNLEN
-	jz	L(cross_page_continue)
-	tzcntl	%eax, %eax
-	ret
 # else
-	jnz	L(cross_page_less_vec)
-#  ifndef USE_AS_WCSLEN
-	movl	%edx, %ecx
-	andl	$(CHAR_PER_VEC - 1), %ecx
-#  endif
-	movl	$CHAR_PER_VEC, %eax
-	subl	%ecx, %eax
-	/* Check the end of data.  */
-	cmpq	%rax, %rsi
-	ja	L(cross_page_continue)
-	movl	%esi, %eax
-	ret
-L(cross_page_less_vec):
-	tzcntl	%eax, %eax
-	/* Select min of length and position of first null.  */
-	cmpq	%rax, %rsi
-	cmovb	%esi, %eax
-	ret
+	shr	%cl, %VRAX
 # endif
+	jz	L(cross_page_continue)
+	bsf	%VRAX, %VRAX
+	ret
 
 END (STRLEN)
 #endif
diff --git a/sysdeps/x86_64/multiarch/strnlen-evex.S b/sysdeps/x86_64/multiarch/strnlen-evex.S
index 64a9fc2606..443a32a749 100644
--- a/sysdeps/x86_64/multiarch/strnlen-evex.S
+++ b/sysdeps/x86_64/multiarch/strnlen-evex.S
@@ -1,8 +1,423 @@
-#ifndef STRNLEN
-# define STRNLEN __strnlen_evex
-#endif
+/* strnlen/wcsnlen optimized with 256-bit EVEX instructions.
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <isa-level.h>
+#include <sysdep.h>
+
+#if ISA_SHOULD_BUILD (4)
+
+# ifndef VEC_SIZE
+#  include "x86-evex256-vecs.h"
+# endif
+
+
+# ifndef STRNLEN
+#  define STRNLEN	__strnlen_evex
+# endif
+
+# ifdef USE_AS_WCSLEN
+#  define VPCMPEQ	vpcmpeqd
+#  define VPCMPNEQ	vpcmpneqd
+#  define VPTESTN	vptestnmd
+#  define VPTEST	vptestmd
+#  define VPMINU	vpminud
+#  define CHAR_SIZE	4
+
+# else
+#  define VPCMPEQ	vpcmpeqb
+#  define VPCMPNEQ	vpcmpneqb
+#  define VPTESTN	vptestnmb
+#  define VPTEST	vptestmb
+#  define VPMINU	vpminub
+#  define CHAR_SIZE	1
+
+#  define REG_WIDTH	VEC_SIZE
+# endif
+
+# define CHAR_PER_VEC	(VEC_SIZE / CHAR_SIZE)
+
+# include "reg-macros.h"
+
+# if CHAR_PER_VEC == 32
+#  define SUB_SHORT(imm, reg)	subb $(imm), %VGPR_SZ(reg, 8)
+# else
+#  define SUB_SHORT(imm, reg)	subl $(imm), %VGPR_SZ(reg, 32)
+# endif
+
+
+
+# if CHAR_PER_VEC == 64
+#  define FALLTHROUGH_RETURN_OFFSET	(CHAR_PER_VEC * 3)
+# else
+#  define FALLTHROUGH_RETURN_OFFSET	(CHAR_PER_VEC * 2)
+# endif
+
+
+# define XZERO	VMM_128(0)
+# define VZERO	VMM(0)
+# define PAGE_SIZE	4096
+
+	.section SECTION(.text), "ax", @progbits
+ENTRY_P2ALIGN (STRNLEN, 6)
+	/* Check zero length.  */
+	test	%RSI_LP, %RSI_LP
+	jz	L(zero)
+# ifdef __ILP32__
+	/* Clear the upper 32 bits.  */
+	movl	%esi, %esi
+# endif
+
+	movl	%edi, %eax
+	vpxorq	%XZERO, %XZERO, %XZERO
+	andl	$(PAGE_SIZE - 1), %eax
+	cmpl	$(PAGE_SIZE - VEC_SIZE), %eax
+	ja	L(cross_page_boundary)
+
+	/* Check the first VEC_SIZE bytes.  Each bit in K0 represents a
+	   null byte.  */
+	VPCMPEQ	(%rdi), %VZERO, %k0
+
+	KMOV	%k0, %VRCX
+	movq	%rsi, %rax
+
+	/* If src (rcx) is zero, bsf does not change the result.  NB:
+	   Must use 64-bit bsf here so that upper bits of len are not
+	   cleared.  */
+	bsfq	%rcx, %rax
+	/* If rax > CHAR_PER_VEC then rcx must have been zero (no null
+	   CHAR) and rsi must be > CHAR_PER_VEC.  */
+	cmpq	$CHAR_PER_VEC, %rax
+	ja	L(more_1x_vec)
+	/* Check if first match in bounds.  */
+	cmpq	%rax, %rsi
+	cmovb	%esi, %eax
+	ret
+
+
+# if CHAR_PER_VEC != 32
+	.p2align 4,, 2
+L(zero):
+L(max_0):
+	movl	%esi, %eax
+	ret
+# endif
+
+	/* Aligned more for strnlen compares remaining length vs 2 *
+	   CHAR_PER_VEC, 4 * CHAR_PER_VEC, and 8 * CHAR_PER_VEC before
+	   going to the loop.  */
+	.p2align 4,, 10
+L(more_1x_vec):
+L(cross_page_continue):
+	/* Compute number of words checked after aligning.  */
+# ifdef USE_AS_WCSLEN
+	/* Need to compute directly for wcslen as CHAR_SIZE * rsi can
+	   overflow.  */
+	movq	%rdi, %rax
+	andq	$(VEC_SIZE * -1), %rdi
+	subq	%rdi, %rax
+	sarq	$2, %rax
+	leaq	-(CHAR_PER_VEC * 1)(%rax, %rsi), %rax
+# else
+	leaq	(VEC_SIZE * -1)(%rsi, %rdi), %rax
+	andq	$(VEC_SIZE * -1), %rdi
+	subq	%rdi, %rax
+# endif
+
+
+	VPCMPEQ	VEC_SIZE(%rdi), %VZERO, %k0
+
+	cmpq	$(CHAR_PER_VEC * 2), %rax
+	ja	L(more_2x_vec)
+
+L(last_2x_vec_or_less):
+	KMOV	%k0, %VRDX
+	test	%VRDX, %VRDX
+	jnz	L(last_vec_check)
+
+	/* Check the end of data.  */
+	SUB_SHORT (CHAR_PER_VEC, rax)
+	jbe	L(max_0)
+	VPCMPEQ	(VEC_SIZE * 2)(%rdi), %VZERO, %k0
+	KMOV	%k0, %VRDX
+	test	%VRDX, %VRDX
+	jz	L(max_0)
+	/* Best place for LAST_VEC_CHECK if ZMM.  */
+	.p2align 4,, 8
+L(last_vec_check):
+	bsf	%VRDX, %VRDX
+	sub	%eax, %edx
+	lea	(%rsi, %rdx), %eax
+	cmovae	%esi, %eax
+	ret
+
+# if CHAR_PER_VEC == 32
+	.p2align 4,, 2
+L(zero):
+L(max_0):
+	movl	%esi, %eax
+	ret
+# endif
+
+	.p2align 4,, 8
+L(last_4x_vec_or_less):
+	addl	$(CHAR_PER_VEC * -4), %eax
+	VPCMPEQ	(VEC_SIZE * 5)(%rdi), %VZERO, %k0
+	subq	$(VEC_SIZE * -4), %rdi
+	cmpl	$(CHAR_PER_VEC * 2), %eax
+	jbe	L(last_2x_vec_or_less)
+
+	.p2align 4,, 6
+L(more_2x_vec):
+	/* Remaining length >= 2 * CHAR_PER_VEC so do VEC0/VEC1 without
+	   rechecking bounds.  */
 
-#define USE_AS_STRNLEN 1
-#define STRLEN	STRNLEN
+	KMOV	%k0, %VRDX
 
-#include "strlen-evex.S"
+	test	%VRDX, %VRDX
+	jnz	L(first_vec_x1)
+
+	VPCMPEQ	(VEC_SIZE * 2)(%rdi), %VZERO, %k0
+	KMOV	%k0, %VRDX
+	test	%VRDX, %VRDX
+	jnz	L(first_vec_x2)
+
+	cmpq	$(CHAR_PER_VEC * 4), %rax
+	ja	L(more_4x_vec)
+
+
+	VPCMPEQ	(VEC_SIZE * 3)(%rdi), %VZERO, %k0
+	KMOV	%k0, %VRDX
+	addl	$(CHAR_PER_VEC * -2), %eax
+	test	%VRDX, %VRDX
+	jnz	L(last_vec_check)
+
+	subl	$(CHAR_PER_VEC), %eax
+	jbe	L(max_1)
+
+	VPCMPEQ	(VEC_SIZE * 4)(%rdi), %VZERO, %k0
+	KMOV	%k0, %VRDX
+
+	test	%VRDX, %VRDX
+	jnz	L(last_vec_check)
+L(max_1):
+	movl	%esi, %eax
+	ret
+
+	.p2align 4,, 3
+L(first_vec_x2):
+# if VEC_SIZE == 64
+	/* If VEC_SIZE == 64 we can fit logic for full return label in
+	   spare bytes before next cache line.  */
+	bsf	%VRDX, %VRDX
+	sub	%eax, %esi
+	leal	(CHAR_PER_VEC * 1)(%rsi, %rdx), %eax
+	ret
+	.p2align 4,, 6
+# else
+	addl	$CHAR_PER_VEC, %esi
+# endif
+L(first_vec_x1):
+	bsf	%VRDX, %VRDX
+	sub	%eax, %esi
+	leal	(CHAR_PER_VEC * 0)(%rsi, %rdx), %eax
+	ret
+
+
+	.p2align 4,, 6
+L(first_vec_x4):
+# if VEC_SIZE == 64
+	/* If VEC_SIZE == 64 we can fit logic for full return label in
+	   spare bytes before next cache line.  */
+	bsf	%VRDX, %VRDX
+	sub	%eax, %esi
+	leal	(CHAR_PER_VEC * 3)(%rsi, %rdx), %eax
+	ret
+	.p2align 4,, 6
+# else
+	addl	$CHAR_PER_VEC, %esi
+# endif
+L(first_vec_x3):
+	bsf	%VRDX, %VRDX
+	sub	%eax, %esi
+	leal	(CHAR_PER_VEC * 2)(%rsi, %rdx), %eax
+	ret
+
+	.p2align 4,, 5
+L(more_4x_vec):
+	VPCMPEQ	(VEC_SIZE * 3)(%rdi), %VZERO, %k0
+	KMOV	%k0, %VRDX
+	test	%VRDX, %VRDX
+	jnz	L(first_vec_x3)
+
+	VPCMPEQ	(VEC_SIZE * 4)(%rdi), %VZERO, %k0
+	KMOV	%k0, %VRDX
+	test	%VRDX, %VRDX
+	jnz	L(first_vec_x4)
+
+	/* Check if at last VEC_SIZE * 4 length before aligning for the
+	   loop.  */
+	cmpq	$(CHAR_PER_VEC * 8), %rax
+	jbe	L(last_4x_vec_or_less)
+
+
+	/* Compute number of words checked after aligning.  */
+# ifdef USE_AS_WCSLEN
+	/* Need to compute directly for wcslen as CHAR_SIZE * rsi can
+	   overflow.  */
+	leaq	(VEC_SIZE * -3)(%rdi), %rdx
+# else
+	leaq	(VEC_SIZE * -3)(%rdi, %rax), %rax
+# endif
+
+	subq	$(VEC_SIZE * -1), %rdi
+
+	/* Align data to VEC_SIZE * 4.  */
+# if VEC_SIZE == 64
+	/* Saves code size.  No evex512 processor has partial register
+	   stalls.  If that change this can be replaced with `andq
+	   $-(VEC_SIZE * 4), %rdi`.  */
+	xorb	%dil, %dil
+# else
+	andq	$-(VEC_SIZE * 4), %rdi
+# endif
+
+# ifdef USE_AS_WCSLEN
+	subq	%rdi, %rdx
+	sarq	$2, %rdx
+	addq	%rdx, %rax
+# else
+	subq	%rdi, %rax
+# endif
+	/* Compare 4 * VEC at a time forward.  */
+	.p2align 4,, 11
+L(loop_4x_vec):
+	VMOVA	(VEC_SIZE * 4)(%rdi), %VMM(1)
+	VPMINU	(VEC_SIZE * 5)(%rdi), %VMM(1), %VMM(2)
+	VMOVA	(VEC_SIZE * 6)(%rdi), %VMM(3)
+	VPMINU	(VEC_SIZE * 7)(%rdi), %VMM(3), %VMM(4)
+	VPTESTN	%VMM(2), %VMM(2), %k0
+	VPTESTN	%VMM(4), %VMM(4), %k2
+	subq	$-(VEC_SIZE * 4), %rdi
+	/* Break if at end of length.  */
+	subq	$(CHAR_PER_VEC * 4), %rax
+	jbe	L(loop_len_end)
+
+
+	KORTEST %k0, %k2
+	jz	L(loop_4x_vec)
+
+
+L(loop_last_4x_vec):
+	movq	%rsi, %rcx
+	subq	%rax, %rsi
+	VPTESTN	%VMM(1), %VMM(1), %k1
+	KMOV	%k1, %VRDX
+	test	%VRDX, %VRDX
+	jnz	L(last_vec_x0)
+
+	KMOV	%k0, %VRDX
+	test	%VRDX, %VRDX
+	jnz	L(last_vec_x1)
+
+	VPTESTN	%VMM(3), %VMM(3), %k0
+
+	/* Seperate logic for VEC_SIZE == 64 and VEC_SIZE == 32 for
+	   returning last 2x VEC. For VEC_SIZE == 64 we test each VEC
+	   individually, for VEC_SIZE == 32 we combine them in a single
+	   64-bit GPR.  */
+# if CHAR_PER_VEC == 64
+	KMOV	%k0, %VRDX
+	test	%VRDX, %VRDX
+	jnz	L(last_vec_x2)
+	KMOV	%k2, %VRDX
+# else
+	/* We can only combine last 2x VEC masks if CHAR_PER_VEC <= 32.
+	 */
+	kmovd	%k2, %edx
+	kmovd	%k0, %eax
+	salq	$CHAR_PER_VEC, %rdx
+	orq	%rax, %rdx
+# endif
+
+	/* first_vec_x3 for strlen-ZMM and first_vec_x2 for strlen-YMM.
+	 */
+	bsfq	%rdx, %rdx
+	leaq	(FALLTHROUGH_RETURN_OFFSET - CHAR_PER_VEC * 4)(%rsi, %rdx), %rax
+	cmpq	%rax, %rcx
+	cmovb	%rcx, %rax
+	ret
+
+	/* Handle last 4x VEC after loop. All VECs have been loaded.  */
+	.p2align 4,, 4
+L(loop_len_end):
+	KORTEST %k0, %k2
+	jnz	L(loop_last_4x_vec)
+	movq	%rsi, %rax
+	ret
+
+
+# if CHAR_PER_VEC == 64
+	/* Since we can't combine the last 2x VEC for VEC_SIZE == 64
+	   need return label for it.  */
+	.p2align 4,, 8
+L(last_vec_x2):
+	bsf	%VRDX, %VRDX
+	leaq	(CHAR_PER_VEC * -2)(%rsi, %rdx), %rax
+	cmpq	%rax, %rcx
+	cmovb	%rcx, %rax
+	ret
+# endif
+
+
+	.p2align 4,, 10
+L(last_vec_x1):
+	addq	$CHAR_PER_VEC, %rsi
+L(last_vec_x0):
+	bsf	%VRDX, %VRDX
+	leaq	(CHAR_PER_VEC * -4)(%rsi, %rdx), %rax
+	cmpq	%rax, %rcx
+	cmovb	%rcx, %rax
+	ret
+
+
+	.p2align 4,, 8
+L(cross_page_boundary):
+	/* Align data to VEC_SIZE.  */
+	movq	%rdi, %rcx
+	andq	$-VEC_SIZE, %rcx
+	VPCMPEQ	(%rcx), %VZERO, %k0
+
+	KMOV	%k0, %VRCX
+# ifdef USE_AS_WCSLEN
+	shrl	$2, %eax
+	andl	$(CHAR_PER_VEC - 1), %eax
+# endif
+	shrx	%VRAX, %VRCX, %VRCX
+
+	negl	%eax
+	andl	$(CHAR_PER_VEC - 1), %eax
+	movq	%rsi, %rdx
+	bsf	%VRCX, %VRDX
+	cmpq	%rax, %rdx
+	ja	L(cross_page_continue)
+	movl	%edx, %eax
+	cmpq	%rdx, %rsi
+	cmovb	%esi, %eax
+	ret
+END (STRNLEN)
+#endif
diff --git a/sysdeps/x86_64/multiarch/wcsnlen-evex.S b/sysdeps/x86_64/multiarch/wcsnlen-evex.S
index e2aad94c1e..57a7e93fbf 100644
--- a/sysdeps/x86_64/multiarch/wcsnlen-evex.S
+++ b/sysdeps/x86_64/multiarch/wcsnlen-evex.S
@@ -2,8 +2,7 @@
 # define WCSNLEN	__wcsnlen_evex
 #endif
 
-#define STRLEN	WCSNLEN
+#define STRNLEN	WCSNLEN
 #define USE_AS_WCSLEN 1
-#define USE_AS_STRNLEN 1
 
-#include "strlen-evex.S"
+#include "strnlen-evex.S"
-- 
2.34.1


^ permalink raw reply	[flat|nested] 41+ messages in thread

* [PATCH v2 4/7] x86: Optimize memrchr-evex.S
  2022-10-18 23:19 ` [PATCH v2 " Noah Goldstein
  2022-10-18 23:19   ` [PATCH v2 2/7] x86: Shrink / minorly optimize strchr-evex " Noah Goldstein
  2022-10-18 23:19   ` [PATCH v2 3/7] x86: Optimize strnlen-evex.S " Noah Goldstein
@ 2022-10-18 23:19   ` Noah Goldstein
  2022-10-18 23:19   ` [PATCH v2 5/7] x86: Optimize strrchr-evex.S and implement with VMM headers Noah Goldstein
                     ` (2 subsequent siblings)
  5 siblings, 0 replies; 41+ messages in thread
From: Noah Goldstein @ 2022-10-18 23:19 UTC (permalink / raw)
  To: libc-alpha; +Cc: goldstein.w.n, hjl.tools, carlos

Optimizations are:
1. Use the fact that lzcnt(0) -> VEC_SIZE for memchr to save a branch
   in short string case.
2. Save several instructions in len = [VEC_SIZE, 4 * VEC_SIZE] case.
3. Use more code-size efficient instructions.
	- tzcnt ...     -> bsf ...
	- vpcmpb $0 ... -> vpcmpeq ...

Code Size Changes:
memrchr-evex.S      :  -29 bytes

Net perf changes:

Reported as geometric mean of all improvements / regressions from N=10
runs of the benchtests. Value as New Time / Old Time so < 1.0 is
improvement and 1.0 is regression.

memrchr-evex.S      : 0.949 (Mostly from improvements in small strings)

Full results attached in email.

Full check passes on x86-64.
---
 sysdeps/x86_64/multiarch/memrchr-evex.S | 538 ++++++++++++++----------
 1 file changed, 324 insertions(+), 214 deletions(-)

diff --git a/sysdeps/x86_64/multiarch/memrchr-evex.S b/sysdeps/x86_64/multiarch/memrchr-evex.S
index 550b328c5a..dbcf52808f 100644
--- a/sysdeps/x86_64/multiarch/memrchr-evex.S
+++ b/sysdeps/x86_64/multiarch/memrchr-evex.S
@@ -21,17 +21,19 @@
 #if ISA_SHOULD_BUILD (4)
 
 # include <sysdep.h>
-# include "x86-evex256-vecs.h"
-# if VEC_SIZE != 32
-#  error "VEC_SIZE != 32 unimplemented"
+
+# ifndef VEC_SIZE
+#  include "x86-evex256-vecs.h"
 # endif
 
+# include "reg-macros.h"
+
 # ifndef MEMRCHR
-#  define MEMRCHR				__memrchr_evex
+#  define MEMRCHR	__memrchr_evex
 # endif
 
-# define PAGE_SIZE			4096
-# define VMMMATCH			VMM(0)
+# define PAGE_SIZE	4096
+# define VMATCH	VMM(0)
 
 	.section SECTION(.text), "ax", @progbits
 ENTRY_P2ALIGN(MEMRCHR, 6)
@@ -43,294 +45,402 @@ ENTRY_P2ALIGN(MEMRCHR, 6)
 # endif
 	jz	L(zero_0)
 
-	/* Get end pointer. Minus one for two reasons. 1) It is necessary for a
-	   correct page cross check and 2) it correctly sets up end ptr to be
-	   subtract by lzcnt aligned.  */
+	/* Get end pointer. Minus one for three reasons. 1) It is
+	   necessary for a correct page cross check and 2) it correctly
+	   sets up end ptr to be subtract by lzcnt aligned. 3) it is a
+	   necessary step in aligning ptr.  */
 	leaq	-1(%rdi, %rdx), %rax
-	vpbroadcastb %esi, %VMMMATCH
+	vpbroadcastb %esi, %VMATCH
 
 	/* Check if we can load 1x VEC without cross a page.  */
 	testl	$(PAGE_SIZE - VEC_SIZE), %eax
 	jz	L(page_cross)
 
-	/* Don't use rax for pointer here because EVEX has better encoding with
-	   offset % VEC_SIZE == 0.  */
-	vpcmpb	$0, -(VEC_SIZE)(%rdi, %rdx), %VMMMATCH, %k0
-	kmovd	%k0, %ecx
-
-	/* Fall through for rdx (len) <= VEC_SIZE (expect small sizes).  */
-	cmpq	$VEC_SIZE, %rdx
-	ja	L(more_1x_vec)
-L(ret_vec_x0_test):
-
-	/* If ecx is zero (no matches) lzcnt will set it 32 (VEC_SIZE) which
-	   will guarantee edx (len) is less than it.  */
-	lzcntl	%ecx, %ecx
-	cmpl	%ecx, %edx
-	jle	L(zero_0)
-	subq	%rcx, %rax
+	/* Don't use rax for pointer here because EVEX has better
+	   encoding with offset % VEC_SIZE == 0.  */
+	vpcmpeqb (VEC_SIZE * -1)(%rdi, %rdx), %VMATCH, %k0
+	KMOV	%k0, %VRCX
+
+	/* If rcx is zero then lzcnt -> VEC_SIZE.  NB: there is a
+	   already a dependency between rcx and rsi so no worries about
+	   false-dep here.  */
+	lzcnt	%VRCX, %VRSI
+	/* If rdx <= rsi then either 1) rcx was non-zero (there was a
+	   match) but it was out of bounds or 2) rcx was zero and rdx
+	   was <= VEC_SIZE so we are done scanning.  */
+	cmpq	%rsi, %rdx
+	/* NB: Use branch to return zero/non-zero.  Common usage will
+	   branch on result of function (if return is null/non-null).
+	   This branch can be used to predict the ensuing one so there
+	   is no reason to extend the data-dependency with cmovcc.  */
+	jbe	L(zero_0)
+
+	/* If rcx is zero then len must be > RDX, otherwise since we
+	   already tested len vs lzcnt(rcx) (in rsi) we are good to
+	   return this match.  */
+	test	%VRCX, %VRCX
+	jz	L(more_1x_vec)
+	subq	%rsi, %rax
 	ret
 
-	/* Fits in aligning bytes of first cache line.  */
+	/* Fits in aligning bytes of first cache line for VEC_SIZE ==
+	   32.  */
+# if VEC_SIZE == 32
+	.p2align 4,, 2
 L(zero_0):
 	xorl	%eax, %eax
 	ret
-
-	.p2align 4,, 9
-L(ret_vec_x0_dec):
-	decq	%rax
-L(ret_vec_x0):
-	lzcntl	%ecx, %ecx
-	subq	%rcx, %rax
-	ret
+# endif
 
 	.p2align 4,, 10
 L(more_1x_vec):
-	testl	%ecx, %ecx
-	jnz	L(ret_vec_x0)
-
 	/* Align rax (pointer to string).  */
 	andq	$-VEC_SIZE, %rax
-
+L(page_cross_continue):
 	/* Recompute length after aligning.  */
-	movq	%rax, %rdx
+	subq	%rdi, %rax
 
-	/* Need no matter what.  */
-	vpcmpb	$0, -(VEC_SIZE)(%rax), %VMMMATCH, %k0
-	kmovd	%k0, %ecx
-
-	subq	%rdi, %rdx
-
-	cmpq	$(VEC_SIZE * 2), %rdx
+	cmpq	$(VEC_SIZE * 2), %rax
 	ja	L(more_2x_vec)
+
 L(last_2x_vec):
+	vpcmpeqb (VEC_SIZE * -1)(%rdi, %rax), %VMATCH, %k0
+	KMOV	%k0, %VRCX
 
-	/* Must dec rax because L(ret_vec_x0_test) expects it.  */
-	decq	%rax
-	cmpl	$VEC_SIZE, %edx
-	jbe	L(ret_vec_x0_test)
+	test	%VRCX, %VRCX
+	jnz	L(ret_vec_x0_test)
 
-	testl	%ecx, %ecx
-	jnz	L(ret_vec_x0)
+	/* If VEC_SIZE == 64 need to subtract because lzcntq won't
+	   implicitly add VEC_SIZE to match position.  */
+# if VEC_SIZE == 64
+	subl	$VEC_SIZE, %eax
+# else
+	cmpb	$VEC_SIZE, %al
+# endif
+	jle	L(zero_2)
 
-	/* Don't use rax for pointer here because EVEX has better encoding with
-	   offset % VEC_SIZE == 0.  */
-	vpcmpb	$0, -(VEC_SIZE * 2)(%rdi, %rdx), %VMMMATCH, %k0
-	kmovd	%k0, %ecx
-	/* NB: 64-bit lzcnt. This will naturally add 32 to position.  */
+	/* We adjusted rax (length) for VEC_SIZE == 64 so need seperate
+	   offsets.  */
+# if VEC_SIZE == 64
+	vpcmpeqb (VEC_SIZE * -1)(%rdi, %rax), %VMATCH, %k0
+# else
+	vpcmpeqb (VEC_SIZE * -2)(%rdi, %rax), %VMATCH, %k0
+# endif
+	KMOV	%k0, %VRCX
+	/* NB: 64-bit lzcnt. This will naturally add 32 to position for
+	   VEC_SIZE == 32.  */
 	lzcntq	%rcx, %rcx
-	cmpl	%ecx, %edx
-	jle	L(zero_0)
-	subq	%rcx, %rax
-	ret
-
-	/* Inexpensive place to put this regarding code size / target alignments
-	   / ICache NLP. Necessary for 2-byte encoding of jump to page cross
-	   case which in turn is necessary for hot path (len <= VEC_SIZE) to fit
-	   in first cache line.  */
-L(page_cross):
-	movq	%rax, %rsi
-	andq	$-VEC_SIZE, %rsi
-	vpcmpb	$0, (%rsi), %VMMMATCH, %k0
-	kmovd	%k0, %r8d
-	/* Shift out negative alignment (because we are starting from endptr and
-	   working backwards).  */
-	movl	%eax, %ecx
-	/* notl because eax already has endptr - 1.  (-x = ~(x - 1)).  */
-	notl	%ecx
-	shlxl	%ecx, %r8d, %ecx
-	cmpq	%rdi, %rsi
-	ja	L(more_1x_vec)
-	lzcntl	%ecx, %ecx
-	cmpl	%ecx, %edx
-	jle	L(zero_1)
-	subq	%rcx, %rax
+	subl	%ecx, %eax
+	ja	L(first_vec_x1_ret)
+	/* If VEC_SIZE == 64 put L(zero_0) here as we can't fit in the
+	   first cache line (this is the second cache line).  */
+# if VEC_SIZE == 64
+L(zero_0):
+# endif
+L(zero_2):
+	xorl	%eax, %eax
 	ret
 
-	/* Continue creating zero labels that fit in aligning bytes and get
-	   2-byte encoding / are in the same cache line as condition.  */
-L(zero_1):
-	xorl	%eax, %eax
+	/* NB: Fits in aligning bytes before next cache line for
+	   VEC_SIZE == 32.  For VEC_SIZE == 64 this is attached to
+	   L(first_vec_x0_test).  */
+# if VEC_SIZE == 32
+L(first_vec_x1_ret):
+	leaq	-1(%rdi, %rax), %rax
 	ret
+# endif
 
-	.p2align 4,, 8
-L(ret_vec_x1):
-	/* This will naturally add 32 to position.  */
-	bsrl	%ecx, %ecx
-	leaq	-(VEC_SIZE * 2)(%rcx, %rax), %rax
+	.p2align 4,, 6
+L(ret_vec_x0_test):
+	lzcnt	%VRCX, %VRCX
+	subl	%ecx, %eax
+	jle	L(zero_2)
+# if VEC_SIZE == 64
+	/* Reuse code at the end of L(ret_vec_x0_test) as we can't fit
+	   L(first_vec_x1_ret) in the same cache line as its jmp base
+	   so we might as well save code size.  */
+L(first_vec_x1_ret):
+# endif
+	leaq	-1(%rdi, %rax), %rax
 	ret
 
-	.p2align 4,, 8
+	.p2align 4,, 6
+L(loop_last_4x_vec):
+	/* Compute remaining length.  */
+	subl	%edi, %eax
+L(last_4x_vec):
+	cmpl	$(VEC_SIZE * 2), %eax
+	jle	L(last_2x_vec)
+# if VEC_SIZE == 32
+	/* Only align for VEC_SIZE == 32.  For VEC_SIZE == 64 we need
+	   the spare bytes to align the loop properly.  */
+	.p2align 4,, 10
+# endif
 L(more_2x_vec):
-	testl	%ecx, %ecx
-	jnz	L(ret_vec_x0_dec)
 
-	vpcmpb	$0, -(VEC_SIZE * 2)(%rax), %VMMMATCH, %k0
-	kmovd	%k0, %ecx
-	testl	%ecx, %ecx
-	jnz	L(ret_vec_x1)
+	/* Length > VEC_SIZE * 2 so check the first 2x VEC for match and
+	   return if either hit.  */
+	vpcmpeqb (VEC_SIZE * -1)(%rdi, %rax), %VMATCH, %k0
+	KMOV	%k0, %VRCX
+
+	test	%VRCX, %VRCX
+	jnz	L(first_vec_x0)
+
+	vpcmpeqb (VEC_SIZE * -2)(%rdi, %rax), %VMATCH, %k0
+	KMOV	%k0, %VRCX
+	test	%VRCX, %VRCX
+	jnz	L(first_vec_x1)
 
 	/* Need no matter what.  */
-	vpcmpb	$0, -(VEC_SIZE * 3)(%rax), %VMMMATCH, %k0
-	kmovd	%k0, %ecx
+	vpcmpeqb (VEC_SIZE * -3)(%rdi, %rax), %VMATCH, %k0
+	KMOV	%k0, %VRCX
 
-	subq	$(VEC_SIZE * 4), %rdx
+	/* Check if we are near the end.  */
+	subq	$(VEC_SIZE * 4), %rax
 	ja	L(more_4x_vec)
 
-	cmpl	$(VEC_SIZE * -1), %edx
-	jle	L(ret_vec_x2_test)
-L(last_vec):
-	testl	%ecx, %ecx
-	jnz	L(ret_vec_x2)
+	test	%VRCX, %VRCX
+	jnz	L(first_vec_x2_test)
 
+	/* Adjust length for final check and check if we are at the end.
+	 */
+	addl	$(VEC_SIZE * 1), %eax
+	jle	L(zero_1)
 
-	/* Need no matter what.  */
-	vpcmpb	$0, -(VEC_SIZE * 4)(%rax), %VMMMATCH, %k0
-	kmovd	%k0, %ecx
-	lzcntl	%ecx, %ecx
-	subq	$(VEC_SIZE * 3 + 1), %rax
-	subq	%rcx, %rax
-	cmpq	%rax, %rdi
-	ja	L(zero_1)
+	vpcmpeqb (VEC_SIZE * -1)(%rdi, %rax), %VMATCH, %k0
+	KMOV	%k0, %VRCX
+
+	lzcnt	%VRCX, %VRCX
+	subl	%ecx, %eax
+	ja	L(first_vec_x3_ret)
+L(zero_1):
+	xorl	%eax, %eax
+	ret
+L(first_vec_x3_ret):
+	leaq	-1(%rdi, %rax), %rax
 	ret
 
-	.p2align 4,, 8
-L(ret_vec_x2_test):
-	lzcntl	%ecx, %ecx
-	subq	$(VEC_SIZE * 2 + 1), %rax
-	subq	%rcx, %rax
-	cmpq	%rax, %rdi
-	ja	L(zero_1)
+	.p2align 4,, 6
+L(first_vec_x2_test):
+	/* Must adjust length before check.  */
+	subl	$-(VEC_SIZE * 2 - 1), %eax
+	lzcnt	%VRCX, %VRCX
+	subl	%ecx, %eax
+	jl	L(zero_4)
+	addq	%rdi, %rax
 	ret
 
-	.p2align 4,, 8
-L(ret_vec_x2):
-	bsrl	%ecx, %ecx
-	leaq	-(VEC_SIZE * 3)(%rcx, %rax), %rax
+
+	.p2align 4,, 10
+L(first_vec_x0):
+	bsr	%VRCX, %VRCX
+	leaq	(VEC_SIZE * -1)(%rdi, %rax), %rax
+	addq	%rcx, %rax
 	ret
 
-	.p2align 4,, 8
-L(ret_vec_x3):
-	bsrl	%ecx, %ecx
-	leaq	-(VEC_SIZE * 4)(%rcx, %rax), %rax
+	/* Fits unobtrusively here.  */
+L(zero_4):
+	xorl	%eax, %eax
+	ret
+
+	.p2align 4,, 10
+L(first_vec_x1):
+	bsr	%VRCX, %VRCX
+	leaq	(VEC_SIZE * -2)(%rdi, %rax), %rax
+	addq	%rcx, %rax
 	ret
 
 	.p2align 4,, 8
+L(first_vec_x3):
+	bsr	%VRCX, %VRCX
+	addq	%rdi, %rax
+	addq	%rcx, %rax
+	ret
+
+	.p2align 4,, 6
+L(first_vec_x2):
+	bsr	%VRCX, %VRCX
+	leaq	(VEC_SIZE * 1)(%rdi, %rax), %rax
+	addq	%rcx, %rax
+	ret
+
+	.p2align 4,, 2
 L(more_4x_vec):
-	testl	%ecx, %ecx
-	jnz	L(ret_vec_x2)
+	test	%VRCX, %VRCX
+	jnz	L(first_vec_x2)
 
-	vpcmpb	$0, -(VEC_SIZE * 4)(%rax), %VMMMATCH, %k0
-	kmovd	%k0, %ecx
+	vpcmpeqb (%rdi, %rax), %VMATCH, %k0
+	KMOV	%k0, %VRCX
 
-	testl	%ecx, %ecx
-	jnz	L(ret_vec_x3)
+	test	%VRCX, %VRCX
+	jnz	L(first_vec_x3)
 
 	/* Check if near end before re-aligning (otherwise might do an
 	   unnecessary loop iteration).  */
-	addq	$-(VEC_SIZE * 4), %rax
-	cmpq	$(VEC_SIZE * 4), %rdx
+	cmpq	$(VEC_SIZE * 4), %rax
 	jbe	L(last_4x_vec)
 
-	decq	%rax
-	andq	$-(VEC_SIZE * 4), %rax
-	movq	%rdi, %rdx
-	/* Get endptr for loop in rdx. NB: Can't just do while rax > rdi because
-	   lengths that overflow can be valid and break the comparison.  */
-	andq	$-(VEC_SIZE * 4), %rdx
+
+	/* NB: We setup the loop to NOT use index-address-mode for the
+	   buffer.  This costs some instructions & code size but avoids
+	   stalls due to unlaminated micro-fused instructions (as used
+	   in the loop) from being forced to issue in the same group
+	   (essentially narrowing the backend width).  */
+
+	/* Get endptr for loop in rdx. NB: Can't just do while rax > rdi
+	   because lengths that overflow can be valid and break the
+	   comparison.  */
+# if VEC_SIZE == 64
+	/* Use rdx as intermediate to compute rax, this gets us imm8
+	   encoding which just allows the L(more_4x_vec) block to fit
+	   in 1 cache-line.  */
+	leaq	(VEC_SIZE * 4)(%rdi), %rdx
+	leaq	(VEC_SIZE * -1)(%rdx, %rax), %rax
+
+	/* No evex machine has partial register stalls. This can be
+	   replaced with: `andq $(VEC_SIZE * -4), %rax/%rdx` if that
+	   changes.  */
+	xorb	%al, %al
+	xorb	%dl, %dl
+# else
+	leaq	(VEC_SIZE * 3)(%rdi, %rax), %rax
+	andq	$(VEC_SIZE * -4), %rax
+	leaq	(VEC_SIZE * 4)(%rdi), %rdx
+	andq	$(VEC_SIZE * -4), %rdx
+# endif
+
 
 	.p2align 4
 L(loop_4x_vec):
-	/* Store 1 were not-equals and 0 where equals in k1 (used to mask later
-	   on).  */
-	vpcmpb	$4, (VEC_SIZE * 3)(%rax), %VMMMATCH, %k1
+	/* NB: We could do the same optimization here as we do for
+	   memchr/rawmemchr by using VEX encoding in the loop for access
+	   to VEX vpcmpeqb + vpternlogd.  Since memrchr is not as hot as
+	   memchr it may not be worth the extra code size, but if the
+	   need arises it an easy ~15% perf improvement to the loop.  */
+
+	cmpq	%rdx, %rax
+	je	L(loop_last_4x_vec)
+	/* Store 1 were not-equals and 0 where equals in k1 (used to
+	   mask later on).  */
+	vpcmpb	$4, (VEC_SIZE * -1)(%rax), %VMATCH, %k1
 
 	/* VEC(2/3) will have zero-byte where we found a CHAR.  */
-	vpxorq	(VEC_SIZE * 2)(%rax), %VMMMATCH, %VMM(2)
-	vpxorq	(VEC_SIZE * 1)(%rax), %VMMMATCH, %VMM(3)
-	vpcmpb	$0, (VEC_SIZE * 0)(%rax), %VMMMATCH, %k4
+	vpxorq	(VEC_SIZE * -2)(%rax), %VMATCH, %VMM(2)
+	vpxorq	(VEC_SIZE * -3)(%rax), %VMATCH, %VMM(3)
+	vpcmpeqb (VEC_SIZE * -4)(%rax), %VMATCH, %k4
 
-	/* Combine VEC(2/3) with min and maskz with k1 (k1 has zero bit where
-	   CHAR is found and VEC(2/3) have zero-byte where CHAR is found.  */
+	/* Combine VEC(2/3) with min and maskz with k1 (k1 has zero bit
+	   where CHAR is found and VEC(2/3) have zero-byte where CHAR
+	   is found.  */
 	vpminub	%VMM(2), %VMM(3), %VMM(3){%k1}{z}
 	vptestnmb %VMM(3), %VMM(3), %k2
 
-	/* Any 1s and we found CHAR.  */
-	kortestd %k2, %k4
-	jnz	L(loop_end)
-
 	addq	$-(VEC_SIZE * 4), %rax
-	cmpq	%rdx, %rax
-	jne	L(loop_4x_vec)
 
-	/* Need to re-adjust rdx / rax for L(last_4x_vec).  */
-	subq	$-(VEC_SIZE * 4), %rdx
-	movq	%rdx, %rax
-	subl	%edi, %edx
-L(last_4x_vec):
+	/* Any 1s and we found CHAR.  */
+	KORTEST %k2, %k4
+	jz	L(loop_4x_vec)
+
 
-	/* Used no matter what.  */
-	vpcmpb	$0, (VEC_SIZE * -1)(%rax), %VMMMATCH, %k0
-	kmovd	%k0, %ecx
+	/* K1 has non-matches for first VEC. inc; jz will overflow rcx
+	   iff all bytes where non-matches.  */
+	KMOV	%k1, %VRCX
+	inc	%VRCX
+	jnz	L(first_vec_x0_end)
 
-	cmpl	$(VEC_SIZE * 2), %edx
-	jbe	L(last_2x_vec)
+	vptestnmb %VMM(2), %VMM(2), %k0
+	KMOV	%k0, %VRCX
+	test	%VRCX, %VRCX
+	jnz	L(first_vec_x1_end)
+	KMOV	%k2, %VRCX
+
+	/* Seperate logic for VEC_SIZE == 64 and VEC_SIZE == 32 for
+	   returning last 2x VEC. For VEC_SIZE == 64 we test each VEC
+	   individually, for VEC_SIZE == 32 we combine them in a single
+	   64-bit GPR.  */
+# if VEC_SIZE == 64
+	test	%VRCX, %VRCX
+	jnz	L(first_vec_x2_end)
+	KMOV	%k4, %VRCX
+# else
+	/* Combine last 2 VEC matches for VEC_SIZE == 32. If rcx (from
+	   VEC(3)) is zero (no CHAR in VEC(3)) then it won't affect the
+	   result in rsi (from VEC(4)). If rcx is non-zero then CHAR in
+	   VEC(3) and bsrq will use that position.  */
+	KMOV	%k4, %VRSI
+	salq	$32, %rcx
+	orq	%rsi, %rcx
+# endif
+	bsrq	%rcx, %rcx
+	addq	%rcx, %rax
+	ret
 
-	testl	%ecx, %ecx
-	jnz	L(ret_vec_x0_dec)
+	.p2align 4,, 4
+L(first_vec_x0_end):
+	/* rcx has 1s at non-matches so we need to `not` it. We used
+	   `inc` to test if zero so use `neg` to complete the `not` so
+	   the last 1 bit represent a match.  NB: (-x + 1 == ~x).  */
+	neg	%VRCX
+	bsr	%VRCX, %VRCX
+	leaq	(VEC_SIZE * 3)(%rcx, %rax), %rax
+	ret
 
+	.p2align 4,, 10
+L(first_vec_x1_end):
+	bsr	%VRCX, %VRCX
+	leaq	(VEC_SIZE * 2)(%rcx, %rax), %rax
+	ret
 
-	vpcmpb	$0, (VEC_SIZE * -2)(%rax), %VMMMATCH, %k0
-	kmovd	%k0, %ecx
+# if VEC_SIZE == 64
+	/* Since we can't combine the last 2x VEC for VEC_SIZE == 64
+	   need return label for it.  */
+	.p2align 4,, 4
+L(first_vec_x2_end):
+	bsr	%VRCX, %VRCX
+	leaq	(VEC_SIZE * 1)(%rcx, %rax), %rax
+	ret
+# endif
 
-	testl	%ecx, %ecx
-	jnz	L(ret_vec_x1)
 
-	/* Used no matter what.  */
-	vpcmpb	$0, (VEC_SIZE * -3)(%rax), %VMMMATCH, %k0
-	kmovd	%k0, %ecx
+	.p2align 4,, 4
+L(page_cross):
+	/* only lower bits of eax[log2(VEC_SIZE):0] are set so we can
+	   use movzbl to get the amount of bytes we are checking here.
+	 */
+	movzbl	%al, %ecx
+	andq	$-VEC_SIZE, %rax
+	vpcmpeqb (%rax), %VMATCH, %k0
+	KMOV	%k0, %VRSI
 
-	cmpl	$(VEC_SIZE * 3), %edx
-	ja	L(last_vec)
+	/* eax was comptued as %rdi + %rdx - 1 so need to add back 1
+	   here.  */
+	leal	1(%rcx), %r8d
 
-	lzcntl	%ecx, %ecx
-	subq	$(VEC_SIZE * 2 + 1), %rax
-	subq	%rcx, %rax
-	cmpq	%rax, %rdi
-	jbe	L(ret_1)
+	/* Invert ecx to get shift count for byte matches out of range.
+	 */
+	notl	%ecx
+	shlx	%VRCX, %VRSI, %VRSI
+
+	/* if r8 < rdx then the entire [buf, buf + len] is handled in
+	   the page cross case.  NB: we can't use the trick here we use
+	   in the non page-cross case because we aren't checking full
+	   VEC_SIZE.  */
+	cmpq	%r8, %rdx
+	ja	L(page_cross_check)
+	lzcnt	%VRSI, %VRSI
+	subl	%esi, %edx
+	ja	L(page_cross_ret)
 	xorl	%eax, %eax
-L(ret_1):
 	ret
 
-	.p2align 4,, 6
-L(loop_end):
-	kmovd	%k1, %ecx
-	notl	%ecx
-	testl	%ecx, %ecx
-	jnz	L(ret_vec_x0_end)
+L(page_cross_check):
+	test	%VRSI, %VRSI
+	jz	L(page_cross_continue)
 
-	vptestnmb %VMM(2), %VMM(2), %k0
-	kmovd	%k0, %ecx
-	testl	%ecx, %ecx
-	jnz	L(ret_vec_x1_end)
-
-	kmovd	%k2, %ecx
-	kmovd	%k4, %esi
-	/* Combine last 2 VEC matches. If ecx (VEC3) is zero (no CHAR in VEC3)
-	   then it won't affect the result in esi (VEC4). If ecx is non-zero
-	   then CHAR in VEC3 and bsrq will use that position.  */
-	salq	$32, %rcx
-	orq	%rsi, %rcx
-	bsrq	%rcx, %rcx
-	addq	%rcx, %rax
-	ret
-	.p2align 4,, 4
-L(ret_vec_x0_end):
-	addq	$(VEC_SIZE), %rax
-L(ret_vec_x1_end):
-	bsrl	%ecx, %ecx
-	leaq	(VEC_SIZE * 2)(%rax, %rcx), %rax
+	lzcnt	%VRSI, %VRSI
+	subl	%esi, %edx
+L(page_cross_ret):
+	leaq	-1(%rdi, %rdx), %rax
 	ret
-
 END(MEMRCHR)
 #endif
-- 
2.34.1


^ permalink raw reply	[flat|nested] 41+ messages in thread

* [PATCH v2 5/7] x86: Optimize strrchr-evex.S and implement with VMM headers
  2022-10-18 23:19 ` [PATCH v2 " Noah Goldstein
                     ` (2 preceding siblings ...)
  2022-10-18 23:19   ` [PATCH v2 4/7] x86: Optimize memrchr-evex.S Noah Goldstein
@ 2022-10-18 23:19   ` Noah Goldstein
  2022-10-18 23:19   ` [PATCH v2 6/7] x86: Add support for VEC_SIZE == 64 in strcmp-evex.S impl Noah Goldstein
  2022-10-18 23:19   ` [PATCH v2 7/7] Bench: Improve benchtests for memchr, strchr, strnlen, strrchr Noah Goldstein
  5 siblings, 0 replies; 41+ messages in thread
From: Noah Goldstein @ 2022-10-18 23:19 UTC (permalink / raw)
  To: libc-alpha; +Cc: goldstein.w.n, hjl.tools, carlos

Optimization is:
1. Cache latest result in "fast path" loop with `vmovdqu` instead of
  `kunpckdq`.  This helps if there are more than one matches.

Code Size Changes:
strrchr-evex.S       :  +30 bytes (Same number of cache lines)

Net perf changes:

Reported as geometric mean of all improvements / regressions from N=10
runs of the benchtests. Value as New Time / Old Time so < 1.0 is
improvement and 1.0 is regression.

strrchr-evex.S       : 0.932 (From cases with higher match frequency)

Full results attached in email.

Full check passes on x86-64.
---
 sysdeps/x86_64/multiarch/strrchr-evex.S | 371 +++++++++++++-----------
 1 file changed, 200 insertions(+), 171 deletions(-)

diff --git a/sysdeps/x86_64/multiarch/strrchr-evex.S b/sysdeps/x86_64/multiarch/strrchr-evex.S
index 992b45fb47..45487dc87a 100644
--- a/sysdeps/x86_64/multiarch/strrchr-evex.S
+++ b/sysdeps/x86_64/multiarch/strrchr-evex.S
@@ -26,25 +26,30 @@
 #  define STRRCHR	__strrchr_evex
 # endif
 
-# define VMOVU	vmovdqu64
-# define VMOVA	vmovdqa64
+# include "x86-evex256-vecs.h"
 
 # ifdef USE_AS_WCSRCHR
-#  define SHIFT_REG	esi
-
-#  define kunpck	kunpckbw
+#  define RCX_M	cl
+#  define SHIFT_REG	rcx
+#  define VPCOMPRESS	vpcompressd
+#  define kunpck_2x	kunpckbw
 #  define kmov_2x	kmovd
 #  define maskz_2x	ecx
 #  define maskm_2x	eax
 #  define CHAR_SIZE	4
 #  define VPMIN	vpminud
 #  define VPTESTN	vptestnmd
+#  define VPTEST	vptestmd
 #  define VPBROADCAST	vpbroadcastd
+#  define VPCMPEQ	vpcmpeqd
 #  define VPCMP	vpcmpd
-# else
-#  define SHIFT_REG	edi
 
-#  define kunpck	kunpckdq
+#  define USE_WIDE_CHAR
+# else
+#  define RCX_M	ecx
+#  define SHIFT_REG	rdi
+#  define VPCOMPRESS	vpcompressb
+#  define kunpck_2x	kunpckdq
 #  define kmov_2x	kmovq
 #  define maskz_2x	rcx
 #  define maskm_2x	rax
@@ -52,58 +57,48 @@
 #  define CHAR_SIZE	1
 #  define VPMIN	vpminub
 #  define VPTESTN	vptestnmb
+#  define VPTEST	vptestmb
 #  define VPBROADCAST	vpbroadcastb
+#  define VPCMPEQ	vpcmpeqb
 #  define VPCMP	vpcmpb
 # endif
 
-# define XMMZERO	xmm16
-# define YMMZERO	ymm16
-# define YMMMATCH	ymm17
-# define YMMSAVE	ymm18
+# include "reg-macros.h"
 
-# define YMM1	ymm19
-# define YMM2	ymm20
-# define YMM3	ymm21
-# define YMM4	ymm22
-# define YMM5	ymm23
-# define YMM6	ymm24
-# define YMM7	ymm25
-# define YMM8	ymm26
-
-
-# define VEC_SIZE	32
+# define VMATCH	VMM(0)
+# define CHAR_PER_VEC	(VEC_SIZE / CHAR_SIZE)
 # define PAGE_SIZE	4096
-	.section .text.evex, "ax", @progbits
-ENTRY(STRRCHR)
+
+	.section SECTION(.text), "ax", @progbits
+ENTRY_P2ALIGN(STRRCHR, 6)
 	movl	%edi, %eax
-	/* Broadcast CHAR to YMMMATCH.  */
-	VPBROADCAST %esi, %YMMMATCH
+	/* Broadcast CHAR to VMATCH.  */
+	VPBROADCAST %esi, %VMATCH
 
 	andl	$(PAGE_SIZE - 1), %eax
 	cmpl	$(PAGE_SIZE - VEC_SIZE), %eax
 	jg	L(cross_page_boundary)
 
-L(page_cross_continue):
-	VMOVU	(%rdi), %YMM1
-	/* k0 has a 1 for each zero CHAR in YMM1.  */
-	VPTESTN	%YMM1, %YMM1, %k0
-	kmovd	%k0, %ecx
-	testl	%ecx, %ecx
+	VMOVU	(%rdi), %VMM(1)
+	/* k0 has a 1 for each zero CHAR in VEC(1).  */
+	VPTESTN	%VMM(1), %VMM(1), %k0
+	KMOV	%k0, %VRSI
+	test	%VRSI, %VRSI
 	jz	L(aligned_more)
 	/* fallthrough: zero CHAR in first VEC.  */
-
-	/* K1 has a 1 for each search CHAR match in YMM1.  */
-	VPCMP	$0, %YMMMATCH, %YMM1, %k1
-	kmovd	%k1, %eax
+L(page_cross_return):
+	/* K1 has a 1 for each search CHAR match in VEC(1).  */
+	VPCMPEQ	%VMATCH, %VMM(1), %k1
+	KMOV	%k1, %VRAX
 	/* Build mask up until first zero CHAR (used to mask of
 	   potential search CHAR matches past the end of the string).
 	 */
-	blsmskl	%ecx, %ecx
-	andl	%ecx, %eax
+	blsmsk	%VRSI, %VRSI
+	and	%VRSI, %VRAX
 	jz	L(ret0)
-	/* Get last match (the `andl` removed any out of bounds
-	   matches).  */
-	bsrl	%eax, %eax
+	/* Get last match (the `and` removed any out of bounds matches).
+	 */
+	bsr	%VRAX, %VRAX
 # ifdef USE_AS_WCSRCHR
 	leaq	(%rdi, %rax, CHAR_SIZE), %rax
 # else
@@ -116,22 +111,22 @@ L(ret0):
 	   search path for earlier matches.  */
 	.p2align 4,, 6
 L(first_vec_x1):
-	VPCMP	$0, %YMMMATCH, %YMM2, %k1
-	kmovd	%k1, %eax
-	blsmskl	%ecx, %ecx
+	VPCMPEQ	%VMATCH, %VMM(2), %k1
+	KMOV	%k1, %VRAX
+	blsmsk	%VRCX, %VRCX
 	/* eax non-zero if search CHAR in range.  */
-	andl	%ecx, %eax
+	and	%VRCX, %VRAX
 	jnz	L(first_vec_x1_return)
 
-	/* fallthrough: no match in YMM2 then need to check for earlier
-	   matches (in YMM1).  */
+	/* fallthrough: no match in VEC(2) then need to check for
+	   earlier matches (in VEC(1)).  */
 	.p2align 4,, 4
 L(first_vec_x0_test):
-	VPCMP	$0, %YMMMATCH, %YMM1, %k1
-	kmovd	%k1, %eax
-	testl	%eax, %eax
+	VPCMPEQ	%VMATCH, %VMM(1), %k1
+	KMOV	%k1, %VRAX
+	test	%VRAX, %VRAX
 	jz	L(ret1)
-	bsrl	%eax, %eax
+	bsr	%VRAX, %VRAX
 # ifdef USE_AS_WCSRCHR
 	leaq	(%rsi, %rax, CHAR_SIZE), %rax
 # else
@@ -142,129 +137,144 @@ L(ret1):
 
 	.p2align 4,, 10
 L(first_vec_x1_or_x2):
-	VPCMP	$0, %YMM3, %YMMMATCH, %k3
-	VPCMP	$0, %YMM2, %YMMMATCH, %k2
+	VPCMPEQ	%VMM(3), %VMATCH, %k3
+	VPCMPEQ	%VMM(2), %VMATCH, %k2
 	/* K2 and K3 have 1 for any search CHAR match. Test if any
-	   matches between either of them. Otherwise check YMM1.  */
-	kortestd %k2, %k3
+	   matches between either of them. Otherwise check VEC(1).  */
+	KORTEST %k2, %k3
 	jz	L(first_vec_x0_test)
 
-	/* Guranteed that YMM2 and YMM3 are within range so merge the
-	   two bitmasks then get last result.  */
-	kunpck	%k2, %k3, %k3
-	kmovq	%k3, %rax
-	bsrq	%rax, %rax
-	leaq	(VEC_SIZE)(%r8, %rax, CHAR_SIZE), %rax
+	/* Guranteed that VEC(2) and VEC(3) are within range so merge
+	   the two bitmasks then get last result.  */
+	kunpck_2x %k2, %k3, %k3
+	kmov_2x	%k3, %maskm_2x
+	bsr	%maskm_2x, %maskm_2x
+	leaq	(VEC_SIZE * 1)(%r8, %rax, CHAR_SIZE), %rax
 	ret
 
-	.p2align 4,, 6
+	.p2align 4,, 7
 L(first_vec_x3):
-	VPCMP	$0, %YMMMATCH, %YMM4, %k1
-	kmovd	%k1, %eax
-	blsmskl	%ecx, %ecx
-	/* If no search CHAR match in range check YMM1/YMM2/YMM3.  */
-	andl	%ecx, %eax
+	VPCMPEQ	%VMATCH, %VMM(4), %k1
+	KMOV	%k1, %VRAX
+	blsmsk	%VRCX, %VRCX
+	/* If no search CHAR match in range check VEC(1)/VEC(2)/VEC(3).
+	 */
+	and	%VRCX, %VRAX
 	jz	L(first_vec_x1_or_x2)
-	bsrl	%eax, %eax
+	bsr	%VRAX, %VRAX
 	leaq	(VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax
 	ret
 
+
 	.p2align 4,, 6
 L(first_vec_x0_x1_test):
-	VPCMP	$0, %YMMMATCH, %YMM2, %k1
-	kmovd	%k1, %eax
-	/* Check YMM2 for last match first. If no match try YMM1.  */
-	testl	%eax, %eax
+	VPCMPEQ	%VMATCH, %VMM(2), %k1
+	KMOV	%k1, %VRAX
+	/* Check VEC(2) for last match first. If no match try VEC(1).
+	 */
+	test	%VRAX, %VRAX
 	jz	L(first_vec_x0_test)
 	.p2align 4,, 4
 L(first_vec_x1_return):
-	bsrl	%eax, %eax
+	bsr	%VRAX, %VRAX
 	leaq	(VEC_SIZE)(%rdi, %rax, CHAR_SIZE), %rax
 	ret
 
+
 	.p2align 4,, 10
 L(first_vec_x2):
-	VPCMP	$0, %YMMMATCH, %YMM3, %k1
-	kmovd	%k1, %eax
-	blsmskl	%ecx, %ecx
-	/* Check YMM3 for last match first. If no match try YMM2/YMM1.
-	 */
-	andl	%ecx, %eax
+	VPCMPEQ	%VMATCH, %VMM(3), %k1
+	KMOV	%k1, %VRAX
+	blsmsk	%VRCX, %VRCX
+	/* Check VEC(3) for last match first. If no match try
+	   VEC(2)/VEC(1).  */
+	and	%VRCX, %VRAX
 	jz	L(first_vec_x0_x1_test)
-	bsrl	%eax, %eax
+	bsr	%VRAX, %VRAX
 	leaq	(VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
 	ret
 
 
-	.p2align 4
+	.p2align 4,, 12
 L(aligned_more):
-	/* Need to keep original pointer incase YMM1 has last match.  */
+L(page_cross_continue):
+	/* Need to keep original pointer incase VEC(1) has last match.
+	 */
 	movq	%rdi, %rsi
 	andq	$-VEC_SIZE, %rdi
-	VMOVU	VEC_SIZE(%rdi), %YMM2
-	VPTESTN	%YMM2, %YMM2, %k0
-	kmovd	%k0, %ecx
-	testl	%ecx, %ecx
+
+	VMOVU	VEC_SIZE(%rdi), %VMM(2)
+	VPTESTN	%VMM(2), %VMM(2), %k0
+	KMOV	%k0, %VRCX
+
+	test	%VRCX, %VRCX
 	jnz	L(first_vec_x1)
 
-	VMOVU	(VEC_SIZE * 2)(%rdi), %YMM3
-	VPTESTN	%YMM3, %YMM3, %k0
-	kmovd	%k0, %ecx
-	testl	%ecx, %ecx
+	VMOVU	(VEC_SIZE * 2)(%rdi), %VMM(3)
+	VPTESTN	%VMM(3), %VMM(3), %k0
+	KMOV	%k0, %VRCX
+
+	test	%VRCX, %VRCX
 	jnz	L(first_vec_x2)
 
-	VMOVU	(VEC_SIZE * 3)(%rdi), %YMM4
-	VPTESTN	%YMM4, %YMM4, %k0
-	kmovd	%k0, %ecx
+	VMOVU	(VEC_SIZE * 3)(%rdi), %VMM(4)
+	VPTESTN	%VMM(4), %VMM(4), %k0
+	KMOV	%k0, %VRCX
 	movq	%rdi, %r8
-	testl	%ecx, %ecx
+	test	%VRCX, %VRCX
 	jnz	L(first_vec_x3)
 
 	andq	$-(VEC_SIZE * 2), %rdi
-	.p2align 4
+	.p2align 4,, 10
 L(first_aligned_loop):
-	/* Preserve YMM1, YMM2, YMM3, and YMM4 until we can gurantee
-	   they don't store a match.  */
-	VMOVA	(VEC_SIZE * 4)(%rdi), %YMM5
-	VMOVA	(VEC_SIZE * 5)(%rdi), %YMM6
+	/* Preserve VEC(1), VEC(2), VEC(3), and VEC(4) until we can
+	   gurantee they don't store a match.  */
+	VMOVA	(VEC_SIZE * 4)(%rdi), %VMM(5)
+	VMOVA	(VEC_SIZE * 5)(%rdi), %VMM(6)
 
-	VPCMP	$0, %YMM5, %YMMMATCH, %k2
-	vpxord	%YMM6, %YMMMATCH, %YMM7
+	VPCMPEQ	%VMM(5), %VMATCH, %k2
+	vpxord	%VMM(6), %VMATCH, %VMM(7)
 
-	VPMIN	%YMM5, %YMM6, %YMM8
-	VPMIN	%YMM8, %YMM7, %YMM7
+	VPMIN	%VMM(5), %VMM(6), %VMM(8)
+	VPMIN	%VMM(8), %VMM(7), %VMM(7)
 
-	VPTESTN	%YMM7, %YMM7, %k1
+	VPTESTN	%VMM(7), %VMM(7), %k1
 	subq	$(VEC_SIZE * -2), %rdi
-	kortestd %k1, %k2
+	KORTEST %k1, %k2
 	jz	L(first_aligned_loop)
 
-	VPCMP	$0, %YMM6, %YMMMATCH, %k3
-	VPTESTN	%YMM8, %YMM8, %k1
-	ktestd	%k1, %k1
+	VPCMPEQ	%VMM(6), %VMATCH, %k3
+	VPTESTN	%VMM(8), %VMM(8), %k1
+
+	/* If k1 is zero, then we found a CHAR match but no null-term.
+	   We can now safely throw out VEC1-4.  */
+	KTEST	%k1, %k1
 	jz	L(second_aligned_loop_prep)
 
-	kortestd %k2, %k3
+	KORTEST %k2, %k3
 	jnz	L(return_first_aligned_loop)
 
+
 	.p2align 4,, 6
 L(first_vec_x1_or_x2_or_x3):
-	VPCMP	$0, %YMM4, %YMMMATCH, %k4
-	kmovd	%k4, %eax
-	testl	%eax, %eax
+	VPCMPEQ	%VMM(4), %VMATCH, %k4
+	KMOV	%k4, %VRAX
+	bsr	%VRAX, %VRAX
 	jz	L(first_vec_x1_or_x2)
-	bsrl	%eax, %eax
 	leaq	(VEC_SIZE * 3)(%r8, %rax, CHAR_SIZE), %rax
 	ret
 
+
 	.p2align 4,, 8
 L(return_first_aligned_loop):
-	VPTESTN	%YMM5, %YMM5, %k0
-	kunpck	%k0, %k1, %k0
+	VPTESTN	%VMM(5), %VMM(5), %k0
+
+	/* Combined results from VEC5/6.  */
+	kunpck_2x %k0, %k1, %k0
 	kmov_2x	%k0, %maskz_2x
 
 	blsmsk	%maskz_2x, %maskz_2x
-	kunpck	%k2, %k3, %k3
+	kunpck_2x %k2, %k3, %k3
 	kmov_2x	%k3, %maskm_2x
 	and	%maskz_2x, %maskm_2x
 	jz	L(first_vec_x1_or_x2_or_x3)
@@ -280,47 +290,62 @@ L(return_first_aligned_loop):
 L(second_aligned_loop_prep):
 L(second_aligned_loop_set_furthest_match):
 	movq	%rdi, %rsi
-	kunpck	%k2, %k3, %k4
-
+	/* Ideally we would safe k2/k3 but `kmov/kunpck` take uops on
+	   port0 and have noticable overhead in the loop.  */
+	VMOVA	%VMM(5), %VMM(7)
+	VMOVA	%VMM(6), %VMM(8)
 	.p2align 4
 L(second_aligned_loop):
-	VMOVU	(VEC_SIZE * 4)(%rdi), %YMM1
-	VMOVU	(VEC_SIZE * 5)(%rdi), %YMM2
-
-	VPCMP	$0, %YMM1, %YMMMATCH, %k2
-	vpxord	%YMM2, %YMMMATCH, %YMM3
+	VMOVU	(VEC_SIZE * 4)(%rdi), %VMM(5)
+	VMOVU	(VEC_SIZE * 5)(%rdi), %VMM(6)
+	VPCMPEQ	%VMM(5), %VMATCH, %k2
+	vpxord	%VMM(6), %VMATCH, %VMM(3)
 
-	VPMIN	%YMM1, %YMM2, %YMM4
-	VPMIN	%YMM3, %YMM4, %YMM3
+	VPMIN	%VMM(5), %VMM(6), %VMM(4)
+	VPMIN	%VMM(3), %VMM(4), %VMM(3)
 
-	VPTESTN	%YMM3, %YMM3, %k1
+	VPTESTN	%VMM(3), %VMM(3), %k1
 	subq	$(VEC_SIZE * -2), %rdi
-	kortestd %k1, %k2
+	KORTEST %k1, %k2
 	jz	L(second_aligned_loop)
-
-	VPCMP	$0, %YMM2, %YMMMATCH, %k3
-	VPTESTN	%YMM4, %YMM4, %k1
-	ktestd	%k1, %k1
+	VPCMPEQ	%VMM(6), %VMATCH, %k3
+	VPTESTN	%VMM(4), %VMM(4), %k1
+	KTEST	%k1, %k1
 	jz	L(second_aligned_loop_set_furthest_match)
 
-	kortestd %k2, %k3
-	/* branch here because there is a significant advantage interms
-	   of output dependency chance in using edx.  */
+	/* branch here because we know we have a match in VEC7/8 but
+	   might not in VEC5/6 so the latter is expected to be less
+	   likely.  */
+	KORTEST %k2, %k3
 	jnz	L(return_new_match)
+
 L(return_old_match):
-	kmovq	%k4, %rax
-	bsrq	%rax, %rax
-	leaq	(VEC_SIZE * 2)(%rsi, %rax, CHAR_SIZE), %rax
+	VPCMPEQ	%VMM(8), %VMATCH, %k0
+	KMOV	%k0, %VRCX
+	bsr	%VRCX, %VRCX
+	jnz	L(return_old_match_ret)
+
+	VPCMPEQ	%VMM(7), %VMATCH, %k0
+	KMOV	%k0, %VRCX
+	bsr	%VRCX, %VRCX
+	subq	$VEC_SIZE, %rsi
+L(return_old_match_ret):
+	leaq	(VEC_SIZE * 3)(%rsi, %rcx, CHAR_SIZE), %rax
 	ret
 
+	.p2align 4,, 10
 L(return_new_match):
-	VPTESTN	%YMM1, %YMM1, %k0
-	kunpck	%k0, %k1, %k0
+	VPTESTN	%VMM(5), %VMM(5), %k0
+
+	/* Combined results from VEC5/6.  */
+	kunpck_2x %k0, %k1, %k0
 	kmov_2x	%k0, %maskz_2x
 
 	blsmsk	%maskz_2x, %maskz_2x
-	kunpck	%k2, %k3, %k3
+	kunpck_2x %k2, %k3, %k3
 	kmov_2x	%k3, %maskm_2x
+
+	/* Match at end was out-of-bounds so use last known match.  */
 	and	%maskz_2x, %maskm_2x
 	jz	L(return_old_match)
 
@@ -328,49 +353,53 @@ L(return_new_match):
 	leaq	(VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
 	ret
 
+	.p2align 4,, 4
 L(cross_page_boundary):
-	/* eax contains all the page offset bits of src (rdi). `xor rdi,
-	   rax` sets pointer will all page offset bits cleared so
-	   offset of (PAGE_SIZE - VEC_SIZE) will get last aligned VEC
-	   before page cross (guranteed to be safe to read). Doing this
-	   as opposed to `movq %rdi, %rax; andq $-VEC_SIZE, %rax` saves
-	   a bit of code size.  */
 	xorq	%rdi, %rax
-	VMOVU	(PAGE_SIZE - VEC_SIZE)(%rax), %YMM1
-	VPTESTN	%YMM1, %YMM1, %k0
-	kmovd	%k0, %ecx
+	mov	$-1, %VRDX
+	VMOVU	(PAGE_SIZE - VEC_SIZE)(%rax), %VMM(6)
+	VPTESTN	%VMM(6), %VMM(6), %k0
+	KMOV	%k0, %VRSI
+
+# ifdef USE_AS_WCSRCHR
+	movl	%edi, %ecx
+	and	$(VEC_SIZE - 1), %ecx
+	shrl	$2, %ecx
+# endif
+	shlx	%VGPR(SHIFT_REG), %VRDX, %VRDX
 
-	/* Shift out zero CHAR matches that are before the begining of
-	   src (rdi).  */
 # ifdef USE_AS_WCSRCHR
-	movl	%edi, %esi
-	andl	$(VEC_SIZE - 1), %esi
-	shrl	$2, %esi
+	kmovb	%edx, %k1
+# else
+	KMOV	%VRDX, %k1
 # endif
-	shrxl	%SHIFT_REG, %ecx, %ecx
 
-	testl	%ecx, %ecx
+	/* Need to adjust result to VEC(1) so it can be re-used by
+	   L(return_vec_x0_test).  The alternative is to collect VEC(1)
+	   will a page cross load which is far more expensive.  */
+	VPCOMPRESS %VMM(6), %VMM(1){%k1}{z}
+
+	/* We could technically just jmp back after the vpcompress but
+	   it doesn't save any 16-byte blocks.  */
+	shrx	%VGPR(SHIFT_REG), %VRSI, %VRSI
+	test	%VRSI, %VRSI
 	jz	L(page_cross_continue)
 
-	/* Found zero CHAR so need to test for search CHAR.  */
-	VPCMP	$0, %YMMMATCH, %YMM1, %k1
-	kmovd	%k1, %eax
-	/* Shift out search CHAR matches that are before the begining of
-	   src (rdi).  */
-	shrxl	%SHIFT_REG, %eax, %eax
-
-	/* Check if any search CHAR match in range.  */
-	blsmskl	%ecx, %ecx
-	andl	%ecx, %eax
-	jz	L(ret3)
-	bsrl	%eax, %eax
+	/* Duplicate of return logic from ENTRY. Doesn't cause spill to
+	   next cache line so might as well copy it here.  */
+	VPCMPEQ	%VMATCH, %VMM(1), %k1
+	KMOV	%k1, %VRAX
+	blsmsk	%VRSI, %VRSI
+	and	%VRSI, %VRAX
+	jz	L(ret_page_cross)
+	bsr	%VRAX, %VRAX
 # ifdef USE_AS_WCSRCHR
 	leaq	(%rdi, %rax, CHAR_SIZE), %rax
 # else
 	addq	%rdi, %rax
 # endif
-L(ret3):
+L(ret_page_cross):
 	ret
-
+	/* 1 byte till next cache line.  */
 END(STRRCHR)
 #endif
-- 
2.34.1


^ permalink raw reply	[flat|nested] 41+ messages in thread

* [PATCH v2 6/7] x86: Add support for VEC_SIZE == 64 in strcmp-evex.S impl
  2022-10-18 23:19 ` [PATCH v2 " Noah Goldstein
                     ` (3 preceding siblings ...)
  2022-10-18 23:19   ` [PATCH v2 5/7] x86: Optimize strrchr-evex.S and implement with VMM headers Noah Goldstein
@ 2022-10-18 23:19   ` Noah Goldstein
  2022-10-18 23:19   ` [PATCH v2 7/7] Bench: Improve benchtests for memchr, strchr, strnlen, strrchr Noah Goldstein
  5 siblings, 0 replies; 41+ messages in thread
From: Noah Goldstein @ 2022-10-18 23:19 UTC (permalink / raw)
  To: libc-alpha; +Cc: goldstein.w.n, hjl.tools, carlos

Unused at the moment, but evex512 strcmp, strncmp, strcasecmp{l}, and
strncasecmp{l} functions can be added by including strcmp-evex.S with
"x86-evex512-vecs.h" defined.

In addition save code size a bit in a few places.

1. tzcnt ...         -> bsf ...
2. vpcmp{b|d} $0 ... -> vpcmpeq{b|d}

This saves a touch of code size but has minimal net affect.

Full check passes on x86-64.
---
 sysdeps/x86_64/multiarch/strcmp-evex.S | 676 ++++++++++++++++---------
 1 file changed, 430 insertions(+), 246 deletions(-)

diff --git a/sysdeps/x86_64/multiarch/strcmp-evex.S b/sysdeps/x86_64/multiarch/strcmp-evex.S
index e482d0167f..756a3bb8d6 100644
--- a/sysdeps/x86_64/multiarch/strcmp-evex.S
+++ b/sysdeps/x86_64/multiarch/strcmp-evex.S
@@ -20,6 +20,10 @@
 
 #if ISA_SHOULD_BUILD (4)
 
+# ifndef VEC_SIZE
+#  include "x86-evex256-vecs.h"
+# endif
+
 # define STRCMP_ISA	_evex
 # include "strcmp-naming.h"
 
@@ -35,41 +39,57 @@
 # define PAGE_SIZE	4096
 
 	/* VEC_SIZE = Number of bytes in a ymm register.  */
-# define VEC_SIZE	32
 # define CHAR_PER_VEC	(VEC_SIZE	/	SIZE_OF_CHAR)
 
-# define VMOVU	vmovdqu64
-# define VMOVA	vmovdqa64
-
 # ifdef USE_AS_WCSCMP
-#  define TESTEQ	subl $0xff,
 	/* Compare packed dwords.  */
 #  define VPCMP	vpcmpd
+#  define VPCMPEQ	vpcmpeqd
 #  define VPMINU	vpminud
 #  define VPTESTM	vptestmd
 #  define VPTESTNM	vptestnmd
 	/* 1 dword char == 4 bytes.  */
 #  define SIZE_OF_CHAR	4
+
+#  define TESTEQ	sub $((1 << CHAR_PER_VEC) - 1),
+
+#  define USE_WIDE_CHAR
 # else
-#  define TESTEQ	incl
 	/* Compare packed bytes.  */
 #  define VPCMP	vpcmpb
+#  define VPCMPEQ	vpcmpeqb
 #  define VPMINU	vpminub
 #  define VPTESTM	vptestmb
 #  define VPTESTNM	vptestnmb
 	/* 1 byte char == 1 byte.  */
 #  define SIZE_OF_CHAR	1
+
+#  define TESTEQ	inc
+# endif
+
+# include "reg-macros.h"
+
+# if VEC_SIZE == 64
+#  define RODATA_SECTION	rodata.cst64
+# else
+#  define RODATA_SECTION	rodata.cst32
+# endif
+
+# if CHAR_PER_VEC == 64
+#  define FALLTHROUGH_RETURN_OFFSET	(VEC_SIZE * 3)
+# else
+#  define FALLTHROUGH_RETURN_OFFSET	(VEC_SIZE * 2)
 # endif
 
 # ifdef USE_AS_STRNCMP
-#  define LOOP_REG	r9d
+#  define LOOP_REG	VR9
 #  define LOOP_REG64	r9
 
 #  define OFFSET_REG8	r9b
 #  define OFFSET_REG	r9d
 #  define OFFSET_REG64	r9
 # else
-#  define LOOP_REG	edx
+#  define LOOP_REG	VRDX
 #  define LOOP_REG64	rdx
 
 #  define OFFSET_REG8	dl
@@ -83,32 +103,6 @@
 #  define VEC_OFFSET	(-VEC_SIZE)
 # endif
 
-# define XMM0	xmm17
-# define XMM1	xmm18
-
-# define XMM10	xmm27
-# define XMM11	xmm28
-# define XMM12	xmm29
-# define XMM13	xmm30
-# define XMM14	xmm31
-
-
-# define YMM0	ymm17
-# define YMM1	ymm18
-# define YMM2	ymm19
-# define YMM3	ymm20
-# define YMM4	ymm21
-# define YMM5	ymm22
-# define YMM6	ymm23
-# define YMM7	ymm24
-# define YMM8	ymm25
-# define YMM9	ymm26
-# define YMM10	ymm27
-# define YMM11	ymm28
-# define YMM12	ymm29
-# define YMM13	ymm30
-# define YMM14	ymm31
-
 # ifdef USE_AS_STRCASECMP_L
 #  define BYTE_LOOP_REG	OFFSET_REG
 # else
@@ -125,61 +119,72 @@
 #  endif
 # endif
 
-# define LCASE_MIN_YMM	%YMM12
-# define LCASE_MAX_YMM	%YMM13
-# define CASE_ADD_YMM	%YMM14
+# define LCASE_MIN_V	VMM(12)
+# define LCASE_MAX_V	VMM(13)
+# define CASE_ADD_V	VMM(14)
 
-# define LCASE_MIN_XMM	%XMM12
-# define LCASE_MAX_XMM	%XMM13
-# define CASE_ADD_XMM	%XMM14
+# if VEC_SIZE == 64
+#  define LCASE_MIN_YMM	VMM_256(12)
+#  define LCASE_MAX_YMM	VMM_256(13)
+#  define CASE_ADD_YMM	VMM_256(14)
+# endif
+
+# define LCASE_MIN_XMM	VMM_128(12)
+# define LCASE_MAX_XMM	VMM_128(13)
+# define CASE_ADD_XMM	VMM_128(14)
 
 	/* NB: wcsncmp uses r11 but strcasecmp is never used in
 	   conjunction with wcscmp.  */
 # define TOLOWER_BASE	%r11
 
 # ifdef USE_AS_STRCASECMP_L
-#  define _REG(x, y) x ## y
-#  define REG(x, y) _REG(x, y)
-#  define TOLOWER(reg1, reg2, ext)										\
-	vpsubb	REG(LCASE_MIN_, ext), reg1, REG(%ext, 10);					\
-	vpsubb	REG(LCASE_MIN_, ext), reg2, REG(%ext, 11);					\
-	vpcmpub	$1, REG(LCASE_MAX_, ext), REG(%ext, 10), %k5;				\
-	vpcmpub	$1, REG(LCASE_MAX_, ext), REG(%ext, 11), %k6;				\
-	vpaddb	reg1, REG(CASE_ADD_, ext), reg1{%k5};						\
-	vpaddb	reg2, REG(CASE_ADD_, ext), reg2{%k6}
-
-#  define TOLOWER_gpr(src, dst) movl (TOLOWER_BASE, src, 4), dst
-#  define TOLOWER_YMM(...)	TOLOWER(__VA_ARGS__, YMM)
-#  define TOLOWER_XMM(...)	TOLOWER(__VA_ARGS__, XMM)
-
-#  define CMP_R1_R2(s1_reg, s2_reg, reg_out, ext)						\
-	TOLOWER	(s1_reg, s2_reg, ext);										\
-	VPCMP	$0, s1_reg, s2_reg, reg_out
-
-#  define CMP_R1_S2(s1_reg, s2_mem, s2_reg, reg_out, ext)				\
-	VMOVU	s2_mem, s2_reg;												\
-	CMP_R1_R2(s1_reg, s2_reg, reg_out, ext)
-
-#  define CMP_R1_R2_YMM(...) CMP_R1_R2(__VA_ARGS__, YMM)
-#  define CMP_R1_R2_XMM(...) CMP_R1_R2(__VA_ARGS__, XMM)
-
-#  define CMP_R1_S2_YMM(...) CMP_R1_S2(__VA_ARGS__, YMM)
-#  define CMP_R1_S2_XMM(...) CMP_R1_S2(__VA_ARGS__, XMM)
+#  define _REG(x, y)	x ## y
+#  define REG(x, y)	_REG(x, y)
+#  define TOLOWER(reg1, reg2, ext, vec_macro)	\
+	vpsubb	%REG(LCASE_MIN_, ext), reg1, %vec_macro(10);	\
+	vpsubb	%REG(LCASE_MIN_, ext), reg2, %vec_macro(11);	\
+	vpcmpub	$1, %REG(LCASE_MAX_, ext), %vec_macro(10), %k5;	\
+	vpcmpub	$1, %REG(LCASE_MAX_, ext), %vec_macro(11), %k6;	\
+	vpaddb	reg1, %REG(CASE_ADD_, ext), reg1{%k5};	\
+	vpaddb	reg2, %REG(CASE_ADD_, ext), reg2{%k6}
+
+#  define TOLOWER_gpr(src, dst)	movl (TOLOWER_BASE, src, 4), dst
+#  define TOLOWER_VMM(...)	TOLOWER(__VA_ARGS__, V, VMM)
+#  define TOLOWER_YMM(...)	TOLOWER(__VA_ARGS__, YMM, VMM_256)
+#  define TOLOWER_XMM(...)	TOLOWER(__VA_ARGS__, XMM, VMM_128)
+
+#  define CMP_R1_R2(s1_reg, s2_reg, reg_out, ext, vec_macro)	\
+	TOLOWER	(s1_reg, s2_reg, ext, vec_macro);	\
+	VPCMPEQ	s1_reg, s2_reg, reg_out
+
+#  define CMP_R1_S2(s1_reg, s2_mem, s2_reg, reg_out, ext, vec_macro)	\
+	VMOVU	s2_mem, s2_reg;	\
+	CMP_R1_R2 (s1_reg, s2_reg, reg_out, ext, vec_macro)
+
+#  define CMP_R1_R2_VMM(...)	CMP_R1_R2(__VA_ARGS__, V, VMM)
+#  define CMP_R1_R2_YMM(...)	CMP_R1_R2(__VA_ARGS__, YMM, VMM_256)
+#  define CMP_R1_R2_XMM(...)	CMP_R1_R2(__VA_ARGS__, XMM, VMM_128)
+
+#  define CMP_R1_S2_VMM(...)	CMP_R1_S2(__VA_ARGS__, V, VMM)
+#  define CMP_R1_S2_YMM(...)	CMP_R1_S2(__VA_ARGS__, YMM, VMM_256)
+#  define CMP_R1_S2_XMM(...)	CMP_R1_S2(__VA_ARGS__, XMM, VMM_128)
 
 # else
 #  define TOLOWER_gpr(...)
+#  define TOLOWER_VMM(...)
 #  define TOLOWER_YMM(...)
 #  define TOLOWER_XMM(...)
 
-#  define CMP_R1_R2_YMM(s1_reg, s2_reg, reg_out)						\
-	VPCMP	$0, s2_reg, s1_reg, reg_out
+#  define CMP_R1_R2_VMM(s1_reg, s2_reg, reg_out)	\
+	VPCMPEQ	s2_reg, s1_reg, reg_out
 
-#  define CMP_R1_R2_XMM(...) CMP_R1_R2_YMM(__VA_ARGS__)
+#  define CMP_R1_R2_YMM(...)	CMP_R1_R2_VMM(__VA_ARGS__)
+#  define CMP_R1_R2_XMM(...)	CMP_R1_R2_VMM(__VA_ARGS__)
 
-#  define CMP_R1_S2_YMM(s1_reg, s2_mem, unused, reg_out)				\
-	VPCMP	$0, s2_mem, s1_reg, reg_out
-
-#  define CMP_R1_S2_XMM(...) CMP_R1_S2_YMM(__VA_ARGS__)
+#  define CMP_R1_S2_VMM(s1_reg, s2_mem, unused, reg_out)	\
+	VPCMPEQ	s2_mem, s1_reg, reg_out
+#  define CMP_R1_S2_YMM(...)	CMP_R1_S2_VMM(__VA_ARGS__)
+#  define CMP_R1_S2_XMM(...)	CMP_R1_S2_VMM(__VA_ARGS__)
 # endif
 
 /* Warning!
@@ -203,7 +208,7 @@
    the maximum offset is reached before a difference is found, zero is
    returned.  */
 
-	.section .text.evex, "ax", @progbits
+	.section SECTION(.text), "ax", @progbits
 	.align	16
 	.type	STRCMP, @function
 	.globl	STRCMP
@@ -232,7 +237,7 @@ STRCMP:
 #  else
 	mov	(%LOCALE_REG), %RAX_LP
 #  endif
-	testl	$1, LOCALE_DATA_VALUES + _NL_CTYPE_NONASCII_CASE * SIZEOF_VALUES(%rax)
+	testb	$1, LOCALE_DATA_VALUES + _NL_CTYPE_NONASCII_CASE * SIZEOF_VALUES(%rax)
 	jne	STRCASECMP_L_NONASCII
 	leaq	_nl_C_LC_CTYPE_tolower + 128 * 4(%rip), TOLOWER_BASE
 # endif
@@ -254,28 +259,46 @@ STRCMP:
 # endif
 
 # if defined USE_AS_STRCASECMP_L
-	.section .rodata.cst32, "aM", @progbits, 32
-	.align	32
+	.section RODATA_SECTION, "aM", @progbits, VEC_SIZE
+	.align	VEC_SIZE
 L(lcase_min):
 	.quad	0x4141414141414141
 	.quad	0x4141414141414141
 	.quad	0x4141414141414141
 	.quad	0x4141414141414141
+#  if VEC_SIZE == 64
+	.quad	0x4141414141414141
+	.quad	0x4141414141414141
+	.quad	0x4141414141414141
+	.quad	0x4141414141414141
+#  endif
 L(lcase_max):
 	.quad	0x1a1a1a1a1a1a1a1a
 	.quad	0x1a1a1a1a1a1a1a1a
 	.quad	0x1a1a1a1a1a1a1a1a
 	.quad	0x1a1a1a1a1a1a1a1a
+#  if VEC_SIZE == 64
+	.quad	0x1a1a1a1a1a1a1a1a
+	.quad	0x1a1a1a1a1a1a1a1a
+	.quad	0x1a1a1a1a1a1a1a1a
+	.quad	0x1a1a1a1a1a1a1a1a
+#  endif
 L(case_add):
 	.quad	0x2020202020202020
 	.quad	0x2020202020202020
 	.quad	0x2020202020202020
 	.quad	0x2020202020202020
+#  if VEC_SIZE == 64
+	.quad	0x2020202020202020
+	.quad	0x2020202020202020
+	.quad	0x2020202020202020
+	.quad	0x2020202020202020
+#  endif
 	.previous
 
-	vmovdqa64 L(lcase_min)(%rip), LCASE_MIN_YMM
-	vmovdqa64 L(lcase_max)(%rip), LCASE_MAX_YMM
-	vmovdqa64 L(case_add)(%rip), CASE_ADD_YMM
+	VMOVA	L(lcase_min)(%rip), %LCASE_MIN_V
+	VMOVA	L(lcase_max)(%rip), %LCASE_MAX_V
+	VMOVA	L(case_add)(%rip), %CASE_ADD_V
 # endif
 
 	movl	%edi, %eax
@@ -288,12 +311,12 @@ L(case_add):
 
 L(no_page_cross):
 	/* Safe to compare 4x vectors.  */
-	VMOVU	(%rdi), %YMM0
-	VPTESTM	%YMM0, %YMM0, %k2
+	VMOVU	(%rdi), %VMM(0)
+	VPTESTM	%VMM(0), %VMM(0), %k2
 	/* Each bit cleared in K1 represents a mismatch or a null CHAR
 	   in YMM0 and 32 bytes at (%rsi).  */
-	CMP_R1_S2_YMM (%YMM0, (%rsi), %YMM1, %k1){%k2}
-	kmovd	%k1, %ecx
+	CMP_R1_S2_VMM (%VMM(0), (%rsi), %VMM(1), %k1){%k2}
+	KMOV	%k1, %VRCX
 # ifdef USE_AS_STRNCMP
 	cmpq	$CHAR_PER_VEC, %rdx
 	jbe	L(vec_0_test_len)
@@ -303,14 +326,14 @@ L(no_page_cross):
 	   wcscmp/wcsncmp.  */
 
 	/* All 1s represents all equals. TESTEQ will overflow to zero in
-	   all equals case. Otherwise 1s will carry until position of first
-	   mismatch.  */
-	TESTEQ	%ecx
+	   all equals case. Otherwise 1s will carry until position of
+	   first mismatch.  */
+	TESTEQ	%VRCX
 	jz	L(more_3x_vec)
 
 	.p2align 4,, 4
 L(return_vec_0):
-	tzcntl	%ecx, %ecx
+	bsf	%VRCX, %VRCX
 # ifdef USE_AS_WCSCMP
 	movl	(%rdi, %rcx, SIZE_OF_CHAR), %edx
 	xorl	%eax, %eax
@@ -321,7 +344,16 @@ L(return_vec_0):
 	orl	$1, %eax
 # else
 	movzbl	(%rdi, %rcx), %eax
+	/* For VEC_SIZE == 64 use movb instead of movzbl to save a byte
+	   and keep logic for len <= VEC_SIZE (common) in just the
+	   first cache line.  NB: No evex512 processor has partial-
+	   register stalls. If that changes this ifdef can be disabled
+	   without affecting correctness.  */
+#  if !defined USE_AS_STRNCMP && !defined USE_AS_STRCASECMP_L && VEC_SIZE == 64
+	movb	(%rsi, %rcx), %cl
+#  else
 	movzbl	(%rsi, %rcx), %ecx
+#  endif
 	TOLOWER_gpr (%rax, %eax)
 	TOLOWER_gpr (%rcx, %ecx)
 	subl	%ecx, %eax
@@ -332,8 +364,8 @@ L(ret0):
 # ifdef USE_AS_STRNCMP
 	.p2align 4,, 4
 L(vec_0_test_len):
-	notl	%ecx
-	bzhil	%edx, %ecx, %eax
+	not	%VRCX
+	bzhi	%VRDX, %VRCX, %VRAX
 	jnz	L(return_vec_0)
 	/* Align if will cross fetch block.  */
 	.p2align 4,, 2
@@ -372,7 +404,7 @@ L(ret1):
 
 	.p2align 4,, 10
 L(return_vec_1):
-	tzcntl	%ecx, %ecx
+	bsf	%VRCX, %VRCX
 # ifdef USE_AS_STRNCMP
 	/* rdx must be > CHAR_PER_VEC so its safe to subtract without
 	   worrying about underflow.  */
@@ -401,24 +433,41 @@ L(ret2):
 	.p2align 4,, 10
 # ifdef USE_AS_STRNCMP
 L(return_vec_3):
-#  if CHAR_PER_VEC <= 16
+#  if CHAR_PER_VEC <= 32
+	/* If CHAR_PER_VEC <= 32 reuse code from L(return_vec_3) without
+	   additional branches by adjusting the bit positions from
+	   VEC3.  We can't do this for CHAR_PER_VEC == 64.  */
+#   if CHAR_PER_VEC <= 16
 	sall	$CHAR_PER_VEC, %ecx
-#  else
+#   else
 	salq	$CHAR_PER_VEC, %rcx
+#   endif
+#  else
+	/* If CHAR_PER_VEC == 64 we can't shift the return GPR so just
+	   check it.  */
+	bsf	%VRCX, %VRCX
+	addl	$(CHAR_PER_VEC), %ecx
+	cmpq	%rcx, %rdx
+	ja	L(ret_vec_3_finish)
+	xorl	%eax, %eax
+	ret
 #  endif
 # endif
+
+	/* If CHAR_PER_VEC == 64 we can't combine matches from the last
+	   2x VEC so need seperate return label.  */
 L(return_vec_2):
 # if (CHAR_PER_VEC <= 16) || !(defined USE_AS_STRNCMP)
-	tzcntl	%ecx, %ecx
+	bsf	%VRCX, %VRCX
 # else
-	tzcntq	%rcx, %rcx
+	bsfq	%rcx, %rcx
 # endif
-
 # ifdef USE_AS_STRNCMP
 	cmpq	%rcx, %rdx
 	jbe	L(ret_zero)
 # endif
 
+L(ret_vec_3_finish):
 # ifdef USE_AS_WCSCMP
 	movl	(VEC_SIZE * 2)(%rdi, %rcx, SIZE_OF_CHAR), %edx
 	xorl	%eax, %eax
@@ -440,7 +489,7 @@ L(ret3):
 # ifndef USE_AS_STRNCMP
 	.p2align 4,, 10
 L(return_vec_3):
-	tzcntl	%ecx, %ecx
+	bsf	%VRCX, %VRCX
 #  ifdef USE_AS_WCSCMP
 	movl	(VEC_SIZE * 3)(%rdi, %rcx, SIZE_OF_CHAR), %edx
 	xorl	%eax, %eax
@@ -465,11 +514,11 @@ L(ret4):
 	.p2align 5
 L(more_3x_vec):
 	/* Safe to compare 4x vectors.  */
-	VMOVU	(VEC_SIZE)(%rdi), %YMM0
-	VPTESTM	%YMM0, %YMM0, %k2
-	CMP_R1_S2_YMM (%YMM0, VEC_SIZE(%rsi), %YMM1, %k1){%k2}
-	kmovd	%k1, %ecx
-	TESTEQ	%ecx
+	VMOVU	(VEC_SIZE)(%rdi), %VMM(0)
+	VPTESTM	%VMM(0), %VMM(0), %k2
+	CMP_R1_S2_VMM (%VMM(0), VEC_SIZE(%rsi), %VMM(1), %k1){%k2}
+	KMOV	%k1, %VRCX
+	TESTEQ	%VRCX
 	jnz	L(return_vec_1)
 
 # ifdef USE_AS_STRNCMP
@@ -477,18 +526,18 @@ L(more_3x_vec):
 	jbe	L(ret_zero)
 # endif
 
-	VMOVU	(VEC_SIZE * 2)(%rdi), %YMM0
-	VPTESTM	%YMM0, %YMM0, %k2
-	CMP_R1_S2_YMM (%YMM0, (VEC_SIZE * 2)(%rsi), %YMM1, %k1){%k2}
-	kmovd	%k1, %ecx
-	TESTEQ	%ecx
+	VMOVU	(VEC_SIZE * 2)(%rdi), %VMM(0)
+	VPTESTM	%VMM(0), %VMM(0), %k2
+	CMP_R1_S2_VMM (%VMM(0), (VEC_SIZE * 2)(%rsi), %VMM(1), %k1){%k2}
+	KMOV	%k1, %VRCX
+	TESTEQ	%VRCX
 	jnz	L(return_vec_2)
 
-	VMOVU	(VEC_SIZE * 3)(%rdi), %YMM0
-	VPTESTM	%YMM0, %YMM0, %k2
-	CMP_R1_S2_YMM (%YMM0, (VEC_SIZE * 3)(%rsi), %YMM1, %k1){%k2}
-	kmovd	%k1, %ecx
-	TESTEQ	%ecx
+	VMOVU	(VEC_SIZE * 3)(%rdi), %VMM(0)
+	VPTESTM	%VMM(0), %VMM(0), %k2
+	CMP_R1_S2_VMM (%VMM(0), (VEC_SIZE * 3)(%rsi), %VMM(1), %k1){%k2}
+	KMOV	%k1, %VRCX
+	TESTEQ	%VRCX
 	jnz	L(return_vec_3)
 
 # ifdef USE_AS_STRNCMP
@@ -565,110 +614,123 @@ L(loop):
 
 	/* Loop entry after handling page cross during loop.  */
 L(loop_skip_page_cross_check):
-	VMOVA	(VEC_SIZE * 0)(%rdi), %YMM0
-	VMOVA	(VEC_SIZE * 1)(%rdi), %YMM2
-	VMOVA	(VEC_SIZE * 2)(%rdi), %YMM4
-	VMOVA	(VEC_SIZE * 3)(%rdi), %YMM6
+	VMOVA	(VEC_SIZE * 0)(%rdi), %VMM(0)
+	VMOVA	(VEC_SIZE * 1)(%rdi), %VMM(2)
+	VMOVA	(VEC_SIZE * 2)(%rdi), %VMM(4)
+	VMOVA	(VEC_SIZE * 3)(%rdi), %VMM(6)
 
-	VPMINU	%YMM0, %YMM2, %YMM8
-	VPMINU	%YMM4, %YMM6, %YMM9
+	VPMINU	%VMM(0), %VMM(2), %VMM(8)
+	VPMINU	%VMM(4), %VMM(6), %VMM(9)
 
 	/* A zero CHAR in YMM9 means that there is a null CHAR.  */
-	VPMINU	%YMM8, %YMM9, %YMM9
+	VPMINU	%VMM(8), %VMM(9), %VMM(9)
 
 	/* Each bit set in K1 represents a non-null CHAR in YMM9.  */
-	VPTESTM	%YMM9, %YMM9, %k1
+	VPTESTM	%VMM(9), %VMM(9), %k1
 # ifndef USE_AS_STRCASECMP_L
-	vpxorq	(VEC_SIZE * 0)(%rsi), %YMM0, %YMM1
-	vpxorq	(VEC_SIZE * 1)(%rsi), %YMM2, %YMM3
-	vpxorq	(VEC_SIZE * 2)(%rsi), %YMM4, %YMM5
+	vpxorq	(VEC_SIZE * 0)(%rsi), %VMM(0), %VMM(1)
+	vpxorq	(VEC_SIZE * 1)(%rsi), %VMM(2), %VMM(3)
+	vpxorq	(VEC_SIZE * 2)(%rsi), %VMM(4), %VMM(5)
 	/* Ternary logic to xor (VEC_SIZE * 3)(%rsi) with YMM6 while
 	   oring with YMM1. Result is stored in YMM6.  */
-	vpternlogd $0xde, (VEC_SIZE * 3)(%rsi), %YMM1, %YMM6
+	vpternlogd $0xde, (VEC_SIZE * 3)(%rsi), %VMM(1), %VMM(6)
 # else
-	VMOVU	(VEC_SIZE * 0)(%rsi), %YMM1
-	TOLOWER_YMM (%YMM0, %YMM1)
-	VMOVU	(VEC_SIZE * 1)(%rsi), %YMM3
-	TOLOWER_YMM (%YMM2, %YMM3)
-	VMOVU	(VEC_SIZE * 2)(%rsi), %YMM5
-	TOLOWER_YMM (%YMM4, %YMM5)
-	VMOVU	(VEC_SIZE * 3)(%rsi), %YMM7
-	TOLOWER_YMM (%YMM6, %YMM7)
-	vpxorq	%YMM0, %YMM1, %YMM1
-	vpxorq	%YMM2, %YMM3, %YMM3
-	vpxorq	%YMM4, %YMM5, %YMM5
-	vpternlogd $0xde, %YMM7, %YMM1, %YMM6
+	VMOVU	(VEC_SIZE * 0)(%rsi), %VMM(1)
+	TOLOWER_VMM (%VMM(0), %VMM(1))
+	VMOVU	(VEC_SIZE * 1)(%rsi), %VMM(3)
+	TOLOWER_VMM (%VMM(2), %VMM(3))
+	VMOVU	(VEC_SIZE * 2)(%rsi), %VMM(5)
+	TOLOWER_VMM (%VMM(4), %VMM(5))
+	VMOVU	(VEC_SIZE * 3)(%rsi), %VMM(7)
+	TOLOWER_VMM (%VMM(6), %VMM(7))
+	vpxorq	%VMM(0), %VMM(1), %VMM(1)
+	vpxorq	%VMM(2), %VMM(3), %VMM(3)
+	vpxorq	%VMM(4), %VMM(5), %VMM(5)
+	vpternlogd $0xde, %VMM(7), %VMM(1), %VMM(6)
 # endif
 	/* Or together YMM3, YMM5, and YMM6.  */
-	vpternlogd $0xfe, %YMM3, %YMM5, %YMM6
+	vpternlogd $0xfe, %VMM(3), %VMM(5), %VMM(6)
 
 
 	/* A non-zero CHAR in YMM6 represents a mismatch.  */
-	VPTESTNM %YMM6, %YMM6, %k0{%k1}
-	kmovd	%k0, %LOOP_REG
+	VPTESTNM %VMM(6), %VMM(6), %k0{%k1}
+	KMOV	%k0, %LOOP_REG
 
 	TESTEQ	%LOOP_REG
 	jz	L(loop)
 
 
 	/* Find which VEC has the mismatch of end of string.  */
-	VPTESTM	%YMM0, %YMM0, %k1
-	VPTESTNM %YMM1, %YMM1, %k0{%k1}
-	kmovd	%k0, %ecx
-	TESTEQ	%ecx
+	VPTESTM	%VMM(0), %VMM(0), %k1
+	VPTESTNM %VMM(1), %VMM(1), %k0{%k1}
+	KMOV	%k0, %VRCX
+	TESTEQ	%VRCX
 	jnz	L(return_vec_0_end)
 
-	VPTESTM	%YMM2, %YMM2, %k1
-	VPTESTNM %YMM3, %YMM3, %k0{%k1}
-	kmovd	%k0, %ecx
-	TESTEQ	%ecx
+	VPTESTM	%VMM(2), %VMM(2), %k1
+	VPTESTNM %VMM(3), %VMM(3), %k0{%k1}
+	KMOV	%k0, %VRCX
+	TESTEQ	%VRCX
 	jnz	L(return_vec_1_end)
 
 
-	/* Handle VEC 2 and 3 without branches.  */
+	/* Handle VEC 2 and 3 without branches if CHAR_PER_VEC <= 32.
+	 */
 L(return_vec_2_3_end):
 # ifdef USE_AS_STRNCMP
 	subq	$(CHAR_PER_VEC * 2), %rdx
 	jbe	L(ret_zero_end)
 # endif
 
-	VPTESTM	%YMM4, %YMM4, %k1
-	VPTESTNM %YMM5, %YMM5, %k0{%k1}
-	kmovd	%k0, %ecx
-	TESTEQ	%ecx
+	VPTESTM	%VMM(4), %VMM(4), %k1
+	VPTESTNM %VMM(5), %VMM(5), %k0{%k1}
+	KMOV	%k0, %VRCX
+	TESTEQ	%VRCX
 # if CHAR_PER_VEC <= 16
 	sall	$CHAR_PER_VEC, %LOOP_REG
 	orl	%ecx, %LOOP_REG
-# else
+# elif CHAR_PER_VEC <= 32
 	salq	$CHAR_PER_VEC, %LOOP_REG64
 	orq	%rcx, %LOOP_REG64
+# else
+	/* We aren't combining last 2x VEC so branch on second the last.
+	 */
+	jnz	L(return_vec_2_end)
 # endif
-L(return_vec_3_end):
+
 	/* LOOP_REG contains matches for null/mismatch from the loop. If
-	   VEC 0,1,and 2 all have no null and no mismatches then mismatch
-	   must entirely be from VEC 3 which is fully represented by
-	   LOOP_REG.  */
+	   VEC 0,1,and 2 all have no null and no mismatches then
+	   mismatch must entirely be from VEC 3 which is fully
+	   represented by LOOP_REG.  */
 # if CHAR_PER_VEC <= 16
-	tzcntl	%LOOP_REG, %LOOP_REG
+	bsf	%LOOP_REG, %LOOP_REG
 # else
-	tzcntq	%LOOP_REG64, %LOOP_REG64
+	bsfq	%LOOP_REG64, %LOOP_REG64
 # endif
 # ifdef USE_AS_STRNCMP
+
+	/* If CHAR_PER_VEC == 64 we can't combine last 2x VEC so need to
+	   adj length before last comparison.  */
+#  if CHAR_PER_VEC == 64
+	subq	$CHAR_PER_VEC, %rdx
+	jbe	L(ret_zero_end)
+#  endif
+
 	cmpq	%LOOP_REG64, %rdx
 	jbe	L(ret_zero_end)
 # endif
 
 # ifdef USE_AS_WCSCMP
-	movl	(VEC_SIZE * 2)(%rdi, %LOOP_REG64, SIZE_OF_CHAR), %ecx
+	movl	(FALLTHROUGH_RETURN_OFFSET)(%rdi, %LOOP_REG64, SIZE_OF_CHAR), %ecx
 	xorl	%eax, %eax
-	cmpl	(VEC_SIZE * 2)(%rsi, %LOOP_REG64, SIZE_OF_CHAR), %ecx
+	cmpl	(FALLTHROUGH_RETURN_OFFSET)(%rsi, %LOOP_REG64, SIZE_OF_CHAR), %ecx
 	je	L(ret5)
 	setl	%al
 	negl	%eax
 	xorl	%r8d, %eax
 # else
-	movzbl	(VEC_SIZE * 2)(%rdi, %LOOP_REG64), %eax
-	movzbl	(VEC_SIZE * 2)(%rsi, %LOOP_REG64), %ecx
+	movzbl	(FALLTHROUGH_RETURN_OFFSET)(%rdi, %LOOP_REG64), %eax
+	movzbl	(FALLTHROUGH_RETURN_OFFSET)(%rsi, %LOOP_REG64), %ecx
 	TOLOWER_gpr (%rax, %eax)
 	TOLOWER_gpr (%rcx, %ecx)
 	subl	%ecx, %eax
@@ -686,23 +748,39 @@ L(ret_zero_end):
 # endif
 
 
+
 	/* The L(return_vec_N_end) differ from L(return_vec_N) in that
-	   they use the value of `r8` to negate the return value. This is
-	   because the page cross logic can swap `rdi` and `rsi`.  */
+	   they use the value of `r8` to negate the return value. This
+	   is because the page cross logic can swap `rdi` and `rsi`.
+	 */
 	.p2align 4,, 10
 # ifdef USE_AS_STRNCMP
 L(return_vec_1_end):
-#  if CHAR_PER_VEC <= 16
+#  if CHAR_PER_VEC <= 32
+	/* If CHAR_PER_VEC <= 32 reuse code from L(return_vec_0_end)
+	   without additional branches by adjusting the bit positions
+	   from VEC1.  We can't do this for CHAR_PER_VEC == 64.  */
+#   if CHAR_PER_VEC <= 16
 	sall	$CHAR_PER_VEC, %ecx
-#  else
+#   else
 	salq	$CHAR_PER_VEC, %rcx
+#   endif
+#  else
+	/* If CHAR_PER_VEC == 64 we can't shift the return GPR so just
+	   check it.  */
+	bsf	%VRCX, %VRCX
+	addl	$(CHAR_PER_VEC), %ecx
+	cmpq	%rcx, %rdx
+	ja	L(ret_vec_0_end_finish)
+	xorl	%eax, %eax
+	ret
 #  endif
 # endif
 L(return_vec_0_end):
 # if (CHAR_PER_VEC <= 16) || !(defined USE_AS_STRNCMP)
-	tzcntl	%ecx, %ecx
+	bsf	%VRCX, %VRCX
 # else
-	tzcntq	%rcx, %rcx
+	bsfq	%rcx, %rcx
 # endif
 
 # ifdef USE_AS_STRNCMP
@@ -710,6 +788,7 @@ L(return_vec_0_end):
 	jbe	L(ret_zero_end)
 # endif
 
+L(ret_vec_0_end_finish):
 # ifdef USE_AS_WCSCMP
 	movl	(%rdi, %rcx, SIZE_OF_CHAR), %edx
 	xorl	%eax, %eax
@@ -737,7 +816,7 @@ L(ret6):
 # ifndef USE_AS_STRNCMP
 	.p2align 4,, 10
 L(return_vec_1_end):
-	tzcntl	%ecx, %ecx
+	bsf	%VRCX, %VRCX
 #  ifdef USE_AS_WCSCMP
 	movl	VEC_SIZE(%rdi, %rcx, SIZE_OF_CHAR), %edx
 	xorl	%eax, %eax
@@ -760,6 +839,41 @@ L(ret7):
 # endif
 
 
+	/* If CHAR_PER_VEC == 64 we can't combine matches from the last
+	   2x VEC so need seperate return label.  */
+# if CHAR_PER_VEC == 64
+L(return_vec_2_end):
+	bsf	%VRCX, %VRCX
+#  ifdef USE_AS_STRNCMP
+	cmpq	%rcx, %rdx
+	jbe	L(ret_zero_end)
+#  endif
+#  ifdef USE_AS_WCSCMP
+	movl	(VEC_SIZE * 2)(%rdi, %rcx, SIZE_OF_CHAR), %edx
+	xorl	%eax, %eax
+	cmpl	(VEC_SIZE * 2)(%rsi, %rcx, SIZE_OF_CHAR), %edx
+	je	L(ret31)
+	setl	%al
+	negl	%eax
+	/* This is the non-zero case for `eax` so just xorl with `r8d`
+	   flip is `rdi` and `rsi` where swapped.  */
+	xorl	%r8d, %eax
+#  else
+	movzbl	(VEC_SIZE * 2)(%rdi, %rcx), %eax
+	movzbl	(VEC_SIZE * 2)(%rsi, %rcx), %ecx
+	TOLOWER_gpr (%rax, %eax)
+	TOLOWER_gpr (%rcx, %ecx)
+	subl	%ecx, %eax
+	/* Flip `eax` if `rdi` and `rsi` where swapped in page cross
+	   logic. Subtract `r8d` after xor for zero case.  */
+	xorl	%r8d, %eax
+	subl	%r8d, %eax
+#  endif
+L(ret13):
+	ret
+# endif
+
+
 	/* Page cross in rsi in next 4x VEC.  */
 
 	/* TODO: Improve logic here.  */
@@ -778,11 +892,11 @@ L(page_cross_during_loop):
 	cmpl	$-(VEC_SIZE * 3), %eax
 	jle	L(less_1x_vec_till_page_cross)
 
-	VMOVA	(%rdi), %YMM0
-	VPTESTM	%YMM0, %YMM0, %k2
-	CMP_R1_S2_YMM (%YMM0, (%rsi), %YMM1, %k1){%k2}
-	kmovd	%k1, %ecx
-	TESTEQ	%ecx
+	VMOVA	(%rdi), %VMM(0)
+	VPTESTM	%VMM(0), %VMM(0), %k2
+	CMP_R1_S2_VMM (%VMM(0), (%rsi), %VMM(1), %k1){%k2}
+	KMOV	%k1, %VRCX
+	TESTEQ	%VRCX
 	jnz	L(return_vec_0_end)
 
 	/* if distance >= 2x VEC then eax > -(VEC_SIZE * 2).  */
@@ -799,9 +913,9 @@ L(less_1x_vec_till_page_cross):
 	   to read back -VEC_SIZE. If rdi is truly at the start of a page
 	   here, it means the previous page (rdi - VEC_SIZE) has already
 	   been loaded earlier so must be valid.  */
-	VMOVU	-VEC_SIZE(%rdi, %rax), %YMM0
-	VPTESTM	%YMM0, %YMM0, %k2
-	CMP_R1_S2_YMM (%YMM0, -VEC_SIZE(%rsi, %rax), %YMM1, %k1){%k2}
+	VMOVU	-VEC_SIZE(%rdi, %rax), %VMM(0)
+	VPTESTM	%VMM(0), %VMM(0), %k2
+	CMP_R1_S2_VMM (%VMM(0), -VEC_SIZE(%rsi, %rax), %VMM(1), %k1){%k2}
 	/* Mask of potentially valid bits. The lower bits can be out of
 	   range comparisons (but safe regarding page crosses).  */
 
@@ -813,12 +927,12 @@ L(less_1x_vec_till_page_cross):
 	shlxl	%ecx, %r10d, %ecx
 	movzbl	%cl, %r10d
 # else
-	movl	$-1, %ecx
-	shlxl	%esi, %ecx, %r10d
+	mov	$-1, %VRCX
+	shlx	%VRSI, %VRCX, %VR10
 # endif
 
-	kmovd	%k1, %ecx
-	notl	%ecx
+	KMOV	%k1, %VRCX
+	not	%VRCX
 
 
 # ifdef USE_AS_STRNCMP
@@ -838,12 +952,10 @@ L(less_1x_vec_till_page_cross):
 	/* Readjust eax before potentially returning to the loop.  */
 	addl	$(PAGE_SIZE - VEC_SIZE * 4), %eax
 
-	andl	%r10d, %ecx
+	and	%VR10, %VRCX
 	jz	L(loop_skip_page_cross_check)
 
-	.p2align 4,, 3
-L(return_page_cross_end):
-	tzcntl	%ecx, %ecx
+	bsf	%VRCX, %VRCX
 
 # if (defined USE_AS_STRNCMP) || (defined USE_AS_WCSCMP)
 	leal	-VEC_SIZE(%OFFSET_REG64, %rcx, SIZE_OF_CHAR), %ecx
@@ -874,8 +986,12 @@ L(ret8):
 # ifdef USE_AS_STRNCMP
 	.p2align 4,, 10
 L(return_page_cross_end_check):
-	andl	%r10d, %ecx
-	tzcntl	%ecx, %ecx
+	and	%VR10, %VRCX
+	/* Need to use tzcnt here as VRCX may be zero.  If VRCX is zero
+	   tzcnt(VRCX) will be CHAR_PER and remaining length (edx) is
+	   guranteed to be <= CHAR_PER_VEC so we will only use the return
+	   idx if VRCX was non-zero.  */
+	tzcnt	%VRCX, %VRCX
 	leal	-VEC_SIZE(%rax, %rcx, SIZE_OF_CHAR), %ecx
 #  ifdef USE_AS_WCSCMP
 	sall	$2, %edx
@@ -892,11 +1008,11 @@ L(more_2x_vec_till_page_cross):
 	/* If more 2x vec till cross we will complete a full loop
 	   iteration here.  */
 
-	VMOVA	VEC_SIZE(%rdi), %YMM0
-	VPTESTM	%YMM0, %YMM0, %k2
-	CMP_R1_S2_YMM (%YMM0, VEC_SIZE(%rsi), %YMM1, %k1){%k2}
-	kmovd	%k1, %ecx
-	TESTEQ	%ecx
+	VMOVA	VEC_SIZE(%rdi), %VMM(0)
+	VPTESTM	%VMM(0), %VMM(0), %k2
+	CMP_R1_S2_VMM (%VMM(0), VEC_SIZE(%rsi), %VMM(1), %k1){%k2}
+	KMOV	%k1, %VRCX
+	TESTEQ	%VRCX
 	jnz	L(return_vec_1_end)
 
 # ifdef USE_AS_STRNCMP
@@ -907,18 +1023,18 @@ L(more_2x_vec_till_page_cross):
 	subl	$-(VEC_SIZE * 4), %eax
 
 	/* Safe to include comparisons from lower bytes.  */
-	VMOVU	-(VEC_SIZE * 2)(%rdi, %rax), %YMM0
-	VPTESTM	%YMM0, %YMM0, %k2
-	CMP_R1_S2_YMM (%YMM0, -(VEC_SIZE * 2)(%rsi, %rax), %YMM1, %k1){%k2}
-	kmovd	%k1, %ecx
-	TESTEQ	%ecx
+	VMOVU	-(VEC_SIZE * 2)(%rdi, %rax), %VMM(0)
+	VPTESTM	%VMM(0), %VMM(0), %k2
+	CMP_R1_S2_VMM (%VMM(0), -(VEC_SIZE * 2)(%rsi, %rax), %VMM(1), %k1){%k2}
+	KMOV	%k1, %VRCX
+	TESTEQ	%VRCX
 	jnz	L(return_vec_page_cross_0)
 
-	VMOVU	-(VEC_SIZE * 1)(%rdi, %rax), %YMM0
-	VPTESTM	%YMM0, %YMM0, %k2
-	CMP_R1_S2_YMM (%YMM0, -(VEC_SIZE * 1)(%rsi, %rax), %YMM1, %k1){%k2}
-	kmovd	%k1, %ecx
-	TESTEQ	%ecx
+	VMOVU	-(VEC_SIZE * 1)(%rdi, %rax), %VMM(0)
+	VPTESTM	%VMM(0), %VMM(0), %k2
+	CMP_R1_S2_VMM (%VMM(0), -(VEC_SIZE * 1)(%rsi, %rax), %VMM(1), %k1){%k2}
+	KMOV	%k1, %VRCX
+	TESTEQ	%VRCX
 	jnz	L(return_vec_page_cross_1)
 
 # ifdef USE_AS_STRNCMP
@@ -937,30 +1053,30 @@ L(more_2x_vec_till_page_cross):
 # endif
 
 	/* Finish the loop.  */
-	VMOVA	(VEC_SIZE * 2)(%rdi), %YMM4
-	VMOVA	(VEC_SIZE * 3)(%rdi), %YMM6
-	VPMINU	%YMM4, %YMM6, %YMM9
-	VPTESTM	%YMM9, %YMM9, %k1
+	VMOVA	(VEC_SIZE * 2)(%rdi), %VMM(4)
+	VMOVA	(VEC_SIZE * 3)(%rdi), %VMM(6)
+	VPMINU	%VMM(4), %VMM(6), %VMM(9)
+	VPTESTM	%VMM(9), %VMM(9), %k1
 # ifndef USE_AS_STRCASECMP_L
-	vpxorq	(VEC_SIZE * 2)(%rsi), %YMM4, %YMM5
+	vpxorq	(VEC_SIZE * 2)(%rsi), %VMM(4), %VMM(5)
 	/* YMM6 = YMM5 | ((VEC_SIZE * 3)(%rsi) ^ YMM6).  */
-	vpternlogd $0xde, (VEC_SIZE * 3)(%rsi), %YMM5, %YMM6
+	vpternlogd $0xde, (VEC_SIZE * 3)(%rsi), %VMM(5), %VMM(6)
 # else
-	VMOVU	(VEC_SIZE * 2)(%rsi), %YMM5
-	TOLOWER_YMM (%YMM4, %YMM5)
-	VMOVU	(VEC_SIZE * 3)(%rsi), %YMM7
-	TOLOWER_YMM (%YMM6, %YMM7)
-	vpxorq	%YMM4, %YMM5, %YMM5
-	vpternlogd $0xde, %YMM7, %YMM5, %YMM6
-# endif
-	VPTESTNM %YMM6, %YMM6, %k0{%k1}
-	kmovd	%k0, %LOOP_REG
+	VMOVU	(VEC_SIZE * 2)(%rsi), %VMM(5)
+	TOLOWER_VMM (%VMM(4), %VMM(5))
+	VMOVU	(VEC_SIZE * 3)(%rsi), %VMM(7)
+	TOLOWER_VMM (%VMM(6), %VMM(7))
+	vpxorq	%VMM(4), %VMM(5), %VMM(5)
+	vpternlogd $0xde, %VMM(7), %VMM(5), %VMM(6)
+# endif
+	VPTESTNM %VMM(6), %VMM(6), %k0{%k1}
+	KMOV	%k0, %LOOP_REG
 	TESTEQ	%LOOP_REG
 	jnz	L(return_vec_2_3_end)
 
 	/* Best for code size to include ucond-jmp here. Would be faster
-	   if this case is hot to duplicate the L(return_vec_2_3_end) code
-	   as fall-through and have jump back to loop on mismatch
+	   if this case is hot to duplicate the L(return_vec_2_3_end)
+	   code as fall-through and have jump back to loop on mismatch
 	   comparison.  */
 	subq	$-(VEC_SIZE * 4), %rdi
 	subq	$-(VEC_SIZE * 4), %rsi
@@ -980,7 +1096,7 @@ L(ret_zero_in_loop_page_cross):
 L(return_vec_page_cross_0):
 	addl	$-VEC_SIZE, %eax
 L(return_vec_page_cross_1):
-	tzcntl	%ecx, %ecx
+	bsf	%VRCX, %VRCX
 # if defined USE_AS_STRNCMP || defined USE_AS_WCSCMP
 	leal	-VEC_SIZE(%rax, %rcx, SIZE_OF_CHAR), %ecx
 #  ifdef USE_AS_STRNCMP
@@ -1023,8 +1139,8 @@ L(ret9):
 L(page_cross):
 # ifndef USE_AS_STRNCMP
 	/* If both are VEC aligned we don't need any special logic here.
-	   Only valid for strcmp where stop condition is guranteed to be
-	   reachable by just reading memory.  */
+	   Only valid for strcmp where stop condition is guranteed to
+	   be reachable by just reading memory.  */
 	testl	$((VEC_SIZE - 1) << 20), %eax
 	jz	L(no_page_cross)
 # endif
@@ -1065,11 +1181,11 @@ L(page_cross):
 	   loadable memory until within 1x VEC of page cross.  */
 	.p2align 4,, 8
 L(page_cross_loop):
-	VMOVU	(%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %YMM0
-	VPTESTM	%YMM0, %YMM0, %k2
-	CMP_R1_S2_YMM (%YMM0, (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %YMM1, %k1){%k2}
-	kmovd	%k1, %ecx
-	TESTEQ	%ecx
+	VMOVU	(%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %VMM(0)
+	VPTESTM	%VMM(0), %VMM(0), %k2
+	CMP_R1_S2_VMM (%VMM(0), (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %VMM(1), %k1){%k2}
+	KMOV	%k1, %VRCX
+	TESTEQ	%VRCX
 	jnz	L(check_ret_vec_page_cross)
 	addl	$CHAR_PER_VEC, %OFFSET_REG
 # ifdef USE_AS_STRNCMP
@@ -1087,13 +1203,13 @@ L(page_cross_loop):
 	subl	%eax, %OFFSET_REG
 	/* OFFSET_REG has distance to page cross - VEC_SIZE. Guranteed
 	   to not cross page so is safe to load. Since we have already
-	   loaded at least 1 VEC from rsi it is also guranteed to be safe.
-	 */
-	VMOVU	(%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %YMM0
-	VPTESTM	%YMM0, %YMM0, %k2
-	CMP_R1_S2_YMM (%YMM0, (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %YMM1, %k1){%k2}
+	   loaded at least 1 VEC from rsi it is also guranteed to be
+	   safe.  */
+	VMOVU	(%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %VMM(0)
+	VPTESTM	%VMM(0), %VMM(0), %k2
+	CMP_R1_S2_VMM (%VMM(0), (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %VMM(1), %k1){%k2}
 
-	kmovd	%k1, %ecx
+	KMOV	%k1, %VRCX
 # ifdef USE_AS_STRNCMP
 	leal	CHAR_PER_VEC(%OFFSET_REG64), %eax
 	cmpq	%rax, %rdx
@@ -1104,7 +1220,7 @@ L(page_cross_loop):
 	addq	%rdi, %rdx
 #  endif
 # endif
-	TESTEQ	%ecx
+	TESTEQ	%VRCX
 	jz	L(prepare_loop_no_len)
 
 	.p2align 4,, 4
@@ -1112,7 +1228,7 @@ L(ret_vec_page_cross):
 # ifndef USE_AS_STRNCMP
 L(check_ret_vec_page_cross):
 # endif
-	tzcntl	%ecx, %ecx
+	tzcnt	%VRCX, %VRCX
 	addl	%OFFSET_REG, %ecx
 L(ret_vec_page_cross_cont):
 # ifdef USE_AS_WCSCMP
@@ -1139,9 +1255,9 @@ L(ret12):
 # ifdef USE_AS_STRNCMP
 	.p2align 4,, 10
 L(check_ret_vec_page_cross2):
-	TESTEQ	%ecx
+	TESTEQ	%VRCX
 L(check_ret_vec_page_cross):
-	tzcntl	%ecx, %ecx
+	tzcnt	%VRCX, %VRCX
 	addl	%OFFSET_REG, %ecx
 	cmpq	%rcx, %rdx
 	ja	L(ret_vec_page_cross_cont)
@@ -1180,8 +1296,71 @@ L(less_1x_vec_till_page):
 # ifdef USE_AS_WCSCMP
 	shrl	$2, %eax
 # endif
+
+	/* Find largest load size we can use. VEC_SIZE == 64 only check
+	   if we can do a full ymm load.  */
+# if VEC_SIZE == 64
+
+	cmpl	$((VEC_SIZE - 32) / SIZE_OF_CHAR), %eax
+	ja	L(less_32_till_page)
+
+
+	/* Use 16 byte comparison.  */
+	VMOVU	(%rdi), %VMM_256(0)
+	VPTESTM	%VMM_256(0), %VMM_256(0), %k2
+	CMP_R1_S2_YMM (%VMM_256(0), (%rsi), %VMM_256(1), %k1){%k2}
+	kmovd	%k1, %ecx
+#  ifdef USE_AS_WCSCMP
+	subl	$0xff, %ecx
+#  else
+	incl	%ecx
+#  endif
+	jnz	L(check_ret_vec_page_cross)
+	movl	$((VEC_SIZE - 32) / SIZE_OF_CHAR), %OFFSET_REG
+#  ifdef USE_AS_STRNCMP
+	cmpq	%OFFSET_REG64, %rdx
+	jbe	L(ret_zero_page_cross_slow_case64)
+	subl	%eax, %OFFSET_REG
+#  else
+	/* Explicit check for 32 byte alignment.  */
+	subl	%eax, %OFFSET_REG
+	jz	L(prepare_loop)
+#  endif
+	VMOVU	(%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %VMM_256(0)
+	VPTESTM	%VMM_256(0), %VMM_256(0), %k2
+	CMP_R1_S2_YMM (%VMM_256(0), (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %VMM_256(1), %k1){%k2}
+	kmovd	%k1, %ecx
+#  ifdef USE_AS_WCSCMP
+	subl	$0xff, %ecx
+#  else
+	incl	%ecx
+#  endif
+	jnz	L(check_ret_vec_page_cross)
+#  ifdef USE_AS_STRNCMP
+	addl	$(32 / SIZE_OF_CHAR), %OFFSET_REG
+	subq	%OFFSET_REG64, %rdx
+	jbe	L(ret_zero_page_cross_slow_case64)
+	subq	$-(CHAR_PER_VEC * 4), %rdx
+
+	leaq	-(VEC_SIZE * 4)(%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %rdi
+	leaq	-(VEC_SIZE * 4)(%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %rsi
+#  else
+	leaq	(32 - VEC_SIZE * 4)(%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %rdi
+	leaq	(32 - VEC_SIZE * 4)(%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %rsi
+#  endif
+	jmp	L(prepare_loop_aligned)
+
+#  ifdef USE_AS_STRNCMP
+	.p2align 4,, 2
+L(ret_zero_page_cross_slow_case64):
+	xorl	%eax, %eax
+	ret
+#  endif
+L(less_32_till_page):
+# endif
+
 	/* Find largest load size we can use.  */
-	cmpl	$(16 / SIZE_OF_CHAR), %eax
+	cmpl	$((VEC_SIZE - 16) / SIZE_OF_CHAR), %eax
 	ja	L(less_16_till_page)
 
 	/* Use 16 byte comparison.  */
@@ -1195,9 +1374,14 @@ L(less_1x_vec_till_page):
 	incw	%cx
 # endif
 	jnz	L(check_ret_vec_page_cross)
-	movl	$(16 / SIZE_OF_CHAR), %OFFSET_REG
+
+	movl	$((VEC_SIZE - 16) / SIZE_OF_CHAR), %OFFSET_REG
 # ifdef USE_AS_STRNCMP
+#  if VEC_SIZE == 32
 	cmpq	%OFFSET_REG64, %rdx
+#  else
+	cmpq	$(16 / SIZE_OF_CHAR), %rdx
+#  endif
 	jbe	L(ret_zero_page_cross_slow_case0)
 	subl	%eax, %OFFSET_REG
 # else
@@ -1239,7 +1423,7 @@ L(ret_zero_page_cross_slow_case0):
 
 	.p2align 4,, 10
 L(less_16_till_page):
-	cmpl	$(24 / SIZE_OF_CHAR), %eax
+	cmpl	$((VEC_SIZE - 8) / SIZE_OF_CHAR), %eax
 	ja	L(less_8_till_page)
 
 	/* Use 8 byte comparison.  */
@@ -1260,7 +1444,7 @@ L(less_16_till_page):
 	cmpq	$(8 / SIZE_OF_CHAR), %rdx
 	jbe	L(ret_zero_page_cross_slow_case0)
 # endif
-	movl	$(24 / SIZE_OF_CHAR), %OFFSET_REG
+	movl	$((VEC_SIZE - 8) / SIZE_OF_CHAR), %OFFSET_REG
 	subl	%eax, %OFFSET_REG
 
 	vmovq	(%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm0
@@ -1320,7 +1504,7 @@ L(ret_less_8_wcs):
 	ret
 
 # else
-	cmpl	$28, %eax
+	cmpl	$(VEC_SIZE - 4), %eax
 	ja	L(less_4_till_page)
 
 	vmovd	(%rdi), %xmm0
@@ -1335,7 +1519,7 @@ L(ret_less_8_wcs):
 	cmpq	$4, %rdx
 	jbe	L(ret_zero_page_cross_slow_case1)
 #  endif
-	movl	$(28 / SIZE_OF_CHAR), %OFFSET_REG
+	movl	$((VEC_SIZE - 4) / SIZE_OF_CHAR), %OFFSET_REG
 	subl	%eax, %OFFSET_REG
 
 	vmovd	(%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm0
@@ -1386,7 +1570,7 @@ L(less_4_loop):
 #  endif
 	incq	%rdi
 	/* end condition is reach page boundary (rdi is aligned).  */
-	testl	$31, %edi
+	testb	$(VEC_SIZE - 1), %dil
 	jnz	L(less_4_loop)
 	leaq	-(VEC_SIZE * 4)(%rdi, %rsi), %rsi
 	addq	$-(VEC_SIZE * 4), %rdi
-- 
2.34.1


^ permalink raw reply	[flat|nested] 41+ messages in thread

* [PATCH v2 7/7] Bench: Improve benchtests for memchr, strchr, strnlen, strrchr
  2022-10-18 23:19 ` [PATCH v2 " Noah Goldstein
                     ` (4 preceding siblings ...)
  2022-10-18 23:19   ` [PATCH v2 6/7] x86: Add support for VEC_SIZE == 64 in strcmp-evex.S impl Noah Goldstein
@ 2022-10-18 23:19   ` Noah Goldstein
  2022-10-19  0:01     ` H.J. Lu
  5 siblings, 1 reply; 41+ messages in thread
From: Noah Goldstein @ 2022-10-18 23:19 UTC (permalink / raw)
  To: libc-alpha; +Cc: goldstein.w.n, hjl.tools, carlos

1. Add more complete coverage in the medium size range.
2. In strnlen remove the `1 << i` which was UB (`i` could go beyond
   32/64)
3. Add timer for total benchmark runtime (useful for deciding about
   tradeoff between coverage and runtime).
---
 benchtests/bench-memchr.c    | 77 +++++++++++++++++++++++++-----------
 benchtests/bench-rawmemchr.c | 30 ++++++++++++--
 benchtests/bench-strchr.c    | 35 +++++++++++-----
 benchtests/bench-strnlen.c   | 12 +++---
 benchtests/bench-strrchr.c   | 28 ++++++++++++-
 5 files changed, 137 insertions(+), 45 deletions(-)

diff --git a/benchtests/bench-memchr.c b/benchtests/bench-memchr.c
index 0facda2fa0..2ec9dd86d0 100644
--- a/benchtests/bench-memchr.c
+++ b/benchtests/bench-memchr.c
@@ -126,7 +126,7 @@ do_test (json_ctx_t *json_ctx, size_t align, size_t pos, size_t len,
 int
 test_main (void)
 {
-  size_t i;
+  size_t i, j, al, al_max;
   int repeats;
   json_ctx_t json_ctx;
   test_init ();
@@ -147,35 +147,46 @@ test_main (void)
 
   json_array_begin (&json_ctx, "results");
 
+  al_max = 0;
+#ifdef USE_AS_MEMRCHR
+  al_max = getpagesize () / 2;
+#endif
+
   for (repeats = 0; repeats < 2; ++repeats)
     {
-      for (i = 1; i < 8; ++i)
+      for (al = 0; al <= al_max; al += getpagesize () / 2)
 	{
-	  do_test (&json_ctx, 0, 16 << i, 2048, 23, repeats);
-	  do_test (&json_ctx, i, 64, 256, 23, repeats);
-	  do_test (&json_ctx, 0, 16 << i, 2048, 0, repeats);
-	  do_test (&json_ctx, i, 64, 256, 0, repeats);
-
-	  do_test (&json_ctx, getpagesize () - 15, 64, 256, 0, repeats);
+	  for (i = 1; i < 8; ++i)
+	    {
+	      do_test (&json_ctx, al, 16 << i, 2048, 23, repeats);
+	      do_test (&json_ctx, al + i, 64, 256, 23, repeats);
+	      do_test (&json_ctx, al, 16 << i, 2048, 0, repeats);
+	      do_test (&json_ctx, al + i, 64, 256, 0, repeats);
+
+	      do_test (&json_ctx, al + getpagesize () - 15, 64, 256, 0,
+		       repeats);
 #ifdef USE_AS_MEMRCHR
-	  /* Also test the position close to the beginning for memrchr.  */
-	  do_test (&json_ctx, 0, i, 256, 23, repeats);
-	  do_test (&json_ctx, 0, i, 256, 0, repeats);
-	  do_test (&json_ctx, i, i, 256, 23, repeats);
-	  do_test (&json_ctx, i, i, 256, 0, repeats);
+	      /* Also test the position close to the beginning for memrchr.  */
+	      do_test (&json_ctx, al, i, 256, 23, repeats);
+	      do_test (&json_ctx, al, i, 256, 0, repeats);
+	      do_test (&json_ctx, al + i, i, 256, 23, repeats);
+	      do_test (&json_ctx, al + i, i, 256, 0, repeats);
 #endif
+	    }
+	  for (i = 1; i < 8; ++i)
+	    {
+	      do_test (&json_ctx, al + i, i << 5, 192, 23, repeats);
+	      do_test (&json_ctx, al + i, i << 5, 192, 0, repeats);
+	      do_test (&json_ctx, al + i, i << 5, 256, 23, repeats);
+	      do_test (&json_ctx, al + i, i << 5, 256, 0, repeats);
+	      do_test (&json_ctx, al + i, i << 5, 512, 23, repeats);
+	      do_test (&json_ctx, al + i, i << 5, 512, 0, repeats);
+
+	      do_test (&json_ctx, al + getpagesize () - 15, i << 5, 256, 23,
+		       repeats);
+	    }
 	}
-      for (i = 1; i < 8; ++i)
-	{
-	  do_test (&json_ctx, i, i << 5, 192, 23, repeats);
-	  do_test (&json_ctx, i, i << 5, 192, 0, repeats);
-	  do_test (&json_ctx, i, i << 5, 256, 23, repeats);
-	  do_test (&json_ctx, i, i << 5, 256, 0, repeats);
-	  do_test (&json_ctx, i, i << 5, 512, 23, repeats);
-	  do_test (&json_ctx, i, i << 5, 512, 0, repeats);
-
-	  do_test (&json_ctx, getpagesize () - 15, i << 5, 256, 23, repeats);
-	}
+
       for (i = 1; i < 32; ++i)
 	{
 	  do_test (&json_ctx, 0, i, i + 1, 23, repeats);
@@ -207,6 +218,24 @@ test_main (void)
 	  do_test (&json_ctx, 0, 2, i + 1, 0, repeats);
 #endif
 	}
+      for (al = 0; al <= al_max; al += getpagesize () / 2)
+	{
+	  for (i = (16 / sizeof (CHAR)); i <= (8192 / sizeof (CHAR)); i += i)
+	    {
+	      for (j = 0; j <= (384 / sizeof (CHAR));
+		   j += (32 / sizeof (CHAR)))
+		{
+		  do_test (&json_ctx, al, i + j, i, 23, repeats);
+		  do_test (&json_ctx, al, i, i + j, 23, repeats);
+		  if (j < i)
+		    {
+		      do_test (&json_ctx, al, i - j, i, 23, repeats);
+		      do_test (&json_ctx, al, i, i - j, 23, repeats);
+		    }
+		}
+	    }
+	}
+
 #ifndef USE_AS_MEMRCHR
       break;
 #endif
diff --git a/benchtests/bench-rawmemchr.c b/benchtests/bench-rawmemchr.c
index b1803afc14..dab77f3858 100644
--- a/benchtests/bench-rawmemchr.c
+++ b/benchtests/bench-rawmemchr.c
@@ -70,7 +70,7 @@ do_test (json_ctx_t *json_ctx, size_t align, size_t pos, size_t len, int seek_ch
   size_t i;
   char *result;
 
-  align &= 7;
+  align &= getpagesize () - 1;
   if (align + len >= page_size)
     return;
 
@@ -106,7 +106,6 @@ test_main (void)
 {
   json_ctx_t json_ctx;
   size_t i;
-
   test_init ();
 
   json_init (&json_ctx, 0, stdout);
@@ -120,7 +119,7 @@ test_main (void)
 
   json_array_begin (&json_ctx, "ifuncs");
   FOR_EACH_IMPL (impl, 0)
-      json_element_string (&json_ctx, impl->name);
+    json_element_string (&json_ctx, impl->name);
   json_array_end (&json_ctx);
 
   json_array_begin (&json_ctx, "results");
@@ -137,6 +136,31 @@ test_main (void)
       do_test (&json_ctx, 0, i, i + 1, 23);
       do_test (&json_ctx, 0, i, i + 1, 0);
     }
+  for (; i < 256; i += 32)
+    {
+      do_test (&json_ctx, 0, i, i + 1, 23);
+      do_test (&json_ctx, 0, i - 1, i, 23);
+    }
+  for (; i < 512; i += 64)
+    {
+      do_test (&json_ctx, 0, i, i + 1, 23);
+      do_test (&json_ctx, 0, i - 1, i, 23);
+    }
+  for (; i < 1024; i += 128)
+    {
+      do_test (&json_ctx, 0, i, i + 1, 23);
+      do_test (&json_ctx, 0, i - 1, i, 23);
+    }
+  for (; i < 2048; i += 256)
+    {
+      do_test (&json_ctx, 0, i, i + 1, 23);
+      do_test (&json_ctx, 0, i - 1, i, 23);
+    }
+  for (; i < 4096; i += 512)
+    {
+      do_test (&json_ctx, 0, i, i + 1, 23);
+      do_test (&json_ctx, 0, i - 1, i, 23);
+    }
 
   json_array_end (&json_ctx);
   json_attr_object_end (&json_ctx);
diff --git a/benchtests/bench-strchr.c b/benchtests/bench-strchr.c
index 54640bde7e..aeb882d442 100644
--- a/benchtests/bench-strchr.c
+++ b/benchtests/bench-strchr.c
@@ -287,8 +287,8 @@ int
 test_main (void)
 {
   json_ctx_t json_ctx;
-  size_t i;
 
+  size_t i, j;
   test_init ();
 
   json_init (&json_ctx, 0, stdout);
@@ -367,15 +367,30 @@ test_main (void)
       do_test (&json_ctx, 0, i, i + 1, 0, BIG_CHAR);
     }
 
-  DO_RAND_TEST(&json_ctx, 0, 15, 16, 0.0);
-  DO_RAND_TEST(&json_ctx, 0, 15, 16, 0.1);
-  DO_RAND_TEST(&json_ctx, 0, 15, 16, 0.25);
-  DO_RAND_TEST(&json_ctx, 0, 15, 16, 0.33);
-  DO_RAND_TEST(&json_ctx, 0, 15, 16, 0.5);
-  DO_RAND_TEST(&json_ctx, 0, 15, 16, 0.66);
-  DO_RAND_TEST(&json_ctx, 0, 15, 16, 0.75);
-  DO_RAND_TEST(&json_ctx, 0, 15, 16, 0.9);
-  DO_RAND_TEST(&json_ctx, 0, 15, 16, 1.0);
+  for (i = 16 / sizeof (CHAR); i <= 8192 / sizeof (CHAR); i += i)
+    {
+      for (j = 32 / sizeof (CHAR); j <= 320 / sizeof (CHAR);
+	   j += 32 / sizeof (CHAR))
+	{
+	  do_test (&json_ctx, 0, i, i + j, 0, MIDDLE_CHAR);
+	  do_test (&json_ctx, 0, i + j, i, 0, MIDDLE_CHAR);
+	  if (i > j)
+	    {
+	      do_test (&json_ctx, 0, i, i - j, 0, MIDDLE_CHAR);
+	      do_test (&json_ctx, 0, i - j, i, 0, MIDDLE_CHAR);
+	    }
+	}
+    }
+
+  DO_RAND_TEST (&json_ctx, 0, 15, 16, 0.0);
+  DO_RAND_TEST (&json_ctx, 0, 15, 16, 0.1);
+  DO_RAND_TEST (&json_ctx, 0, 15, 16, 0.25);
+  DO_RAND_TEST (&json_ctx, 0, 15, 16, 0.33);
+  DO_RAND_TEST (&json_ctx, 0, 15, 16, 0.5);
+  DO_RAND_TEST (&json_ctx, 0, 15, 16, 0.66);
+  DO_RAND_TEST (&json_ctx, 0, 15, 16, 0.75);
+  DO_RAND_TEST (&json_ctx, 0, 15, 16, 0.9);
+  DO_RAND_TEST (&json_ctx, 0, 15, 16, 1.0);
 
   json_array_end (&json_ctx);
   json_attr_object_end (&json_ctx);
diff --git a/benchtests/bench-strnlen.c b/benchtests/bench-strnlen.c
index 13b46b3f57..82c02eb6ed 100644
--- a/benchtests/bench-strnlen.c
+++ b/benchtests/bench-strnlen.c
@@ -195,19 +195,19 @@ test_main (void)
     {
       for (j = 0; j <= (704 / sizeof (CHAR)); j += (32 / sizeof (CHAR)))
 	{
-	  do_test (&json_ctx, 0, 1 << i, (i + j), BIG_CHAR);
 	  do_test (&json_ctx, 0, i + j, i, BIG_CHAR);
-
-	  do_test (&json_ctx, 64, 1 << i, (i + j), BIG_CHAR);
 	  do_test (&json_ctx, 64, i + j, i, BIG_CHAR);
 
+	  do_test (&json_ctx, 0, i, i + j, BIG_CHAR);
+	  do_test (&json_ctx, 64, i, i + j, BIG_CHAR);
+
 	  if (j < i)
 	    {
-	      do_test (&json_ctx, 0, 1 << i, i - j, BIG_CHAR);
 	      do_test (&json_ctx, 0, i - j, i, BIG_CHAR);
-
-	      do_test (&json_ctx, 64, 1 << i, i - j, BIG_CHAR);
 	      do_test (&json_ctx, 64, i - j, i, BIG_CHAR);
+
+	      do_test (&json_ctx, 0, i, i - j, BIG_CHAR);
+	      do_test (&json_ctx, 64, i, i - j, BIG_CHAR);
 	    }
 	}
     }
diff --git a/benchtests/bench-strrchr.c b/benchtests/bench-strrchr.c
index 7cd2a15484..3fcf3f281d 100644
--- a/benchtests/bench-strrchr.c
+++ b/benchtests/bench-strrchr.c
@@ -151,7 +151,7 @@ int
 test_main (void)
 {
   json_ctx_t json_ctx;
-  size_t i, j;
+  size_t i, j, k;
   int seek;
 
   test_init ();
@@ -173,7 +173,7 @@ test_main (void)
 
   for (seek = 0; seek <= 23; seek += 23)
     {
-      for (j = 1; j < 32; j += j)
+      for (j = 1; j <= 256; j = (j * 4))
 	{
 	  for (i = 1; i < 9; ++i)
 	    {
@@ -197,6 +197,30 @@ test_main (void)
 	      do_test (&json_ctx, getpagesize () - i / 2 - 1, i, i + 1, seek,
 		       SMALL_CHAR, j);
 	    }
+
+	  for (i = (16 / sizeof (CHAR)); i <= (288 / sizeof (CHAR)); i += 32)
+	    {
+	      do_test (&json_ctx, 0, i - 16, i, seek, SMALL_CHAR, j);
+	      do_test (&json_ctx, 0, i, i + 16, seek, SMALL_CHAR, j);
+	    }
+
+	  for (i = (16 / sizeof (CHAR)); i <= (2048 / sizeof (CHAR)); i += i)
+	    {
+	      for (k = 0; k <= (288 / sizeof (CHAR));
+		   k += (48 / sizeof (CHAR)))
+		{
+		  do_test (&json_ctx, 0, k, i, seek, SMALL_CHAR, j);
+		  do_test (&json_ctx, 0, i, i + k, seek, SMALL_CHAR, j);
+
+		  if (k < i)
+		    {
+		      do_test (&json_ctx, 0, i - k, i, seek, SMALL_CHAR, j);
+		      do_test (&json_ctx, 0, k, i - k, seek, SMALL_CHAR, j);
+		      do_test (&json_ctx, 0, i, i - k, seek, SMALL_CHAR, j);
+		    }
+		}
+	    }
+
 	  if (seek == 0)
 	    {
 	      break;
-- 
2.34.1


^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH v2 7/7] Bench: Improve benchtests for memchr, strchr, strnlen, strrchr
  2022-10-18 23:19   ` [PATCH v2 7/7] Bench: Improve benchtests for memchr, strchr, strnlen, strrchr Noah Goldstein
@ 2022-10-19  0:01     ` H.J. Lu
  2022-10-19  0:44       ` Noah Goldstein
  0 siblings, 1 reply; 41+ messages in thread
From: H.J. Lu @ 2022-10-19  0:01 UTC (permalink / raw)
  To: Noah Goldstein; +Cc: libc-alpha, carlos

On Tue, Oct 18, 2022 at 4:19 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> 1. Add more complete coverage in the medium size range.
> 2. In strnlen remove the `1 << i` which was UB (`i` could go beyond
>    32/64)
> 3. Add timer for total benchmark runtime (useful for deciding about
>    tradeoff between coverage and runtime).

Please drop #3.

> ---
>  benchtests/bench-memchr.c    | 77 +++++++++++++++++++++++++-----------
>  benchtests/bench-rawmemchr.c | 30 ++++++++++++--
>  benchtests/bench-strchr.c    | 35 +++++++++++-----
>  benchtests/bench-strnlen.c   | 12 +++---
>  benchtests/bench-strrchr.c   | 28 ++++++++++++-
>  5 files changed, 137 insertions(+), 45 deletions(-)
>
> diff --git a/benchtests/bench-memchr.c b/benchtests/bench-memchr.c
> index 0facda2fa0..2ec9dd86d0 100644
> --- a/benchtests/bench-memchr.c
> +++ b/benchtests/bench-memchr.c
> @@ -126,7 +126,7 @@ do_test (json_ctx_t *json_ctx, size_t align, size_t pos, size_t len,
>  int
>  test_main (void)
>  {
> -  size_t i;
> +  size_t i, j, al, al_max;
>    int repeats;
>    json_ctx_t json_ctx;
>    test_init ();
> @@ -147,35 +147,46 @@ test_main (void)
>
>    json_array_begin (&json_ctx, "results");
>
> +  al_max = 0;
> +#ifdef USE_AS_MEMRCHR
> +  al_max = getpagesize () / 2;
> +#endif
> +
>    for (repeats = 0; repeats < 2; ++repeats)
>      {
> -      for (i = 1; i < 8; ++i)
> +      for (al = 0; al <= al_max; al += getpagesize () / 2)
>         {
> -         do_test (&json_ctx, 0, 16 << i, 2048, 23, repeats);
> -         do_test (&json_ctx, i, 64, 256, 23, repeats);
> -         do_test (&json_ctx, 0, 16 << i, 2048, 0, repeats);
> -         do_test (&json_ctx, i, 64, 256, 0, repeats);
> -
> -         do_test (&json_ctx, getpagesize () - 15, 64, 256, 0, repeats);
> +         for (i = 1; i < 8; ++i)
> +           {
> +             do_test (&json_ctx, al, 16 << i, 2048, 23, repeats);
> +             do_test (&json_ctx, al + i, 64, 256, 23, repeats);
> +             do_test (&json_ctx, al, 16 << i, 2048, 0, repeats);
> +             do_test (&json_ctx, al + i, 64, 256, 0, repeats);
> +
> +             do_test (&json_ctx, al + getpagesize () - 15, 64, 256, 0,
> +                      repeats);
>  #ifdef USE_AS_MEMRCHR
> -         /* Also test the position close to the beginning for memrchr.  */
> -         do_test (&json_ctx, 0, i, 256, 23, repeats);
> -         do_test (&json_ctx, 0, i, 256, 0, repeats);
> -         do_test (&json_ctx, i, i, 256, 23, repeats);
> -         do_test (&json_ctx, i, i, 256, 0, repeats);
> +             /* Also test the position close to the beginning for memrchr.  */
> +             do_test (&json_ctx, al, i, 256, 23, repeats);
> +             do_test (&json_ctx, al, i, 256, 0, repeats);
> +             do_test (&json_ctx, al + i, i, 256, 23, repeats);
> +             do_test (&json_ctx, al + i, i, 256, 0, repeats);
>  #endif
> +           }
> +         for (i = 1; i < 8; ++i)
> +           {
> +             do_test (&json_ctx, al + i, i << 5, 192, 23, repeats);
> +             do_test (&json_ctx, al + i, i << 5, 192, 0, repeats);
> +             do_test (&json_ctx, al + i, i << 5, 256, 23, repeats);
> +             do_test (&json_ctx, al + i, i << 5, 256, 0, repeats);
> +             do_test (&json_ctx, al + i, i << 5, 512, 23, repeats);
> +             do_test (&json_ctx, al + i, i << 5, 512, 0, repeats);
> +
> +             do_test (&json_ctx, al + getpagesize () - 15, i << 5, 256, 23,
> +                      repeats);
> +           }
>         }
> -      for (i = 1; i < 8; ++i)
> -       {
> -         do_test (&json_ctx, i, i << 5, 192, 23, repeats);
> -         do_test (&json_ctx, i, i << 5, 192, 0, repeats);
> -         do_test (&json_ctx, i, i << 5, 256, 23, repeats);
> -         do_test (&json_ctx, i, i << 5, 256, 0, repeats);
> -         do_test (&json_ctx, i, i << 5, 512, 23, repeats);
> -         do_test (&json_ctx, i, i << 5, 512, 0, repeats);
> -
> -         do_test (&json_ctx, getpagesize () - 15, i << 5, 256, 23, repeats);
> -       }
> +
>        for (i = 1; i < 32; ++i)
>         {
>           do_test (&json_ctx, 0, i, i + 1, 23, repeats);
> @@ -207,6 +218,24 @@ test_main (void)
>           do_test (&json_ctx, 0, 2, i + 1, 0, repeats);
>  #endif
>         }
> +      for (al = 0; al <= al_max; al += getpagesize () / 2)
> +       {
> +         for (i = (16 / sizeof (CHAR)); i <= (8192 / sizeof (CHAR)); i += i)
> +           {
> +             for (j = 0; j <= (384 / sizeof (CHAR));
> +                  j += (32 / sizeof (CHAR)))
> +               {
> +                 do_test (&json_ctx, al, i + j, i, 23, repeats);
> +                 do_test (&json_ctx, al, i, i + j, 23, repeats);
> +                 if (j < i)
> +                   {
> +                     do_test (&json_ctx, al, i - j, i, 23, repeats);
> +                     do_test (&json_ctx, al, i, i - j, 23, repeats);
> +                   }
> +               }
> +           }
> +       }
> +
>  #ifndef USE_AS_MEMRCHR
>        break;
>  #endif
> diff --git a/benchtests/bench-rawmemchr.c b/benchtests/bench-rawmemchr.c
> index b1803afc14..dab77f3858 100644
> --- a/benchtests/bench-rawmemchr.c
> +++ b/benchtests/bench-rawmemchr.c
> @@ -70,7 +70,7 @@ do_test (json_ctx_t *json_ctx, size_t align, size_t pos, size_t len, int seek_ch
>    size_t i;
>    char *result;
>
> -  align &= 7;
> +  align &= getpagesize () - 1;
>    if (align + len >= page_size)
>      return;
>
> @@ -106,7 +106,6 @@ test_main (void)
>  {
>    json_ctx_t json_ctx;
>    size_t i;
> -
>    test_init ();
>
>    json_init (&json_ctx, 0, stdout);
> @@ -120,7 +119,7 @@ test_main (void)
>
>    json_array_begin (&json_ctx, "ifuncs");
>    FOR_EACH_IMPL (impl, 0)
> -      json_element_string (&json_ctx, impl->name);
> +    json_element_string (&json_ctx, impl->name);
>    json_array_end (&json_ctx);
>
>    json_array_begin (&json_ctx, "results");
> @@ -137,6 +136,31 @@ test_main (void)
>        do_test (&json_ctx, 0, i, i + 1, 23);
>        do_test (&json_ctx, 0, i, i + 1, 0);
>      }
> +  for (; i < 256; i += 32)
> +    {
> +      do_test (&json_ctx, 0, i, i + 1, 23);
> +      do_test (&json_ctx, 0, i - 1, i, 23);
> +    }
> +  for (; i < 512; i += 64)
> +    {
> +      do_test (&json_ctx, 0, i, i + 1, 23);
> +      do_test (&json_ctx, 0, i - 1, i, 23);
> +    }
> +  for (; i < 1024; i += 128)
> +    {
> +      do_test (&json_ctx, 0, i, i + 1, 23);
> +      do_test (&json_ctx, 0, i - 1, i, 23);
> +    }
> +  for (; i < 2048; i += 256)
> +    {
> +      do_test (&json_ctx, 0, i, i + 1, 23);
> +      do_test (&json_ctx, 0, i - 1, i, 23);
> +    }
> +  for (; i < 4096; i += 512)
> +    {
> +      do_test (&json_ctx, 0, i, i + 1, 23);
> +      do_test (&json_ctx, 0, i - 1, i, 23);
> +    }
>
>    json_array_end (&json_ctx);
>    json_attr_object_end (&json_ctx);
> diff --git a/benchtests/bench-strchr.c b/benchtests/bench-strchr.c
> index 54640bde7e..aeb882d442 100644
> --- a/benchtests/bench-strchr.c
> +++ b/benchtests/bench-strchr.c
> @@ -287,8 +287,8 @@ int
>  test_main (void)
>  {
>    json_ctx_t json_ctx;
> -  size_t i;
>
> +  size_t i, j;
>    test_init ();
>
>    json_init (&json_ctx, 0, stdout);
> @@ -367,15 +367,30 @@ test_main (void)
>        do_test (&json_ctx, 0, i, i + 1, 0, BIG_CHAR);
>      }
>
> -  DO_RAND_TEST(&json_ctx, 0, 15, 16, 0.0);
> -  DO_RAND_TEST(&json_ctx, 0, 15, 16, 0.1);
> -  DO_RAND_TEST(&json_ctx, 0, 15, 16, 0.25);
> -  DO_RAND_TEST(&json_ctx, 0, 15, 16, 0.33);
> -  DO_RAND_TEST(&json_ctx, 0, 15, 16, 0.5);
> -  DO_RAND_TEST(&json_ctx, 0, 15, 16, 0.66);
> -  DO_RAND_TEST(&json_ctx, 0, 15, 16, 0.75);
> -  DO_RAND_TEST(&json_ctx, 0, 15, 16, 0.9);
> -  DO_RAND_TEST(&json_ctx, 0, 15, 16, 1.0);
> +  for (i = 16 / sizeof (CHAR); i <= 8192 / sizeof (CHAR); i += i)
> +    {
> +      for (j = 32 / sizeof (CHAR); j <= 320 / sizeof (CHAR);
> +          j += 32 / sizeof (CHAR))
> +       {
> +         do_test (&json_ctx, 0, i, i + j, 0, MIDDLE_CHAR);
> +         do_test (&json_ctx, 0, i + j, i, 0, MIDDLE_CHAR);
> +         if (i > j)
> +           {
> +             do_test (&json_ctx, 0, i, i - j, 0, MIDDLE_CHAR);
> +             do_test (&json_ctx, 0, i - j, i, 0, MIDDLE_CHAR);
> +           }
> +       }
> +    }
> +
> +  DO_RAND_TEST (&json_ctx, 0, 15, 16, 0.0);
> +  DO_RAND_TEST (&json_ctx, 0, 15, 16, 0.1);
> +  DO_RAND_TEST (&json_ctx, 0, 15, 16, 0.25);
> +  DO_RAND_TEST (&json_ctx, 0, 15, 16, 0.33);
> +  DO_RAND_TEST (&json_ctx, 0, 15, 16, 0.5);
> +  DO_RAND_TEST (&json_ctx, 0, 15, 16, 0.66);
> +  DO_RAND_TEST (&json_ctx, 0, 15, 16, 0.75);
> +  DO_RAND_TEST (&json_ctx, 0, 15, 16, 0.9);
> +  DO_RAND_TEST (&json_ctx, 0, 15, 16, 1.0);
>
>    json_array_end (&json_ctx);
>    json_attr_object_end (&json_ctx);
> diff --git a/benchtests/bench-strnlen.c b/benchtests/bench-strnlen.c
> index 13b46b3f57..82c02eb6ed 100644
> --- a/benchtests/bench-strnlen.c
> +++ b/benchtests/bench-strnlen.c
> @@ -195,19 +195,19 @@ test_main (void)
>      {
>        for (j = 0; j <= (704 / sizeof (CHAR)); j += (32 / sizeof (CHAR)))
>         {
> -         do_test (&json_ctx, 0, 1 << i, (i + j), BIG_CHAR);
>           do_test (&json_ctx, 0, i + j, i, BIG_CHAR);
> -
> -         do_test (&json_ctx, 64, 1 << i, (i + j), BIG_CHAR);
>           do_test (&json_ctx, 64, i + j, i, BIG_CHAR);
>
> +         do_test (&json_ctx, 0, i, i + j, BIG_CHAR);
> +         do_test (&json_ctx, 64, i, i + j, BIG_CHAR);
> +
>           if (j < i)
>             {
> -             do_test (&json_ctx, 0, 1 << i, i - j, BIG_CHAR);
>               do_test (&json_ctx, 0, i - j, i, BIG_CHAR);
> -
> -             do_test (&json_ctx, 64, 1 << i, i - j, BIG_CHAR);
>               do_test (&json_ctx, 64, i - j, i, BIG_CHAR);
> +
> +             do_test (&json_ctx, 0, i, i - j, BIG_CHAR);
> +             do_test (&json_ctx, 64, i, i - j, BIG_CHAR);
>             }
>         }
>      }
> diff --git a/benchtests/bench-strrchr.c b/benchtests/bench-strrchr.c
> index 7cd2a15484..3fcf3f281d 100644
> --- a/benchtests/bench-strrchr.c
> +++ b/benchtests/bench-strrchr.c
> @@ -151,7 +151,7 @@ int
>  test_main (void)
>  {
>    json_ctx_t json_ctx;
> -  size_t i, j;
> +  size_t i, j, k;
>    int seek;
>
>    test_init ();
> @@ -173,7 +173,7 @@ test_main (void)
>
>    for (seek = 0; seek <= 23; seek += 23)
>      {
> -      for (j = 1; j < 32; j += j)
> +      for (j = 1; j <= 256; j = (j * 4))
>         {
>           for (i = 1; i < 9; ++i)
>             {
> @@ -197,6 +197,30 @@ test_main (void)
>               do_test (&json_ctx, getpagesize () - i / 2 - 1, i, i + 1, seek,
>                        SMALL_CHAR, j);
>             }
> +
> +         for (i = (16 / sizeof (CHAR)); i <= (288 / sizeof (CHAR)); i += 32)
> +           {
> +             do_test (&json_ctx, 0, i - 16, i, seek, SMALL_CHAR, j);
> +             do_test (&json_ctx, 0, i, i + 16, seek, SMALL_CHAR, j);
> +           }
> +
> +         for (i = (16 / sizeof (CHAR)); i <= (2048 / sizeof (CHAR)); i += i)
> +           {
> +             for (k = 0; k <= (288 / sizeof (CHAR));
> +                  k += (48 / sizeof (CHAR)))
> +               {
> +                 do_test (&json_ctx, 0, k, i, seek, SMALL_CHAR, j);
> +                 do_test (&json_ctx, 0, i, i + k, seek, SMALL_CHAR, j);
> +
> +                 if (k < i)
> +                   {
> +                     do_test (&json_ctx, 0, i - k, i, seek, SMALL_CHAR, j);
> +                     do_test (&json_ctx, 0, k, i - k, seek, SMALL_CHAR, j);
> +                     do_test (&json_ctx, 0, i, i - k, seek, SMALL_CHAR, j);
> +                   }
> +               }
> +           }
> +
>           if (seek == 0)
>             {
>               break;
> --
> 2.34.1
>


-- 
H.J.

^ permalink raw reply	[flat|nested] 41+ messages in thread

* [PATCH v3 1/7] x86: Optimize memchr-evex.S and implement with VMM headers
  2022-10-18  2:48 [PATCH v1 1/7] x86: Optimize memchr-evex.S and implement with VMM headers Noah Goldstein
                   ` (7 preceding siblings ...)
  2022-10-18 23:19 ` [PATCH v2 " Noah Goldstein
@ 2022-10-19  0:44 ` Noah Goldstein
  2022-10-19  0:44   ` [PATCH v3 2/7] x86: Shrink / minorly optimize strchr-evex " Noah Goldstein
                     ` (6 more replies)
  8 siblings, 7 replies; 41+ messages in thread
From: Noah Goldstein @ 2022-10-19  0:44 UTC (permalink / raw)
  To: libc-alpha; +Cc: goldstein.w.n, hjl.tools, carlos

Optimizations are:

1. Use the fact that tzcnt(0) -> VEC_SIZE for memchr to save a branch
   in short string case.
2. Restructure code so that small strings are given the hot path.
	- This is a net-zero on the benchmark suite but in general makes
      sense as smaller sizes are far more common.
3. Use more code-size efficient instructions.
	- tzcnt ...     -> bsf ...
	- vpcmpb $0 ... -> vpcmpeq ...
4. Align labels less aggressively, especially if it doesn't save fetch
   blocks / causes the basic-block to span extra cache-lines.

The optimizations (especially for point 2) make the memchr and
rawmemchr code essentially incompatible so split rawmemchr-evex
to a new file.

Code Size Changes:
memchr-evex.S       : -107 bytes
rawmemchr-evex.S    :  -53 bytes

Net perf changes:

Reported as geometric mean of all improvements / regressions from N=10
runs of the benchtests. Value as New Time / Old Time so < 1.0 is
improvement and 1.0 is regression.

memchr-evex.S       : 0.928
rawmemchr-evex.S    : 0.986 (Less targets cross cache lines)

Full results attached in email.

Full check passes on x86-64.
---
 sysdeps/x86_64/multiarch/memchr-evex.S        | 939 ++++++++++--------
 sysdeps/x86_64/multiarch/rawmemchr-evex-rtm.S |   9 +-
 sysdeps/x86_64/multiarch/rawmemchr-evex.S     | 313 +++++-
 3 files changed, 851 insertions(+), 410 deletions(-)

diff --git a/sysdeps/x86_64/multiarch/memchr-evex.S b/sysdeps/x86_64/multiarch/memchr-evex.S
index 0dd4f1dcce..23a1c0018e 100644
--- a/sysdeps/x86_64/multiarch/memchr-evex.S
+++ b/sysdeps/x86_64/multiarch/memchr-evex.S
@@ -21,17 +21,27 @@
 
 #if ISA_SHOULD_BUILD (4)
 
+# ifndef VEC_SIZE
+#  include "x86-evex256-vecs.h"
+# endif
+
 # ifndef MEMCHR
 #  define MEMCHR	__memchr_evex
 # endif
 
 # ifdef USE_AS_WMEMCHR
+#  define PC_SHIFT_GPR	rcx
+#  define VPTESTN	vptestnmd
 #  define VPBROADCAST	vpbroadcastd
 #  define VPMINU	vpminud
 #  define VPCMP	vpcmpd
 #  define VPCMPEQ	vpcmpeqd
 #  define CHAR_SIZE	4
+
+#  define USE_WIDE_CHAR
 # else
+#  define PC_SHIFT_GPR	rdi
+#  define VPTESTN	vptestnmb
 #  define VPBROADCAST	vpbroadcastb
 #  define VPMINU	vpminub
 #  define VPCMP	vpcmpb
@@ -39,534 +49,661 @@
 #  define CHAR_SIZE	1
 # endif
 
-	/* In the 4x loop the RTM and non-RTM versions have data pointer
-	   off by VEC_SIZE * 4 with RTM version being VEC_SIZE * 4 greater.
-	   This is represented by BASE_OFFSET. As well because the RTM
-	   version uses vpcmp which stores a bit per element compared where
-	   the non-RTM version uses vpcmpeq which stores a bit per byte
-	   compared RET_SCALE of CHAR_SIZE is only relevant for the RTM
-	   version.  */
-# ifdef USE_IN_RTM
+# include "reg-macros.h"
+
+
+/* If not in an RTM and VEC_SIZE != 64 (the VEC_SIZE = 64
+   doesn't have VEX encoding), use VEX encoding in loop so we
+   can use vpcmpeqb + vptern which is more efficient than the
+   EVEX alternative.  */
+# if defined USE_IN_RTM || VEC_SIZE == 64
+#  undef COND_VZEROUPPER
+#  undef VZEROUPPER_RETURN
+#  undef VZEROUPPER
+
+#  define COND_VZEROUPPER
+#  define VZEROUPPER_RETURN	ret
 #  define VZEROUPPER
-#  define BASE_OFFSET	(VEC_SIZE * 4)
-#  define RET_SCALE	CHAR_SIZE
+
+#  define USE_TERN_IN_LOOP	0
 # else
+#  define USE_TERN_IN_LOOP	1
+#  undef VZEROUPPER
 #  define VZEROUPPER	vzeroupper
-#  define BASE_OFFSET	0
-#  define RET_SCALE	1
 # endif
 
-	/* In the return from 4x loop memchr and rawmemchr versions have
-	   data pointers off by VEC_SIZE * 4 with memchr version being
-	   VEC_SIZE * 4 greater.  */
-# ifdef USE_AS_RAWMEMCHR
-#  define RET_OFFSET	(BASE_OFFSET - (VEC_SIZE * 4))
-#  define RAW_PTR_REG	rcx
-#  define ALGN_PTR_REG	rdi
+# if USE_TERN_IN_LOOP
+	/* Resulting bitmask for vpmovmskb has 4-bits set for each wchar
+	   so we don't want to multiply resulting index.  */
+#  define TERN_CHAR_MULT	1
+
+#  ifdef USE_AS_WMEMCHR
+#   define TEST_END()	inc %VRCX
+#  else
+#   define TEST_END()	add %rdx, %rcx
+#  endif
 # else
-#  define RET_OFFSET	BASE_OFFSET
-#  define RAW_PTR_REG	rdi
-#  define ALGN_PTR_REG	rcx
+#  define TERN_CHAR_MULT	CHAR_SIZE
+#  define TEST_END()	KORTEST %k2, %k3
 # endif
 
-# define XMMZERO	xmm23
-# define YMMZERO	ymm23
-# define XMMMATCH	xmm16
-# define YMMMATCH	ymm16
-# define YMM1		ymm17
-# define YMM2		ymm18
-# define YMM3		ymm19
-# define YMM4		ymm20
-# define YMM5		ymm21
-# define YMM6		ymm22
+# if defined USE_AS_WMEMCHR || !USE_TERN_IN_LOOP
+#  ifndef USE_AS_WMEMCHR
+#   define GPR_X0_IS_RET	1
+#  else
+#   define GPR_X0_IS_RET	0
+#  endif
+#  define GPR_X0	rax
+# else
+#  define GPR_X0_IS_RET	0
+#  define GPR_X0	rdx
+# endif
+
+# define CHAR_PER_VEC	(VEC_SIZE / CHAR_SIZE)
 
-# ifndef SECTION
-#  define SECTION(p)	p##.evex
+# if CHAR_PER_VEC == 64
+#  define LAST_VEC_OFFSET	(VEC_SIZE * 3)
+# else
+#  define LAST_VEC_OFFSET	(VEC_SIZE * 2)
+# endif
+# if CHAR_PER_VEC >= 32
+#  define MASK_GPR(...)	VGPR(__VA_ARGS__)
+# elif CHAR_PER_VEC == 16
+#  define MASK_GPR(reg)	VGPR_SZ(reg, 16)
+# else
+#  define MASK_GPR(reg)	VGPR_SZ(reg, 8)
 # endif
 
-# define VEC_SIZE 32
-# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE)
-# define PAGE_SIZE 4096
+# define VMATCH	VMM(0)
+# define VMATCH_LO	VMM_lo(0)
 
-	.section SECTION(.text),"ax",@progbits
+# define PAGE_SIZE	4096
+
+
+	.section SECTION(.text), "ax", @progbits
 ENTRY_P2ALIGN (MEMCHR, 6)
-# ifndef USE_AS_RAWMEMCHR
 	/* Check for zero length.  */
 	test	%RDX_LP, %RDX_LP
-	jz	L(zero)
+	jz	L(zero_0)
 
-#  ifdef __ILP32__
+# ifdef __ILP32__
 	/* Clear the upper 32 bits.  */
 	movl	%edx, %edx
-#  endif
 # endif
-	/* Broadcast CHAR to YMMMATCH.  */
-	VPBROADCAST %esi, %YMMMATCH
+	VPBROADCAST %esi, %VMATCH
 	/* Check if we may cross page boundary with one vector load.  */
 	movl	%edi, %eax
 	andl	$(PAGE_SIZE - 1), %eax
 	cmpl	$(PAGE_SIZE - VEC_SIZE), %eax
-	ja	L(cross_page_boundary)
+	ja	L(page_cross)
+
+	VPCMPEQ	(%rdi), %VMATCH, %k0
+	KMOV	%k0, %VRAX
+# ifndef USE_AS_WMEMCHR
+	/* If rcx is zero then tzcnt -> CHAR_PER_VEC.  NB: there is a
+	   already a dependency between rcx and rsi so no worries about
+	   false-dep here.  */
+	tzcnt	%VRAX, %VRSI
+	/* If rdx <= rsi then either 1) rcx was non-zero (there was a
+	   match) but it was out of bounds or 2) rcx was zero and rdx
+	   was <= VEC_SIZE so we are done scanning.  */
+	cmpq	%rsi, %rdx
+	/* NB: Use branch to return zero/non-zero.  Common usage will
+	   branch on result of function (if return is null/non-null).
+	   This branch can be used to predict the ensuing one so there
+	   is no reason to extend the data-dependency with cmovcc.  */
+	jbe	L(zero_0)
+
+	/* If rcx is zero then len must be > RDX, otherwise since we
+	   already tested len vs lzcnt(rcx) (in rsi) we are good to
+	   return this match.  */
+	test	%VRAX, %VRAX
+	jz	L(more_1x_vec)
+	leaq	(%rdi, %rsi), %rax
+# else
 
-	/* Check the first VEC_SIZE bytes.  */
-	VPCMP	$0, (%rdi), %YMMMATCH, %k0
-	kmovd	%k0, %eax
-# ifndef USE_AS_RAWMEMCHR
-	/* If length < CHAR_PER_VEC handle special.  */
+	/* We can't use the `tzcnt` trick for wmemchr because CHAR_SIZE
+	   > 1 so if rcx is tzcnt != CHAR_PER_VEC.  */
 	cmpq	$CHAR_PER_VEC, %rdx
-	jbe	L(first_vec_x0)
-# endif
-	testl	%eax, %eax
-	jz	L(aligned_more)
-	tzcntl	%eax, %eax
-# ifdef USE_AS_WMEMCHR
-	/* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count.  */
+	ja	L(more_1x_vec)
+	tzcnt	%VRAX, %VRAX
+	cmpl	%eax, %edx
+	jbe	L(zero_0)
+L(first_vec_x0_ret):
 	leaq	(%rdi, %rax, CHAR_SIZE), %rax
-# else
-	addq	%rdi, %rax
 # endif
 	ret
 
-# ifndef USE_AS_RAWMEMCHR
-L(zero):
-	xorl	%eax, %eax
-	ret
-
-	.p2align 4
-L(first_vec_x0):
-	/* Check if first match was before length. NB: tzcnt has false data-
-	   dependency on destination. eax already had a data-dependency on esi
-	   so this should have no affect here.  */
-	tzcntl	%eax, %esi
-#  ifdef USE_AS_WMEMCHR
-	leaq	(%rdi, %rsi, CHAR_SIZE), %rdi
-#  else
-	addq	%rsi, %rdi
-#  endif
+	/* Only fits in first cache line for VEC_SIZE == 32.  */
+# if VEC_SIZE == 32
+	.p2align 4,, 2
+L(zero_0):
 	xorl	%eax, %eax
-	cmpl	%esi, %edx
-	cmovg	%rdi, %rax
 	ret
 # endif
 
-	.p2align 4
-L(cross_page_boundary):
-	/* Save pointer before aligning as its original value is
-	   necessary for computer return address if byte is found or
-	   adjusting length if it is not and this is memchr.  */
-	movq	%rdi, %rcx
-	/* Align data to VEC_SIZE. ALGN_PTR_REG is rcx for memchr and rdi
-	   for rawmemchr.  */
-	andq	$-VEC_SIZE, %ALGN_PTR_REG
-	VPCMP	$0, (%ALGN_PTR_REG), %YMMMATCH, %k0
-	kmovd	%k0, %r8d
+	.p2align 4,, 9
+L(more_1x_vec):
 # ifdef USE_AS_WMEMCHR
-	/* NB: Divide shift count by 4 since each bit in K0 represent 4
-	   bytes.  */
-	sarl	$2, %eax
-# endif
-# ifndef USE_AS_RAWMEMCHR
-	movl	$(PAGE_SIZE / CHAR_SIZE), %esi
-	subl	%eax, %esi
+	/* If wmemchr still need to test if there was a match in first
+	   VEC.  Use bsf to test here so we can reuse
+	   L(first_vec_x0_ret).  */
+	bsf	%VRAX, %VRAX
+	jnz	L(first_vec_x0_ret)
 # endif
+
+L(page_cross_continue):
 # ifdef USE_AS_WMEMCHR
-	andl	$(CHAR_PER_VEC - 1), %eax
-# endif
-	/* Remove the leading bytes.  */
-	sarxl	%eax, %r8d, %eax
-# ifndef USE_AS_RAWMEMCHR
-	/* Check the end of data.  */
-	cmpq	%rsi, %rdx
-	jbe	L(first_vec_x0)
+	/* We can't use end of the buffer to re-calculate length for
+	   wmemchr as len * CHAR_SIZE may overflow.  */
+	leaq	-(VEC_SIZE + CHAR_SIZE)(%rdi), %rax
+	andq	$(VEC_SIZE * -1), %rdi
+	subq	%rdi, %rax
+	sarq	$2, %rax
+	addq	%rdx, %rax
+# else
+	leaq	-(VEC_SIZE + 1)(%rdx, %rdi), %rax
+	andq	$(VEC_SIZE * -1), %rdi
+	subq	%rdi, %rax
 # endif
-	testl	%eax, %eax
-	jz	L(cross_page_continue)
-	tzcntl	%eax, %eax
+
+	/* rax contains remaining length - 1.  -1 so we can get imm8
+	   encoding in a few additional places saving code size.  */
+
+	/* Needed regardless of remaining length.  */
+	VPCMPEQ	VEC_SIZE(%rdi), %VMATCH, %k0
+	KMOV	%k0, %VRDX
+
+	/* We cannot fold the above `sub %rdi, %rax` with the `cmp
+	   $(CHAR_PER_VEC * 2), %rax` because its possible for a very
+	   large length to overflow and cause the subtract to carry
+	   despite length being above CHAR_PER_VEC * 2.  */
+	cmpq	$(CHAR_PER_VEC * 2 - 1), %rax
+	ja	L(more_2x_vec)
+L(last_2x_vec):
+
+	test	%VRDX, %VRDX
+	jnz	L(first_vec_x1_check)
+
+	/* Check the end of data.  NB: use 8-bit operations to save code
+	   size.  We no longer need the full-width of eax and will
+	   perform a write-only operation over eax so there will be no
+	   partial-register stalls.  */
+	subb	$(CHAR_PER_VEC * 1 - 1), %al
+	jle	L(zero_0)
+
+	VPCMPEQ	(VEC_SIZE * 2)(%rdi), %VMATCH, %k0
+	KMOV	%k0, %VRCX
 # ifdef USE_AS_WMEMCHR
-	/* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count.  */
-	leaq	(%RAW_PTR_REG, %rax, CHAR_SIZE), %rax
+	/* For wmemchr against we can't take advantage of tzcnt(0) ==
+	   VEC_SIZE as CHAR_PER_VEC != VEC_SIZE.  */
+	test	%VRCX, %VRCX
+	jz	L(zero_0)
+# endif
+	tzcnt	%VRCX, %VRCX
+	cmp	%cl, %al
+
+	/* Same CFG for VEC_SIZE == 64 and VEC_SIZE == 32.  We give
+	   fallthrough to L(zero_0) for VEC_SIZE == 64 here as there is
+	   not enough space before the next cache line to fit the `lea`
+	   for return.  */
+# if VEC_SIZE == 64
+	ja	L(first_vec_x2_ret)
+L(zero_0):
+	xorl	%eax, %eax
+	ret
 # else
-	addq	%RAW_PTR_REG, %rax
+	jbe	L(zero_0)
+	leaq	(VEC_SIZE * 2)(%rdi, %rcx, CHAR_SIZE), %rax
+	ret
 # endif
+
+	.p2align 4,, 5
+L(first_vec_x1_check):
+	bsf	%VRDX, %VRDX
+	cmpb	%dl, %al
+	jb	L(zero_4)
+	leaq	(VEC_SIZE * 1)(%rdi, %rdx, CHAR_SIZE), %rax
 	ret
 
-	.p2align 4
-L(first_vec_x1):
-	tzcntl	%eax, %eax
-	leaq	VEC_SIZE(%rdi, %rax, CHAR_SIZE), %rax
+	/* Fits at the end of the cache line here for VEC_SIZE == 32.
+	 */
+# if VEC_SIZE == 32
+L(zero_4):
+	xorl	%eax, %eax
 	ret
+# endif
 
-	.p2align 4
+
+	.p2align 4,, 4
 L(first_vec_x2):
-	tzcntl	%eax, %eax
-	leaq	(VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
+	bsf	%VRCX, %VRCX
+L(first_vec_x2_ret):
+	leaq	(VEC_SIZE * 2)(%rdi, %rcx, CHAR_SIZE), %rax
 	ret
 
-	.p2align 4
-L(first_vec_x3):
-	tzcntl	%eax, %eax
-	leaq	(VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax
+	/* Fits at the end of the cache line here for VEC_SIZE == 64.
+	 */
+# if VEC_SIZE == 64
+L(zero_4):
+	xorl	%eax, %eax
 	ret
+# endif
 
-	.p2align 4
-L(first_vec_x4):
-	tzcntl	%eax, %eax
-	leaq	(VEC_SIZE * 4)(%rdi, %rax, CHAR_SIZE), %rax
+	.p2align 4,, 4
+L(first_vec_x1):
+	bsf	%VRDX, %VRDX
+	leaq	(VEC_SIZE * 1)(%rdi, %rdx, CHAR_SIZE), %rax
 	ret
 
-	.p2align 5
-L(aligned_more):
-	/* Check the first 4 * VEC_SIZE.  Only one VEC_SIZE at a time
-	   since data is only aligned to VEC_SIZE.  */
 
-# ifndef USE_AS_RAWMEMCHR
-	/* Align data to VEC_SIZE.  */
-L(cross_page_continue):
-	xorl	%ecx, %ecx
-	subl	%edi, %ecx
-	andq	$-VEC_SIZE, %rdi
-	/* esi is for adjusting length to see if near the end.  */
-	leal	(VEC_SIZE * 5)(%rdi, %rcx), %esi
-#  ifdef USE_AS_WMEMCHR
-	/* NB: Divide bytes by 4 to get the wchar_t count.  */
-	sarl	$2, %esi
-#  endif
-# else
-	andq	$-VEC_SIZE, %rdi
-L(cross_page_continue):
-# endif
-	/* Load first VEC regardless.  */
-	VPCMP	$0, (VEC_SIZE)(%rdi), %YMMMATCH, %k0
-	kmovd	%k0, %eax
-# ifndef USE_AS_RAWMEMCHR
-	/* Adjust length. If near end handle specially.  */
-	subq	%rsi, %rdx
-	jbe	L(last_4x_vec_or_less)
-# endif
-	testl	%eax, %eax
+	.p2align 4,, 5
+L(more_2x_vec):
+	/* Length > VEC_SIZE * 2 so check first 2x VEC before rechecking
+	   length.  */
+
+
+	/* Already computed matches for first VEC in rdx.  */
+	test	%VRDX, %VRDX
 	jnz	L(first_vec_x1)
 
-	VPCMP	$0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k0
-	kmovd	%k0, %eax
-	testl	%eax, %eax
+
+	VPCMPEQ	(VEC_SIZE * 2)(%rdi), %VMATCH, %k0
+	KMOV	%k0, %VRCX
+	test	%VRCX, %VRCX
 	jnz	L(first_vec_x2)
 
-	VPCMP	$0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k0
-	kmovd	%k0, %eax
-	testl	%eax, %eax
+	/* Needed regardless of next length check.  */
+	VPCMPEQ	(VEC_SIZE * 3)(%rdi), %VMATCH, %k0
+	KMOV	%k0, %VRCX
+
+	/* Check if we are near the end.  */
+	cmpq	$(CHAR_PER_VEC * 4 - 1), %rax
+	ja	L(more_4x_vec)
+
+	test	%VRCX, %VRCX
+	jnz	L(first_vec_x3_check)
+
+	/* Use 8-bit instructions to save code size.  We won't use full-
+	   width eax again and will perform a write-only operation to
+	   eax so no worries about partial-register stalls.  */
+	subb	$(CHAR_PER_VEC * 3), %al
+	jb	L(zero_2)
+L(last_vec_check):
+	VPCMPEQ	(VEC_SIZE * 4)(%rdi), %VMATCH, %k0
+	KMOV	%k0, %VRCX
+# ifdef USE_AS_WMEMCHR
+	/* For wmemchr against we can't take advantage of tzcnt(0) ==
+	   VEC_SIZE as CHAR_PER_VEC != VEC_SIZE.  */
+	test	%VRCX, %VRCX
+	jz	L(zero_2)
+# endif
+	tzcnt	%VRCX, %VRCX
+	cmp	%cl, %al
+	jae	L(first_vec_x4_ret)
+L(zero_2):
+	xorl	%eax, %eax
+	ret
+
+	/* Fits at the end of the cache line here for VEC_SIZE == 64.
+	   For VEC_SIZE == 32 we put the return label at the end of
+	   L(first_vec_x4).  */
+# if VEC_SIZE == 64
+L(first_vec_x4_ret):
+	leaq	(VEC_SIZE * 4)(%rdi, %rcx, CHAR_SIZE), %rax
+	ret
+# endif
+
+	.p2align 4,, 6
+L(first_vec_x4):
+	bsf	%VRCX, %VRCX
+# if VEC_SIZE == 32
+	/* Place L(first_vec_x4_ret) here as we can't fit it in the same
+	   cache line as where it is called from so we might as well
+	   save code size by reusing return of L(first_vec_x4).  */
+L(first_vec_x4_ret):
+# endif
+	leaq	(VEC_SIZE * 4)(%rdi, %rcx, CHAR_SIZE), %rax
+	ret
+
+	.p2align 4,, 6
+L(first_vec_x3_check):
+	/* Need to adjust remaining length before checking.  */
+	addb	$-(CHAR_PER_VEC * 2), %al
+	bsf	%VRCX, %VRCX
+	cmpb	%cl, %al
+	jb	L(zero_2)
+	leaq	(VEC_SIZE * 3)(%rdi, %rcx, CHAR_SIZE), %rax
+	ret
+
+	.p2align 4,, 6
+L(first_vec_x3):
+	bsf	%VRCX, %VRCX
+	leaq	(VEC_SIZE * 3)(%rdi, %rcx, CHAR_SIZE), %rax
+	ret
+
+	.p2align 4,, 3
+# if !USE_TERN_IN_LOOP
+	.p2align 4,, 10
+# endif
+L(more_4x_vec):
+	test	%VRCX, %VRCX
 	jnz	L(first_vec_x3)
 
-	VPCMP	$0, (VEC_SIZE * 4)(%rdi), %YMMMATCH, %k0
-	kmovd	%k0, %eax
-	testl	%eax, %eax
+	VPCMPEQ	(VEC_SIZE * 4)(%rdi), %VMATCH, %k0
+	KMOV	%k0, %VRCX
+	test	%VRCX, %VRCX
 	jnz	L(first_vec_x4)
 
+	subq	$-(VEC_SIZE * 5), %rdi
+	subq	$(CHAR_PER_VEC * 8), %rax
+	jb	L(last_4x_vec)
 
-# ifndef USE_AS_RAWMEMCHR
-	/* Check if at last CHAR_PER_VEC * 4 length.  */
-	subq	$(CHAR_PER_VEC * 4), %rdx
-	jbe	L(last_4x_vec_or_less_cmpeq)
-	/* +VEC_SIZE if USE_IN_RTM otherwise +VEC_SIZE * 5.  */
-	addq	$(VEC_SIZE + (VEC_SIZE * 4 - BASE_OFFSET)), %rdi
-
-	/* Align data to VEC_SIZE * 4 for the loop and readjust length.
-	 */
-#  ifdef USE_AS_WMEMCHR
+# ifdef USE_AS_WMEMCHR
 	movl	%edi, %ecx
-	andq	$-(4 * VEC_SIZE), %rdi
+# else
+	addq	%rdi, %rax
+# endif
+
+
+# if VEC_SIZE == 64
+	/* use xorb to do `andq $-(VEC_SIZE * 4), %rdi`. No evex
+	   processor has partial register stalls (all have merging
+	   uop). If that changes this can be removed.  */
+	xorb	%dil, %dil
+# else
+	andq	$-(VEC_SIZE * 4), %rdi
+# endif
+
+# ifdef USE_AS_WMEMCHR
 	subl	%edi, %ecx
-	/* NB: Divide bytes by 4 to get the wchar_t count.  */
 	sarl	$2, %ecx
-	addq	%rcx, %rdx
-#  else
-	addq	%rdi, %rdx
-	andq	$-(4 * VEC_SIZE), %rdi
-	subq	%rdi, %rdx
-#  endif
+	addq	%rcx, %rax
 # else
-	addq	$(VEC_SIZE + (VEC_SIZE * 4 - BASE_OFFSET)), %rdi
-	andq	$-(4 * VEC_SIZE), %rdi
+	subq	%rdi, %rax
 # endif
-# ifdef USE_IN_RTM
-	vpxorq	%XMMZERO, %XMMZERO, %XMMZERO
-# else
-	/* copy ymmmatch to ymm0 so we can use vpcmpeq which is not
-	   encodable with EVEX registers (ymm16-ymm31).  */
-	vmovdqa64 %YMMMATCH, %ymm0
+
+
+
+# if USE_TERN_IN_LOOP
+	/* copy VMATCH to low ymm so we can use vpcmpeq which is not
+	   encodable with EVEX registers.  NB: this is VEC_SIZE == 32
+	   only as there is no way to encode vpcmpeq with zmm0-15.  */
+	vmovdqa64 %VMATCH, %VMATCH_LO
 # endif
 
-	/* Compare 4 * VEC at a time forward.  */
-	.p2align 4
+	.p2align 4,, 11
 L(loop_4x_vec):
-	/* Two versions of the loop. One that does not require
-	   vzeroupper by not using ymm0-ymm15 and another does that require
-	   vzeroupper because it uses ymm0-ymm15. The reason why ymm0-ymm15
-	   is used at all is because there is no EVEX encoding vpcmpeq and
-	   with vpcmpeq this loop can be performed more efficiently. The
-	   non-vzeroupper version is safe for RTM while the vzeroupper
-	   version should be prefered if RTM are not supported.  */
-# ifdef USE_IN_RTM
-	/* It would be possible to save some instructions using 4x VPCMP
-	   but bottleneck on port 5 makes it not woth it.  */
-	VPCMP	$4, (VEC_SIZE * 4)(%rdi), %YMMMATCH, %k1
-	/* xor will set bytes match esi to zero.  */
-	vpxorq	(VEC_SIZE * 5)(%rdi), %YMMMATCH, %YMM2
-	vpxorq	(VEC_SIZE * 6)(%rdi), %YMMMATCH, %YMM3
-	VPCMP	$0, (VEC_SIZE * 7)(%rdi), %YMMMATCH, %k3
-	/* Reduce VEC2 / VEC3 with min and VEC1 with zero mask.  */
-	VPMINU	%YMM2, %YMM3, %YMM3{%k1}{z}
-	VPCMP	$0, %YMM3, %YMMZERO, %k2
-# else
+	/* Two versions of the loop.  One that does not require
+	   vzeroupper by not using ymmm0-15 and another does that
+	   require vzeroupper because it uses ymmm0-15.  The reason why
+	   ymm0-15 is used at all is because there is no EVEX encoding
+	   vpcmpeq and with vpcmpeq this loop can be performed more
+	   efficiently.  The non-vzeroupper version is safe for RTM
+	   while the vzeroupper version should be prefered if RTM are
+	   not supported.   Which loop version we use is determined by
+	   USE_TERN_IN_LOOP.  */
+
+# if USE_TERN_IN_LOOP
 	/* Since vptern can only take 3x vectors fastest to do 1 vec
 	   seperately with EVEX vpcmp.  */
 #  ifdef USE_AS_WMEMCHR
 	/* vptern can only accept masks for epi32/epi64 so can only save
-	   instruction using not equals mask on vptern with wmemchr.  */
-	VPCMP	$4, (%rdi), %YMMMATCH, %k1
+	   instruction using not equals mask on vptern with wmemchr.
+	 */
+	VPCMP	$4, (VEC_SIZE * 0)(%rdi), %VMATCH, %k1
 #  else
-	VPCMP	$0, (%rdi), %YMMMATCH, %k1
+	VPCMPEQ	(VEC_SIZE * 0)(%rdi), %VMATCH, %k1
 #  endif
 	/* Compare 3x with vpcmpeq and or them all together with vptern.
 	 */
-	VPCMPEQ	VEC_SIZE(%rdi), %ymm0, %ymm2
-	VPCMPEQ	(VEC_SIZE * 2)(%rdi), %ymm0, %ymm3
-	VPCMPEQ	(VEC_SIZE * 3)(%rdi), %ymm0, %ymm4
+	VPCMPEQ	(VEC_SIZE * 1)(%rdi), %VMATCH_LO, %VMM_lo(2)
+	VPCMPEQ	(VEC_SIZE * 2)(%rdi), %VMATCH_LO, %VMM_lo(3)
+	VPCMPEQ	(VEC_SIZE * 3)(%rdi), %VMATCH_LO, %VMM_lo(4)
 #  ifdef USE_AS_WMEMCHR
-	/* This takes the not of or between ymm2, ymm3, ymm4 as well as
-	   combines result from VEC0 with zero mask.  */
-	vpternlogd $1, %ymm2, %ymm3, %ymm4{%k1}{z}
-	vpmovmskb %ymm4, %ecx
+	/* This takes the not of or between VEC_lo(2), VEC_lo(3),
+	   VEC_lo(4) as well as combines result from VEC(0) with zero
+	   mask.  */
+	vpternlogd $1, %VMM_lo(2), %VMM_lo(3), %VMM_lo(4){%k1}{z}
+	vpmovmskb %VMM_lo(4), %VRCX
 #  else
-	/* 254 is mask for oring ymm2, ymm3, ymm4 into ymm4.  */
-	vpternlogd $254, %ymm2, %ymm3, %ymm4
-	vpmovmskb %ymm4, %ecx
-	kmovd	%k1, %eax
+	/* 254 is mask for oring VEC_lo(2), VEC_lo(3), VEC_lo(4) into
+	   VEC_lo(4).  */
+	vpternlogd $254, %VMM_lo(2), %VMM_lo(3), %VMM_lo(4)
+	vpmovmskb %VMM_lo(4), %VRCX
+	KMOV	%k1, %edx
 #  endif
-# endif
 
-# ifdef USE_AS_RAWMEMCHR
-	subq	$-(VEC_SIZE * 4), %rdi
-# endif
-# ifdef USE_IN_RTM
-	kortestd %k2, %k3
 # else
-#  ifdef USE_AS_WMEMCHR
-	/* ecx contains not of matches. All 1s means no matches. incl will
-	   overflow and set zeroflag if that is the case.  */
-	incl	%ecx
-#  else
-	/* If either VEC1 (eax) or VEC2-VEC4 (ecx) are not zero. Adding
-	   to ecx is not an issue because if eax is non-zero it will be
-	   used for returning the match. If it is zero the add does
-	   nothing.  */
-	addq	%rax, %rcx
-#  endif
+	/* Loop version that uses EVEX encoding.  */
+	VPCMP	$4, (VEC_SIZE * 0)(%rdi), %VMATCH, %k1
+	vpxorq	(VEC_SIZE * 1)(%rdi), %VMATCH, %VMM(2)
+	vpxorq	(VEC_SIZE * 2)(%rdi), %VMATCH, %VMM(3)
+	VPCMPEQ	(VEC_SIZE * 3)(%rdi), %VMATCH, %k3
+	VPMINU	%VMM(2), %VMM(3), %VMM(3){%k1}{z}
+	VPTESTN	%VMM(3), %VMM(3), %k2
 # endif
-# ifdef USE_AS_RAWMEMCHR
-	jz	L(loop_4x_vec)
-# else
-	jnz	L(loop_4x_vec_end)
+
+
+	TEST_END ()
+	jnz	L(loop_vec_ret)
 
 	subq	$-(VEC_SIZE * 4), %rdi
 
-	subq	$(CHAR_PER_VEC * 4), %rdx
-	ja	L(loop_4x_vec)
+	subq	$(CHAR_PER_VEC * 4), %rax
+	jae	L(loop_4x_vec)
 
-	/* Fall through into less than 4 remaining vectors of length case.
+	/* COND_VZEROUPPER is vzeroupper if we use the VEX encoded loop.
 	 */
-	VPCMP	$0, BASE_OFFSET(%rdi), %YMMMATCH, %k0
-	addq	$(BASE_OFFSET - VEC_SIZE), %rdi
-	kmovd	%k0, %eax
-	VZEROUPPER
-
-L(last_4x_vec_or_less):
-	/* Check if first VEC contained match.  */
-	testl	%eax, %eax
-	jnz	L(first_vec_x1_check)
+	COND_VZEROUPPER
 
-	/* If remaining length > CHAR_PER_VEC * 2.  */
-	addl	$(CHAR_PER_VEC * 2), %edx
-	jg	L(last_4x_vec)
-
-L(last_2x_vec):
-	/* If remaining length < CHAR_PER_VEC.  */
-	addl	$CHAR_PER_VEC, %edx
-	jle	L(zero_end)
-
-	/* Check VEC2 and compare any match with remaining length.  */
-	VPCMP	$0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k0
-	kmovd	%k0, %eax
-	tzcntl	%eax, %eax
-	cmpl	%eax, %edx
-	jbe	L(set_zero_end)
-	leaq	(VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
-L(zero_end):
-	ret
+	.p2align 4,, 10
+L(last_4x_vec):
+	/* For CHAR_PER_VEC == 64 we don't need to mask as we use 8-bit
+	   instructions on eax from here on out.  */
+# if CHAR_PER_VEC != 64
+	andl	$(CHAR_PER_VEC * 4 - 1), %eax
+# endif
+	VPCMPEQ	(VEC_SIZE * 0)(%rdi), %VMATCH, %k0
+	subq	$(VEC_SIZE * 1), %rdi
+	KMOV	%k0, %VRDX
+	cmpb	$(CHAR_PER_VEC * 2 - 1), %al
+	jbe	L(last_2x_vec)
+	test	%VRDX, %VRDX
+	jnz	L(last_vec_x1_novzero)
+
+	VPCMPEQ	(VEC_SIZE * 2)(%rdi), %VMATCH, %k0
+	KMOV	%k0, %VRDX
+	test	%VRDX, %VRDX
+	jnz	L(last_vec_x2_novzero)
+
+	VPCMPEQ	(VEC_SIZE * 3)(%rdi), %VMATCH, %k0
+	KMOV	%k0, %VRCX
+	test	%VRCX, %VRCX
+	jnz	L(first_vec_x3_check)
+
+	subb	$(CHAR_PER_VEC * 3), %al
+	jae	L(last_vec_check)
 
-L(set_zero_end):
 	xorl	%eax, %eax
 	ret
 
-	.p2align 4
-L(first_vec_x1_check):
-	/* eax must be non-zero. Use bsfl to save code size.  */
-	bsfl	%eax, %eax
-	/* Adjust length.  */
-	subl	$-(CHAR_PER_VEC * 4), %edx
-	/* Check if match within remaining length.  */
-	cmpl	%eax, %edx
-	jbe	L(set_zero_end)
-	/* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count.  */
-	leaq	VEC_SIZE(%rdi, %rax, CHAR_SIZE), %rax
+# if defined USE_AS_WMEMCHR && USE_TERN_IN_LOOP
+L(last_vec_x2_novzero):
+	addq	$VEC_SIZE, %rdi
+L(last_vec_x1_novzero):
+	bsf	%VRDX, %VRDX
+	leaq	(VEC_SIZE * 1)(%rdi, %rdx, CHAR_SIZE), %rax
 	ret
+# endif
 
-	.p2align 4
-L(loop_4x_vec_end):
+# if CHAR_PER_VEC == 64
+	/* Since we can't combine the last 2x VEC when CHAR_PER_VEC ==
+	   64 it needs a seperate return label.  */
+	.p2align 4,, 4
+L(last_vec_x2):
+L(last_vec_x2_novzero):
+	bsf	%VRDX, %VRDX
+	leaq	(VEC_SIZE * 2)(%rdi, %rdx, TERN_CHAR_MULT), %rax
+	ret
 # endif
-	/* rawmemchr will fall through into this if match was found in
-	   loop.  */
 
-# if defined USE_IN_RTM || defined USE_AS_WMEMCHR
-	/* k1 has not of matches with VEC1.  */
-	kmovd	%k1, %eax
-#  ifdef USE_AS_WMEMCHR
-	subl	$((1 << CHAR_PER_VEC) - 1), %eax
-#  else
-	incl	%eax
-#  endif
+	.p2align 4,, 4
+L(loop_vec_ret):
+# if defined USE_AS_WMEMCHR || !USE_TERN_IN_LOOP
+	KMOV	%k1, %VRAX
+	inc	%MASK_GPR(rax)
 # else
-	/* eax already has matches for VEC1.  */
-	testl	%eax, %eax
+	test	%VRDX, %VRDX
 # endif
-	jnz	L(last_vec_x1_return)
+	jnz	L(last_vec_x0)
 
-# ifdef USE_IN_RTM
-	VPCMP	$0, %YMM2, %YMMZERO, %k0
-	kmovd	%k0, %eax
+
+# if USE_TERN_IN_LOOP
+	vpmovmskb %VMM_lo(2), %VRDX
 # else
-	vpmovmskb %ymm2, %eax
+	VPTESTN	%VMM(2), %VMM(2), %k1
+	KMOV	%k1, %VRDX
 # endif
-	testl	%eax, %eax
-	jnz	L(last_vec_x2_return)
+	test	%VRDX, %VRDX
+	jnz	L(last_vec_x1)
 
-# ifdef USE_IN_RTM
-	kmovd	%k2, %eax
-	testl	%eax, %eax
-	jnz	L(last_vec_x3_return)
 
-	kmovd	%k3, %eax
-	tzcntl	%eax, %eax
-	leaq	(VEC_SIZE * 3 + RET_OFFSET)(%rdi, %rax, CHAR_SIZE), %rax
+# if USE_TERN_IN_LOOP
+	vpmovmskb %VMM_lo(3), %VRDX
 # else
-	vpmovmskb %ymm3, %eax
-	/* Combine matches in VEC3 (eax) with matches in VEC4 (ecx).  */
-	salq	$VEC_SIZE, %rcx
-	orq	%rcx, %rax
-	tzcntq	%rax, %rax
-	leaq	(VEC_SIZE * 2 + RET_OFFSET)(%rdi, %rax), %rax
-	VZEROUPPER
+	KMOV	%k2, %VRDX
 # endif
-	ret
 
-	.p2align 4,, 10
-L(last_vec_x1_return):
-	tzcntl	%eax, %eax
-# if defined USE_AS_WMEMCHR || RET_OFFSET != 0
-	/* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count.  */
-	leaq	RET_OFFSET(%rdi, %rax, CHAR_SIZE), %rax
+	/* No longer need any of the lo vecs (ymm0-15) so vzeroupper
+	   (only if used VEX encoded loop).  */
+	COND_VZEROUPPER
+
+	/* Seperate logic for CHAR_PER_VEC == 64 vs the rest.  For
+	   CHAR_PER_VEC we test the last 2x VEC seperately, for
+	   CHAR_PER_VEC <= 32 we can combine the results from the 2x
+	   VEC in a single GPR.  */
+# if CHAR_PER_VEC == 64
+#  if USE_TERN_IN_LOOP
+#   error "Unsupported"
+#  endif
+
+
+	/* If CHAR_PER_VEC == 64 we can't combine the last two VEC.  */
+	test	%VRDX, %VRDX
+	jnz	L(last_vec_x2)
+	KMOV	%k3, %VRDX
 # else
-	addq	%rdi, %rax
+	/* CHAR_PER_VEC <= 32 so we can combine the results from the
+	   last 2x VEC.  */
+
+#  if !USE_TERN_IN_LOOP
+	KMOV	%k3, %VRCX
+#  endif
+	salq	$(VEC_SIZE / TERN_CHAR_MULT), %rcx
+	addq	%rcx, %rdx
+#  if !defined USE_AS_WMEMCHR || !USE_TERN_IN_LOOP
+L(last_vec_x2_novzero):
+#  endif
 # endif
-	VZEROUPPER
+	bsf	%rdx, %rdx
+	leaq	(LAST_VEC_OFFSET)(%rdi, %rdx, TERN_CHAR_MULT), %rax
 	ret
 
-	.p2align 4
-L(last_vec_x2_return):
-	tzcntl	%eax, %eax
-	/* NB: Multiply bytes by RET_SCALE to get the wchar_t count
-	   if relevant (RET_SCALE = CHAR_SIZE if USE_AS_WMEMCHAR and
-	   USE_IN_RTM are both defined. Otherwise RET_SCALE = 1.  */
-	leaq	(VEC_SIZE + RET_OFFSET)(%rdi, %rax, RET_SCALE), %rax
-	VZEROUPPER
+	.p2align 4,, 8
+L(last_vec_x1):
+	COND_VZEROUPPER
+# if !defined USE_AS_WMEMCHR || !USE_TERN_IN_LOOP
+L(last_vec_x1_novzero):
+# endif
+	bsf	%VRDX, %VRDX
+	leaq	(VEC_SIZE * 1)(%rdi, %rdx, TERN_CHAR_MULT), %rax
 	ret
 
-# ifdef USE_IN_RTM
-	.p2align 4
-L(last_vec_x3_return):
-	tzcntl	%eax, %eax
-	/* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count.  */
-	leaq	(VEC_SIZE * 2 + RET_OFFSET)(%rdi, %rax, CHAR_SIZE), %rax
+
+	.p2align 4,, 4
+L(last_vec_x0):
+	COND_VZEROUPPER
+	bsf	%VGPR(GPR_X0), %VGPR(GPR_X0)
+# if GPR_X0_IS_RET
+	addq	%rdi, %rax
+# else
+	leaq	(%rdi, %GPR_X0, CHAR_SIZE), %rax
+# endif
 	ret
+
+	.p2align 4,, 6
+L(page_cross):
+	/* Need to preserve eax to compute inbound bytes we are
+	   checking.  */
+# ifdef USE_AS_WMEMCHR
+	movl	%eax, %ecx
+# else
+	xorl	%ecx, %ecx
+	subl	%eax, %ecx
 # endif
 
-# ifndef USE_AS_RAWMEMCHR
-	.p2align 4,, 5
-L(last_4x_vec_or_less_cmpeq):
-	VPCMP	$0, (VEC_SIZE * 5)(%rdi), %YMMMATCH, %k0
-	kmovd	%k0, %eax
-	subq	$-(VEC_SIZE * 4), %rdi
-	/* Check first VEC regardless.  */
-	testl	%eax, %eax
-	jnz	L(first_vec_x1_check)
+	xorq	%rdi, %rax
+	VPCMPEQ	(PAGE_SIZE - VEC_SIZE)(%rax), %VMATCH, %k0
+	KMOV	%k0, %VRAX
 
-	/* If remaining length <= CHAR_PER_VEC * 2.  */
-	addl	$(CHAR_PER_VEC * 2), %edx
-	jle	L(last_2x_vec)
+# ifdef USE_AS_WMEMCHR
+	/* NB: Divide by CHAR_SIZE to shift out out of bounds bytes.  */
+	shrl	$2, %ecx
+	andl	$(CHAR_PER_VEC - 1), %ecx
+# endif
 
-	.p2align 4
-L(last_4x_vec):
-	VPCMP	$0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k0
-	kmovd	%k0, %eax
-	testl	%eax, %eax
-	jnz	L(last_vec_x2)
 
+	shrx	%VGPR(PC_SHIFT_GPR), %VRAX, %VRAX
 
-	VPCMP	$0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k0
-	kmovd	%k0, %eax
-	/* Create mask for possible matches within remaining length.  */
-#  ifdef USE_AS_WMEMCHR
-	movl	$((1 << (CHAR_PER_VEC * 2)) - 1), %ecx
-	bzhil	%edx, %ecx, %ecx
-#  else
-	movq	$-1, %rcx
-	bzhiq	%rdx, %rcx, %rcx
-#  endif
-	/* Test matches in data against length match.  */
-	andl	%ecx, %eax
-	jnz	L(last_vec_x3)
+# ifdef USE_AS_WMEMCHR
+	negl	%ecx
+# endif
 
-	/* if remaining length <= CHAR_PER_VEC * 3 (Note this is after
-	   remaining length was found to be > CHAR_PER_VEC * 2.  */
-	subl	$CHAR_PER_VEC, %edx
-	jbe	L(zero_end2)
+	/* mask lower bits from ecx (negative eax) to get bytes till
+	   next VEC.  */
+	andl	$(CHAR_PER_VEC - 1), %ecx
 
+	/* Check if VEC is entirely contained in the remainder of the
+	   page.  */
+	cmpq	%rcx, %rdx
+	jbe	L(page_cross_ret)
 
-	VPCMP	$0, (VEC_SIZE * 4)(%rdi), %YMMMATCH, %k0
-	kmovd	%k0, %eax
-	/* Shift remaining length mask for last VEC.  */
-#  ifdef USE_AS_WMEMCHR
-	shrl	$CHAR_PER_VEC, %ecx
-#  else
-	shrq	$CHAR_PER_VEC, %rcx
-#  endif
-	andl	%ecx, %eax
-	jz	L(zero_end2)
-	bsfl	%eax, %eax
-	leaq	(VEC_SIZE * 4)(%rdi, %rax, CHAR_SIZE), %rax
-L(zero_end2):
-	ret
+	/* Length crosses the page so if rax is zero (no matches)
+	   continue.  */
+	test	%VRAX, %VRAX
+	jz	L(page_cross_continue)
 
-L(last_vec_x2):
-	tzcntl	%eax, %eax
-	leaq	(VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
+	/* if rdx > rcx then any match here must be in [buf:buf + len].
+	 */
+	tzcnt	%VRAX, %VRAX
+# ifdef USE_AS_WMEMCHR
+	leaq	(%rdi, %rax, CHAR_SIZE), %rax
+# else
+	addq	%rdi, %rax
+# endif
 	ret
 
-	.p2align 4
-L(last_vec_x3):
-	tzcntl	%eax, %eax
-	leaq	(VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax
+	.p2align 4,, 2
+L(page_cross_zero):
+	xorl	%eax, %eax
 	ret
+
+	.p2align 4,, 4
+L(page_cross_ret):
+	/* Search is entirely contained in page cross case.  */
+# ifdef USE_AS_WMEMCHR
+	test	%VRAX, %VRAX
+	jz	L(page_cross_zero)
+# endif
+	tzcnt	%VRAX, %VRAX
+	cmpl	%eax, %edx
+	jbe	L(page_cross_zero)
+# ifdef USE_AS_WMEMCHR
+	leaq	(%rdi, %rax, CHAR_SIZE), %rax
+# else
+	addq	%rdi, %rax
 # endif
-	/* 7 bytes from next cache line.  */
+	ret
 END (MEMCHR)
 #endif
diff --git a/sysdeps/x86_64/multiarch/rawmemchr-evex-rtm.S b/sysdeps/x86_64/multiarch/rawmemchr-evex-rtm.S
index deda1ca395..2073eaa620 100644
--- a/sysdeps/x86_64/multiarch/rawmemchr-evex-rtm.S
+++ b/sysdeps/x86_64/multiarch/rawmemchr-evex-rtm.S
@@ -1,3 +1,6 @@
-#define MEMCHR __rawmemchr_evex_rtm
-#define USE_AS_RAWMEMCHR 1
-#include "memchr-evex-rtm.S"
+#define RAWMEMCHR	__rawmemchr_evex_rtm
+
+#define USE_IN_RTM	1
+#define SECTION(p)	p##.evex.rtm
+
+#include "rawmemchr-evex.S"
diff --git a/sysdeps/x86_64/multiarch/rawmemchr-evex.S b/sysdeps/x86_64/multiarch/rawmemchr-evex.S
index dc1c450699..dad54def2b 100644
--- a/sysdeps/x86_64/multiarch/rawmemchr-evex.S
+++ b/sysdeps/x86_64/multiarch/rawmemchr-evex.S
@@ -1,7 +1,308 @@
-#ifndef RAWMEMCHR
-# define RAWMEMCHR	__rawmemchr_evex
-#endif
-#define USE_AS_RAWMEMCHR	1
-#define MEMCHR	RAWMEMCHR
+/* rawmemchr optimized with 256-bit EVEX instructions.
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <isa-level.h>
+#include <sysdep.h>
+
+#if ISA_SHOULD_BUILD (4)
+
+# ifndef VEC_SIZE
+#  include "x86-evex256-vecs.h"
+# endif
+
+# ifndef RAWMEMCHR
+#  define RAWMEMCHR	__rawmemchr_evex
+# endif
+
+
+# define PC_SHIFT_GPR	rdi
+# define REG_WIDTH	VEC_SIZE
+# define VPTESTN	vptestnmb
+# define VPBROADCAST	vpbroadcastb
+# define VPMINU	vpminub
+# define VPCMP	vpcmpb
+# define VPCMPEQ	vpcmpeqb
+# define CHAR_SIZE	1
+
+# include "reg-macros.h"
+
+/* If not in an RTM and VEC_SIZE != 64 (the VEC_SIZE = 64
+   doesn't have VEX encoding), use VEX encoding in loop so we
+   can use vpcmpeqb + vptern which is more efficient than the
+   EVEX alternative.  */
+# if defined USE_IN_RTM || VEC_SIZE == 64
+#  undef COND_VZEROUPPER
+#  undef VZEROUPPER_RETURN
+#  undef VZEROUPPER
+
+
+#  define COND_VZEROUPPER
+#  define VZEROUPPER_RETURN	ret
+#  define VZEROUPPER
+
+#  define USE_TERN_IN_LOOP	0
+# else
+#  define USE_TERN_IN_LOOP	1
+#  undef VZEROUPPER
+#  define VZEROUPPER	vzeroupper
+# endif
+
+# define CHAR_PER_VEC	VEC_SIZE
+
+# if CHAR_PER_VEC == 64
+
+#  define TAIL_RETURN_LBL	first_vec_x2
+#  define TAIL_RETURN_OFFSET	(CHAR_PER_VEC * 2)
+
+#  define FALLTHROUGH_RETURN_LBL	first_vec_x3
+#  define FALLTHROUGH_RETURN_OFFSET	(CHAR_PER_VEC * 3)
+
+# else	/* !(CHAR_PER_VEC == 64) */
+
+#  define TAIL_RETURN_LBL	first_vec_x3
+#  define TAIL_RETURN_OFFSET	(CHAR_PER_VEC * 3)
+
+#  define FALLTHROUGH_RETURN_LBL	first_vec_x2
+#  define FALLTHROUGH_RETURN_OFFSET	(CHAR_PER_VEC * 2)
+# endif	/* !(CHAR_PER_VEC == 64) */
+
+
+# define VMATCH	VMM(0)
+# define VMATCH_LO	VMM_lo(0)
+
+# define PAGE_SIZE	4096
+
+	.section SECTION(.text), "ax", @progbits
+ENTRY_P2ALIGN (RAWMEMCHR, 6)
+	VPBROADCAST %esi, %VMATCH
+	/* Check if we may cross page boundary with one vector load.  */
+	movl	%edi, %eax
+	andl	$(PAGE_SIZE - 1), %eax
+	cmpl	$(PAGE_SIZE - VEC_SIZE), %eax
+	ja	L(page_cross)
+
+	VPCMPEQ	(%rdi), %VMATCH, %k0
+	KMOV	%k0, %VRAX
+
+	test	%VRAX, %VRAX
+	jz	L(aligned_more)
+L(first_vec_x0):
+	bsf	%VRAX, %VRAX
+	addq	%rdi, %rax
+	ret
+
+	.p2align 4,, 4
+L(first_vec_x4):
+	bsf	%VRAX, %VRAX
+	leaq	(VEC_SIZE * 4)(%rdi, %rax), %rax
+	ret
 
-#include "memchr-evex.S"
+	/* For VEC_SIZE == 32 we can fit this in aligning bytes so might
+	   as well place it more locally.  For VEC_SIZE == 64 we reuse
+	   return code at the end of loop's return.  */
+# if VEC_SIZE == 32
+	.p2align 4,, 4
+L(FALLTHROUGH_RETURN_LBL):
+	bsf	%VRAX, %VRAX
+	leaq	(FALLTHROUGH_RETURN_OFFSET)(%rdi, %rax), %rax
+	ret
+# endif
+
+	.p2align 4,, 6
+L(page_cross):
+	/* eax has lower page-offset bits of rdi so xor will zero them
+	   out.  */
+	xorq	%rdi, %rax
+	VPCMPEQ	(PAGE_SIZE - VEC_SIZE)(%rax), %VMATCH, %k0
+	KMOV	%k0, %VRAX
+
+	/* Shift out out-of-bounds matches.  */
+	shrx	%VRDI, %VRAX, %VRAX
+	test	%VRAX, %VRAX
+	jnz	L(first_vec_x0)
+
+	.p2align 4,, 10
+L(aligned_more):
+L(page_cross_continue):
+	/* Align pointer.  */
+	andq	$(VEC_SIZE * -1), %rdi
+
+	VPCMPEQ	VEC_SIZE(%rdi), %VMATCH, %k0
+	KMOV	%k0, %VRAX
+	test	%VRAX, %VRAX
+	jnz	L(first_vec_x1)
+
+	VPCMPEQ	(VEC_SIZE * 2)(%rdi), %VMATCH, %k0
+	KMOV	%k0, %VRAX
+	test	%VRAX, %VRAX
+	jnz	L(first_vec_x2)
+
+	VPCMPEQ	(VEC_SIZE * 3)(%rdi), %VMATCH, %k0
+	KMOV	%k0, %VRAX
+	test	%VRAX, %VRAX
+	jnz	L(first_vec_x3)
+
+	VPCMPEQ	(VEC_SIZE * 4)(%rdi), %VMATCH, %k0
+	KMOV	%k0, %VRAX
+	test	%VRAX, %VRAX
+	jnz	L(first_vec_x4)
+
+	subq	$-(VEC_SIZE * 1), %rdi
+# if VEC_SIZE == 64
+	/* Saves code size.  No evex512 processor has partial register
+	   stalls.  If that change this can be replaced with `andq
+	   $-(VEC_SIZE * 4), %rdi`.  */
+	xorb	%dil, %dil
+# else
+	andq	$-(VEC_SIZE * 4), %rdi
+# endif
+
+# if USE_TERN_IN_LOOP
+	/* copy VMATCH to low ymm so we can use vpcmpeq which is not
+	   encodable with EVEX registers.  NB: this is VEC_SIZE == 32
+	   only as there is no way to encode vpcmpeq with zmm0-15.  */
+	vmovdqa64 %VMATCH, %VMATCH_LO
+# endif
+
+	.p2align 4
+L(loop_4x_vec):
+	/* Two versions of the loop.  One that does not require
+	   vzeroupper by not using ymm0-15 and another does that
+	   require vzeroupper because it uses ymm0-15.  The reason why
+	   ymm0-15 is used at all is because there is no EVEX encoding
+	   vpcmpeq and with vpcmpeq this loop can be performed more
+	   efficiently.  The non-vzeroupper version is safe for RTM
+	   while the vzeroupper version should be prefered if RTM are
+	   not supported.   Which loop version we use is determined by
+	   USE_TERN_IN_LOOP.  */
+
+# if USE_TERN_IN_LOOP
+	/* Since vptern can only take 3x vectors fastest to do 1 vec
+	   seperately with EVEX vpcmp.  */
+	VPCMPEQ	(VEC_SIZE * 4)(%rdi), %VMATCH, %k1
+	/* Compare 3x with vpcmpeq and or them all together with vptern.
+	 */
+
+	VPCMPEQ	(VEC_SIZE * 5)(%rdi), %VMATCH_LO, %VMM_lo(2)
+	subq	$(VEC_SIZE * -4), %rdi
+	VPCMPEQ	(VEC_SIZE * 2)(%rdi), %VMATCH_LO, %VMM_lo(3)
+	VPCMPEQ	(VEC_SIZE * 3)(%rdi), %VMATCH_LO, %VMM_lo(4)
+
+	/* 254 is mask for oring VEC_lo(2), VEC_lo(3), VEC_lo(4) into
+	   VEC_lo(4).  */
+	vpternlogd $254, %VMM_lo(2), %VMM_lo(3), %VMM_lo(4)
+	vpmovmskb %VMM_lo(4), %VRCX
+
+	KMOV	%k1, %eax
+
+	/* NB:  rax has match from first VEC and rcx has matches from
+	   VEC 2-4.  If rax is non-zero we will return that match.  If
+	   rax is zero adding won't disturb the bits in rcx.  */
+	add	%rax, %rcx
+# else
+	/* Loop version that uses EVEX encoding.  */
+	VPCMP	$4, (VEC_SIZE * 4)(%rdi), %VMATCH, %k1
+	vpxorq	(VEC_SIZE * 5)(%rdi), %VMATCH, %VMM(2)
+	vpxorq	(VEC_SIZE * 6)(%rdi), %VMATCH, %VMM(3)
+	VPCMPEQ	(VEC_SIZE * 7)(%rdi), %VMATCH, %k3
+	VPMINU	%VMM(2), %VMM(3), %VMM(3){%k1}{z}
+	VPTESTN	%VMM(3), %VMM(3), %k2
+	subq	$(VEC_SIZE * -4), %rdi
+	KORTEST %k2, %k3
+# endif
+	jz	L(loop_4x_vec)
+
+# if USE_TERN_IN_LOOP
+	test	%VRAX, %VRAX
+# else
+	KMOV	%k1, %VRAX
+	inc	%VRAX
+# endif
+	jnz	L(last_vec_x0)
+
+
+# if USE_TERN_IN_LOOP
+	vpmovmskb %VMM_lo(2), %VRAX
+# else
+	VPTESTN	%VMM(2), %VMM(2), %k1
+	KMOV	%k1, %VRAX
+# endif
+	test	%VRAX, %VRAX
+	jnz	L(last_vec_x1)
+
+
+# if USE_TERN_IN_LOOP
+	vpmovmskb %VMM_lo(3), %VRAX
+# else
+	KMOV	%k2, %VRAX
+# endif
+
+	/* No longer need any of the lo vecs (ymm0-15) so vzeroupper
+	   (only if used VEX encoded loop).  */
+	COND_VZEROUPPER
+
+	/* Seperate logic for VEC_SIZE == 64 and VEC_SIZE == 32 for
+	   returning last 2x VEC. For VEC_SIZE == 64 we test each VEC
+	   individually, for VEC_SIZE == 32 we combine them in a single
+	   64-bit GPR.  */
+# if CHAR_PER_VEC == 64
+#  if USE_TERN_IN_LOOP
+#   error "Unsupported"
+#  endif
+
+
+	/* If CHAR_PER_VEC == 64 we can't combine the last two VEC.  */
+	test	%VRAX, %VRAX
+	jnz	L(first_vec_x2)
+	KMOV	%k3, %VRAX
+L(FALLTHROUGH_RETURN_LBL):
+# else
+	/* CHAR_PER_VEC <= 32 so we can combine the results from the
+	   last 2x VEC.  */
+#  if !USE_TERN_IN_LOOP
+	KMOV	%k3, %VRCX
+#  endif
+	salq	$CHAR_PER_VEC, %rcx
+	addq	%rcx, %rax
+# endif
+	bsf	%rax, %rax
+	leaq	(FALLTHROUGH_RETURN_OFFSET)(%rdi, %rax), %rax
+	ret
+
+	.p2align 4,, 8
+L(TAIL_RETURN_LBL):
+	bsf	%rax, %rax
+	leaq	(TAIL_RETURN_OFFSET)(%rdi, %rax), %rax
+	ret
+
+	.p2align 4,, 8
+L(last_vec_x1):
+	COND_VZEROUPPER
+L(first_vec_x1):
+	bsf	%VRAX, %VRAX
+	leaq	(VEC_SIZE * 1)(%rdi, %rax), %rax
+	ret
+
+	.p2align 4,, 8
+L(last_vec_x0):
+	COND_VZEROUPPER
+	bsf	%VRAX, %VRAX
+	addq	%rdi, %rax
+	ret
+END (RAWMEMCHR)
+#endif
-- 
2.34.1


^ permalink raw reply	[flat|nested] 41+ messages in thread

* [PATCH v3 2/7] x86: Shrink / minorly optimize strchr-evex and implement with VMM headers
  2022-10-19  0:44 ` [PATCH v3 1/7] x86: Optimize memchr-evex.S and implement with VMM headers Noah Goldstein
@ 2022-10-19  0:44   ` Noah Goldstein
  2022-10-19 16:53     ` H.J. Lu
  2022-10-19  0:44   ` [PATCH v3 3/7] x86: Optimize strnlen-evex.S " Noah Goldstein
                     ` (5 subsequent siblings)
  6 siblings, 1 reply; 41+ messages in thread
From: Noah Goldstein @ 2022-10-19  0:44 UTC (permalink / raw)
  To: libc-alpha; +Cc: goldstein.w.n, hjl.tools, carlos

Size Optimizations:
1. Condence hot path for better cache-locality.
    - This is most impact for strchrnul where the logic strings with
      len <= VEC_SIZE or with a match in the first VEC no fits entirely
      in the first cache line.
2. Reuse common targets in first 4x VEC and after the loop.
3. Don't align targets so aggressively if it doesn't change the number
   of fetch blocks it will require and put more care in avoiding the
   case where targets unnecessarily split cache lines.
4. Align the loop better for DSB/LSD
5. Use more code-size efficient instructions.
	- tzcnt ...     -> bsf ...
	- vpcmpb $0 ... -> vpcmpeq ...
6. Align labels less aggressively, especially if it doesn't save fetch
   blocks / causes the basic-block to span extra cache-lines.

Code Size Changes:
strchr-evex.S	: -63 bytes
strchrnul-evex.S: -48 bytes

Net perf changes:
Reported as geometric mean of all improvements / regressions from N=10
runs of the benchtests. Value as New Time / Old Time so < 1.0 is
improvement and 1.0 is regression.

strchr-evex.S (Fixed)   : 0.971
strchr-evex.S (Rand)    : 0.932
strchrnul-evex.S        : 0.965

Full results attached in email.

Full check passes on x86-64.
---
 sysdeps/x86_64/multiarch/strchr-evex.S | 558 +++++++++++++++----------
 1 file changed, 340 insertions(+), 218 deletions(-)

diff --git a/sysdeps/x86_64/multiarch/strchr-evex.S b/sysdeps/x86_64/multiarch/strchr-evex.S
index a1c15c4419..c2a0d112f7 100644
--- a/sysdeps/x86_64/multiarch/strchr-evex.S
+++ b/sysdeps/x86_64/multiarch/strchr-evex.S
@@ -26,48 +26,75 @@
 #  define STRCHR	__strchr_evex
 # endif
 
-# define VMOVU		vmovdqu64
-# define VMOVA		vmovdqa64
+# ifndef VEC_SIZE
+#  include "x86-evex256-vecs.h"
+# endif
 
 # ifdef USE_AS_WCSCHR
 #  define VPBROADCAST	vpbroadcastd
-#  define VPCMP		vpcmpd
+#  define VPCMP	vpcmpd
+#  define VPCMPEQ	vpcmpeqd
 #  define VPTESTN	vptestnmd
+#  define VPTEST	vptestmd
 #  define VPMINU	vpminud
 #  define CHAR_REG	esi
-#  define SHIFT_REG	ecx
+#  define SHIFT_REG	rcx
 #  define CHAR_SIZE	4
+
+#  define USE_WIDE_CHAR
 # else
 #  define VPBROADCAST	vpbroadcastb
-#  define VPCMP		vpcmpb
+#  define VPCMP	vpcmpb
+#  define VPCMPEQ	vpcmpeqb
 #  define VPTESTN	vptestnmb
+#  define VPTEST	vptestmb
 #  define VPMINU	vpminub
 #  define CHAR_REG	sil
-#  define SHIFT_REG	edx
+#  define SHIFT_REG	rdi
 #  define CHAR_SIZE	1
 # endif
 
-# define XMMZERO	xmm16
-
-# define YMMZERO	ymm16
-# define YMM0		ymm17
-# define YMM1		ymm18
-# define YMM2		ymm19
-# define YMM3		ymm20
-# define YMM4		ymm21
-# define YMM5		ymm22
-# define YMM6		ymm23
-# define YMM7		ymm24
-# define YMM8		ymm25
-
-# define VEC_SIZE 32
-# define PAGE_SIZE 4096
-# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE)
-
-	.section .text.evex,"ax",@progbits
-ENTRY_P2ALIGN (STRCHR, 5)
-	/* Broadcast CHAR to YMM0.	*/
-	VPBROADCAST	%esi, %YMM0
+# include "reg-macros.h"
+
+# if VEC_SIZE == 64
+#  define MASK_GPR	rcx
+#  define LOOP_REG	rax
+
+#  define COND_MASK(k_reg)	{%k_reg}
+# else
+#  define MASK_GPR	rax
+#  define LOOP_REG	rdi
+
+#  define COND_MASK(k_reg)
+# endif
+
+# define CHAR_PER_VEC	(VEC_SIZE / CHAR_SIZE)
+
+
+# if CHAR_PER_VEC == 64
+#  define LAST_VEC_OFFSET	(VEC_SIZE * 3)
+#  define TESTZ(reg)	incq %VGPR_SZ(reg, 64)
+# else
+
+#  if CHAR_PER_VEC == 32
+#   define TESTZ(reg)	incl %VGPR_SZ(reg, 32)
+#  elif CHAR_PER_VEC == 16
+#   define TESTZ(reg)	incw %VGPR_SZ(reg, 16)
+#  else
+#   define TESTZ(reg)	incb %VGPR_SZ(reg, 8)
+#  endif
+
+#  define LAST_VEC_OFFSET	(VEC_SIZE * 2)
+# endif
+
+# define VMATCH	VMM(0)
+
+# define PAGE_SIZE	4096
+
+	.section SECTION(.text), "ax", @progbits
+ENTRY_P2ALIGN (STRCHR, 6)
+	/* Broadcast CHAR to VEC_0.  */
+	VPBROADCAST %esi, %VMATCH
 	movl	%edi, %eax
 	andl	$(PAGE_SIZE - 1), %eax
 	/* Check if we cross page boundary with one vector load.
@@ -75,19 +102,27 @@ ENTRY_P2ALIGN (STRCHR, 5)
 	cmpl	$(PAGE_SIZE - VEC_SIZE), %eax
 	ja	L(cross_page_boundary)
 
+
 	/* Check the first VEC_SIZE bytes. Search for both CHAR and the
 	   null bytes.  */
-	VMOVU	(%rdi), %YMM1
-
+	VMOVU	(%rdi), %VMM(1)
 	/* Leaves only CHARS matching esi as 0.  */
-	vpxorq	%YMM1, %YMM0, %YMM2
-	VPMINU	%YMM2, %YMM1, %YMM2
-	/* Each bit in K0 represents a CHAR or a null byte in YMM1.  */
-	VPTESTN	%YMM2, %YMM2, %k0
-	kmovd	%k0, %eax
-	testl	%eax, %eax
+	vpxorq	%VMM(1), %VMATCH, %VMM(2)
+	VPMINU	%VMM(2), %VMM(1), %VMM(2)
+	/* Each bit in K0 represents a CHAR or a null byte in VEC_1.  */
+	VPTESTN	%VMM(2), %VMM(2), %k0
+	KMOV	%k0, %VRAX
+# if VEC_SIZE == 64 && defined USE_AS_STRCHRNUL
+	/* If VEC_SIZE == 64 && STRCHRNUL use bsf to test condition so
+	   that all logic for match/null in first VEC first in 1x cache
+	   lines.  This has a slight cost to larger sizes.  */
+	bsf	%VRAX, %VRAX
+	jz	L(aligned_more)
+# else
+	test	%VRAX, %VRAX
 	jz	L(aligned_more)
-	tzcntl	%eax, %eax
+	bsf	%VRAX, %VRAX
+# endif
 # ifndef USE_AS_STRCHRNUL
 	/* Found CHAR or the null byte.  */
 	cmp	(%rdi, %rax, CHAR_SIZE), %CHAR_REG
@@ -109,287 +144,374 @@ ENTRY_P2ALIGN (STRCHR, 5)
 # endif
 	ret
 
-
-
-	.p2align 4,, 10
-L(first_vec_x4):
-# ifndef USE_AS_STRCHRNUL
-	/* Check to see if first match was CHAR (k0) or null (k1).  */
-	kmovd	%k0, %eax
-	tzcntl	%eax, %eax
-	kmovd	%k1, %ecx
-	/* bzhil will not be 0 if first match was null.  */
-	bzhil	%eax, %ecx, %ecx
-	jne	L(zero)
-# else
-	/* Combine CHAR and null matches.  */
-	kord	%k0, %k1, %k0
-	kmovd	%k0, %eax
-	tzcntl	%eax, %eax
-# endif
-	/* NB: Multiply sizeof char type (1 or 4) to get the number of
-	   bytes.  */
-	leaq	(VEC_SIZE * 4)(%rdi, %rax, CHAR_SIZE), %rax
-	ret
-
 # ifndef USE_AS_STRCHRNUL
 L(zero):
 	xorl	%eax, %eax
 	ret
 # endif
 
-
-	.p2align 4
+	.p2align 4,, 2
+L(first_vec_x3):
+	subq	$-(VEC_SIZE * 2), %rdi
+# if VEC_SIZE == 32
+	/* Reuse L(first_vec_x3) for last VEC2 only for VEC_SIZE == 32.
+	   For VEC_SIZE == 64 the registers don't match.  */
+L(last_vec_x2):
+# endif
 L(first_vec_x1):
 	/* Use bsf here to save 1-byte keeping keeping the block in 1x
 	   fetch block. eax guranteed non-zero.  */
-	bsfl	%eax, %eax
+	bsf	%VRCX, %VRCX
 # ifndef USE_AS_STRCHRNUL
-	/* Found CHAR or the null byte.	 */
-	cmp	(VEC_SIZE)(%rdi, %rax, CHAR_SIZE), %CHAR_REG
+	/* Found CHAR or the null byte.  */
+	cmp	(VEC_SIZE)(%rdi, %rcx, CHAR_SIZE), %CHAR_REG
 	jne	L(zero)
-
 # endif
 	/* NB: Multiply sizeof char type (1 or 4) to get the number of
 	   bytes.  */
-	leaq	(VEC_SIZE)(%rdi, %rax, CHAR_SIZE), %rax
+	leaq	(VEC_SIZE)(%rdi, %rcx, CHAR_SIZE), %rax
 	ret
 
-	.p2align 4,, 10
+	.p2align 4,, 2
+L(first_vec_x4):
+	subq	$-(VEC_SIZE * 2), %rdi
 L(first_vec_x2):
 # ifndef USE_AS_STRCHRNUL
 	/* Check to see if first match was CHAR (k0) or null (k1).  */
-	kmovd	%k0, %eax
-	tzcntl	%eax, %eax
-	kmovd	%k1, %ecx
+	KMOV	%k0, %VRAX
+	tzcnt	%VRAX, %VRAX
+	KMOV	%k1, %VRCX
 	/* bzhil will not be 0 if first match was null.  */
-	bzhil	%eax, %ecx, %ecx
+	bzhi	%VRAX, %VRCX, %VRCX
 	jne	L(zero)
 # else
 	/* Combine CHAR and null matches.  */
-	kord	%k0, %k1, %k0
-	kmovd	%k0, %eax
-	tzcntl	%eax, %eax
+	KOR	%k0, %k1, %k0
+	KMOV	%k0, %VRAX
+	bsf	%VRAX, %VRAX
 # endif
 	/* NB: Multiply sizeof char type (1 or 4) to get the number of
 	   bytes.  */
 	leaq	(VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
 	ret
 
-	.p2align 4,, 10
-L(first_vec_x3):
-	/* Use bsf here to save 1-byte keeping keeping the block in 1x
-	   fetch block. eax guranteed non-zero.  */
-	bsfl	%eax, %eax
-# ifndef USE_AS_STRCHRNUL
-	/* Found CHAR or the null byte.	 */
-	cmp	(VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %CHAR_REG
-	jne	L(zero)
+# ifdef USE_AS_STRCHRNUL
+	/* We use this as a hook to get imm8 encoding for the jmp to
+	   L(page_cross_boundary).  This allows the hot case of a
+	   match/null-term in first VEC to fit entirely in 1 cache
+	   line.  */
+L(cross_page_boundary):
+	jmp	L(cross_page_boundary_real)
 # endif
-	/* NB: Multiply sizeof char type (1 or 4) to get the number of
-	   bytes.  */
-	leaq	(VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax
-	ret
 
 	.p2align 4
 L(aligned_more):
+L(cross_page_continue):
 	/* Align data to VEC_SIZE.  */
 	andq	$-VEC_SIZE, %rdi
-L(cross_page_continue):
-	/* Check the next 4 * VEC_SIZE. Only one VEC_SIZE at a time since
-	   data is only aligned to VEC_SIZE. Use two alternating methods
-	   for checking VEC to balance latency and port contention.  */
 
-	/* This method has higher latency but has better port
-	   distribution.  */
-	VMOVA	(VEC_SIZE)(%rdi), %YMM1
+	/* Check the next 4 * VEC_SIZE. Only one VEC_SIZE at a time
+	   since data is only aligned to VEC_SIZE. Use two alternating
+	   methods for checking VEC to balance latency and port
+	   contention.  */
+
+    /* Method(1) with 8c latency:
+	   For VEC_SIZE == 32:
+	   p0 * 1.83, p1 * 0.83, p5 * 1.33
+	   For VEC_SIZE == 64:
+	   p0 * 2.50, p1 * 0.00, p5 * 1.50  */
+	VMOVA	(VEC_SIZE)(%rdi), %VMM(1)
 	/* Leaves only CHARS matching esi as 0.  */
-	vpxorq	%YMM1, %YMM0, %YMM2
-	VPMINU	%YMM2, %YMM1, %YMM2
-	/* Each bit in K0 represents a CHAR or a null byte in YMM1.  */
-	VPTESTN	%YMM2, %YMM2, %k0
-	kmovd	%k0, %eax
-	testl	%eax, %eax
+	vpxorq	%VMM(1), %VMATCH, %VMM(2)
+	VPMINU	%VMM(2), %VMM(1), %VMM(2)
+	/* Each bit in K0 represents a CHAR or a null byte in VEC_1.  */
+	VPTESTN	%VMM(2), %VMM(2), %k0
+	KMOV	%k0, %VRCX
+	test	%VRCX, %VRCX
 	jnz	L(first_vec_x1)
 
-	/* This method has higher latency but has better port
-	   distribution.  */
-	VMOVA	(VEC_SIZE * 2)(%rdi), %YMM1
-	/* Each bit in K0 represents a CHAR in YMM1.  */
-	VPCMP	$0, %YMM1, %YMM0, %k0
-	/* Each bit in K1 represents a CHAR in YMM1.  */
-	VPTESTN	%YMM1, %YMM1, %k1
-	kortestd	%k0, %k1
+    /* Method(2) with 6c latency:
+	   For VEC_SIZE == 32:
+	   p0 * 1.00, p1 * 0.00, p5 * 2.00
+	   For VEC_SIZE == 64:
+	   p0 * 1.00, p1 * 0.00, p5 * 2.00  */
+	VMOVA	(VEC_SIZE * 2)(%rdi), %VMM(1)
+	/* Each bit in K0 represents a CHAR in VEC_1.  */
+	VPCMPEQ	%VMM(1), %VMATCH, %k0
+	/* Each bit in K1 represents a CHAR in VEC_1.  */
+	VPTESTN	%VMM(1), %VMM(1), %k1
+	KORTEST %k0, %k1
 	jnz	L(first_vec_x2)
 
-	VMOVA	(VEC_SIZE * 3)(%rdi), %YMM1
+	/* By swapping between Method 1/2 we get more fair port
+	   distrubition and better throughput.  */
+
+	VMOVA	(VEC_SIZE * 3)(%rdi), %VMM(1)
 	/* Leaves only CHARS matching esi as 0.  */
-	vpxorq	%YMM1, %YMM0, %YMM2
-	VPMINU	%YMM2, %YMM1, %YMM2
-	/* Each bit in K0 represents a CHAR or a null byte in YMM1.  */
-	VPTESTN	%YMM2, %YMM2, %k0
-	kmovd	%k0, %eax
-	testl	%eax, %eax
+	vpxorq	%VMM(1), %VMATCH, %VMM(2)
+	VPMINU	%VMM(2), %VMM(1), %VMM(2)
+	/* Each bit in K0 represents a CHAR or a null byte in VEC_1.  */
+	VPTESTN	%VMM(2), %VMM(2), %k0
+	KMOV	%k0, %VRCX
+	test	%VRCX, %VRCX
 	jnz	L(first_vec_x3)
 
-	VMOVA	(VEC_SIZE * 4)(%rdi), %YMM1
-	/* Each bit in K0 represents a CHAR in YMM1.  */
-	VPCMP	$0, %YMM1, %YMM0, %k0
-	/* Each bit in K1 represents a CHAR in YMM1.  */
-	VPTESTN	%YMM1, %YMM1, %k1
-	kortestd	%k0, %k1
+	VMOVA	(VEC_SIZE * 4)(%rdi), %VMM(1)
+	/* Each bit in K0 represents a CHAR in VEC_1.  */
+	VPCMPEQ	%VMM(1), %VMATCH, %k0
+	/* Each bit in K1 represents a CHAR in VEC_1.  */
+	VPTESTN	%VMM(1), %VMM(1), %k1
+	KORTEST %k0, %k1
 	jnz	L(first_vec_x4)
 
 	/* Align data to VEC_SIZE * 4 for the loop.  */
+# if VEC_SIZE == 64
+	/* Use rax for the loop reg as it allows to the loop to fit in
+	   exactly 2-cache-lines. (more efficient imm32 + gpr
+	   encoding).  */
+	leaq	(VEC_SIZE)(%rdi), %rax
+	/* No partial register stalls on evex512 processors.  */
+	xorb	%al, %al
+# else
+	/* For VEC_SIZE == 32 continue using rdi for loop reg so we can
+	   reuse more code and save space.  */
 	addq	$VEC_SIZE, %rdi
 	andq	$-(VEC_SIZE * 4), %rdi
-
+# endif
 	.p2align 4
 L(loop_4x_vec):
-	/* Check 4x VEC at a time. No penalty to imm32 offset with evex
-	   encoding.  */
-	VMOVA	(VEC_SIZE * 4)(%rdi), %YMM1
-	VMOVA	(VEC_SIZE * 5)(%rdi), %YMM2
-	VMOVA	(VEC_SIZE * 6)(%rdi), %YMM3
-	VMOVA	(VEC_SIZE * 7)(%rdi), %YMM4
-
-	/* For YMM1 and YMM3 use xor to set the CHARs matching esi to
+	/* Check 4x VEC at a time. No penalty for imm32 offset with evex
+	   encoding (if offset % VEC_SIZE == 0).  */
+	VMOVA	(VEC_SIZE * 4)(%LOOP_REG), %VMM(1)
+	VMOVA	(VEC_SIZE * 5)(%LOOP_REG), %VMM(2)
+	VMOVA	(VEC_SIZE * 6)(%LOOP_REG), %VMM(3)
+	VMOVA	(VEC_SIZE * 7)(%LOOP_REG), %VMM(4)
+
+	/* Collect bits where VEC_1 does NOT match esi.  This is later
+	   use to mask of results (getting not matches allows us to
+	   save an instruction on combining).  */
+	VPCMP	$4, %VMATCH, %VMM(1), %k1
+
+	/* Two methods for loop depending on VEC_SIZE.  This is because
+	   with zmm registers VPMINU can only run on p0 (as opposed to
+	   p0/p1 for ymm) so it is less prefered.  */
+# if VEC_SIZE == 32
+	/* For VEC_2 and VEC_3 use xor to set the CHARs matching esi to
 	   zero.  */
-	vpxorq	%YMM1, %YMM0, %YMM5
-	/* For YMM2 and YMM4 cmp not equals to CHAR and store result in
-	   k register. Its possible to save either 1 or 2 instructions
-	   using cmp no equals method for either YMM1 or YMM1 and YMM3
-	   respectively but bottleneck on p5 makes it not worth it.  */
-	VPCMP	$4, %YMM0, %YMM2, %k2
-	vpxorq	%YMM3, %YMM0, %YMM7
-	VPCMP	$4, %YMM0, %YMM4, %k4
-
-	/* Use min to select all zeros from either xor or end of string).
-	 */
-	VPMINU	%YMM1, %YMM5, %YMM1
-	VPMINU	%YMM3, %YMM7, %YMM3
+	vpxorq	%VMM(2), %VMATCH, %VMM(6)
+	vpxorq	%VMM(3), %VMATCH, %VMM(7)
 
-	/* Use min + zeromask to select for zeros. Since k2 and k4 will
-	   have 0 as positions that matched with CHAR which will set
-	   zero in the corresponding destination bytes in YMM2 / YMM4.
-	 */
-	VPMINU	%YMM1, %YMM2, %YMM2{%k2}{z}
-	VPMINU	%YMM3, %YMM4, %YMM4
-	VPMINU	%YMM2, %YMM4, %YMM4{%k4}{z}
-
-	VPTESTN	%YMM4, %YMM4, %k1
-	kmovd	%k1, %ecx
-	subq	$-(VEC_SIZE * 4), %rdi
-	testl	%ecx, %ecx
+	/* Find non-matches in VEC_4 while combining with non-matches
+	   from VEC_1.  NB: Try and use masked predicate execution on
+	   instructions that have mask result as it has no latency
+	   penalty.  */
+	VPCMP	$4, %VMATCH, %VMM(4), %k4{%k1}
+
+	/* Combined zeros from VEC_1 / VEC_2 (search for null term).  */
+	VPMINU	%VMM(1), %VMM(2), %VMM(2)
+
+	/* Use min to select all zeros from either xor or end of
+	   string).  */
+	VPMINU	%VMM(3), %VMM(7), %VMM(3)
+	VPMINU	%VMM(2), %VMM(6), %VMM(2)
+
+	/* Combined zeros from VEC_2 / VEC_3 (search for null term).  */
+	VPMINU	%VMM(3), %VMM(4), %VMM(4)
+
+	/* Combined zeros from VEC_2 / VEC_4 (this has all null term and
+	   esi matches for VEC_2 / VEC_3).  */
+	VPMINU	%VMM(2), %VMM(4), %VMM(4)
+# else
+	/* Collect non-matches for VEC_2.  */
+	VPCMP	$4, %VMM(2), %VMATCH, %k2
+
+	/* Combined zeros from VEC_1 / VEC_2 (search for null term).  */
+	VPMINU	%VMM(1), %VMM(2), %VMM(2)
+
+	/* Find non-matches in VEC_3/VEC_4 while combining with non-
+	   matches from VEC_1/VEC_2 respectively.  */
+	VPCMP	$4, %VMM(3), %VMATCH, %k3{%k1}
+	VPCMP	$4, %VMM(4), %VMATCH, %k4{%k2}
+
+	/* Finish combining zeros in all VECs.  */
+	VPMINU	%VMM(3), %VMM(4), %VMM(4)
+
+	/* Combine in esi matches for VEC_3 (if there was a match with
+	   esi, the corresponding bit in %k3 is zero so the
+	   VPMINU_MASKZ will have a zero in the result).  NB: This make
+	   the VPMINU 3c latency.  The only way to avoid it is to
+	   createa a 12c dependency chain on all the `VPCMP $4, ...`
+	   which has higher total latency.  */
+	VPMINU	%VMM(2), %VMM(4), %VMM(4){%k3}{z}
+# endif
+	VPTEST	%VMM(4), %VMM(4), %k0{%k4}
+	KMOV	%k0, %VRDX
+	subq	$-(VEC_SIZE * 4), %LOOP_REG
+
+	/* TESTZ is inc using the proper register width depending on
+	   CHAR_PER_VEC. An esi match or null-term match leaves a zero-
+	   bit in rdx so inc won't overflow and won't be zero.  */
+	TESTZ	(rdx)
 	jz	L(loop_4x_vec)
 
-	VPTESTN	%YMM1, %YMM1, %k0
-	kmovd	%k0, %eax
-	testl	%eax, %eax
-	jnz	L(last_vec_x1)
+	VPTEST	%VMM(1), %VMM(1), %k0{%k1}
+	KMOV	%k0, %VGPR(MASK_GPR)
+	TESTZ	(MASK_GPR)
+# if VEC_SIZE == 32
+	/* We can reuse the return code in page_cross logic for VEC_SIZE
+	   == 32.  */
+	jnz	L(last_vec_x1_vec_size32)
+# else
+	jnz	L(last_vec_x1_vec_size64)
+# endif
+
 
-	VPTESTN	%YMM2, %YMM2, %k0
-	kmovd	%k0, %eax
-	testl	%eax, %eax
+	/* COND_MASK integates the esi matches for VEC_SIZE == 64. For
+	   VEC_SIZE == 32 they are already integrated.  */
+	VPTEST	%VMM(2), %VMM(2), %k0 COND_MASK(k2)
+	KMOV	%k0, %VRCX
+	TESTZ	(rcx)
 	jnz	L(last_vec_x2)
 
-	VPTESTN	%YMM3, %YMM3, %k0
-	kmovd	%k0, %eax
-	/* Combine YMM3 matches (eax) with YMM4 matches (ecx).  */
-# ifdef USE_AS_WCSCHR
-	sall	$8, %ecx
-	orl	%ecx, %eax
-	bsfl	%eax, %eax
+	VPTEST	%VMM(3), %VMM(3), %k0 COND_MASK(k3)
+	KMOV	%k0, %VRCX
+# if CHAR_PER_VEC == 64
+	TESTZ	(rcx)
+	jnz	L(last_vec_x3)
 # else
-	salq	$32, %rcx
-	orq	%rcx, %rax
-	bsfq	%rax, %rax
+	salq	$CHAR_PER_VEC, %rdx
+	TESTZ	(rcx)
+	orq	%rcx, %rdx
 # endif
+
+	bsfq	%rdx, %rdx
+
 # ifndef USE_AS_STRCHRNUL
 	/* Check if match was CHAR or null.  */
-	cmp	(VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %CHAR_REG
+	cmp	(LAST_VEC_OFFSET)(%LOOP_REG, %rdx, CHAR_SIZE), %CHAR_REG
 	jne	L(zero_end)
 # endif
 	/* NB: Multiply sizeof char type (1 or 4) to get the number of
 	   bytes.  */
-	leaq	(VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
+	leaq	(LAST_VEC_OFFSET)(%LOOP_REG, %rdx, CHAR_SIZE), %rax
 	ret
 
-	.p2align 4,, 8
-L(last_vec_x1):
-	bsfl	%eax, %eax
-# ifdef USE_AS_WCSCHR
-	/* NB: Multiply wchar_t count by 4 to get the number of bytes.
-	   */
-	leaq	(%rdi, %rax, CHAR_SIZE), %rax
-# else
-	addq	%rdi, %rax
+# ifndef USE_AS_STRCHRNUL
+L(zero_end):
+	xorl	%eax, %eax
+	ret
 # endif
 
-# ifndef USE_AS_STRCHRNUL
+
+	/* Seperate return label for last VEC1 because for VEC_SIZE ==
+	   32 we can reuse return code in L(page_cross) but VEC_SIZE ==
+	   64 has mismatched registers.  */
+# if VEC_SIZE == 64
+	.p2align 4,, 8
+L(last_vec_x1_vec_size64):
+	bsf	%VRCX, %VRCX
+#  ifndef USE_AS_STRCHRNUL
 	/* Check if match was null.  */
-	cmp	(%rax), %CHAR_REG
+	cmp	(%rax, %rcx, CHAR_SIZE), %CHAR_REG
 	jne	L(zero_end)
-# endif
-
+#  endif
+#  ifdef USE_AS_WCSCHR
+	/* NB: Multiply wchar_t count by 4 to get the number of bytes.
+	 */
+	leaq	(%rax, %rcx, CHAR_SIZE), %rax
+#  else
+	addq	%rcx, %rax
+#  endif
 	ret
 
+	/* Since we can't combine the last 2x matches for CHAR_PER_VEC
+	   == 64 we need return label for last VEC3.  */
+#  if CHAR_PER_VEC == 64
 	.p2align 4,, 8
+L(last_vec_x3):
+	addq	$VEC_SIZE, %LOOP_REG
+#  endif
+
+	/* Duplicate L(last_vec_x2) for VEC_SIZE == 64 because we can't
+	   reuse L(first_vec_x3) due to register mismatch.  */
 L(last_vec_x2):
-	bsfl	%eax, %eax
-# ifndef USE_AS_STRCHRNUL
+	bsf	%VGPR(MASK_GPR), %VGPR(MASK_GPR)
+#  ifndef USE_AS_STRCHRNUL
 	/* Check if match was null.  */
-	cmp	(VEC_SIZE)(%rdi, %rax, CHAR_SIZE), %CHAR_REG
+	cmp	(VEC_SIZE * 1)(%LOOP_REG, %MASK_GPR, CHAR_SIZE), %CHAR_REG
 	jne	L(zero_end)
-# endif
+#  endif
 	/* NB: Multiply sizeof char type (1 or 4) to get the number of
 	   bytes.  */
-	leaq	(VEC_SIZE)(%rdi, %rax, CHAR_SIZE), %rax
+	leaq	(VEC_SIZE * 1)(%LOOP_REG, %MASK_GPR, CHAR_SIZE), %rax
 	ret
+# endif
 
-	/* Cold case for crossing page with first load.	 */
-	.p2align 4,, 8
+	/* Cold case for crossing page with first load.  */
+	.p2align 4,, 10
+# ifndef USE_AS_STRCHRNUL
 L(cross_page_boundary):
-	movq	%rdi, %rdx
+# endif
+L(cross_page_boundary_real):
 	/* Align rdi.  */
-	andq	$-VEC_SIZE, %rdi
-	VMOVA	(%rdi), %YMM1
-	/* Leaves only CHARS matching esi as 0.  */
-	vpxorq	%YMM1, %YMM0, %YMM2
-	VPMINU	%YMM2, %YMM1, %YMM2
-	/* Each bit in K0 represents a CHAR or a null byte in YMM1.  */
-	VPTESTN	%YMM2, %YMM2, %k0
-	kmovd	%k0, %eax
+	xorq	%rdi, %rax
+	VMOVA	(PAGE_SIZE - VEC_SIZE)(%rax), %VMM(1)
+	/* Use high latency method of getting matches to save code size.
+	 */
+
+	/* K1 has 1s where VEC(1) does NOT match esi.  */
+	VPCMP	$4, %VMM(1), %VMATCH, %k1
+	/* K0 has ones where K1 is 1 (non-match with esi), and non-zero
+	   (null).  */
+	VPTEST	%VMM(1), %VMM(1), %k0{%k1}
+	KMOV	%k0, %VRAX
 	/* Remove the leading bits.  */
 # ifdef USE_AS_WCSCHR
-	movl	%edx, %SHIFT_REG
+	movl	%edi, %VGPR_SZ(SHIFT_REG, 32)
 	/* NB: Divide shift count by 4 since each bit in K1 represent 4
 	   bytes.  */
-	sarl	$2, %SHIFT_REG
-	andl	$(CHAR_PER_VEC - 1), %SHIFT_REG
+	sarl	$2, %VGPR_SZ(SHIFT_REG, 32)
+	andl	$(CHAR_PER_VEC - 1), %VGPR_SZ(SHIFT_REG, 32)
+
+	/* if wcsrchr we need to reverse matches as we can't rely on
+	   signed shift to bring in ones. There is not sarx for
+	   gpr8/16. Also not we can't use inc here as the lower bits
+	   represent matches out of range so we can't rely on overflow.
+	 */
+	xorl	$((1 << CHAR_PER_VEC)- 1), %eax
+# endif
+	/* Use arithmatic shift so that leading 1s are filled in.  */
+	sarx	%VGPR(SHIFT_REG), %VRAX, %VRAX
+	/* If eax is all ones then no matches for esi or NULL.  */
+
+# ifdef USE_AS_WCSCHR
+	test	%VRAX, %VRAX
+# else
+	inc	%VRAX
 # endif
-	sarxl	%SHIFT_REG, %eax, %eax
-	/* If eax is zero continue.  */
-	testl	%eax, %eax
 	jz	L(cross_page_continue)
-	bsfl	%eax, %eax
 
+	.p2align 4,, 10
+L(last_vec_x1_vec_size32):
+	bsf	%VRAX, %VRAX
 # ifdef USE_AS_WCSCHR
-	/* NB: Multiply wchar_t count by 4 to get the number of
-	   bytes.  */
-	leaq	(%rdx, %rax, CHAR_SIZE), %rax
+	/* NB: Multiply wchar_t count by 4 to get the number of bytes.
+	 */
+	leaq	(%rdi, %rax, CHAR_SIZE), %rax
 # else
-	addq	%rdx, %rax
+	addq	%rdi, %rax
 # endif
 # ifndef USE_AS_STRCHRNUL
 	/* Check to see if match was CHAR or null.  */
 	cmp	(%rax), %CHAR_REG
-	je	L(cross_page_ret)
-L(zero_end):
-	xorl	%eax, %eax
-L(cross_page_ret):
+	jne	L(zero_end_0)
 # endif
 	ret
+# ifndef USE_AS_STRCHRNUL
+L(zero_end_0):
+	xorl	%eax, %eax
+	ret
+# endif
 
 END (STRCHR)
 #endif
-- 
2.34.1


^ permalink raw reply	[flat|nested] 41+ messages in thread

* [PATCH v3 3/7] x86: Optimize strnlen-evex.S and implement with VMM headers
  2022-10-19  0:44 ` [PATCH v3 1/7] x86: Optimize memchr-evex.S and implement with VMM headers Noah Goldstein
  2022-10-19  0:44   ` [PATCH v3 2/7] x86: Shrink / minorly optimize strchr-evex " Noah Goldstein
@ 2022-10-19  0:44   ` Noah Goldstein
  2022-10-19 16:57     ` H.J. Lu
  2022-10-19  0:44   ` [PATCH v3 4/7] x86: Optimize memrchr-evex.S Noah Goldstein
                     ` (4 subsequent siblings)
  6 siblings, 1 reply; 41+ messages in thread
From: Noah Goldstein @ 2022-10-19  0:44 UTC (permalink / raw)
  To: libc-alpha; +Cc: goldstein.w.n, hjl.tools, carlos

Optimizations are:
1. Use the fact that bsf(0) leaves the destination unchanged to save a
   branch in short string case.
2. Restructure code so that small strings are given the hot path.
        - This is a net-zero on the benchmark suite but in general makes
      sense as smaller sizes are far more common.
3. Use more code-size efficient instructions.
	- tzcnt ...     -> bsf ...
	- vpcmpb $0 ... -> vpcmpeq ...
4. Align labels less aggressively, especially if it doesn't save fetch
   blocks / causes the basic-block to span extra cache-lines.

The optimizations (especially for point 2) make the strnlen and
strlen code essentially incompatible so split strnlen-evex
to a new file.

Code Size Changes:
strlen-evex.S       :  -23 bytes
strnlen-evex.S      : -167 bytes

Net perf changes:

Reported as geometric mean of all improvements / regressions from N=10
runs of the benchtests. Value as New Time / Old Time so < 1.0 is
improvement and 1.0 is regression.

strlen-evex.S       : 0.992 (No real change)
strnlen-evex.S      : 0.947

Full results attached in email.

Full check passes on x86-64.
---
 sysdeps/x86_64/multiarch/strlen-evex.S  | 544 +++++++-----------------
 sysdeps/x86_64/multiarch/strnlen-evex.S | 427 ++++++++++++++++++-
 sysdeps/x86_64/multiarch/wcsnlen-evex.S |   5 +-
 3 files changed, 572 insertions(+), 404 deletions(-)

diff --git a/sysdeps/x86_64/multiarch/strlen-evex.S b/sysdeps/x86_64/multiarch/strlen-evex.S
index 2109ec2f7a..487846f098 100644
--- a/sysdeps/x86_64/multiarch/strlen-evex.S
+++ b/sysdeps/x86_64/multiarch/strlen-evex.S
@@ -26,466 +26,220 @@
 #  define STRLEN	__strlen_evex
 # endif
 
-# define VMOVA		vmovdqa64
+# ifndef VEC_SIZE
+#  include "x86-evex256-vecs.h"
+# endif
 
 # ifdef USE_AS_WCSLEN
-#  define VPCMP		vpcmpd
+#  define VPCMPEQ	vpcmpeqd
+#  define VPCMPNEQ	vpcmpneqd
+#  define VPTESTN	vptestnmd
+#  define VPTEST	vptestmd
 #  define VPMINU	vpminud
-#  define SHIFT_REG ecx
 #  define CHAR_SIZE	4
+#  define CHAR_SIZE_SHIFT_REG(reg)	sar $2, %reg
 # else
-#  define VPCMP		vpcmpb
+#  define VPCMPEQ	vpcmpeqb
+#  define VPCMPNEQ	vpcmpneqb
+#  define VPTESTN	vptestnmb
+#  define VPTEST	vptestmb
 #  define VPMINU	vpminub
-#  define SHIFT_REG edx
 #  define CHAR_SIZE	1
+#  define CHAR_SIZE_SHIFT_REG(reg)
+
+#  define REG_WIDTH	VEC_SIZE
 # endif
 
-# define XMMZERO	xmm16
-# define YMMZERO	ymm16
-# define YMM1		ymm17
-# define YMM2		ymm18
-# define YMM3		ymm19
-# define YMM4		ymm20
-# define YMM5		ymm21
-# define YMM6		ymm22
-
-# define VEC_SIZE 32
-# define PAGE_SIZE 4096
-# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE)
-
-	.section .text.evex,"ax",@progbits
-ENTRY (STRLEN)
-# ifdef USE_AS_STRNLEN
-	/* Check zero length.  */
-	test	%RSI_LP, %RSI_LP
-	jz	L(zero)
-#  ifdef __ILP32__
-	/* Clear the upper 32 bits.  */
-	movl	%esi, %esi
-#  endif
-	mov	%RSI_LP, %R8_LP
+# define CHAR_PER_VEC	(VEC_SIZE / CHAR_SIZE)
+
+# include "reg-macros.h"
+
+# if CHAR_PER_VEC == 64
+
+#  define TAIL_RETURN_LBL	first_vec_x2
+#  define TAIL_RETURN_OFFSET	(CHAR_PER_VEC * 2)
+
+#  define FALLTHROUGH_RETURN_LBL	first_vec_x3
+#  define FALLTHROUGH_RETURN_OFFSET	(CHAR_PER_VEC * 3)
+
+# else
+
+#  define TAIL_RETURN_LBL	first_vec_x3
+#  define TAIL_RETURN_OFFSET	(CHAR_PER_VEC * 3)
+
+#  define FALLTHROUGH_RETURN_LBL	first_vec_x2
+#  define FALLTHROUGH_RETURN_OFFSET	(CHAR_PER_VEC * 2)
 # endif
+
+# define XZERO	VMM_128(0)
+# define VZERO	VMM(0)
+# define PAGE_SIZE	4096
+
+	.section SECTION(.text), "ax", @progbits
+ENTRY_P2ALIGN (STRLEN, 6)
 	movl	%edi, %eax
-	vpxorq	%XMMZERO, %XMMZERO, %XMMZERO
-	/* Clear high bits from edi. Only keeping bits relevant to page
-	   cross check.  */
+	vpxorq	%XZERO, %XZERO, %XZERO
 	andl	$(PAGE_SIZE - 1), %eax
-	/* Check if we may cross page boundary with one vector load.  */
 	cmpl	$(PAGE_SIZE - VEC_SIZE), %eax
 	ja	L(cross_page_boundary)
 
 	/* Check the first VEC_SIZE bytes.  Each bit in K0 represents a
 	   null byte.  */
-	VPCMP	$0, (%rdi), %YMMZERO, %k0
-	kmovd	%k0, %eax
-# ifdef USE_AS_STRNLEN
-	/* If length < CHAR_PER_VEC handle special.  */
-	cmpq	$CHAR_PER_VEC, %rsi
-	jbe	L(first_vec_x0)
-# endif
-	testl	%eax, %eax
+	VPCMPEQ	(%rdi), %VZERO, %k0
+	KMOV	%k0, %VRAX
+	test	%VRAX, %VRAX
 	jz	L(aligned_more)
-	tzcntl	%eax, %eax
-	ret
-# ifdef USE_AS_STRNLEN
-L(zero):
-	xorl	%eax, %eax
-	ret
-
-	.p2align 4
-L(first_vec_x0):
-	/* Set bit for max len so that tzcnt will return min of max len
-	   and position of first match.  */
-	btsq	%rsi, %rax
-	tzcntl	%eax, %eax
-	ret
-# endif
-
-	.p2align 4
-L(first_vec_x1):
-	tzcntl	%eax, %eax
-	/* Safe to use 32 bit instructions as these are only called for
-	   size = [1, 159].  */
-# ifdef USE_AS_STRNLEN
-	/* Use ecx which was computed earlier to compute correct value.
-	 */
-	leal	-(CHAR_PER_VEC * 4 + 1)(%rcx, %rax), %eax
-# else
-	subl	%edx, %edi
-#  ifdef USE_AS_WCSLEN
-	/* NB: Divide bytes by 4 to get the wchar_t count.  */
-	sarl	$2, %edi
-#  endif
-	leal	CHAR_PER_VEC(%rdi, %rax), %eax
-# endif
-	ret
-
-	.p2align 4
-L(first_vec_x2):
-	tzcntl	%eax, %eax
-	/* Safe to use 32 bit instructions as these are only called for
-	   size = [1, 159].  */
-# ifdef USE_AS_STRNLEN
-	/* Use ecx which was computed earlier to compute correct value.
-	 */
-	leal	-(CHAR_PER_VEC * 3 + 1)(%rcx, %rax), %eax
-# else
-	subl	%edx, %edi
-#  ifdef USE_AS_WCSLEN
-	/* NB: Divide bytes by 4 to get the wchar_t count.  */
-	sarl	$2, %edi
-#  endif
-	leal	(CHAR_PER_VEC * 2)(%rdi, %rax), %eax
-# endif
+	bsf	%VRAX, %VRAX
 	ret
 
-	.p2align 4
-L(first_vec_x3):
-	tzcntl	%eax, %eax
-	/* Safe to use 32 bit instructions as these are only called for
-	   size = [1, 159].  */
-# ifdef USE_AS_STRNLEN
-	/* Use ecx which was computed earlier to compute correct value.
-	 */
-	leal	-(CHAR_PER_VEC * 2 + 1)(%rcx, %rax), %eax
-# else
-	subl	%edx, %edi
-#  ifdef USE_AS_WCSLEN
-	/* NB: Divide bytes by 4 to get the wchar_t count.  */
-	sarl	$2, %edi
-#  endif
-	leal	(CHAR_PER_VEC * 3)(%rdi, %rax), %eax
-# endif
-	ret
-
-	.p2align 4
+	.p2align 4,, 8
 L(first_vec_x4):
-	tzcntl	%eax, %eax
-	/* Safe to use 32 bit instructions as these are only called for
-	   size = [1, 159].  */
-# ifdef USE_AS_STRNLEN
-	/* Use ecx which was computed earlier to compute correct value.
-	 */
-	leal	-(CHAR_PER_VEC + 1)(%rcx, %rax), %eax
-# else
-	subl	%edx, %edi
-#  ifdef USE_AS_WCSLEN
-	/* NB: Divide bytes by 4 to get the wchar_t count.  */
-	sarl	$2, %edi
-#  endif
+	bsf	%VRAX, %VRAX
+	subl	%ecx, %edi
+	CHAR_SIZE_SHIFT_REG (edi)
 	leal	(CHAR_PER_VEC * 4)(%rdi, %rax), %eax
-# endif
 	ret
 
-	.p2align 5
+
+
+	/* Aligned more for strnlen compares remaining length vs 2 *
+	   CHAR_PER_VEC, 4 * CHAR_PER_VEC, and 8 * CHAR_PER_VEC before
+	   going to the loop.  */
+	.p2align 4,, 10
 L(aligned_more):
-	movq	%rdi, %rdx
-	/* Align data to VEC_SIZE.  */
-	andq	$-(VEC_SIZE), %rdi
+	movq	%rdi, %rcx
+	andq	$(VEC_SIZE * -1), %rdi
 L(cross_page_continue):
-	/* Check the first 4 * VEC_SIZE.  Only one VEC_SIZE at a time
-	   since data is only aligned to VEC_SIZE.  */
-# ifdef USE_AS_STRNLEN
-	/* + CHAR_SIZE because it simplies the logic in
-	   last_4x_vec_or_less.  */
-	leaq	(VEC_SIZE * 5 + CHAR_SIZE)(%rdi), %rcx
-	subq	%rdx, %rcx
-#  ifdef USE_AS_WCSLEN
-	/* NB: Divide bytes by 4 to get the wchar_t count.  */
-	sarl	$2, %ecx
-#  endif
-# endif
-	/* Load first VEC regardless.  */
-	VPCMP	$0, VEC_SIZE(%rdi), %YMMZERO, %k0
-# ifdef USE_AS_STRNLEN
-	/* Adjust length. If near end handle specially.  */
-	subq	%rcx, %rsi
-	jb	L(last_4x_vec_or_less)
-# endif
-	kmovd	%k0, %eax
-	testl	%eax, %eax
+	/* Remaining length >= 2 * CHAR_PER_VEC so do VEC0/VEC1 without
+	   rechecking bounds.  */
+	VPCMPEQ	(VEC_SIZE * 1)(%rdi), %VZERO, %k0
+	KMOV	%k0, %VRAX
+	test	%VRAX, %VRAX
 	jnz	L(first_vec_x1)
 
-	VPCMP	$0, (VEC_SIZE * 2)(%rdi), %YMMZERO, %k0
-	kmovd	%k0, %eax
-	test	%eax, %eax
+	VPCMPEQ	(VEC_SIZE * 2)(%rdi), %VZERO, %k0
+	KMOV	%k0, %VRAX
+	test	%VRAX, %VRAX
 	jnz	L(first_vec_x2)
 
-	VPCMP	$0, (VEC_SIZE * 3)(%rdi), %YMMZERO, %k0
-	kmovd	%k0, %eax
-	testl	%eax, %eax
+	VPCMPEQ	(VEC_SIZE * 3)(%rdi), %VZERO, %k0
+	KMOV	%k0, %VRAX
+	test	%VRAX, %VRAX
 	jnz	L(first_vec_x3)
 
-	VPCMP	$0, (VEC_SIZE * 4)(%rdi), %YMMZERO, %k0
-	kmovd	%k0, %eax
-	testl	%eax, %eax
+	VPCMPEQ	(VEC_SIZE * 4)(%rdi), %VZERO, %k0
+	KMOV	%k0, %VRAX
+	test	%VRAX, %VRAX
 	jnz	L(first_vec_x4)
 
-	addq	$VEC_SIZE, %rdi
-# ifdef USE_AS_STRNLEN
-	/* Check if at last VEC_SIZE * 4 length.  */
-	cmpq	$(CHAR_PER_VEC * 4 - 1), %rsi
-	jbe	L(last_4x_vec_or_less_load)
-	movl	%edi, %ecx
-	andl	$(VEC_SIZE * 4 - 1), %ecx
-#  ifdef USE_AS_WCSLEN
-	/* NB: Divide bytes by 4 to get the wchar_t count.  */
-	sarl	$2, %ecx
-#  endif
-	/* Readjust length.  */
-	addq	%rcx, %rsi
-# endif
-	/* Align data to VEC_SIZE * 4.  */
+	subq	$(VEC_SIZE * -1), %rdi
+
+# if CHAR_PER_VEC == 64
+	/* No partial register stalls on processors that we use evex512
+	   on and this saves code size.  */
+	xorb	%dil, %dil
+# else
 	andq	$-(VEC_SIZE * 4), %rdi
+# endif
+
+
 
 	/* Compare 4 * VEC at a time forward.  */
 	.p2align 4
 L(loop_4x_vec):
-	/* Load first VEC regardless.  */
-	VMOVA	(VEC_SIZE * 4)(%rdi), %YMM1
-# ifdef USE_AS_STRNLEN
-	/* Break if at end of length.  */
-	subq	$(CHAR_PER_VEC * 4), %rsi
-	jb	L(last_4x_vec_or_less_cmpeq)
-# endif
-	/* Save some code size by microfusing VPMINU with the load. Since
-	   the matches in ymm2/ymm4 can only be returned if there where no
-	   matches in ymm1/ymm3 respectively there is no issue with overlap.
-	 */
-	VPMINU	(VEC_SIZE * 5)(%rdi), %YMM1, %YMM2
-	VMOVA	(VEC_SIZE * 6)(%rdi), %YMM3
-	VPMINU	(VEC_SIZE * 7)(%rdi), %YMM3, %YMM4
+	VMOVA	(VEC_SIZE * 4)(%rdi), %VMM(1)
+	VPMINU	(VEC_SIZE * 5)(%rdi), %VMM(1), %VMM(2)
+	VMOVA	(VEC_SIZE * 6)(%rdi), %VMM(3)
+	VPMINU	(VEC_SIZE * 7)(%rdi), %VMM(3), %VMM(4)
+	VPTESTN	%VMM(2), %VMM(2), %k0
+	VPTESTN	%VMM(4), %VMM(4), %k2
 
-	VPCMP	$0, %YMM2, %YMMZERO, %k0
-	VPCMP	$0, %YMM4, %YMMZERO, %k1
 	subq	$-(VEC_SIZE * 4), %rdi
-	kortestd	%k0, %k1
+	KORTEST %k0, %k2
 	jz	L(loop_4x_vec)
 
-	/* Check if end was in first half.  */
-	kmovd	%k0, %eax
-	subq	%rdx, %rdi
-# ifdef USE_AS_WCSLEN
-	shrq	$2, %rdi
-# endif
-	testl	%eax, %eax
-	jz	L(second_vec_return)
+	VPTESTN	%VMM(1), %VMM(1), %k1
+	KMOV	%k1, %VRAX
+	test	%VRAX, %VRAX
+	jnz	L(first_vec_x0)
 
-	VPCMP	$0, %YMM1, %YMMZERO, %k2
-	kmovd	%k2, %edx
-	/* Combine VEC1 matches (edx) with VEC2 matches (eax).  */
-# ifdef USE_AS_WCSLEN
-	sall	$CHAR_PER_VEC, %eax
-	orl	%edx, %eax
-	tzcntl	%eax, %eax
-# else
-	salq	$CHAR_PER_VEC, %rax
-	orq	%rdx, %rax
-	tzcntq	%rax, %rax
-# endif
-	addq	%rdi, %rax
-	ret
-
-
-# ifdef USE_AS_STRNLEN
-
-L(last_4x_vec_or_less_load):
-	/* Depending on entry adjust rdi / prepare first VEC in YMM1.  */
-	VMOVA	(VEC_SIZE * 4)(%rdi), %YMM1
-L(last_4x_vec_or_less_cmpeq):
-	VPCMP	$0, %YMM1, %YMMZERO, %k0
-	addq	$(VEC_SIZE * 3), %rdi
-L(last_4x_vec_or_less):
-	kmovd	%k0, %eax
-	/* If remaining length > VEC_SIZE * 2. This works if esi is off by
-	   VEC_SIZE * 4.  */
-	testl	$(CHAR_PER_VEC * 2), %esi
-	jnz	L(last_4x_vec)
-
-	/* length may have been negative or positive by an offset of
-	   CHAR_PER_VEC * 4 depending on where this was called from. This
-	   fixes that.  */
-	andl	$(CHAR_PER_VEC * 4 - 1), %esi
-	testl	%eax, %eax
-	jnz	L(last_vec_x1_check)
+	KMOV	%k0, %VRAX
+	test	%VRAX, %VRAX
+	jnz	L(first_vec_x1)
 
-	/* Check the end of data.  */
-	subl	$CHAR_PER_VEC, %esi
-	jb	L(max)
+	VPTESTN	%VMM(3), %VMM(3), %k0
 
-	VPCMP	$0, (VEC_SIZE * 2)(%rdi), %YMMZERO, %k0
-	kmovd	%k0, %eax
-	tzcntl	%eax, %eax
-	/* Check the end of data.  */
-	cmpl	%eax, %esi
-	jb	L(max)
-
-	subq	%rdx, %rdi
-#  ifdef USE_AS_WCSLEN
-	/* NB: Divide bytes by 4 to get the wchar_t count.  */
-	sarq	$2, %rdi
-#  endif
-	leaq	(CHAR_PER_VEC * 2)(%rdi, %rax), %rax
-	ret
-L(max):
-	movq	%r8, %rax
-	ret
-# endif
-
-	/* Placed here in strnlen so that the jcc L(last_4x_vec_or_less)
-	   in the 4x VEC loop can use 2 byte encoding.  */
-	.p2align 4
-L(second_vec_return):
-	VPCMP	$0, %YMM3, %YMMZERO, %k0
-	/* Combine YMM3 matches (k0) with YMM4 matches (k1).  */
-# ifdef USE_AS_WCSLEN
-	kunpckbw	%k0, %k1, %k0
-	kmovd	%k0, %eax
-	tzcntl	%eax, %eax
+# if CHAR_PER_VEC == 64
+	KMOV	%k0, %VRAX
+	test	%VRAX, %VRAX
+	jnz	L(first_vec_x2)
+	KMOV	%k2, %VRAX
 # else
-	kunpckdq	%k0, %k1, %k0
-	kmovq	%k0, %rax
-	tzcntq	%rax, %rax
+	/* We can only combine last 2x VEC masks if CHAR_PER_VEC <= 32.
+	 */
+	kmovd	%k2, %edx
+	kmovd	%k0, %eax
+	salq	$CHAR_PER_VEC, %rdx
+	orq	%rdx, %rax
 # endif
-	leaq	(CHAR_PER_VEC * 2)(%rdi, %rax), %rax
-	ret
 
-
-# ifdef USE_AS_STRNLEN
-L(last_vec_x1_check):
-	tzcntl	%eax, %eax
-	/* Check the end of data.  */
-	cmpl	%eax, %esi
-	jb	L(max)
-	subq	%rdx, %rdi
-#  ifdef USE_AS_WCSLEN
-	/* NB: Divide bytes by 4 to get the wchar_t count.  */
-	sarq	$2, %rdi
-#  endif
-	leaq	(CHAR_PER_VEC)(%rdi, %rax), %rax
+	/* first_vec_x3 for strlen-ZMM and first_vec_x2 for strlen-YMM.
+	 */
+	.p2align 4,, 2
+L(FALLTHROUGH_RETURN_LBL):
+	bsfq	%rax, %rax
+	subq	%rcx, %rdi
+	CHAR_SIZE_SHIFT_REG (rdi)
+	leaq	(FALLTHROUGH_RETURN_OFFSET)(%rdi, %rax), %rax
 	ret
 
-	.p2align 4
-L(last_4x_vec):
-	/* Test first 2x VEC normally.  */
-	testl	%eax, %eax
-	jnz	L(last_vec_x1)
-
-	VPCMP	$0, (VEC_SIZE * 2)(%rdi), %YMMZERO, %k0
-	kmovd	%k0, %eax
-	testl	%eax, %eax
-	jnz	L(last_vec_x2)
-
-	/* Normalize length.  */
-	andl	$(CHAR_PER_VEC * 4 - 1), %esi
-	VPCMP	$0, (VEC_SIZE * 3)(%rdi), %YMMZERO, %k0
-	kmovd	%k0, %eax
-	testl	%eax, %eax
-	jnz	L(last_vec_x3)
-
-	/* Check the end of data.  */
-	subl	$(CHAR_PER_VEC * 3), %esi
-	jb	L(max)
-
-	VPCMP	$0, (VEC_SIZE * 4)(%rdi), %YMMZERO, %k0
-	kmovd	%k0, %eax
-	tzcntl	%eax, %eax
-	/* Check the end of data.  */
-	cmpl	%eax, %esi
-	jb	L(max_end)
-
-	subq	%rdx, %rdi
-#  ifdef USE_AS_WCSLEN
-	/* NB: Divide bytes by 4 to get the wchar_t count.  */
-	sarq	$2, %rdi
-#  endif
-	leaq	(CHAR_PER_VEC * 4)(%rdi, %rax), %rax
+	.p2align 4,, 8
+L(first_vec_x0):
+	bsf	%VRAX, %VRAX
+	sub	%rcx, %rdi
+	CHAR_SIZE_SHIFT_REG (rdi)
+	addq	%rdi, %rax
 	ret
 
-	.p2align 4
-L(last_vec_x1):
-	tzcntl	%eax, %eax
-	subq	%rdx, %rdi
-#  ifdef USE_AS_WCSLEN
-	/* NB: Divide bytes by 4 to get the wchar_t count.  */
-	sarq	$2, %rdi
-#  endif
+	.p2align 4,, 10
+L(first_vec_x1):
+	bsf	%VRAX, %VRAX
+	sub	%rcx, %rdi
+	CHAR_SIZE_SHIFT_REG (rdi)
 	leaq	(CHAR_PER_VEC)(%rdi, %rax), %rax
 	ret
 
-	.p2align 4
-L(last_vec_x2):
-	tzcntl	%eax, %eax
-	subq	%rdx, %rdi
-#  ifdef USE_AS_WCSLEN
-	/* NB: Divide bytes by 4 to get the wchar_t count.  */
-	sarq	$2, %rdi
-#  endif
-	leaq	(CHAR_PER_VEC * 2)(%rdi, %rax), %rax
-	ret
-
-	.p2align 4
-L(last_vec_x3):
-	tzcntl	%eax, %eax
-	subl	$(CHAR_PER_VEC * 2), %esi
-	/* Check the end of data.  */
-	cmpl	%eax, %esi
-	jb	L(max_end)
-	subq	%rdx, %rdi
-#  ifdef USE_AS_WCSLEN
-	/* NB: Divide bytes by 4 to get the wchar_t count.  */
-	sarq	$2, %rdi
-#  endif
-	leaq	(CHAR_PER_VEC * 3)(%rdi, %rax), %rax
-	ret
-L(max_end):
-	movq	%r8, %rax
+	.p2align 4,, 10
+	/* first_vec_x2 for strlen-ZMM and first_vec_x3 for strlen-YMM.
+	 */
+L(TAIL_RETURN_LBL):
+	bsf	%VRAX, %VRAX
+	sub	%VRCX, %VRDI
+	CHAR_SIZE_SHIFT_REG (VRDI)
+	lea	(TAIL_RETURN_OFFSET)(%rdi, %rax), %VRAX
 	ret
-# endif
 
-	/* Cold case for crossing page with first load.	 */
-	.p2align 4
+	.p2align 4,, 8
 L(cross_page_boundary):
-	movq	%rdi, %rdx
+	movq	%rdi, %rcx
 	/* Align data to VEC_SIZE.  */
 	andq	$-VEC_SIZE, %rdi
-	VPCMP	$0, (%rdi), %YMMZERO, %k0
-	kmovd	%k0, %eax
-	/* Remove the leading bytes.  */
+
+	VPCMPEQ	(%rdi), %VZERO, %k0
+
+	KMOV	%k0, %VRAX
 # ifdef USE_AS_WCSLEN
-	/* NB: Divide shift count by 4 since each bit in K0 represent 4
-	   bytes.  */
-	movl	%edx, %ecx
-	shrl	$2, %ecx
-	andl	$(CHAR_PER_VEC - 1), %ecx
-# endif
-	/* SHIFT_REG is ecx for USE_AS_WCSLEN and edx otherwise.  */
-	sarxl	%SHIFT_REG, %eax, %eax
+	movl	%ecx, %edx
+	shrl	$2, %edx
+	andl	$(CHAR_PER_VEC - 1), %edx
+	shrx	%edx, %eax, %eax
 	testl	%eax, %eax
-# ifndef USE_AS_STRNLEN
-	jz	L(cross_page_continue)
-	tzcntl	%eax, %eax
-	ret
 # else
-	jnz	L(cross_page_less_vec)
-#  ifndef USE_AS_WCSLEN
-	movl	%edx, %ecx
-	andl	$(CHAR_PER_VEC - 1), %ecx
-#  endif
-	movl	$CHAR_PER_VEC, %eax
-	subl	%ecx, %eax
-	/* Check the end of data.  */
-	cmpq	%rax, %rsi
-	ja	L(cross_page_continue)
-	movl	%esi, %eax
-	ret
-L(cross_page_less_vec):
-	tzcntl	%eax, %eax
-	/* Select min of length and position of first null.  */
-	cmpq	%rax, %rsi
-	cmovb	%esi, %eax
-	ret
+	shr	%cl, %VRAX
 # endif
+	jz	L(cross_page_continue)
+	bsf	%VRAX, %VRAX
+	ret
 
 END (STRLEN)
 #endif
diff --git a/sysdeps/x86_64/multiarch/strnlen-evex.S b/sysdeps/x86_64/multiarch/strnlen-evex.S
index 64a9fc2606..443a32a749 100644
--- a/sysdeps/x86_64/multiarch/strnlen-evex.S
+++ b/sysdeps/x86_64/multiarch/strnlen-evex.S
@@ -1,8 +1,423 @@
-#ifndef STRNLEN
-# define STRNLEN __strnlen_evex
-#endif
+/* strnlen/wcsnlen optimized with 256-bit EVEX instructions.
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <isa-level.h>
+#include <sysdep.h>
+
+#if ISA_SHOULD_BUILD (4)
+
+# ifndef VEC_SIZE
+#  include "x86-evex256-vecs.h"
+# endif
+
+
+# ifndef STRNLEN
+#  define STRNLEN	__strnlen_evex
+# endif
+
+# ifdef USE_AS_WCSLEN
+#  define VPCMPEQ	vpcmpeqd
+#  define VPCMPNEQ	vpcmpneqd
+#  define VPTESTN	vptestnmd
+#  define VPTEST	vptestmd
+#  define VPMINU	vpminud
+#  define CHAR_SIZE	4
+
+# else
+#  define VPCMPEQ	vpcmpeqb
+#  define VPCMPNEQ	vpcmpneqb
+#  define VPTESTN	vptestnmb
+#  define VPTEST	vptestmb
+#  define VPMINU	vpminub
+#  define CHAR_SIZE	1
+
+#  define REG_WIDTH	VEC_SIZE
+# endif
+
+# define CHAR_PER_VEC	(VEC_SIZE / CHAR_SIZE)
+
+# include "reg-macros.h"
+
+# if CHAR_PER_VEC == 32
+#  define SUB_SHORT(imm, reg)	subb $(imm), %VGPR_SZ(reg, 8)
+# else
+#  define SUB_SHORT(imm, reg)	subl $(imm), %VGPR_SZ(reg, 32)
+# endif
+
+
+
+# if CHAR_PER_VEC == 64
+#  define FALLTHROUGH_RETURN_OFFSET	(CHAR_PER_VEC * 3)
+# else
+#  define FALLTHROUGH_RETURN_OFFSET	(CHAR_PER_VEC * 2)
+# endif
+
+
+# define XZERO	VMM_128(0)
+# define VZERO	VMM(0)
+# define PAGE_SIZE	4096
+
+	.section SECTION(.text), "ax", @progbits
+ENTRY_P2ALIGN (STRNLEN, 6)
+	/* Check zero length.  */
+	test	%RSI_LP, %RSI_LP
+	jz	L(zero)
+# ifdef __ILP32__
+	/* Clear the upper 32 bits.  */
+	movl	%esi, %esi
+# endif
+
+	movl	%edi, %eax
+	vpxorq	%XZERO, %XZERO, %XZERO
+	andl	$(PAGE_SIZE - 1), %eax
+	cmpl	$(PAGE_SIZE - VEC_SIZE), %eax
+	ja	L(cross_page_boundary)
+
+	/* Check the first VEC_SIZE bytes.  Each bit in K0 represents a
+	   null byte.  */
+	VPCMPEQ	(%rdi), %VZERO, %k0
+
+	KMOV	%k0, %VRCX
+	movq	%rsi, %rax
+
+	/* If src (rcx) is zero, bsf does not change the result.  NB:
+	   Must use 64-bit bsf here so that upper bits of len are not
+	   cleared.  */
+	bsfq	%rcx, %rax
+	/* If rax > CHAR_PER_VEC then rcx must have been zero (no null
+	   CHAR) and rsi must be > CHAR_PER_VEC.  */
+	cmpq	$CHAR_PER_VEC, %rax
+	ja	L(more_1x_vec)
+	/* Check if first match in bounds.  */
+	cmpq	%rax, %rsi
+	cmovb	%esi, %eax
+	ret
+
+
+# if CHAR_PER_VEC != 32
+	.p2align 4,, 2
+L(zero):
+L(max_0):
+	movl	%esi, %eax
+	ret
+# endif
+
+	/* Aligned more for strnlen compares remaining length vs 2 *
+	   CHAR_PER_VEC, 4 * CHAR_PER_VEC, and 8 * CHAR_PER_VEC before
+	   going to the loop.  */
+	.p2align 4,, 10
+L(more_1x_vec):
+L(cross_page_continue):
+	/* Compute number of words checked after aligning.  */
+# ifdef USE_AS_WCSLEN
+	/* Need to compute directly for wcslen as CHAR_SIZE * rsi can
+	   overflow.  */
+	movq	%rdi, %rax
+	andq	$(VEC_SIZE * -1), %rdi
+	subq	%rdi, %rax
+	sarq	$2, %rax
+	leaq	-(CHAR_PER_VEC * 1)(%rax, %rsi), %rax
+# else
+	leaq	(VEC_SIZE * -1)(%rsi, %rdi), %rax
+	andq	$(VEC_SIZE * -1), %rdi
+	subq	%rdi, %rax
+# endif
+
+
+	VPCMPEQ	VEC_SIZE(%rdi), %VZERO, %k0
+
+	cmpq	$(CHAR_PER_VEC * 2), %rax
+	ja	L(more_2x_vec)
+
+L(last_2x_vec_or_less):
+	KMOV	%k0, %VRDX
+	test	%VRDX, %VRDX
+	jnz	L(last_vec_check)
+
+	/* Check the end of data.  */
+	SUB_SHORT (CHAR_PER_VEC, rax)
+	jbe	L(max_0)
+	VPCMPEQ	(VEC_SIZE * 2)(%rdi), %VZERO, %k0
+	KMOV	%k0, %VRDX
+	test	%VRDX, %VRDX
+	jz	L(max_0)
+	/* Best place for LAST_VEC_CHECK if ZMM.  */
+	.p2align 4,, 8
+L(last_vec_check):
+	bsf	%VRDX, %VRDX
+	sub	%eax, %edx
+	lea	(%rsi, %rdx), %eax
+	cmovae	%esi, %eax
+	ret
+
+# if CHAR_PER_VEC == 32
+	.p2align 4,, 2
+L(zero):
+L(max_0):
+	movl	%esi, %eax
+	ret
+# endif
+
+	.p2align 4,, 8
+L(last_4x_vec_or_less):
+	addl	$(CHAR_PER_VEC * -4), %eax
+	VPCMPEQ	(VEC_SIZE * 5)(%rdi), %VZERO, %k0
+	subq	$(VEC_SIZE * -4), %rdi
+	cmpl	$(CHAR_PER_VEC * 2), %eax
+	jbe	L(last_2x_vec_or_less)
+
+	.p2align 4,, 6
+L(more_2x_vec):
+	/* Remaining length >= 2 * CHAR_PER_VEC so do VEC0/VEC1 without
+	   rechecking bounds.  */
 
-#define USE_AS_STRNLEN 1
-#define STRLEN	STRNLEN
+	KMOV	%k0, %VRDX
 
-#include "strlen-evex.S"
+	test	%VRDX, %VRDX
+	jnz	L(first_vec_x1)
+
+	VPCMPEQ	(VEC_SIZE * 2)(%rdi), %VZERO, %k0
+	KMOV	%k0, %VRDX
+	test	%VRDX, %VRDX
+	jnz	L(first_vec_x2)
+
+	cmpq	$(CHAR_PER_VEC * 4), %rax
+	ja	L(more_4x_vec)
+
+
+	VPCMPEQ	(VEC_SIZE * 3)(%rdi), %VZERO, %k0
+	KMOV	%k0, %VRDX
+	addl	$(CHAR_PER_VEC * -2), %eax
+	test	%VRDX, %VRDX
+	jnz	L(last_vec_check)
+
+	subl	$(CHAR_PER_VEC), %eax
+	jbe	L(max_1)
+
+	VPCMPEQ	(VEC_SIZE * 4)(%rdi), %VZERO, %k0
+	KMOV	%k0, %VRDX
+
+	test	%VRDX, %VRDX
+	jnz	L(last_vec_check)
+L(max_1):
+	movl	%esi, %eax
+	ret
+
+	.p2align 4,, 3
+L(first_vec_x2):
+# if VEC_SIZE == 64
+	/* If VEC_SIZE == 64 we can fit logic for full return label in
+	   spare bytes before next cache line.  */
+	bsf	%VRDX, %VRDX
+	sub	%eax, %esi
+	leal	(CHAR_PER_VEC * 1)(%rsi, %rdx), %eax
+	ret
+	.p2align 4,, 6
+# else
+	addl	$CHAR_PER_VEC, %esi
+# endif
+L(first_vec_x1):
+	bsf	%VRDX, %VRDX
+	sub	%eax, %esi
+	leal	(CHAR_PER_VEC * 0)(%rsi, %rdx), %eax
+	ret
+
+
+	.p2align 4,, 6
+L(first_vec_x4):
+# if VEC_SIZE == 64
+	/* If VEC_SIZE == 64 we can fit logic for full return label in
+	   spare bytes before next cache line.  */
+	bsf	%VRDX, %VRDX
+	sub	%eax, %esi
+	leal	(CHAR_PER_VEC * 3)(%rsi, %rdx), %eax
+	ret
+	.p2align 4,, 6
+# else
+	addl	$CHAR_PER_VEC, %esi
+# endif
+L(first_vec_x3):
+	bsf	%VRDX, %VRDX
+	sub	%eax, %esi
+	leal	(CHAR_PER_VEC * 2)(%rsi, %rdx), %eax
+	ret
+
+	.p2align 4,, 5
+L(more_4x_vec):
+	VPCMPEQ	(VEC_SIZE * 3)(%rdi), %VZERO, %k0
+	KMOV	%k0, %VRDX
+	test	%VRDX, %VRDX
+	jnz	L(first_vec_x3)
+
+	VPCMPEQ	(VEC_SIZE * 4)(%rdi), %VZERO, %k0
+	KMOV	%k0, %VRDX
+	test	%VRDX, %VRDX
+	jnz	L(first_vec_x4)
+
+	/* Check if at last VEC_SIZE * 4 length before aligning for the
+	   loop.  */
+	cmpq	$(CHAR_PER_VEC * 8), %rax
+	jbe	L(last_4x_vec_or_less)
+
+
+	/* Compute number of words checked after aligning.  */
+# ifdef USE_AS_WCSLEN
+	/* Need to compute directly for wcslen as CHAR_SIZE * rsi can
+	   overflow.  */
+	leaq	(VEC_SIZE * -3)(%rdi), %rdx
+# else
+	leaq	(VEC_SIZE * -3)(%rdi, %rax), %rax
+# endif
+
+	subq	$(VEC_SIZE * -1), %rdi
+
+	/* Align data to VEC_SIZE * 4.  */
+# if VEC_SIZE == 64
+	/* Saves code size.  No evex512 processor has partial register
+	   stalls.  If that change this can be replaced with `andq
+	   $-(VEC_SIZE * 4), %rdi`.  */
+	xorb	%dil, %dil
+# else
+	andq	$-(VEC_SIZE * 4), %rdi
+# endif
+
+# ifdef USE_AS_WCSLEN
+	subq	%rdi, %rdx
+	sarq	$2, %rdx
+	addq	%rdx, %rax
+# else
+	subq	%rdi, %rax
+# endif
+	/* Compare 4 * VEC at a time forward.  */
+	.p2align 4,, 11
+L(loop_4x_vec):
+	VMOVA	(VEC_SIZE * 4)(%rdi), %VMM(1)
+	VPMINU	(VEC_SIZE * 5)(%rdi), %VMM(1), %VMM(2)
+	VMOVA	(VEC_SIZE * 6)(%rdi), %VMM(3)
+	VPMINU	(VEC_SIZE * 7)(%rdi), %VMM(3), %VMM(4)
+	VPTESTN	%VMM(2), %VMM(2), %k0
+	VPTESTN	%VMM(4), %VMM(4), %k2
+	subq	$-(VEC_SIZE * 4), %rdi
+	/* Break if at end of length.  */
+	subq	$(CHAR_PER_VEC * 4), %rax
+	jbe	L(loop_len_end)
+
+
+	KORTEST %k0, %k2
+	jz	L(loop_4x_vec)
+
+
+L(loop_last_4x_vec):
+	movq	%rsi, %rcx
+	subq	%rax, %rsi
+	VPTESTN	%VMM(1), %VMM(1), %k1
+	KMOV	%k1, %VRDX
+	test	%VRDX, %VRDX
+	jnz	L(last_vec_x0)
+
+	KMOV	%k0, %VRDX
+	test	%VRDX, %VRDX
+	jnz	L(last_vec_x1)
+
+	VPTESTN	%VMM(3), %VMM(3), %k0
+
+	/* Seperate logic for VEC_SIZE == 64 and VEC_SIZE == 32 for
+	   returning last 2x VEC. For VEC_SIZE == 64 we test each VEC
+	   individually, for VEC_SIZE == 32 we combine them in a single
+	   64-bit GPR.  */
+# if CHAR_PER_VEC == 64
+	KMOV	%k0, %VRDX
+	test	%VRDX, %VRDX
+	jnz	L(last_vec_x2)
+	KMOV	%k2, %VRDX
+# else
+	/* We can only combine last 2x VEC masks if CHAR_PER_VEC <= 32.
+	 */
+	kmovd	%k2, %edx
+	kmovd	%k0, %eax
+	salq	$CHAR_PER_VEC, %rdx
+	orq	%rax, %rdx
+# endif
+
+	/* first_vec_x3 for strlen-ZMM and first_vec_x2 for strlen-YMM.
+	 */
+	bsfq	%rdx, %rdx
+	leaq	(FALLTHROUGH_RETURN_OFFSET - CHAR_PER_VEC * 4)(%rsi, %rdx), %rax
+	cmpq	%rax, %rcx
+	cmovb	%rcx, %rax
+	ret
+
+	/* Handle last 4x VEC after loop. All VECs have been loaded.  */
+	.p2align 4,, 4
+L(loop_len_end):
+	KORTEST %k0, %k2
+	jnz	L(loop_last_4x_vec)
+	movq	%rsi, %rax
+	ret
+
+
+# if CHAR_PER_VEC == 64
+	/* Since we can't combine the last 2x VEC for VEC_SIZE == 64
+	   need return label for it.  */
+	.p2align 4,, 8
+L(last_vec_x2):
+	bsf	%VRDX, %VRDX
+	leaq	(CHAR_PER_VEC * -2)(%rsi, %rdx), %rax
+	cmpq	%rax, %rcx
+	cmovb	%rcx, %rax
+	ret
+# endif
+
+
+	.p2align 4,, 10
+L(last_vec_x1):
+	addq	$CHAR_PER_VEC, %rsi
+L(last_vec_x0):
+	bsf	%VRDX, %VRDX
+	leaq	(CHAR_PER_VEC * -4)(%rsi, %rdx), %rax
+	cmpq	%rax, %rcx
+	cmovb	%rcx, %rax
+	ret
+
+
+	.p2align 4,, 8
+L(cross_page_boundary):
+	/* Align data to VEC_SIZE.  */
+	movq	%rdi, %rcx
+	andq	$-VEC_SIZE, %rcx
+	VPCMPEQ	(%rcx), %VZERO, %k0
+
+	KMOV	%k0, %VRCX
+# ifdef USE_AS_WCSLEN
+	shrl	$2, %eax
+	andl	$(CHAR_PER_VEC - 1), %eax
+# endif
+	shrx	%VRAX, %VRCX, %VRCX
+
+	negl	%eax
+	andl	$(CHAR_PER_VEC - 1), %eax
+	movq	%rsi, %rdx
+	bsf	%VRCX, %VRDX
+	cmpq	%rax, %rdx
+	ja	L(cross_page_continue)
+	movl	%edx, %eax
+	cmpq	%rdx, %rsi
+	cmovb	%esi, %eax
+	ret
+END (STRNLEN)
+#endif
diff --git a/sysdeps/x86_64/multiarch/wcsnlen-evex.S b/sysdeps/x86_64/multiarch/wcsnlen-evex.S
index e2aad94c1e..57a7e93fbf 100644
--- a/sysdeps/x86_64/multiarch/wcsnlen-evex.S
+++ b/sysdeps/x86_64/multiarch/wcsnlen-evex.S
@@ -2,8 +2,7 @@
 # define WCSNLEN	__wcsnlen_evex
 #endif
 
-#define STRLEN	WCSNLEN
+#define STRNLEN	WCSNLEN
 #define USE_AS_WCSLEN 1
-#define USE_AS_STRNLEN 1
 
-#include "strlen-evex.S"
+#include "strnlen-evex.S"
-- 
2.34.1


^ permalink raw reply	[flat|nested] 41+ messages in thread

* [PATCH v3 4/7] x86: Optimize memrchr-evex.S
  2022-10-19  0:44 ` [PATCH v3 1/7] x86: Optimize memchr-evex.S and implement with VMM headers Noah Goldstein
  2022-10-19  0:44   ` [PATCH v3 2/7] x86: Shrink / minorly optimize strchr-evex " Noah Goldstein
  2022-10-19  0:44   ` [PATCH v3 3/7] x86: Optimize strnlen-evex.S " Noah Goldstein
@ 2022-10-19  0:44   ` Noah Goldstein
  2022-10-19 16:58     ` H.J. Lu
  2022-10-19  0:44   ` [PATCH v3 5/7] x86: Optimize strrchr-evex.S and implement with VMM headers Noah Goldstein
                     ` (3 subsequent siblings)
  6 siblings, 1 reply; 41+ messages in thread
From: Noah Goldstein @ 2022-10-19  0:44 UTC (permalink / raw)
  To: libc-alpha; +Cc: goldstein.w.n, hjl.tools, carlos

Optimizations are:
1. Use the fact that lzcnt(0) -> VEC_SIZE for memchr to save a branch
   in short string case.
2. Save several instructions in len = [VEC_SIZE, 4 * VEC_SIZE] case.
3. Use more code-size efficient instructions.
	- tzcnt ...     -> bsf ...
	- vpcmpb $0 ... -> vpcmpeq ...

Code Size Changes:
memrchr-evex.S      :  -29 bytes

Net perf changes:

Reported as geometric mean of all improvements / regressions from N=10
runs of the benchtests. Value as New Time / Old Time so < 1.0 is
improvement and 1.0 is regression.

memrchr-evex.S      : 0.949 (Mostly from improvements in small strings)

Full results attached in email.

Full check passes on x86-64.
---
 sysdeps/x86_64/multiarch/memrchr-evex.S | 538 ++++++++++++++----------
 1 file changed, 324 insertions(+), 214 deletions(-)

diff --git a/sysdeps/x86_64/multiarch/memrchr-evex.S b/sysdeps/x86_64/multiarch/memrchr-evex.S
index 550b328c5a..dbcf52808f 100644
--- a/sysdeps/x86_64/multiarch/memrchr-evex.S
+++ b/sysdeps/x86_64/multiarch/memrchr-evex.S
@@ -21,17 +21,19 @@
 #if ISA_SHOULD_BUILD (4)
 
 # include <sysdep.h>
-# include "x86-evex256-vecs.h"
-# if VEC_SIZE != 32
-#  error "VEC_SIZE != 32 unimplemented"
+
+# ifndef VEC_SIZE
+#  include "x86-evex256-vecs.h"
 # endif
 
+# include "reg-macros.h"
+
 # ifndef MEMRCHR
-#  define MEMRCHR				__memrchr_evex
+#  define MEMRCHR	__memrchr_evex
 # endif
 
-# define PAGE_SIZE			4096
-# define VMMMATCH			VMM(0)
+# define PAGE_SIZE	4096
+# define VMATCH	VMM(0)
 
 	.section SECTION(.text), "ax", @progbits
 ENTRY_P2ALIGN(MEMRCHR, 6)
@@ -43,294 +45,402 @@ ENTRY_P2ALIGN(MEMRCHR, 6)
 # endif
 	jz	L(zero_0)
 
-	/* Get end pointer. Minus one for two reasons. 1) It is necessary for a
-	   correct page cross check and 2) it correctly sets up end ptr to be
-	   subtract by lzcnt aligned.  */
+	/* Get end pointer. Minus one for three reasons. 1) It is
+	   necessary for a correct page cross check and 2) it correctly
+	   sets up end ptr to be subtract by lzcnt aligned. 3) it is a
+	   necessary step in aligning ptr.  */
 	leaq	-1(%rdi, %rdx), %rax
-	vpbroadcastb %esi, %VMMMATCH
+	vpbroadcastb %esi, %VMATCH
 
 	/* Check if we can load 1x VEC without cross a page.  */
 	testl	$(PAGE_SIZE - VEC_SIZE), %eax
 	jz	L(page_cross)
 
-	/* Don't use rax for pointer here because EVEX has better encoding with
-	   offset % VEC_SIZE == 0.  */
-	vpcmpb	$0, -(VEC_SIZE)(%rdi, %rdx), %VMMMATCH, %k0
-	kmovd	%k0, %ecx
-
-	/* Fall through for rdx (len) <= VEC_SIZE (expect small sizes).  */
-	cmpq	$VEC_SIZE, %rdx
-	ja	L(more_1x_vec)
-L(ret_vec_x0_test):
-
-	/* If ecx is zero (no matches) lzcnt will set it 32 (VEC_SIZE) which
-	   will guarantee edx (len) is less than it.  */
-	lzcntl	%ecx, %ecx
-	cmpl	%ecx, %edx
-	jle	L(zero_0)
-	subq	%rcx, %rax
+	/* Don't use rax for pointer here because EVEX has better
+	   encoding with offset % VEC_SIZE == 0.  */
+	vpcmpeqb (VEC_SIZE * -1)(%rdi, %rdx), %VMATCH, %k0
+	KMOV	%k0, %VRCX
+
+	/* If rcx is zero then lzcnt -> VEC_SIZE.  NB: there is a
+	   already a dependency between rcx and rsi so no worries about
+	   false-dep here.  */
+	lzcnt	%VRCX, %VRSI
+	/* If rdx <= rsi then either 1) rcx was non-zero (there was a
+	   match) but it was out of bounds or 2) rcx was zero and rdx
+	   was <= VEC_SIZE so we are done scanning.  */
+	cmpq	%rsi, %rdx
+	/* NB: Use branch to return zero/non-zero.  Common usage will
+	   branch on result of function (if return is null/non-null).
+	   This branch can be used to predict the ensuing one so there
+	   is no reason to extend the data-dependency with cmovcc.  */
+	jbe	L(zero_0)
+
+	/* If rcx is zero then len must be > RDX, otherwise since we
+	   already tested len vs lzcnt(rcx) (in rsi) we are good to
+	   return this match.  */
+	test	%VRCX, %VRCX
+	jz	L(more_1x_vec)
+	subq	%rsi, %rax
 	ret
 
-	/* Fits in aligning bytes of first cache line.  */
+	/* Fits in aligning bytes of first cache line for VEC_SIZE ==
+	   32.  */
+# if VEC_SIZE == 32
+	.p2align 4,, 2
 L(zero_0):
 	xorl	%eax, %eax
 	ret
-
-	.p2align 4,, 9
-L(ret_vec_x0_dec):
-	decq	%rax
-L(ret_vec_x0):
-	lzcntl	%ecx, %ecx
-	subq	%rcx, %rax
-	ret
+# endif
 
 	.p2align 4,, 10
 L(more_1x_vec):
-	testl	%ecx, %ecx
-	jnz	L(ret_vec_x0)
-
 	/* Align rax (pointer to string).  */
 	andq	$-VEC_SIZE, %rax
-
+L(page_cross_continue):
 	/* Recompute length after aligning.  */
-	movq	%rax, %rdx
+	subq	%rdi, %rax
 
-	/* Need no matter what.  */
-	vpcmpb	$0, -(VEC_SIZE)(%rax), %VMMMATCH, %k0
-	kmovd	%k0, %ecx
-
-	subq	%rdi, %rdx
-
-	cmpq	$(VEC_SIZE * 2), %rdx
+	cmpq	$(VEC_SIZE * 2), %rax
 	ja	L(more_2x_vec)
+
 L(last_2x_vec):
+	vpcmpeqb (VEC_SIZE * -1)(%rdi, %rax), %VMATCH, %k0
+	KMOV	%k0, %VRCX
 
-	/* Must dec rax because L(ret_vec_x0_test) expects it.  */
-	decq	%rax
-	cmpl	$VEC_SIZE, %edx
-	jbe	L(ret_vec_x0_test)
+	test	%VRCX, %VRCX
+	jnz	L(ret_vec_x0_test)
 
-	testl	%ecx, %ecx
-	jnz	L(ret_vec_x0)
+	/* If VEC_SIZE == 64 need to subtract because lzcntq won't
+	   implicitly add VEC_SIZE to match position.  */
+# if VEC_SIZE == 64
+	subl	$VEC_SIZE, %eax
+# else
+	cmpb	$VEC_SIZE, %al
+# endif
+	jle	L(zero_2)
 
-	/* Don't use rax for pointer here because EVEX has better encoding with
-	   offset % VEC_SIZE == 0.  */
-	vpcmpb	$0, -(VEC_SIZE * 2)(%rdi, %rdx), %VMMMATCH, %k0
-	kmovd	%k0, %ecx
-	/* NB: 64-bit lzcnt. This will naturally add 32 to position.  */
+	/* We adjusted rax (length) for VEC_SIZE == 64 so need seperate
+	   offsets.  */
+# if VEC_SIZE == 64
+	vpcmpeqb (VEC_SIZE * -1)(%rdi, %rax), %VMATCH, %k0
+# else
+	vpcmpeqb (VEC_SIZE * -2)(%rdi, %rax), %VMATCH, %k0
+# endif
+	KMOV	%k0, %VRCX
+	/* NB: 64-bit lzcnt. This will naturally add 32 to position for
+	   VEC_SIZE == 32.  */
 	lzcntq	%rcx, %rcx
-	cmpl	%ecx, %edx
-	jle	L(zero_0)
-	subq	%rcx, %rax
-	ret
-
-	/* Inexpensive place to put this regarding code size / target alignments
-	   / ICache NLP. Necessary for 2-byte encoding of jump to page cross
-	   case which in turn is necessary for hot path (len <= VEC_SIZE) to fit
-	   in first cache line.  */
-L(page_cross):
-	movq	%rax, %rsi
-	andq	$-VEC_SIZE, %rsi
-	vpcmpb	$0, (%rsi), %VMMMATCH, %k0
-	kmovd	%k0, %r8d
-	/* Shift out negative alignment (because we are starting from endptr and
-	   working backwards).  */
-	movl	%eax, %ecx
-	/* notl because eax already has endptr - 1.  (-x = ~(x - 1)).  */
-	notl	%ecx
-	shlxl	%ecx, %r8d, %ecx
-	cmpq	%rdi, %rsi
-	ja	L(more_1x_vec)
-	lzcntl	%ecx, %ecx
-	cmpl	%ecx, %edx
-	jle	L(zero_1)
-	subq	%rcx, %rax
+	subl	%ecx, %eax
+	ja	L(first_vec_x1_ret)
+	/* If VEC_SIZE == 64 put L(zero_0) here as we can't fit in the
+	   first cache line (this is the second cache line).  */
+# if VEC_SIZE == 64
+L(zero_0):
+# endif
+L(zero_2):
+	xorl	%eax, %eax
 	ret
 
-	/* Continue creating zero labels that fit in aligning bytes and get
-	   2-byte encoding / are in the same cache line as condition.  */
-L(zero_1):
-	xorl	%eax, %eax
+	/* NB: Fits in aligning bytes before next cache line for
+	   VEC_SIZE == 32.  For VEC_SIZE == 64 this is attached to
+	   L(first_vec_x0_test).  */
+# if VEC_SIZE == 32
+L(first_vec_x1_ret):
+	leaq	-1(%rdi, %rax), %rax
 	ret
+# endif
 
-	.p2align 4,, 8
-L(ret_vec_x1):
-	/* This will naturally add 32 to position.  */
-	bsrl	%ecx, %ecx
-	leaq	-(VEC_SIZE * 2)(%rcx, %rax), %rax
+	.p2align 4,, 6
+L(ret_vec_x0_test):
+	lzcnt	%VRCX, %VRCX
+	subl	%ecx, %eax
+	jle	L(zero_2)
+# if VEC_SIZE == 64
+	/* Reuse code at the end of L(ret_vec_x0_test) as we can't fit
+	   L(first_vec_x1_ret) in the same cache line as its jmp base
+	   so we might as well save code size.  */
+L(first_vec_x1_ret):
+# endif
+	leaq	-1(%rdi, %rax), %rax
 	ret
 
-	.p2align 4,, 8
+	.p2align 4,, 6
+L(loop_last_4x_vec):
+	/* Compute remaining length.  */
+	subl	%edi, %eax
+L(last_4x_vec):
+	cmpl	$(VEC_SIZE * 2), %eax
+	jle	L(last_2x_vec)
+# if VEC_SIZE == 32
+	/* Only align for VEC_SIZE == 32.  For VEC_SIZE == 64 we need
+	   the spare bytes to align the loop properly.  */
+	.p2align 4,, 10
+# endif
 L(more_2x_vec):
-	testl	%ecx, %ecx
-	jnz	L(ret_vec_x0_dec)
 
-	vpcmpb	$0, -(VEC_SIZE * 2)(%rax), %VMMMATCH, %k0
-	kmovd	%k0, %ecx
-	testl	%ecx, %ecx
-	jnz	L(ret_vec_x1)
+	/* Length > VEC_SIZE * 2 so check the first 2x VEC for match and
+	   return if either hit.  */
+	vpcmpeqb (VEC_SIZE * -1)(%rdi, %rax), %VMATCH, %k0
+	KMOV	%k0, %VRCX
+
+	test	%VRCX, %VRCX
+	jnz	L(first_vec_x0)
+
+	vpcmpeqb (VEC_SIZE * -2)(%rdi, %rax), %VMATCH, %k0
+	KMOV	%k0, %VRCX
+	test	%VRCX, %VRCX
+	jnz	L(first_vec_x1)
 
 	/* Need no matter what.  */
-	vpcmpb	$0, -(VEC_SIZE * 3)(%rax), %VMMMATCH, %k0
-	kmovd	%k0, %ecx
+	vpcmpeqb (VEC_SIZE * -3)(%rdi, %rax), %VMATCH, %k0
+	KMOV	%k0, %VRCX
 
-	subq	$(VEC_SIZE * 4), %rdx
+	/* Check if we are near the end.  */
+	subq	$(VEC_SIZE * 4), %rax
 	ja	L(more_4x_vec)
 
-	cmpl	$(VEC_SIZE * -1), %edx
-	jle	L(ret_vec_x2_test)
-L(last_vec):
-	testl	%ecx, %ecx
-	jnz	L(ret_vec_x2)
+	test	%VRCX, %VRCX
+	jnz	L(first_vec_x2_test)
 
+	/* Adjust length for final check and check if we are at the end.
+	 */
+	addl	$(VEC_SIZE * 1), %eax
+	jle	L(zero_1)
 
-	/* Need no matter what.  */
-	vpcmpb	$0, -(VEC_SIZE * 4)(%rax), %VMMMATCH, %k0
-	kmovd	%k0, %ecx
-	lzcntl	%ecx, %ecx
-	subq	$(VEC_SIZE * 3 + 1), %rax
-	subq	%rcx, %rax
-	cmpq	%rax, %rdi
-	ja	L(zero_1)
+	vpcmpeqb (VEC_SIZE * -1)(%rdi, %rax), %VMATCH, %k0
+	KMOV	%k0, %VRCX
+
+	lzcnt	%VRCX, %VRCX
+	subl	%ecx, %eax
+	ja	L(first_vec_x3_ret)
+L(zero_1):
+	xorl	%eax, %eax
+	ret
+L(first_vec_x3_ret):
+	leaq	-1(%rdi, %rax), %rax
 	ret
 
-	.p2align 4,, 8
-L(ret_vec_x2_test):
-	lzcntl	%ecx, %ecx
-	subq	$(VEC_SIZE * 2 + 1), %rax
-	subq	%rcx, %rax
-	cmpq	%rax, %rdi
-	ja	L(zero_1)
+	.p2align 4,, 6
+L(first_vec_x2_test):
+	/* Must adjust length before check.  */
+	subl	$-(VEC_SIZE * 2 - 1), %eax
+	lzcnt	%VRCX, %VRCX
+	subl	%ecx, %eax
+	jl	L(zero_4)
+	addq	%rdi, %rax
 	ret
 
-	.p2align 4,, 8
-L(ret_vec_x2):
-	bsrl	%ecx, %ecx
-	leaq	-(VEC_SIZE * 3)(%rcx, %rax), %rax
+
+	.p2align 4,, 10
+L(first_vec_x0):
+	bsr	%VRCX, %VRCX
+	leaq	(VEC_SIZE * -1)(%rdi, %rax), %rax
+	addq	%rcx, %rax
 	ret
 
-	.p2align 4,, 8
-L(ret_vec_x3):
-	bsrl	%ecx, %ecx
-	leaq	-(VEC_SIZE * 4)(%rcx, %rax), %rax
+	/* Fits unobtrusively here.  */
+L(zero_4):
+	xorl	%eax, %eax
+	ret
+
+	.p2align 4,, 10
+L(first_vec_x1):
+	bsr	%VRCX, %VRCX
+	leaq	(VEC_SIZE * -2)(%rdi, %rax), %rax
+	addq	%rcx, %rax
 	ret
 
 	.p2align 4,, 8
+L(first_vec_x3):
+	bsr	%VRCX, %VRCX
+	addq	%rdi, %rax
+	addq	%rcx, %rax
+	ret
+
+	.p2align 4,, 6
+L(first_vec_x2):
+	bsr	%VRCX, %VRCX
+	leaq	(VEC_SIZE * 1)(%rdi, %rax), %rax
+	addq	%rcx, %rax
+	ret
+
+	.p2align 4,, 2
 L(more_4x_vec):
-	testl	%ecx, %ecx
-	jnz	L(ret_vec_x2)
+	test	%VRCX, %VRCX
+	jnz	L(first_vec_x2)
 
-	vpcmpb	$0, -(VEC_SIZE * 4)(%rax), %VMMMATCH, %k0
-	kmovd	%k0, %ecx
+	vpcmpeqb (%rdi, %rax), %VMATCH, %k0
+	KMOV	%k0, %VRCX
 
-	testl	%ecx, %ecx
-	jnz	L(ret_vec_x3)
+	test	%VRCX, %VRCX
+	jnz	L(first_vec_x3)
 
 	/* Check if near end before re-aligning (otherwise might do an
 	   unnecessary loop iteration).  */
-	addq	$-(VEC_SIZE * 4), %rax
-	cmpq	$(VEC_SIZE * 4), %rdx
+	cmpq	$(VEC_SIZE * 4), %rax
 	jbe	L(last_4x_vec)
 
-	decq	%rax
-	andq	$-(VEC_SIZE * 4), %rax
-	movq	%rdi, %rdx
-	/* Get endptr for loop in rdx. NB: Can't just do while rax > rdi because
-	   lengths that overflow can be valid and break the comparison.  */
-	andq	$-(VEC_SIZE * 4), %rdx
+
+	/* NB: We setup the loop to NOT use index-address-mode for the
+	   buffer.  This costs some instructions & code size but avoids
+	   stalls due to unlaminated micro-fused instructions (as used
+	   in the loop) from being forced to issue in the same group
+	   (essentially narrowing the backend width).  */
+
+	/* Get endptr for loop in rdx. NB: Can't just do while rax > rdi
+	   because lengths that overflow can be valid and break the
+	   comparison.  */
+# if VEC_SIZE == 64
+	/* Use rdx as intermediate to compute rax, this gets us imm8
+	   encoding which just allows the L(more_4x_vec) block to fit
+	   in 1 cache-line.  */
+	leaq	(VEC_SIZE * 4)(%rdi), %rdx
+	leaq	(VEC_SIZE * -1)(%rdx, %rax), %rax
+
+	/* No evex machine has partial register stalls. This can be
+	   replaced with: `andq $(VEC_SIZE * -4), %rax/%rdx` if that
+	   changes.  */
+	xorb	%al, %al
+	xorb	%dl, %dl
+# else
+	leaq	(VEC_SIZE * 3)(%rdi, %rax), %rax
+	andq	$(VEC_SIZE * -4), %rax
+	leaq	(VEC_SIZE * 4)(%rdi), %rdx
+	andq	$(VEC_SIZE * -4), %rdx
+# endif
+
 
 	.p2align 4
 L(loop_4x_vec):
-	/* Store 1 were not-equals and 0 where equals in k1 (used to mask later
-	   on).  */
-	vpcmpb	$4, (VEC_SIZE * 3)(%rax), %VMMMATCH, %k1
+	/* NB: We could do the same optimization here as we do for
+	   memchr/rawmemchr by using VEX encoding in the loop for access
+	   to VEX vpcmpeqb + vpternlogd.  Since memrchr is not as hot as
+	   memchr it may not be worth the extra code size, but if the
+	   need arises it an easy ~15% perf improvement to the loop.  */
+
+	cmpq	%rdx, %rax
+	je	L(loop_last_4x_vec)
+	/* Store 1 were not-equals and 0 where equals in k1 (used to
+	   mask later on).  */
+	vpcmpb	$4, (VEC_SIZE * -1)(%rax), %VMATCH, %k1
 
 	/* VEC(2/3) will have zero-byte where we found a CHAR.  */
-	vpxorq	(VEC_SIZE * 2)(%rax), %VMMMATCH, %VMM(2)
-	vpxorq	(VEC_SIZE * 1)(%rax), %VMMMATCH, %VMM(3)
-	vpcmpb	$0, (VEC_SIZE * 0)(%rax), %VMMMATCH, %k4
+	vpxorq	(VEC_SIZE * -2)(%rax), %VMATCH, %VMM(2)
+	vpxorq	(VEC_SIZE * -3)(%rax), %VMATCH, %VMM(3)
+	vpcmpeqb (VEC_SIZE * -4)(%rax), %VMATCH, %k4
 
-	/* Combine VEC(2/3) with min and maskz with k1 (k1 has zero bit where
-	   CHAR is found and VEC(2/3) have zero-byte where CHAR is found.  */
+	/* Combine VEC(2/3) with min and maskz with k1 (k1 has zero bit
+	   where CHAR is found and VEC(2/3) have zero-byte where CHAR
+	   is found.  */
 	vpminub	%VMM(2), %VMM(3), %VMM(3){%k1}{z}
 	vptestnmb %VMM(3), %VMM(3), %k2
 
-	/* Any 1s and we found CHAR.  */
-	kortestd %k2, %k4
-	jnz	L(loop_end)
-
 	addq	$-(VEC_SIZE * 4), %rax
-	cmpq	%rdx, %rax
-	jne	L(loop_4x_vec)
 
-	/* Need to re-adjust rdx / rax for L(last_4x_vec).  */
-	subq	$-(VEC_SIZE * 4), %rdx
-	movq	%rdx, %rax
-	subl	%edi, %edx
-L(last_4x_vec):
+	/* Any 1s and we found CHAR.  */
+	KORTEST %k2, %k4
+	jz	L(loop_4x_vec)
+
 
-	/* Used no matter what.  */
-	vpcmpb	$0, (VEC_SIZE * -1)(%rax), %VMMMATCH, %k0
-	kmovd	%k0, %ecx
+	/* K1 has non-matches for first VEC. inc; jz will overflow rcx
+	   iff all bytes where non-matches.  */
+	KMOV	%k1, %VRCX
+	inc	%VRCX
+	jnz	L(first_vec_x0_end)
 
-	cmpl	$(VEC_SIZE * 2), %edx
-	jbe	L(last_2x_vec)
+	vptestnmb %VMM(2), %VMM(2), %k0
+	KMOV	%k0, %VRCX
+	test	%VRCX, %VRCX
+	jnz	L(first_vec_x1_end)
+	KMOV	%k2, %VRCX
+
+	/* Seperate logic for VEC_SIZE == 64 and VEC_SIZE == 32 for
+	   returning last 2x VEC. For VEC_SIZE == 64 we test each VEC
+	   individually, for VEC_SIZE == 32 we combine them in a single
+	   64-bit GPR.  */
+# if VEC_SIZE == 64
+	test	%VRCX, %VRCX
+	jnz	L(first_vec_x2_end)
+	KMOV	%k4, %VRCX
+# else
+	/* Combine last 2 VEC matches for VEC_SIZE == 32. If rcx (from
+	   VEC(3)) is zero (no CHAR in VEC(3)) then it won't affect the
+	   result in rsi (from VEC(4)). If rcx is non-zero then CHAR in
+	   VEC(3) and bsrq will use that position.  */
+	KMOV	%k4, %VRSI
+	salq	$32, %rcx
+	orq	%rsi, %rcx
+# endif
+	bsrq	%rcx, %rcx
+	addq	%rcx, %rax
+	ret
 
-	testl	%ecx, %ecx
-	jnz	L(ret_vec_x0_dec)
+	.p2align 4,, 4
+L(first_vec_x0_end):
+	/* rcx has 1s at non-matches so we need to `not` it. We used
+	   `inc` to test if zero so use `neg` to complete the `not` so
+	   the last 1 bit represent a match.  NB: (-x + 1 == ~x).  */
+	neg	%VRCX
+	bsr	%VRCX, %VRCX
+	leaq	(VEC_SIZE * 3)(%rcx, %rax), %rax
+	ret
 
+	.p2align 4,, 10
+L(first_vec_x1_end):
+	bsr	%VRCX, %VRCX
+	leaq	(VEC_SIZE * 2)(%rcx, %rax), %rax
+	ret
 
-	vpcmpb	$0, (VEC_SIZE * -2)(%rax), %VMMMATCH, %k0
-	kmovd	%k0, %ecx
+# if VEC_SIZE == 64
+	/* Since we can't combine the last 2x VEC for VEC_SIZE == 64
+	   need return label for it.  */
+	.p2align 4,, 4
+L(first_vec_x2_end):
+	bsr	%VRCX, %VRCX
+	leaq	(VEC_SIZE * 1)(%rcx, %rax), %rax
+	ret
+# endif
 
-	testl	%ecx, %ecx
-	jnz	L(ret_vec_x1)
 
-	/* Used no matter what.  */
-	vpcmpb	$0, (VEC_SIZE * -3)(%rax), %VMMMATCH, %k0
-	kmovd	%k0, %ecx
+	.p2align 4,, 4
+L(page_cross):
+	/* only lower bits of eax[log2(VEC_SIZE):0] are set so we can
+	   use movzbl to get the amount of bytes we are checking here.
+	 */
+	movzbl	%al, %ecx
+	andq	$-VEC_SIZE, %rax
+	vpcmpeqb (%rax), %VMATCH, %k0
+	KMOV	%k0, %VRSI
 
-	cmpl	$(VEC_SIZE * 3), %edx
-	ja	L(last_vec)
+	/* eax was comptued as %rdi + %rdx - 1 so need to add back 1
+	   here.  */
+	leal	1(%rcx), %r8d
 
-	lzcntl	%ecx, %ecx
-	subq	$(VEC_SIZE * 2 + 1), %rax
-	subq	%rcx, %rax
-	cmpq	%rax, %rdi
-	jbe	L(ret_1)
+	/* Invert ecx to get shift count for byte matches out of range.
+	 */
+	notl	%ecx
+	shlx	%VRCX, %VRSI, %VRSI
+
+	/* if r8 < rdx then the entire [buf, buf + len] is handled in
+	   the page cross case.  NB: we can't use the trick here we use
+	   in the non page-cross case because we aren't checking full
+	   VEC_SIZE.  */
+	cmpq	%r8, %rdx
+	ja	L(page_cross_check)
+	lzcnt	%VRSI, %VRSI
+	subl	%esi, %edx
+	ja	L(page_cross_ret)
 	xorl	%eax, %eax
-L(ret_1):
 	ret
 
-	.p2align 4,, 6
-L(loop_end):
-	kmovd	%k1, %ecx
-	notl	%ecx
-	testl	%ecx, %ecx
-	jnz	L(ret_vec_x0_end)
+L(page_cross_check):
+	test	%VRSI, %VRSI
+	jz	L(page_cross_continue)
 
-	vptestnmb %VMM(2), %VMM(2), %k0
-	kmovd	%k0, %ecx
-	testl	%ecx, %ecx
-	jnz	L(ret_vec_x1_end)
-
-	kmovd	%k2, %ecx
-	kmovd	%k4, %esi
-	/* Combine last 2 VEC matches. If ecx (VEC3) is zero (no CHAR in VEC3)
-	   then it won't affect the result in esi (VEC4). If ecx is non-zero
-	   then CHAR in VEC3 and bsrq will use that position.  */
-	salq	$32, %rcx
-	orq	%rsi, %rcx
-	bsrq	%rcx, %rcx
-	addq	%rcx, %rax
-	ret
-	.p2align 4,, 4
-L(ret_vec_x0_end):
-	addq	$(VEC_SIZE), %rax
-L(ret_vec_x1_end):
-	bsrl	%ecx, %ecx
-	leaq	(VEC_SIZE * 2)(%rax, %rcx), %rax
+	lzcnt	%VRSI, %VRSI
+	subl	%esi, %edx
+L(page_cross_ret):
+	leaq	-1(%rdi, %rdx), %rax
 	ret
-
 END(MEMRCHR)
 #endif
-- 
2.34.1


^ permalink raw reply	[flat|nested] 41+ messages in thread

* [PATCH v3 5/7] x86: Optimize strrchr-evex.S and implement with VMM headers
  2022-10-19  0:44 ` [PATCH v3 1/7] x86: Optimize memchr-evex.S and implement with VMM headers Noah Goldstein
                     ` (2 preceding siblings ...)
  2022-10-19  0:44   ` [PATCH v3 4/7] x86: Optimize memrchr-evex.S Noah Goldstein
@ 2022-10-19  0:44   ` Noah Goldstein
  2022-10-19 16:58     ` H.J. Lu
  2022-10-19  0:44   ` [PATCH v3 6/7] x86: Add support for VEC_SIZE == 64 in strcmp-evex.S impl Noah Goldstein
                     ` (2 subsequent siblings)
  6 siblings, 1 reply; 41+ messages in thread
From: Noah Goldstein @ 2022-10-19  0:44 UTC (permalink / raw)
  To: libc-alpha; +Cc: goldstein.w.n, hjl.tools, carlos

Optimization is:
1. Cache latest result in "fast path" loop with `vmovdqu` instead of
  `kunpckdq`.  This helps if there are more than one matches.

Code Size Changes:
strrchr-evex.S       :  +30 bytes (Same number of cache lines)

Net perf changes:

Reported as geometric mean of all improvements / regressions from N=10
runs of the benchtests. Value as New Time / Old Time so < 1.0 is
improvement and 1.0 is regression.

strrchr-evex.S       : 0.932 (From cases with higher match frequency)

Full results attached in email.

Full check passes on x86-64.
---
 sysdeps/x86_64/multiarch/strrchr-evex.S | 371 +++++++++++++-----------
 1 file changed, 200 insertions(+), 171 deletions(-)

diff --git a/sysdeps/x86_64/multiarch/strrchr-evex.S b/sysdeps/x86_64/multiarch/strrchr-evex.S
index 992b45fb47..45487dc87a 100644
--- a/sysdeps/x86_64/multiarch/strrchr-evex.S
+++ b/sysdeps/x86_64/multiarch/strrchr-evex.S
@@ -26,25 +26,30 @@
 #  define STRRCHR	__strrchr_evex
 # endif
 
-# define VMOVU	vmovdqu64
-# define VMOVA	vmovdqa64
+# include "x86-evex256-vecs.h"
 
 # ifdef USE_AS_WCSRCHR
-#  define SHIFT_REG	esi
-
-#  define kunpck	kunpckbw
+#  define RCX_M	cl
+#  define SHIFT_REG	rcx
+#  define VPCOMPRESS	vpcompressd
+#  define kunpck_2x	kunpckbw
 #  define kmov_2x	kmovd
 #  define maskz_2x	ecx
 #  define maskm_2x	eax
 #  define CHAR_SIZE	4
 #  define VPMIN	vpminud
 #  define VPTESTN	vptestnmd
+#  define VPTEST	vptestmd
 #  define VPBROADCAST	vpbroadcastd
+#  define VPCMPEQ	vpcmpeqd
 #  define VPCMP	vpcmpd
-# else
-#  define SHIFT_REG	edi
 
-#  define kunpck	kunpckdq
+#  define USE_WIDE_CHAR
+# else
+#  define RCX_M	ecx
+#  define SHIFT_REG	rdi
+#  define VPCOMPRESS	vpcompressb
+#  define kunpck_2x	kunpckdq
 #  define kmov_2x	kmovq
 #  define maskz_2x	rcx
 #  define maskm_2x	rax
@@ -52,58 +57,48 @@
 #  define CHAR_SIZE	1
 #  define VPMIN	vpminub
 #  define VPTESTN	vptestnmb
+#  define VPTEST	vptestmb
 #  define VPBROADCAST	vpbroadcastb
+#  define VPCMPEQ	vpcmpeqb
 #  define VPCMP	vpcmpb
 # endif
 
-# define XMMZERO	xmm16
-# define YMMZERO	ymm16
-# define YMMMATCH	ymm17
-# define YMMSAVE	ymm18
+# include "reg-macros.h"
 
-# define YMM1	ymm19
-# define YMM2	ymm20
-# define YMM3	ymm21
-# define YMM4	ymm22
-# define YMM5	ymm23
-# define YMM6	ymm24
-# define YMM7	ymm25
-# define YMM8	ymm26
-
-
-# define VEC_SIZE	32
+# define VMATCH	VMM(0)
+# define CHAR_PER_VEC	(VEC_SIZE / CHAR_SIZE)
 # define PAGE_SIZE	4096
-	.section .text.evex, "ax", @progbits
-ENTRY(STRRCHR)
+
+	.section SECTION(.text), "ax", @progbits
+ENTRY_P2ALIGN(STRRCHR, 6)
 	movl	%edi, %eax
-	/* Broadcast CHAR to YMMMATCH.  */
-	VPBROADCAST %esi, %YMMMATCH
+	/* Broadcast CHAR to VMATCH.  */
+	VPBROADCAST %esi, %VMATCH
 
 	andl	$(PAGE_SIZE - 1), %eax
 	cmpl	$(PAGE_SIZE - VEC_SIZE), %eax
 	jg	L(cross_page_boundary)
 
-L(page_cross_continue):
-	VMOVU	(%rdi), %YMM1
-	/* k0 has a 1 for each zero CHAR in YMM1.  */
-	VPTESTN	%YMM1, %YMM1, %k0
-	kmovd	%k0, %ecx
-	testl	%ecx, %ecx
+	VMOVU	(%rdi), %VMM(1)
+	/* k0 has a 1 for each zero CHAR in VEC(1).  */
+	VPTESTN	%VMM(1), %VMM(1), %k0
+	KMOV	%k0, %VRSI
+	test	%VRSI, %VRSI
 	jz	L(aligned_more)
 	/* fallthrough: zero CHAR in first VEC.  */
-
-	/* K1 has a 1 for each search CHAR match in YMM1.  */
-	VPCMP	$0, %YMMMATCH, %YMM1, %k1
-	kmovd	%k1, %eax
+L(page_cross_return):
+	/* K1 has a 1 for each search CHAR match in VEC(1).  */
+	VPCMPEQ	%VMATCH, %VMM(1), %k1
+	KMOV	%k1, %VRAX
 	/* Build mask up until first zero CHAR (used to mask of
 	   potential search CHAR matches past the end of the string).
 	 */
-	blsmskl	%ecx, %ecx
-	andl	%ecx, %eax
+	blsmsk	%VRSI, %VRSI
+	and	%VRSI, %VRAX
 	jz	L(ret0)
-	/* Get last match (the `andl` removed any out of bounds
-	   matches).  */
-	bsrl	%eax, %eax
+	/* Get last match (the `and` removed any out of bounds matches).
+	 */
+	bsr	%VRAX, %VRAX
 # ifdef USE_AS_WCSRCHR
 	leaq	(%rdi, %rax, CHAR_SIZE), %rax
 # else
@@ -116,22 +111,22 @@ L(ret0):
 	   search path for earlier matches.  */
 	.p2align 4,, 6
 L(first_vec_x1):
-	VPCMP	$0, %YMMMATCH, %YMM2, %k1
-	kmovd	%k1, %eax
-	blsmskl	%ecx, %ecx
+	VPCMPEQ	%VMATCH, %VMM(2), %k1
+	KMOV	%k1, %VRAX
+	blsmsk	%VRCX, %VRCX
 	/* eax non-zero if search CHAR in range.  */
-	andl	%ecx, %eax
+	and	%VRCX, %VRAX
 	jnz	L(first_vec_x1_return)
 
-	/* fallthrough: no match in YMM2 then need to check for earlier
-	   matches (in YMM1).  */
+	/* fallthrough: no match in VEC(2) then need to check for
+	   earlier matches (in VEC(1)).  */
 	.p2align 4,, 4
 L(first_vec_x0_test):
-	VPCMP	$0, %YMMMATCH, %YMM1, %k1
-	kmovd	%k1, %eax
-	testl	%eax, %eax
+	VPCMPEQ	%VMATCH, %VMM(1), %k1
+	KMOV	%k1, %VRAX
+	test	%VRAX, %VRAX
 	jz	L(ret1)
-	bsrl	%eax, %eax
+	bsr	%VRAX, %VRAX
 # ifdef USE_AS_WCSRCHR
 	leaq	(%rsi, %rax, CHAR_SIZE), %rax
 # else
@@ -142,129 +137,144 @@ L(ret1):
 
 	.p2align 4,, 10
 L(first_vec_x1_or_x2):
-	VPCMP	$0, %YMM3, %YMMMATCH, %k3
-	VPCMP	$0, %YMM2, %YMMMATCH, %k2
+	VPCMPEQ	%VMM(3), %VMATCH, %k3
+	VPCMPEQ	%VMM(2), %VMATCH, %k2
 	/* K2 and K3 have 1 for any search CHAR match. Test if any
-	   matches between either of them. Otherwise check YMM1.  */
-	kortestd %k2, %k3
+	   matches between either of them. Otherwise check VEC(1).  */
+	KORTEST %k2, %k3
 	jz	L(first_vec_x0_test)
 
-	/* Guranteed that YMM2 and YMM3 are within range so merge the
-	   two bitmasks then get last result.  */
-	kunpck	%k2, %k3, %k3
-	kmovq	%k3, %rax
-	bsrq	%rax, %rax
-	leaq	(VEC_SIZE)(%r8, %rax, CHAR_SIZE), %rax
+	/* Guranteed that VEC(2) and VEC(3) are within range so merge
+	   the two bitmasks then get last result.  */
+	kunpck_2x %k2, %k3, %k3
+	kmov_2x	%k3, %maskm_2x
+	bsr	%maskm_2x, %maskm_2x
+	leaq	(VEC_SIZE * 1)(%r8, %rax, CHAR_SIZE), %rax
 	ret
 
-	.p2align 4,, 6
+	.p2align 4,, 7
 L(first_vec_x3):
-	VPCMP	$0, %YMMMATCH, %YMM4, %k1
-	kmovd	%k1, %eax
-	blsmskl	%ecx, %ecx
-	/* If no search CHAR match in range check YMM1/YMM2/YMM3.  */
-	andl	%ecx, %eax
+	VPCMPEQ	%VMATCH, %VMM(4), %k1
+	KMOV	%k1, %VRAX
+	blsmsk	%VRCX, %VRCX
+	/* If no search CHAR match in range check VEC(1)/VEC(2)/VEC(3).
+	 */
+	and	%VRCX, %VRAX
 	jz	L(first_vec_x1_or_x2)
-	bsrl	%eax, %eax
+	bsr	%VRAX, %VRAX
 	leaq	(VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax
 	ret
 
+
 	.p2align 4,, 6
 L(first_vec_x0_x1_test):
-	VPCMP	$0, %YMMMATCH, %YMM2, %k1
-	kmovd	%k1, %eax
-	/* Check YMM2 for last match first. If no match try YMM1.  */
-	testl	%eax, %eax
+	VPCMPEQ	%VMATCH, %VMM(2), %k1
+	KMOV	%k1, %VRAX
+	/* Check VEC(2) for last match first. If no match try VEC(1).
+	 */
+	test	%VRAX, %VRAX
 	jz	L(first_vec_x0_test)
 	.p2align 4,, 4
 L(first_vec_x1_return):
-	bsrl	%eax, %eax
+	bsr	%VRAX, %VRAX
 	leaq	(VEC_SIZE)(%rdi, %rax, CHAR_SIZE), %rax
 	ret
 
+
 	.p2align 4,, 10
 L(first_vec_x2):
-	VPCMP	$0, %YMMMATCH, %YMM3, %k1
-	kmovd	%k1, %eax
-	blsmskl	%ecx, %ecx
-	/* Check YMM3 for last match first. If no match try YMM2/YMM1.
-	 */
-	andl	%ecx, %eax
+	VPCMPEQ	%VMATCH, %VMM(3), %k1
+	KMOV	%k1, %VRAX
+	blsmsk	%VRCX, %VRCX
+	/* Check VEC(3) for last match first. If no match try
+	   VEC(2)/VEC(1).  */
+	and	%VRCX, %VRAX
 	jz	L(first_vec_x0_x1_test)
-	bsrl	%eax, %eax
+	bsr	%VRAX, %VRAX
 	leaq	(VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
 	ret
 
 
-	.p2align 4
+	.p2align 4,, 12
 L(aligned_more):
-	/* Need to keep original pointer incase YMM1 has last match.  */
+L(page_cross_continue):
+	/* Need to keep original pointer incase VEC(1) has last match.
+	 */
 	movq	%rdi, %rsi
 	andq	$-VEC_SIZE, %rdi
-	VMOVU	VEC_SIZE(%rdi), %YMM2
-	VPTESTN	%YMM2, %YMM2, %k0
-	kmovd	%k0, %ecx
-	testl	%ecx, %ecx
+
+	VMOVU	VEC_SIZE(%rdi), %VMM(2)
+	VPTESTN	%VMM(2), %VMM(2), %k0
+	KMOV	%k0, %VRCX
+
+	test	%VRCX, %VRCX
 	jnz	L(first_vec_x1)
 
-	VMOVU	(VEC_SIZE * 2)(%rdi), %YMM3
-	VPTESTN	%YMM3, %YMM3, %k0
-	kmovd	%k0, %ecx
-	testl	%ecx, %ecx
+	VMOVU	(VEC_SIZE * 2)(%rdi), %VMM(3)
+	VPTESTN	%VMM(3), %VMM(3), %k0
+	KMOV	%k0, %VRCX
+
+	test	%VRCX, %VRCX
 	jnz	L(first_vec_x2)
 
-	VMOVU	(VEC_SIZE * 3)(%rdi), %YMM4
-	VPTESTN	%YMM4, %YMM4, %k0
-	kmovd	%k0, %ecx
+	VMOVU	(VEC_SIZE * 3)(%rdi), %VMM(4)
+	VPTESTN	%VMM(4), %VMM(4), %k0
+	KMOV	%k0, %VRCX
 	movq	%rdi, %r8
-	testl	%ecx, %ecx
+	test	%VRCX, %VRCX
 	jnz	L(first_vec_x3)
 
 	andq	$-(VEC_SIZE * 2), %rdi
-	.p2align 4
+	.p2align 4,, 10
 L(first_aligned_loop):
-	/* Preserve YMM1, YMM2, YMM3, and YMM4 until we can gurantee
-	   they don't store a match.  */
-	VMOVA	(VEC_SIZE * 4)(%rdi), %YMM5
-	VMOVA	(VEC_SIZE * 5)(%rdi), %YMM6
+	/* Preserve VEC(1), VEC(2), VEC(3), and VEC(4) until we can
+	   gurantee they don't store a match.  */
+	VMOVA	(VEC_SIZE * 4)(%rdi), %VMM(5)
+	VMOVA	(VEC_SIZE * 5)(%rdi), %VMM(6)
 
-	VPCMP	$0, %YMM5, %YMMMATCH, %k2
-	vpxord	%YMM6, %YMMMATCH, %YMM7
+	VPCMPEQ	%VMM(5), %VMATCH, %k2
+	vpxord	%VMM(6), %VMATCH, %VMM(7)
 
-	VPMIN	%YMM5, %YMM6, %YMM8
-	VPMIN	%YMM8, %YMM7, %YMM7
+	VPMIN	%VMM(5), %VMM(6), %VMM(8)
+	VPMIN	%VMM(8), %VMM(7), %VMM(7)
 
-	VPTESTN	%YMM7, %YMM7, %k1
+	VPTESTN	%VMM(7), %VMM(7), %k1
 	subq	$(VEC_SIZE * -2), %rdi
-	kortestd %k1, %k2
+	KORTEST %k1, %k2
 	jz	L(first_aligned_loop)
 
-	VPCMP	$0, %YMM6, %YMMMATCH, %k3
-	VPTESTN	%YMM8, %YMM8, %k1
-	ktestd	%k1, %k1
+	VPCMPEQ	%VMM(6), %VMATCH, %k3
+	VPTESTN	%VMM(8), %VMM(8), %k1
+
+	/* If k1 is zero, then we found a CHAR match but no null-term.
+	   We can now safely throw out VEC1-4.  */
+	KTEST	%k1, %k1
 	jz	L(second_aligned_loop_prep)
 
-	kortestd %k2, %k3
+	KORTEST %k2, %k3
 	jnz	L(return_first_aligned_loop)
 
+
 	.p2align 4,, 6
 L(first_vec_x1_or_x2_or_x3):
-	VPCMP	$0, %YMM4, %YMMMATCH, %k4
-	kmovd	%k4, %eax
-	testl	%eax, %eax
+	VPCMPEQ	%VMM(4), %VMATCH, %k4
+	KMOV	%k4, %VRAX
+	bsr	%VRAX, %VRAX
 	jz	L(first_vec_x1_or_x2)
-	bsrl	%eax, %eax
 	leaq	(VEC_SIZE * 3)(%r8, %rax, CHAR_SIZE), %rax
 	ret
 
+
 	.p2align 4,, 8
 L(return_first_aligned_loop):
-	VPTESTN	%YMM5, %YMM5, %k0
-	kunpck	%k0, %k1, %k0
+	VPTESTN	%VMM(5), %VMM(5), %k0
+
+	/* Combined results from VEC5/6.  */
+	kunpck_2x %k0, %k1, %k0
 	kmov_2x	%k0, %maskz_2x
 
 	blsmsk	%maskz_2x, %maskz_2x
-	kunpck	%k2, %k3, %k3
+	kunpck_2x %k2, %k3, %k3
 	kmov_2x	%k3, %maskm_2x
 	and	%maskz_2x, %maskm_2x
 	jz	L(first_vec_x1_or_x2_or_x3)
@@ -280,47 +290,62 @@ L(return_first_aligned_loop):
 L(second_aligned_loop_prep):
 L(second_aligned_loop_set_furthest_match):
 	movq	%rdi, %rsi
-	kunpck	%k2, %k3, %k4
-
+	/* Ideally we would safe k2/k3 but `kmov/kunpck` take uops on
+	   port0 and have noticable overhead in the loop.  */
+	VMOVA	%VMM(5), %VMM(7)
+	VMOVA	%VMM(6), %VMM(8)
 	.p2align 4
 L(second_aligned_loop):
-	VMOVU	(VEC_SIZE * 4)(%rdi), %YMM1
-	VMOVU	(VEC_SIZE * 5)(%rdi), %YMM2
-
-	VPCMP	$0, %YMM1, %YMMMATCH, %k2
-	vpxord	%YMM2, %YMMMATCH, %YMM3
+	VMOVU	(VEC_SIZE * 4)(%rdi), %VMM(5)
+	VMOVU	(VEC_SIZE * 5)(%rdi), %VMM(6)
+	VPCMPEQ	%VMM(5), %VMATCH, %k2
+	vpxord	%VMM(6), %VMATCH, %VMM(3)
 
-	VPMIN	%YMM1, %YMM2, %YMM4
-	VPMIN	%YMM3, %YMM4, %YMM3
+	VPMIN	%VMM(5), %VMM(6), %VMM(4)
+	VPMIN	%VMM(3), %VMM(4), %VMM(3)
 
-	VPTESTN	%YMM3, %YMM3, %k1
+	VPTESTN	%VMM(3), %VMM(3), %k1
 	subq	$(VEC_SIZE * -2), %rdi
-	kortestd %k1, %k2
+	KORTEST %k1, %k2
 	jz	L(second_aligned_loop)
-
-	VPCMP	$0, %YMM2, %YMMMATCH, %k3
-	VPTESTN	%YMM4, %YMM4, %k1
-	ktestd	%k1, %k1
+	VPCMPEQ	%VMM(6), %VMATCH, %k3
+	VPTESTN	%VMM(4), %VMM(4), %k1
+	KTEST	%k1, %k1
 	jz	L(second_aligned_loop_set_furthest_match)
 
-	kortestd %k2, %k3
-	/* branch here because there is a significant advantage interms
-	   of output dependency chance in using edx.  */
+	/* branch here because we know we have a match in VEC7/8 but
+	   might not in VEC5/6 so the latter is expected to be less
+	   likely.  */
+	KORTEST %k2, %k3
 	jnz	L(return_new_match)
+
 L(return_old_match):
-	kmovq	%k4, %rax
-	bsrq	%rax, %rax
-	leaq	(VEC_SIZE * 2)(%rsi, %rax, CHAR_SIZE), %rax
+	VPCMPEQ	%VMM(8), %VMATCH, %k0
+	KMOV	%k0, %VRCX
+	bsr	%VRCX, %VRCX
+	jnz	L(return_old_match_ret)
+
+	VPCMPEQ	%VMM(7), %VMATCH, %k0
+	KMOV	%k0, %VRCX
+	bsr	%VRCX, %VRCX
+	subq	$VEC_SIZE, %rsi
+L(return_old_match_ret):
+	leaq	(VEC_SIZE * 3)(%rsi, %rcx, CHAR_SIZE), %rax
 	ret
 
+	.p2align 4,, 10
 L(return_new_match):
-	VPTESTN	%YMM1, %YMM1, %k0
-	kunpck	%k0, %k1, %k0
+	VPTESTN	%VMM(5), %VMM(5), %k0
+
+	/* Combined results from VEC5/6.  */
+	kunpck_2x %k0, %k1, %k0
 	kmov_2x	%k0, %maskz_2x
 
 	blsmsk	%maskz_2x, %maskz_2x
-	kunpck	%k2, %k3, %k3
+	kunpck_2x %k2, %k3, %k3
 	kmov_2x	%k3, %maskm_2x
+
+	/* Match at end was out-of-bounds so use last known match.  */
 	and	%maskz_2x, %maskm_2x
 	jz	L(return_old_match)
 
@@ -328,49 +353,53 @@ L(return_new_match):
 	leaq	(VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
 	ret
 
+	.p2align 4,, 4
 L(cross_page_boundary):
-	/* eax contains all the page offset bits of src (rdi). `xor rdi,
-	   rax` sets pointer will all page offset bits cleared so
-	   offset of (PAGE_SIZE - VEC_SIZE) will get last aligned VEC
-	   before page cross (guranteed to be safe to read). Doing this
-	   as opposed to `movq %rdi, %rax; andq $-VEC_SIZE, %rax` saves
-	   a bit of code size.  */
 	xorq	%rdi, %rax
-	VMOVU	(PAGE_SIZE - VEC_SIZE)(%rax), %YMM1
-	VPTESTN	%YMM1, %YMM1, %k0
-	kmovd	%k0, %ecx
+	mov	$-1, %VRDX
+	VMOVU	(PAGE_SIZE - VEC_SIZE)(%rax), %VMM(6)
+	VPTESTN	%VMM(6), %VMM(6), %k0
+	KMOV	%k0, %VRSI
+
+# ifdef USE_AS_WCSRCHR
+	movl	%edi, %ecx
+	and	$(VEC_SIZE - 1), %ecx
+	shrl	$2, %ecx
+# endif
+	shlx	%VGPR(SHIFT_REG), %VRDX, %VRDX
 
-	/* Shift out zero CHAR matches that are before the begining of
-	   src (rdi).  */
 # ifdef USE_AS_WCSRCHR
-	movl	%edi, %esi
-	andl	$(VEC_SIZE - 1), %esi
-	shrl	$2, %esi
+	kmovb	%edx, %k1
+# else
+	KMOV	%VRDX, %k1
 # endif
-	shrxl	%SHIFT_REG, %ecx, %ecx
 
-	testl	%ecx, %ecx
+	/* Need to adjust result to VEC(1) so it can be re-used by
+	   L(return_vec_x0_test).  The alternative is to collect VEC(1)
+	   will a page cross load which is far more expensive.  */
+	VPCOMPRESS %VMM(6), %VMM(1){%k1}{z}
+
+	/* We could technically just jmp back after the vpcompress but
+	   it doesn't save any 16-byte blocks.  */
+	shrx	%VGPR(SHIFT_REG), %VRSI, %VRSI
+	test	%VRSI, %VRSI
 	jz	L(page_cross_continue)
 
-	/* Found zero CHAR so need to test for search CHAR.  */
-	VPCMP	$0, %YMMMATCH, %YMM1, %k1
-	kmovd	%k1, %eax
-	/* Shift out search CHAR matches that are before the begining of
-	   src (rdi).  */
-	shrxl	%SHIFT_REG, %eax, %eax
-
-	/* Check if any search CHAR match in range.  */
-	blsmskl	%ecx, %ecx
-	andl	%ecx, %eax
-	jz	L(ret3)
-	bsrl	%eax, %eax
+	/* Duplicate of return logic from ENTRY. Doesn't cause spill to
+	   next cache line so might as well copy it here.  */
+	VPCMPEQ	%VMATCH, %VMM(1), %k1
+	KMOV	%k1, %VRAX
+	blsmsk	%VRSI, %VRSI
+	and	%VRSI, %VRAX
+	jz	L(ret_page_cross)
+	bsr	%VRAX, %VRAX
 # ifdef USE_AS_WCSRCHR
 	leaq	(%rdi, %rax, CHAR_SIZE), %rax
 # else
 	addq	%rdi, %rax
 # endif
-L(ret3):
+L(ret_page_cross):
 	ret
-
+	/* 1 byte till next cache line.  */
 END(STRRCHR)
 #endif
-- 
2.34.1


^ permalink raw reply	[flat|nested] 41+ messages in thread

* [PATCH v3 6/7] x86: Add support for VEC_SIZE == 64 in strcmp-evex.S impl
  2022-10-19  0:44 ` [PATCH v3 1/7] x86: Optimize memchr-evex.S and implement with VMM headers Noah Goldstein
                     ` (3 preceding siblings ...)
  2022-10-19  0:44   ` [PATCH v3 5/7] x86: Optimize strrchr-evex.S and implement with VMM headers Noah Goldstein
@ 2022-10-19  0:44   ` Noah Goldstein
  2022-10-19 16:59     ` H.J. Lu
  2022-10-19  0:44   ` [PATCH v3 7/7] Bench: Improve benchtests for memchr, strchr, strnlen, strrchr Noah Goldstein
  2022-10-19 16:52   ` [PATCH v3 1/7] x86: Optimize memchr-evex.S and implement with VMM headers H.J. Lu
  6 siblings, 1 reply; 41+ messages in thread
From: Noah Goldstein @ 2022-10-19  0:44 UTC (permalink / raw)
  To: libc-alpha; +Cc: goldstein.w.n, hjl.tools, carlos

Unused at the moment, but evex512 strcmp, strncmp, strcasecmp{l}, and
strncasecmp{l} functions can be added by including strcmp-evex.S with
"x86-evex512-vecs.h" defined.

In addition save code size a bit in a few places.

1. tzcnt ...         -> bsf ...
2. vpcmp{b|d} $0 ... -> vpcmpeq{b|d}

This saves a touch of code size but has minimal net affect.

Full check passes on x86-64.
---
 sysdeps/x86_64/multiarch/strcmp-evex.S | 676 ++++++++++++++++---------
 1 file changed, 430 insertions(+), 246 deletions(-)

diff --git a/sysdeps/x86_64/multiarch/strcmp-evex.S b/sysdeps/x86_64/multiarch/strcmp-evex.S
index e482d0167f..756a3bb8d6 100644
--- a/sysdeps/x86_64/multiarch/strcmp-evex.S
+++ b/sysdeps/x86_64/multiarch/strcmp-evex.S
@@ -20,6 +20,10 @@
 
 #if ISA_SHOULD_BUILD (4)
 
+# ifndef VEC_SIZE
+#  include "x86-evex256-vecs.h"
+# endif
+
 # define STRCMP_ISA	_evex
 # include "strcmp-naming.h"
 
@@ -35,41 +39,57 @@
 # define PAGE_SIZE	4096
 
 	/* VEC_SIZE = Number of bytes in a ymm register.  */
-# define VEC_SIZE	32
 # define CHAR_PER_VEC	(VEC_SIZE	/	SIZE_OF_CHAR)
 
-# define VMOVU	vmovdqu64
-# define VMOVA	vmovdqa64
-
 # ifdef USE_AS_WCSCMP
-#  define TESTEQ	subl $0xff,
 	/* Compare packed dwords.  */
 #  define VPCMP	vpcmpd
+#  define VPCMPEQ	vpcmpeqd
 #  define VPMINU	vpminud
 #  define VPTESTM	vptestmd
 #  define VPTESTNM	vptestnmd
 	/* 1 dword char == 4 bytes.  */
 #  define SIZE_OF_CHAR	4
+
+#  define TESTEQ	sub $((1 << CHAR_PER_VEC) - 1),
+
+#  define USE_WIDE_CHAR
 # else
-#  define TESTEQ	incl
 	/* Compare packed bytes.  */
 #  define VPCMP	vpcmpb
+#  define VPCMPEQ	vpcmpeqb
 #  define VPMINU	vpminub
 #  define VPTESTM	vptestmb
 #  define VPTESTNM	vptestnmb
 	/* 1 byte char == 1 byte.  */
 #  define SIZE_OF_CHAR	1
+
+#  define TESTEQ	inc
+# endif
+
+# include "reg-macros.h"
+
+# if VEC_SIZE == 64
+#  define RODATA_SECTION	rodata.cst64
+# else
+#  define RODATA_SECTION	rodata.cst32
+# endif
+
+# if CHAR_PER_VEC == 64
+#  define FALLTHROUGH_RETURN_OFFSET	(VEC_SIZE * 3)
+# else
+#  define FALLTHROUGH_RETURN_OFFSET	(VEC_SIZE * 2)
 # endif
 
 # ifdef USE_AS_STRNCMP
-#  define LOOP_REG	r9d
+#  define LOOP_REG	VR9
 #  define LOOP_REG64	r9
 
 #  define OFFSET_REG8	r9b
 #  define OFFSET_REG	r9d
 #  define OFFSET_REG64	r9
 # else
-#  define LOOP_REG	edx
+#  define LOOP_REG	VRDX
 #  define LOOP_REG64	rdx
 
 #  define OFFSET_REG8	dl
@@ -83,32 +103,6 @@
 #  define VEC_OFFSET	(-VEC_SIZE)
 # endif
 
-# define XMM0	xmm17
-# define XMM1	xmm18
-
-# define XMM10	xmm27
-# define XMM11	xmm28
-# define XMM12	xmm29
-# define XMM13	xmm30
-# define XMM14	xmm31
-
-
-# define YMM0	ymm17
-# define YMM1	ymm18
-# define YMM2	ymm19
-# define YMM3	ymm20
-# define YMM4	ymm21
-# define YMM5	ymm22
-# define YMM6	ymm23
-# define YMM7	ymm24
-# define YMM8	ymm25
-# define YMM9	ymm26
-# define YMM10	ymm27
-# define YMM11	ymm28
-# define YMM12	ymm29
-# define YMM13	ymm30
-# define YMM14	ymm31
-
 # ifdef USE_AS_STRCASECMP_L
 #  define BYTE_LOOP_REG	OFFSET_REG
 # else
@@ -125,61 +119,72 @@
 #  endif
 # endif
 
-# define LCASE_MIN_YMM	%YMM12
-# define LCASE_MAX_YMM	%YMM13
-# define CASE_ADD_YMM	%YMM14
+# define LCASE_MIN_V	VMM(12)
+# define LCASE_MAX_V	VMM(13)
+# define CASE_ADD_V	VMM(14)
 
-# define LCASE_MIN_XMM	%XMM12
-# define LCASE_MAX_XMM	%XMM13
-# define CASE_ADD_XMM	%XMM14
+# if VEC_SIZE == 64
+#  define LCASE_MIN_YMM	VMM_256(12)
+#  define LCASE_MAX_YMM	VMM_256(13)
+#  define CASE_ADD_YMM	VMM_256(14)
+# endif
+
+# define LCASE_MIN_XMM	VMM_128(12)
+# define LCASE_MAX_XMM	VMM_128(13)
+# define CASE_ADD_XMM	VMM_128(14)
 
 	/* NB: wcsncmp uses r11 but strcasecmp is never used in
 	   conjunction with wcscmp.  */
 # define TOLOWER_BASE	%r11
 
 # ifdef USE_AS_STRCASECMP_L
-#  define _REG(x, y) x ## y
-#  define REG(x, y) _REG(x, y)
-#  define TOLOWER(reg1, reg2, ext)										\
-	vpsubb	REG(LCASE_MIN_, ext), reg1, REG(%ext, 10);					\
-	vpsubb	REG(LCASE_MIN_, ext), reg2, REG(%ext, 11);					\
-	vpcmpub	$1, REG(LCASE_MAX_, ext), REG(%ext, 10), %k5;				\
-	vpcmpub	$1, REG(LCASE_MAX_, ext), REG(%ext, 11), %k6;				\
-	vpaddb	reg1, REG(CASE_ADD_, ext), reg1{%k5};						\
-	vpaddb	reg2, REG(CASE_ADD_, ext), reg2{%k6}
-
-#  define TOLOWER_gpr(src, dst) movl (TOLOWER_BASE, src, 4), dst
-#  define TOLOWER_YMM(...)	TOLOWER(__VA_ARGS__, YMM)
-#  define TOLOWER_XMM(...)	TOLOWER(__VA_ARGS__, XMM)
-
-#  define CMP_R1_R2(s1_reg, s2_reg, reg_out, ext)						\
-	TOLOWER	(s1_reg, s2_reg, ext);										\
-	VPCMP	$0, s1_reg, s2_reg, reg_out
-
-#  define CMP_R1_S2(s1_reg, s2_mem, s2_reg, reg_out, ext)				\
-	VMOVU	s2_mem, s2_reg;												\
-	CMP_R1_R2(s1_reg, s2_reg, reg_out, ext)
-
-#  define CMP_R1_R2_YMM(...) CMP_R1_R2(__VA_ARGS__, YMM)
-#  define CMP_R1_R2_XMM(...) CMP_R1_R2(__VA_ARGS__, XMM)
-
-#  define CMP_R1_S2_YMM(...) CMP_R1_S2(__VA_ARGS__, YMM)
-#  define CMP_R1_S2_XMM(...) CMP_R1_S2(__VA_ARGS__, XMM)
+#  define _REG(x, y)	x ## y
+#  define REG(x, y)	_REG(x, y)
+#  define TOLOWER(reg1, reg2, ext, vec_macro)	\
+	vpsubb	%REG(LCASE_MIN_, ext), reg1, %vec_macro(10);	\
+	vpsubb	%REG(LCASE_MIN_, ext), reg2, %vec_macro(11);	\
+	vpcmpub	$1, %REG(LCASE_MAX_, ext), %vec_macro(10), %k5;	\
+	vpcmpub	$1, %REG(LCASE_MAX_, ext), %vec_macro(11), %k6;	\
+	vpaddb	reg1, %REG(CASE_ADD_, ext), reg1{%k5};	\
+	vpaddb	reg2, %REG(CASE_ADD_, ext), reg2{%k6}
+
+#  define TOLOWER_gpr(src, dst)	movl (TOLOWER_BASE, src, 4), dst
+#  define TOLOWER_VMM(...)	TOLOWER(__VA_ARGS__, V, VMM)
+#  define TOLOWER_YMM(...)	TOLOWER(__VA_ARGS__, YMM, VMM_256)
+#  define TOLOWER_XMM(...)	TOLOWER(__VA_ARGS__, XMM, VMM_128)
+
+#  define CMP_R1_R2(s1_reg, s2_reg, reg_out, ext, vec_macro)	\
+	TOLOWER	(s1_reg, s2_reg, ext, vec_macro);	\
+	VPCMPEQ	s1_reg, s2_reg, reg_out
+
+#  define CMP_R1_S2(s1_reg, s2_mem, s2_reg, reg_out, ext, vec_macro)	\
+	VMOVU	s2_mem, s2_reg;	\
+	CMP_R1_R2 (s1_reg, s2_reg, reg_out, ext, vec_macro)
+
+#  define CMP_R1_R2_VMM(...)	CMP_R1_R2(__VA_ARGS__, V, VMM)
+#  define CMP_R1_R2_YMM(...)	CMP_R1_R2(__VA_ARGS__, YMM, VMM_256)
+#  define CMP_R1_R2_XMM(...)	CMP_R1_R2(__VA_ARGS__, XMM, VMM_128)
+
+#  define CMP_R1_S2_VMM(...)	CMP_R1_S2(__VA_ARGS__, V, VMM)
+#  define CMP_R1_S2_YMM(...)	CMP_R1_S2(__VA_ARGS__, YMM, VMM_256)
+#  define CMP_R1_S2_XMM(...)	CMP_R1_S2(__VA_ARGS__, XMM, VMM_128)
 
 # else
 #  define TOLOWER_gpr(...)
+#  define TOLOWER_VMM(...)
 #  define TOLOWER_YMM(...)
 #  define TOLOWER_XMM(...)
 
-#  define CMP_R1_R2_YMM(s1_reg, s2_reg, reg_out)						\
-	VPCMP	$0, s2_reg, s1_reg, reg_out
+#  define CMP_R1_R2_VMM(s1_reg, s2_reg, reg_out)	\
+	VPCMPEQ	s2_reg, s1_reg, reg_out
 
-#  define CMP_R1_R2_XMM(...) CMP_R1_R2_YMM(__VA_ARGS__)
+#  define CMP_R1_R2_YMM(...)	CMP_R1_R2_VMM(__VA_ARGS__)
+#  define CMP_R1_R2_XMM(...)	CMP_R1_R2_VMM(__VA_ARGS__)
 
-#  define CMP_R1_S2_YMM(s1_reg, s2_mem, unused, reg_out)				\
-	VPCMP	$0, s2_mem, s1_reg, reg_out
-
-#  define CMP_R1_S2_XMM(...) CMP_R1_S2_YMM(__VA_ARGS__)
+#  define CMP_R1_S2_VMM(s1_reg, s2_mem, unused, reg_out)	\
+	VPCMPEQ	s2_mem, s1_reg, reg_out
+#  define CMP_R1_S2_YMM(...)	CMP_R1_S2_VMM(__VA_ARGS__)
+#  define CMP_R1_S2_XMM(...)	CMP_R1_S2_VMM(__VA_ARGS__)
 # endif
 
 /* Warning!
@@ -203,7 +208,7 @@
    the maximum offset is reached before a difference is found, zero is
    returned.  */
 
-	.section .text.evex, "ax", @progbits
+	.section SECTION(.text), "ax", @progbits
 	.align	16
 	.type	STRCMP, @function
 	.globl	STRCMP
@@ -232,7 +237,7 @@ STRCMP:
 #  else
 	mov	(%LOCALE_REG), %RAX_LP
 #  endif
-	testl	$1, LOCALE_DATA_VALUES + _NL_CTYPE_NONASCII_CASE * SIZEOF_VALUES(%rax)
+	testb	$1, LOCALE_DATA_VALUES + _NL_CTYPE_NONASCII_CASE * SIZEOF_VALUES(%rax)
 	jne	STRCASECMP_L_NONASCII
 	leaq	_nl_C_LC_CTYPE_tolower + 128 * 4(%rip), TOLOWER_BASE
 # endif
@@ -254,28 +259,46 @@ STRCMP:
 # endif
 
 # if defined USE_AS_STRCASECMP_L
-	.section .rodata.cst32, "aM", @progbits, 32
-	.align	32
+	.section RODATA_SECTION, "aM", @progbits, VEC_SIZE
+	.align	VEC_SIZE
 L(lcase_min):
 	.quad	0x4141414141414141
 	.quad	0x4141414141414141
 	.quad	0x4141414141414141
 	.quad	0x4141414141414141
+#  if VEC_SIZE == 64
+	.quad	0x4141414141414141
+	.quad	0x4141414141414141
+	.quad	0x4141414141414141
+	.quad	0x4141414141414141
+#  endif
 L(lcase_max):
 	.quad	0x1a1a1a1a1a1a1a1a
 	.quad	0x1a1a1a1a1a1a1a1a
 	.quad	0x1a1a1a1a1a1a1a1a
 	.quad	0x1a1a1a1a1a1a1a1a
+#  if VEC_SIZE == 64
+	.quad	0x1a1a1a1a1a1a1a1a
+	.quad	0x1a1a1a1a1a1a1a1a
+	.quad	0x1a1a1a1a1a1a1a1a
+	.quad	0x1a1a1a1a1a1a1a1a
+#  endif
 L(case_add):
 	.quad	0x2020202020202020
 	.quad	0x2020202020202020
 	.quad	0x2020202020202020
 	.quad	0x2020202020202020
+#  if VEC_SIZE == 64
+	.quad	0x2020202020202020
+	.quad	0x2020202020202020
+	.quad	0x2020202020202020
+	.quad	0x2020202020202020
+#  endif
 	.previous
 
-	vmovdqa64 L(lcase_min)(%rip), LCASE_MIN_YMM
-	vmovdqa64 L(lcase_max)(%rip), LCASE_MAX_YMM
-	vmovdqa64 L(case_add)(%rip), CASE_ADD_YMM
+	VMOVA	L(lcase_min)(%rip), %LCASE_MIN_V
+	VMOVA	L(lcase_max)(%rip), %LCASE_MAX_V
+	VMOVA	L(case_add)(%rip), %CASE_ADD_V
 # endif
 
 	movl	%edi, %eax
@@ -288,12 +311,12 @@ L(case_add):
 
 L(no_page_cross):
 	/* Safe to compare 4x vectors.  */
-	VMOVU	(%rdi), %YMM0
-	VPTESTM	%YMM0, %YMM0, %k2
+	VMOVU	(%rdi), %VMM(0)
+	VPTESTM	%VMM(0), %VMM(0), %k2
 	/* Each bit cleared in K1 represents a mismatch or a null CHAR
 	   in YMM0 and 32 bytes at (%rsi).  */
-	CMP_R1_S2_YMM (%YMM0, (%rsi), %YMM1, %k1){%k2}
-	kmovd	%k1, %ecx
+	CMP_R1_S2_VMM (%VMM(0), (%rsi), %VMM(1), %k1){%k2}
+	KMOV	%k1, %VRCX
 # ifdef USE_AS_STRNCMP
 	cmpq	$CHAR_PER_VEC, %rdx
 	jbe	L(vec_0_test_len)
@@ -303,14 +326,14 @@ L(no_page_cross):
 	   wcscmp/wcsncmp.  */
 
 	/* All 1s represents all equals. TESTEQ will overflow to zero in
-	   all equals case. Otherwise 1s will carry until position of first
-	   mismatch.  */
-	TESTEQ	%ecx
+	   all equals case. Otherwise 1s will carry until position of
+	   first mismatch.  */
+	TESTEQ	%VRCX
 	jz	L(more_3x_vec)
 
 	.p2align 4,, 4
 L(return_vec_0):
-	tzcntl	%ecx, %ecx
+	bsf	%VRCX, %VRCX
 # ifdef USE_AS_WCSCMP
 	movl	(%rdi, %rcx, SIZE_OF_CHAR), %edx
 	xorl	%eax, %eax
@@ -321,7 +344,16 @@ L(return_vec_0):
 	orl	$1, %eax
 # else
 	movzbl	(%rdi, %rcx), %eax
+	/* For VEC_SIZE == 64 use movb instead of movzbl to save a byte
+	   and keep logic for len <= VEC_SIZE (common) in just the
+	   first cache line.  NB: No evex512 processor has partial-
+	   register stalls. If that changes this ifdef can be disabled
+	   without affecting correctness.  */
+#  if !defined USE_AS_STRNCMP && !defined USE_AS_STRCASECMP_L && VEC_SIZE == 64
+	movb	(%rsi, %rcx), %cl
+#  else
 	movzbl	(%rsi, %rcx), %ecx
+#  endif
 	TOLOWER_gpr (%rax, %eax)
 	TOLOWER_gpr (%rcx, %ecx)
 	subl	%ecx, %eax
@@ -332,8 +364,8 @@ L(ret0):
 # ifdef USE_AS_STRNCMP
 	.p2align 4,, 4
 L(vec_0_test_len):
-	notl	%ecx
-	bzhil	%edx, %ecx, %eax
+	not	%VRCX
+	bzhi	%VRDX, %VRCX, %VRAX
 	jnz	L(return_vec_0)
 	/* Align if will cross fetch block.  */
 	.p2align 4,, 2
@@ -372,7 +404,7 @@ L(ret1):
 
 	.p2align 4,, 10
 L(return_vec_1):
-	tzcntl	%ecx, %ecx
+	bsf	%VRCX, %VRCX
 # ifdef USE_AS_STRNCMP
 	/* rdx must be > CHAR_PER_VEC so its safe to subtract without
 	   worrying about underflow.  */
@@ -401,24 +433,41 @@ L(ret2):
 	.p2align 4,, 10
 # ifdef USE_AS_STRNCMP
 L(return_vec_3):
-#  if CHAR_PER_VEC <= 16
+#  if CHAR_PER_VEC <= 32
+	/* If CHAR_PER_VEC <= 32 reuse code from L(return_vec_3) without
+	   additional branches by adjusting the bit positions from
+	   VEC3.  We can't do this for CHAR_PER_VEC == 64.  */
+#   if CHAR_PER_VEC <= 16
 	sall	$CHAR_PER_VEC, %ecx
-#  else
+#   else
 	salq	$CHAR_PER_VEC, %rcx
+#   endif
+#  else
+	/* If CHAR_PER_VEC == 64 we can't shift the return GPR so just
+	   check it.  */
+	bsf	%VRCX, %VRCX
+	addl	$(CHAR_PER_VEC), %ecx
+	cmpq	%rcx, %rdx
+	ja	L(ret_vec_3_finish)
+	xorl	%eax, %eax
+	ret
 #  endif
 # endif
+
+	/* If CHAR_PER_VEC == 64 we can't combine matches from the last
+	   2x VEC so need seperate return label.  */
 L(return_vec_2):
 # if (CHAR_PER_VEC <= 16) || !(defined USE_AS_STRNCMP)
-	tzcntl	%ecx, %ecx
+	bsf	%VRCX, %VRCX
 # else
-	tzcntq	%rcx, %rcx
+	bsfq	%rcx, %rcx
 # endif
-
 # ifdef USE_AS_STRNCMP
 	cmpq	%rcx, %rdx
 	jbe	L(ret_zero)
 # endif
 
+L(ret_vec_3_finish):
 # ifdef USE_AS_WCSCMP
 	movl	(VEC_SIZE * 2)(%rdi, %rcx, SIZE_OF_CHAR), %edx
 	xorl	%eax, %eax
@@ -440,7 +489,7 @@ L(ret3):
 # ifndef USE_AS_STRNCMP
 	.p2align 4,, 10
 L(return_vec_3):
-	tzcntl	%ecx, %ecx
+	bsf	%VRCX, %VRCX
 #  ifdef USE_AS_WCSCMP
 	movl	(VEC_SIZE * 3)(%rdi, %rcx, SIZE_OF_CHAR), %edx
 	xorl	%eax, %eax
@@ -465,11 +514,11 @@ L(ret4):
 	.p2align 5
 L(more_3x_vec):
 	/* Safe to compare 4x vectors.  */
-	VMOVU	(VEC_SIZE)(%rdi), %YMM0
-	VPTESTM	%YMM0, %YMM0, %k2
-	CMP_R1_S2_YMM (%YMM0, VEC_SIZE(%rsi), %YMM1, %k1){%k2}
-	kmovd	%k1, %ecx
-	TESTEQ	%ecx
+	VMOVU	(VEC_SIZE)(%rdi), %VMM(0)
+	VPTESTM	%VMM(0), %VMM(0), %k2
+	CMP_R1_S2_VMM (%VMM(0), VEC_SIZE(%rsi), %VMM(1), %k1){%k2}
+	KMOV	%k1, %VRCX
+	TESTEQ	%VRCX
 	jnz	L(return_vec_1)
 
 # ifdef USE_AS_STRNCMP
@@ -477,18 +526,18 @@ L(more_3x_vec):
 	jbe	L(ret_zero)
 # endif
 
-	VMOVU	(VEC_SIZE * 2)(%rdi), %YMM0
-	VPTESTM	%YMM0, %YMM0, %k2
-	CMP_R1_S2_YMM (%YMM0, (VEC_SIZE * 2)(%rsi), %YMM1, %k1){%k2}
-	kmovd	%k1, %ecx
-	TESTEQ	%ecx
+	VMOVU	(VEC_SIZE * 2)(%rdi), %VMM(0)
+	VPTESTM	%VMM(0), %VMM(0), %k2
+	CMP_R1_S2_VMM (%VMM(0), (VEC_SIZE * 2)(%rsi), %VMM(1), %k1){%k2}
+	KMOV	%k1, %VRCX
+	TESTEQ	%VRCX
 	jnz	L(return_vec_2)
 
-	VMOVU	(VEC_SIZE * 3)(%rdi), %YMM0
-	VPTESTM	%YMM0, %YMM0, %k2
-	CMP_R1_S2_YMM (%YMM0, (VEC_SIZE * 3)(%rsi), %YMM1, %k1){%k2}
-	kmovd	%k1, %ecx
-	TESTEQ	%ecx
+	VMOVU	(VEC_SIZE * 3)(%rdi), %VMM(0)
+	VPTESTM	%VMM(0), %VMM(0), %k2
+	CMP_R1_S2_VMM (%VMM(0), (VEC_SIZE * 3)(%rsi), %VMM(1), %k1){%k2}
+	KMOV	%k1, %VRCX
+	TESTEQ	%VRCX
 	jnz	L(return_vec_3)
 
 # ifdef USE_AS_STRNCMP
@@ -565,110 +614,123 @@ L(loop):
 
 	/* Loop entry after handling page cross during loop.  */
 L(loop_skip_page_cross_check):
-	VMOVA	(VEC_SIZE * 0)(%rdi), %YMM0
-	VMOVA	(VEC_SIZE * 1)(%rdi), %YMM2
-	VMOVA	(VEC_SIZE * 2)(%rdi), %YMM4
-	VMOVA	(VEC_SIZE * 3)(%rdi), %YMM6
+	VMOVA	(VEC_SIZE * 0)(%rdi), %VMM(0)
+	VMOVA	(VEC_SIZE * 1)(%rdi), %VMM(2)
+	VMOVA	(VEC_SIZE * 2)(%rdi), %VMM(4)
+	VMOVA	(VEC_SIZE * 3)(%rdi), %VMM(6)
 
-	VPMINU	%YMM0, %YMM2, %YMM8
-	VPMINU	%YMM4, %YMM6, %YMM9
+	VPMINU	%VMM(0), %VMM(2), %VMM(8)
+	VPMINU	%VMM(4), %VMM(6), %VMM(9)
 
 	/* A zero CHAR in YMM9 means that there is a null CHAR.  */
-	VPMINU	%YMM8, %YMM9, %YMM9
+	VPMINU	%VMM(8), %VMM(9), %VMM(9)
 
 	/* Each bit set in K1 represents a non-null CHAR in YMM9.  */
-	VPTESTM	%YMM9, %YMM9, %k1
+	VPTESTM	%VMM(9), %VMM(9), %k1
 # ifndef USE_AS_STRCASECMP_L
-	vpxorq	(VEC_SIZE * 0)(%rsi), %YMM0, %YMM1
-	vpxorq	(VEC_SIZE * 1)(%rsi), %YMM2, %YMM3
-	vpxorq	(VEC_SIZE * 2)(%rsi), %YMM4, %YMM5
+	vpxorq	(VEC_SIZE * 0)(%rsi), %VMM(0), %VMM(1)
+	vpxorq	(VEC_SIZE * 1)(%rsi), %VMM(2), %VMM(3)
+	vpxorq	(VEC_SIZE * 2)(%rsi), %VMM(4), %VMM(5)
 	/* Ternary logic to xor (VEC_SIZE * 3)(%rsi) with YMM6 while
 	   oring with YMM1. Result is stored in YMM6.  */
-	vpternlogd $0xde, (VEC_SIZE * 3)(%rsi), %YMM1, %YMM6
+	vpternlogd $0xde, (VEC_SIZE * 3)(%rsi), %VMM(1), %VMM(6)
 # else
-	VMOVU	(VEC_SIZE * 0)(%rsi), %YMM1
-	TOLOWER_YMM (%YMM0, %YMM1)
-	VMOVU	(VEC_SIZE * 1)(%rsi), %YMM3
-	TOLOWER_YMM (%YMM2, %YMM3)
-	VMOVU	(VEC_SIZE * 2)(%rsi), %YMM5
-	TOLOWER_YMM (%YMM4, %YMM5)
-	VMOVU	(VEC_SIZE * 3)(%rsi), %YMM7
-	TOLOWER_YMM (%YMM6, %YMM7)
-	vpxorq	%YMM0, %YMM1, %YMM1
-	vpxorq	%YMM2, %YMM3, %YMM3
-	vpxorq	%YMM4, %YMM5, %YMM5
-	vpternlogd $0xde, %YMM7, %YMM1, %YMM6
+	VMOVU	(VEC_SIZE * 0)(%rsi), %VMM(1)
+	TOLOWER_VMM (%VMM(0), %VMM(1))
+	VMOVU	(VEC_SIZE * 1)(%rsi), %VMM(3)
+	TOLOWER_VMM (%VMM(2), %VMM(3))
+	VMOVU	(VEC_SIZE * 2)(%rsi), %VMM(5)
+	TOLOWER_VMM (%VMM(4), %VMM(5))
+	VMOVU	(VEC_SIZE * 3)(%rsi), %VMM(7)
+	TOLOWER_VMM (%VMM(6), %VMM(7))
+	vpxorq	%VMM(0), %VMM(1), %VMM(1)
+	vpxorq	%VMM(2), %VMM(3), %VMM(3)
+	vpxorq	%VMM(4), %VMM(5), %VMM(5)
+	vpternlogd $0xde, %VMM(7), %VMM(1), %VMM(6)
 # endif
 	/* Or together YMM3, YMM5, and YMM6.  */
-	vpternlogd $0xfe, %YMM3, %YMM5, %YMM6
+	vpternlogd $0xfe, %VMM(3), %VMM(5), %VMM(6)
 
 
 	/* A non-zero CHAR in YMM6 represents a mismatch.  */
-	VPTESTNM %YMM6, %YMM6, %k0{%k1}
-	kmovd	%k0, %LOOP_REG
+	VPTESTNM %VMM(6), %VMM(6), %k0{%k1}
+	KMOV	%k0, %LOOP_REG
 
 	TESTEQ	%LOOP_REG
 	jz	L(loop)
 
 
 	/* Find which VEC has the mismatch of end of string.  */
-	VPTESTM	%YMM0, %YMM0, %k1
-	VPTESTNM %YMM1, %YMM1, %k0{%k1}
-	kmovd	%k0, %ecx
-	TESTEQ	%ecx
+	VPTESTM	%VMM(0), %VMM(0), %k1
+	VPTESTNM %VMM(1), %VMM(1), %k0{%k1}
+	KMOV	%k0, %VRCX
+	TESTEQ	%VRCX
 	jnz	L(return_vec_0_end)
 
-	VPTESTM	%YMM2, %YMM2, %k1
-	VPTESTNM %YMM3, %YMM3, %k0{%k1}
-	kmovd	%k0, %ecx
-	TESTEQ	%ecx
+	VPTESTM	%VMM(2), %VMM(2), %k1
+	VPTESTNM %VMM(3), %VMM(3), %k0{%k1}
+	KMOV	%k0, %VRCX
+	TESTEQ	%VRCX
 	jnz	L(return_vec_1_end)
 
 
-	/* Handle VEC 2 and 3 without branches.  */
+	/* Handle VEC 2 and 3 without branches if CHAR_PER_VEC <= 32.
+	 */
 L(return_vec_2_3_end):
 # ifdef USE_AS_STRNCMP
 	subq	$(CHAR_PER_VEC * 2), %rdx
 	jbe	L(ret_zero_end)
 # endif
 
-	VPTESTM	%YMM4, %YMM4, %k1
-	VPTESTNM %YMM5, %YMM5, %k0{%k1}
-	kmovd	%k0, %ecx
-	TESTEQ	%ecx
+	VPTESTM	%VMM(4), %VMM(4), %k1
+	VPTESTNM %VMM(5), %VMM(5), %k0{%k1}
+	KMOV	%k0, %VRCX
+	TESTEQ	%VRCX
 # if CHAR_PER_VEC <= 16
 	sall	$CHAR_PER_VEC, %LOOP_REG
 	orl	%ecx, %LOOP_REG
-# else
+# elif CHAR_PER_VEC <= 32
 	salq	$CHAR_PER_VEC, %LOOP_REG64
 	orq	%rcx, %LOOP_REG64
+# else
+	/* We aren't combining last 2x VEC so branch on second the last.
+	 */
+	jnz	L(return_vec_2_end)
 # endif
-L(return_vec_3_end):
+
 	/* LOOP_REG contains matches for null/mismatch from the loop. If
-	   VEC 0,1,and 2 all have no null and no mismatches then mismatch
-	   must entirely be from VEC 3 which is fully represented by
-	   LOOP_REG.  */
+	   VEC 0,1,and 2 all have no null and no mismatches then
+	   mismatch must entirely be from VEC 3 which is fully
+	   represented by LOOP_REG.  */
 # if CHAR_PER_VEC <= 16
-	tzcntl	%LOOP_REG, %LOOP_REG
+	bsf	%LOOP_REG, %LOOP_REG
 # else
-	tzcntq	%LOOP_REG64, %LOOP_REG64
+	bsfq	%LOOP_REG64, %LOOP_REG64
 # endif
 # ifdef USE_AS_STRNCMP
+
+	/* If CHAR_PER_VEC == 64 we can't combine last 2x VEC so need to
+	   adj length before last comparison.  */
+#  if CHAR_PER_VEC == 64
+	subq	$CHAR_PER_VEC, %rdx
+	jbe	L(ret_zero_end)
+#  endif
+
 	cmpq	%LOOP_REG64, %rdx
 	jbe	L(ret_zero_end)
 # endif
 
 # ifdef USE_AS_WCSCMP
-	movl	(VEC_SIZE * 2)(%rdi, %LOOP_REG64, SIZE_OF_CHAR), %ecx
+	movl	(FALLTHROUGH_RETURN_OFFSET)(%rdi, %LOOP_REG64, SIZE_OF_CHAR), %ecx
 	xorl	%eax, %eax
-	cmpl	(VEC_SIZE * 2)(%rsi, %LOOP_REG64, SIZE_OF_CHAR), %ecx
+	cmpl	(FALLTHROUGH_RETURN_OFFSET)(%rsi, %LOOP_REG64, SIZE_OF_CHAR), %ecx
 	je	L(ret5)
 	setl	%al
 	negl	%eax
 	xorl	%r8d, %eax
 # else
-	movzbl	(VEC_SIZE * 2)(%rdi, %LOOP_REG64), %eax
-	movzbl	(VEC_SIZE * 2)(%rsi, %LOOP_REG64), %ecx
+	movzbl	(FALLTHROUGH_RETURN_OFFSET)(%rdi, %LOOP_REG64), %eax
+	movzbl	(FALLTHROUGH_RETURN_OFFSET)(%rsi, %LOOP_REG64), %ecx
 	TOLOWER_gpr (%rax, %eax)
 	TOLOWER_gpr (%rcx, %ecx)
 	subl	%ecx, %eax
@@ -686,23 +748,39 @@ L(ret_zero_end):
 # endif
 
 
+
 	/* The L(return_vec_N_end) differ from L(return_vec_N) in that
-	   they use the value of `r8` to negate the return value. This is
-	   because the page cross logic can swap `rdi` and `rsi`.  */
+	   they use the value of `r8` to negate the return value. This
+	   is because the page cross logic can swap `rdi` and `rsi`.
+	 */
 	.p2align 4,, 10
 # ifdef USE_AS_STRNCMP
 L(return_vec_1_end):
-#  if CHAR_PER_VEC <= 16
+#  if CHAR_PER_VEC <= 32
+	/* If CHAR_PER_VEC <= 32 reuse code from L(return_vec_0_end)
+	   without additional branches by adjusting the bit positions
+	   from VEC1.  We can't do this for CHAR_PER_VEC == 64.  */
+#   if CHAR_PER_VEC <= 16
 	sall	$CHAR_PER_VEC, %ecx
-#  else
+#   else
 	salq	$CHAR_PER_VEC, %rcx
+#   endif
+#  else
+	/* If CHAR_PER_VEC == 64 we can't shift the return GPR so just
+	   check it.  */
+	bsf	%VRCX, %VRCX
+	addl	$(CHAR_PER_VEC), %ecx
+	cmpq	%rcx, %rdx
+	ja	L(ret_vec_0_end_finish)
+	xorl	%eax, %eax
+	ret
 #  endif
 # endif
 L(return_vec_0_end):
 # if (CHAR_PER_VEC <= 16) || !(defined USE_AS_STRNCMP)
-	tzcntl	%ecx, %ecx
+	bsf	%VRCX, %VRCX
 # else
-	tzcntq	%rcx, %rcx
+	bsfq	%rcx, %rcx
 # endif
 
 # ifdef USE_AS_STRNCMP
@@ -710,6 +788,7 @@ L(return_vec_0_end):
 	jbe	L(ret_zero_end)
 # endif
 
+L(ret_vec_0_end_finish):
 # ifdef USE_AS_WCSCMP
 	movl	(%rdi, %rcx, SIZE_OF_CHAR), %edx
 	xorl	%eax, %eax
@@ -737,7 +816,7 @@ L(ret6):
 # ifndef USE_AS_STRNCMP
 	.p2align 4,, 10
 L(return_vec_1_end):
-	tzcntl	%ecx, %ecx
+	bsf	%VRCX, %VRCX
 #  ifdef USE_AS_WCSCMP
 	movl	VEC_SIZE(%rdi, %rcx, SIZE_OF_CHAR), %edx
 	xorl	%eax, %eax
@@ -760,6 +839,41 @@ L(ret7):
 # endif
 
 
+	/* If CHAR_PER_VEC == 64 we can't combine matches from the last
+	   2x VEC so need seperate return label.  */
+# if CHAR_PER_VEC == 64
+L(return_vec_2_end):
+	bsf	%VRCX, %VRCX
+#  ifdef USE_AS_STRNCMP
+	cmpq	%rcx, %rdx
+	jbe	L(ret_zero_end)
+#  endif
+#  ifdef USE_AS_WCSCMP
+	movl	(VEC_SIZE * 2)(%rdi, %rcx, SIZE_OF_CHAR), %edx
+	xorl	%eax, %eax
+	cmpl	(VEC_SIZE * 2)(%rsi, %rcx, SIZE_OF_CHAR), %edx
+	je	L(ret31)
+	setl	%al
+	negl	%eax
+	/* This is the non-zero case for `eax` so just xorl with `r8d`
+	   flip is `rdi` and `rsi` where swapped.  */
+	xorl	%r8d, %eax
+#  else
+	movzbl	(VEC_SIZE * 2)(%rdi, %rcx), %eax
+	movzbl	(VEC_SIZE * 2)(%rsi, %rcx), %ecx
+	TOLOWER_gpr (%rax, %eax)
+	TOLOWER_gpr (%rcx, %ecx)
+	subl	%ecx, %eax
+	/* Flip `eax` if `rdi` and `rsi` where swapped in page cross
+	   logic. Subtract `r8d` after xor for zero case.  */
+	xorl	%r8d, %eax
+	subl	%r8d, %eax
+#  endif
+L(ret13):
+	ret
+# endif
+
+
 	/* Page cross in rsi in next 4x VEC.  */
 
 	/* TODO: Improve logic here.  */
@@ -778,11 +892,11 @@ L(page_cross_during_loop):
 	cmpl	$-(VEC_SIZE * 3), %eax
 	jle	L(less_1x_vec_till_page_cross)
 
-	VMOVA	(%rdi), %YMM0
-	VPTESTM	%YMM0, %YMM0, %k2
-	CMP_R1_S2_YMM (%YMM0, (%rsi), %YMM1, %k1){%k2}
-	kmovd	%k1, %ecx
-	TESTEQ	%ecx
+	VMOVA	(%rdi), %VMM(0)
+	VPTESTM	%VMM(0), %VMM(0), %k2
+	CMP_R1_S2_VMM (%VMM(0), (%rsi), %VMM(1), %k1){%k2}
+	KMOV	%k1, %VRCX
+	TESTEQ	%VRCX
 	jnz	L(return_vec_0_end)
 
 	/* if distance >= 2x VEC then eax > -(VEC_SIZE * 2).  */
@@ -799,9 +913,9 @@ L(less_1x_vec_till_page_cross):
 	   to read back -VEC_SIZE. If rdi is truly at the start of a page
 	   here, it means the previous page (rdi - VEC_SIZE) has already
 	   been loaded earlier so must be valid.  */
-	VMOVU	-VEC_SIZE(%rdi, %rax), %YMM0
-	VPTESTM	%YMM0, %YMM0, %k2
-	CMP_R1_S2_YMM (%YMM0, -VEC_SIZE(%rsi, %rax), %YMM1, %k1){%k2}
+	VMOVU	-VEC_SIZE(%rdi, %rax), %VMM(0)
+	VPTESTM	%VMM(0), %VMM(0), %k2
+	CMP_R1_S2_VMM (%VMM(0), -VEC_SIZE(%rsi, %rax), %VMM(1), %k1){%k2}
 	/* Mask of potentially valid bits. The lower bits can be out of
 	   range comparisons (but safe regarding page crosses).  */
 
@@ -813,12 +927,12 @@ L(less_1x_vec_till_page_cross):
 	shlxl	%ecx, %r10d, %ecx
 	movzbl	%cl, %r10d
 # else
-	movl	$-1, %ecx
-	shlxl	%esi, %ecx, %r10d
+	mov	$-1, %VRCX
+	shlx	%VRSI, %VRCX, %VR10
 # endif
 
-	kmovd	%k1, %ecx
-	notl	%ecx
+	KMOV	%k1, %VRCX
+	not	%VRCX
 
 
 # ifdef USE_AS_STRNCMP
@@ -838,12 +952,10 @@ L(less_1x_vec_till_page_cross):
 	/* Readjust eax before potentially returning to the loop.  */
 	addl	$(PAGE_SIZE - VEC_SIZE * 4), %eax
 
-	andl	%r10d, %ecx
+	and	%VR10, %VRCX
 	jz	L(loop_skip_page_cross_check)
 
-	.p2align 4,, 3
-L(return_page_cross_end):
-	tzcntl	%ecx, %ecx
+	bsf	%VRCX, %VRCX
 
 # if (defined USE_AS_STRNCMP) || (defined USE_AS_WCSCMP)
 	leal	-VEC_SIZE(%OFFSET_REG64, %rcx, SIZE_OF_CHAR), %ecx
@@ -874,8 +986,12 @@ L(ret8):
 # ifdef USE_AS_STRNCMP
 	.p2align 4,, 10
 L(return_page_cross_end_check):
-	andl	%r10d, %ecx
-	tzcntl	%ecx, %ecx
+	and	%VR10, %VRCX
+	/* Need to use tzcnt here as VRCX may be zero.  If VRCX is zero
+	   tzcnt(VRCX) will be CHAR_PER and remaining length (edx) is
+	   guranteed to be <= CHAR_PER_VEC so we will only use the return
+	   idx if VRCX was non-zero.  */
+	tzcnt	%VRCX, %VRCX
 	leal	-VEC_SIZE(%rax, %rcx, SIZE_OF_CHAR), %ecx
 #  ifdef USE_AS_WCSCMP
 	sall	$2, %edx
@@ -892,11 +1008,11 @@ L(more_2x_vec_till_page_cross):
 	/* If more 2x vec till cross we will complete a full loop
 	   iteration here.  */
 
-	VMOVA	VEC_SIZE(%rdi), %YMM0
-	VPTESTM	%YMM0, %YMM0, %k2
-	CMP_R1_S2_YMM (%YMM0, VEC_SIZE(%rsi), %YMM1, %k1){%k2}
-	kmovd	%k1, %ecx
-	TESTEQ	%ecx
+	VMOVA	VEC_SIZE(%rdi), %VMM(0)
+	VPTESTM	%VMM(0), %VMM(0), %k2
+	CMP_R1_S2_VMM (%VMM(0), VEC_SIZE(%rsi), %VMM(1), %k1){%k2}
+	KMOV	%k1, %VRCX
+	TESTEQ	%VRCX
 	jnz	L(return_vec_1_end)
 
 # ifdef USE_AS_STRNCMP
@@ -907,18 +1023,18 @@ L(more_2x_vec_till_page_cross):
 	subl	$-(VEC_SIZE * 4), %eax
 
 	/* Safe to include comparisons from lower bytes.  */
-	VMOVU	-(VEC_SIZE * 2)(%rdi, %rax), %YMM0
-	VPTESTM	%YMM0, %YMM0, %k2
-	CMP_R1_S2_YMM (%YMM0, -(VEC_SIZE * 2)(%rsi, %rax), %YMM1, %k1){%k2}
-	kmovd	%k1, %ecx
-	TESTEQ	%ecx
+	VMOVU	-(VEC_SIZE * 2)(%rdi, %rax), %VMM(0)
+	VPTESTM	%VMM(0), %VMM(0), %k2
+	CMP_R1_S2_VMM (%VMM(0), -(VEC_SIZE * 2)(%rsi, %rax), %VMM(1), %k1){%k2}
+	KMOV	%k1, %VRCX
+	TESTEQ	%VRCX
 	jnz	L(return_vec_page_cross_0)
 
-	VMOVU	-(VEC_SIZE * 1)(%rdi, %rax), %YMM0
-	VPTESTM	%YMM0, %YMM0, %k2
-	CMP_R1_S2_YMM (%YMM0, -(VEC_SIZE * 1)(%rsi, %rax), %YMM1, %k1){%k2}
-	kmovd	%k1, %ecx
-	TESTEQ	%ecx
+	VMOVU	-(VEC_SIZE * 1)(%rdi, %rax), %VMM(0)
+	VPTESTM	%VMM(0), %VMM(0), %k2
+	CMP_R1_S2_VMM (%VMM(0), -(VEC_SIZE * 1)(%rsi, %rax), %VMM(1), %k1){%k2}
+	KMOV	%k1, %VRCX
+	TESTEQ	%VRCX
 	jnz	L(return_vec_page_cross_1)
 
 # ifdef USE_AS_STRNCMP
@@ -937,30 +1053,30 @@ L(more_2x_vec_till_page_cross):
 # endif
 
 	/* Finish the loop.  */
-	VMOVA	(VEC_SIZE * 2)(%rdi), %YMM4
-	VMOVA	(VEC_SIZE * 3)(%rdi), %YMM6
-	VPMINU	%YMM4, %YMM6, %YMM9
-	VPTESTM	%YMM9, %YMM9, %k1
+	VMOVA	(VEC_SIZE * 2)(%rdi), %VMM(4)
+	VMOVA	(VEC_SIZE * 3)(%rdi), %VMM(6)
+	VPMINU	%VMM(4), %VMM(6), %VMM(9)
+	VPTESTM	%VMM(9), %VMM(9), %k1
 # ifndef USE_AS_STRCASECMP_L
-	vpxorq	(VEC_SIZE * 2)(%rsi), %YMM4, %YMM5
+	vpxorq	(VEC_SIZE * 2)(%rsi), %VMM(4), %VMM(5)
 	/* YMM6 = YMM5 | ((VEC_SIZE * 3)(%rsi) ^ YMM6).  */
-	vpternlogd $0xde, (VEC_SIZE * 3)(%rsi), %YMM5, %YMM6
+	vpternlogd $0xde, (VEC_SIZE * 3)(%rsi), %VMM(5), %VMM(6)
 # else
-	VMOVU	(VEC_SIZE * 2)(%rsi), %YMM5
-	TOLOWER_YMM (%YMM4, %YMM5)
-	VMOVU	(VEC_SIZE * 3)(%rsi), %YMM7
-	TOLOWER_YMM (%YMM6, %YMM7)
-	vpxorq	%YMM4, %YMM5, %YMM5
-	vpternlogd $0xde, %YMM7, %YMM5, %YMM6
-# endif
-	VPTESTNM %YMM6, %YMM6, %k0{%k1}
-	kmovd	%k0, %LOOP_REG
+	VMOVU	(VEC_SIZE * 2)(%rsi), %VMM(5)
+	TOLOWER_VMM (%VMM(4), %VMM(5))
+	VMOVU	(VEC_SIZE * 3)(%rsi), %VMM(7)
+	TOLOWER_VMM (%VMM(6), %VMM(7))
+	vpxorq	%VMM(4), %VMM(5), %VMM(5)
+	vpternlogd $0xde, %VMM(7), %VMM(5), %VMM(6)
+# endif
+	VPTESTNM %VMM(6), %VMM(6), %k0{%k1}
+	KMOV	%k0, %LOOP_REG
 	TESTEQ	%LOOP_REG
 	jnz	L(return_vec_2_3_end)
 
 	/* Best for code size to include ucond-jmp here. Would be faster
-	   if this case is hot to duplicate the L(return_vec_2_3_end) code
-	   as fall-through and have jump back to loop on mismatch
+	   if this case is hot to duplicate the L(return_vec_2_3_end)
+	   code as fall-through and have jump back to loop on mismatch
 	   comparison.  */
 	subq	$-(VEC_SIZE * 4), %rdi
 	subq	$-(VEC_SIZE * 4), %rsi
@@ -980,7 +1096,7 @@ L(ret_zero_in_loop_page_cross):
 L(return_vec_page_cross_0):
 	addl	$-VEC_SIZE, %eax
 L(return_vec_page_cross_1):
-	tzcntl	%ecx, %ecx
+	bsf	%VRCX, %VRCX
 # if defined USE_AS_STRNCMP || defined USE_AS_WCSCMP
 	leal	-VEC_SIZE(%rax, %rcx, SIZE_OF_CHAR), %ecx
 #  ifdef USE_AS_STRNCMP
@@ -1023,8 +1139,8 @@ L(ret9):
 L(page_cross):
 # ifndef USE_AS_STRNCMP
 	/* If both are VEC aligned we don't need any special logic here.
-	   Only valid for strcmp where stop condition is guranteed to be
-	   reachable by just reading memory.  */
+	   Only valid for strcmp where stop condition is guranteed to
+	   be reachable by just reading memory.  */
 	testl	$((VEC_SIZE - 1) << 20), %eax
 	jz	L(no_page_cross)
 # endif
@@ -1065,11 +1181,11 @@ L(page_cross):
 	   loadable memory until within 1x VEC of page cross.  */
 	.p2align 4,, 8
 L(page_cross_loop):
-	VMOVU	(%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %YMM0
-	VPTESTM	%YMM0, %YMM0, %k2
-	CMP_R1_S2_YMM (%YMM0, (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %YMM1, %k1){%k2}
-	kmovd	%k1, %ecx
-	TESTEQ	%ecx
+	VMOVU	(%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %VMM(0)
+	VPTESTM	%VMM(0), %VMM(0), %k2
+	CMP_R1_S2_VMM (%VMM(0), (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %VMM(1), %k1){%k2}
+	KMOV	%k1, %VRCX
+	TESTEQ	%VRCX
 	jnz	L(check_ret_vec_page_cross)
 	addl	$CHAR_PER_VEC, %OFFSET_REG
 # ifdef USE_AS_STRNCMP
@@ -1087,13 +1203,13 @@ L(page_cross_loop):
 	subl	%eax, %OFFSET_REG
 	/* OFFSET_REG has distance to page cross - VEC_SIZE. Guranteed
 	   to not cross page so is safe to load. Since we have already
-	   loaded at least 1 VEC from rsi it is also guranteed to be safe.
-	 */
-	VMOVU	(%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %YMM0
-	VPTESTM	%YMM0, %YMM0, %k2
-	CMP_R1_S2_YMM (%YMM0, (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %YMM1, %k1){%k2}
+	   loaded at least 1 VEC from rsi it is also guranteed to be
+	   safe.  */
+	VMOVU	(%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %VMM(0)
+	VPTESTM	%VMM(0), %VMM(0), %k2
+	CMP_R1_S2_VMM (%VMM(0), (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %VMM(1), %k1){%k2}
 
-	kmovd	%k1, %ecx
+	KMOV	%k1, %VRCX
 # ifdef USE_AS_STRNCMP
 	leal	CHAR_PER_VEC(%OFFSET_REG64), %eax
 	cmpq	%rax, %rdx
@@ -1104,7 +1220,7 @@ L(page_cross_loop):
 	addq	%rdi, %rdx
 #  endif
 # endif
-	TESTEQ	%ecx
+	TESTEQ	%VRCX
 	jz	L(prepare_loop_no_len)
 
 	.p2align 4,, 4
@@ -1112,7 +1228,7 @@ L(ret_vec_page_cross):
 # ifndef USE_AS_STRNCMP
 L(check_ret_vec_page_cross):
 # endif
-	tzcntl	%ecx, %ecx
+	tzcnt	%VRCX, %VRCX
 	addl	%OFFSET_REG, %ecx
 L(ret_vec_page_cross_cont):
 # ifdef USE_AS_WCSCMP
@@ -1139,9 +1255,9 @@ L(ret12):
 # ifdef USE_AS_STRNCMP
 	.p2align 4,, 10
 L(check_ret_vec_page_cross2):
-	TESTEQ	%ecx
+	TESTEQ	%VRCX
 L(check_ret_vec_page_cross):
-	tzcntl	%ecx, %ecx
+	tzcnt	%VRCX, %VRCX
 	addl	%OFFSET_REG, %ecx
 	cmpq	%rcx, %rdx
 	ja	L(ret_vec_page_cross_cont)
@@ -1180,8 +1296,71 @@ L(less_1x_vec_till_page):
 # ifdef USE_AS_WCSCMP
 	shrl	$2, %eax
 # endif
+
+	/* Find largest load size we can use. VEC_SIZE == 64 only check
+	   if we can do a full ymm load.  */
+# if VEC_SIZE == 64
+
+	cmpl	$((VEC_SIZE - 32) / SIZE_OF_CHAR), %eax
+	ja	L(less_32_till_page)
+
+
+	/* Use 16 byte comparison.  */
+	VMOVU	(%rdi), %VMM_256(0)
+	VPTESTM	%VMM_256(0), %VMM_256(0), %k2
+	CMP_R1_S2_YMM (%VMM_256(0), (%rsi), %VMM_256(1), %k1){%k2}
+	kmovd	%k1, %ecx
+#  ifdef USE_AS_WCSCMP
+	subl	$0xff, %ecx
+#  else
+	incl	%ecx
+#  endif
+	jnz	L(check_ret_vec_page_cross)
+	movl	$((VEC_SIZE - 32) / SIZE_OF_CHAR), %OFFSET_REG
+#  ifdef USE_AS_STRNCMP
+	cmpq	%OFFSET_REG64, %rdx
+	jbe	L(ret_zero_page_cross_slow_case64)
+	subl	%eax, %OFFSET_REG
+#  else
+	/* Explicit check for 32 byte alignment.  */
+	subl	%eax, %OFFSET_REG
+	jz	L(prepare_loop)
+#  endif
+	VMOVU	(%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %VMM_256(0)
+	VPTESTM	%VMM_256(0), %VMM_256(0), %k2
+	CMP_R1_S2_YMM (%VMM_256(0), (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %VMM_256(1), %k1){%k2}
+	kmovd	%k1, %ecx
+#  ifdef USE_AS_WCSCMP
+	subl	$0xff, %ecx
+#  else
+	incl	%ecx
+#  endif
+	jnz	L(check_ret_vec_page_cross)
+#  ifdef USE_AS_STRNCMP
+	addl	$(32 / SIZE_OF_CHAR), %OFFSET_REG
+	subq	%OFFSET_REG64, %rdx
+	jbe	L(ret_zero_page_cross_slow_case64)
+	subq	$-(CHAR_PER_VEC * 4), %rdx
+
+	leaq	-(VEC_SIZE * 4)(%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %rdi
+	leaq	-(VEC_SIZE * 4)(%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %rsi
+#  else
+	leaq	(32 - VEC_SIZE * 4)(%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %rdi
+	leaq	(32 - VEC_SIZE * 4)(%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %rsi
+#  endif
+	jmp	L(prepare_loop_aligned)
+
+#  ifdef USE_AS_STRNCMP
+	.p2align 4,, 2
+L(ret_zero_page_cross_slow_case64):
+	xorl	%eax, %eax
+	ret
+#  endif
+L(less_32_till_page):
+# endif
+
 	/* Find largest load size we can use.  */
-	cmpl	$(16 / SIZE_OF_CHAR), %eax
+	cmpl	$((VEC_SIZE - 16) / SIZE_OF_CHAR), %eax
 	ja	L(less_16_till_page)
 
 	/* Use 16 byte comparison.  */
@@ -1195,9 +1374,14 @@ L(less_1x_vec_till_page):
 	incw	%cx
 # endif
 	jnz	L(check_ret_vec_page_cross)
-	movl	$(16 / SIZE_OF_CHAR), %OFFSET_REG
+
+	movl	$((VEC_SIZE - 16) / SIZE_OF_CHAR), %OFFSET_REG
 # ifdef USE_AS_STRNCMP
+#  if VEC_SIZE == 32
 	cmpq	%OFFSET_REG64, %rdx
+#  else
+	cmpq	$(16 / SIZE_OF_CHAR), %rdx
+#  endif
 	jbe	L(ret_zero_page_cross_slow_case0)
 	subl	%eax, %OFFSET_REG
 # else
@@ -1239,7 +1423,7 @@ L(ret_zero_page_cross_slow_case0):
 
 	.p2align 4,, 10
 L(less_16_till_page):
-	cmpl	$(24 / SIZE_OF_CHAR), %eax
+	cmpl	$((VEC_SIZE - 8) / SIZE_OF_CHAR), %eax
 	ja	L(less_8_till_page)
 
 	/* Use 8 byte comparison.  */
@@ -1260,7 +1444,7 @@ L(less_16_till_page):
 	cmpq	$(8 / SIZE_OF_CHAR), %rdx
 	jbe	L(ret_zero_page_cross_slow_case0)
 # endif
-	movl	$(24 / SIZE_OF_CHAR), %OFFSET_REG
+	movl	$((VEC_SIZE - 8) / SIZE_OF_CHAR), %OFFSET_REG
 	subl	%eax, %OFFSET_REG
 
 	vmovq	(%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm0
@@ -1320,7 +1504,7 @@ L(ret_less_8_wcs):
 	ret
 
 # else
-	cmpl	$28, %eax
+	cmpl	$(VEC_SIZE - 4), %eax
 	ja	L(less_4_till_page)
 
 	vmovd	(%rdi), %xmm0
@@ -1335,7 +1519,7 @@ L(ret_less_8_wcs):
 	cmpq	$4, %rdx
 	jbe	L(ret_zero_page_cross_slow_case1)
 #  endif
-	movl	$(28 / SIZE_OF_CHAR), %OFFSET_REG
+	movl	$((VEC_SIZE - 4) / SIZE_OF_CHAR), %OFFSET_REG
 	subl	%eax, %OFFSET_REG
 
 	vmovd	(%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm0
@@ -1386,7 +1570,7 @@ L(less_4_loop):
 #  endif
 	incq	%rdi
 	/* end condition is reach page boundary (rdi is aligned).  */
-	testl	$31, %edi
+	testb	$(VEC_SIZE - 1), %dil
 	jnz	L(less_4_loop)
 	leaq	-(VEC_SIZE * 4)(%rdi, %rsi), %rsi
 	addq	$-(VEC_SIZE * 4), %rdi
-- 
2.34.1


^ permalink raw reply	[flat|nested] 41+ messages in thread

* [PATCH v3 7/7] Bench: Improve benchtests for memchr, strchr, strnlen, strrchr
  2022-10-19  0:44 ` [PATCH v3 1/7] x86: Optimize memchr-evex.S and implement with VMM headers Noah Goldstein
                     ` (4 preceding siblings ...)
  2022-10-19  0:44   ` [PATCH v3 6/7] x86: Add support for VEC_SIZE == 64 in strcmp-evex.S impl Noah Goldstein
@ 2022-10-19  0:44   ` Noah Goldstein
  2022-10-19 17:00     ` H.J. Lu
  2022-10-19 16:52   ` [PATCH v3 1/7] x86: Optimize memchr-evex.S and implement with VMM headers H.J. Lu
  6 siblings, 1 reply; 41+ messages in thread
From: Noah Goldstein @ 2022-10-19  0:44 UTC (permalink / raw)
  To: libc-alpha; +Cc: goldstein.w.n, hjl.tools, carlos

1. Add more complete coverage in the medium size range.
2. In strnlen remove the `1 << i` which was UB (`i` could go beyond
   32/64)
---
 benchtests/bench-memchr.c    | 77 +++++++++++++++++++++++++-----------
 benchtests/bench-rawmemchr.c | 30 ++++++++++++--
 benchtests/bench-strchr.c    | 35 +++++++++++-----
 benchtests/bench-strnlen.c   | 12 +++---
 benchtests/bench-strrchr.c   | 28 ++++++++++++-
 5 files changed, 137 insertions(+), 45 deletions(-)

diff --git a/benchtests/bench-memchr.c b/benchtests/bench-memchr.c
index 0facda2fa0..2ec9dd86d0 100644
--- a/benchtests/bench-memchr.c
+++ b/benchtests/bench-memchr.c
@@ -126,7 +126,7 @@ do_test (json_ctx_t *json_ctx, size_t align, size_t pos, size_t len,
 int
 test_main (void)
 {
-  size_t i;
+  size_t i, j, al, al_max;
   int repeats;
   json_ctx_t json_ctx;
   test_init ();
@@ -147,35 +147,46 @@ test_main (void)
 
   json_array_begin (&json_ctx, "results");
 
+  al_max = 0;
+#ifdef USE_AS_MEMRCHR
+  al_max = getpagesize () / 2;
+#endif
+
   for (repeats = 0; repeats < 2; ++repeats)
     {
-      for (i = 1; i < 8; ++i)
+      for (al = 0; al <= al_max; al += getpagesize () / 2)
 	{
-	  do_test (&json_ctx, 0, 16 << i, 2048, 23, repeats);
-	  do_test (&json_ctx, i, 64, 256, 23, repeats);
-	  do_test (&json_ctx, 0, 16 << i, 2048, 0, repeats);
-	  do_test (&json_ctx, i, 64, 256, 0, repeats);
-
-	  do_test (&json_ctx, getpagesize () - 15, 64, 256, 0, repeats);
+	  for (i = 1; i < 8; ++i)
+	    {
+	      do_test (&json_ctx, al, 16 << i, 2048, 23, repeats);
+	      do_test (&json_ctx, al + i, 64, 256, 23, repeats);
+	      do_test (&json_ctx, al, 16 << i, 2048, 0, repeats);
+	      do_test (&json_ctx, al + i, 64, 256, 0, repeats);
+
+	      do_test (&json_ctx, al + getpagesize () - 15, 64, 256, 0,
+		       repeats);
 #ifdef USE_AS_MEMRCHR
-	  /* Also test the position close to the beginning for memrchr.  */
-	  do_test (&json_ctx, 0, i, 256, 23, repeats);
-	  do_test (&json_ctx, 0, i, 256, 0, repeats);
-	  do_test (&json_ctx, i, i, 256, 23, repeats);
-	  do_test (&json_ctx, i, i, 256, 0, repeats);
+	      /* Also test the position close to the beginning for memrchr.  */
+	      do_test (&json_ctx, al, i, 256, 23, repeats);
+	      do_test (&json_ctx, al, i, 256, 0, repeats);
+	      do_test (&json_ctx, al + i, i, 256, 23, repeats);
+	      do_test (&json_ctx, al + i, i, 256, 0, repeats);
 #endif
+	    }
+	  for (i = 1; i < 8; ++i)
+	    {
+	      do_test (&json_ctx, al + i, i << 5, 192, 23, repeats);
+	      do_test (&json_ctx, al + i, i << 5, 192, 0, repeats);
+	      do_test (&json_ctx, al + i, i << 5, 256, 23, repeats);
+	      do_test (&json_ctx, al + i, i << 5, 256, 0, repeats);
+	      do_test (&json_ctx, al + i, i << 5, 512, 23, repeats);
+	      do_test (&json_ctx, al + i, i << 5, 512, 0, repeats);
+
+	      do_test (&json_ctx, al + getpagesize () - 15, i << 5, 256, 23,
+		       repeats);
+	    }
 	}
-      for (i = 1; i < 8; ++i)
-	{
-	  do_test (&json_ctx, i, i << 5, 192, 23, repeats);
-	  do_test (&json_ctx, i, i << 5, 192, 0, repeats);
-	  do_test (&json_ctx, i, i << 5, 256, 23, repeats);
-	  do_test (&json_ctx, i, i << 5, 256, 0, repeats);
-	  do_test (&json_ctx, i, i << 5, 512, 23, repeats);
-	  do_test (&json_ctx, i, i << 5, 512, 0, repeats);
-
-	  do_test (&json_ctx, getpagesize () - 15, i << 5, 256, 23, repeats);
-	}
+
       for (i = 1; i < 32; ++i)
 	{
 	  do_test (&json_ctx, 0, i, i + 1, 23, repeats);
@@ -207,6 +218,24 @@ test_main (void)
 	  do_test (&json_ctx, 0, 2, i + 1, 0, repeats);
 #endif
 	}
+      for (al = 0; al <= al_max; al += getpagesize () / 2)
+	{
+	  for (i = (16 / sizeof (CHAR)); i <= (8192 / sizeof (CHAR)); i += i)
+	    {
+	      for (j = 0; j <= (384 / sizeof (CHAR));
+		   j += (32 / sizeof (CHAR)))
+		{
+		  do_test (&json_ctx, al, i + j, i, 23, repeats);
+		  do_test (&json_ctx, al, i, i + j, 23, repeats);
+		  if (j < i)
+		    {
+		      do_test (&json_ctx, al, i - j, i, 23, repeats);
+		      do_test (&json_ctx, al, i, i - j, 23, repeats);
+		    }
+		}
+	    }
+	}
+
 #ifndef USE_AS_MEMRCHR
       break;
 #endif
diff --git a/benchtests/bench-rawmemchr.c b/benchtests/bench-rawmemchr.c
index b1803afc14..dab77f3858 100644
--- a/benchtests/bench-rawmemchr.c
+++ b/benchtests/bench-rawmemchr.c
@@ -70,7 +70,7 @@ do_test (json_ctx_t *json_ctx, size_t align, size_t pos, size_t len, int seek_ch
   size_t i;
   char *result;
 
-  align &= 7;
+  align &= getpagesize () - 1;
   if (align + len >= page_size)
     return;
 
@@ -106,7 +106,6 @@ test_main (void)
 {
   json_ctx_t json_ctx;
   size_t i;
-
   test_init ();
 
   json_init (&json_ctx, 0, stdout);
@@ -120,7 +119,7 @@ test_main (void)
 
   json_array_begin (&json_ctx, "ifuncs");
   FOR_EACH_IMPL (impl, 0)
-      json_element_string (&json_ctx, impl->name);
+    json_element_string (&json_ctx, impl->name);
   json_array_end (&json_ctx);
 
   json_array_begin (&json_ctx, "results");
@@ -137,6 +136,31 @@ test_main (void)
       do_test (&json_ctx, 0, i, i + 1, 23);
       do_test (&json_ctx, 0, i, i + 1, 0);
     }
+  for (; i < 256; i += 32)
+    {
+      do_test (&json_ctx, 0, i, i + 1, 23);
+      do_test (&json_ctx, 0, i - 1, i, 23);
+    }
+  for (; i < 512; i += 64)
+    {
+      do_test (&json_ctx, 0, i, i + 1, 23);
+      do_test (&json_ctx, 0, i - 1, i, 23);
+    }
+  for (; i < 1024; i += 128)
+    {
+      do_test (&json_ctx, 0, i, i + 1, 23);
+      do_test (&json_ctx, 0, i - 1, i, 23);
+    }
+  for (; i < 2048; i += 256)
+    {
+      do_test (&json_ctx, 0, i, i + 1, 23);
+      do_test (&json_ctx, 0, i - 1, i, 23);
+    }
+  for (; i < 4096; i += 512)
+    {
+      do_test (&json_ctx, 0, i, i + 1, 23);
+      do_test (&json_ctx, 0, i - 1, i, 23);
+    }
 
   json_array_end (&json_ctx);
   json_attr_object_end (&json_ctx);
diff --git a/benchtests/bench-strchr.c b/benchtests/bench-strchr.c
index 54640bde7e..aeb882d442 100644
--- a/benchtests/bench-strchr.c
+++ b/benchtests/bench-strchr.c
@@ -287,8 +287,8 @@ int
 test_main (void)
 {
   json_ctx_t json_ctx;
-  size_t i;
 
+  size_t i, j;
   test_init ();
 
   json_init (&json_ctx, 0, stdout);
@@ -367,15 +367,30 @@ test_main (void)
       do_test (&json_ctx, 0, i, i + 1, 0, BIG_CHAR);
     }
 
-  DO_RAND_TEST(&json_ctx, 0, 15, 16, 0.0);
-  DO_RAND_TEST(&json_ctx, 0, 15, 16, 0.1);
-  DO_RAND_TEST(&json_ctx, 0, 15, 16, 0.25);
-  DO_RAND_TEST(&json_ctx, 0, 15, 16, 0.33);
-  DO_RAND_TEST(&json_ctx, 0, 15, 16, 0.5);
-  DO_RAND_TEST(&json_ctx, 0, 15, 16, 0.66);
-  DO_RAND_TEST(&json_ctx, 0, 15, 16, 0.75);
-  DO_RAND_TEST(&json_ctx, 0, 15, 16, 0.9);
-  DO_RAND_TEST(&json_ctx, 0, 15, 16, 1.0);
+  for (i = 16 / sizeof (CHAR); i <= 8192 / sizeof (CHAR); i += i)
+    {
+      for (j = 32 / sizeof (CHAR); j <= 320 / sizeof (CHAR);
+	   j += 32 / sizeof (CHAR))
+	{
+	  do_test (&json_ctx, 0, i, i + j, 0, MIDDLE_CHAR);
+	  do_test (&json_ctx, 0, i + j, i, 0, MIDDLE_CHAR);
+	  if (i > j)
+	    {
+	      do_test (&json_ctx, 0, i, i - j, 0, MIDDLE_CHAR);
+	      do_test (&json_ctx, 0, i - j, i, 0, MIDDLE_CHAR);
+	    }
+	}
+    }
+
+  DO_RAND_TEST (&json_ctx, 0, 15, 16, 0.0);
+  DO_RAND_TEST (&json_ctx, 0, 15, 16, 0.1);
+  DO_RAND_TEST (&json_ctx, 0, 15, 16, 0.25);
+  DO_RAND_TEST (&json_ctx, 0, 15, 16, 0.33);
+  DO_RAND_TEST (&json_ctx, 0, 15, 16, 0.5);
+  DO_RAND_TEST (&json_ctx, 0, 15, 16, 0.66);
+  DO_RAND_TEST (&json_ctx, 0, 15, 16, 0.75);
+  DO_RAND_TEST (&json_ctx, 0, 15, 16, 0.9);
+  DO_RAND_TEST (&json_ctx, 0, 15, 16, 1.0);
 
   json_array_end (&json_ctx);
   json_attr_object_end (&json_ctx);
diff --git a/benchtests/bench-strnlen.c b/benchtests/bench-strnlen.c
index 13b46b3f57..82c02eb6ed 100644
--- a/benchtests/bench-strnlen.c
+++ b/benchtests/bench-strnlen.c
@@ -195,19 +195,19 @@ test_main (void)
     {
       for (j = 0; j <= (704 / sizeof (CHAR)); j += (32 / sizeof (CHAR)))
 	{
-	  do_test (&json_ctx, 0, 1 << i, (i + j), BIG_CHAR);
 	  do_test (&json_ctx, 0, i + j, i, BIG_CHAR);
-
-	  do_test (&json_ctx, 64, 1 << i, (i + j), BIG_CHAR);
 	  do_test (&json_ctx, 64, i + j, i, BIG_CHAR);
 
+	  do_test (&json_ctx, 0, i, i + j, BIG_CHAR);
+	  do_test (&json_ctx, 64, i, i + j, BIG_CHAR);
+
 	  if (j < i)
 	    {
-	      do_test (&json_ctx, 0, 1 << i, i - j, BIG_CHAR);
 	      do_test (&json_ctx, 0, i - j, i, BIG_CHAR);
-
-	      do_test (&json_ctx, 64, 1 << i, i - j, BIG_CHAR);
 	      do_test (&json_ctx, 64, i - j, i, BIG_CHAR);
+
+	      do_test (&json_ctx, 0, i, i - j, BIG_CHAR);
+	      do_test (&json_ctx, 64, i, i - j, BIG_CHAR);
 	    }
 	}
     }
diff --git a/benchtests/bench-strrchr.c b/benchtests/bench-strrchr.c
index 7cd2a15484..3fcf3f281d 100644
--- a/benchtests/bench-strrchr.c
+++ b/benchtests/bench-strrchr.c
@@ -151,7 +151,7 @@ int
 test_main (void)
 {
   json_ctx_t json_ctx;
-  size_t i, j;
+  size_t i, j, k;
   int seek;
 
   test_init ();
@@ -173,7 +173,7 @@ test_main (void)
 
   for (seek = 0; seek <= 23; seek += 23)
     {
-      for (j = 1; j < 32; j += j)
+      for (j = 1; j <= 256; j = (j * 4))
 	{
 	  for (i = 1; i < 9; ++i)
 	    {
@@ -197,6 +197,30 @@ test_main (void)
 	      do_test (&json_ctx, getpagesize () - i / 2 - 1, i, i + 1, seek,
 		       SMALL_CHAR, j);
 	    }
+
+	  for (i = (16 / sizeof (CHAR)); i <= (288 / sizeof (CHAR)); i += 32)
+	    {
+	      do_test (&json_ctx, 0, i - 16, i, seek, SMALL_CHAR, j);
+	      do_test (&json_ctx, 0, i, i + 16, seek, SMALL_CHAR, j);
+	    }
+
+	  for (i = (16 / sizeof (CHAR)); i <= (2048 / sizeof (CHAR)); i += i)
+	    {
+	      for (k = 0; k <= (288 / sizeof (CHAR));
+		   k += (48 / sizeof (CHAR)))
+		{
+		  do_test (&json_ctx, 0, k, i, seek, SMALL_CHAR, j);
+		  do_test (&json_ctx, 0, i, i + k, seek, SMALL_CHAR, j);
+
+		  if (k < i)
+		    {
+		      do_test (&json_ctx, 0, i - k, i, seek, SMALL_CHAR, j);
+		      do_test (&json_ctx, 0, k, i - k, seek, SMALL_CHAR, j);
+		      do_test (&json_ctx, 0, i, i - k, seek, SMALL_CHAR, j);
+		    }
+		}
+	    }
+
 	  if (seek == 0)
 	    {
 	      break;
-- 
2.34.1


^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH v2 7/7] Bench: Improve benchtests for memchr, strchr, strnlen, strrchr
  2022-10-19  0:01     ` H.J. Lu
@ 2022-10-19  0:44       ` Noah Goldstein
  0 siblings, 0 replies; 41+ messages in thread
From: Noah Goldstein @ 2022-10-19  0:44 UTC (permalink / raw)
  To: H.J. Lu; +Cc: libc-alpha, carlos

On Tue, Oct 18, 2022 at 5:01 PM H.J. Lu <hjl.tools@gmail.com> wrote:
>
> On Tue, Oct 18, 2022 at 4:19 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> >
> > 1. Add more complete coverage in the medium size range.
> > 2. In strnlen remove the `1 << i` which was UB (`i` could go beyond
> >    32/64)
> > 3. Add timer for total benchmark runtime (useful for deciding about
> >    tradeoff between coverage and runtime).
>
> Please drop #3.

Fixed.
>
> > ---
> >  benchtests/bench-memchr.c    | 77 +++++++++++++++++++++++++-----------
> >  benchtests/bench-rawmemchr.c | 30 ++++++++++++--
> >  benchtests/bench-strchr.c    | 35 +++++++++++-----
> >  benchtests/bench-strnlen.c   | 12 +++---
> >  benchtests/bench-strrchr.c   | 28 ++++++++++++-
> >  5 files changed, 137 insertions(+), 45 deletions(-)
> >
> > diff --git a/benchtests/bench-memchr.c b/benchtests/bench-memchr.c
> > index 0facda2fa0..2ec9dd86d0 100644
> > --- a/benchtests/bench-memchr.c
> > +++ b/benchtests/bench-memchr.c
> > @@ -126,7 +126,7 @@ do_test (json_ctx_t *json_ctx, size_t align, size_t pos, size_t len,
> >  int
> >  test_main (void)
> >  {
> > -  size_t i;
> > +  size_t i, j, al, al_max;
> >    int repeats;
> >    json_ctx_t json_ctx;
> >    test_init ();
> > @@ -147,35 +147,46 @@ test_main (void)
> >
> >    json_array_begin (&json_ctx, "results");
> >
> > +  al_max = 0;
> > +#ifdef USE_AS_MEMRCHR
> > +  al_max = getpagesize () / 2;
> > +#endif
> > +
> >    for (repeats = 0; repeats < 2; ++repeats)
> >      {
> > -      for (i = 1; i < 8; ++i)
> > +      for (al = 0; al <= al_max; al += getpagesize () / 2)
> >         {
> > -         do_test (&json_ctx, 0, 16 << i, 2048, 23, repeats);
> > -         do_test (&json_ctx, i, 64, 256, 23, repeats);
> > -         do_test (&json_ctx, 0, 16 << i, 2048, 0, repeats);
> > -         do_test (&json_ctx, i, 64, 256, 0, repeats);
> > -
> > -         do_test (&json_ctx, getpagesize () - 15, 64, 256, 0, repeats);
> > +         for (i = 1; i < 8; ++i)
> > +           {
> > +             do_test (&json_ctx, al, 16 << i, 2048, 23, repeats);
> > +             do_test (&json_ctx, al + i, 64, 256, 23, repeats);
> > +             do_test (&json_ctx, al, 16 << i, 2048, 0, repeats);
> > +             do_test (&json_ctx, al + i, 64, 256, 0, repeats);
> > +
> > +             do_test (&json_ctx, al + getpagesize () - 15, 64, 256, 0,
> > +                      repeats);
> >  #ifdef USE_AS_MEMRCHR
> > -         /* Also test the position close to the beginning for memrchr.  */
> > -         do_test (&json_ctx, 0, i, 256, 23, repeats);
> > -         do_test (&json_ctx, 0, i, 256, 0, repeats);
> > -         do_test (&json_ctx, i, i, 256, 23, repeats);
> > -         do_test (&json_ctx, i, i, 256, 0, repeats);
> > +             /* Also test the position close to the beginning for memrchr.  */
> > +             do_test (&json_ctx, al, i, 256, 23, repeats);
> > +             do_test (&json_ctx, al, i, 256, 0, repeats);
> > +             do_test (&json_ctx, al + i, i, 256, 23, repeats);
> > +             do_test (&json_ctx, al + i, i, 256, 0, repeats);
> >  #endif
> > +           }
> > +         for (i = 1; i < 8; ++i)
> > +           {
> > +             do_test (&json_ctx, al + i, i << 5, 192, 23, repeats);
> > +             do_test (&json_ctx, al + i, i << 5, 192, 0, repeats);
> > +             do_test (&json_ctx, al + i, i << 5, 256, 23, repeats);
> > +             do_test (&json_ctx, al + i, i << 5, 256, 0, repeats);
> > +             do_test (&json_ctx, al + i, i << 5, 512, 23, repeats);
> > +             do_test (&json_ctx, al + i, i << 5, 512, 0, repeats);
> > +
> > +             do_test (&json_ctx, al + getpagesize () - 15, i << 5, 256, 23,
> > +                      repeats);
> > +           }
> >         }
> > -      for (i = 1; i < 8; ++i)
> > -       {
> > -         do_test (&json_ctx, i, i << 5, 192, 23, repeats);
> > -         do_test (&json_ctx, i, i << 5, 192, 0, repeats);
> > -         do_test (&json_ctx, i, i << 5, 256, 23, repeats);
> > -         do_test (&json_ctx, i, i << 5, 256, 0, repeats);
> > -         do_test (&json_ctx, i, i << 5, 512, 23, repeats);
> > -         do_test (&json_ctx, i, i << 5, 512, 0, repeats);
> > -
> > -         do_test (&json_ctx, getpagesize () - 15, i << 5, 256, 23, repeats);
> > -       }
> > +
> >        for (i = 1; i < 32; ++i)
> >         {
> >           do_test (&json_ctx, 0, i, i + 1, 23, repeats);
> > @@ -207,6 +218,24 @@ test_main (void)
> >           do_test (&json_ctx, 0, 2, i + 1, 0, repeats);
> >  #endif
> >         }
> > +      for (al = 0; al <= al_max; al += getpagesize () / 2)
> > +       {
> > +         for (i = (16 / sizeof (CHAR)); i <= (8192 / sizeof (CHAR)); i += i)
> > +           {
> > +             for (j = 0; j <= (384 / sizeof (CHAR));
> > +                  j += (32 / sizeof (CHAR)))
> > +               {
> > +                 do_test (&json_ctx, al, i + j, i, 23, repeats);
> > +                 do_test (&json_ctx, al, i, i + j, 23, repeats);
> > +                 if (j < i)
> > +                   {
> > +                     do_test (&json_ctx, al, i - j, i, 23, repeats);
> > +                     do_test (&json_ctx, al, i, i - j, 23, repeats);
> > +                   }
> > +               }
> > +           }
> > +       }
> > +
> >  #ifndef USE_AS_MEMRCHR
> >        break;
> >  #endif
> > diff --git a/benchtests/bench-rawmemchr.c b/benchtests/bench-rawmemchr.c
> > index b1803afc14..dab77f3858 100644
> > --- a/benchtests/bench-rawmemchr.c
> > +++ b/benchtests/bench-rawmemchr.c
> > @@ -70,7 +70,7 @@ do_test (json_ctx_t *json_ctx, size_t align, size_t pos, size_t len, int seek_ch
> >    size_t i;
> >    char *result;
> >
> > -  align &= 7;
> > +  align &= getpagesize () - 1;
> >    if (align + len >= page_size)
> >      return;
> >
> > @@ -106,7 +106,6 @@ test_main (void)
> >  {
> >    json_ctx_t json_ctx;
> >    size_t i;
> > -
> >    test_init ();
> >
> >    json_init (&json_ctx, 0, stdout);
> > @@ -120,7 +119,7 @@ test_main (void)
> >
> >    json_array_begin (&json_ctx, "ifuncs");
> >    FOR_EACH_IMPL (impl, 0)
> > -      json_element_string (&json_ctx, impl->name);
> > +    json_element_string (&json_ctx, impl->name);
> >    json_array_end (&json_ctx);
> >
> >    json_array_begin (&json_ctx, "results");
> > @@ -137,6 +136,31 @@ test_main (void)
> >        do_test (&json_ctx, 0, i, i + 1, 23);
> >        do_test (&json_ctx, 0, i, i + 1, 0);
> >      }
> > +  for (; i < 256; i += 32)
> > +    {
> > +      do_test (&json_ctx, 0, i, i + 1, 23);
> > +      do_test (&json_ctx, 0, i - 1, i, 23);
> > +    }
> > +  for (; i < 512; i += 64)
> > +    {
> > +      do_test (&json_ctx, 0, i, i + 1, 23);
> > +      do_test (&json_ctx, 0, i - 1, i, 23);
> > +    }
> > +  for (; i < 1024; i += 128)
> > +    {
> > +      do_test (&json_ctx, 0, i, i + 1, 23);
> > +      do_test (&json_ctx, 0, i - 1, i, 23);
> > +    }
> > +  for (; i < 2048; i += 256)
> > +    {
> > +      do_test (&json_ctx, 0, i, i + 1, 23);
> > +      do_test (&json_ctx, 0, i - 1, i, 23);
> > +    }
> > +  for (; i < 4096; i += 512)
> > +    {
> > +      do_test (&json_ctx, 0, i, i + 1, 23);
> > +      do_test (&json_ctx, 0, i - 1, i, 23);
> > +    }
> >
> >    json_array_end (&json_ctx);
> >    json_attr_object_end (&json_ctx);
> > diff --git a/benchtests/bench-strchr.c b/benchtests/bench-strchr.c
> > index 54640bde7e..aeb882d442 100644
> > --- a/benchtests/bench-strchr.c
> > +++ b/benchtests/bench-strchr.c
> > @@ -287,8 +287,8 @@ int
> >  test_main (void)
> >  {
> >    json_ctx_t json_ctx;
> > -  size_t i;
> >
> > +  size_t i, j;
> >    test_init ();
> >
> >    json_init (&json_ctx, 0, stdout);
> > @@ -367,15 +367,30 @@ test_main (void)
> >        do_test (&json_ctx, 0, i, i + 1, 0, BIG_CHAR);
> >      }
> >
> > -  DO_RAND_TEST(&json_ctx, 0, 15, 16, 0.0);
> > -  DO_RAND_TEST(&json_ctx, 0, 15, 16, 0.1);
> > -  DO_RAND_TEST(&json_ctx, 0, 15, 16, 0.25);
> > -  DO_RAND_TEST(&json_ctx, 0, 15, 16, 0.33);
> > -  DO_RAND_TEST(&json_ctx, 0, 15, 16, 0.5);
> > -  DO_RAND_TEST(&json_ctx, 0, 15, 16, 0.66);
> > -  DO_RAND_TEST(&json_ctx, 0, 15, 16, 0.75);
> > -  DO_RAND_TEST(&json_ctx, 0, 15, 16, 0.9);
> > -  DO_RAND_TEST(&json_ctx, 0, 15, 16, 1.0);
> > +  for (i = 16 / sizeof (CHAR); i <= 8192 / sizeof (CHAR); i += i)
> > +    {
> > +      for (j = 32 / sizeof (CHAR); j <= 320 / sizeof (CHAR);
> > +          j += 32 / sizeof (CHAR))
> > +       {
> > +         do_test (&json_ctx, 0, i, i + j, 0, MIDDLE_CHAR);
> > +         do_test (&json_ctx, 0, i + j, i, 0, MIDDLE_CHAR);
> > +         if (i > j)
> > +           {
> > +             do_test (&json_ctx, 0, i, i - j, 0, MIDDLE_CHAR);
> > +             do_test (&json_ctx, 0, i - j, i, 0, MIDDLE_CHAR);
> > +           }
> > +       }
> > +    }
> > +
> > +  DO_RAND_TEST (&json_ctx, 0, 15, 16, 0.0);
> > +  DO_RAND_TEST (&json_ctx, 0, 15, 16, 0.1);
> > +  DO_RAND_TEST (&json_ctx, 0, 15, 16, 0.25);
> > +  DO_RAND_TEST (&json_ctx, 0, 15, 16, 0.33);
> > +  DO_RAND_TEST (&json_ctx, 0, 15, 16, 0.5);
> > +  DO_RAND_TEST (&json_ctx, 0, 15, 16, 0.66);
> > +  DO_RAND_TEST (&json_ctx, 0, 15, 16, 0.75);
> > +  DO_RAND_TEST (&json_ctx, 0, 15, 16, 0.9);
> > +  DO_RAND_TEST (&json_ctx, 0, 15, 16, 1.0);
> >
> >    json_array_end (&json_ctx);
> >    json_attr_object_end (&json_ctx);
> > diff --git a/benchtests/bench-strnlen.c b/benchtests/bench-strnlen.c
> > index 13b46b3f57..82c02eb6ed 100644
> > --- a/benchtests/bench-strnlen.c
> > +++ b/benchtests/bench-strnlen.c
> > @@ -195,19 +195,19 @@ test_main (void)
> >      {
> >        for (j = 0; j <= (704 / sizeof (CHAR)); j += (32 / sizeof (CHAR)))
> >         {
> > -         do_test (&json_ctx, 0, 1 << i, (i + j), BIG_CHAR);
> >           do_test (&json_ctx, 0, i + j, i, BIG_CHAR);
> > -
> > -         do_test (&json_ctx, 64, 1 << i, (i + j), BIG_CHAR);
> >           do_test (&json_ctx, 64, i + j, i, BIG_CHAR);
> >
> > +         do_test (&json_ctx, 0, i, i + j, BIG_CHAR);
> > +         do_test (&json_ctx, 64, i, i + j, BIG_CHAR);
> > +
> >           if (j < i)
> >             {
> > -             do_test (&json_ctx, 0, 1 << i, i - j, BIG_CHAR);
> >               do_test (&json_ctx, 0, i - j, i, BIG_CHAR);
> > -
> > -             do_test (&json_ctx, 64, 1 << i, i - j, BIG_CHAR);
> >               do_test (&json_ctx, 64, i - j, i, BIG_CHAR);
> > +
> > +             do_test (&json_ctx, 0, i, i - j, BIG_CHAR);
> > +             do_test (&json_ctx, 64, i, i - j, BIG_CHAR);
> >             }
> >         }
> >      }
> > diff --git a/benchtests/bench-strrchr.c b/benchtests/bench-strrchr.c
> > index 7cd2a15484..3fcf3f281d 100644
> > --- a/benchtests/bench-strrchr.c
> > +++ b/benchtests/bench-strrchr.c
> > @@ -151,7 +151,7 @@ int
> >  test_main (void)
> >  {
> >    json_ctx_t json_ctx;
> > -  size_t i, j;
> > +  size_t i, j, k;
> >    int seek;
> >
> >    test_init ();
> > @@ -173,7 +173,7 @@ test_main (void)
> >
> >    for (seek = 0; seek <= 23; seek += 23)
> >      {
> > -      for (j = 1; j < 32; j += j)
> > +      for (j = 1; j <= 256; j = (j * 4))
> >         {
> >           for (i = 1; i < 9; ++i)
> >             {
> > @@ -197,6 +197,30 @@ test_main (void)
> >               do_test (&json_ctx, getpagesize () - i / 2 - 1, i, i + 1, seek,
> >                        SMALL_CHAR, j);
> >             }
> > +
> > +         for (i = (16 / sizeof (CHAR)); i <= (288 / sizeof (CHAR)); i += 32)
> > +           {
> > +             do_test (&json_ctx, 0, i - 16, i, seek, SMALL_CHAR, j);
> > +             do_test (&json_ctx, 0, i, i + 16, seek, SMALL_CHAR, j);
> > +           }
> > +
> > +         for (i = (16 / sizeof (CHAR)); i <= (2048 / sizeof (CHAR)); i += i)
> > +           {
> > +             for (k = 0; k <= (288 / sizeof (CHAR));
> > +                  k += (48 / sizeof (CHAR)))
> > +               {
> > +                 do_test (&json_ctx, 0, k, i, seek, SMALL_CHAR, j);
> > +                 do_test (&json_ctx, 0, i, i + k, seek, SMALL_CHAR, j);
> > +
> > +                 if (k < i)
> > +                   {
> > +                     do_test (&json_ctx, 0, i - k, i, seek, SMALL_CHAR, j);
> > +                     do_test (&json_ctx, 0, k, i - k, seek, SMALL_CHAR, j);
> > +                     do_test (&json_ctx, 0, i, i - k, seek, SMALL_CHAR, j);
> > +                   }
> > +               }
> > +           }
> > +
> >           if (seek == 0)
> >             {
> >               break;
> > --
> > 2.34.1
> >
>
>
> --
> H.J.

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH v3 1/7] x86: Optimize memchr-evex.S and implement with VMM headers
  2022-10-19  0:44 ` [PATCH v3 1/7] x86: Optimize memchr-evex.S and implement with VMM headers Noah Goldstein
                     ` (5 preceding siblings ...)
  2022-10-19  0:44   ` [PATCH v3 7/7] Bench: Improve benchtests for memchr, strchr, strnlen, strrchr Noah Goldstein
@ 2022-10-19 16:52   ` H.J. Lu
  6 siblings, 0 replies; 41+ messages in thread
From: H.J. Lu @ 2022-10-19 16:52 UTC (permalink / raw)
  To: Noah Goldstein; +Cc: libc-alpha, carlos

On Tue, Oct 18, 2022 at 5:44 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> Optimizations are:
>
> 1. Use the fact that tzcnt(0) -> VEC_SIZE for memchr to save a branch
>    in short string case.
> 2. Restructure code so that small strings are given the hot path.
>         - This is a net-zero on the benchmark suite but in general makes
>       sense as smaller sizes are far more common.
> 3. Use more code-size efficient instructions.
>         - tzcnt ...     -> bsf ...
>         - vpcmpb $0 ... -> vpcmpeq ...
> 4. Align labels less aggressively, especially if it doesn't save fetch
>    blocks / causes the basic-block to span extra cache-lines.
>
> The optimizations (especially for point 2) make the memchr and
> rawmemchr code essentially incompatible so split rawmemchr-evex
> to a new file.
>
> Code Size Changes:
> memchr-evex.S       : -107 bytes
> rawmemchr-evex.S    :  -53 bytes
>
> Net perf changes:
>
> Reported as geometric mean of all improvements / regressions from N=10
> runs of the benchtests. Value as New Time / Old Time so < 1.0 is
> improvement and 1.0 is regression.
>
> memchr-evex.S       : 0.928
> rawmemchr-evex.S    : 0.986 (Less targets cross cache lines)
>
> Full results attached in email.
>
> Full check passes on x86-64.
> ---
>  sysdeps/x86_64/multiarch/memchr-evex.S        | 939 ++++++++++--------
>  sysdeps/x86_64/multiarch/rawmemchr-evex-rtm.S |   9 +-
>  sysdeps/x86_64/multiarch/rawmemchr-evex.S     | 313 +++++-
>  3 files changed, 851 insertions(+), 410 deletions(-)
>
> diff --git a/sysdeps/x86_64/multiarch/memchr-evex.S b/sysdeps/x86_64/multiarch/memchr-evex.S
> index 0dd4f1dcce..23a1c0018e 100644
> --- a/sysdeps/x86_64/multiarch/memchr-evex.S
> +++ b/sysdeps/x86_64/multiarch/memchr-evex.S
> @@ -21,17 +21,27 @@
>
>  #if ISA_SHOULD_BUILD (4)
>
> +# ifndef VEC_SIZE
> +#  include "x86-evex256-vecs.h"
> +# endif
> +
>  # ifndef MEMCHR
>  #  define MEMCHR       __memchr_evex
>  # endif
>
>  # ifdef USE_AS_WMEMCHR
> +#  define PC_SHIFT_GPR rcx
> +#  define VPTESTN      vptestnmd
>  #  define VPBROADCAST  vpbroadcastd
>  #  define VPMINU       vpminud
>  #  define VPCMP        vpcmpd
>  #  define VPCMPEQ      vpcmpeqd
>  #  define CHAR_SIZE    4
> +
> +#  define USE_WIDE_CHAR
>  # else
> +#  define PC_SHIFT_GPR rdi
> +#  define VPTESTN      vptestnmb
>  #  define VPBROADCAST  vpbroadcastb
>  #  define VPMINU       vpminub
>  #  define VPCMP        vpcmpb
> @@ -39,534 +49,661 @@
>  #  define CHAR_SIZE    1
>  # endif
>
> -       /* In the 4x loop the RTM and non-RTM versions have data pointer
> -          off by VEC_SIZE * 4 with RTM version being VEC_SIZE * 4 greater.
> -          This is represented by BASE_OFFSET. As well because the RTM
> -          version uses vpcmp which stores a bit per element compared where
> -          the non-RTM version uses vpcmpeq which stores a bit per byte
> -          compared RET_SCALE of CHAR_SIZE is only relevant for the RTM
> -          version.  */
> -# ifdef USE_IN_RTM
> +# include "reg-macros.h"
> +
> +
> +/* If not in an RTM and VEC_SIZE != 64 (the VEC_SIZE = 64
> +   doesn't have VEX encoding), use VEX encoding in loop so we
> +   can use vpcmpeqb + vptern which is more efficient than the
> +   EVEX alternative.  */
> +# if defined USE_IN_RTM || VEC_SIZE == 64
> +#  undef COND_VZEROUPPER
> +#  undef VZEROUPPER_RETURN
> +#  undef VZEROUPPER
> +
> +#  define COND_VZEROUPPER
> +#  define VZEROUPPER_RETURN    ret
>  #  define VZEROUPPER
> -#  define BASE_OFFSET  (VEC_SIZE * 4)
> -#  define RET_SCALE    CHAR_SIZE
> +
> +#  define USE_TERN_IN_LOOP     0
>  # else
> +#  define USE_TERN_IN_LOOP     1
> +#  undef VZEROUPPER
>  #  define VZEROUPPER   vzeroupper
> -#  define BASE_OFFSET  0
> -#  define RET_SCALE    1
>  # endif
>
> -       /* In the return from 4x loop memchr and rawmemchr versions have
> -          data pointers off by VEC_SIZE * 4 with memchr version being
> -          VEC_SIZE * 4 greater.  */
> -# ifdef USE_AS_RAWMEMCHR
> -#  define RET_OFFSET   (BASE_OFFSET - (VEC_SIZE * 4))
> -#  define RAW_PTR_REG  rcx
> -#  define ALGN_PTR_REG rdi
> +# if USE_TERN_IN_LOOP
> +       /* Resulting bitmask for vpmovmskb has 4-bits set for each wchar
> +          so we don't want to multiply resulting index.  */
> +#  define TERN_CHAR_MULT       1
> +
> +#  ifdef USE_AS_WMEMCHR
> +#   define TEST_END()  inc %VRCX
> +#  else
> +#   define TEST_END()  add %rdx, %rcx
> +#  endif
>  # else
> -#  define RET_OFFSET   BASE_OFFSET
> -#  define RAW_PTR_REG  rdi
> -#  define ALGN_PTR_REG rcx
> +#  define TERN_CHAR_MULT       CHAR_SIZE
> +#  define TEST_END()   KORTEST %k2, %k3
>  # endif
>
> -# define XMMZERO       xmm23
> -# define YMMZERO       ymm23
> -# define XMMMATCH      xmm16
> -# define YMMMATCH      ymm16
> -# define YMM1          ymm17
> -# define YMM2          ymm18
> -# define YMM3          ymm19
> -# define YMM4          ymm20
> -# define YMM5          ymm21
> -# define YMM6          ymm22
> +# if defined USE_AS_WMEMCHR || !USE_TERN_IN_LOOP
> +#  ifndef USE_AS_WMEMCHR
> +#   define GPR_X0_IS_RET       1
> +#  else
> +#   define GPR_X0_IS_RET       0
> +#  endif
> +#  define GPR_X0       rax
> +# else
> +#  define GPR_X0_IS_RET        0
> +#  define GPR_X0       rdx
> +# endif
> +
> +# define CHAR_PER_VEC  (VEC_SIZE / CHAR_SIZE)
>
> -# ifndef SECTION
> -#  define SECTION(p)   p##.evex
> +# if CHAR_PER_VEC == 64
> +#  define LAST_VEC_OFFSET      (VEC_SIZE * 3)
> +# else
> +#  define LAST_VEC_OFFSET      (VEC_SIZE * 2)
> +# endif
> +# if CHAR_PER_VEC >= 32
> +#  define MASK_GPR(...)        VGPR(__VA_ARGS__)
> +# elif CHAR_PER_VEC == 16
> +#  define MASK_GPR(reg)        VGPR_SZ(reg, 16)
> +# else
> +#  define MASK_GPR(reg)        VGPR_SZ(reg, 8)
>  # endif
>
> -# define VEC_SIZE 32
> -# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE)
> -# define PAGE_SIZE 4096
> +# define VMATCH        VMM(0)
> +# define VMATCH_LO     VMM_lo(0)
>
> -       .section SECTION(.text),"ax",@progbits
> +# define PAGE_SIZE     4096
> +
> +
> +       .section SECTION(.text), "ax", @progbits
>  ENTRY_P2ALIGN (MEMCHR, 6)
> -# ifndef USE_AS_RAWMEMCHR
>         /* Check for zero length.  */
>         test    %RDX_LP, %RDX_LP
> -       jz      L(zero)
> +       jz      L(zero_0)
>
> -#  ifdef __ILP32__
> +# ifdef __ILP32__
>         /* Clear the upper 32 bits.  */
>         movl    %edx, %edx
> -#  endif
>  # endif
> -       /* Broadcast CHAR to YMMMATCH.  */
> -       VPBROADCAST %esi, %YMMMATCH
> +       VPBROADCAST %esi, %VMATCH
>         /* Check if we may cross page boundary with one vector load.  */
>         movl    %edi, %eax
>         andl    $(PAGE_SIZE - 1), %eax
>         cmpl    $(PAGE_SIZE - VEC_SIZE), %eax
> -       ja      L(cross_page_boundary)
> +       ja      L(page_cross)
> +
> +       VPCMPEQ (%rdi), %VMATCH, %k0
> +       KMOV    %k0, %VRAX
> +# ifndef USE_AS_WMEMCHR
> +       /* If rcx is zero then tzcnt -> CHAR_PER_VEC.  NB: there is a
> +          already a dependency between rcx and rsi so no worries about
> +          false-dep here.  */
> +       tzcnt   %VRAX, %VRSI
> +       /* If rdx <= rsi then either 1) rcx was non-zero (there was a
> +          match) but it was out of bounds or 2) rcx was zero and rdx
> +          was <= VEC_SIZE so we are done scanning.  */
> +       cmpq    %rsi, %rdx
> +       /* NB: Use branch to return zero/non-zero.  Common usage will
> +          branch on result of function (if return is null/non-null).
> +          This branch can be used to predict the ensuing one so there
> +          is no reason to extend the data-dependency with cmovcc.  */
> +       jbe     L(zero_0)
> +
> +       /* If rcx is zero then len must be > RDX, otherwise since we
> +          already tested len vs lzcnt(rcx) (in rsi) we are good to
> +          return this match.  */
> +       test    %VRAX, %VRAX
> +       jz      L(more_1x_vec)
> +       leaq    (%rdi, %rsi), %rax
> +# else
>
> -       /* Check the first VEC_SIZE bytes.  */
> -       VPCMP   $0, (%rdi), %YMMMATCH, %k0
> -       kmovd   %k0, %eax
> -# ifndef USE_AS_RAWMEMCHR
> -       /* If length < CHAR_PER_VEC handle special.  */
> +       /* We can't use the `tzcnt` trick for wmemchr because CHAR_SIZE
> +          > 1 so if rcx is tzcnt != CHAR_PER_VEC.  */
>         cmpq    $CHAR_PER_VEC, %rdx
> -       jbe     L(first_vec_x0)
> -# endif
> -       testl   %eax, %eax
> -       jz      L(aligned_more)
> -       tzcntl  %eax, %eax
> -# ifdef USE_AS_WMEMCHR
> -       /* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count.  */
> +       ja      L(more_1x_vec)
> +       tzcnt   %VRAX, %VRAX
> +       cmpl    %eax, %edx
> +       jbe     L(zero_0)
> +L(first_vec_x0_ret):
>         leaq    (%rdi, %rax, CHAR_SIZE), %rax
> -# else
> -       addq    %rdi, %rax
>  # endif
>         ret
>
> -# ifndef USE_AS_RAWMEMCHR
> -L(zero):
> -       xorl    %eax, %eax
> -       ret
> -
> -       .p2align 4
> -L(first_vec_x0):
> -       /* Check if first match was before length. NB: tzcnt has false data-
> -          dependency on destination. eax already had a data-dependency on esi
> -          so this should have no affect here.  */
> -       tzcntl  %eax, %esi
> -#  ifdef USE_AS_WMEMCHR
> -       leaq    (%rdi, %rsi, CHAR_SIZE), %rdi
> -#  else
> -       addq    %rsi, %rdi
> -#  endif
> +       /* Only fits in first cache line for VEC_SIZE == 32.  */
> +# if VEC_SIZE == 32
> +       .p2align 4,, 2
> +L(zero_0):
>         xorl    %eax, %eax
> -       cmpl    %esi, %edx
> -       cmovg   %rdi, %rax
>         ret
>  # endif
>
> -       .p2align 4
> -L(cross_page_boundary):
> -       /* Save pointer before aligning as its original value is
> -          necessary for computer return address if byte is found or
> -          adjusting length if it is not and this is memchr.  */
> -       movq    %rdi, %rcx
> -       /* Align data to VEC_SIZE. ALGN_PTR_REG is rcx for memchr and rdi
> -          for rawmemchr.  */
> -       andq    $-VEC_SIZE, %ALGN_PTR_REG
> -       VPCMP   $0, (%ALGN_PTR_REG), %YMMMATCH, %k0
> -       kmovd   %k0, %r8d
> +       .p2align 4,, 9
> +L(more_1x_vec):
>  # ifdef USE_AS_WMEMCHR
> -       /* NB: Divide shift count by 4 since each bit in K0 represent 4
> -          bytes.  */
> -       sarl    $2, %eax
> -# endif
> -# ifndef USE_AS_RAWMEMCHR
> -       movl    $(PAGE_SIZE / CHAR_SIZE), %esi
> -       subl    %eax, %esi
> +       /* If wmemchr still need to test if there was a match in first
> +          VEC.  Use bsf to test here so we can reuse
> +          L(first_vec_x0_ret).  */
> +       bsf     %VRAX, %VRAX
> +       jnz     L(first_vec_x0_ret)
>  # endif
> +
> +L(page_cross_continue):
>  # ifdef USE_AS_WMEMCHR
> -       andl    $(CHAR_PER_VEC - 1), %eax
> -# endif
> -       /* Remove the leading bytes.  */
> -       sarxl   %eax, %r8d, %eax
> -# ifndef USE_AS_RAWMEMCHR
> -       /* Check the end of data.  */
> -       cmpq    %rsi, %rdx
> -       jbe     L(first_vec_x0)
> +       /* We can't use end of the buffer to re-calculate length for
> +          wmemchr as len * CHAR_SIZE may overflow.  */
> +       leaq    -(VEC_SIZE + CHAR_SIZE)(%rdi), %rax
> +       andq    $(VEC_SIZE * -1), %rdi
> +       subq    %rdi, %rax
> +       sarq    $2, %rax
> +       addq    %rdx, %rax
> +# else
> +       leaq    -(VEC_SIZE + 1)(%rdx, %rdi), %rax
> +       andq    $(VEC_SIZE * -1), %rdi
> +       subq    %rdi, %rax
>  # endif
> -       testl   %eax, %eax
> -       jz      L(cross_page_continue)
> -       tzcntl  %eax, %eax
> +
> +       /* rax contains remaining length - 1.  -1 so we can get imm8
> +          encoding in a few additional places saving code size.  */
> +
> +       /* Needed regardless of remaining length.  */
> +       VPCMPEQ VEC_SIZE(%rdi), %VMATCH, %k0
> +       KMOV    %k0, %VRDX
> +
> +       /* We cannot fold the above `sub %rdi, %rax` with the `cmp
> +          $(CHAR_PER_VEC * 2), %rax` because its possible for a very
> +          large length to overflow and cause the subtract to carry
> +          despite length being above CHAR_PER_VEC * 2.  */
> +       cmpq    $(CHAR_PER_VEC * 2 - 1), %rax
> +       ja      L(more_2x_vec)
> +L(last_2x_vec):
> +
> +       test    %VRDX, %VRDX
> +       jnz     L(first_vec_x1_check)
> +
> +       /* Check the end of data.  NB: use 8-bit operations to save code
> +          size.  We no longer need the full-width of eax and will
> +          perform a write-only operation over eax so there will be no
> +          partial-register stalls.  */
> +       subb    $(CHAR_PER_VEC * 1 - 1), %al
> +       jle     L(zero_0)
> +
> +       VPCMPEQ (VEC_SIZE * 2)(%rdi), %VMATCH, %k0
> +       KMOV    %k0, %VRCX
>  # ifdef USE_AS_WMEMCHR
> -       /* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count.  */
> -       leaq    (%RAW_PTR_REG, %rax, CHAR_SIZE), %rax
> +       /* For wmemchr against we can't take advantage of tzcnt(0) ==
> +          VEC_SIZE as CHAR_PER_VEC != VEC_SIZE.  */
> +       test    %VRCX, %VRCX
> +       jz      L(zero_0)
> +# endif
> +       tzcnt   %VRCX, %VRCX
> +       cmp     %cl, %al
> +
> +       /* Same CFG for VEC_SIZE == 64 and VEC_SIZE == 32.  We give
> +          fallthrough to L(zero_0) for VEC_SIZE == 64 here as there is
> +          not enough space before the next cache line to fit the `lea`
> +          for return.  */
> +# if VEC_SIZE == 64
> +       ja      L(first_vec_x2_ret)
> +L(zero_0):
> +       xorl    %eax, %eax
> +       ret
>  # else
> -       addq    %RAW_PTR_REG, %rax
> +       jbe     L(zero_0)
> +       leaq    (VEC_SIZE * 2)(%rdi, %rcx, CHAR_SIZE), %rax
> +       ret
>  # endif
> +
> +       .p2align 4,, 5
> +L(first_vec_x1_check):
> +       bsf     %VRDX, %VRDX
> +       cmpb    %dl, %al
> +       jb      L(zero_4)
> +       leaq    (VEC_SIZE * 1)(%rdi, %rdx, CHAR_SIZE), %rax
>         ret
>
> -       .p2align 4
> -L(first_vec_x1):
> -       tzcntl  %eax, %eax
> -       leaq    VEC_SIZE(%rdi, %rax, CHAR_SIZE), %rax
> +       /* Fits at the end of the cache line here for VEC_SIZE == 32.
> +        */
> +# if VEC_SIZE == 32
> +L(zero_4):
> +       xorl    %eax, %eax
>         ret
> +# endif
>
> -       .p2align 4
> +
> +       .p2align 4,, 4
>  L(first_vec_x2):
> -       tzcntl  %eax, %eax
> -       leaq    (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
> +       bsf     %VRCX, %VRCX
> +L(first_vec_x2_ret):
> +       leaq    (VEC_SIZE * 2)(%rdi, %rcx, CHAR_SIZE), %rax
>         ret
>
> -       .p2align 4
> -L(first_vec_x3):
> -       tzcntl  %eax, %eax
> -       leaq    (VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax
> +       /* Fits at the end of the cache line here for VEC_SIZE == 64.
> +        */
> +# if VEC_SIZE == 64
> +L(zero_4):
> +       xorl    %eax, %eax
>         ret
> +# endif
>
> -       .p2align 4
> -L(first_vec_x4):
> -       tzcntl  %eax, %eax
> -       leaq    (VEC_SIZE * 4)(%rdi, %rax, CHAR_SIZE), %rax
> +       .p2align 4,, 4
> +L(first_vec_x1):
> +       bsf     %VRDX, %VRDX
> +       leaq    (VEC_SIZE * 1)(%rdi, %rdx, CHAR_SIZE), %rax
>         ret
>
> -       .p2align 5
> -L(aligned_more):
> -       /* Check the first 4 * VEC_SIZE.  Only one VEC_SIZE at a time
> -          since data is only aligned to VEC_SIZE.  */
>
> -# ifndef USE_AS_RAWMEMCHR
> -       /* Align data to VEC_SIZE.  */
> -L(cross_page_continue):
> -       xorl    %ecx, %ecx
> -       subl    %edi, %ecx
> -       andq    $-VEC_SIZE, %rdi
> -       /* esi is for adjusting length to see if near the end.  */
> -       leal    (VEC_SIZE * 5)(%rdi, %rcx), %esi
> -#  ifdef USE_AS_WMEMCHR
> -       /* NB: Divide bytes by 4 to get the wchar_t count.  */
> -       sarl    $2, %esi
> -#  endif
> -# else
> -       andq    $-VEC_SIZE, %rdi
> -L(cross_page_continue):
> -# endif
> -       /* Load first VEC regardless.  */
> -       VPCMP   $0, (VEC_SIZE)(%rdi), %YMMMATCH, %k0
> -       kmovd   %k0, %eax
> -# ifndef USE_AS_RAWMEMCHR
> -       /* Adjust length. If near end handle specially.  */
> -       subq    %rsi, %rdx
> -       jbe     L(last_4x_vec_or_less)
> -# endif
> -       testl   %eax, %eax
> +       .p2align 4,, 5
> +L(more_2x_vec):
> +       /* Length > VEC_SIZE * 2 so check first 2x VEC before rechecking
> +          length.  */
> +
> +
> +       /* Already computed matches for first VEC in rdx.  */
> +       test    %VRDX, %VRDX
>         jnz     L(first_vec_x1)
>
> -       VPCMP   $0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k0
> -       kmovd   %k0, %eax
> -       testl   %eax, %eax
> +
> +       VPCMPEQ (VEC_SIZE * 2)(%rdi), %VMATCH, %k0
> +       KMOV    %k0, %VRCX
> +       test    %VRCX, %VRCX
>         jnz     L(first_vec_x2)
>
> -       VPCMP   $0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k0
> -       kmovd   %k0, %eax
> -       testl   %eax, %eax
> +       /* Needed regardless of next length check.  */
> +       VPCMPEQ (VEC_SIZE * 3)(%rdi), %VMATCH, %k0
> +       KMOV    %k0, %VRCX
> +
> +       /* Check if we are near the end.  */
> +       cmpq    $(CHAR_PER_VEC * 4 - 1), %rax
> +       ja      L(more_4x_vec)
> +
> +       test    %VRCX, %VRCX
> +       jnz     L(first_vec_x3_check)
> +
> +       /* Use 8-bit instructions to save code size.  We won't use full-
> +          width eax again and will perform a write-only operation to
> +          eax so no worries about partial-register stalls.  */
> +       subb    $(CHAR_PER_VEC * 3), %al
> +       jb      L(zero_2)
> +L(last_vec_check):
> +       VPCMPEQ (VEC_SIZE * 4)(%rdi), %VMATCH, %k0
> +       KMOV    %k0, %VRCX
> +# ifdef USE_AS_WMEMCHR
> +       /* For wmemchr against we can't take advantage of tzcnt(0) ==
> +          VEC_SIZE as CHAR_PER_VEC != VEC_SIZE.  */
> +       test    %VRCX, %VRCX
> +       jz      L(zero_2)
> +# endif
> +       tzcnt   %VRCX, %VRCX
> +       cmp     %cl, %al
> +       jae     L(first_vec_x4_ret)
> +L(zero_2):
> +       xorl    %eax, %eax
> +       ret
> +
> +       /* Fits at the end of the cache line here for VEC_SIZE == 64.
> +          For VEC_SIZE == 32 we put the return label at the end of
> +          L(first_vec_x4).  */
> +# if VEC_SIZE == 64
> +L(first_vec_x4_ret):
> +       leaq    (VEC_SIZE * 4)(%rdi, %rcx, CHAR_SIZE), %rax
> +       ret
> +# endif
> +
> +       .p2align 4,, 6
> +L(first_vec_x4):
> +       bsf     %VRCX, %VRCX
> +# if VEC_SIZE == 32
> +       /* Place L(first_vec_x4_ret) here as we can't fit it in the same
> +          cache line as where it is called from so we might as well
> +          save code size by reusing return of L(first_vec_x4).  */
> +L(first_vec_x4_ret):
> +# endif
> +       leaq    (VEC_SIZE * 4)(%rdi, %rcx, CHAR_SIZE), %rax
> +       ret
> +
> +       .p2align 4,, 6
> +L(first_vec_x3_check):
> +       /* Need to adjust remaining length before checking.  */
> +       addb    $-(CHAR_PER_VEC * 2), %al
> +       bsf     %VRCX, %VRCX
> +       cmpb    %cl, %al
> +       jb      L(zero_2)
> +       leaq    (VEC_SIZE * 3)(%rdi, %rcx, CHAR_SIZE), %rax
> +       ret
> +
> +       .p2align 4,, 6
> +L(first_vec_x3):
> +       bsf     %VRCX, %VRCX
> +       leaq    (VEC_SIZE * 3)(%rdi, %rcx, CHAR_SIZE), %rax
> +       ret
> +
> +       .p2align 4,, 3
> +# if !USE_TERN_IN_LOOP
> +       .p2align 4,, 10
> +# endif
> +L(more_4x_vec):
> +       test    %VRCX, %VRCX
>         jnz     L(first_vec_x3)
>
> -       VPCMP   $0, (VEC_SIZE * 4)(%rdi), %YMMMATCH, %k0
> -       kmovd   %k0, %eax
> -       testl   %eax, %eax
> +       VPCMPEQ (VEC_SIZE * 4)(%rdi), %VMATCH, %k0
> +       KMOV    %k0, %VRCX
> +       test    %VRCX, %VRCX
>         jnz     L(first_vec_x4)
>
> +       subq    $-(VEC_SIZE * 5), %rdi
> +       subq    $(CHAR_PER_VEC * 8), %rax
> +       jb      L(last_4x_vec)
>
> -# ifndef USE_AS_RAWMEMCHR
> -       /* Check if at last CHAR_PER_VEC * 4 length.  */
> -       subq    $(CHAR_PER_VEC * 4), %rdx
> -       jbe     L(last_4x_vec_or_less_cmpeq)
> -       /* +VEC_SIZE if USE_IN_RTM otherwise +VEC_SIZE * 5.  */
> -       addq    $(VEC_SIZE + (VEC_SIZE * 4 - BASE_OFFSET)), %rdi
> -
> -       /* Align data to VEC_SIZE * 4 for the loop and readjust length.
> -        */
> -#  ifdef USE_AS_WMEMCHR
> +# ifdef USE_AS_WMEMCHR
>         movl    %edi, %ecx
> -       andq    $-(4 * VEC_SIZE), %rdi
> +# else
> +       addq    %rdi, %rax
> +# endif
> +
> +
> +# if VEC_SIZE == 64
> +       /* use xorb to do `andq $-(VEC_SIZE * 4), %rdi`. No evex
> +          processor has partial register stalls (all have merging
> +          uop). If that changes this can be removed.  */
> +       xorb    %dil, %dil
> +# else
> +       andq    $-(VEC_SIZE * 4), %rdi
> +# endif
> +
> +# ifdef USE_AS_WMEMCHR
>         subl    %edi, %ecx
> -       /* NB: Divide bytes by 4 to get the wchar_t count.  */
>         sarl    $2, %ecx
> -       addq    %rcx, %rdx
> -#  else
> -       addq    %rdi, %rdx
> -       andq    $-(4 * VEC_SIZE), %rdi
> -       subq    %rdi, %rdx
> -#  endif
> +       addq    %rcx, %rax
>  # else
> -       addq    $(VEC_SIZE + (VEC_SIZE * 4 - BASE_OFFSET)), %rdi
> -       andq    $-(4 * VEC_SIZE), %rdi
> +       subq    %rdi, %rax
>  # endif
> -# ifdef USE_IN_RTM
> -       vpxorq  %XMMZERO, %XMMZERO, %XMMZERO
> -# else
> -       /* copy ymmmatch to ymm0 so we can use vpcmpeq which is not
> -          encodable with EVEX registers (ymm16-ymm31).  */
> -       vmovdqa64 %YMMMATCH, %ymm0
> +
> +
> +
> +# if USE_TERN_IN_LOOP
> +       /* copy VMATCH to low ymm so we can use vpcmpeq which is not
> +          encodable with EVEX registers.  NB: this is VEC_SIZE == 32
> +          only as there is no way to encode vpcmpeq with zmm0-15.  */
> +       vmovdqa64 %VMATCH, %VMATCH_LO
>  # endif
>
> -       /* Compare 4 * VEC at a time forward.  */
> -       .p2align 4
> +       .p2align 4,, 11
>  L(loop_4x_vec):
> -       /* Two versions of the loop. One that does not require
> -          vzeroupper by not using ymm0-ymm15 and another does that require
> -          vzeroupper because it uses ymm0-ymm15. The reason why ymm0-ymm15
> -          is used at all is because there is no EVEX encoding vpcmpeq and
> -          with vpcmpeq this loop can be performed more efficiently. The
> -          non-vzeroupper version is safe for RTM while the vzeroupper
> -          version should be prefered if RTM are not supported.  */
> -# ifdef USE_IN_RTM
> -       /* It would be possible to save some instructions using 4x VPCMP
> -          but bottleneck on port 5 makes it not woth it.  */
> -       VPCMP   $4, (VEC_SIZE * 4)(%rdi), %YMMMATCH, %k1
> -       /* xor will set bytes match esi to zero.  */
> -       vpxorq  (VEC_SIZE * 5)(%rdi), %YMMMATCH, %YMM2
> -       vpxorq  (VEC_SIZE * 6)(%rdi), %YMMMATCH, %YMM3
> -       VPCMP   $0, (VEC_SIZE * 7)(%rdi), %YMMMATCH, %k3
> -       /* Reduce VEC2 / VEC3 with min and VEC1 with zero mask.  */
> -       VPMINU  %YMM2, %YMM3, %YMM3{%k1}{z}
> -       VPCMP   $0, %YMM3, %YMMZERO, %k2
> -# else
> +       /* Two versions of the loop.  One that does not require
> +          vzeroupper by not using ymmm0-15 and another does that
> +          require vzeroupper because it uses ymmm0-15.  The reason why
> +          ymm0-15 is used at all is because there is no EVEX encoding
> +          vpcmpeq and with vpcmpeq this loop can be performed more
> +          efficiently.  The non-vzeroupper version is safe for RTM
> +          while the vzeroupper version should be prefered if RTM are
> +          not supported.   Which loop version we use is determined by
> +          USE_TERN_IN_LOOP.  */
> +
> +# if USE_TERN_IN_LOOP
>         /* Since vptern can only take 3x vectors fastest to do 1 vec
>            seperately with EVEX vpcmp.  */
>  #  ifdef USE_AS_WMEMCHR
>         /* vptern can only accept masks for epi32/epi64 so can only save
> -          instruction using not equals mask on vptern with wmemchr.  */
> -       VPCMP   $4, (%rdi), %YMMMATCH, %k1
> +          instruction using not equals mask on vptern with wmemchr.
> +        */
> +       VPCMP   $4, (VEC_SIZE * 0)(%rdi), %VMATCH, %k1
>  #  else
> -       VPCMP   $0, (%rdi), %YMMMATCH, %k1
> +       VPCMPEQ (VEC_SIZE * 0)(%rdi), %VMATCH, %k1
>  #  endif
>         /* Compare 3x with vpcmpeq and or them all together with vptern.
>          */
> -       VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm2
> -       VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm0, %ymm3
> -       VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm0, %ymm4
> +       VPCMPEQ (VEC_SIZE * 1)(%rdi), %VMATCH_LO, %VMM_lo(2)
> +       VPCMPEQ (VEC_SIZE * 2)(%rdi), %VMATCH_LO, %VMM_lo(3)
> +       VPCMPEQ (VEC_SIZE * 3)(%rdi), %VMATCH_LO, %VMM_lo(4)
>  #  ifdef USE_AS_WMEMCHR
> -       /* This takes the not of or between ymm2, ymm3, ymm4 as well as
> -          combines result from VEC0 with zero mask.  */
> -       vpternlogd $1, %ymm2, %ymm3, %ymm4{%k1}{z}
> -       vpmovmskb %ymm4, %ecx
> +       /* This takes the not of or between VEC_lo(2), VEC_lo(3),
> +          VEC_lo(4) as well as combines result from VEC(0) with zero
> +          mask.  */
> +       vpternlogd $1, %VMM_lo(2), %VMM_lo(3), %VMM_lo(4){%k1}{z}
> +       vpmovmskb %VMM_lo(4), %VRCX
>  #  else
> -       /* 254 is mask for oring ymm2, ymm3, ymm4 into ymm4.  */
> -       vpternlogd $254, %ymm2, %ymm3, %ymm4
> -       vpmovmskb %ymm4, %ecx
> -       kmovd   %k1, %eax
> +       /* 254 is mask for oring VEC_lo(2), VEC_lo(3), VEC_lo(4) into
> +          VEC_lo(4).  */
> +       vpternlogd $254, %VMM_lo(2), %VMM_lo(3), %VMM_lo(4)
> +       vpmovmskb %VMM_lo(4), %VRCX
> +       KMOV    %k1, %edx
>  #  endif
> -# endif
>
> -# ifdef USE_AS_RAWMEMCHR
> -       subq    $-(VEC_SIZE * 4), %rdi
> -# endif
> -# ifdef USE_IN_RTM
> -       kortestd %k2, %k3
>  # else
> -#  ifdef USE_AS_WMEMCHR
> -       /* ecx contains not of matches. All 1s means no matches. incl will
> -          overflow and set zeroflag if that is the case.  */
> -       incl    %ecx
> -#  else
> -       /* If either VEC1 (eax) or VEC2-VEC4 (ecx) are not zero. Adding
> -          to ecx is not an issue because if eax is non-zero it will be
> -          used for returning the match. If it is zero the add does
> -          nothing.  */
> -       addq    %rax, %rcx
> -#  endif
> +       /* Loop version that uses EVEX encoding.  */
> +       VPCMP   $4, (VEC_SIZE * 0)(%rdi), %VMATCH, %k1
> +       vpxorq  (VEC_SIZE * 1)(%rdi), %VMATCH, %VMM(2)
> +       vpxorq  (VEC_SIZE * 2)(%rdi), %VMATCH, %VMM(3)
> +       VPCMPEQ (VEC_SIZE * 3)(%rdi), %VMATCH, %k3
> +       VPMINU  %VMM(2), %VMM(3), %VMM(3){%k1}{z}
> +       VPTESTN %VMM(3), %VMM(3), %k2
>  # endif
> -# ifdef USE_AS_RAWMEMCHR
> -       jz      L(loop_4x_vec)
> -# else
> -       jnz     L(loop_4x_vec_end)
> +
> +
> +       TEST_END ()
> +       jnz     L(loop_vec_ret)
>
>         subq    $-(VEC_SIZE * 4), %rdi
>
> -       subq    $(CHAR_PER_VEC * 4), %rdx
> -       ja      L(loop_4x_vec)
> +       subq    $(CHAR_PER_VEC * 4), %rax
> +       jae     L(loop_4x_vec)
>
> -       /* Fall through into less than 4 remaining vectors of length case.
> +       /* COND_VZEROUPPER is vzeroupper if we use the VEX encoded loop.
>          */
> -       VPCMP   $0, BASE_OFFSET(%rdi), %YMMMATCH, %k0
> -       addq    $(BASE_OFFSET - VEC_SIZE), %rdi
> -       kmovd   %k0, %eax
> -       VZEROUPPER
> -
> -L(last_4x_vec_or_less):
> -       /* Check if first VEC contained match.  */
> -       testl   %eax, %eax
> -       jnz     L(first_vec_x1_check)
> +       COND_VZEROUPPER
>
> -       /* If remaining length > CHAR_PER_VEC * 2.  */
> -       addl    $(CHAR_PER_VEC * 2), %edx
> -       jg      L(last_4x_vec)
> -
> -L(last_2x_vec):
> -       /* If remaining length < CHAR_PER_VEC.  */
> -       addl    $CHAR_PER_VEC, %edx
> -       jle     L(zero_end)
> -
> -       /* Check VEC2 and compare any match with remaining length.  */
> -       VPCMP   $0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k0
> -       kmovd   %k0, %eax
> -       tzcntl  %eax, %eax
> -       cmpl    %eax, %edx
> -       jbe     L(set_zero_end)
> -       leaq    (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
> -L(zero_end):
> -       ret
> +       .p2align 4,, 10
> +L(last_4x_vec):
> +       /* For CHAR_PER_VEC == 64 we don't need to mask as we use 8-bit
> +          instructions on eax from here on out.  */
> +# if CHAR_PER_VEC != 64
> +       andl    $(CHAR_PER_VEC * 4 - 1), %eax
> +# endif
> +       VPCMPEQ (VEC_SIZE * 0)(%rdi), %VMATCH, %k0
> +       subq    $(VEC_SIZE * 1), %rdi
> +       KMOV    %k0, %VRDX
> +       cmpb    $(CHAR_PER_VEC * 2 - 1), %al
> +       jbe     L(last_2x_vec)
> +       test    %VRDX, %VRDX
> +       jnz     L(last_vec_x1_novzero)
> +
> +       VPCMPEQ (VEC_SIZE * 2)(%rdi), %VMATCH, %k0
> +       KMOV    %k0, %VRDX
> +       test    %VRDX, %VRDX
> +       jnz     L(last_vec_x2_novzero)
> +
> +       VPCMPEQ (VEC_SIZE * 3)(%rdi), %VMATCH, %k0
> +       KMOV    %k0, %VRCX
> +       test    %VRCX, %VRCX
> +       jnz     L(first_vec_x3_check)
> +
> +       subb    $(CHAR_PER_VEC * 3), %al
> +       jae     L(last_vec_check)
>
> -L(set_zero_end):
>         xorl    %eax, %eax
>         ret
>
> -       .p2align 4
> -L(first_vec_x1_check):
> -       /* eax must be non-zero. Use bsfl to save code size.  */
> -       bsfl    %eax, %eax
> -       /* Adjust length.  */
> -       subl    $-(CHAR_PER_VEC * 4), %edx
> -       /* Check if match within remaining length.  */
> -       cmpl    %eax, %edx
> -       jbe     L(set_zero_end)
> -       /* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count.  */
> -       leaq    VEC_SIZE(%rdi, %rax, CHAR_SIZE), %rax
> +# if defined USE_AS_WMEMCHR && USE_TERN_IN_LOOP
> +L(last_vec_x2_novzero):
> +       addq    $VEC_SIZE, %rdi
> +L(last_vec_x1_novzero):
> +       bsf     %VRDX, %VRDX
> +       leaq    (VEC_SIZE * 1)(%rdi, %rdx, CHAR_SIZE), %rax
>         ret
> +# endif
>
> -       .p2align 4
> -L(loop_4x_vec_end):
> +# if CHAR_PER_VEC == 64
> +       /* Since we can't combine the last 2x VEC when CHAR_PER_VEC ==
> +          64 it needs a seperate return label.  */
> +       .p2align 4,, 4
> +L(last_vec_x2):
> +L(last_vec_x2_novzero):
> +       bsf     %VRDX, %VRDX
> +       leaq    (VEC_SIZE * 2)(%rdi, %rdx, TERN_CHAR_MULT), %rax
> +       ret
>  # endif
> -       /* rawmemchr will fall through into this if match was found in
> -          loop.  */
>
> -# if defined USE_IN_RTM || defined USE_AS_WMEMCHR
> -       /* k1 has not of matches with VEC1.  */
> -       kmovd   %k1, %eax
> -#  ifdef USE_AS_WMEMCHR
> -       subl    $((1 << CHAR_PER_VEC) - 1), %eax
> -#  else
> -       incl    %eax
> -#  endif
> +       .p2align 4,, 4
> +L(loop_vec_ret):
> +# if defined USE_AS_WMEMCHR || !USE_TERN_IN_LOOP
> +       KMOV    %k1, %VRAX
> +       inc     %MASK_GPR(rax)
>  # else
> -       /* eax already has matches for VEC1.  */
> -       testl   %eax, %eax
> +       test    %VRDX, %VRDX
>  # endif
> -       jnz     L(last_vec_x1_return)
> +       jnz     L(last_vec_x0)
>
> -# ifdef USE_IN_RTM
> -       VPCMP   $0, %YMM2, %YMMZERO, %k0
> -       kmovd   %k0, %eax
> +
> +# if USE_TERN_IN_LOOP
> +       vpmovmskb %VMM_lo(2), %VRDX
>  # else
> -       vpmovmskb %ymm2, %eax
> +       VPTESTN %VMM(2), %VMM(2), %k1
> +       KMOV    %k1, %VRDX
>  # endif
> -       testl   %eax, %eax
> -       jnz     L(last_vec_x2_return)
> +       test    %VRDX, %VRDX
> +       jnz     L(last_vec_x1)
>
> -# ifdef USE_IN_RTM
> -       kmovd   %k2, %eax
> -       testl   %eax, %eax
> -       jnz     L(last_vec_x3_return)
>
> -       kmovd   %k3, %eax
> -       tzcntl  %eax, %eax
> -       leaq    (VEC_SIZE * 3 + RET_OFFSET)(%rdi, %rax, CHAR_SIZE), %rax
> +# if USE_TERN_IN_LOOP
> +       vpmovmskb %VMM_lo(3), %VRDX
>  # else
> -       vpmovmskb %ymm3, %eax
> -       /* Combine matches in VEC3 (eax) with matches in VEC4 (ecx).  */
> -       salq    $VEC_SIZE, %rcx
> -       orq     %rcx, %rax
> -       tzcntq  %rax, %rax
> -       leaq    (VEC_SIZE * 2 + RET_OFFSET)(%rdi, %rax), %rax
> -       VZEROUPPER
> +       KMOV    %k2, %VRDX
>  # endif
> -       ret
>
> -       .p2align 4,, 10
> -L(last_vec_x1_return):
> -       tzcntl  %eax, %eax
> -# if defined USE_AS_WMEMCHR || RET_OFFSET != 0
> -       /* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count.  */
> -       leaq    RET_OFFSET(%rdi, %rax, CHAR_SIZE), %rax
> +       /* No longer need any of the lo vecs (ymm0-15) so vzeroupper
> +          (only if used VEX encoded loop).  */
> +       COND_VZEROUPPER
> +
> +       /* Seperate logic for CHAR_PER_VEC == 64 vs the rest.  For
> +          CHAR_PER_VEC we test the last 2x VEC seperately, for
> +          CHAR_PER_VEC <= 32 we can combine the results from the 2x
> +          VEC in a single GPR.  */
> +# if CHAR_PER_VEC == 64
> +#  if USE_TERN_IN_LOOP
> +#   error "Unsupported"
> +#  endif
> +
> +
> +       /* If CHAR_PER_VEC == 64 we can't combine the last two VEC.  */
> +       test    %VRDX, %VRDX
> +       jnz     L(last_vec_x2)
> +       KMOV    %k3, %VRDX
>  # else
> -       addq    %rdi, %rax
> +       /* CHAR_PER_VEC <= 32 so we can combine the results from the
> +          last 2x VEC.  */
> +
> +#  if !USE_TERN_IN_LOOP
> +       KMOV    %k3, %VRCX
> +#  endif
> +       salq    $(VEC_SIZE / TERN_CHAR_MULT), %rcx
> +       addq    %rcx, %rdx
> +#  if !defined USE_AS_WMEMCHR || !USE_TERN_IN_LOOP
> +L(last_vec_x2_novzero):
> +#  endif
>  # endif
> -       VZEROUPPER
> +       bsf     %rdx, %rdx
> +       leaq    (LAST_VEC_OFFSET)(%rdi, %rdx, TERN_CHAR_MULT), %rax
>         ret
>
> -       .p2align 4
> -L(last_vec_x2_return):
> -       tzcntl  %eax, %eax
> -       /* NB: Multiply bytes by RET_SCALE to get the wchar_t count
> -          if relevant (RET_SCALE = CHAR_SIZE if USE_AS_WMEMCHAR and
> -          USE_IN_RTM are both defined. Otherwise RET_SCALE = 1.  */
> -       leaq    (VEC_SIZE + RET_OFFSET)(%rdi, %rax, RET_SCALE), %rax
> -       VZEROUPPER
> +       .p2align 4,, 8
> +L(last_vec_x1):
> +       COND_VZEROUPPER
> +# if !defined USE_AS_WMEMCHR || !USE_TERN_IN_LOOP
> +L(last_vec_x1_novzero):
> +# endif
> +       bsf     %VRDX, %VRDX
> +       leaq    (VEC_SIZE * 1)(%rdi, %rdx, TERN_CHAR_MULT), %rax
>         ret
>
> -# ifdef USE_IN_RTM
> -       .p2align 4
> -L(last_vec_x3_return):
> -       tzcntl  %eax, %eax
> -       /* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count.  */
> -       leaq    (VEC_SIZE * 2 + RET_OFFSET)(%rdi, %rax, CHAR_SIZE), %rax
> +
> +       .p2align 4,, 4
> +L(last_vec_x0):
> +       COND_VZEROUPPER
> +       bsf     %VGPR(GPR_X0), %VGPR(GPR_X0)
> +# if GPR_X0_IS_RET
> +       addq    %rdi, %rax
> +# else
> +       leaq    (%rdi, %GPR_X0, CHAR_SIZE), %rax
> +# endif
>         ret
> +
> +       .p2align 4,, 6
> +L(page_cross):
> +       /* Need to preserve eax to compute inbound bytes we are
> +          checking.  */
> +# ifdef USE_AS_WMEMCHR
> +       movl    %eax, %ecx
> +# else
> +       xorl    %ecx, %ecx
> +       subl    %eax, %ecx
>  # endif
>
> -# ifndef USE_AS_RAWMEMCHR
> -       .p2align 4,, 5
> -L(last_4x_vec_or_less_cmpeq):
> -       VPCMP   $0, (VEC_SIZE * 5)(%rdi), %YMMMATCH, %k0
> -       kmovd   %k0, %eax
> -       subq    $-(VEC_SIZE * 4), %rdi
> -       /* Check first VEC regardless.  */
> -       testl   %eax, %eax
> -       jnz     L(first_vec_x1_check)
> +       xorq    %rdi, %rax
> +       VPCMPEQ (PAGE_SIZE - VEC_SIZE)(%rax), %VMATCH, %k0
> +       KMOV    %k0, %VRAX
>
> -       /* If remaining length <= CHAR_PER_VEC * 2.  */
> -       addl    $(CHAR_PER_VEC * 2), %edx
> -       jle     L(last_2x_vec)
> +# ifdef USE_AS_WMEMCHR
> +       /* NB: Divide by CHAR_SIZE to shift out out of bounds bytes.  */
> +       shrl    $2, %ecx
> +       andl    $(CHAR_PER_VEC - 1), %ecx
> +# endif
>
> -       .p2align 4
> -L(last_4x_vec):
> -       VPCMP   $0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k0
> -       kmovd   %k0, %eax
> -       testl   %eax, %eax
> -       jnz     L(last_vec_x2)
>
> +       shrx    %VGPR(PC_SHIFT_GPR), %VRAX, %VRAX
>
> -       VPCMP   $0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k0
> -       kmovd   %k0, %eax
> -       /* Create mask for possible matches within remaining length.  */
> -#  ifdef USE_AS_WMEMCHR
> -       movl    $((1 << (CHAR_PER_VEC * 2)) - 1), %ecx
> -       bzhil   %edx, %ecx, %ecx
> -#  else
> -       movq    $-1, %rcx
> -       bzhiq   %rdx, %rcx, %rcx
> -#  endif
> -       /* Test matches in data against length match.  */
> -       andl    %ecx, %eax
> -       jnz     L(last_vec_x3)
> +# ifdef USE_AS_WMEMCHR
> +       negl    %ecx
> +# endif
>
> -       /* if remaining length <= CHAR_PER_VEC * 3 (Note this is after
> -          remaining length was found to be > CHAR_PER_VEC * 2.  */
> -       subl    $CHAR_PER_VEC, %edx
> -       jbe     L(zero_end2)
> +       /* mask lower bits from ecx (negative eax) to get bytes till
> +          next VEC.  */
> +       andl    $(CHAR_PER_VEC - 1), %ecx
>
> +       /* Check if VEC is entirely contained in the remainder of the
> +          page.  */
> +       cmpq    %rcx, %rdx
> +       jbe     L(page_cross_ret)
>
> -       VPCMP   $0, (VEC_SIZE * 4)(%rdi), %YMMMATCH, %k0
> -       kmovd   %k0, %eax
> -       /* Shift remaining length mask for last VEC.  */
> -#  ifdef USE_AS_WMEMCHR
> -       shrl    $CHAR_PER_VEC, %ecx
> -#  else
> -       shrq    $CHAR_PER_VEC, %rcx
> -#  endif
> -       andl    %ecx, %eax
> -       jz      L(zero_end2)
> -       bsfl    %eax, %eax
> -       leaq    (VEC_SIZE * 4)(%rdi, %rax, CHAR_SIZE), %rax
> -L(zero_end2):
> -       ret
> +       /* Length crosses the page so if rax is zero (no matches)
> +          continue.  */
> +       test    %VRAX, %VRAX
> +       jz      L(page_cross_continue)
>
> -L(last_vec_x2):
> -       tzcntl  %eax, %eax
> -       leaq    (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
> +       /* if rdx > rcx then any match here must be in [buf:buf + len].
> +        */
> +       tzcnt   %VRAX, %VRAX
> +# ifdef USE_AS_WMEMCHR
> +       leaq    (%rdi, %rax, CHAR_SIZE), %rax
> +# else
> +       addq    %rdi, %rax
> +# endif
>         ret
>
> -       .p2align 4
> -L(last_vec_x3):
> -       tzcntl  %eax, %eax
> -       leaq    (VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax
> +       .p2align 4,, 2
> +L(page_cross_zero):
> +       xorl    %eax, %eax
>         ret
> +
> +       .p2align 4,, 4
> +L(page_cross_ret):
> +       /* Search is entirely contained in page cross case.  */
> +# ifdef USE_AS_WMEMCHR
> +       test    %VRAX, %VRAX
> +       jz      L(page_cross_zero)
> +# endif
> +       tzcnt   %VRAX, %VRAX
> +       cmpl    %eax, %edx
> +       jbe     L(page_cross_zero)
> +# ifdef USE_AS_WMEMCHR
> +       leaq    (%rdi, %rax, CHAR_SIZE), %rax
> +# else
> +       addq    %rdi, %rax
>  # endif
> -       /* 7 bytes from next cache line.  */
> +       ret
>  END (MEMCHR)
>  #endif
> diff --git a/sysdeps/x86_64/multiarch/rawmemchr-evex-rtm.S b/sysdeps/x86_64/multiarch/rawmemchr-evex-rtm.S
> index deda1ca395..2073eaa620 100644
> --- a/sysdeps/x86_64/multiarch/rawmemchr-evex-rtm.S
> +++ b/sysdeps/x86_64/multiarch/rawmemchr-evex-rtm.S
> @@ -1,3 +1,6 @@
> -#define MEMCHR __rawmemchr_evex_rtm
> -#define USE_AS_RAWMEMCHR 1
> -#include "memchr-evex-rtm.S"
> +#define RAWMEMCHR      __rawmemchr_evex_rtm
> +
> +#define USE_IN_RTM     1
> +#define SECTION(p)     p##.evex.rtm
> +
> +#include "rawmemchr-evex.S"
> diff --git a/sysdeps/x86_64/multiarch/rawmemchr-evex.S b/sysdeps/x86_64/multiarch/rawmemchr-evex.S
> index dc1c450699..dad54def2b 100644
> --- a/sysdeps/x86_64/multiarch/rawmemchr-evex.S
> +++ b/sysdeps/x86_64/multiarch/rawmemchr-evex.S
> @@ -1,7 +1,308 @@
> -#ifndef RAWMEMCHR
> -# define RAWMEMCHR     __rawmemchr_evex
> -#endif
> -#define USE_AS_RAWMEMCHR       1
> -#define MEMCHR RAWMEMCHR
> +/* rawmemchr optimized with 256-bit EVEX instructions.
> +   Copyright (C) 2022 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +#include <isa-level.h>
> +#include <sysdep.h>
> +
> +#if ISA_SHOULD_BUILD (4)
> +
> +# ifndef VEC_SIZE
> +#  include "x86-evex256-vecs.h"
> +# endif
> +
> +# ifndef RAWMEMCHR
> +#  define RAWMEMCHR    __rawmemchr_evex
> +# endif
> +
> +
> +# define PC_SHIFT_GPR  rdi
> +# define REG_WIDTH     VEC_SIZE
> +# define VPTESTN       vptestnmb
> +# define VPBROADCAST   vpbroadcastb
> +# define VPMINU        vpminub
> +# define VPCMP vpcmpb
> +# define VPCMPEQ       vpcmpeqb
> +# define CHAR_SIZE     1
> +
> +# include "reg-macros.h"
> +
> +/* If not in an RTM and VEC_SIZE != 64 (the VEC_SIZE = 64
> +   doesn't have VEX encoding), use VEX encoding in loop so we
> +   can use vpcmpeqb + vptern which is more efficient than the
> +   EVEX alternative.  */
> +# if defined USE_IN_RTM || VEC_SIZE == 64
> +#  undef COND_VZEROUPPER
> +#  undef VZEROUPPER_RETURN
> +#  undef VZEROUPPER
> +
> +
> +#  define COND_VZEROUPPER
> +#  define VZEROUPPER_RETURN    ret
> +#  define VZEROUPPER
> +
> +#  define USE_TERN_IN_LOOP     0
> +# else
> +#  define USE_TERN_IN_LOOP     1
> +#  undef VZEROUPPER
> +#  define VZEROUPPER   vzeroupper
> +# endif
> +
> +# define CHAR_PER_VEC  VEC_SIZE
> +
> +# if CHAR_PER_VEC == 64
> +
> +#  define TAIL_RETURN_LBL      first_vec_x2
> +#  define TAIL_RETURN_OFFSET   (CHAR_PER_VEC * 2)
> +
> +#  define FALLTHROUGH_RETURN_LBL       first_vec_x3
> +#  define FALLTHROUGH_RETURN_OFFSET    (CHAR_PER_VEC * 3)
> +
> +# else /* !(CHAR_PER_VEC == 64) */
> +
> +#  define TAIL_RETURN_LBL      first_vec_x3
> +#  define TAIL_RETURN_OFFSET   (CHAR_PER_VEC * 3)
> +
> +#  define FALLTHROUGH_RETURN_LBL       first_vec_x2
> +#  define FALLTHROUGH_RETURN_OFFSET    (CHAR_PER_VEC * 2)
> +# endif        /* !(CHAR_PER_VEC == 64) */
> +
> +
> +# define VMATCH        VMM(0)
> +# define VMATCH_LO     VMM_lo(0)
> +
> +# define PAGE_SIZE     4096
> +
> +       .section SECTION(.text), "ax", @progbits
> +ENTRY_P2ALIGN (RAWMEMCHR, 6)
> +       VPBROADCAST %esi, %VMATCH
> +       /* Check if we may cross page boundary with one vector load.  */
> +       movl    %edi, %eax
> +       andl    $(PAGE_SIZE - 1), %eax
> +       cmpl    $(PAGE_SIZE - VEC_SIZE), %eax
> +       ja      L(page_cross)
> +
> +       VPCMPEQ (%rdi), %VMATCH, %k0
> +       KMOV    %k0, %VRAX
> +
> +       test    %VRAX, %VRAX
> +       jz      L(aligned_more)
> +L(first_vec_x0):
> +       bsf     %VRAX, %VRAX
> +       addq    %rdi, %rax
> +       ret
> +
> +       .p2align 4,, 4
> +L(first_vec_x4):
> +       bsf     %VRAX, %VRAX
> +       leaq    (VEC_SIZE * 4)(%rdi, %rax), %rax
> +       ret
>
> -#include "memchr-evex.S"
> +       /* For VEC_SIZE == 32 we can fit this in aligning bytes so might
> +          as well place it more locally.  For VEC_SIZE == 64 we reuse
> +          return code at the end of loop's return.  */
> +# if VEC_SIZE == 32
> +       .p2align 4,, 4
> +L(FALLTHROUGH_RETURN_LBL):
> +       bsf     %VRAX, %VRAX
> +       leaq    (FALLTHROUGH_RETURN_OFFSET)(%rdi, %rax), %rax
> +       ret
> +# endif
> +
> +       .p2align 4,, 6
> +L(page_cross):
> +       /* eax has lower page-offset bits of rdi so xor will zero them
> +          out.  */
> +       xorq    %rdi, %rax
> +       VPCMPEQ (PAGE_SIZE - VEC_SIZE)(%rax), %VMATCH, %k0
> +       KMOV    %k0, %VRAX
> +
> +       /* Shift out out-of-bounds matches.  */
> +       shrx    %VRDI, %VRAX, %VRAX
> +       test    %VRAX, %VRAX
> +       jnz     L(first_vec_x0)
> +
> +       .p2align 4,, 10
> +L(aligned_more):
> +L(page_cross_continue):
> +       /* Align pointer.  */
> +       andq    $(VEC_SIZE * -1), %rdi
> +
> +       VPCMPEQ VEC_SIZE(%rdi), %VMATCH, %k0
> +       KMOV    %k0, %VRAX
> +       test    %VRAX, %VRAX
> +       jnz     L(first_vec_x1)
> +
> +       VPCMPEQ (VEC_SIZE * 2)(%rdi), %VMATCH, %k0
> +       KMOV    %k0, %VRAX
> +       test    %VRAX, %VRAX
> +       jnz     L(first_vec_x2)
> +
> +       VPCMPEQ (VEC_SIZE * 3)(%rdi), %VMATCH, %k0
> +       KMOV    %k0, %VRAX
> +       test    %VRAX, %VRAX
> +       jnz     L(first_vec_x3)
> +
> +       VPCMPEQ (VEC_SIZE * 4)(%rdi), %VMATCH, %k0
> +       KMOV    %k0, %VRAX
> +       test    %VRAX, %VRAX
> +       jnz     L(first_vec_x4)
> +
> +       subq    $-(VEC_SIZE * 1), %rdi
> +# if VEC_SIZE == 64
> +       /* Saves code size.  No evex512 processor has partial register
> +          stalls.  If that change this can be replaced with `andq
> +          $-(VEC_SIZE * 4), %rdi`.  */
> +       xorb    %dil, %dil
> +# else
> +       andq    $-(VEC_SIZE * 4), %rdi
> +# endif
> +
> +# if USE_TERN_IN_LOOP
> +       /* copy VMATCH to low ymm so we can use vpcmpeq which is not
> +          encodable with EVEX registers.  NB: this is VEC_SIZE == 32
> +          only as there is no way to encode vpcmpeq with zmm0-15.  */
> +       vmovdqa64 %VMATCH, %VMATCH_LO
> +# endif
> +
> +       .p2align 4
> +L(loop_4x_vec):
> +       /* Two versions of the loop.  One that does not require
> +          vzeroupper by not using ymm0-15 and another does that
> +          require vzeroupper because it uses ymm0-15.  The reason why
> +          ymm0-15 is used at all is because there is no EVEX encoding
> +          vpcmpeq and with vpcmpeq this loop can be performed more
> +          efficiently.  The non-vzeroupper version is safe for RTM
> +          while the vzeroupper version should be prefered if RTM are
> +          not supported.   Which loop version we use is determined by
> +          USE_TERN_IN_LOOP.  */
> +
> +# if USE_TERN_IN_LOOP
> +       /* Since vptern can only take 3x vectors fastest to do 1 vec
> +          seperately with EVEX vpcmp.  */
> +       VPCMPEQ (VEC_SIZE * 4)(%rdi), %VMATCH, %k1
> +       /* Compare 3x with vpcmpeq and or them all together with vptern.
> +        */
> +
> +       VPCMPEQ (VEC_SIZE * 5)(%rdi), %VMATCH_LO, %VMM_lo(2)
> +       subq    $(VEC_SIZE * -4), %rdi
> +       VPCMPEQ (VEC_SIZE * 2)(%rdi), %VMATCH_LO, %VMM_lo(3)
> +       VPCMPEQ (VEC_SIZE * 3)(%rdi), %VMATCH_LO, %VMM_lo(4)
> +
> +       /* 254 is mask for oring VEC_lo(2), VEC_lo(3), VEC_lo(4) into
> +          VEC_lo(4).  */
> +       vpternlogd $254, %VMM_lo(2), %VMM_lo(3), %VMM_lo(4)
> +       vpmovmskb %VMM_lo(4), %VRCX
> +
> +       KMOV    %k1, %eax
> +
> +       /* NB:  rax has match from first VEC and rcx has matches from
> +          VEC 2-4.  If rax is non-zero we will return that match.  If
> +          rax is zero adding won't disturb the bits in rcx.  */
> +       add     %rax, %rcx
> +# else
> +       /* Loop version that uses EVEX encoding.  */
> +       VPCMP   $4, (VEC_SIZE * 4)(%rdi), %VMATCH, %k1
> +       vpxorq  (VEC_SIZE * 5)(%rdi), %VMATCH, %VMM(2)
> +       vpxorq  (VEC_SIZE * 6)(%rdi), %VMATCH, %VMM(3)
> +       VPCMPEQ (VEC_SIZE * 7)(%rdi), %VMATCH, %k3
> +       VPMINU  %VMM(2), %VMM(3), %VMM(3){%k1}{z}
> +       VPTESTN %VMM(3), %VMM(3), %k2
> +       subq    $(VEC_SIZE * -4), %rdi
> +       KORTEST %k2, %k3
> +# endif
> +       jz      L(loop_4x_vec)
> +
> +# if USE_TERN_IN_LOOP
> +       test    %VRAX, %VRAX
> +# else
> +       KMOV    %k1, %VRAX
> +       inc     %VRAX
> +# endif
> +       jnz     L(last_vec_x0)
> +
> +
> +# if USE_TERN_IN_LOOP
> +       vpmovmskb %VMM_lo(2), %VRAX
> +# else
> +       VPTESTN %VMM(2), %VMM(2), %k1
> +       KMOV    %k1, %VRAX
> +# endif
> +       test    %VRAX, %VRAX
> +       jnz     L(last_vec_x1)
> +
> +
> +# if USE_TERN_IN_LOOP
> +       vpmovmskb %VMM_lo(3), %VRAX
> +# else
> +       KMOV    %k2, %VRAX
> +# endif
> +
> +       /* No longer need any of the lo vecs (ymm0-15) so vzeroupper
> +          (only if used VEX encoded loop).  */
> +       COND_VZEROUPPER
> +
> +       /* Seperate logic for VEC_SIZE == 64 and VEC_SIZE == 32 for
> +          returning last 2x VEC. For VEC_SIZE == 64 we test each VEC
> +          individually, for VEC_SIZE == 32 we combine them in a single
> +          64-bit GPR.  */
> +# if CHAR_PER_VEC == 64
> +#  if USE_TERN_IN_LOOP
> +#   error "Unsupported"
> +#  endif
> +
> +
> +       /* If CHAR_PER_VEC == 64 we can't combine the last two VEC.  */
> +       test    %VRAX, %VRAX
> +       jnz     L(first_vec_x2)
> +       KMOV    %k3, %VRAX
> +L(FALLTHROUGH_RETURN_LBL):
> +# else
> +       /* CHAR_PER_VEC <= 32 so we can combine the results from the
> +          last 2x VEC.  */
> +#  if !USE_TERN_IN_LOOP
> +       KMOV    %k3, %VRCX
> +#  endif
> +       salq    $CHAR_PER_VEC, %rcx
> +       addq    %rcx, %rax
> +# endif
> +       bsf     %rax, %rax
> +       leaq    (FALLTHROUGH_RETURN_OFFSET)(%rdi, %rax), %rax
> +       ret
> +
> +       .p2align 4,, 8
> +L(TAIL_RETURN_LBL):
> +       bsf     %rax, %rax
> +       leaq    (TAIL_RETURN_OFFSET)(%rdi, %rax), %rax
> +       ret
> +
> +       .p2align 4,, 8
> +L(last_vec_x1):
> +       COND_VZEROUPPER
> +L(first_vec_x1):
> +       bsf     %VRAX, %VRAX
> +       leaq    (VEC_SIZE * 1)(%rdi, %rax), %rax
> +       ret
> +
> +       .p2align 4,, 8
> +L(last_vec_x0):
> +       COND_VZEROUPPER
> +       bsf     %VRAX, %VRAX
> +       addq    %rdi, %rax
> +       ret
> +END (RAWMEMCHR)
> +#endif
> --
> 2.34.1
>

LGTM.

Thanks.

-- 
H.J.

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH v3 2/7] x86: Shrink / minorly optimize strchr-evex and implement with VMM headers
  2022-10-19  0:44   ` [PATCH v3 2/7] x86: Shrink / minorly optimize strchr-evex " Noah Goldstein
@ 2022-10-19 16:53     ` H.J. Lu
  0 siblings, 0 replies; 41+ messages in thread
From: H.J. Lu @ 2022-10-19 16:53 UTC (permalink / raw)
  To: Noah Goldstein; +Cc: libc-alpha, carlos

On Tue, Oct 18, 2022 at 5:44 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> Size Optimizations:
> 1. Condence hot path for better cache-locality.
>     - This is most impact for strchrnul where the logic strings with
>       len <= VEC_SIZE or with a match in the first VEC no fits entirely
>       in the first cache line.
> 2. Reuse common targets in first 4x VEC and after the loop.
> 3. Don't align targets so aggressively if it doesn't change the number
>    of fetch blocks it will require and put more care in avoiding the
>    case where targets unnecessarily split cache lines.
> 4. Align the loop better for DSB/LSD
> 5. Use more code-size efficient instructions.
>         - tzcnt ...     -> bsf ...
>         - vpcmpb $0 ... -> vpcmpeq ...
> 6. Align labels less aggressively, especially if it doesn't save fetch
>    blocks / causes the basic-block to span extra cache-lines.
>
> Code Size Changes:
> strchr-evex.S   : -63 bytes
> strchrnul-evex.S: -48 bytes
>
> Net perf changes:
> Reported as geometric mean of all improvements / regressions from N=10
> runs of the benchtests. Value as New Time / Old Time so < 1.0 is
> improvement and 1.0 is regression.
>
> strchr-evex.S (Fixed)   : 0.971
> strchr-evex.S (Rand)    : 0.932
> strchrnul-evex.S        : 0.965
>
> Full results attached in email.
>
> Full check passes on x86-64.
> ---
>  sysdeps/x86_64/multiarch/strchr-evex.S | 558 +++++++++++++++----------
>  1 file changed, 340 insertions(+), 218 deletions(-)
>
> diff --git a/sysdeps/x86_64/multiarch/strchr-evex.S b/sysdeps/x86_64/multiarch/strchr-evex.S
> index a1c15c4419..c2a0d112f7 100644
> --- a/sysdeps/x86_64/multiarch/strchr-evex.S
> +++ b/sysdeps/x86_64/multiarch/strchr-evex.S
> @@ -26,48 +26,75 @@
>  #  define STRCHR       __strchr_evex
>  # endif
>
> -# define VMOVU         vmovdqu64
> -# define VMOVA         vmovdqa64
> +# ifndef VEC_SIZE
> +#  include "x86-evex256-vecs.h"
> +# endif
>
>  # ifdef USE_AS_WCSCHR
>  #  define VPBROADCAST  vpbroadcastd
> -#  define VPCMP                vpcmpd
> +#  define VPCMP        vpcmpd
> +#  define VPCMPEQ      vpcmpeqd
>  #  define VPTESTN      vptestnmd
> +#  define VPTEST       vptestmd
>  #  define VPMINU       vpminud
>  #  define CHAR_REG     esi
> -#  define SHIFT_REG    ecx
> +#  define SHIFT_REG    rcx
>  #  define CHAR_SIZE    4
> +
> +#  define USE_WIDE_CHAR
>  # else
>  #  define VPBROADCAST  vpbroadcastb
> -#  define VPCMP                vpcmpb
> +#  define VPCMP        vpcmpb
> +#  define VPCMPEQ      vpcmpeqb
>  #  define VPTESTN      vptestnmb
> +#  define VPTEST       vptestmb
>  #  define VPMINU       vpminub
>  #  define CHAR_REG     sil
> -#  define SHIFT_REG    edx
> +#  define SHIFT_REG    rdi
>  #  define CHAR_SIZE    1
>  # endif
>
> -# define XMMZERO       xmm16
> -
> -# define YMMZERO       ymm16
> -# define YMM0          ymm17
> -# define YMM1          ymm18
> -# define YMM2          ymm19
> -# define YMM3          ymm20
> -# define YMM4          ymm21
> -# define YMM5          ymm22
> -# define YMM6          ymm23
> -# define YMM7          ymm24
> -# define YMM8          ymm25
> -
> -# define VEC_SIZE 32
> -# define PAGE_SIZE 4096
> -# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE)
> -
> -       .section .text.evex,"ax",@progbits
> -ENTRY_P2ALIGN (STRCHR, 5)
> -       /* Broadcast CHAR to YMM0.      */
> -       VPBROADCAST     %esi, %YMM0
> +# include "reg-macros.h"
> +
> +# if VEC_SIZE == 64
> +#  define MASK_GPR     rcx
> +#  define LOOP_REG     rax
> +
> +#  define COND_MASK(k_reg)     {%k_reg}
> +# else
> +#  define MASK_GPR     rax
> +#  define LOOP_REG     rdi
> +
> +#  define COND_MASK(k_reg)
> +# endif
> +
> +# define CHAR_PER_VEC  (VEC_SIZE / CHAR_SIZE)
> +
> +
> +# if CHAR_PER_VEC == 64
> +#  define LAST_VEC_OFFSET      (VEC_SIZE * 3)
> +#  define TESTZ(reg)   incq %VGPR_SZ(reg, 64)
> +# else
> +
> +#  if CHAR_PER_VEC == 32
> +#   define TESTZ(reg)  incl %VGPR_SZ(reg, 32)
> +#  elif CHAR_PER_VEC == 16
> +#   define TESTZ(reg)  incw %VGPR_SZ(reg, 16)
> +#  else
> +#   define TESTZ(reg)  incb %VGPR_SZ(reg, 8)
> +#  endif
> +
> +#  define LAST_VEC_OFFSET      (VEC_SIZE * 2)
> +# endif
> +
> +# define VMATCH        VMM(0)
> +
> +# define PAGE_SIZE     4096
> +
> +       .section SECTION(.text), "ax", @progbits
> +ENTRY_P2ALIGN (STRCHR, 6)
> +       /* Broadcast CHAR to VEC_0.  */
> +       VPBROADCAST %esi, %VMATCH
>         movl    %edi, %eax
>         andl    $(PAGE_SIZE - 1), %eax
>         /* Check if we cross page boundary with one vector load.
> @@ -75,19 +102,27 @@ ENTRY_P2ALIGN (STRCHR, 5)
>         cmpl    $(PAGE_SIZE - VEC_SIZE), %eax
>         ja      L(cross_page_boundary)
>
> +
>         /* Check the first VEC_SIZE bytes. Search for both CHAR and the
>            null bytes.  */
> -       VMOVU   (%rdi), %YMM1
> -
> +       VMOVU   (%rdi), %VMM(1)
>         /* Leaves only CHARS matching esi as 0.  */
> -       vpxorq  %YMM1, %YMM0, %YMM2
> -       VPMINU  %YMM2, %YMM1, %YMM2
> -       /* Each bit in K0 represents a CHAR or a null byte in YMM1.  */
> -       VPTESTN %YMM2, %YMM2, %k0
> -       kmovd   %k0, %eax
> -       testl   %eax, %eax
> +       vpxorq  %VMM(1), %VMATCH, %VMM(2)
> +       VPMINU  %VMM(2), %VMM(1), %VMM(2)
> +       /* Each bit in K0 represents a CHAR or a null byte in VEC_1.  */
> +       VPTESTN %VMM(2), %VMM(2), %k0
> +       KMOV    %k0, %VRAX
> +# if VEC_SIZE == 64 && defined USE_AS_STRCHRNUL
> +       /* If VEC_SIZE == 64 && STRCHRNUL use bsf to test condition so
> +          that all logic for match/null in first VEC first in 1x cache
> +          lines.  This has a slight cost to larger sizes.  */
> +       bsf     %VRAX, %VRAX
> +       jz      L(aligned_more)
> +# else
> +       test    %VRAX, %VRAX
>         jz      L(aligned_more)
> -       tzcntl  %eax, %eax
> +       bsf     %VRAX, %VRAX
> +# endif
>  # ifndef USE_AS_STRCHRNUL
>         /* Found CHAR or the null byte.  */
>         cmp     (%rdi, %rax, CHAR_SIZE), %CHAR_REG
> @@ -109,287 +144,374 @@ ENTRY_P2ALIGN (STRCHR, 5)
>  # endif
>         ret
>
> -
> -
> -       .p2align 4,, 10
> -L(first_vec_x4):
> -# ifndef USE_AS_STRCHRNUL
> -       /* Check to see if first match was CHAR (k0) or null (k1).  */
> -       kmovd   %k0, %eax
> -       tzcntl  %eax, %eax
> -       kmovd   %k1, %ecx
> -       /* bzhil will not be 0 if first match was null.  */
> -       bzhil   %eax, %ecx, %ecx
> -       jne     L(zero)
> -# else
> -       /* Combine CHAR and null matches.  */
> -       kord    %k0, %k1, %k0
> -       kmovd   %k0, %eax
> -       tzcntl  %eax, %eax
> -# endif
> -       /* NB: Multiply sizeof char type (1 or 4) to get the number of
> -          bytes.  */
> -       leaq    (VEC_SIZE * 4)(%rdi, %rax, CHAR_SIZE), %rax
> -       ret
> -
>  # ifndef USE_AS_STRCHRNUL
>  L(zero):
>         xorl    %eax, %eax
>         ret
>  # endif
>
> -
> -       .p2align 4
> +       .p2align 4,, 2
> +L(first_vec_x3):
> +       subq    $-(VEC_SIZE * 2), %rdi
> +# if VEC_SIZE == 32
> +       /* Reuse L(first_vec_x3) for last VEC2 only for VEC_SIZE == 32.
> +          For VEC_SIZE == 64 the registers don't match.  */
> +L(last_vec_x2):
> +# endif
>  L(first_vec_x1):
>         /* Use bsf here to save 1-byte keeping keeping the block in 1x
>            fetch block. eax guranteed non-zero.  */
> -       bsfl    %eax, %eax
> +       bsf     %VRCX, %VRCX
>  # ifndef USE_AS_STRCHRNUL
> -       /* Found CHAR or the null byte.  */
> -       cmp     (VEC_SIZE)(%rdi, %rax, CHAR_SIZE), %CHAR_REG
> +       /* Found CHAR or the null byte.  */
> +       cmp     (VEC_SIZE)(%rdi, %rcx, CHAR_SIZE), %CHAR_REG
>         jne     L(zero)
> -
>  # endif
>         /* NB: Multiply sizeof char type (1 or 4) to get the number of
>            bytes.  */
> -       leaq    (VEC_SIZE)(%rdi, %rax, CHAR_SIZE), %rax
> +       leaq    (VEC_SIZE)(%rdi, %rcx, CHAR_SIZE), %rax
>         ret
>
> -       .p2align 4,, 10
> +       .p2align 4,, 2
> +L(first_vec_x4):
> +       subq    $-(VEC_SIZE * 2), %rdi
>  L(first_vec_x2):
>  # ifndef USE_AS_STRCHRNUL
>         /* Check to see if first match was CHAR (k0) or null (k1).  */
> -       kmovd   %k0, %eax
> -       tzcntl  %eax, %eax
> -       kmovd   %k1, %ecx
> +       KMOV    %k0, %VRAX
> +       tzcnt   %VRAX, %VRAX
> +       KMOV    %k1, %VRCX
>         /* bzhil will not be 0 if first match was null.  */
> -       bzhil   %eax, %ecx, %ecx
> +       bzhi    %VRAX, %VRCX, %VRCX
>         jne     L(zero)
>  # else
>         /* Combine CHAR and null matches.  */
> -       kord    %k0, %k1, %k0
> -       kmovd   %k0, %eax
> -       tzcntl  %eax, %eax
> +       KOR     %k0, %k1, %k0
> +       KMOV    %k0, %VRAX
> +       bsf     %VRAX, %VRAX
>  # endif
>         /* NB: Multiply sizeof char type (1 or 4) to get the number of
>            bytes.  */
>         leaq    (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
>         ret
>
> -       .p2align 4,, 10
> -L(first_vec_x3):
> -       /* Use bsf here to save 1-byte keeping keeping the block in 1x
> -          fetch block. eax guranteed non-zero.  */
> -       bsfl    %eax, %eax
> -# ifndef USE_AS_STRCHRNUL
> -       /* Found CHAR or the null byte.  */
> -       cmp     (VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %CHAR_REG
> -       jne     L(zero)
> +# ifdef USE_AS_STRCHRNUL
> +       /* We use this as a hook to get imm8 encoding for the jmp to
> +          L(page_cross_boundary).  This allows the hot case of a
> +          match/null-term in first VEC to fit entirely in 1 cache
> +          line.  */
> +L(cross_page_boundary):
> +       jmp     L(cross_page_boundary_real)
>  # endif
> -       /* NB: Multiply sizeof char type (1 or 4) to get the number of
> -          bytes.  */
> -       leaq    (VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax
> -       ret
>
>         .p2align 4
>  L(aligned_more):
> +L(cross_page_continue):
>         /* Align data to VEC_SIZE.  */
>         andq    $-VEC_SIZE, %rdi
> -L(cross_page_continue):
> -       /* Check the next 4 * VEC_SIZE. Only one VEC_SIZE at a time since
> -          data is only aligned to VEC_SIZE. Use two alternating methods
> -          for checking VEC to balance latency and port contention.  */
>
> -       /* This method has higher latency but has better port
> -          distribution.  */
> -       VMOVA   (VEC_SIZE)(%rdi), %YMM1
> +       /* Check the next 4 * VEC_SIZE. Only one VEC_SIZE at a time
> +          since data is only aligned to VEC_SIZE. Use two alternating
> +          methods for checking VEC to balance latency and port
> +          contention.  */
> +
> +    /* Method(1) with 8c latency:
> +          For VEC_SIZE == 32:
> +          p0 * 1.83, p1 * 0.83, p5 * 1.33
> +          For VEC_SIZE == 64:
> +          p0 * 2.50, p1 * 0.00, p5 * 1.50  */
> +       VMOVA   (VEC_SIZE)(%rdi), %VMM(1)
>         /* Leaves only CHARS matching esi as 0.  */
> -       vpxorq  %YMM1, %YMM0, %YMM2
> -       VPMINU  %YMM2, %YMM1, %YMM2
> -       /* Each bit in K0 represents a CHAR or a null byte in YMM1.  */
> -       VPTESTN %YMM2, %YMM2, %k0
> -       kmovd   %k0, %eax
> -       testl   %eax, %eax
> +       vpxorq  %VMM(1), %VMATCH, %VMM(2)
> +       VPMINU  %VMM(2), %VMM(1), %VMM(2)
> +       /* Each bit in K0 represents a CHAR or a null byte in VEC_1.  */
> +       VPTESTN %VMM(2), %VMM(2), %k0
> +       KMOV    %k0, %VRCX
> +       test    %VRCX, %VRCX
>         jnz     L(first_vec_x1)
>
> -       /* This method has higher latency but has better port
> -          distribution.  */
> -       VMOVA   (VEC_SIZE * 2)(%rdi), %YMM1
> -       /* Each bit in K0 represents a CHAR in YMM1.  */
> -       VPCMP   $0, %YMM1, %YMM0, %k0
> -       /* Each bit in K1 represents a CHAR in YMM1.  */
> -       VPTESTN %YMM1, %YMM1, %k1
> -       kortestd        %k0, %k1
> +    /* Method(2) with 6c latency:
> +          For VEC_SIZE == 32:
> +          p0 * 1.00, p1 * 0.00, p5 * 2.00
> +          For VEC_SIZE == 64:
> +          p0 * 1.00, p1 * 0.00, p5 * 2.00  */
> +       VMOVA   (VEC_SIZE * 2)(%rdi), %VMM(1)
> +       /* Each bit in K0 represents a CHAR in VEC_1.  */
> +       VPCMPEQ %VMM(1), %VMATCH, %k0
> +       /* Each bit in K1 represents a CHAR in VEC_1.  */
> +       VPTESTN %VMM(1), %VMM(1), %k1
> +       KORTEST %k0, %k1
>         jnz     L(first_vec_x2)
>
> -       VMOVA   (VEC_SIZE * 3)(%rdi), %YMM1
> +       /* By swapping between Method 1/2 we get more fair port
> +          distrubition and better throughput.  */
> +
> +       VMOVA   (VEC_SIZE * 3)(%rdi), %VMM(1)
>         /* Leaves only CHARS matching esi as 0.  */
> -       vpxorq  %YMM1, %YMM0, %YMM2
> -       VPMINU  %YMM2, %YMM1, %YMM2
> -       /* Each bit in K0 represents a CHAR or a null byte in YMM1.  */
> -       VPTESTN %YMM2, %YMM2, %k0
> -       kmovd   %k0, %eax
> -       testl   %eax, %eax
> +       vpxorq  %VMM(1), %VMATCH, %VMM(2)
> +       VPMINU  %VMM(2), %VMM(1), %VMM(2)
> +       /* Each bit in K0 represents a CHAR or a null byte in VEC_1.  */
> +       VPTESTN %VMM(2), %VMM(2), %k0
> +       KMOV    %k0, %VRCX
> +       test    %VRCX, %VRCX
>         jnz     L(first_vec_x3)
>
> -       VMOVA   (VEC_SIZE * 4)(%rdi), %YMM1
> -       /* Each bit in K0 represents a CHAR in YMM1.  */
> -       VPCMP   $0, %YMM1, %YMM0, %k0
> -       /* Each bit in K1 represents a CHAR in YMM1.  */
> -       VPTESTN %YMM1, %YMM1, %k1
> -       kortestd        %k0, %k1
> +       VMOVA   (VEC_SIZE * 4)(%rdi), %VMM(1)
> +       /* Each bit in K0 represents a CHAR in VEC_1.  */
> +       VPCMPEQ %VMM(1), %VMATCH, %k0
> +       /* Each bit in K1 represents a CHAR in VEC_1.  */
> +       VPTESTN %VMM(1), %VMM(1), %k1
> +       KORTEST %k0, %k1
>         jnz     L(first_vec_x4)
>
>         /* Align data to VEC_SIZE * 4 for the loop.  */
> +# if VEC_SIZE == 64
> +       /* Use rax for the loop reg as it allows to the loop to fit in
> +          exactly 2-cache-lines. (more efficient imm32 + gpr
> +          encoding).  */
> +       leaq    (VEC_SIZE)(%rdi), %rax
> +       /* No partial register stalls on evex512 processors.  */
> +       xorb    %al, %al
> +# else
> +       /* For VEC_SIZE == 32 continue using rdi for loop reg so we can
> +          reuse more code and save space.  */
>         addq    $VEC_SIZE, %rdi
>         andq    $-(VEC_SIZE * 4), %rdi
> -
> +# endif
>         .p2align 4
>  L(loop_4x_vec):
> -       /* Check 4x VEC at a time. No penalty to imm32 offset with evex
> -          encoding.  */
> -       VMOVA   (VEC_SIZE * 4)(%rdi), %YMM1
> -       VMOVA   (VEC_SIZE * 5)(%rdi), %YMM2
> -       VMOVA   (VEC_SIZE * 6)(%rdi), %YMM3
> -       VMOVA   (VEC_SIZE * 7)(%rdi), %YMM4
> -
> -       /* For YMM1 and YMM3 use xor to set the CHARs matching esi to
> +       /* Check 4x VEC at a time. No penalty for imm32 offset with evex
> +          encoding (if offset % VEC_SIZE == 0).  */
> +       VMOVA   (VEC_SIZE * 4)(%LOOP_REG), %VMM(1)
> +       VMOVA   (VEC_SIZE * 5)(%LOOP_REG), %VMM(2)
> +       VMOVA   (VEC_SIZE * 6)(%LOOP_REG), %VMM(3)
> +       VMOVA   (VEC_SIZE * 7)(%LOOP_REG), %VMM(4)
> +
> +       /* Collect bits where VEC_1 does NOT match esi.  This is later
> +          use to mask of results (getting not matches allows us to
> +          save an instruction on combining).  */
> +       VPCMP   $4, %VMATCH, %VMM(1), %k1
> +
> +       /* Two methods for loop depending on VEC_SIZE.  This is because
> +          with zmm registers VPMINU can only run on p0 (as opposed to
> +          p0/p1 for ymm) so it is less prefered.  */
> +# if VEC_SIZE == 32
> +       /* For VEC_2 and VEC_3 use xor to set the CHARs matching esi to
>            zero.  */
> -       vpxorq  %YMM1, %YMM0, %YMM5
> -       /* For YMM2 and YMM4 cmp not equals to CHAR and store result in
> -          k register. Its possible to save either 1 or 2 instructions
> -          using cmp no equals method for either YMM1 or YMM1 and YMM3
> -          respectively but bottleneck on p5 makes it not worth it.  */
> -       VPCMP   $4, %YMM0, %YMM2, %k2
> -       vpxorq  %YMM3, %YMM0, %YMM7
> -       VPCMP   $4, %YMM0, %YMM4, %k4
> -
> -       /* Use min to select all zeros from either xor or end of string).
> -        */
> -       VPMINU  %YMM1, %YMM5, %YMM1
> -       VPMINU  %YMM3, %YMM7, %YMM3
> +       vpxorq  %VMM(2), %VMATCH, %VMM(6)
> +       vpxorq  %VMM(3), %VMATCH, %VMM(7)
>
> -       /* Use min + zeromask to select for zeros. Since k2 and k4 will
> -          have 0 as positions that matched with CHAR which will set
> -          zero in the corresponding destination bytes in YMM2 / YMM4.
> -        */
> -       VPMINU  %YMM1, %YMM2, %YMM2{%k2}{z}
> -       VPMINU  %YMM3, %YMM4, %YMM4
> -       VPMINU  %YMM2, %YMM4, %YMM4{%k4}{z}
> -
> -       VPTESTN %YMM4, %YMM4, %k1
> -       kmovd   %k1, %ecx
> -       subq    $-(VEC_SIZE * 4), %rdi
> -       testl   %ecx, %ecx
> +       /* Find non-matches in VEC_4 while combining with non-matches
> +          from VEC_1.  NB: Try and use masked predicate execution on
> +          instructions that have mask result as it has no latency
> +          penalty.  */
> +       VPCMP   $4, %VMATCH, %VMM(4), %k4{%k1}
> +
> +       /* Combined zeros from VEC_1 / VEC_2 (search for null term).  */
> +       VPMINU  %VMM(1), %VMM(2), %VMM(2)
> +
> +       /* Use min to select all zeros from either xor or end of
> +          string).  */
> +       VPMINU  %VMM(3), %VMM(7), %VMM(3)
> +       VPMINU  %VMM(2), %VMM(6), %VMM(2)
> +
> +       /* Combined zeros from VEC_2 / VEC_3 (search for null term).  */
> +       VPMINU  %VMM(3), %VMM(4), %VMM(4)
> +
> +       /* Combined zeros from VEC_2 / VEC_4 (this has all null term and
> +          esi matches for VEC_2 / VEC_3).  */
> +       VPMINU  %VMM(2), %VMM(4), %VMM(4)
> +# else
> +       /* Collect non-matches for VEC_2.  */
> +       VPCMP   $4, %VMM(2), %VMATCH, %k2
> +
> +       /* Combined zeros from VEC_1 / VEC_2 (search for null term).  */
> +       VPMINU  %VMM(1), %VMM(2), %VMM(2)
> +
> +       /* Find non-matches in VEC_3/VEC_4 while combining with non-
> +          matches from VEC_1/VEC_2 respectively.  */
> +       VPCMP   $4, %VMM(3), %VMATCH, %k3{%k1}
> +       VPCMP   $4, %VMM(4), %VMATCH, %k4{%k2}
> +
> +       /* Finish combining zeros in all VECs.  */
> +       VPMINU  %VMM(3), %VMM(4), %VMM(4)
> +
> +       /* Combine in esi matches for VEC_3 (if there was a match with
> +          esi, the corresponding bit in %k3 is zero so the
> +          VPMINU_MASKZ will have a zero in the result).  NB: This make
> +          the VPMINU 3c latency.  The only way to avoid it is to
> +          createa a 12c dependency chain on all the `VPCMP $4, ...`
> +          which has higher total latency.  */
> +       VPMINU  %VMM(2), %VMM(4), %VMM(4){%k3}{z}
> +# endif
> +       VPTEST  %VMM(4), %VMM(4), %k0{%k4}
> +       KMOV    %k0, %VRDX
> +       subq    $-(VEC_SIZE * 4), %LOOP_REG
> +
> +       /* TESTZ is inc using the proper register width depending on
> +          CHAR_PER_VEC. An esi match or null-term match leaves a zero-
> +          bit in rdx so inc won't overflow and won't be zero.  */
> +       TESTZ   (rdx)
>         jz      L(loop_4x_vec)
>
> -       VPTESTN %YMM1, %YMM1, %k0
> -       kmovd   %k0, %eax
> -       testl   %eax, %eax
> -       jnz     L(last_vec_x1)
> +       VPTEST  %VMM(1), %VMM(1), %k0{%k1}
> +       KMOV    %k0, %VGPR(MASK_GPR)
> +       TESTZ   (MASK_GPR)
> +# if VEC_SIZE == 32
> +       /* We can reuse the return code in page_cross logic for VEC_SIZE
> +          == 32.  */
> +       jnz     L(last_vec_x1_vec_size32)
> +# else
> +       jnz     L(last_vec_x1_vec_size64)
> +# endif
> +
>
> -       VPTESTN %YMM2, %YMM2, %k0
> -       kmovd   %k0, %eax
> -       testl   %eax, %eax
> +       /* COND_MASK integates the esi matches for VEC_SIZE == 64. For
> +          VEC_SIZE == 32 they are already integrated.  */
> +       VPTEST  %VMM(2), %VMM(2), %k0 COND_MASK(k2)
> +       KMOV    %k0, %VRCX
> +       TESTZ   (rcx)
>         jnz     L(last_vec_x2)
>
> -       VPTESTN %YMM3, %YMM3, %k0
> -       kmovd   %k0, %eax
> -       /* Combine YMM3 matches (eax) with YMM4 matches (ecx).  */
> -# ifdef USE_AS_WCSCHR
> -       sall    $8, %ecx
> -       orl     %ecx, %eax
> -       bsfl    %eax, %eax
> +       VPTEST  %VMM(3), %VMM(3), %k0 COND_MASK(k3)
> +       KMOV    %k0, %VRCX
> +# if CHAR_PER_VEC == 64
> +       TESTZ   (rcx)
> +       jnz     L(last_vec_x3)
>  # else
> -       salq    $32, %rcx
> -       orq     %rcx, %rax
> -       bsfq    %rax, %rax
> +       salq    $CHAR_PER_VEC, %rdx
> +       TESTZ   (rcx)
> +       orq     %rcx, %rdx
>  # endif
> +
> +       bsfq    %rdx, %rdx
> +
>  # ifndef USE_AS_STRCHRNUL
>         /* Check if match was CHAR or null.  */
> -       cmp     (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %CHAR_REG
> +       cmp     (LAST_VEC_OFFSET)(%LOOP_REG, %rdx, CHAR_SIZE), %CHAR_REG
>         jne     L(zero_end)
>  # endif
>         /* NB: Multiply sizeof char type (1 or 4) to get the number of
>            bytes.  */
> -       leaq    (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
> +       leaq    (LAST_VEC_OFFSET)(%LOOP_REG, %rdx, CHAR_SIZE), %rax
>         ret
>
> -       .p2align 4,, 8
> -L(last_vec_x1):
> -       bsfl    %eax, %eax
> -# ifdef USE_AS_WCSCHR
> -       /* NB: Multiply wchar_t count by 4 to get the number of bytes.
> -          */
> -       leaq    (%rdi, %rax, CHAR_SIZE), %rax
> -# else
> -       addq    %rdi, %rax
> +# ifndef USE_AS_STRCHRNUL
> +L(zero_end):
> +       xorl    %eax, %eax
> +       ret
>  # endif
>
> -# ifndef USE_AS_STRCHRNUL
> +
> +       /* Seperate return label for last VEC1 because for VEC_SIZE ==
> +          32 we can reuse return code in L(page_cross) but VEC_SIZE ==
> +          64 has mismatched registers.  */
> +# if VEC_SIZE == 64
> +       .p2align 4,, 8
> +L(last_vec_x1_vec_size64):
> +       bsf     %VRCX, %VRCX
> +#  ifndef USE_AS_STRCHRNUL
>         /* Check if match was null.  */
> -       cmp     (%rax), %CHAR_REG
> +       cmp     (%rax, %rcx, CHAR_SIZE), %CHAR_REG
>         jne     L(zero_end)
> -# endif
> -
> +#  endif
> +#  ifdef USE_AS_WCSCHR
> +       /* NB: Multiply wchar_t count by 4 to get the number of bytes.
> +        */
> +       leaq    (%rax, %rcx, CHAR_SIZE), %rax
> +#  else
> +       addq    %rcx, %rax
> +#  endif
>         ret
>
> +       /* Since we can't combine the last 2x matches for CHAR_PER_VEC
> +          == 64 we need return label for last VEC3.  */
> +#  if CHAR_PER_VEC == 64
>         .p2align 4,, 8
> +L(last_vec_x3):
> +       addq    $VEC_SIZE, %LOOP_REG
> +#  endif
> +
> +       /* Duplicate L(last_vec_x2) for VEC_SIZE == 64 because we can't
> +          reuse L(first_vec_x3) due to register mismatch.  */
>  L(last_vec_x2):
> -       bsfl    %eax, %eax
> -# ifndef USE_AS_STRCHRNUL
> +       bsf     %VGPR(MASK_GPR), %VGPR(MASK_GPR)
> +#  ifndef USE_AS_STRCHRNUL
>         /* Check if match was null.  */
> -       cmp     (VEC_SIZE)(%rdi, %rax, CHAR_SIZE), %CHAR_REG
> +       cmp     (VEC_SIZE * 1)(%LOOP_REG, %MASK_GPR, CHAR_SIZE), %CHAR_REG
>         jne     L(zero_end)
> -# endif
> +#  endif
>         /* NB: Multiply sizeof char type (1 or 4) to get the number of
>            bytes.  */
> -       leaq    (VEC_SIZE)(%rdi, %rax, CHAR_SIZE), %rax
> +       leaq    (VEC_SIZE * 1)(%LOOP_REG, %MASK_GPR, CHAR_SIZE), %rax
>         ret
> +# endif
>
> -       /* Cold case for crossing page with first load.  */
> -       .p2align 4,, 8
> +       /* Cold case for crossing page with first load.  */
> +       .p2align 4,, 10
> +# ifndef USE_AS_STRCHRNUL
>  L(cross_page_boundary):
> -       movq    %rdi, %rdx
> +# endif
> +L(cross_page_boundary_real):
>         /* Align rdi.  */
> -       andq    $-VEC_SIZE, %rdi
> -       VMOVA   (%rdi), %YMM1
> -       /* Leaves only CHARS matching esi as 0.  */
> -       vpxorq  %YMM1, %YMM0, %YMM2
> -       VPMINU  %YMM2, %YMM1, %YMM2
> -       /* Each bit in K0 represents a CHAR or a null byte in YMM1.  */
> -       VPTESTN %YMM2, %YMM2, %k0
> -       kmovd   %k0, %eax
> +       xorq    %rdi, %rax
> +       VMOVA   (PAGE_SIZE - VEC_SIZE)(%rax), %VMM(1)
> +       /* Use high latency method of getting matches to save code size.
> +        */
> +
> +       /* K1 has 1s where VEC(1) does NOT match esi.  */
> +       VPCMP   $4, %VMM(1), %VMATCH, %k1
> +       /* K0 has ones where K1 is 1 (non-match with esi), and non-zero
> +          (null).  */
> +       VPTEST  %VMM(1), %VMM(1), %k0{%k1}
> +       KMOV    %k0, %VRAX
>         /* Remove the leading bits.  */
>  # ifdef USE_AS_WCSCHR
> -       movl    %edx, %SHIFT_REG
> +       movl    %edi, %VGPR_SZ(SHIFT_REG, 32)
>         /* NB: Divide shift count by 4 since each bit in K1 represent 4
>            bytes.  */
> -       sarl    $2, %SHIFT_REG
> -       andl    $(CHAR_PER_VEC - 1), %SHIFT_REG
> +       sarl    $2, %VGPR_SZ(SHIFT_REG, 32)
> +       andl    $(CHAR_PER_VEC - 1), %VGPR_SZ(SHIFT_REG, 32)
> +
> +       /* if wcsrchr we need to reverse matches as we can't rely on
> +          signed shift to bring in ones. There is not sarx for
> +          gpr8/16. Also not we can't use inc here as the lower bits
> +          represent matches out of range so we can't rely on overflow.
> +        */
> +       xorl    $((1 << CHAR_PER_VEC)- 1), %eax
> +# endif
> +       /* Use arithmatic shift so that leading 1s are filled in.  */
> +       sarx    %VGPR(SHIFT_REG), %VRAX, %VRAX
> +       /* If eax is all ones then no matches for esi or NULL.  */
> +
> +# ifdef USE_AS_WCSCHR
> +       test    %VRAX, %VRAX
> +# else
> +       inc     %VRAX
>  # endif
> -       sarxl   %SHIFT_REG, %eax, %eax
> -       /* If eax is zero continue.  */
> -       testl   %eax, %eax
>         jz      L(cross_page_continue)
> -       bsfl    %eax, %eax
>
> +       .p2align 4,, 10
> +L(last_vec_x1_vec_size32):
> +       bsf     %VRAX, %VRAX
>  # ifdef USE_AS_WCSCHR
> -       /* NB: Multiply wchar_t count by 4 to get the number of
> -          bytes.  */
> -       leaq    (%rdx, %rax, CHAR_SIZE), %rax
> +       /* NB: Multiply wchar_t count by 4 to get the number of bytes.
> +        */
> +       leaq    (%rdi, %rax, CHAR_SIZE), %rax
>  # else
> -       addq    %rdx, %rax
> +       addq    %rdi, %rax
>  # endif
>  # ifndef USE_AS_STRCHRNUL
>         /* Check to see if match was CHAR or null.  */
>         cmp     (%rax), %CHAR_REG
> -       je      L(cross_page_ret)
> -L(zero_end):
> -       xorl    %eax, %eax
> -L(cross_page_ret):
> +       jne     L(zero_end_0)
>  # endif
>         ret
> +# ifndef USE_AS_STRCHRNUL
> +L(zero_end_0):
> +       xorl    %eax, %eax
> +       ret
> +# endif
>
>  END (STRCHR)
>  #endif
> --
> 2.34.1
>

LGTM.

Thanks.

-- 
H.J.

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH v3 3/7] x86: Optimize strnlen-evex.S and implement with VMM headers
  2022-10-19  0:44   ` [PATCH v3 3/7] x86: Optimize strnlen-evex.S " Noah Goldstein
@ 2022-10-19 16:57     ` H.J. Lu
  0 siblings, 0 replies; 41+ messages in thread
From: H.J. Lu @ 2022-10-19 16:57 UTC (permalink / raw)
  To: Noah Goldstein; +Cc: libc-alpha, carlos

On Tue, Oct 18, 2022 at 5:44 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> Optimizations are:
> 1. Use the fact that bsf(0) leaves the destination unchanged to save a
>    branch in short string case.
> 2. Restructure code so that small strings are given the hot path.
>         - This is a net-zero on the benchmark suite but in general makes
>       sense as smaller sizes are far more common.
> 3. Use more code-size efficient instructions.
>         - tzcnt ...     -> bsf ...
>         - vpcmpb $0 ... -> vpcmpeq ...
> 4. Align labels less aggressively, especially if it doesn't save fetch
>    blocks / causes the basic-block to span extra cache-lines.
>
> The optimizations (especially for point 2) make the strnlen and
> strlen code essentially incompatible so split strnlen-evex
> to a new file.
>
> Code Size Changes:
> strlen-evex.S       :  -23 bytes
> strnlen-evex.S      : -167 bytes
>
> Net perf changes:
>
> Reported as geometric mean of all improvements / regressions from N=10
> runs of the benchtests. Value as New Time / Old Time so < 1.0 is
> improvement and 1.0 is regression.
>
> strlen-evex.S       : 0.992 (No real change)
> strnlen-evex.S      : 0.947
>
> Full results attached in email.
>
> Full check passes on x86-64.
> ---
>  sysdeps/x86_64/multiarch/strlen-evex.S  | 544 +++++++-----------------
>  sysdeps/x86_64/multiarch/strnlen-evex.S | 427 ++++++++++++++++++-
>  sysdeps/x86_64/multiarch/wcsnlen-evex.S |   5 +-
>  3 files changed, 572 insertions(+), 404 deletions(-)
>
> diff --git a/sysdeps/x86_64/multiarch/strlen-evex.S b/sysdeps/x86_64/multiarch/strlen-evex.S
> index 2109ec2f7a..487846f098 100644
> --- a/sysdeps/x86_64/multiarch/strlen-evex.S
> +++ b/sysdeps/x86_64/multiarch/strlen-evex.S
> @@ -26,466 +26,220 @@
>  #  define STRLEN       __strlen_evex
>  # endif
>
> -# define VMOVA         vmovdqa64
> +# ifndef VEC_SIZE
> +#  include "x86-evex256-vecs.h"
> +# endif
>
>  # ifdef USE_AS_WCSLEN
> -#  define VPCMP                vpcmpd
> +#  define VPCMPEQ      vpcmpeqd
> +#  define VPCMPNEQ     vpcmpneqd
> +#  define VPTESTN      vptestnmd
> +#  define VPTEST       vptestmd
>  #  define VPMINU       vpminud
> -#  define SHIFT_REG ecx
>  #  define CHAR_SIZE    4
> +#  define CHAR_SIZE_SHIFT_REG(reg)     sar $2, %reg
>  # else
> -#  define VPCMP                vpcmpb
> +#  define VPCMPEQ      vpcmpeqb
> +#  define VPCMPNEQ     vpcmpneqb
> +#  define VPTESTN      vptestnmb
> +#  define VPTEST       vptestmb
>  #  define VPMINU       vpminub
> -#  define SHIFT_REG edx
>  #  define CHAR_SIZE    1
> +#  define CHAR_SIZE_SHIFT_REG(reg)
> +
> +#  define REG_WIDTH    VEC_SIZE
>  # endif
>
> -# define XMMZERO       xmm16
> -# define YMMZERO       ymm16
> -# define YMM1          ymm17
> -# define YMM2          ymm18
> -# define YMM3          ymm19
> -# define YMM4          ymm20
> -# define YMM5          ymm21
> -# define YMM6          ymm22
> -
> -# define VEC_SIZE 32
> -# define PAGE_SIZE 4096
> -# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE)
> -
> -       .section .text.evex,"ax",@progbits
> -ENTRY (STRLEN)
> -# ifdef USE_AS_STRNLEN
> -       /* Check zero length.  */
> -       test    %RSI_LP, %RSI_LP
> -       jz      L(zero)
> -#  ifdef __ILP32__
> -       /* Clear the upper 32 bits.  */
> -       movl    %esi, %esi
> -#  endif
> -       mov     %RSI_LP, %R8_LP
> +# define CHAR_PER_VEC  (VEC_SIZE / CHAR_SIZE)
> +
> +# include "reg-macros.h"
> +
> +# if CHAR_PER_VEC == 64
> +
> +#  define TAIL_RETURN_LBL      first_vec_x2
> +#  define TAIL_RETURN_OFFSET   (CHAR_PER_VEC * 2)
> +
> +#  define FALLTHROUGH_RETURN_LBL       first_vec_x3
> +#  define FALLTHROUGH_RETURN_OFFSET    (CHAR_PER_VEC * 3)
> +
> +# else
> +
> +#  define TAIL_RETURN_LBL      first_vec_x3
> +#  define TAIL_RETURN_OFFSET   (CHAR_PER_VEC * 3)
> +
> +#  define FALLTHROUGH_RETURN_LBL       first_vec_x2
> +#  define FALLTHROUGH_RETURN_OFFSET    (CHAR_PER_VEC * 2)
>  # endif
> +
> +# define XZERO VMM_128(0)
> +# define VZERO VMM(0)
> +# define PAGE_SIZE     4096
> +
> +       .section SECTION(.text), "ax", @progbits
> +ENTRY_P2ALIGN (STRLEN, 6)
>         movl    %edi, %eax
> -       vpxorq  %XMMZERO, %XMMZERO, %XMMZERO
> -       /* Clear high bits from edi. Only keeping bits relevant to page
> -          cross check.  */
> +       vpxorq  %XZERO, %XZERO, %XZERO
>         andl    $(PAGE_SIZE - 1), %eax
> -       /* Check if we may cross page boundary with one vector load.  */
>         cmpl    $(PAGE_SIZE - VEC_SIZE), %eax
>         ja      L(cross_page_boundary)
>
>         /* Check the first VEC_SIZE bytes.  Each bit in K0 represents a
>            null byte.  */
> -       VPCMP   $0, (%rdi), %YMMZERO, %k0
> -       kmovd   %k0, %eax
> -# ifdef USE_AS_STRNLEN
> -       /* If length < CHAR_PER_VEC handle special.  */
> -       cmpq    $CHAR_PER_VEC, %rsi
> -       jbe     L(first_vec_x0)
> -# endif
> -       testl   %eax, %eax
> +       VPCMPEQ (%rdi), %VZERO, %k0
> +       KMOV    %k0, %VRAX
> +       test    %VRAX, %VRAX
>         jz      L(aligned_more)
> -       tzcntl  %eax, %eax
> -       ret
> -# ifdef USE_AS_STRNLEN
> -L(zero):
> -       xorl    %eax, %eax
> -       ret
> -
> -       .p2align 4
> -L(first_vec_x0):
> -       /* Set bit for max len so that tzcnt will return min of max len
> -          and position of first match.  */
> -       btsq    %rsi, %rax
> -       tzcntl  %eax, %eax
> -       ret
> -# endif
> -
> -       .p2align 4
> -L(first_vec_x1):
> -       tzcntl  %eax, %eax
> -       /* Safe to use 32 bit instructions as these are only called for
> -          size = [1, 159].  */
> -# ifdef USE_AS_STRNLEN
> -       /* Use ecx which was computed earlier to compute correct value.
> -        */
> -       leal    -(CHAR_PER_VEC * 4 + 1)(%rcx, %rax), %eax
> -# else
> -       subl    %edx, %edi
> -#  ifdef USE_AS_WCSLEN
> -       /* NB: Divide bytes by 4 to get the wchar_t count.  */
> -       sarl    $2, %edi
> -#  endif
> -       leal    CHAR_PER_VEC(%rdi, %rax), %eax
> -# endif
> -       ret
> -
> -       .p2align 4
> -L(first_vec_x2):
> -       tzcntl  %eax, %eax
> -       /* Safe to use 32 bit instructions as these are only called for
> -          size = [1, 159].  */
> -# ifdef USE_AS_STRNLEN
> -       /* Use ecx which was computed earlier to compute correct value.
> -        */
> -       leal    -(CHAR_PER_VEC * 3 + 1)(%rcx, %rax), %eax
> -# else
> -       subl    %edx, %edi
> -#  ifdef USE_AS_WCSLEN
> -       /* NB: Divide bytes by 4 to get the wchar_t count.  */
> -       sarl    $2, %edi
> -#  endif
> -       leal    (CHAR_PER_VEC * 2)(%rdi, %rax), %eax
> -# endif
> +       bsf     %VRAX, %VRAX
>         ret
>
> -       .p2align 4
> -L(first_vec_x3):
> -       tzcntl  %eax, %eax
> -       /* Safe to use 32 bit instructions as these are only called for
> -          size = [1, 159].  */
> -# ifdef USE_AS_STRNLEN
> -       /* Use ecx which was computed earlier to compute correct value.
> -        */
> -       leal    -(CHAR_PER_VEC * 2 + 1)(%rcx, %rax), %eax
> -# else
> -       subl    %edx, %edi
> -#  ifdef USE_AS_WCSLEN
> -       /* NB: Divide bytes by 4 to get the wchar_t count.  */
> -       sarl    $2, %edi
> -#  endif
> -       leal    (CHAR_PER_VEC * 3)(%rdi, %rax), %eax
> -# endif
> -       ret
> -
> -       .p2align 4
> +       .p2align 4,, 8
>  L(first_vec_x4):
> -       tzcntl  %eax, %eax
> -       /* Safe to use 32 bit instructions as these are only called for
> -          size = [1, 159].  */
> -# ifdef USE_AS_STRNLEN
> -       /* Use ecx which was computed earlier to compute correct value.
> -        */
> -       leal    -(CHAR_PER_VEC + 1)(%rcx, %rax), %eax
> -# else
> -       subl    %edx, %edi
> -#  ifdef USE_AS_WCSLEN
> -       /* NB: Divide bytes by 4 to get the wchar_t count.  */
> -       sarl    $2, %edi
> -#  endif
> +       bsf     %VRAX, %VRAX
> +       subl    %ecx, %edi
> +       CHAR_SIZE_SHIFT_REG (edi)
>         leal    (CHAR_PER_VEC * 4)(%rdi, %rax), %eax
> -# endif
>         ret
>
> -       .p2align 5
> +
> +
> +       /* Aligned more for strnlen compares remaining length vs 2 *
> +          CHAR_PER_VEC, 4 * CHAR_PER_VEC, and 8 * CHAR_PER_VEC before
> +          going to the loop.  */
> +       .p2align 4,, 10
>  L(aligned_more):
> -       movq    %rdi, %rdx
> -       /* Align data to VEC_SIZE.  */
> -       andq    $-(VEC_SIZE), %rdi
> +       movq    %rdi, %rcx
> +       andq    $(VEC_SIZE * -1), %rdi
>  L(cross_page_continue):
> -       /* Check the first 4 * VEC_SIZE.  Only one VEC_SIZE at a time
> -          since data is only aligned to VEC_SIZE.  */
> -# ifdef USE_AS_STRNLEN
> -       /* + CHAR_SIZE because it simplies the logic in
> -          last_4x_vec_or_less.  */
> -       leaq    (VEC_SIZE * 5 + CHAR_SIZE)(%rdi), %rcx
> -       subq    %rdx, %rcx
> -#  ifdef USE_AS_WCSLEN
> -       /* NB: Divide bytes by 4 to get the wchar_t count.  */
> -       sarl    $2, %ecx
> -#  endif
> -# endif
> -       /* Load first VEC regardless.  */
> -       VPCMP   $0, VEC_SIZE(%rdi), %YMMZERO, %k0
> -# ifdef USE_AS_STRNLEN
> -       /* Adjust length. If near end handle specially.  */
> -       subq    %rcx, %rsi
> -       jb      L(last_4x_vec_or_less)
> -# endif
> -       kmovd   %k0, %eax
> -       testl   %eax, %eax
> +       /* Remaining length >= 2 * CHAR_PER_VEC so do VEC0/VEC1 without
> +          rechecking bounds.  */
> +       VPCMPEQ (VEC_SIZE * 1)(%rdi), %VZERO, %k0
> +       KMOV    %k0, %VRAX
> +       test    %VRAX, %VRAX
>         jnz     L(first_vec_x1)
>
> -       VPCMP   $0, (VEC_SIZE * 2)(%rdi), %YMMZERO, %k0
> -       kmovd   %k0, %eax
> -       test    %eax, %eax
> +       VPCMPEQ (VEC_SIZE * 2)(%rdi), %VZERO, %k0
> +       KMOV    %k0, %VRAX
> +       test    %VRAX, %VRAX
>         jnz     L(first_vec_x2)
>
> -       VPCMP   $0, (VEC_SIZE * 3)(%rdi), %YMMZERO, %k0
> -       kmovd   %k0, %eax
> -       testl   %eax, %eax
> +       VPCMPEQ (VEC_SIZE * 3)(%rdi), %VZERO, %k0
> +       KMOV    %k0, %VRAX
> +       test    %VRAX, %VRAX
>         jnz     L(first_vec_x3)
>
> -       VPCMP   $0, (VEC_SIZE * 4)(%rdi), %YMMZERO, %k0
> -       kmovd   %k0, %eax
> -       testl   %eax, %eax
> +       VPCMPEQ (VEC_SIZE * 4)(%rdi), %VZERO, %k0
> +       KMOV    %k0, %VRAX
> +       test    %VRAX, %VRAX
>         jnz     L(first_vec_x4)
>
> -       addq    $VEC_SIZE, %rdi
> -# ifdef USE_AS_STRNLEN
> -       /* Check if at last VEC_SIZE * 4 length.  */
> -       cmpq    $(CHAR_PER_VEC * 4 - 1), %rsi
> -       jbe     L(last_4x_vec_or_less_load)
> -       movl    %edi, %ecx
> -       andl    $(VEC_SIZE * 4 - 1), %ecx
> -#  ifdef USE_AS_WCSLEN
> -       /* NB: Divide bytes by 4 to get the wchar_t count.  */
> -       sarl    $2, %ecx
> -#  endif
> -       /* Readjust length.  */
> -       addq    %rcx, %rsi
> -# endif
> -       /* Align data to VEC_SIZE * 4.  */
> +       subq    $(VEC_SIZE * -1), %rdi
> +
> +# if CHAR_PER_VEC == 64
> +       /* No partial register stalls on processors that we use evex512
> +          on and this saves code size.  */
> +       xorb    %dil, %dil
> +# else
>         andq    $-(VEC_SIZE * 4), %rdi
> +# endif
> +
> +
>
>         /* Compare 4 * VEC at a time forward.  */
>         .p2align 4
>  L(loop_4x_vec):
> -       /* Load first VEC regardless.  */
> -       VMOVA   (VEC_SIZE * 4)(%rdi), %YMM1
> -# ifdef USE_AS_STRNLEN
> -       /* Break if at end of length.  */
> -       subq    $(CHAR_PER_VEC * 4), %rsi
> -       jb      L(last_4x_vec_or_less_cmpeq)
> -# endif
> -       /* Save some code size by microfusing VPMINU with the load. Since
> -          the matches in ymm2/ymm4 can only be returned if there where no
> -          matches in ymm1/ymm3 respectively there is no issue with overlap.
> -        */
> -       VPMINU  (VEC_SIZE * 5)(%rdi), %YMM1, %YMM2
> -       VMOVA   (VEC_SIZE * 6)(%rdi), %YMM3
> -       VPMINU  (VEC_SIZE * 7)(%rdi), %YMM3, %YMM4
> +       VMOVA   (VEC_SIZE * 4)(%rdi), %VMM(1)
> +       VPMINU  (VEC_SIZE * 5)(%rdi), %VMM(1), %VMM(2)
> +       VMOVA   (VEC_SIZE * 6)(%rdi), %VMM(3)
> +       VPMINU  (VEC_SIZE * 7)(%rdi), %VMM(3), %VMM(4)
> +       VPTESTN %VMM(2), %VMM(2), %k0
> +       VPTESTN %VMM(4), %VMM(4), %k2
>
> -       VPCMP   $0, %YMM2, %YMMZERO, %k0
> -       VPCMP   $0, %YMM4, %YMMZERO, %k1
>         subq    $-(VEC_SIZE * 4), %rdi
> -       kortestd        %k0, %k1
> +       KORTEST %k0, %k2
>         jz      L(loop_4x_vec)
>
> -       /* Check if end was in first half.  */
> -       kmovd   %k0, %eax
> -       subq    %rdx, %rdi
> -# ifdef USE_AS_WCSLEN
> -       shrq    $2, %rdi
> -# endif
> -       testl   %eax, %eax
> -       jz      L(second_vec_return)
> +       VPTESTN %VMM(1), %VMM(1), %k1
> +       KMOV    %k1, %VRAX
> +       test    %VRAX, %VRAX
> +       jnz     L(first_vec_x0)
>
> -       VPCMP   $0, %YMM1, %YMMZERO, %k2
> -       kmovd   %k2, %edx
> -       /* Combine VEC1 matches (edx) with VEC2 matches (eax).  */
> -# ifdef USE_AS_WCSLEN
> -       sall    $CHAR_PER_VEC, %eax
> -       orl     %edx, %eax
> -       tzcntl  %eax, %eax
> -# else
> -       salq    $CHAR_PER_VEC, %rax
> -       orq     %rdx, %rax
> -       tzcntq  %rax, %rax
> -# endif
> -       addq    %rdi, %rax
> -       ret
> -
> -
> -# ifdef USE_AS_STRNLEN
> -
> -L(last_4x_vec_or_less_load):
> -       /* Depending on entry adjust rdi / prepare first VEC in YMM1.  */
> -       VMOVA   (VEC_SIZE * 4)(%rdi), %YMM1
> -L(last_4x_vec_or_less_cmpeq):
> -       VPCMP   $0, %YMM1, %YMMZERO, %k0
> -       addq    $(VEC_SIZE * 3), %rdi
> -L(last_4x_vec_or_less):
> -       kmovd   %k0, %eax
> -       /* If remaining length > VEC_SIZE * 2. This works if esi is off by
> -          VEC_SIZE * 4.  */
> -       testl   $(CHAR_PER_VEC * 2), %esi
> -       jnz     L(last_4x_vec)
> -
> -       /* length may have been negative or positive by an offset of
> -          CHAR_PER_VEC * 4 depending on where this was called from. This
> -          fixes that.  */
> -       andl    $(CHAR_PER_VEC * 4 - 1), %esi
> -       testl   %eax, %eax
> -       jnz     L(last_vec_x1_check)
> +       KMOV    %k0, %VRAX
> +       test    %VRAX, %VRAX
> +       jnz     L(first_vec_x1)
>
> -       /* Check the end of data.  */
> -       subl    $CHAR_PER_VEC, %esi
> -       jb      L(max)
> +       VPTESTN %VMM(3), %VMM(3), %k0
>
> -       VPCMP   $0, (VEC_SIZE * 2)(%rdi), %YMMZERO, %k0
> -       kmovd   %k0, %eax
> -       tzcntl  %eax, %eax
> -       /* Check the end of data.  */
> -       cmpl    %eax, %esi
> -       jb      L(max)
> -
> -       subq    %rdx, %rdi
> -#  ifdef USE_AS_WCSLEN
> -       /* NB: Divide bytes by 4 to get the wchar_t count.  */
> -       sarq    $2, %rdi
> -#  endif
> -       leaq    (CHAR_PER_VEC * 2)(%rdi, %rax), %rax
> -       ret
> -L(max):
> -       movq    %r8, %rax
> -       ret
> -# endif
> -
> -       /* Placed here in strnlen so that the jcc L(last_4x_vec_or_less)
> -          in the 4x VEC loop can use 2 byte encoding.  */
> -       .p2align 4
> -L(second_vec_return):
> -       VPCMP   $0, %YMM3, %YMMZERO, %k0
> -       /* Combine YMM3 matches (k0) with YMM4 matches (k1).  */
> -# ifdef USE_AS_WCSLEN
> -       kunpckbw        %k0, %k1, %k0
> -       kmovd   %k0, %eax
> -       tzcntl  %eax, %eax
> +# if CHAR_PER_VEC == 64
> +       KMOV    %k0, %VRAX
> +       test    %VRAX, %VRAX
> +       jnz     L(first_vec_x2)
> +       KMOV    %k2, %VRAX
>  # else
> -       kunpckdq        %k0, %k1, %k0
> -       kmovq   %k0, %rax
> -       tzcntq  %rax, %rax
> +       /* We can only combine last 2x VEC masks if CHAR_PER_VEC <= 32.
> +        */
> +       kmovd   %k2, %edx
> +       kmovd   %k0, %eax
> +       salq    $CHAR_PER_VEC, %rdx
> +       orq     %rdx, %rax
>  # endif
> -       leaq    (CHAR_PER_VEC * 2)(%rdi, %rax), %rax
> -       ret
>
> -
> -# ifdef USE_AS_STRNLEN
> -L(last_vec_x1_check):
> -       tzcntl  %eax, %eax
> -       /* Check the end of data.  */
> -       cmpl    %eax, %esi
> -       jb      L(max)
> -       subq    %rdx, %rdi
> -#  ifdef USE_AS_WCSLEN
> -       /* NB: Divide bytes by 4 to get the wchar_t count.  */
> -       sarq    $2, %rdi
> -#  endif
> -       leaq    (CHAR_PER_VEC)(%rdi, %rax), %rax
> +       /* first_vec_x3 for strlen-ZMM and first_vec_x2 for strlen-YMM.
> +        */
> +       .p2align 4,, 2
> +L(FALLTHROUGH_RETURN_LBL):
> +       bsfq    %rax, %rax
> +       subq    %rcx, %rdi
> +       CHAR_SIZE_SHIFT_REG (rdi)
> +       leaq    (FALLTHROUGH_RETURN_OFFSET)(%rdi, %rax), %rax
>         ret
>
> -       .p2align 4
> -L(last_4x_vec):
> -       /* Test first 2x VEC normally.  */
> -       testl   %eax, %eax
> -       jnz     L(last_vec_x1)
> -
> -       VPCMP   $0, (VEC_SIZE * 2)(%rdi), %YMMZERO, %k0
> -       kmovd   %k0, %eax
> -       testl   %eax, %eax
> -       jnz     L(last_vec_x2)
> -
> -       /* Normalize length.  */
> -       andl    $(CHAR_PER_VEC * 4 - 1), %esi
> -       VPCMP   $0, (VEC_SIZE * 3)(%rdi), %YMMZERO, %k0
> -       kmovd   %k0, %eax
> -       testl   %eax, %eax
> -       jnz     L(last_vec_x3)
> -
> -       /* Check the end of data.  */
> -       subl    $(CHAR_PER_VEC * 3), %esi
> -       jb      L(max)
> -
> -       VPCMP   $0, (VEC_SIZE * 4)(%rdi), %YMMZERO, %k0
> -       kmovd   %k0, %eax
> -       tzcntl  %eax, %eax
> -       /* Check the end of data.  */
> -       cmpl    %eax, %esi
> -       jb      L(max_end)
> -
> -       subq    %rdx, %rdi
> -#  ifdef USE_AS_WCSLEN
> -       /* NB: Divide bytes by 4 to get the wchar_t count.  */
> -       sarq    $2, %rdi
> -#  endif
> -       leaq    (CHAR_PER_VEC * 4)(%rdi, %rax), %rax
> +       .p2align 4,, 8
> +L(first_vec_x0):
> +       bsf     %VRAX, %VRAX
> +       sub     %rcx, %rdi
> +       CHAR_SIZE_SHIFT_REG (rdi)
> +       addq    %rdi, %rax
>         ret
>
> -       .p2align 4
> -L(last_vec_x1):
> -       tzcntl  %eax, %eax
> -       subq    %rdx, %rdi
> -#  ifdef USE_AS_WCSLEN
> -       /* NB: Divide bytes by 4 to get the wchar_t count.  */
> -       sarq    $2, %rdi
> -#  endif
> +       .p2align 4,, 10
> +L(first_vec_x1):
> +       bsf     %VRAX, %VRAX
> +       sub     %rcx, %rdi
> +       CHAR_SIZE_SHIFT_REG (rdi)
>         leaq    (CHAR_PER_VEC)(%rdi, %rax), %rax
>         ret
>
> -       .p2align 4
> -L(last_vec_x2):
> -       tzcntl  %eax, %eax
> -       subq    %rdx, %rdi
> -#  ifdef USE_AS_WCSLEN
> -       /* NB: Divide bytes by 4 to get the wchar_t count.  */
> -       sarq    $2, %rdi
> -#  endif
> -       leaq    (CHAR_PER_VEC * 2)(%rdi, %rax), %rax
> -       ret
> -
> -       .p2align 4
> -L(last_vec_x3):
> -       tzcntl  %eax, %eax
> -       subl    $(CHAR_PER_VEC * 2), %esi
> -       /* Check the end of data.  */
> -       cmpl    %eax, %esi
> -       jb      L(max_end)
> -       subq    %rdx, %rdi
> -#  ifdef USE_AS_WCSLEN
> -       /* NB: Divide bytes by 4 to get the wchar_t count.  */
> -       sarq    $2, %rdi
> -#  endif
> -       leaq    (CHAR_PER_VEC * 3)(%rdi, %rax), %rax
> -       ret
> -L(max_end):
> -       movq    %r8, %rax
> +       .p2align 4,, 10
> +       /* first_vec_x2 for strlen-ZMM and first_vec_x3 for strlen-YMM.
> +        */
> +L(TAIL_RETURN_LBL):
> +       bsf     %VRAX, %VRAX
> +       sub     %VRCX, %VRDI
> +       CHAR_SIZE_SHIFT_REG (VRDI)
> +       lea     (TAIL_RETURN_OFFSET)(%rdi, %rax), %VRAX
>         ret
> -# endif
>
> -       /* Cold case for crossing page with first load.  */
> -       .p2align 4
> +       .p2align 4,, 8
>  L(cross_page_boundary):
> -       movq    %rdi, %rdx
> +       movq    %rdi, %rcx
>         /* Align data to VEC_SIZE.  */
>         andq    $-VEC_SIZE, %rdi
> -       VPCMP   $0, (%rdi), %YMMZERO, %k0
> -       kmovd   %k0, %eax
> -       /* Remove the leading bytes.  */
> +
> +       VPCMPEQ (%rdi), %VZERO, %k0
> +
> +       KMOV    %k0, %VRAX
>  # ifdef USE_AS_WCSLEN
> -       /* NB: Divide shift count by 4 since each bit in K0 represent 4
> -          bytes.  */
> -       movl    %edx, %ecx
> -       shrl    $2, %ecx
> -       andl    $(CHAR_PER_VEC - 1), %ecx
> -# endif
> -       /* SHIFT_REG is ecx for USE_AS_WCSLEN and edx otherwise.  */
> -       sarxl   %SHIFT_REG, %eax, %eax
> +       movl    %ecx, %edx
> +       shrl    $2, %edx
> +       andl    $(CHAR_PER_VEC - 1), %edx
> +       shrx    %edx, %eax, %eax
>         testl   %eax, %eax
> -# ifndef USE_AS_STRNLEN
> -       jz      L(cross_page_continue)
> -       tzcntl  %eax, %eax
> -       ret
>  # else
> -       jnz     L(cross_page_less_vec)
> -#  ifndef USE_AS_WCSLEN
> -       movl    %edx, %ecx
> -       andl    $(CHAR_PER_VEC - 1), %ecx
> -#  endif
> -       movl    $CHAR_PER_VEC, %eax
> -       subl    %ecx, %eax
> -       /* Check the end of data.  */
> -       cmpq    %rax, %rsi
> -       ja      L(cross_page_continue)
> -       movl    %esi, %eax
> -       ret
> -L(cross_page_less_vec):
> -       tzcntl  %eax, %eax
> -       /* Select min of length and position of first null.  */
> -       cmpq    %rax, %rsi
> -       cmovb   %esi, %eax
> -       ret
> +       shr     %cl, %VRAX
>  # endif
> +       jz      L(cross_page_continue)
> +       bsf     %VRAX, %VRAX
> +       ret
>
>  END (STRLEN)
>  #endif
> diff --git a/sysdeps/x86_64/multiarch/strnlen-evex.S b/sysdeps/x86_64/multiarch/strnlen-evex.S
> index 64a9fc2606..443a32a749 100644
> --- a/sysdeps/x86_64/multiarch/strnlen-evex.S
> +++ b/sysdeps/x86_64/multiarch/strnlen-evex.S
> @@ -1,8 +1,423 @@
> -#ifndef STRNLEN
> -# define STRNLEN __strnlen_evex
> -#endif
> +/* strnlen/wcsnlen optimized with 256-bit EVEX instructions.
> +   Copyright (C) 2022 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +#include <isa-level.h>
> +#include <sysdep.h>
> +
> +#if ISA_SHOULD_BUILD (4)
> +
> +# ifndef VEC_SIZE
> +#  include "x86-evex256-vecs.h"
> +# endif
> +
> +
> +# ifndef STRNLEN
> +#  define STRNLEN      __strnlen_evex
> +# endif
> +
> +# ifdef USE_AS_WCSLEN
> +#  define VPCMPEQ      vpcmpeqd
> +#  define VPCMPNEQ     vpcmpneqd
> +#  define VPTESTN      vptestnmd
> +#  define VPTEST       vptestmd
> +#  define VPMINU       vpminud
> +#  define CHAR_SIZE    4
> +
> +# else
> +#  define VPCMPEQ      vpcmpeqb
> +#  define VPCMPNEQ     vpcmpneqb
> +#  define VPTESTN      vptestnmb
> +#  define VPTEST       vptestmb
> +#  define VPMINU       vpminub
> +#  define CHAR_SIZE    1
> +
> +#  define REG_WIDTH    VEC_SIZE
> +# endif
> +
> +# define CHAR_PER_VEC  (VEC_SIZE / CHAR_SIZE)
> +
> +# include "reg-macros.h"
> +
> +# if CHAR_PER_VEC == 32
> +#  define SUB_SHORT(imm, reg)  subb $(imm), %VGPR_SZ(reg, 8)
> +# else
> +#  define SUB_SHORT(imm, reg)  subl $(imm), %VGPR_SZ(reg, 32)
> +# endif
> +
> +
> +
> +# if CHAR_PER_VEC == 64
> +#  define FALLTHROUGH_RETURN_OFFSET    (CHAR_PER_VEC * 3)
> +# else
> +#  define FALLTHROUGH_RETURN_OFFSET    (CHAR_PER_VEC * 2)
> +# endif
> +
> +
> +# define XZERO VMM_128(0)
> +# define VZERO VMM(0)
> +# define PAGE_SIZE     4096
> +
> +       .section SECTION(.text), "ax", @progbits
> +ENTRY_P2ALIGN (STRNLEN, 6)
> +       /* Check zero length.  */
> +       test    %RSI_LP, %RSI_LP
> +       jz      L(zero)
> +# ifdef __ILP32__
> +       /* Clear the upper 32 bits.  */
> +       movl    %esi, %esi
> +# endif
> +
> +       movl    %edi, %eax
> +       vpxorq  %XZERO, %XZERO, %XZERO
> +       andl    $(PAGE_SIZE - 1), %eax
> +       cmpl    $(PAGE_SIZE - VEC_SIZE), %eax
> +       ja      L(cross_page_boundary)
> +
> +       /* Check the first VEC_SIZE bytes.  Each bit in K0 represents a
> +          null byte.  */
> +       VPCMPEQ (%rdi), %VZERO, %k0
> +
> +       KMOV    %k0, %VRCX
> +       movq    %rsi, %rax
> +
> +       /* If src (rcx) is zero, bsf does not change the result.  NB:
> +          Must use 64-bit bsf here so that upper bits of len are not
> +          cleared.  */
> +       bsfq    %rcx, %rax
> +       /* If rax > CHAR_PER_VEC then rcx must have been zero (no null
> +          CHAR) and rsi must be > CHAR_PER_VEC.  */
> +       cmpq    $CHAR_PER_VEC, %rax
> +       ja      L(more_1x_vec)
> +       /* Check if first match in bounds.  */
> +       cmpq    %rax, %rsi
> +       cmovb   %esi, %eax
> +       ret
> +
> +
> +# if CHAR_PER_VEC != 32
> +       .p2align 4,, 2
> +L(zero):
> +L(max_0):
> +       movl    %esi, %eax
> +       ret
> +# endif
> +
> +       /* Aligned more for strnlen compares remaining length vs 2 *
> +          CHAR_PER_VEC, 4 * CHAR_PER_VEC, and 8 * CHAR_PER_VEC before
> +          going to the loop.  */
> +       .p2align 4,, 10
> +L(more_1x_vec):
> +L(cross_page_continue):
> +       /* Compute number of words checked after aligning.  */
> +# ifdef USE_AS_WCSLEN
> +       /* Need to compute directly for wcslen as CHAR_SIZE * rsi can
> +          overflow.  */
> +       movq    %rdi, %rax
> +       andq    $(VEC_SIZE * -1), %rdi
> +       subq    %rdi, %rax
> +       sarq    $2, %rax
> +       leaq    -(CHAR_PER_VEC * 1)(%rax, %rsi), %rax
> +# else
> +       leaq    (VEC_SIZE * -1)(%rsi, %rdi), %rax
> +       andq    $(VEC_SIZE * -1), %rdi
> +       subq    %rdi, %rax
> +# endif
> +
> +
> +       VPCMPEQ VEC_SIZE(%rdi), %VZERO, %k0
> +
> +       cmpq    $(CHAR_PER_VEC * 2), %rax
> +       ja      L(more_2x_vec)
> +
> +L(last_2x_vec_or_less):
> +       KMOV    %k0, %VRDX
> +       test    %VRDX, %VRDX
> +       jnz     L(last_vec_check)
> +
> +       /* Check the end of data.  */
> +       SUB_SHORT (CHAR_PER_VEC, rax)
> +       jbe     L(max_0)
> +       VPCMPEQ (VEC_SIZE * 2)(%rdi), %VZERO, %k0
> +       KMOV    %k0, %VRDX
> +       test    %VRDX, %VRDX
> +       jz      L(max_0)
> +       /* Best place for LAST_VEC_CHECK if ZMM.  */
> +       .p2align 4,, 8
> +L(last_vec_check):
> +       bsf     %VRDX, %VRDX
> +       sub     %eax, %edx
> +       lea     (%rsi, %rdx), %eax
> +       cmovae  %esi, %eax
> +       ret
> +
> +# if CHAR_PER_VEC == 32
> +       .p2align 4,, 2
> +L(zero):
> +L(max_0):
> +       movl    %esi, %eax
> +       ret
> +# endif
> +
> +       .p2align 4,, 8
> +L(last_4x_vec_or_less):
> +       addl    $(CHAR_PER_VEC * -4), %eax
> +       VPCMPEQ (VEC_SIZE * 5)(%rdi), %VZERO, %k0
> +       subq    $(VEC_SIZE * -4), %rdi
> +       cmpl    $(CHAR_PER_VEC * 2), %eax
> +       jbe     L(last_2x_vec_or_less)
> +
> +       .p2align 4,, 6
> +L(more_2x_vec):
> +       /* Remaining length >= 2 * CHAR_PER_VEC so do VEC0/VEC1 without
> +          rechecking bounds.  */
>
> -#define USE_AS_STRNLEN 1
> -#define STRLEN STRNLEN
> +       KMOV    %k0, %VRDX
>
> -#include "strlen-evex.S"
> +       test    %VRDX, %VRDX
> +       jnz     L(first_vec_x1)
> +
> +       VPCMPEQ (VEC_SIZE * 2)(%rdi), %VZERO, %k0
> +       KMOV    %k0, %VRDX
> +       test    %VRDX, %VRDX
> +       jnz     L(first_vec_x2)
> +
> +       cmpq    $(CHAR_PER_VEC * 4), %rax
> +       ja      L(more_4x_vec)
> +
> +
> +       VPCMPEQ (VEC_SIZE * 3)(%rdi), %VZERO, %k0
> +       KMOV    %k0, %VRDX
> +       addl    $(CHAR_PER_VEC * -2), %eax
> +       test    %VRDX, %VRDX
> +       jnz     L(last_vec_check)
> +
> +       subl    $(CHAR_PER_VEC), %eax
> +       jbe     L(max_1)
> +
> +       VPCMPEQ (VEC_SIZE * 4)(%rdi), %VZERO, %k0
> +       KMOV    %k0, %VRDX
> +
> +       test    %VRDX, %VRDX
> +       jnz     L(last_vec_check)
> +L(max_1):
> +       movl    %esi, %eax
> +       ret
> +
> +       .p2align 4,, 3
> +L(first_vec_x2):
> +# if VEC_SIZE == 64
> +       /* If VEC_SIZE == 64 we can fit logic for full return label in
> +          spare bytes before next cache line.  */
> +       bsf     %VRDX, %VRDX
> +       sub     %eax, %esi
> +       leal    (CHAR_PER_VEC * 1)(%rsi, %rdx), %eax
> +       ret
> +       .p2align 4,, 6
> +# else
> +       addl    $CHAR_PER_VEC, %esi
> +# endif
> +L(first_vec_x1):
> +       bsf     %VRDX, %VRDX
> +       sub     %eax, %esi
> +       leal    (CHAR_PER_VEC * 0)(%rsi, %rdx), %eax
> +       ret
> +
> +
> +       .p2align 4,, 6
> +L(first_vec_x4):
> +# if VEC_SIZE == 64
> +       /* If VEC_SIZE == 64 we can fit logic for full return label in
> +          spare bytes before next cache line.  */
> +       bsf     %VRDX, %VRDX
> +       sub     %eax, %esi
> +       leal    (CHAR_PER_VEC * 3)(%rsi, %rdx), %eax
> +       ret
> +       .p2align 4,, 6
> +# else
> +       addl    $CHAR_PER_VEC, %esi
> +# endif
> +L(first_vec_x3):
> +       bsf     %VRDX, %VRDX
> +       sub     %eax, %esi
> +       leal    (CHAR_PER_VEC * 2)(%rsi, %rdx), %eax
> +       ret
> +
> +       .p2align 4,, 5
> +L(more_4x_vec):
> +       VPCMPEQ (VEC_SIZE * 3)(%rdi), %VZERO, %k0
> +       KMOV    %k0, %VRDX
> +       test    %VRDX, %VRDX
> +       jnz     L(first_vec_x3)
> +
> +       VPCMPEQ (VEC_SIZE * 4)(%rdi), %VZERO, %k0
> +       KMOV    %k0, %VRDX
> +       test    %VRDX, %VRDX
> +       jnz     L(first_vec_x4)
> +
> +       /* Check if at last VEC_SIZE * 4 length before aligning for the
> +          loop.  */
> +       cmpq    $(CHAR_PER_VEC * 8), %rax
> +       jbe     L(last_4x_vec_or_less)
> +
> +
> +       /* Compute number of words checked after aligning.  */
> +# ifdef USE_AS_WCSLEN
> +       /* Need to compute directly for wcslen as CHAR_SIZE * rsi can
> +          overflow.  */
> +       leaq    (VEC_SIZE * -3)(%rdi), %rdx
> +# else
> +       leaq    (VEC_SIZE * -3)(%rdi, %rax), %rax
> +# endif
> +
> +       subq    $(VEC_SIZE * -1), %rdi
> +
> +       /* Align data to VEC_SIZE * 4.  */
> +# if VEC_SIZE == 64
> +       /* Saves code size.  No evex512 processor has partial register
> +          stalls.  If that change this can be replaced with `andq
> +          $-(VEC_SIZE * 4), %rdi`.  */
> +       xorb    %dil, %dil
> +# else
> +       andq    $-(VEC_SIZE * 4), %rdi
> +# endif
> +
> +# ifdef USE_AS_WCSLEN
> +       subq    %rdi, %rdx
> +       sarq    $2, %rdx
> +       addq    %rdx, %rax
> +# else
> +       subq    %rdi, %rax
> +# endif
> +       /* Compare 4 * VEC at a time forward.  */
> +       .p2align 4,, 11
> +L(loop_4x_vec):
> +       VMOVA   (VEC_SIZE * 4)(%rdi), %VMM(1)
> +       VPMINU  (VEC_SIZE * 5)(%rdi), %VMM(1), %VMM(2)
> +       VMOVA   (VEC_SIZE * 6)(%rdi), %VMM(3)
> +       VPMINU  (VEC_SIZE * 7)(%rdi), %VMM(3), %VMM(4)
> +       VPTESTN %VMM(2), %VMM(2), %k0
> +       VPTESTN %VMM(4), %VMM(4), %k2
> +       subq    $-(VEC_SIZE * 4), %rdi
> +       /* Break if at end of length.  */
> +       subq    $(CHAR_PER_VEC * 4), %rax
> +       jbe     L(loop_len_end)
> +
> +
> +       KORTEST %k0, %k2
> +       jz      L(loop_4x_vec)
> +
> +
> +L(loop_last_4x_vec):
> +       movq    %rsi, %rcx
> +       subq    %rax, %rsi
> +       VPTESTN %VMM(1), %VMM(1), %k1
> +       KMOV    %k1, %VRDX
> +       test    %VRDX, %VRDX
> +       jnz     L(last_vec_x0)
> +
> +       KMOV    %k0, %VRDX
> +       test    %VRDX, %VRDX
> +       jnz     L(last_vec_x1)
> +
> +       VPTESTN %VMM(3), %VMM(3), %k0
> +
> +       /* Seperate logic for VEC_SIZE == 64 and VEC_SIZE == 32 for
> +          returning last 2x VEC. For VEC_SIZE == 64 we test each VEC
> +          individually, for VEC_SIZE == 32 we combine them in a single
> +          64-bit GPR.  */
> +# if CHAR_PER_VEC == 64
> +       KMOV    %k0, %VRDX
> +       test    %VRDX, %VRDX
> +       jnz     L(last_vec_x2)
> +       KMOV    %k2, %VRDX
> +# else
> +       /* We can only combine last 2x VEC masks if CHAR_PER_VEC <= 32.
> +        */
> +       kmovd   %k2, %edx
> +       kmovd   %k0, %eax
> +       salq    $CHAR_PER_VEC, %rdx
> +       orq     %rax, %rdx
> +# endif
> +
> +       /* first_vec_x3 for strlen-ZMM and first_vec_x2 for strlen-YMM.
> +        */
> +       bsfq    %rdx, %rdx
> +       leaq    (FALLTHROUGH_RETURN_OFFSET - CHAR_PER_VEC * 4)(%rsi, %rdx), %rax
> +       cmpq    %rax, %rcx
> +       cmovb   %rcx, %rax
> +       ret
> +
> +       /* Handle last 4x VEC after loop. All VECs have been loaded.  */
> +       .p2align 4,, 4
> +L(loop_len_end):
> +       KORTEST %k0, %k2
> +       jnz     L(loop_last_4x_vec)
> +       movq    %rsi, %rax
> +       ret
> +
> +
> +# if CHAR_PER_VEC == 64
> +       /* Since we can't combine the last 2x VEC for VEC_SIZE == 64
> +          need return label for it.  */
> +       .p2align 4,, 8
> +L(last_vec_x2):
> +       bsf     %VRDX, %VRDX
> +       leaq    (CHAR_PER_VEC * -2)(%rsi, %rdx), %rax
> +       cmpq    %rax, %rcx
> +       cmovb   %rcx, %rax
> +       ret
> +# endif
> +
> +
> +       .p2align 4,, 10
> +L(last_vec_x1):
> +       addq    $CHAR_PER_VEC, %rsi
> +L(last_vec_x0):
> +       bsf     %VRDX, %VRDX
> +       leaq    (CHAR_PER_VEC * -4)(%rsi, %rdx), %rax
> +       cmpq    %rax, %rcx
> +       cmovb   %rcx, %rax
> +       ret
> +
> +
> +       .p2align 4,, 8
> +L(cross_page_boundary):
> +       /* Align data to VEC_SIZE.  */
> +       movq    %rdi, %rcx
> +       andq    $-VEC_SIZE, %rcx
> +       VPCMPEQ (%rcx), %VZERO, %k0
> +
> +       KMOV    %k0, %VRCX
> +# ifdef USE_AS_WCSLEN
> +       shrl    $2, %eax
> +       andl    $(CHAR_PER_VEC - 1), %eax
> +# endif
> +       shrx    %VRAX, %VRCX, %VRCX
> +
> +       negl    %eax
> +       andl    $(CHAR_PER_VEC - 1), %eax
> +       movq    %rsi, %rdx
> +       bsf     %VRCX, %VRDX
> +       cmpq    %rax, %rdx
> +       ja      L(cross_page_continue)
> +       movl    %edx, %eax
> +       cmpq    %rdx, %rsi
> +       cmovb   %esi, %eax
> +       ret
> +END (STRNLEN)
> +#endif
> diff --git a/sysdeps/x86_64/multiarch/wcsnlen-evex.S b/sysdeps/x86_64/multiarch/wcsnlen-evex.S
> index e2aad94c1e..57a7e93fbf 100644
> --- a/sysdeps/x86_64/multiarch/wcsnlen-evex.S
> +++ b/sysdeps/x86_64/multiarch/wcsnlen-evex.S
> @@ -2,8 +2,7 @@
>  # define WCSNLEN       __wcsnlen_evex
>  #endif
>
> -#define STRLEN WCSNLEN
> +#define STRNLEN        WCSNLEN
>  #define USE_AS_WCSLEN 1
> -#define USE_AS_STRNLEN 1
>
> -#include "strlen-evex.S"
> +#include "strnlen-evex.S"
> --
> 2.34.1
>

LGTM.

Thanks.

-- 
H.J.

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH v3 4/7] x86: Optimize memrchr-evex.S
  2022-10-19  0:44   ` [PATCH v3 4/7] x86: Optimize memrchr-evex.S Noah Goldstein
@ 2022-10-19 16:58     ` H.J. Lu
  0 siblings, 0 replies; 41+ messages in thread
From: H.J. Lu @ 2022-10-19 16:58 UTC (permalink / raw)
  To: Noah Goldstein; +Cc: libc-alpha, carlos

On Tue, Oct 18, 2022 at 5:44 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> Optimizations are:
> 1. Use the fact that lzcnt(0) -> VEC_SIZE for memchr to save a branch
>    in short string case.
> 2. Save several instructions in len = [VEC_SIZE, 4 * VEC_SIZE] case.
> 3. Use more code-size efficient instructions.
>         - tzcnt ...     -> bsf ...
>         - vpcmpb $0 ... -> vpcmpeq ...
>
> Code Size Changes:
> memrchr-evex.S      :  -29 bytes
>
> Net perf changes:
>
> Reported as geometric mean of all improvements / regressions from N=10
> runs of the benchtests. Value as New Time / Old Time so < 1.0 is
> improvement and 1.0 is regression.
>
> memrchr-evex.S      : 0.949 (Mostly from improvements in small strings)
>
> Full results attached in email.
>
> Full check passes on x86-64.
> ---
>  sysdeps/x86_64/multiarch/memrchr-evex.S | 538 ++++++++++++++----------
>  1 file changed, 324 insertions(+), 214 deletions(-)
>
> diff --git a/sysdeps/x86_64/multiarch/memrchr-evex.S b/sysdeps/x86_64/multiarch/memrchr-evex.S
> index 550b328c5a..dbcf52808f 100644
> --- a/sysdeps/x86_64/multiarch/memrchr-evex.S
> +++ b/sysdeps/x86_64/multiarch/memrchr-evex.S
> @@ -21,17 +21,19 @@
>  #if ISA_SHOULD_BUILD (4)
>
>  # include <sysdep.h>
> -# include "x86-evex256-vecs.h"
> -# if VEC_SIZE != 32
> -#  error "VEC_SIZE != 32 unimplemented"
> +
> +# ifndef VEC_SIZE
> +#  include "x86-evex256-vecs.h"
>  # endif
>
> +# include "reg-macros.h"
> +
>  # ifndef MEMRCHR
> -#  define MEMRCHR                              __memrchr_evex
> +#  define MEMRCHR      __memrchr_evex
>  # endif
>
> -# define PAGE_SIZE                     4096
> -# define VMMMATCH                      VMM(0)
> +# define PAGE_SIZE     4096
> +# define VMATCH        VMM(0)
>
>         .section SECTION(.text), "ax", @progbits
>  ENTRY_P2ALIGN(MEMRCHR, 6)
> @@ -43,294 +45,402 @@ ENTRY_P2ALIGN(MEMRCHR, 6)
>  # endif
>         jz      L(zero_0)
>
> -       /* Get end pointer. Minus one for two reasons. 1) It is necessary for a
> -          correct page cross check and 2) it correctly sets up end ptr to be
> -          subtract by lzcnt aligned.  */
> +       /* Get end pointer. Minus one for three reasons. 1) It is
> +          necessary for a correct page cross check and 2) it correctly
> +          sets up end ptr to be subtract by lzcnt aligned. 3) it is a
> +          necessary step in aligning ptr.  */
>         leaq    -1(%rdi, %rdx), %rax
> -       vpbroadcastb %esi, %VMMMATCH
> +       vpbroadcastb %esi, %VMATCH
>
>         /* Check if we can load 1x VEC without cross a page.  */
>         testl   $(PAGE_SIZE - VEC_SIZE), %eax
>         jz      L(page_cross)
>
> -       /* Don't use rax for pointer here because EVEX has better encoding with
> -          offset % VEC_SIZE == 0.  */
> -       vpcmpb  $0, -(VEC_SIZE)(%rdi, %rdx), %VMMMATCH, %k0
> -       kmovd   %k0, %ecx
> -
> -       /* Fall through for rdx (len) <= VEC_SIZE (expect small sizes).  */
> -       cmpq    $VEC_SIZE, %rdx
> -       ja      L(more_1x_vec)
> -L(ret_vec_x0_test):
> -
> -       /* If ecx is zero (no matches) lzcnt will set it 32 (VEC_SIZE) which
> -          will guarantee edx (len) is less than it.  */
> -       lzcntl  %ecx, %ecx
> -       cmpl    %ecx, %edx
> -       jle     L(zero_0)
> -       subq    %rcx, %rax
> +       /* Don't use rax for pointer here because EVEX has better
> +          encoding with offset % VEC_SIZE == 0.  */
> +       vpcmpeqb (VEC_SIZE * -1)(%rdi, %rdx), %VMATCH, %k0
> +       KMOV    %k0, %VRCX
> +
> +       /* If rcx is zero then lzcnt -> VEC_SIZE.  NB: there is a
> +          already a dependency between rcx and rsi so no worries about
> +          false-dep here.  */
> +       lzcnt   %VRCX, %VRSI
> +       /* If rdx <= rsi then either 1) rcx was non-zero (there was a
> +          match) but it was out of bounds or 2) rcx was zero and rdx
> +          was <= VEC_SIZE so we are done scanning.  */
> +       cmpq    %rsi, %rdx
> +       /* NB: Use branch to return zero/non-zero.  Common usage will
> +          branch on result of function (if return is null/non-null).
> +          This branch can be used to predict the ensuing one so there
> +          is no reason to extend the data-dependency with cmovcc.  */
> +       jbe     L(zero_0)
> +
> +       /* If rcx is zero then len must be > RDX, otherwise since we
> +          already tested len vs lzcnt(rcx) (in rsi) we are good to
> +          return this match.  */
> +       test    %VRCX, %VRCX
> +       jz      L(more_1x_vec)
> +       subq    %rsi, %rax
>         ret
>
> -       /* Fits in aligning bytes of first cache line.  */
> +       /* Fits in aligning bytes of first cache line for VEC_SIZE ==
> +          32.  */
> +# if VEC_SIZE == 32
> +       .p2align 4,, 2
>  L(zero_0):
>         xorl    %eax, %eax
>         ret
> -
> -       .p2align 4,, 9
> -L(ret_vec_x0_dec):
> -       decq    %rax
> -L(ret_vec_x0):
> -       lzcntl  %ecx, %ecx
> -       subq    %rcx, %rax
> -       ret
> +# endif
>
>         .p2align 4,, 10
>  L(more_1x_vec):
> -       testl   %ecx, %ecx
> -       jnz     L(ret_vec_x0)
> -
>         /* Align rax (pointer to string).  */
>         andq    $-VEC_SIZE, %rax
> -
> +L(page_cross_continue):
>         /* Recompute length after aligning.  */
> -       movq    %rax, %rdx
> +       subq    %rdi, %rax
>
> -       /* Need no matter what.  */
> -       vpcmpb  $0, -(VEC_SIZE)(%rax), %VMMMATCH, %k0
> -       kmovd   %k0, %ecx
> -
> -       subq    %rdi, %rdx
> -
> -       cmpq    $(VEC_SIZE * 2), %rdx
> +       cmpq    $(VEC_SIZE * 2), %rax
>         ja      L(more_2x_vec)
> +
>  L(last_2x_vec):
> +       vpcmpeqb (VEC_SIZE * -1)(%rdi, %rax), %VMATCH, %k0
> +       KMOV    %k0, %VRCX
>
> -       /* Must dec rax because L(ret_vec_x0_test) expects it.  */
> -       decq    %rax
> -       cmpl    $VEC_SIZE, %edx
> -       jbe     L(ret_vec_x0_test)
> +       test    %VRCX, %VRCX
> +       jnz     L(ret_vec_x0_test)
>
> -       testl   %ecx, %ecx
> -       jnz     L(ret_vec_x0)
> +       /* If VEC_SIZE == 64 need to subtract because lzcntq won't
> +          implicitly add VEC_SIZE to match position.  */
> +# if VEC_SIZE == 64
> +       subl    $VEC_SIZE, %eax
> +# else
> +       cmpb    $VEC_SIZE, %al
> +# endif
> +       jle     L(zero_2)
>
> -       /* Don't use rax for pointer here because EVEX has better encoding with
> -          offset % VEC_SIZE == 0.  */
> -       vpcmpb  $0, -(VEC_SIZE * 2)(%rdi, %rdx), %VMMMATCH, %k0
> -       kmovd   %k0, %ecx
> -       /* NB: 64-bit lzcnt. This will naturally add 32 to position.  */
> +       /* We adjusted rax (length) for VEC_SIZE == 64 so need seperate
> +          offsets.  */
> +# if VEC_SIZE == 64
> +       vpcmpeqb (VEC_SIZE * -1)(%rdi, %rax), %VMATCH, %k0
> +# else
> +       vpcmpeqb (VEC_SIZE * -2)(%rdi, %rax), %VMATCH, %k0
> +# endif
> +       KMOV    %k0, %VRCX
> +       /* NB: 64-bit lzcnt. This will naturally add 32 to position for
> +          VEC_SIZE == 32.  */
>         lzcntq  %rcx, %rcx
> -       cmpl    %ecx, %edx
> -       jle     L(zero_0)
> -       subq    %rcx, %rax
> -       ret
> -
> -       /* Inexpensive place to put this regarding code size / target alignments
> -          / ICache NLP. Necessary for 2-byte encoding of jump to page cross
> -          case which in turn is necessary for hot path (len <= VEC_SIZE) to fit
> -          in first cache line.  */
> -L(page_cross):
> -       movq    %rax, %rsi
> -       andq    $-VEC_SIZE, %rsi
> -       vpcmpb  $0, (%rsi), %VMMMATCH, %k0
> -       kmovd   %k0, %r8d
> -       /* Shift out negative alignment (because we are starting from endptr and
> -          working backwards).  */
> -       movl    %eax, %ecx
> -       /* notl because eax already has endptr - 1.  (-x = ~(x - 1)).  */
> -       notl    %ecx
> -       shlxl   %ecx, %r8d, %ecx
> -       cmpq    %rdi, %rsi
> -       ja      L(more_1x_vec)
> -       lzcntl  %ecx, %ecx
> -       cmpl    %ecx, %edx
> -       jle     L(zero_1)
> -       subq    %rcx, %rax
> +       subl    %ecx, %eax
> +       ja      L(first_vec_x1_ret)
> +       /* If VEC_SIZE == 64 put L(zero_0) here as we can't fit in the
> +          first cache line (this is the second cache line).  */
> +# if VEC_SIZE == 64
> +L(zero_0):
> +# endif
> +L(zero_2):
> +       xorl    %eax, %eax
>         ret
>
> -       /* Continue creating zero labels that fit in aligning bytes and get
> -          2-byte encoding / are in the same cache line as condition.  */
> -L(zero_1):
> -       xorl    %eax, %eax
> +       /* NB: Fits in aligning bytes before next cache line for
> +          VEC_SIZE == 32.  For VEC_SIZE == 64 this is attached to
> +          L(first_vec_x0_test).  */
> +# if VEC_SIZE == 32
> +L(first_vec_x1_ret):
> +       leaq    -1(%rdi, %rax), %rax
>         ret
> +# endif
>
> -       .p2align 4,, 8
> -L(ret_vec_x1):
> -       /* This will naturally add 32 to position.  */
> -       bsrl    %ecx, %ecx
> -       leaq    -(VEC_SIZE * 2)(%rcx, %rax), %rax
> +       .p2align 4,, 6
> +L(ret_vec_x0_test):
> +       lzcnt   %VRCX, %VRCX
> +       subl    %ecx, %eax
> +       jle     L(zero_2)
> +# if VEC_SIZE == 64
> +       /* Reuse code at the end of L(ret_vec_x0_test) as we can't fit
> +          L(first_vec_x1_ret) in the same cache line as its jmp base
> +          so we might as well save code size.  */
> +L(first_vec_x1_ret):
> +# endif
> +       leaq    -1(%rdi, %rax), %rax
>         ret
>
> -       .p2align 4,, 8
> +       .p2align 4,, 6
> +L(loop_last_4x_vec):
> +       /* Compute remaining length.  */
> +       subl    %edi, %eax
> +L(last_4x_vec):
> +       cmpl    $(VEC_SIZE * 2), %eax
> +       jle     L(last_2x_vec)
> +# if VEC_SIZE == 32
> +       /* Only align for VEC_SIZE == 32.  For VEC_SIZE == 64 we need
> +          the spare bytes to align the loop properly.  */
> +       .p2align 4,, 10
> +# endif
>  L(more_2x_vec):
> -       testl   %ecx, %ecx
> -       jnz     L(ret_vec_x0_dec)
>
> -       vpcmpb  $0, -(VEC_SIZE * 2)(%rax), %VMMMATCH, %k0
> -       kmovd   %k0, %ecx
> -       testl   %ecx, %ecx
> -       jnz     L(ret_vec_x1)
> +       /* Length > VEC_SIZE * 2 so check the first 2x VEC for match and
> +          return if either hit.  */
> +       vpcmpeqb (VEC_SIZE * -1)(%rdi, %rax), %VMATCH, %k0
> +       KMOV    %k0, %VRCX
> +
> +       test    %VRCX, %VRCX
> +       jnz     L(first_vec_x0)
> +
> +       vpcmpeqb (VEC_SIZE * -2)(%rdi, %rax), %VMATCH, %k0
> +       KMOV    %k0, %VRCX
> +       test    %VRCX, %VRCX
> +       jnz     L(first_vec_x1)
>
>         /* Need no matter what.  */
> -       vpcmpb  $0, -(VEC_SIZE * 3)(%rax), %VMMMATCH, %k0
> -       kmovd   %k0, %ecx
> +       vpcmpeqb (VEC_SIZE * -3)(%rdi, %rax), %VMATCH, %k0
> +       KMOV    %k0, %VRCX
>
> -       subq    $(VEC_SIZE * 4), %rdx
> +       /* Check if we are near the end.  */
> +       subq    $(VEC_SIZE * 4), %rax
>         ja      L(more_4x_vec)
>
> -       cmpl    $(VEC_SIZE * -1), %edx
> -       jle     L(ret_vec_x2_test)
> -L(last_vec):
> -       testl   %ecx, %ecx
> -       jnz     L(ret_vec_x2)
> +       test    %VRCX, %VRCX
> +       jnz     L(first_vec_x2_test)
>
> +       /* Adjust length for final check and check if we are at the end.
> +        */
> +       addl    $(VEC_SIZE * 1), %eax
> +       jle     L(zero_1)
>
> -       /* Need no matter what.  */
> -       vpcmpb  $0, -(VEC_SIZE * 4)(%rax), %VMMMATCH, %k0
> -       kmovd   %k0, %ecx
> -       lzcntl  %ecx, %ecx
> -       subq    $(VEC_SIZE * 3 + 1), %rax
> -       subq    %rcx, %rax
> -       cmpq    %rax, %rdi
> -       ja      L(zero_1)
> +       vpcmpeqb (VEC_SIZE * -1)(%rdi, %rax), %VMATCH, %k0
> +       KMOV    %k0, %VRCX
> +
> +       lzcnt   %VRCX, %VRCX
> +       subl    %ecx, %eax
> +       ja      L(first_vec_x3_ret)
> +L(zero_1):
> +       xorl    %eax, %eax
> +       ret
> +L(first_vec_x3_ret):
> +       leaq    -1(%rdi, %rax), %rax
>         ret
>
> -       .p2align 4,, 8
> -L(ret_vec_x2_test):
> -       lzcntl  %ecx, %ecx
> -       subq    $(VEC_SIZE * 2 + 1), %rax
> -       subq    %rcx, %rax
> -       cmpq    %rax, %rdi
> -       ja      L(zero_1)
> +       .p2align 4,, 6
> +L(first_vec_x2_test):
> +       /* Must adjust length before check.  */
> +       subl    $-(VEC_SIZE * 2 - 1), %eax
> +       lzcnt   %VRCX, %VRCX
> +       subl    %ecx, %eax
> +       jl      L(zero_4)
> +       addq    %rdi, %rax
>         ret
>
> -       .p2align 4,, 8
> -L(ret_vec_x2):
> -       bsrl    %ecx, %ecx
> -       leaq    -(VEC_SIZE * 3)(%rcx, %rax), %rax
> +
> +       .p2align 4,, 10
> +L(first_vec_x0):
> +       bsr     %VRCX, %VRCX
> +       leaq    (VEC_SIZE * -1)(%rdi, %rax), %rax
> +       addq    %rcx, %rax
>         ret
>
> -       .p2align 4,, 8
> -L(ret_vec_x3):
> -       bsrl    %ecx, %ecx
> -       leaq    -(VEC_SIZE * 4)(%rcx, %rax), %rax
> +       /* Fits unobtrusively here.  */
> +L(zero_4):
> +       xorl    %eax, %eax
> +       ret
> +
> +       .p2align 4,, 10
> +L(first_vec_x1):
> +       bsr     %VRCX, %VRCX
> +       leaq    (VEC_SIZE * -2)(%rdi, %rax), %rax
> +       addq    %rcx, %rax
>         ret
>
>         .p2align 4,, 8
> +L(first_vec_x3):
> +       bsr     %VRCX, %VRCX
> +       addq    %rdi, %rax
> +       addq    %rcx, %rax
> +       ret
> +
> +       .p2align 4,, 6
> +L(first_vec_x2):
> +       bsr     %VRCX, %VRCX
> +       leaq    (VEC_SIZE * 1)(%rdi, %rax), %rax
> +       addq    %rcx, %rax
> +       ret
> +
> +       .p2align 4,, 2
>  L(more_4x_vec):
> -       testl   %ecx, %ecx
> -       jnz     L(ret_vec_x2)
> +       test    %VRCX, %VRCX
> +       jnz     L(first_vec_x2)
>
> -       vpcmpb  $0, -(VEC_SIZE * 4)(%rax), %VMMMATCH, %k0
> -       kmovd   %k0, %ecx
> +       vpcmpeqb (%rdi, %rax), %VMATCH, %k0
> +       KMOV    %k0, %VRCX
>
> -       testl   %ecx, %ecx
> -       jnz     L(ret_vec_x3)
> +       test    %VRCX, %VRCX
> +       jnz     L(first_vec_x3)
>
>         /* Check if near end before re-aligning (otherwise might do an
>            unnecessary loop iteration).  */
> -       addq    $-(VEC_SIZE * 4), %rax
> -       cmpq    $(VEC_SIZE * 4), %rdx
> +       cmpq    $(VEC_SIZE * 4), %rax
>         jbe     L(last_4x_vec)
>
> -       decq    %rax
> -       andq    $-(VEC_SIZE * 4), %rax
> -       movq    %rdi, %rdx
> -       /* Get endptr for loop in rdx. NB: Can't just do while rax > rdi because
> -          lengths that overflow can be valid and break the comparison.  */
> -       andq    $-(VEC_SIZE * 4), %rdx
> +
> +       /* NB: We setup the loop to NOT use index-address-mode for the
> +          buffer.  This costs some instructions & code size but avoids
> +          stalls due to unlaminated micro-fused instructions (as used
> +          in the loop) from being forced to issue in the same group
> +          (essentially narrowing the backend width).  */
> +
> +       /* Get endptr for loop in rdx. NB: Can't just do while rax > rdi
> +          because lengths that overflow can be valid and break the
> +          comparison.  */
> +# if VEC_SIZE == 64
> +       /* Use rdx as intermediate to compute rax, this gets us imm8
> +          encoding which just allows the L(more_4x_vec) block to fit
> +          in 1 cache-line.  */
> +       leaq    (VEC_SIZE * 4)(%rdi), %rdx
> +       leaq    (VEC_SIZE * -1)(%rdx, %rax), %rax
> +
> +       /* No evex machine has partial register stalls. This can be
> +          replaced with: `andq $(VEC_SIZE * -4), %rax/%rdx` if that
> +          changes.  */
> +       xorb    %al, %al
> +       xorb    %dl, %dl
> +# else
> +       leaq    (VEC_SIZE * 3)(%rdi, %rax), %rax
> +       andq    $(VEC_SIZE * -4), %rax
> +       leaq    (VEC_SIZE * 4)(%rdi), %rdx
> +       andq    $(VEC_SIZE * -4), %rdx
> +# endif
> +
>
>         .p2align 4
>  L(loop_4x_vec):
> -       /* Store 1 were not-equals and 0 where equals in k1 (used to mask later
> -          on).  */
> -       vpcmpb  $4, (VEC_SIZE * 3)(%rax), %VMMMATCH, %k1
> +       /* NB: We could do the same optimization here as we do for
> +          memchr/rawmemchr by using VEX encoding in the loop for access
> +          to VEX vpcmpeqb + vpternlogd.  Since memrchr is not as hot as
> +          memchr it may not be worth the extra code size, but if the
> +          need arises it an easy ~15% perf improvement to the loop.  */
> +
> +       cmpq    %rdx, %rax
> +       je      L(loop_last_4x_vec)
> +       /* Store 1 were not-equals and 0 where equals in k1 (used to
> +          mask later on).  */
> +       vpcmpb  $4, (VEC_SIZE * -1)(%rax), %VMATCH, %k1
>
>         /* VEC(2/3) will have zero-byte where we found a CHAR.  */
> -       vpxorq  (VEC_SIZE * 2)(%rax), %VMMMATCH, %VMM(2)
> -       vpxorq  (VEC_SIZE * 1)(%rax), %VMMMATCH, %VMM(3)
> -       vpcmpb  $0, (VEC_SIZE * 0)(%rax), %VMMMATCH, %k4
> +       vpxorq  (VEC_SIZE * -2)(%rax), %VMATCH, %VMM(2)
> +       vpxorq  (VEC_SIZE * -3)(%rax), %VMATCH, %VMM(3)
> +       vpcmpeqb (VEC_SIZE * -4)(%rax), %VMATCH, %k4
>
> -       /* Combine VEC(2/3) with min and maskz with k1 (k1 has zero bit where
> -          CHAR is found and VEC(2/3) have zero-byte where CHAR is found.  */
> +       /* Combine VEC(2/3) with min and maskz with k1 (k1 has zero bit
> +          where CHAR is found and VEC(2/3) have zero-byte where CHAR
> +          is found.  */
>         vpminub %VMM(2), %VMM(3), %VMM(3){%k1}{z}
>         vptestnmb %VMM(3), %VMM(3), %k2
>
> -       /* Any 1s and we found CHAR.  */
> -       kortestd %k2, %k4
> -       jnz     L(loop_end)
> -
>         addq    $-(VEC_SIZE * 4), %rax
> -       cmpq    %rdx, %rax
> -       jne     L(loop_4x_vec)
>
> -       /* Need to re-adjust rdx / rax for L(last_4x_vec).  */
> -       subq    $-(VEC_SIZE * 4), %rdx
> -       movq    %rdx, %rax
> -       subl    %edi, %edx
> -L(last_4x_vec):
> +       /* Any 1s and we found CHAR.  */
> +       KORTEST %k2, %k4
> +       jz      L(loop_4x_vec)
> +
>
> -       /* Used no matter what.  */
> -       vpcmpb  $0, (VEC_SIZE * -1)(%rax), %VMMMATCH, %k0
> -       kmovd   %k0, %ecx
> +       /* K1 has non-matches for first VEC. inc; jz will overflow rcx
> +          iff all bytes where non-matches.  */
> +       KMOV    %k1, %VRCX
> +       inc     %VRCX
> +       jnz     L(first_vec_x0_end)
>
> -       cmpl    $(VEC_SIZE * 2), %edx
> -       jbe     L(last_2x_vec)
> +       vptestnmb %VMM(2), %VMM(2), %k0
> +       KMOV    %k0, %VRCX
> +       test    %VRCX, %VRCX
> +       jnz     L(first_vec_x1_end)
> +       KMOV    %k2, %VRCX
> +
> +       /* Seperate logic for VEC_SIZE == 64 and VEC_SIZE == 32 for
> +          returning last 2x VEC. For VEC_SIZE == 64 we test each VEC
> +          individually, for VEC_SIZE == 32 we combine them in a single
> +          64-bit GPR.  */
> +# if VEC_SIZE == 64
> +       test    %VRCX, %VRCX
> +       jnz     L(first_vec_x2_end)
> +       KMOV    %k4, %VRCX
> +# else
> +       /* Combine last 2 VEC matches for VEC_SIZE == 32. If rcx (from
> +          VEC(3)) is zero (no CHAR in VEC(3)) then it won't affect the
> +          result in rsi (from VEC(4)). If rcx is non-zero then CHAR in
> +          VEC(3) and bsrq will use that position.  */
> +       KMOV    %k4, %VRSI
> +       salq    $32, %rcx
> +       orq     %rsi, %rcx
> +# endif
> +       bsrq    %rcx, %rcx
> +       addq    %rcx, %rax
> +       ret
>
> -       testl   %ecx, %ecx
> -       jnz     L(ret_vec_x0_dec)
> +       .p2align 4,, 4
> +L(first_vec_x0_end):
> +       /* rcx has 1s at non-matches so we need to `not` it. We used
> +          `inc` to test if zero so use `neg` to complete the `not` so
> +          the last 1 bit represent a match.  NB: (-x + 1 == ~x).  */
> +       neg     %VRCX
> +       bsr     %VRCX, %VRCX
> +       leaq    (VEC_SIZE * 3)(%rcx, %rax), %rax
> +       ret
>
> +       .p2align 4,, 10
> +L(first_vec_x1_end):
> +       bsr     %VRCX, %VRCX
> +       leaq    (VEC_SIZE * 2)(%rcx, %rax), %rax
> +       ret
>
> -       vpcmpb  $0, (VEC_SIZE * -2)(%rax), %VMMMATCH, %k0
> -       kmovd   %k0, %ecx
> +# if VEC_SIZE == 64
> +       /* Since we can't combine the last 2x VEC for VEC_SIZE == 64
> +          need return label for it.  */
> +       .p2align 4,, 4
> +L(first_vec_x2_end):
> +       bsr     %VRCX, %VRCX
> +       leaq    (VEC_SIZE * 1)(%rcx, %rax), %rax
> +       ret
> +# endif
>
> -       testl   %ecx, %ecx
> -       jnz     L(ret_vec_x1)
>
> -       /* Used no matter what.  */
> -       vpcmpb  $0, (VEC_SIZE * -3)(%rax), %VMMMATCH, %k0
> -       kmovd   %k0, %ecx
> +       .p2align 4,, 4
> +L(page_cross):
> +       /* only lower bits of eax[log2(VEC_SIZE):0] are set so we can
> +          use movzbl to get the amount of bytes we are checking here.
> +        */
> +       movzbl  %al, %ecx
> +       andq    $-VEC_SIZE, %rax
> +       vpcmpeqb (%rax), %VMATCH, %k0
> +       KMOV    %k0, %VRSI
>
> -       cmpl    $(VEC_SIZE * 3), %edx
> -       ja      L(last_vec)
> +       /* eax was comptued as %rdi + %rdx - 1 so need to add back 1
> +          here.  */
> +       leal    1(%rcx), %r8d
>
> -       lzcntl  %ecx, %ecx
> -       subq    $(VEC_SIZE * 2 + 1), %rax
> -       subq    %rcx, %rax
> -       cmpq    %rax, %rdi
> -       jbe     L(ret_1)
> +       /* Invert ecx to get shift count for byte matches out of range.
> +        */
> +       notl    %ecx
> +       shlx    %VRCX, %VRSI, %VRSI
> +
> +       /* if r8 < rdx then the entire [buf, buf + len] is handled in
> +          the page cross case.  NB: we can't use the trick here we use
> +          in the non page-cross case because we aren't checking full
> +          VEC_SIZE.  */
> +       cmpq    %r8, %rdx
> +       ja      L(page_cross_check)
> +       lzcnt   %VRSI, %VRSI
> +       subl    %esi, %edx
> +       ja      L(page_cross_ret)
>         xorl    %eax, %eax
> -L(ret_1):
>         ret
>
> -       .p2align 4,, 6
> -L(loop_end):
> -       kmovd   %k1, %ecx
> -       notl    %ecx
> -       testl   %ecx, %ecx
> -       jnz     L(ret_vec_x0_end)
> +L(page_cross_check):
> +       test    %VRSI, %VRSI
> +       jz      L(page_cross_continue)
>
> -       vptestnmb %VMM(2), %VMM(2), %k0
> -       kmovd   %k0, %ecx
> -       testl   %ecx, %ecx
> -       jnz     L(ret_vec_x1_end)
> -
> -       kmovd   %k2, %ecx
> -       kmovd   %k4, %esi
> -       /* Combine last 2 VEC matches. If ecx (VEC3) is zero (no CHAR in VEC3)
> -          then it won't affect the result in esi (VEC4). If ecx is non-zero
> -          then CHAR in VEC3 and bsrq will use that position.  */
> -       salq    $32, %rcx
> -       orq     %rsi, %rcx
> -       bsrq    %rcx, %rcx
> -       addq    %rcx, %rax
> -       ret
> -       .p2align 4,, 4
> -L(ret_vec_x0_end):
> -       addq    $(VEC_SIZE), %rax
> -L(ret_vec_x1_end):
> -       bsrl    %ecx, %ecx
> -       leaq    (VEC_SIZE * 2)(%rax, %rcx), %rax
> +       lzcnt   %VRSI, %VRSI
> +       subl    %esi, %edx
> +L(page_cross_ret):
> +       leaq    -1(%rdi, %rdx), %rax
>         ret
> -
>  END(MEMRCHR)
>  #endif
> --
> 2.34.1
>

LGTM.

Thanks.

-- 
H.J.

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH v3 5/7] x86: Optimize strrchr-evex.S and implement with VMM headers
  2022-10-19  0:44   ` [PATCH v3 5/7] x86: Optimize strrchr-evex.S and implement with VMM headers Noah Goldstein
@ 2022-10-19 16:58     ` H.J. Lu
  0 siblings, 0 replies; 41+ messages in thread
From: H.J. Lu @ 2022-10-19 16:58 UTC (permalink / raw)
  To: Noah Goldstein; +Cc: libc-alpha, carlos

On Tue, Oct 18, 2022 at 5:44 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> Optimization is:
> 1. Cache latest result in "fast path" loop with `vmovdqu` instead of
>   `kunpckdq`.  This helps if there are more than one matches.
>
> Code Size Changes:
> strrchr-evex.S       :  +30 bytes (Same number of cache lines)
>
> Net perf changes:
>
> Reported as geometric mean of all improvements / regressions from N=10
> runs of the benchtests. Value as New Time / Old Time so < 1.0 is
> improvement and 1.0 is regression.
>
> strrchr-evex.S       : 0.932 (From cases with higher match frequency)
>
> Full results attached in email.
>
> Full check passes on x86-64.
> ---
>  sysdeps/x86_64/multiarch/strrchr-evex.S | 371 +++++++++++++-----------
>  1 file changed, 200 insertions(+), 171 deletions(-)
>
> diff --git a/sysdeps/x86_64/multiarch/strrchr-evex.S b/sysdeps/x86_64/multiarch/strrchr-evex.S
> index 992b45fb47..45487dc87a 100644
> --- a/sysdeps/x86_64/multiarch/strrchr-evex.S
> +++ b/sysdeps/x86_64/multiarch/strrchr-evex.S
> @@ -26,25 +26,30 @@
>  #  define STRRCHR      __strrchr_evex
>  # endif
>
> -# define VMOVU vmovdqu64
> -# define VMOVA vmovdqa64
> +# include "x86-evex256-vecs.h"
>
>  # ifdef USE_AS_WCSRCHR
> -#  define SHIFT_REG    esi
> -
> -#  define kunpck       kunpckbw
> +#  define RCX_M        cl
> +#  define SHIFT_REG    rcx
> +#  define VPCOMPRESS   vpcompressd
> +#  define kunpck_2x    kunpckbw
>  #  define kmov_2x      kmovd
>  #  define maskz_2x     ecx
>  #  define maskm_2x     eax
>  #  define CHAR_SIZE    4
>  #  define VPMIN        vpminud
>  #  define VPTESTN      vptestnmd
> +#  define VPTEST       vptestmd
>  #  define VPBROADCAST  vpbroadcastd
> +#  define VPCMPEQ      vpcmpeqd
>  #  define VPCMP        vpcmpd
> -# else
> -#  define SHIFT_REG    edi
>
> -#  define kunpck       kunpckdq
> +#  define USE_WIDE_CHAR
> +# else
> +#  define RCX_M        ecx
> +#  define SHIFT_REG    rdi
> +#  define VPCOMPRESS   vpcompressb
> +#  define kunpck_2x    kunpckdq
>  #  define kmov_2x      kmovq
>  #  define maskz_2x     rcx
>  #  define maskm_2x     rax
> @@ -52,58 +57,48 @@
>  #  define CHAR_SIZE    1
>  #  define VPMIN        vpminub
>  #  define VPTESTN      vptestnmb
> +#  define VPTEST       vptestmb
>  #  define VPBROADCAST  vpbroadcastb
> +#  define VPCMPEQ      vpcmpeqb
>  #  define VPCMP        vpcmpb
>  # endif
>
> -# define XMMZERO       xmm16
> -# define YMMZERO       ymm16
> -# define YMMMATCH      ymm17
> -# define YMMSAVE       ymm18
> +# include "reg-macros.h"
>
> -# define YMM1  ymm19
> -# define YMM2  ymm20
> -# define YMM3  ymm21
> -# define YMM4  ymm22
> -# define YMM5  ymm23
> -# define YMM6  ymm24
> -# define YMM7  ymm25
> -# define YMM8  ymm26
> -
> -
> -# define VEC_SIZE      32
> +# define VMATCH        VMM(0)
> +# define CHAR_PER_VEC  (VEC_SIZE / CHAR_SIZE)
>  # define PAGE_SIZE     4096
> -       .section .text.evex, "ax", @progbits
> -ENTRY(STRRCHR)
> +
> +       .section SECTION(.text), "ax", @progbits
> +ENTRY_P2ALIGN(STRRCHR, 6)
>         movl    %edi, %eax
> -       /* Broadcast CHAR to YMMMATCH.  */
> -       VPBROADCAST %esi, %YMMMATCH
> +       /* Broadcast CHAR to VMATCH.  */
> +       VPBROADCAST %esi, %VMATCH
>
>         andl    $(PAGE_SIZE - 1), %eax
>         cmpl    $(PAGE_SIZE - VEC_SIZE), %eax
>         jg      L(cross_page_boundary)
>
> -L(page_cross_continue):
> -       VMOVU   (%rdi), %YMM1
> -       /* k0 has a 1 for each zero CHAR in YMM1.  */
> -       VPTESTN %YMM1, %YMM1, %k0
> -       kmovd   %k0, %ecx
> -       testl   %ecx, %ecx
> +       VMOVU   (%rdi), %VMM(1)
> +       /* k0 has a 1 for each zero CHAR in VEC(1).  */
> +       VPTESTN %VMM(1), %VMM(1), %k0
> +       KMOV    %k0, %VRSI
> +       test    %VRSI, %VRSI
>         jz      L(aligned_more)
>         /* fallthrough: zero CHAR in first VEC.  */
> -
> -       /* K1 has a 1 for each search CHAR match in YMM1.  */
> -       VPCMP   $0, %YMMMATCH, %YMM1, %k1
> -       kmovd   %k1, %eax
> +L(page_cross_return):
> +       /* K1 has a 1 for each search CHAR match in VEC(1).  */
> +       VPCMPEQ %VMATCH, %VMM(1), %k1
> +       KMOV    %k1, %VRAX
>         /* Build mask up until first zero CHAR (used to mask of
>            potential search CHAR matches past the end of the string).
>          */
> -       blsmskl %ecx, %ecx
> -       andl    %ecx, %eax
> +       blsmsk  %VRSI, %VRSI
> +       and     %VRSI, %VRAX
>         jz      L(ret0)
> -       /* Get last match (the `andl` removed any out of bounds
> -          matches).  */
> -       bsrl    %eax, %eax
> +       /* Get last match (the `and` removed any out of bounds matches).
> +        */
> +       bsr     %VRAX, %VRAX
>  # ifdef USE_AS_WCSRCHR
>         leaq    (%rdi, %rax, CHAR_SIZE), %rax
>  # else
> @@ -116,22 +111,22 @@ L(ret0):
>            search path for earlier matches.  */
>         .p2align 4,, 6
>  L(first_vec_x1):
> -       VPCMP   $0, %YMMMATCH, %YMM2, %k1
> -       kmovd   %k1, %eax
> -       blsmskl %ecx, %ecx
> +       VPCMPEQ %VMATCH, %VMM(2), %k1
> +       KMOV    %k1, %VRAX
> +       blsmsk  %VRCX, %VRCX
>         /* eax non-zero if search CHAR in range.  */
> -       andl    %ecx, %eax
> +       and     %VRCX, %VRAX
>         jnz     L(first_vec_x1_return)
>
> -       /* fallthrough: no match in YMM2 then need to check for earlier
> -          matches (in YMM1).  */
> +       /* fallthrough: no match in VEC(2) then need to check for
> +          earlier matches (in VEC(1)).  */
>         .p2align 4,, 4
>  L(first_vec_x0_test):
> -       VPCMP   $0, %YMMMATCH, %YMM1, %k1
> -       kmovd   %k1, %eax
> -       testl   %eax, %eax
> +       VPCMPEQ %VMATCH, %VMM(1), %k1
> +       KMOV    %k1, %VRAX
> +       test    %VRAX, %VRAX
>         jz      L(ret1)
> -       bsrl    %eax, %eax
> +       bsr     %VRAX, %VRAX
>  # ifdef USE_AS_WCSRCHR
>         leaq    (%rsi, %rax, CHAR_SIZE), %rax
>  # else
> @@ -142,129 +137,144 @@ L(ret1):
>
>         .p2align 4,, 10
>  L(first_vec_x1_or_x2):
> -       VPCMP   $0, %YMM3, %YMMMATCH, %k3
> -       VPCMP   $0, %YMM2, %YMMMATCH, %k2
> +       VPCMPEQ %VMM(3), %VMATCH, %k3
> +       VPCMPEQ %VMM(2), %VMATCH, %k2
>         /* K2 and K3 have 1 for any search CHAR match. Test if any
> -          matches between either of them. Otherwise check YMM1.  */
> -       kortestd %k2, %k3
> +          matches between either of them. Otherwise check VEC(1).  */
> +       KORTEST %k2, %k3
>         jz      L(first_vec_x0_test)
>
> -       /* Guranteed that YMM2 and YMM3 are within range so merge the
> -          two bitmasks then get last result.  */
> -       kunpck  %k2, %k3, %k3
> -       kmovq   %k3, %rax
> -       bsrq    %rax, %rax
> -       leaq    (VEC_SIZE)(%r8, %rax, CHAR_SIZE), %rax
> +       /* Guranteed that VEC(2) and VEC(3) are within range so merge
> +          the two bitmasks then get last result.  */
> +       kunpck_2x %k2, %k3, %k3
> +       kmov_2x %k3, %maskm_2x
> +       bsr     %maskm_2x, %maskm_2x
> +       leaq    (VEC_SIZE * 1)(%r8, %rax, CHAR_SIZE), %rax
>         ret
>
> -       .p2align 4,, 6
> +       .p2align 4,, 7
>  L(first_vec_x3):
> -       VPCMP   $0, %YMMMATCH, %YMM4, %k1
> -       kmovd   %k1, %eax
> -       blsmskl %ecx, %ecx
> -       /* If no search CHAR match in range check YMM1/YMM2/YMM3.  */
> -       andl    %ecx, %eax
> +       VPCMPEQ %VMATCH, %VMM(4), %k1
> +       KMOV    %k1, %VRAX
> +       blsmsk  %VRCX, %VRCX
> +       /* If no search CHAR match in range check VEC(1)/VEC(2)/VEC(3).
> +        */
> +       and     %VRCX, %VRAX
>         jz      L(first_vec_x1_or_x2)
> -       bsrl    %eax, %eax
> +       bsr     %VRAX, %VRAX
>         leaq    (VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax
>         ret
>
> +
>         .p2align 4,, 6
>  L(first_vec_x0_x1_test):
> -       VPCMP   $0, %YMMMATCH, %YMM2, %k1
> -       kmovd   %k1, %eax
> -       /* Check YMM2 for last match first. If no match try YMM1.  */
> -       testl   %eax, %eax
> +       VPCMPEQ %VMATCH, %VMM(2), %k1
> +       KMOV    %k1, %VRAX
> +       /* Check VEC(2) for last match first. If no match try VEC(1).
> +        */
> +       test    %VRAX, %VRAX
>         jz      L(first_vec_x0_test)
>         .p2align 4,, 4
>  L(first_vec_x1_return):
> -       bsrl    %eax, %eax
> +       bsr     %VRAX, %VRAX
>         leaq    (VEC_SIZE)(%rdi, %rax, CHAR_SIZE), %rax
>         ret
>
> +
>         .p2align 4,, 10
>  L(first_vec_x2):
> -       VPCMP   $0, %YMMMATCH, %YMM3, %k1
> -       kmovd   %k1, %eax
> -       blsmskl %ecx, %ecx
> -       /* Check YMM3 for last match first. If no match try YMM2/YMM1.
> -        */
> -       andl    %ecx, %eax
> +       VPCMPEQ %VMATCH, %VMM(3), %k1
> +       KMOV    %k1, %VRAX
> +       blsmsk  %VRCX, %VRCX
> +       /* Check VEC(3) for last match first. If no match try
> +          VEC(2)/VEC(1).  */
> +       and     %VRCX, %VRAX
>         jz      L(first_vec_x0_x1_test)
> -       bsrl    %eax, %eax
> +       bsr     %VRAX, %VRAX
>         leaq    (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
>         ret
>
>
> -       .p2align 4
> +       .p2align 4,, 12
>  L(aligned_more):
> -       /* Need to keep original pointer incase YMM1 has last match.  */
> +L(page_cross_continue):
> +       /* Need to keep original pointer incase VEC(1) has last match.
> +        */
>         movq    %rdi, %rsi
>         andq    $-VEC_SIZE, %rdi
> -       VMOVU   VEC_SIZE(%rdi), %YMM2
> -       VPTESTN %YMM2, %YMM2, %k0
> -       kmovd   %k0, %ecx
> -       testl   %ecx, %ecx
> +
> +       VMOVU   VEC_SIZE(%rdi), %VMM(2)
> +       VPTESTN %VMM(2), %VMM(2), %k0
> +       KMOV    %k0, %VRCX
> +
> +       test    %VRCX, %VRCX
>         jnz     L(first_vec_x1)
>
> -       VMOVU   (VEC_SIZE * 2)(%rdi), %YMM3
> -       VPTESTN %YMM3, %YMM3, %k0
> -       kmovd   %k0, %ecx
> -       testl   %ecx, %ecx
> +       VMOVU   (VEC_SIZE * 2)(%rdi), %VMM(3)
> +       VPTESTN %VMM(3), %VMM(3), %k0
> +       KMOV    %k0, %VRCX
> +
> +       test    %VRCX, %VRCX
>         jnz     L(first_vec_x2)
>
> -       VMOVU   (VEC_SIZE * 3)(%rdi), %YMM4
> -       VPTESTN %YMM4, %YMM4, %k0
> -       kmovd   %k0, %ecx
> +       VMOVU   (VEC_SIZE * 3)(%rdi), %VMM(4)
> +       VPTESTN %VMM(4), %VMM(4), %k0
> +       KMOV    %k0, %VRCX
>         movq    %rdi, %r8
> -       testl   %ecx, %ecx
> +       test    %VRCX, %VRCX
>         jnz     L(first_vec_x3)
>
>         andq    $-(VEC_SIZE * 2), %rdi
> -       .p2align 4
> +       .p2align 4,, 10
>  L(first_aligned_loop):
> -       /* Preserve YMM1, YMM2, YMM3, and YMM4 until we can gurantee
> -          they don't store a match.  */
> -       VMOVA   (VEC_SIZE * 4)(%rdi), %YMM5
> -       VMOVA   (VEC_SIZE * 5)(%rdi), %YMM6
> +       /* Preserve VEC(1), VEC(2), VEC(3), and VEC(4) until we can
> +          gurantee they don't store a match.  */
> +       VMOVA   (VEC_SIZE * 4)(%rdi), %VMM(5)
> +       VMOVA   (VEC_SIZE * 5)(%rdi), %VMM(6)
>
> -       VPCMP   $0, %YMM5, %YMMMATCH, %k2
> -       vpxord  %YMM6, %YMMMATCH, %YMM7
> +       VPCMPEQ %VMM(5), %VMATCH, %k2
> +       vpxord  %VMM(6), %VMATCH, %VMM(7)
>
> -       VPMIN   %YMM5, %YMM6, %YMM8
> -       VPMIN   %YMM8, %YMM7, %YMM7
> +       VPMIN   %VMM(5), %VMM(6), %VMM(8)
> +       VPMIN   %VMM(8), %VMM(7), %VMM(7)
>
> -       VPTESTN %YMM7, %YMM7, %k1
> +       VPTESTN %VMM(7), %VMM(7), %k1
>         subq    $(VEC_SIZE * -2), %rdi
> -       kortestd %k1, %k2
> +       KORTEST %k1, %k2
>         jz      L(first_aligned_loop)
>
> -       VPCMP   $0, %YMM6, %YMMMATCH, %k3
> -       VPTESTN %YMM8, %YMM8, %k1
> -       ktestd  %k1, %k1
> +       VPCMPEQ %VMM(6), %VMATCH, %k3
> +       VPTESTN %VMM(8), %VMM(8), %k1
> +
> +       /* If k1 is zero, then we found a CHAR match but no null-term.
> +          We can now safely throw out VEC1-4.  */
> +       KTEST   %k1, %k1
>         jz      L(second_aligned_loop_prep)
>
> -       kortestd %k2, %k3
> +       KORTEST %k2, %k3
>         jnz     L(return_first_aligned_loop)
>
> +
>         .p2align 4,, 6
>  L(first_vec_x1_or_x2_or_x3):
> -       VPCMP   $0, %YMM4, %YMMMATCH, %k4
> -       kmovd   %k4, %eax
> -       testl   %eax, %eax
> +       VPCMPEQ %VMM(4), %VMATCH, %k4
> +       KMOV    %k4, %VRAX
> +       bsr     %VRAX, %VRAX
>         jz      L(first_vec_x1_or_x2)
> -       bsrl    %eax, %eax
>         leaq    (VEC_SIZE * 3)(%r8, %rax, CHAR_SIZE), %rax
>         ret
>
> +
>         .p2align 4,, 8
>  L(return_first_aligned_loop):
> -       VPTESTN %YMM5, %YMM5, %k0
> -       kunpck  %k0, %k1, %k0
> +       VPTESTN %VMM(5), %VMM(5), %k0
> +
> +       /* Combined results from VEC5/6.  */
> +       kunpck_2x %k0, %k1, %k0
>         kmov_2x %k0, %maskz_2x
>
>         blsmsk  %maskz_2x, %maskz_2x
> -       kunpck  %k2, %k3, %k3
> +       kunpck_2x %k2, %k3, %k3
>         kmov_2x %k3, %maskm_2x
>         and     %maskz_2x, %maskm_2x
>         jz      L(first_vec_x1_or_x2_or_x3)
> @@ -280,47 +290,62 @@ L(return_first_aligned_loop):
>  L(second_aligned_loop_prep):
>  L(second_aligned_loop_set_furthest_match):
>         movq    %rdi, %rsi
> -       kunpck  %k2, %k3, %k4
> -
> +       /* Ideally we would safe k2/k3 but `kmov/kunpck` take uops on
> +          port0 and have noticable overhead in the loop.  */
> +       VMOVA   %VMM(5), %VMM(7)
> +       VMOVA   %VMM(6), %VMM(8)
>         .p2align 4
>  L(second_aligned_loop):
> -       VMOVU   (VEC_SIZE * 4)(%rdi), %YMM1
> -       VMOVU   (VEC_SIZE * 5)(%rdi), %YMM2
> -
> -       VPCMP   $0, %YMM1, %YMMMATCH, %k2
> -       vpxord  %YMM2, %YMMMATCH, %YMM3
> +       VMOVU   (VEC_SIZE * 4)(%rdi), %VMM(5)
> +       VMOVU   (VEC_SIZE * 5)(%rdi), %VMM(6)
> +       VPCMPEQ %VMM(5), %VMATCH, %k2
> +       vpxord  %VMM(6), %VMATCH, %VMM(3)
>
> -       VPMIN   %YMM1, %YMM2, %YMM4
> -       VPMIN   %YMM3, %YMM4, %YMM3
> +       VPMIN   %VMM(5), %VMM(6), %VMM(4)
> +       VPMIN   %VMM(3), %VMM(4), %VMM(3)
>
> -       VPTESTN %YMM3, %YMM3, %k1
> +       VPTESTN %VMM(3), %VMM(3), %k1
>         subq    $(VEC_SIZE * -2), %rdi
> -       kortestd %k1, %k2
> +       KORTEST %k1, %k2
>         jz      L(second_aligned_loop)
> -
> -       VPCMP   $0, %YMM2, %YMMMATCH, %k3
> -       VPTESTN %YMM4, %YMM4, %k1
> -       ktestd  %k1, %k1
> +       VPCMPEQ %VMM(6), %VMATCH, %k3
> +       VPTESTN %VMM(4), %VMM(4), %k1
> +       KTEST   %k1, %k1
>         jz      L(second_aligned_loop_set_furthest_match)
>
> -       kortestd %k2, %k3
> -       /* branch here because there is a significant advantage interms
> -          of output dependency chance in using edx.  */
> +       /* branch here because we know we have a match in VEC7/8 but
> +          might not in VEC5/6 so the latter is expected to be less
> +          likely.  */
> +       KORTEST %k2, %k3
>         jnz     L(return_new_match)
> +
>  L(return_old_match):
> -       kmovq   %k4, %rax
> -       bsrq    %rax, %rax
> -       leaq    (VEC_SIZE * 2)(%rsi, %rax, CHAR_SIZE), %rax
> +       VPCMPEQ %VMM(8), %VMATCH, %k0
> +       KMOV    %k0, %VRCX
> +       bsr     %VRCX, %VRCX
> +       jnz     L(return_old_match_ret)
> +
> +       VPCMPEQ %VMM(7), %VMATCH, %k0
> +       KMOV    %k0, %VRCX
> +       bsr     %VRCX, %VRCX
> +       subq    $VEC_SIZE, %rsi
> +L(return_old_match_ret):
> +       leaq    (VEC_SIZE * 3)(%rsi, %rcx, CHAR_SIZE), %rax
>         ret
>
> +       .p2align 4,, 10
>  L(return_new_match):
> -       VPTESTN %YMM1, %YMM1, %k0
> -       kunpck  %k0, %k1, %k0
> +       VPTESTN %VMM(5), %VMM(5), %k0
> +
> +       /* Combined results from VEC5/6.  */
> +       kunpck_2x %k0, %k1, %k0
>         kmov_2x %k0, %maskz_2x
>
>         blsmsk  %maskz_2x, %maskz_2x
> -       kunpck  %k2, %k3, %k3
> +       kunpck_2x %k2, %k3, %k3
>         kmov_2x %k3, %maskm_2x
> +
> +       /* Match at end was out-of-bounds so use last known match.  */
>         and     %maskz_2x, %maskm_2x
>         jz      L(return_old_match)
>
> @@ -328,49 +353,53 @@ L(return_new_match):
>         leaq    (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
>         ret
>
> +       .p2align 4,, 4
>  L(cross_page_boundary):
> -       /* eax contains all the page offset bits of src (rdi). `xor rdi,
> -          rax` sets pointer will all page offset bits cleared so
> -          offset of (PAGE_SIZE - VEC_SIZE) will get last aligned VEC
> -          before page cross (guranteed to be safe to read). Doing this
> -          as opposed to `movq %rdi, %rax; andq $-VEC_SIZE, %rax` saves
> -          a bit of code size.  */
>         xorq    %rdi, %rax
> -       VMOVU   (PAGE_SIZE - VEC_SIZE)(%rax), %YMM1
> -       VPTESTN %YMM1, %YMM1, %k0
> -       kmovd   %k0, %ecx
> +       mov     $-1, %VRDX
> +       VMOVU   (PAGE_SIZE - VEC_SIZE)(%rax), %VMM(6)
> +       VPTESTN %VMM(6), %VMM(6), %k0
> +       KMOV    %k0, %VRSI
> +
> +# ifdef USE_AS_WCSRCHR
> +       movl    %edi, %ecx
> +       and     $(VEC_SIZE - 1), %ecx
> +       shrl    $2, %ecx
> +# endif
> +       shlx    %VGPR(SHIFT_REG), %VRDX, %VRDX
>
> -       /* Shift out zero CHAR matches that are before the begining of
> -          src (rdi).  */
>  # ifdef USE_AS_WCSRCHR
> -       movl    %edi, %esi
> -       andl    $(VEC_SIZE - 1), %esi
> -       shrl    $2, %esi
> +       kmovb   %edx, %k1
> +# else
> +       KMOV    %VRDX, %k1
>  # endif
> -       shrxl   %SHIFT_REG, %ecx, %ecx
>
> -       testl   %ecx, %ecx
> +       /* Need to adjust result to VEC(1) so it can be re-used by
> +          L(return_vec_x0_test).  The alternative is to collect VEC(1)
> +          will a page cross load which is far more expensive.  */
> +       VPCOMPRESS %VMM(6), %VMM(1){%k1}{z}
> +
> +       /* We could technically just jmp back after the vpcompress but
> +          it doesn't save any 16-byte blocks.  */
> +       shrx    %VGPR(SHIFT_REG), %VRSI, %VRSI
> +       test    %VRSI, %VRSI
>         jz      L(page_cross_continue)
>
> -       /* Found zero CHAR so need to test for search CHAR.  */
> -       VPCMP   $0, %YMMMATCH, %YMM1, %k1
> -       kmovd   %k1, %eax
> -       /* Shift out search CHAR matches that are before the begining of
> -          src (rdi).  */
> -       shrxl   %SHIFT_REG, %eax, %eax
> -
> -       /* Check if any search CHAR match in range.  */
> -       blsmskl %ecx, %ecx
> -       andl    %ecx, %eax
> -       jz      L(ret3)
> -       bsrl    %eax, %eax
> +       /* Duplicate of return logic from ENTRY. Doesn't cause spill to
> +          next cache line so might as well copy it here.  */
> +       VPCMPEQ %VMATCH, %VMM(1), %k1
> +       KMOV    %k1, %VRAX
> +       blsmsk  %VRSI, %VRSI
> +       and     %VRSI, %VRAX
> +       jz      L(ret_page_cross)
> +       bsr     %VRAX, %VRAX
>  # ifdef USE_AS_WCSRCHR
>         leaq    (%rdi, %rax, CHAR_SIZE), %rax
>  # else
>         addq    %rdi, %rax
>  # endif
> -L(ret3):
> +L(ret_page_cross):
>         ret
> -
> +       /* 1 byte till next cache line.  */
>  END(STRRCHR)
>  #endif
> --
> 2.34.1
>

LGTM.

Thanks.

-- 
H.J.

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH v3 6/7] x86: Add support for VEC_SIZE == 64 in strcmp-evex.S impl
  2022-10-19  0:44   ` [PATCH v3 6/7] x86: Add support for VEC_SIZE == 64 in strcmp-evex.S impl Noah Goldstein
@ 2022-10-19 16:59     ` H.J. Lu
  0 siblings, 0 replies; 41+ messages in thread
From: H.J. Lu @ 2022-10-19 16:59 UTC (permalink / raw)
  To: Noah Goldstein; +Cc: libc-alpha, carlos

On Tue, Oct 18, 2022 at 5:44 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> Unused at the moment, but evex512 strcmp, strncmp, strcasecmp{l}, and
> strncasecmp{l} functions can be added by including strcmp-evex.S with
> "x86-evex512-vecs.h" defined.
>
> In addition save code size a bit in a few places.
>
> 1. tzcnt ...         -> bsf ...
> 2. vpcmp{b|d} $0 ... -> vpcmpeq{b|d}
>
> This saves a touch of code size but has minimal net affect.
>
> Full check passes on x86-64.
> ---
>  sysdeps/x86_64/multiarch/strcmp-evex.S | 676 ++++++++++++++++---------
>  1 file changed, 430 insertions(+), 246 deletions(-)
>
> diff --git a/sysdeps/x86_64/multiarch/strcmp-evex.S b/sysdeps/x86_64/multiarch/strcmp-evex.S
> index e482d0167f..756a3bb8d6 100644
> --- a/sysdeps/x86_64/multiarch/strcmp-evex.S
> +++ b/sysdeps/x86_64/multiarch/strcmp-evex.S
> @@ -20,6 +20,10 @@
>
>  #if ISA_SHOULD_BUILD (4)
>
> +# ifndef VEC_SIZE
> +#  include "x86-evex256-vecs.h"
> +# endif
> +
>  # define STRCMP_ISA    _evex
>  # include "strcmp-naming.h"
>
> @@ -35,41 +39,57 @@
>  # define PAGE_SIZE     4096
>
>         /* VEC_SIZE = Number of bytes in a ymm register.  */
> -# define VEC_SIZE      32
>  # define CHAR_PER_VEC  (VEC_SIZE       /       SIZE_OF_CHAR)
>
> -# define VMOVU vmovdqu64
> -# define VMOVA vmovdqa64
> -
>  # ifdef USE_AS_WCSCMP
> -#  define TESTEQ       subl $0xff,
>         /* Compare packed dwords.  */
>  #  define VPCMP        vpcmpd
> +#  define VPCMPEQ      vpcmpeqd
>  #  define VPMINU       vpminud
>  #  define VPTESTM      vptestmd
>  #  define VPTESTNM     vptestnmd
>         /* 1 dword char == 4 bytes.  */
>  #  define SIZE_OF_CHAR 4
> +
> +#  define TESTEQ       sub $((1 << CHAR_PER_VEC) - 1),
> +
> +#  define USE_WIDE_CHAR
>  # else
> -#  define TESTEQ       incl
>         /* Compare packed bytes.  */
>  #  define VPCMP        vpcmpb
> +#  define VPCMPEQ      vpcmpeqb
>  #  define VPMINU       vpminub
>  #  define VPTESTM      vptestmb
>  #  define VPTESTNM     vptestnmb
>         /* 1 byte char == 1 byte.  */
>  #  define SIZE_OF_CHAR 1
> +
> +#  define TESTEQ       inc
> +# endif
> +
> +# include "reg-macros.h"
> +
> +# if VEC_SIZE == 64
> +#  define RODATA_SECTION       rodata.cst64
> +# else
> +#  define RODATA_SECTION       rodata.cst32
> +# endif
> +
> +# if CHAR_PER_VEC == 64
> +#  define FALLTHROUGH_RETURN_OFFSET    (VEC_SIZE * 3)
> +# else
> +#  define FALLTHROUGH_RETURN_OFFSET    (VEC_SIZE * 2)
>  # endif
>
>  # ifdef USE_AS_STRNCMP
> -#  define LOOP_REG     r9d
> +#  define LOOP_REG     VR9
>  #  define LOOP_REG64   r9
>
>  #  define OFFSET_REG8  r9b
>  #  define OFFSET_REG   r9d
>  #  define OFFSET_REG64 r9
>  # else
> -#  define LOOP_REG     edx
> +#  define LOOP_REG     VRDX
>  #  define LOOP_REG64   rdx
>
>  #  define OFFSET_REG8  dl
> @@ -83,32 +103,6 @@
>  #  define VEC_OFFSET   (-VEC_SIZE)
>  # endif
>
> -# define XMM0  xmm17
> -# define XMM1  xmm18
> -
> -# define XMM10 xmm27
> -# define XMM11 xmm28
> -# define XMM12 xmm29
> -# define XMM13 xmm30
> -# define XMM14 xmm31
> -
> -
> -# define YMM0  ymm17
> -# define YMM1  ymm18
> -# define YMM2  ymm19
> -# define YMM3  ymm20
> -# define YMM4  ymm21
> -# define YMM5  ymm22
> -# define YMM6  ymm23
> -# define YMM7  ymm24
> -# define YMM8  ymm25
> -# define YMM9  ymm26
> -# define YMM10 ymm27
> -# define YMM11 ymm28
> -# define YMM12 ymm29
> -# define YMM13 ymm30
> -# define YMM14 ymm31
> -
>  # ifdef USE_AS_STRCASECMP_L
>  #  define BYTE_LOOP_REG        OFFSET_REG
>  # else
> @@ -125,61 +119,72 @@
>  #  endif
>  # endif
>
> -# define LCASE_MIN_YMM %YMM12
> -# define LCASE_MAX_YMM %YMM13
> -# define CASE_ADD_YMM  %YMM14
> +# define LCASE_MIN_V   VMM(12)
> +# define LCASE_MAX_V   VMM(13)
> +# define CASE_ADD_V    VMM(14)
>
> -# define LCASE_MIN_XMM %XMM12
> -# define LCASE_MAX_XMM %XMM13
> -# define CASE_ADD_XMM  %XMM14
> +# if VEC_SIZE == 64
> +#  define LCASE_MIN_YMM        VMM_256(12)
> +#  define LCASE_MAX_YMM        VMM_256(13)
> +#  define CASE_ADD_YMM VMM_256(14)
> +# endif
> +
> +# define LCASE_MIN_XMM VMM_128(12)
> +# define LCASE_MAX_XMM VMM_128(13)
> +# define CASE_ADD_XMM  VMM_128(14)
>
>         /* NB: wcsncmp uses r11 but strcasecmp is never used in
>            conjunction with wcscmp.  */
>  # define TOLOWER_BASE  %r11
>
>  # ifdef USE_AS_STRCASECMP_L
> -#  define _REG(x, y) x ## y
> -#  define REG(x, y) _REG(x, y)
> -#  define TOLOWER(reg1, reg2, ext)                                                                             \
> -       vpsubb  REG(LCASE_MIN_, ext), reg1, REG(%ext, 10);                                      \
> -       vpsubb  REG(LCASE_MIN_, ext), reg2, REG(%ext, 11);                                      \
> -       vpcmpub $1, REG(LCASE_MAX_, ext), REG(%ext, 10), %k5;                           \
> -       vpcmpub $1, REG(LCASE_MAX_, ext), REG(%ext, 11), %k6;                           \
> -       vpaddb  reg1, REG(CASE_ADD_, ext), reg1{%k5};                                           \
> -       vpaddb  reg2, REG(CASE_ADD_, ext), reg2{%k6}
> -
> -#  define TOLOWER_gpr(src, dst) movl (TOLOWER_BASE, src, 4), dst
> -#  define TOLOWER_YMM(...)     TOLOWER(__VA_ARGS__, YMM)
> -#  define TOLOWER_XMM(...)     TOLOWER(__VA_ARGS__, XMM)
> -
> -#  define CMP_R1_R2(s1_reg, s2_reg, reg_out, ext)                                              \
> -       TOLOWER (s1_reg, s2_reg, ext);                                                                          \
> -       VPCMP   $0, s1_reg, s2_reg, reg_out
> -
> -#  define CMP_R1_S2(s1_reg, s2_mem, s2_reg, reg_out, ext)                              \
> -       VMOVU   s2_mem, s2_reg;                                                                                         \
> -       CMP_R1_R2(s1_reg, s2_reg, reg_out, ext)
> -
> -#  define CMP_R1_R2_YMM(...) CMP_R1_R2(__VA_ARGS__, YMM)
> -#  define CMP_R1_R2_XMM(...) CMP_R1_R2(__VA_ARGS__, XMM)
> -
> -#  define CMP_R1_S2_YMM(...) CMP_R1_S2(__VA_ARGS__, YMM)
> -#  define CMP_R1_S2_XMM(...) CMP_R1_S2(__VA_ARGS__, XMM)
> +#  define _REG(x, y)   x ## y
> +#  define REG(x, y)    _REG(x, y)
> +#  define TOLOWER(reg1, reg2, ext, vec_macro)  \
> +       vpsubb  %REG(LCASE_MIN_, ext), reg1, %vec_macro(10);    \
> +       vpsubb  %REG(LCASE_MIN_, ext), reg2, %vec_macro(11);    \
> +       vpcmpub $1, %REG(LCASE_MAX_, ext), %vec_macro(10), %k5; \
> +       vpcmpub $1, %REG(LCASE_MAX_, ext), %vec_macro(11), %k6; \
> +       vpaddb  reg1, %REG(CASE_ADD_, ext), reg1{%k5};  \
> +       vpaddb  reg2, %REG(CASE_ADD_, ext), reg2{%k6}
> +
> +#  define TOLOWER_gpr(src, dst)        movl (TOLOWER_BASE, src, 4), dst
> +#  define TOLOWER_VMM(...)     TOLOWER(__VA_ARGS__, V, VMM)
> +#  define TOLOWER_YMM(...)     TOLOWER(__VA_ARGS__, YMM, VMM_256)
> +#  define TOLOWER_XMM(...)     TOLOWER(__VA_ARGS__, XMM, VMM_128)
> +
> +#  define CMP_R1_R2(s1_reg, s2_reg, reg_out, ext, vec_macro)   \
> +       TOLOWER (s1_reg, s2_reg, ext, vec_macro);       \
> +       VPCMPEQ s1_reg, s2_reg, reg_out
> +
> +#  define CMP_R1_S2(s1_reg, s2_mem, s2_reg, reg_out, ext, vec_macro)   \
> +       VMOVU   s2_mem, s2_reg; \
> +       CMP_R1_R2 (s1_reg, s2_reg, reg_out, ext, vec_macro)
> +
> +#  define CMP_R1_R2_VMM(...)   CMP_R1_R2(__VA_ARGS__, V, VMM)
> +#  define CMP_R1_R2_YMM(...)   CMP_R1_R2(__VA_ARGS__, YMM, VMM_256)
> +#  define CMP_R1_R2_XMM(...)   CMP_R1_R2(__VA_ARGS__, XMM, VMM_128)
> +
> +#  define CMP_R1_S2_VMM(...)   CMP_R1_S2(__VA_ARGS__, V, VMM)
> +#  define CMP_R1_S2_YMM(...)   CMP_R1_S2(__VA_ARGS__, YMM, VMM_256)
> +#  define CMP_R1_S2_XMM(...)   CMP_R1_S2(__VA_ARGS__, XMM, VMM_128)
>
>  # else
>  #  define TOLOWER_gpr(...)
> +#  define TOLOWER_VMM(...)
>  #  define TOLOWER_YMM(...)
>  #  define TOLOWER_XMM(...)
>
> -#  define CMP_R1_R2_YMM(s1_reg, s2_reg, reg_out)                                               \
> -       VPCMP   $0, s2_reg, s1_reg, reg_out
> +#  define CMP_R1_R2_VMM(s1_reg, s2_reg, reg_out)       \
> +       VPCMPEQ s2_reg, s1_reg, reg_out
>
> -#  define CMP_R1_R2_XMM(...) CMP_R1_R2_YMM(__VA_ARGS__)
> +#  define CMP_R1_R2_YMM(...)   CMP_R1_R2_VMM(__VA_ARGS__)
> +#  define CMP_R1_R2_XMM(...)   CMP_R1_R2_VMM(__VA_ARGS__)
>
> -#  define CMP_R1_S2_YMM(s1_reg, s2_mem, unused, reg_out)                               \
> -       VPCMP   $0, s2_mem, s1_reg, reg_out
> -
> -#  define CMP_R1_S2_XMM(...) CMP_R1_S2_YMM(__VA_ARGS__)
> +#  define CMP_R1_S2_VMM(s1_reg, s2_mem, unused, reg_out)       \
> +       VPCMPEQ s2_mem, s1_reg, reg_out
> +#  define CMP_R1_S2_YMM(...)   CMP_R1_S2_VMM(__VA_ARGS__)
> +#  define CMP_R1_S2_XMM(...)   CMP_R1_S2_VMM(__VA_ARGS__)
>  # endif
>
>  /* Warning!
> @@ -203,7 +208,7 @@
>     the maximum offset is reached before a difference is found, zero is
>     returned.  */
>
> -       .section .text.evex, "ax", @progbits
> +       .section SECTION(.text), "ax", @progbits
>         .align  16
>         .type   STRCMP, @function
>         .globl  STRCMP
> @@ -232,7 +237,7 @@ STRCMP:
>  #  else
>         mov     (%LOCALE_REG), %RAX_LP
>  #  endif
> -       testl   $1, LOCALE_DATA_VALUES + _NL_CTYPE_NONASCII_CASE * SIZEOF_VALUES(%rax)
> +       testb   $1, LOCALE_DATA_VALUES + _NL_CTYPE_NONASCII_CASE * SIZEOF_VALUES(%rax)
>         jne     STRCASECMP_L_NONASCII
>         leaq    _nl_C_LC_CTYPE_tolower + 128 * 4(%rip), TOLOWER_BASE
>  # endif
> @@ -254,28 +259,46 @@ STRCMP:
>  # endif
>
>  # if defined USE_AS_STRCASECMP_L
> -       .section .rodata.cst32, "aM", @progbits, 32
> -       .align  32
> +       .section RODATA_SECTION, "aM", @progbits, VEC_SIZE
> +       .align  VEC_SIZE
>  L(lcase_min):
>         .quad   0x4141414141414141
>         .quad   0x4141414141414141
>         .quad   0x4141414141414141
>         .quad   0x4141414141414141
> +#  if VEC_SIZE == 64
> +       .quad   0x4141414141414141
> +       .quad   0x4141414141414141
> +       .quad   0x4141414141414141
> +       .quad   0x4141414141414141
> +#  endif
>  L(lcase_max):
>         .quad   0x1a1a1a1a1a1a1a1a
>         .quad   0x1a1a1a1a1a1a1a1a
>         .quad   0x1a1a1a1a1a1a1a1a
>         .quad   0x1a1a1a1a1a1a1a1a
> +#  if VEC_SIZE == 64
> +       .quad   0x1a1a1a1a1a1a1a1a
> +       .quad   0x1a1a1a1a1a1a1a1a
> +       .quad   0x1a1a1a1a1a1a1a1a
> +       .quad   0x1a1a1a1a1a1a1a1a
> +#  endif
>  L(case_add):
>         .quad   0x2020202020202020
>         .quad   0x2020202020202020
>         .quad   0x2020202020202020
>         .quad   0x2020202020202020
> +#  if VEC_SIZE == 64
> +       .quad   0x2020202020202020
> +       .quad   0x2020202020202020
> +       .quad   0x2020202020202020
> +       .quad   0x2020202020202020
> +#  endif
>         .previous
>
> -       vmovdqa64 L(lcase_min)(%rip), LCASE_MIN_YMM
> -       vmovdqa64 L(lcase_max)(%rip), LCASE_MAX_YMM
> -       vmovdqa64 L(case_add)(%rip), CASE_ADD_YMM
> +       VMOVA   L(lcase_min)(%rip), %LCASE_MIN_V
> +       VMOVA   L(lcase_max)(%rip), %LCASE_MAX_V
> +       VMOVA   L(case_add)(%rip), %CASE_ADD_V
>  # endif
>
>         movl    %edi, %eax
> @@ -288,12 +311,12 @@ L(case_add):
>
>  L(no_page_cross):
>         /* Safe to compare 4x vectors.  */
> -       VMOVU   (%rdi), %YMM0
> -       VPTESTM %YMM0, %YMM0, %k2
> +       VMOVU   (%rdi), %VMM(0)
> +       VPTESTM %VMM(0), %VMM(0), %k2
>         /* Each bit cleared in K1 represents a mismatch or a null CHAR
>            in YMM0 and 32 bytes at (%rsi).  */
> -       CMP_R1_S2_YMM (%YMM0, (%rsi), %YMM1, %k1){%k2}
> -       kmovd   %k1, %ecx
> +       CMP_R1_S2_VMM (%VMM(0), (%rsi), %VMM(1), %k1){%k2}
> +       KMOV    %k1, %VRCX
>  # ifdef USE_AS_STRNCMP
>         cmpq    $CHAR_PER_VEC, %rdx
>         jbe     L(vec_0_test_len)
> @@ -303,14 +326,14 @@ L(no_page_cross):
>            wcscmp/wcsncmp.  */
>
>         /* All 1s represents all equals. TESTEQ will overflow to zero in
> -          all equals case. Otherwise 1s will carry until position of first
> -          mismatch.  */
> -       TESTEQ  %ecx
> +          all equals case. Otherwise 1s will carry until position of
> +          first mismatch.  */
> +       TESTEQ  %VRCX
>         jz      L(more_3x_vec)
>
>         .p2align 4,, 4
>  L(return_vec_0):
> -       tzcntl  %ecx, %ecx
> +       bsf     %VRCX, %VRCX
>  # ifdef USE_AS_WCSCMP
>         movl    (%rdi, %rcx, SIZE_OF_CHAR), %edx
>         xorl    %eax, %eax
> @@ -321,7 +344,16 @@ L(return_vec_0):
>         orl     $1, %eax
>  # else
>         movzbl  (%rdi, %rcx), %eax
> +       /* For VEC_SIZE == 64 use movb instead of movzbl to save a byte
> +          and keep logic for len <= VEC_SIZE (common) in just the
> +          first cache line.  NB: No evex512 processor has partial-
> +          register stalls. If that changes this ifdef can be disabled
> +          without affecting correctness.  */
> +#  if !defined USE_AS_STRNCMP && !defined USE_AS_STRCASECMP_L && VEC_SIZE == 64
> +       movb    (%rsi, %rcx), %cl
> +#  else
>         movzbl  (%rsi, %rcx), %ecx
> +#  endif
>         TOLOWER_gpr (%rax, %eax)
>         TOLOWER_gpr (%rcx, %ecx)
>         subl    %ecx, %eax
> @@ -332,8 +364,8 @@ L(ret0):
>  # ifdef USE_AS_STRNCMP
>         .p2align 4,, 4
>  L(vec_0_test_len):
> -       notl    %ecx
> -       bzhil   %edx, %ecx, %eax
> +       not     %VRCX
> +       bzhi    %VRDX, %VRCX, %VRAX
>         jnz     L(return_vec_0)
>         /* Align if will cross fetch block.  */
>         .p2align 4,, 2
> @@ -372,7 +404,7 @@ L(ret1):
>
>         .p2align 4,, 10
>  L(return_vec_1):
> -       tzcntl  %ecx, %ecx
> +       bsf     %VRCX, %VRCX
>  # ifdef USE_AS_STRNCMP
>         /* rdx must be > CHAR_PER_VEC so its safe to subtract without
>            worrying about underflow.  */
> @@ -401,24 +433,41 @@ L(ret2):
>         .p2align 4,, 10
>  # ifdef USE_AS_STRNCMP
>  L(return_vec_3):
> -#  if CHAR_PER_VEC <= 16
> +#  if CHAR_PER_VEC <= 32
> +       /* If CHAR_PER_VEC <= 32 reuse code from L(return_vec_3) without
> +          additional branches by adjusting the bit positions from
> +          VEC3.  We can't do this for CHAR_PER_VEC == 64.  */
> +#   if CHAR_PER_VEC <= 16
>         sall    $CHAR_PER_VEC, %ecx
> -#  else
> +#   else
>         salq    $CHAR_PER_VEC, %rcx
> +#   endif
> +#  else
> +       /* If CHAR_PER_VEC == 64 we can't shift the return GPR so just
> +          check it.  */
> +       bsf     %VRCX, %VRCX
> +       addl    $(CHAR_PER_VEC), %ecx
> +       cmpq    %rcx, %rdx
> +       ja      L(ret_vec_3_finish)
> +       xorl    %eax, %eax
> +       ret
>  #  endif
>  # endif
> +
> +       /* If CHAR_PER_VEC == 64 we can't combine matches from the last
> +          2x VEC so need seperate return label.  */
>  L(return_vec_2):
>  # if (CHAR_PER_VEC <= 16) || !(defined USE_AS_STRNCMP)
> -       tzcntl  %ecx, %ecx
> +       bsf     %VRCX, %VRCX
>  # else
> -       tzcntq  %rcx, %rcx
> +       bsfq    %rcx, %rcx
>  # endif
> -
>  # ifdef USE_AS_STRNCMP
>         cmpq    %rcx, %rdx
>         jbe     L(ret_zero)
>  # endif
>
> +L(ret_vec_3_finish):
>  # ifdef USE_AS_WCSCMP
>         movl    (VEC_SIZE * 2)(%rdi, %rcx, SIZE_OF_CHAR), %edx
>         xorl    %eax, %eax
> @@ -440,7 +489,7 @@ L(ret3):
>  # ifndef USE_AS_STRNCMP
>         .p2align 4,, 10
>  L(return_vec_3):
> -       tzcntl  %ecx, %ecx
> +       bsf     %VRCX, %VRCX
>  #  ifdef USE_AS_WCSCMP
>         movl    (VEC_SIZE * 3)(%rdi, %rcx, SIZE_OF_CHAR), %edx
>         xorl    %eax, %eax
> @@ -465,11 +514,11 @@ L(ret4):
>         .p2align 5
>  L(more_3x_vec):
>         /* Safe to compare 4x vectors.  */
> -       VMOVU   (VEC_SIZE)(%rdi), %YMM0
> -       VPTESTM %YMM0, %YMM0, %k2
> -       CMP_R1_S2_YMM (%YMM0, VEC_SIZE(%rsi), %YMM1, %k1){%k2}
> -       kmovd   %k1, %ecx
> -       TESTEQ  %ecx
> +       VMOVU   (VEC_SIZE)(%rdi), %VMM(0)
> +       VPTESTM %VMM(0), %VMM(0), %k2
> +       CMP_R1_S2_VMM (%VMM(0), VEC_SIZE(%rsi), %VMM(1), %k1){%k2}
> +       KMOV    %k1, %VRCX
> +       TESTEQ  %VRCX
>         jnz     L(return_vec_1)
>
>  # ifdef USE_AS_STRNCMP
> @@ -477,18 +526,18 @@ L(more_3x_vec):
>         jbe     L(ret_zero)
>  # endif
>
> -       VMOVU   (VEC_SIZE * 2)(%rdi), %YMM0
> -       VPTESTM %YMM0, %YMM0, %k2
> -       CMP_R1_S2_YMM (%YMM0, (VEC_SIZE * 2)(%rsi), %YMM1, %k1){%k2}
> -       kmovd   %k1, %ecx
> -       TESTEQ  %ecx
> +       VMOVU   (VEC_SIZE * 2)(%rdi), %VMM(0)
> +       VPTESTM %VMM(0), %VMM(0), %k2
> +       CMP_R1_S2_VMM (%VMM(0), (VEC_SIZE * 2)(%rsi), %VMM(1), %k1){%k2}
> +       KMOV    %k1, %VRCX
> +       TESTEQ  %VRCX
>         jnz     L(return_vec_2)
>
> -       VMOVU   (VEC_SIZE * 3)(%rdi), %YMM0
> -       VPTESTM %YMM0, %YMM0, %k2
> -       CMP_R1_S2_YMM (%YMM0, (VEC_SIZE * 3)(%rsi), %YMM1, %k1){%k2}
> -       kmovd   %k1, %ecx
> -       TESTEQ  %ecx
> +       VMOVU   (VEC_SIZE * 3)(%rdi), %VMM(0)
> +       VPTESTM %VMM(0), %VMM(0), %k2
> +       CMP_R1_S2_VMM (%VMM(0), (VEC_SIZE * 3)(%rsi), %VMM(1), %k1){%k2}
> +       KMOV    %k1, %VRCX
> +       TESTEQ  %VRCX
>         jnz     L(return_vec_3)
>
>  # ifdef USE_AS_STRNCMP
> @@ -565,110 +614,123 @@ L(loop):
>
>         /* Loop entry after handling page cross during loop.  */
>  L(loop_skip_page_cross_check):
> -       VMOVA   (VEC_SIZE * 0)(%rdi), %YMM0
> -       VMOVA   (VEC_SIZE * 1)(%rdi), %YMM2
> -       VMOVA   (VEC_SIZE * 2)(%rdi), %YMM4
> -       VMOVA   (VEC_SIZE * 3)(%rdi), %YMM6
> +       VMOVA   (VEC_SIZE * 0)(%rdi), %VMM(0)
> +       VMOVA   (VEC_SIZE * 1)(%rdi), %VMM(2)
> +       VMOVA   (VEC_SIZE * 2)(%rdi), %VMM(4)
> +       VMOVA   (VEC_SIZE * 3)(%rdi), %VMM(6)
>
> -       VPMINU  %YMM0, %YMM2, %YMM8
> -       VPMINU  %YMM4, %YMM6, %YMM9
> +       VPMINU  %VMM(0), %VMM(2), %VMM(8)
> +       VPMINU  %VMM(4), %VMM(6), %VMM(9)
>
>         /* A zero CHAR in YMM9 means that there is a null CHAR.  */
> -       VPMINU  %YMM8, %YMM9, %YMM9
> +       VPMINU  %VMM(8), %VMM(9), %VMM(9)
>
>         /* Each bit set in K1 represents a non-null CHAR in YMM9.  */
> -       VPTESTM %YMM9, %YMM9, %k1
> +       VPTESTM %VMM(9), %VMM(9), %k1
>  # ifndef USE_AS_STRCASECMP_L
> -       vpxorq  (VEC_SIZE * 0)(%rsi), %YMM0, %YMM1
> -       vpxorq  (VEC_SIZE * 1)(%rsi), %YMM2, %YMM3
> -       vpxorq  (VEC_SIZE * 2)(%rsi), %YMM4, %YMM5
> +       vpxorq  (VEC_SIZE * 0)(%rsi), %VMM(0), %VMM(1)
> +       vpxorq  (VEC_SIZE * 1)(%rsi), %VMM(2), %VMM(3)
> +       vpxorq  (VEC_SIZE * 2)(%rsi), %VMM(4), %VMM(5)
>         /* Ternary logic to xor (VEC_SIZE * 3)(%rsi) with YMM6 while
>            oring with YMM1. Result is stored in YMM6.  */
> -       vpternlogd $0xde, (VEC_SIZE * 3)(%rsi), %YMM1, %YMM6
> +       vpternlogd $0xde, (VEC_SIZE * 3)(%rsi), %VMM(1), %VMM(6)
>  # else
> -       VMOVU   (VEC_SIZE * 0)(%rsi), %YMM1
> -       TOLOWER_YMM (%YMM0, %YMM1)
> -       VMOVU   (VEC_SIZE * 1)(%rsi), %YMM3
> -       TOLOWER_YMM (%YMM2, %YMM3)
> -       VMOVU   (VEC_SIZE * 2)(%rsi), %YMM5
> -       TOLOWER_YMM (%YMM4, %YMM5)
> -       VMOVU   (VEC_SIZE * 3)(%rsi), %YMM7
> -       TOLOWER_YMM (%YMM6, %YMM7)
> -       vpxorq  %YMM0, %YMM1, %YMM1
> -       vpxorq  %YMM2, %YMM3, %YMM3
> -       vpxorq  %YMM4, %YMM5, %YMM5
> -       vpternlogd $0xde, %YMM7, %YMM1, %YMM6
> +       VMOVU   (VEC_SIZE * 0)(%rsi), %VMM(1)
> +       TOLOWER_VMM (%VMM(0), %VMM(1))
> +       VMOVU   (VEC_SIZE * 1)(%rsi), %VMM(3)
> +       TOLOWER_VMM (%VMM(2), %VMM(3))
> +       VMOVU   (VEC_SIZE * 2)(%rsi), %VMM(5)
> +       TOLOWER_VMM (%VMM(4), %VMM(5))
> +       VMOVU   (VEC_SIZE * 3)(%rsi), %VMM(7)
> +       TOLOWER_VMM (%VMM(6), %VMM(7))
> +       vpxorq  %VMM(0), %VMM(1), %VMM(1)
> +       vpxorq  %VMM(2), %VMM(3), %VMM(3)
> +       vpxorq  %VMM(4), %VMM(5), %VMM(5)
> +       vpternlogd $0xde, %VMM(7), %VMM(1), %VMM(6)
>  # endif
>         /* Or together YMM3, YMM5, and YMM6.  */
> -       vpternlogd $0xfe, %YMM3, %YMM5, %YMM6
> +       vpternlogd $0xfe, %VMM(3), %VMM(5), %VMM(6)
>
>
>         /* A non-zero CHAR in YMM6 represents a mismatch.  */
> -       VPTESTNM %YMM6, %YMM6, %k0{%k1}
> -       kmovd   %k0, %LOOP_REG
> +       VPTESTNM %VMM(6), %VMM(6), %k0{%k1}
> +       KMOV    %k0, %LOOP_REG
>
>         TESTEQ  %LOOP_REG
>         jz      L(loop)
>
>
>         /* Find which VEC has the mismatch of end of string.  */
> -       VPTESTM %YMM0, %YMM0, %k1
> -       VPTESTNM %YMM1, %YMM1, %k0{%k1}
> -       kmovd   %k0, %ecx
> -       TESTEQ  %ecx
> +       VPTESTM %VMM(0), %VMM(0), %k1
> +       VPTESTNM %VMM(1), %VMM(1), %k0{%k1}
> +       KMOV    %k0, %VRCX
> +       TESTEQ  %VRCX
>         jnz     L(return_vec_0_end)
>
> -       VPTESTM %YMM2, %YMM2, %k1
> -       VPTESTNM %YMM3, %YMM3, %k0{%k1}
> -       kmovd   %k0, %ecx
> -       TESTEQ  %ecx
> +       VPTESTM %VMM(2), %VMM(2), %k1
> +       VPTESTNM %VMM(3), %VMM(3), %k0{%k1}
> +       KMOV    %k0, %VRCX
> +       TESTEQ  %VRCX
>         jnz     L(return_vec_1_end)
>
>
> -       /* Handle VEC 2 and 3 without branches.  */
> +       /* Handle VEC 2 and 3 without branches if CHAR_PER_VEC <= 32.
> +        */
>  L(return_vec_2_3_end):
>  # ifdef USE_AS_STRNCMP
>         subq    $(CHAR_PER_VEC * 2), %rdx
>         jbe     L(ret_zero_end)
>  # endif
>
> -       VPTESTM %YMM4, %YMM4, %k1
> -       VPTESTNM %YMM5, %YMM5, %k0{%k1}
> -       kmovd   %k0, %ecx
> -       TESTEQ  %ecx
> +       VPTESTM %VMM(4), %VMM(4), %k1
> +       VPTESTNM %VMM(5), %VMM(5), %k0{%k1}
> +       KMOV    %k0, %VRCX
> +       TESTEQ  %VRCX
>  # if CHAR_PER_VEC <= 16
>         sall    $CHAR_PER_VEC, %LOOP_REG
>         orl     %ecx, %LOOP_REG
> -# else
> +# elif CHAR_PER_VEC <= 32
>         salq    $CHAR_PER_VEC, %LOOP_REG64
>         orq     %rcx, %LOOP_REG64
> +# else
> +       /* We aren't combining last 2x VEC so branch on second the last.
> +        */
> +       jnz     L(return_vec_2_end)
>  # endif
> -L(return_vec_3_end):
> +
>         /* LOOP_REG contains matches for null/mismatch from the loop. If
> -          VEC 0,1,and 2 all have no null and no mismatches then mismatch
> -          must entirely be from VEC 3 which is fully represented by
> -          LOOP_REG.  */
> +          VEC 0,1,and 2 all have no null and no mismatches then
> +          mismatch must entirely be from VEC 3 which is fully
> +          represented by LOOP_REG.  */
>  # if CHAR_PER_VEC <= 16
> -       tzcntl  %LOOP_REG, %LOOP_REG
> +       bsf     %LOOP_REG, %LOOP_REG
>  # else
> -       tzcntq  %LOOP_REG64, %LOOP_REG64
> +       bsfq    %LOOP_REG64, %LOOP_REG64
>  # endif
>  # ifdef USE_AS_STRNCMP
> +
> +       /* If CHAR_PER_VEC == 64 we can't combine last 2x VEC so need to
> +          adj length before last comparison.  */
> +#  if CHAR_PER_VEC == 64
> +       subq    $CHAR_PER_VEC, %rdx
> +       jbe     L(ret_zero_end)
> +#  endif
> +
>         cmpq    %LOOP_REG64, %rdx
>         jbe     L(ret_zero_end)
>  # endif
>
>  # ifdef USE_AS_WCSCMP
> -       movl    (VEC_SIZE * 2)(%rdi, %LOOP_REG64, SIZE_OF_CHAR), %ecx
> +       movl    (FALLTHROUGH_RETURN_OFFSET)(%rdi, %LOOP_REG64, SIZE_OF_CHAR), %ecx
>         xorl    %eax, %eax
> -       cmpl    (VEC_SIZE * 2)(%rsi, %LOOP_REG64, SIZE_OF_CHAR), %ecx
> +       cmpl    (FALLTHROUGH_RETURN_OFFSET)(%rsi, %LOOP_REG64, SIZE_OF_CHAR), %ecx
>         je      L(ret5)
>         setl    %al
>         negl    %eax
>         xorl    %r8d, %eax
>  # else
> -       movzbl  (VEC_SIZE * 2)(%rdi, %LOOP_REG64), %eax
> -       movzbl  (VEC_SIZE * 2)(%rsi, %LOOP_REG64), %ecx
> +       movzbl  (FALLTHROUGH_RETURN_OFFSET)(%rdi, %LOOP_REG64), %eax
> +       movzbl  (FALLTHROUGH_RETURN_OFFSET)(%rsi, %LOOP_REG64), %ecx
>         TOLOWER_gpr (%rax, %eax)
>         TOLOWER_gpr (%rcx, %ecx)
>         subl    %ecx, %eax
> @@ -686,23 +748,39 @@ L(ret_zero_end):
>  # endif
>
>
> +
>         /* The L(return_vec_N_end) differ from L(return_vec_N) in that
> -          they use the value of `r8` to negate the return value. This is
> -          because the page cross logic can swap `rdi` and `rsi`.  */
> +          they use the value of `r8` to negate the return value. This
> +          is because the page cross logic can swap `rdi` and `rsi`.
> +        */
>         .p2align 4,, 10
>  # ifdef USE_AS_STRNCMP
>  L(return_vec_1_end):
> -#  if CHAR_PER_VEC <= 16
> +#  if CHAR_PER_VEC <= 32
> +       /* If CHAR_PER_VEC <= 32 reuse code from L(return_vec_0_end)
> +          without additional branches by adjusting the bit positions
> +          from VEC1.  We can't do this for CHAR_PER_VEC == 64.  */
> +#   if CHAR_PER_VEC <= 16
>         sall    $CHAR_PER_VEC, %ecx
> -#  else
> +#   else
>         salq    $CHAR_PER_VEC, %rcx
> +#   endif
> +#  else
> +       /* If CHAR_PER_VEC == 64 we can't shift the return GPR so just
> +          check it.  */
> +       bsf     %VRCX, %VRCX
> +       addl    $(CHAR_PER_VEC), %ecx
> +       cmpq    %rcx, %rdx
> +       ja      L(ret_vec_0_end_finish)
> +       xorl    %eax, %eax
> +       ret
>  #  endif
>  # endif
>  L(return_vec_0_end):
>  # if (CHAR_PER_VEC <= 16) || !(defined USE_AS_STRNCMP)
> -       tzcntl  %ecx, %ecx
> +       bsf     %VRCX, %VRCX
>  # else
> -       tzcntq  %rcx, %rcx
> +       bsfq    %rcx, %rcx
>  # endif
>
>  # ifdef USE_AS_STRNCMP
> @@ -710,6 +788,7 @@ L(return_vec_0_end):
>         jbe     L(ret_zero_end)
>  # endif
>
> +L(ret_vec_0_end_finish):
>  # ifdef USE_AS_WCSCMP
>         movl    (%rdi, %rcx, SIZE_OF_CHAR), %edx
>         xorl    %eax, %eax
> @@ -737,7 +816,7 @@ L(ret6):
>  # ifndef USE_AS_STRNCMP
>         .p2align 4,, 10
>  L(return_vec_1_end):
> -       tzcntl  %ecx, %ecx
> +       bsf     %VRCX, %VRCX
>  #  ifdef USE_AS_WCSCMP
>         movl    VEC_SIZE(%rdi, %rcx, SIZE_OF_CHAR), %edx
>         xorl    %eax, %eax
> @@ -760,6 +839,41 @@ L(ret7):
>  # endif
>
>
> +       /* If CHAR_PER_VEC == 64 we can't combine matches from the last
> +          2x VEC so need seperate return label.  */
> +# if CHAR_PER_VEC == 64
> +L(return_vec_2_end):
> +       bsf     %VRCX, %VRCX
> +#  ifdef USE_AS_STRNCMP
> +       cmpq    %rcx, %rdx
> +       jbe     L(ret_zero_end)
> +#  endif
> +#  ifdef USE_AS_WCSCMP
> +       movl    (VEC_SIZE * 2)(%rdi, %rcx, SIZE_OF_CHAR), %edx
> +       xorl    %eax, %eax
> +       cmpl    (VEC_SIZE * 2)(%rsi, %rcx, SIZE_OF_CHAR), %edx
> +       je      L(ret31)
> +       setl    %al
> +       negl    %eax
> +       /* This is the non-zero case for `eax` so just xorl with `r8d`
> +          flip is `rdi` and `rsi` where swapped.  */
> +       xorl    %r8d, %eax
> +#  else
> +       movzbl  (VEC_SIZE * 2)(%rdi, %rcx), %eax
> +       movzbl  (VEC_SIZE * 2)(%rsi, %rcx), %ecx
> +       TOLOWER_gpr (%rax, %eax)
> +       TOLOWER_gpr (%rcx, %ecx)
> +       subl    %ecx, %eax
> +       /* Flip `eax` if `rdi` and `rsi` where swapped in page cross
> +          logic. Subtract `r8d` after xor for zero case.  */
> +       xorl    %r8d, %eax
> +       subl    %r8d, %eax
> +#  endif
> +L(ret13):
> +       ret
> +# endif
> +
> +
>         /* Page cross in rsi in next 4x VEC.  */
>
>         /* TODO: Improve logic here.  */
> @@ -778,11 +892,11 @@ L(page_cross_during_loop):
>         cmpl    $-(VEC_SIZE * 3), %eax
>         jle     L(less_1x_vec_till_page_cross)
>
> -       VMOVA   (%rdi), %YMM0
> -       VPTESTM %YMM0, %YMM0, %k2
> -       CMP_R1_S2_YMM (%YMM0, (%rsi), %YMM1, %k1){%k2}
> -       kmovd   %k1, %ecx
> -       TESTEQ  %ecx
> +       VMOVA   (%rdi), %VMM(0)
> +       VPTESTM %VMM(0), %VMM(0), %k2
> +       CMP_R1_S2_VMM (%VMM(0), (%rsi), %VMM(1), %k1){%k2}
> +       KMOV    %k1, %VRCX
> +       TESTEQ  %VRCX
>         jnz     L(return_vec_0_end)
>
>         /* if distance >= 2x VEC then eax > -(VEC_SIZE * 2).  */
> @@ -799,9 +913,9 @@ L(less_1x_vec_till_page_cross):
>            to read back -VEC_SIZE. If rdi is truly at the start of a page
>            here, it means the previous page (rdi - VEC_SIZE) has already
>            been loaded earlier so must be valid.  */
> -       VMOVU   -VEC_SIZE(%rdi, %rax), %YMM0
> -       VPTESTM %YMM0, %YMM0, %k2
> -       CMP_R1_S2_YMM (%YMM0, -VEC_SIZE(%rsi, %rax), %YMM1, %k1){%k2}
> +       VMOVU   -VEC_SIZE(%rdi, %rax), %VMM(0)
> +       VPTESTM %VMM(0), %VMM(0), %k2
> +       CMP_R1_S2_VMM (%VMM(0), -VEC_SIZE(%rsi, %rax), %VMM(1), %k1){%k2}
>         /* Mask of potentially valid bits. The lower bits can be out of
>            range comparisons (but safe regarding page crosses).  */
>
> @@ -813,12 +927,12 @@ L(less_1x_vec_till_page_cross):
>         shlxl   %ecx, %r10d, %ecx
>         movzbl  %cl, %r10d
>  # else
> -       movl    $-1, %ecx
> -       shlxl   %esi, %ecx, %r10d
> +       mov     $-1, %VRCX
> +       shlx    %VRSI, %VRCX, %VR10
>  # endif
>
> -       kmovd   %k1, %ecx
> -       notl    %ecx
> +       KMOV    %k1, %VRCX
> +       not     %VRCX
>
>
>  # ifdef USE_AS_STRNCMP
> @@ -838,12 +952,10 @@ L(less_1x_vec_till_page_cross):
>         /* Readjust eax before potentially returning to the loop.  */
>         addl    $(PAGE_SIZE - VEC_SIZE * 4), %eax
>
> -       andl    %r10d, %ecx
> +       and     %VR10, %VRCX
>         jz      L(loop_skip_page_cross_check)
>
> -       .p2align 4,, 3
> -L(return_page_cross_end):
> -       tzcntl  %ecx, %ecx
> +       bsf     %VRCX, %VRCX
>
>  # if (defined USE_AS_STRNCMP) || (defined USE_AS_WCSCMP)
>         leal    -VEC_SIZE(%OFFSET_REG64, %rcx, SIZE_OF_CHAR), %ecx
> @@ -874,8 +986,12 @@ L(ret8):
>  # ifdef USE_AS_STRNCMP
>         .p2align 4,, 10
>  L(return_page_cross_end_check):
> -       andl    %r10d, %ecx
> -       tzcntl  %ecx, %ecx
> +       and     %VR10, %VRCX
> +       /* Need to use tzcnt here as VRCX may be zero.  If VRCX is zero
> +          tzcnt(VRCX) will be CHAR_PER and remaining length (edx) is
> +          guranteed to be <= CHAR_PER_VEC so we will only use the return
> +          idx if VRCX was non-zero.  */
> +       tzcnt   %VRCX, %VRCX
>         leal    -VEC_SIZE(%rax, %rcx, SIZE_OF_CHAR), %ecx
>  #  ifdef USE_AS_WCSCMP
>         sall    $2, %edx
> @@ -892,11 +1008,11 @@ L(more_2x_vec_till_page_cross):
>         /* If more 2x vec till cross we will complete a full loop
>            iteration here.  */
>
> -       VMOVA   VEC_SIZE(%rdi), %YMM0
> -       VPTESTM %YMM0, %YMM0, %k2
> -       CMP_R1_S2_YMM (%YMM0, VEC_SIZE(%rsi), %YMM1, %k1){%k2}
> -       kmovd   %k1, %ecx
> -       TESTEQ  %ecx
> +       VMOVA   VEC_SIZE(%rdi), %VMM(0)
> +       VPTESTM %VMM(0), %VMM(0), %k2
> +       CMP_R1_S2_VMM (%VMM(0), VEC_SIZE(%rsi), %VMM(1), %k1){%k2}
> +       KMOV    %k1, %VRCX
> +       TESTEQ  %VRCX
>         jnz     L(return_vec_1_end)
>
>  # ifdef USE_AS_STRNCMP
> @@ -907,18 +1023,18 @@ L(more_2x_vec_till_page_cross):
>         subl    $-(VEC_SIZE * 4), %eax
>
>         /* Safe to include comparisons from lower bytes.  */
> -       VMOVU   -(VEC_SIZE * 2)(%rdi, %rax), %YMM0
> -       VPTESTM %YMM0, %YMM0, %k2
> -       CMP_R1_S2_YMM (%YMM0, -(VEC_SIZE * 2)(%rsi, %rax), %YMM1, %k1){%k2}
> -       kmovd   %k1, %ecx
> -       TESTEQ  %ecx
> +       VMOVU   -(VEC_SIZE * 2)(%rdi, %rax), %VMM(0)
> +       VPTESTM %VMM(0), %VMM(0), %k2
> +       CMP_R1_S2_VMM (%VMM(0), -(VEC_SIZE * 2)(%rsi, %rax), %VMM(1), %k1){%k2}
> +       KMOV    %k1, %VRCX
> +       TESTEQ  %VRCX
>         jnz     L(return_vec_page_cross_0)
>
> -       VMOVU   -(VEC_SIZE * 1)(%rdi, %rax), %YMM0
> -       VPTESTM %YMM0, %YMM0, %k2
> -       CMP_R1_S2_YMM (%YMM0, -(VEC_SIZE * 1)(%rsi, %rax), %YMM1, %k1){%k2}
> -       kmovd   %k1, %ecx
> -       TESTEQ  %ecx
> +       VMOVU   -(VEC_SIZE * 1)(%rdi, %rax), %VMM(0)
> +       VPTESTM %VMM(0), %VMM(0), %k2
> +       CMP_R1_S2_VMM (%VMM(0), -(VEC_SIZE * 1)(%rsi, %rax), %VMM(1), %k1){%k2}
> +       KMOV    %k1, %VRCX
> +       TESTEQ  %VRCX
>         jnz     L(return_vec_page_cross_1)
>
>  # ifdef USE_AS_STRNCMP
> @@ -937,30 +1053,30 @@ L(more_2x_vec_till_page_cross):
>  # endif
>
>         /* Finish the loop.  */
> -       VMOVA   (VEC_SIZE * 2)(%rdi), %YMM4
> -       VMOVA   (VEC_SIZE * 3)(%rdi), %YMM6
> -       VPMINU  %YMM4, %YMM6, %YMM9
> -       VPTESTM %YMM9, %YMM9, %k1
> +       VMOVA   (VEC_SIZE * 2)(%rdi), %VMM(4)
> +       VMOVA   (VEC_SIZE * 3)(%rdi), %VMM(6)
> +       VPMINU  %VMM(4), %VMM(6), %VMM(9)
> +       VPTESTM %VMM(9), %VMM(9), %k1
>  # ifndef USE_AS_STRCASECMP_L
> -       vpxorq  (VEC_SIZE * 2)(%rsi), %YMM4, %YMM5
> +       vpxorq  (VEC_SIZE * 2)(%rsi), %VMM(4), %VMM(5)
>         /* YMM6 = YMM5 | ((VEC_SIZE * 3)(%rsi) ^ YMM6).  */
> -       vpternlogd $0xde, (VEC_SIZE * 3)(%rsi), %YMM5, %YMM6
> +       vpternlogd $0xde, (VEC_SIZE * 3)(%rsi), %VMM(5), %VMM(6)
>  # else
> -       VMOVU   (VEC_SIZE * 2)(%rsi), %YMM5
> -       TOLOWER_YMM (%YMM4, %YMM5)
> -       VMOVU   (VEC_SIZE * 3)(%rsi), %YMM7
> -       TOLOWER_YMM (%YMM6, %YMM7)
> -       vpxorq  %YMM4, %YMM5, %YMM5
> -       vpternlogd $0xde, %YMM7, %YMM5, %YMM6
> -# endif
> -       VPTESTNM %YMM6, %YMM6, %k0{%k1}
> -       kmovd   %k0, %LOOP_REG
> +       VMOVU   (VEC_SIZE * 2)(%rsi), %VMM(5)
> +       TOLOWER_VMM (%VMM(4), %VMM(5))
> +       VMOVU   (VEC_SIZE * 3)(%rsi), %VMM(7)
> +       TOLOWER_VMM (%VMM(6), %VMM(7))
> +       vpxorq  %VMM(4), %VMM(5), %VMM(5)
> +       vpternlogd $0xde, %VMM(7), %VMM(5), %VMM(6)
> +# endif
> +       VPTESTNM %VMM(6), %VMM(6), %k0{%k1}
> +       KMOV    %k0, %LOOP_REG
>         TESTEQ  %LOOP_REG
>         jnz     L(return_vec_2_3_end)
>
>         /* Best for code size to include ucond-jmp here. Would be faster
> -          if this case is hot to duplicate the L(return_vec_2_3_end) code
> -          as fall-through and have jump back to loop on mismatch
> +          if this case is hot to duplicate the L(return_vec_2_3_end)
> +          code as fall-through and have jump back to loop on mismatch
>            comparison.  */
>         subq    $-(VEC_SIZE * 4), %rdi
>         subq    $-(VEC_SIZE * 4), %rsi
> @@ -980,7 +1096,7 @@ L(ret_zero_in_loop_page_cross):
>  L(return_vec_page_cross_0):
>         addl    $-VEC_SIZE, %eax
>  L(return_vec_page_cross_1):
> -       tzcntl  %ecx, %ecx
> +       bsf     %VRCX, %VRCX
>  # if defined USE_AS_STRNCMP || defined USE_AS_WCSCMP
>         leal    -VEC_SIZE(%rax, %rcx, SIZE_OF_CHAR), %ecx
>  #  ifdef USE_AS_STRNCMP
> @@ -1023,8 +1139,8 @@ L(ret9):
>  L(page_cross):
>  # ifndef USE_AS_STRNCMP
>         /* If both are VEC aligned we don't need any special logic here.
> -          Only valid for strcmp where stop condition is guranteed to be
> -          reachable by just reading memory.  */
> +          Only valid for strcmp where stop condition is guranteed to
> +          be reachable by just reading memory.  */
>         testl   $((VEC_SIZE - 1) << 20), %eax
>         jz      L(no_page_cross)
>  # endif
> @@ -1065,11 +1181,11 @@ L(page_cross):
>            loadable memory until within 1x VEC of page cross.  */
>         .p2align 4,, 8
>  L(page_cross_loop):
> -       VMOVU   (%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %YMM0
> -       VPTESTM %YMM0, %YMM0, %k2
> -       CMP_R1_S2_YMM (%YMM0, (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %YMM1, %k1){%k2}
> -       kmovd   %k1, %ecx
> -       TESTEQ  %ecx
> +       VMOVU   (%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %VMM(0)
> +       VPTESTM %VMM(0), %VMM(0), %k2
> +       CMP_R1_S2_VMM (%VMM(0), (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %VMM(1), %k1){%k2}
> +       KMOV    %k1, %VRCX
> +       TESTEQ  %VRCX
>         jnz     L(check_ret_vec_page_cross)
>         addl    $CHAR_PER_VEC, %OFFSET_REG
>  # ifdef USE_AS_STRNCMP
> @@ -1087,13 +1203,13 @@ L(page_cross_loop):
>         subl    %eax, %OFFSET_REG
>         /* OFFSET_REG has distance to page cross - VEC_SIZE. Guranteed
>            to not cross page so is safe to load. Since we have already
> -          loaded at least 1 VEC from rsi it is also guranteed to be safe.
> -        */
> -       VMOVU   (%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %YMM0
> -       VPTESTM %YMM0, %YMM0, %k2
> -       CMP_R1_S2_YMM (%YMM0, (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %YMM1, %k1){%k2}
> +          loaded at least 1 VEC from rsi it is also guranteed to be
> +          safe.  */
> +       VMOVU   (%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %VMM(0)
> +       VPTESTM %VMM(0), %VMM(0), %k2
> +       CMP_R1_S2_VMM (%VMM(0), (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %VMM(1), %k1){%k2}
>
> -       kmovd   %k1, %ecx
> +       KMOV    %k1, %VRCX
>  # ifdef USE_AS_STRNCMP
>         leal    CHAR_PER_VEC(%OFFSET_REG64), %eax
>         cmpq    %rax, %rdx
> @@ -1104,7 +1220,7 @@ L(page_cross_loop):
>         addq    %rdi, %rdx
>  #  endif
>  # endif
> -       TESTEQ  %ecx
> +       TESTEQ  %VRCX
>         jz      L(prepare_loop_no_len)
>
>         .p2align 4,, 4
> @@ -1112,7 +1228,7 @@ L(ret_vec_page_cross):
>  # ifndef USE_AS_STRNCMP
>  L(check_ret_vec_page_cross):
>  # endif
> -       tzcntl  %ecx, %ecx
> +       tzcnt   %VRCX, %VRCX
>         addl    %OFFSET_REG, %ecx
>  L(ret_vec_page_cross_cont):
>  # ifdef USE_AS_WCSCMP
> @@ -1139,9 +1255,9 @@ L(ret12):
>  # ifdef USE_AS_STRNCMP
>         .p2align 4,, 10
>  L(check_ret_vec_page_cross2):
> -       TESTEQ  %ecx
> +       TESTEQ  %VRCX
>  L(check_ret_vec_page_cross):
> -       tzcntl  %ecx, %ecx
> +       tzcnt   %VRCX, %VRCX
>         addl    %OFFSET_REG, %ecx
>         cmpq    %rcx, %rdx
>         ja      L(ret_vec_page_cross_cont)
> @@ -1180,8 +1296,71 @@ L(less_1x_vec_till_page):
>  # ifdef USE_AS_WCSCMP
>         shrl    $2, %eax
>  # endif
> +
> +       /* Find largest load size we can use. VEC_SIZE == 64 only check
> +          if we can do a full ymm load.  */
> +# if VEC_SIZE == 64
> +
> +       cmpl    $((VEC_SIZE - 32) / SIZE_OF_CHAR), %eax
> +       ja      L(less_32_till_page)
> +
> +
> +       /* Use 16 byte comparison.  */
> +       VMOVU   (%rdi), %VMM_256(0)
> +       VPTESTM %VMM_256(0), %VMM_256(0), %k2
> +       CMP_R1_S2_YMM (%VMM_256(0), (%rsi), %VMM_256(1), %k1){%k2}
> +       kmovd   %k1, %ecx
> +#  ifdef USE_AS_WCSCMP
> +       subl    $0xff, %ecx
> +#  else
> +       incl    %ecx
> +#  endif
> +       jnz     L(check_ret_vec_page_cross)
> +       movl    $((VEC_SIZE - 32) / SIZE_OF_CHAR), %OFFSET_REG
> +#  ifdef USE_AS_STRNCMP
> +       cmpq    %OFFSET_REG64, %rdx
> +       jbe     L(ret_zero_page_cross_slow_case64)
> +       subl    %eax, %OFFSET_REG
> +#  else
> +       /* Explicit check for 32 byte alignment.  */
> +       subl    %eax, %OFFSET_REG
> +       jz      L(prepare_loop)
> +#  endif
> +       VMOVU   (%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %VMM_256(0)
> +       VPTESTM %VMM_256(0), %VMM_256(0), %k2
> +       CMP_R1_S2_YMM (%VMM_256(0), (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %VMM_256(1), %k1){%k2}
> +       kmovd   %k1, %ecx
> +#  ifdef USE_AS_WCSCMP
> +       subl    $0xff, %ecx
> +#  else
> +       incl    %ecx
> +#  endif
> +       jnz     L(check_ret_vec_page_cross)
> +#  ifdef USE_AS_STRNCMP
> +       addl    $(32 / SIZE_OF_CHAR), %OFFSET_REG
> +       subq    %OFFSET_REG64, %rdx
> +       jbe     L(ret_zero_page_cross_slow_case64)
> +       subq    $-(CHAR_PER_VEC * 4), %rdx
> +
> +       leaq    -(VEC_SIZE * 4)(%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %rdi
> +       leaq    -(VEC_SIZE * 4)(%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %rsi
> +#  else
> +       leaq    (32 - VEC_SIZE * 4)(%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %rdi
> +       leaq    (32 - VEC_SIZE * 4)(%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %rsi
> +#  endif
> +       jmp     L(prepare_loop_aligned)
> +
> +#  ifdef USE_AS_STRNCMP
> +       .p2align 4,, 2
> +L(ret_zero_page_cross_slow_case64):
> +       xorl    %eax, %eax
> +       ret
> +#  endif
> +L(less_32_till_page):
> +# endif
> +
>         /* Find largest load size we can use.  */
> -       cmpl    $(16 / SIZE_OF_CHAR), %eax
> +       cmpl    $((VEC_SIZE - 16) / SIZE_OF_CHAR), %eax
>         ja      L(less_16_till_page)
>
>         /* Use 16 byte comparison.  */
> @@ -1195,9 +1374,14 @@ L(less_1x_vec_till_page):
>         incw    %cx
>  # endif
>         jnz     L(check_ret_vec_page_cross)
> -       movl    $(16 / SIZE_OF_CHAR), %OFFSET_REG
> +
> +       movl    $((VEC_SIZE - 16) / SIZE_OF_CHAR), %OFFSET_REG
>  # ifdef USE_AS_STRNCMP
> +#  if VEC_SIZE == 32
>         cmpq    %OFFSET_REG64, %rdx
> +#  else
> +       cmpq    $(16 / SIZE_OF_CHAR), %rdx
> +#  endif
>         jbe     L(ret_zero_page_cross_slow_case0)
>         subl    %eax, %OFFSET_REG
>  # else
> @@ -1239,7 +1423,7 @@ L(ret_zero_page_cross_slow_case0):
>
>         .p2align 4,, 10
>  L(less_16_till_page):
> -       cmpl    $(24 / SIZE_OF_CHAR), %eax
> +       cmpl    $((VEC_SIZE - 8) / SIZE_OF_CHAR), %eax
>         ja      L(less_8_till_page)
>
>         /* Use 8 byte comparison.  */
> @@ -1260,7 +1444,7 @@ L(less_16_till_page):
>         cmpq    $(8 / SIZE_OF_CHAR), %rdx
>         jbe     L(ret_zero_page_cross_slow_case0)
>  # endif
> -       movl    $(24 / SIZE_OF_CHAR), %OFFSET_REG
> +       movl    $((VEC_SIZE - 8) / SIZE_OF_CHAR), %OFFSET_REG
>         subl    %eax, %OFFSET_REG
>
>         vmovq   (%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm0
> @@ -1320,7 +1504,7 @@ L(ret_less_8_wcs):
>         ret
>
>  # else
> -       cmpl    $28, %eax
> +       cmpl    $(VEC_SIZE - 4), %eax
>         ja      L(less_4_till_page)
>
>         vmovd   (%rdi), %xmm0
> @@ -1335,7 +1519,7 @@ L(ret_less_8_wcs):
>         cmpq    $4, %rdx
>         jbe     L(ret_zero_page_cross_slow_case1)
>  #  endif
> -       movl    $(28 / SIZE_OF_CHAR), %OFFSET_REG
> +       movl    $((VEC_SIZE - 4) / SIZE_OF_CHAR), %OFFSET_REG
>         subl    %eax, %OFFSET_REG
>
>         vmovd   (%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm0
> @@ -1386,7 +1570,7 @@ L(less_4_loop):
>  #  endif
>         incq    %rdi
>         /* end condition is reach page boundary (rdi is aligned).  */
> -       testl   $31, %edi
> +       testb   $(VEC_SIZE - 1), %dil
>         jnz     L(less_4_loop)
>         leaq    -(VEC_SIZE * 4)(%rdi, %rsi), %rsi
>         addq    $-(VEC_SIZE * 4), %rdi
> --
> 2.34.1
>

LGTM.

Thanks.

-- 
H.J.

^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH v3 7/7] Bench: Improve benchtests for memchr, strchr, strnlen, strrchr
  2022-10-19  0:44   ` [PATCH v3 7/7] Bench: Improve benchtests for memchr, strchr, strnlen, strrchr Noah Goldstein
@ 2022-10-19 17:00     ` H.J. Lu
  0 siblings, 0 replies; 41+ messages in thread
From: H.J. Lu @ 2022-10-19 17:00 UTC (permalink / raw)
  To: Noah Goldstein; +Cc: libc-alpha, carlos

On Tue, Oct 18, 2022 at 5:44 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> 1. Add more complete coverage in the medium size range.
> 2. In strnlen remove the `1 << i` which was UB (`i` could go beyond
>    32/64)
> ---
>  benchtests/bench-memchr.c    | 77 +++++++++++++++++++++++++-----------
>  benchtests/bench-rawmemchr.c | 30 ++++++++++++--
>  benchtests/bench-strchr.c    | 35 +++++++++++-----
>  benchtests/bench-strnlen.c   | 12 +++---
>  benchtests/bench-strrchr.c   | 28 ++++++++++++-
>  5 files changed, 137 insertions(+), 45 deletions(-)
>
> diff --git a/benchtests/bench-memchr.c b/benchtests/bench-memchr.c
> index 0facda2fa0..2ec9dd86d0 100644
> --- a/benchtests/bench-memchr.c
> +++ b/benchtests/bench-memchr.c
> @@ -126,7 +126,7 @@ do_test (json_ctx_t *json_ctx, size_t align, size_t pos, size_t len,
>  int
>  test_main (void)
>  {
> -  size_t i;
> +  size_t i, j, al, al_max;
>    int repeats;
>    json_ctx_t json_ctx;
>    test_init ();
> @@ -147,35 +147,46 @@ test_main (void)
>
>    json_array_begin (&json_ctx, "results");
>
> +  al_max = 0;
> +#ifdef USE_AS_MEMRCHR
> +  al_max = getpagesize () / 2;
> +#endif
> +
>    for (repeats = 0; repeats < 2; ++repeats)
>      {
> -      for (i = 1; i < 8; ++i)
> +      for (al = 0; al <= al_max; al += getpagesize () / 2)
>         {
> -         do_test (&json_ctx, 0, 16 << i, 2048, 23, repeats);
> -         do_test (&json_ctx, i, 64, 256, 23, repeats);
> -         do_test (&json_ctx, 0, 16 << i, 2048, 0, repeats);
> -         do_test (&json_ctx, i, 64, 256, 0, repeats);
> -
> -         do_test (&json_ctx, getpagesize () - 15, 64, 256, 0, repeats);
> +         for (i = 1; i < 8; ++i)
> +           {
> +             do_test (&json_ctx, al, 16 << i, 2048, 23, repeats);
> +             do_test (&json_ctx, al + i, 64, 256, 23, repeats);
> +             do_test (&json_ctx, al, 16 << i, 2048, 0, repeats);
> +             do_test (&json_ctx, al + i, 64, 256, 0, repeats);
> +
> +             do_test (&json_ctx, al + getpagesize () - 15, 64, 256, 0,
> +                      repeats);
>  #ifdef USE_AS_MEMRCHR
> -         /* Also test the position close to the beginning for memrchr.  */
> -         do_test (&json_ctx, 0, i, 256, 23, repeats);
> -         do_test (&json_ctx, 0, i, 256, 0, repeats);
> -         do_test (&json_ctx, i, i, 256, 23, repeats);
> -         do_test (&json_ctx, i, i, 256, 0, repeats);
> +             /* Also test the position close to the beginning for memrchr.  */
> +             do_test (&json_ctx, al, i, 256, 23, repeats);
> +             do_test (&json_ctx, al, i, 256, 0, repeats);
> +             do_test (&json_ctx, al + i, i, 256, 23, repeats);
> +             do_test (&json_ctx, al + i, i, 256, 0, repeats);
>  #endif
> +           }
> +         for (i = 1; i < 8; ++i)
> +           {
> +             do_test (&json_ctx, al + i, i << 5, 192, 23, repeats);
> +             do_test (&json_ctx, al + i, i << 5, 192, 0, repeats);
> +             do_test (&json_ctx, al + i, i << 5, 256, 23, repeats);
> +             do_test (&json_ctx, al + i, i << 5, 256, 0, repeats);
> +             do_test (&json_ctx, al + i, i << 5, 512, 23, repeats);
> +             do_test (&json_ctx, al + i, i << 5, 512, 0, repeats);
> +
> +             do_test (&json_ctx, al + getpagesize () - 15, i << 5, 256, 23,
> +                      repeats);
> +           }
>         }
> -      for (i = 1; i < 8; ++i)
> -       {
> -         do_test (&json_ctx, i, i << 5, 192, 23, repeats);
> -         do_test (&json_ctx, i, i << 5, 192, 0, repeats);
> -         do_test (&json_ctx, i, i << 5, 256, 23, repeats);
> -         do_test (&json_ctx, i, i << 5, 256, 0, repeats);
> -         do_test (&json_ctx, i, i << 5, 512, 23, repeats);
> -         do_test (&json_ctx, i, i << 5, 512, 0, repeats);
> -
> -         do_test (&json_ctx, getpagesize () - 15, i << 5, 256, 23, repeats);
> -       }
> +
>        for (i = 1; i < 32; ++i)
>         {
>           do_test (&json_ctx, 0, i, i + 1, 23, repeats);
> @@ -207,6 +218,24 @@ test_main (void)
>           do_test (&json_ctx, 0, 2, i + 1, 0, repeats);
>  #endif
>         }
> +      for (al = 0; al <= al_max; al += getpagesize () / 2)
> +       {
> +         for (i = (16 / sizeof (CHAR)); i <= (8192 / sizeof (CHAR)); i += i)
> +           {
> +             for (j = 0; j <= (384 / sizeof (CHAR));
> +                  j += (32 / sizeof (CHAR)))
> +               {
> +                 do_test (&json_ctx, al, i + j, i, 23, repeats);
> +                 do_test (&json_ctx, al, i, i + j, 23, repeats);
> +                 if (j < i)
> +                   {
> +                     do_test (&json_ctx, al, i - j, i, 23, repeats);
> +                     do_test (&json_ctx, al, i, i - j, 23, repeats);
> +                   }
> +               }
> +           }
> +       }
> +
>  #ifndef USE_AS_MEMRCHR
>        break;
>  #endif
> diff --git a/benchtests/bench-rawmemchr.c b/benchtests/bench-rawmemchr.c
> index b1803afc14..dab77f3858 100644
> --- a/benchtests/bench-rawmemchr.c
> +++ b/benchtests/bench-rawmemchr.c
> @@ -70,7 +70,7 @@ do_test (json_ctx_t *json_ctx, size_t align, size_t pos, size_t len, int seek_ch
>    size_t i;
>    char *result;
>
> -  align &= 7;
> +  align &= getpagesize () - 1;
>    if (align + len >= page_size)
>      return;
>
> @@ -106,7 +106,6 @@ test_main (void)
>  {
>    json_ctx_t json_ctx;
>    size_t i;
> -
>    test_init ();
>
>    json_init (&json_ctx, 0, stdout);
> @@ -120,7 +119,7 @@ test_main (void)
>
>    json_array_begin (&json_ctx, "ifuncs");
>    FOR_EACH_IMPL (impl, 0)
> -      json_element_string (&json_ctx, impl->name);
> +    json_element_string (&json_ctx, impl->name);
>    json_array_end (&json_ctx);
>
>    json_array_begin (&json_ctx, "results");
> @@ -137,6 +136,31 @@ test_main (void)
>        do_test (&json_ctx, 0, i, i + 1, 23);
>        do_test (&json_ctx, 0, i, i + 1, 0);
>      }
> +  for (; i < 256; i += 32)
> +    {
> +      do_test (&json_ctx, 0, i, i + 1, 23);
> +      do_test (&json_ctx, 0, i - 1, i, 23);
> +    }
> +  for (; i < 512; i += 64)
> +    {
> +      do_test (&json_ctx, 0, i, i + 1, 23);
> +      do_test (&json_ctx, 0, i - 1, i, 23);
> +    }
> +  for (; i < 1024; i += 128)
> +    {
> +      do_test (&json_ctx, 0, i, i + 1, 23);
> +      do_test (&json_ctx, 0, i - 1, i, 23);
> +    }
> +  for (; i < 2048; i += 256)
> +    {
> +      do_test (&json_ctx, 0, i, i + 1, 23);
> +      do_test (&json_ctx, 0, i - 1, i, 23);
> +    }
> +  for (; i < 4096; i += 512)
> +    {
> +      do_test (&json_ctx, 0, i, i + 1, 23);
> +      do_test (&json_ctx, 0, i - 1, i, 23);
> +    }
>
>    json_array_end (&json_ctx);
>    json_attr_object_end (&json_ctx);
> diff --git a/benchtests/bench-strchr.c b/benchtests/bench-strchr.c
> index 54640bde7e..aeb882d442 100644
> --- a/benchtests/bench-strchr.c
> +++ b/benchtests/bench-strchr.c
> @@ -287,8 +287,8 @@ int
>  test_main (void)
>  {
>    json_ctx_t json_ctx;
> -  size_t i;
>
> +  size_t i, j;
>    test_init ();
>
>    json_init (&json_ctx, 0, stdout);
> @@ -367,15 +367,30 @@ test_main (void)
>        do_test (&json_ctx, 0, i, i + 1, 0, BIG_CHAR);
>      }
>
> -  DO_RAND_TEST(&json_ctx, 0, 15, 16, 0.0);
> -  DO_RAND_TEST(&json_ctx, 0, 15, 16, 0.1);
> -  DO_RAND_TEST(&json_ctx, 0, 15, 16, 0.25);
> -  DO_RAND_TEST(&json_ctx, 0, 15, 16, 0.33);
> -  DO_RAND_TEST(&json_ctx, 0, 15, 16, 0.5);
> -  DO_RAND_TEST(&json_ctx, 0, 15, 16, 0.66);
> -  DO_RAND_TEST(&json_ctx, 0, 15, 16, 0.75);
> -  DO_RAND_TEST(&json_ctx, 0, 15, 16, 0.9);
> -  DO_RAND_TEST(&json_ctx, 0, 15, 16, 1.0);
> +  for (i = 16 / sizeof (CHAR); i <= 8192 / sizeof (CHAR); i += i)
> +    {
> +      for (j = 32 / sizeof (CHAR); j <= 320 / sizeof (CHAR);
> +          j += 32 / sizeof (CHAR))
> +       {
> +         do_test (&json_ctx, 0, i, i + j, 0, MIDDLE_CHAR);
> +         do_test (&json_ctx, 0, i + j, i, 0, MIDDLE_CHAR);
> +         if (i > j)
> +           {
> +             do_test (&json_ctx, 0, i, i - j, 0, MIDDLE_CHAR);
> +             do_test (&json_ctx, 0, i - j, i, 0, MIDDLE_CHAR);
> +           }
> +       }
> +    }
> +
> +  DO_RAND_TEST (&json_ctx, 0, 15, 16, 0.0);
> +  DO_RAND_TEST (&json_ctx, 0, 15, 16, 0.1);
> +  DO_RAND_TEST (&json_ctx, 0, 15, 16, 0.25);
> +  DO_RAND_TEST (&json_ctx, 0, 15, 16, 0.33);
> +  DO_RAND_TEST (&json_ctx, 0, 15, 16, 0.5);
> +  DO_RAND_TEST (&json_ctx, 0, 15, 16, 0.66);
> +  DO_RAND_TEST (&json_ctx, 0, 15, 16, 0.75);
> +  DO_RAND_TEST (&json_ctx, 0, 15, 16, 0.9);
> +  DO_RAND_TEST (&json_ctx, 0, 15, 16, 1.0);
>
>    json_array_end (&json_ctx);
>    json_attr_object_end (&json_ctx);
> diff --git a/benchtests/bench-strnlen.c b/benchtests/bench-strnlen.c
> index 13b46b3f57..82c02eb6ed 100644
> --- a/benchtests/bench-strnlen.c
> +++ b/benchtests/bench-strnlen.c
> @@ -195,19 +195,19 @@ test_main (void)
>      {
>        for (j = 0; j <= (704 / sizeof (CHAR)); j += (32 / sizeof (CHAR)))
>         {
> -         do_test (&json_ctx, 0, 1 << i, (i + j), BIG_CHAR);
>           do_test (&json_ctx, 0, i + j, i, BIG_CHAR);
> -
> -         do_test (&json_ctx, 64, 1 << i, (i + j), BIG_CHAR);
>           do_test (&json_ctx, 64, i + j, i, BIG_CHAR);
>
> +         do_test (&json_ctx, 0, i, i + j, BIG_CHAR);
> +         do_test (&json_ctx, 64, i, i + j, BIG_CHAR);
> +
>           if (j < i)
>             {
> -             do_test (&json_ctx, 0, 1 << i, i - j, BIG_CHAR);
>               do_test (&json_ctx, 0, i - j, i, BIG_CHAR);
> -
> -             do_test (&json_ctx, 64, 1 << i, i - j, BIG_CHAR);
>               do_test (&json_ctx, 64, i - j, i, BIG_CHAR);
> +
> +             do_test (&json_ctx, 0, i, i - j, BIG_CHAR);
> +             do_test (&json_ctx, 64, i, i - j, BIG_CHAR);
>             }
>         }
>      }
> diff --git a/benchtests/bench-strrchr.c b/benchtests/bench-strrchr.c
> index 7cd2a15484..3fcf3f281d 100644
> --- a/benchtests/bench-strrchr.c
> +++ b/benchtests/bench-strrchr.c
> @@ -151,7 +151,7 @@ int
>  test_main (void)
>  {
>    json_ctx_t json_ctx;
> -  size_t i, j;
> +  size_t i, j, k;
>    int seek;
>
>    test_init ();
> @@ -173,7 +173,7 @@ test_main (void)
>
>    for (seek = 0; seek <= 23; seek += 23)
>      {
> -      for (j = 1; j < 32; j += j)
> +      for (j = 1; j <= 256; j = (j * 4))
>         {
>           for (i = 1; i < 9; ++i)
>             {
> @@ -197,6 +197,30 @@ test_main (void)
>               do_test (&json_ctx, getpagesize () - i / 2 - 1, i, i + 1, seek,
>                        SMALL_CHAR, j);
>             }
> +
> +         for (i = (16 / sizeof (CHAR)); i <= (288 / sizeof (CHAR)); i += 32)
> +           {
> +             do_test (&json_ctx, 0, i - 16, i, seek, SMALL_CHAR, j);
> +             do_test (&json_ctx, 0, i, i + 16, seek, SMALL_CHAR, j);
> +           }
> +
> +         for (i = (16 / sizeof (CHAR)); i <= (2048 / sizeof (CHAR)); i += i)
> +           {
> +             for (k = 0; k <= (288 / sizeof (CHAR));
> +                  k += (48 / sizeof (CHAR)))
> +               {
> +                 do_test (&json_ctx, 0, k, i, seek, SMALL_CHAR, j);
> +                 do_test (&json_ctx, 0, i, i + k, seek, SMALL_CHAR, j);
> +
> +                 if (k < i)
> +                   {
> +                     do_test (&json_ctx, 0, i - k, i, seek, SMALL_CHAR, j);
> +                     do_test (&json_ctx, 0, k, i - k, seek, SMALL_CHAR, j);
> +                     do_test (&json_ctx, 0, i, i - k, seek, SMALL_CHAR, j);
> +                   }
> +               }
> +           }
> +
>           if (seek == 0)
>             {
>               break;
> --
> 2.34.1
>

LGTM.

Thanks.

-- 
H.J.

^ permalink raw reply	[flat|nested] 41+ messages in thread

* [PATCH v4] x86: Add support for VEC_SIZE == 64 in strcmp-evex.S impl
  2022-10-18  2:49 ` [PATCH v1 6/7] x86: Add support for VEC_SIZE == 64 in strcmp-evex.S impl Noah Goldstein
@ 2022-10-20  2:15   ` Noah Goldstein
  2022-10-20  3:46     ` H.J. Lu
  0 siblings, 1 reply; 41+ messages in thread
From: Noah Goldstein @ 2022-10-20  2:15 UTC (permalink / raw)
  To: libc-alpha; +Cc: goldstein.w.n, hjl.tools, carlos

Unused at the moment, but evex512 strcmp, strncmp, strcasecmp{l}, and
strncasecmp{l} functions can be added by including strcmp-evex.S with
"x86-evex512-vecs.h" defined.

In addition save code size a bit in a few places.

1. tzcnt ...         -> bsf ...
2. vpcmp{b|d} $0 ... -> vpcmpeq{b|d}

This saves a touch of code size but has minimal net affect.

Full check passes on x86-64.
---
 sysdeps/x86_64/multiarch/strcmp-evex.S | 684 ++++++++++++++++---------
 1 file changed, 438 insertions(+), 246 deletions(-)

diff --git a/sysdeps/x86_64/multiarch/strcmp-evex.S b/sysdeps/x86_64/multiarch/strcmp-evex.S
index e482d0167f..e47aa8ef99 100644
--- a/sysdeps/x86_64/multiarch/strcmp-evex.S
+++ b/sysdeps/x86_64/multiarch/strcmp-evex.S
@@ -20,6 +20,10 @@
 
 #if ISA_SHOULD_BUILD (4)
 
+# ifndef VEC_SIZE
+#  include "x86-evex256-vecs.h"
+# endif
+
 # define STRCMP_ISA	_evex
 # include "strcmp-naming.h"
 
@@ -35,41 +39,57 @@
 # define PAGE_SIZE	4096
 
 	/* VEC_SIZE = Number of bytes in a ymm register.  */
-# define VEC_SIZE	32
 # define CHAR_PER_VEC	(VEC_SIZE	/	SIZE_OF_CHAR)
 
-# define VMOVU	vmovdqu64
-# define VMOVA	vmovdqa64
-
 # ifdef USE_AS_WCSCMP
-#  define TESTEQ	subl $0xff,
 	/* Compare packed dwords.  */
 #  define VPCMP	vpcmpd
+#  define VPCMPEQ	vpcmpeqd
 #  define VPMINU	vpminud
 #  define VPTESTM	vptestmd
 #  define VPTESTNM	vptestnmd
 	/* 1 dword char == 4 bytes.  */
 #  define SIZE_OF_CHAR	4
+
+#  define TESTEQ	sub $((1 << CHAR_PER_VEC) - 1),
+
+#  define USE_WIDE_CHAR
 # else
-#  define TESTEQ	incl
 	/* Compare packed bytes.  */
 #  define VPCMP	vpcmpb
+#  define VPCMPEQ	vpcmpeqb
 #  define VPMINU	vpminub
 #  define VPTESTM	vptestmb
 #  define VPTESTNM	vptestnmb
 	/* 1 byte char == 1 byte.  */
 #  define SIZE_OF_CHAR	1
+
+#  define TESTEQ	inc
+# endif
+
+# include "reg-macros.h"
+
+# if VEC_SIZE == 64
+#  define RODATA_SECTION	rodata.cst64
+# else
+#  define RODATA_SECTION	rodata.cst32
+# endif
+
+# if CHAR_PER_VEC == 64
+#  define FALLTHROUGH_RETURN_OFFSET	(VEC_SIZE * 3)
+# else
+#  define FALLTHROUGH_RETURN_OFFSET	(VEC_SIZE * 2)
 # endif
 
 # ifdef USE_AS_STRNCMP
-#  define LOOP_REG	r9d
+#  define LOOP_REG	VR9
 #  define LOOP_REG64	r9
 
 #  define OFFSET_REG8	r9b
 #  define OFFSET_REG	r9d
 #  define OFFSET_REG64	r9
 # else
-#  define LOOP_REG	edx
+#  define LOOP_REG	VRDX
 #  define LOOP_REG64	rdx
 
 #  define OFFSET_REG8	dl
@@ -83,32 +103,6 @@
 #  define VEC_OFFSET	(-VEC_SIZE)
 # endif
 
-# define XMM0	xmm17
-# define XMM1	xmm18
-
-# define XMM10	xmm27
-# define XMM11	xmm28
-# define XMM12	xmm29
-# define XMM13	xmm30
-# define XMM14	xmm31
-
-
-# define YMM0	ymm17
-# define YMM1	ymm18
-# define YMM2	ymm19
-# define YMM3	ymm20
-# define YMM4	ymm21
-# define YMM5	ymm22
-# define YMM6	ymm23
-# define YMM7	ymm24
-# define YMM8	ymm25
-# define YMM9	ymm26
-# define YMM10	ymm27
-# define YMM11	ymm28
-# define YMM12	ymm29
-# define YMM13	ymm30
-# define YMM14	ymm31
-
 # ifdef USE_AS_STRCASECMP_L
 #  define BYTE_LOOP_REG	OFFSET_REG
 # else
@@ -125,61 +119,72 @@
 #  endif
 # endif
 
-# define LCASE_MIN_YMM	%YMM12
-# define LCASE_MAX_YMM	%YMM13
-# define CASE_ADD_YMM	%YMM14
+# define LCASE_MIN_V	VMM(12)
+# define LCASE_MAX_V	VMM(13)
+# define CASE_ADD_V	VMM(14)
 
-# define LCASE_MIN_XMM	%XMM12
-# define LCASE_MAX_XMM	%XMM13
-# define CASE_ADD_XMM	%XMM14
+# if VEC_SIZE == 64
+#  define LCASE_MIN_YMM	VMM_256(12)
+#  define LCASE_MAX_YMM	VMM_256(13)
+#  define CASE_ADD_YMM	VMM_256(14)
+# endif
+
+# define LCASE_MIN_XMM	VMM_128(12)
+# define LCASE_MAX_XMM	VMM_128(13)
+# define CASE_ADD_XMM	VMM_128(14)
 
 	/* NB: wcsncmp uses r11 but strcasecmp is never used in
 	   conjunction with wcscmp.  */
 # define TOLOWER_BASE	%r11
 
 # ifdef USE_AS_STRCASECMP_L
-#  define _REG(x, y) x ## y
-#  define REG(x, y) _REG(x, y)
-#  define TOLOWER(reg1, reg2, ext)										\
-	vpsubb	REG(LCASE_MIN_, ext), reg1, REG(%ext, 10);					\
-	vpsubb	REG(LCASE_MIN_, ext), reg2, REG(%ext, 11);					\
-	vpcmpub	$1, REG(LCASE_MAX_, ext), REG(%ext, 10), %k5;				\
-	vpcmpub	$1, REG(LCASE_MAX_, ext), REG(%ext, 11), %k6;				\
-	vpaddb	reg1, REG(CASE_ADD_, ext), reg1{%k5};						\
-	vpaddb	reg2, REG(CASE_ADD_, ext), reg2{%k6}
-
-#  define TOLOWER_gpr(src, dst) movl (TOLOWER_BASE, src, 4), dst
-#  define TOLOWER_YMM(...)	TOLOWER(__VA_ARGS__, YMM)
-#  define TOLOWER_XMM(...)	TOLOWER(__VA_ARGS__, XMM)
-
-#  define CMP_R1_R2(s1_reg, s2_reg, reg_out, ext)						\
-	TOLOWER	(s1_reg, s2_reg, ext);										\
-	VPCMP	$0, s1_reg, s2_reg, reg_out
-
-#  define CMP_R1_S2(s1_reg, s2_mem, s2_reg, reg_out, ext)				\
-	VMOVU	s2_mem, s2_reg;												\
-	CMP_R1_R2(s1_reg, s2_reg, reg_out, ext)
-
-#  define CMP_R1_R2_YMM(...) CMP_R1_R2(__VA_ARGS__, YMM)
-#  define CMP_R1_R2_XMM(...) CMP_R1_R2(__VA_ARGS__, XMM)
-
-#  define CMP_R1_S2_YMM(...) CMP_R1_S2(__VA_ARGS__, YMM)
-#  define CMP_R1_S2_XMM(...) CMP_R1_S2(__VA_ARGS__, XMM)
+#  define _REG(x, y)	x ## y
+#  define REG(x, y)	_REG(x, y)
+#  define TOLOWER(reg1, reg2, ext, vec_macro)	\
+	vpsubb	%REG(LCASE_MIN_, ext), reg1, %vec_macro(10);	\
+	vpsubb	%REG(LCASE_MIN_, ext), reg2, %vec_macro(11);	\
+	vpcmpub	$1, %REG(LCASE_MAX_, ext), %vec_macro(10), %k5;	\
+	vpcmpub	$1, %REG(LCASE_MAX_, ext), %vec_macro(11), %k6;	\
+	vpaddb	reg1, %REG(CASE_ADD_, ext), reg1{%k5};	\
+	vpaddb	reg2, %REG(CASE_ADD_, ext), reg2{%k6}
+
+#  define TOLOWER_gpr(src, dst)	movl (TOLOWER_BASE, src, 4), dst
+#  define TOLOWER_VMM(...)	TOLOWER(__VA_ARGS__, V, VMM)
+#  define TOLOWER_YMM(...)	TOLOWER(__VA_ARGS__, YMM, VMM_256)
+#  define TOLOWER_XMM(...)	TOLOWER(__VA_ARGS__, XMM, VMM_128)
+
+#  define CMP_R1_R2(s1_reg, s2_reg, reg_out, ext, vec_macro)	\
+	TOLOWER	(s1_reg, s2_reg, ext, vec_macro);	\
+	VPCMPEQ	s1_reg, s2_reg, reg_out
+
+#  define CMP_R1_S2(s1_reg, s2_mem, s2_reg, reg_out, ext, vec_macro)	\
+	VMOVU	s2_mem, s2_reg;	\
+	CMP_R1_R2 (s1_reg, s2_reg, reg_out, ext, vec_macro)
+
+#  define CMP_R1_R2_VMM(...)	CMP_R1_R2(__VA_ARGS__, V, VMM)
+#  define CMP_R1_R2_YMM(...)	CMP_R1_R2(__VA_ARGS__, YMM, VMM_256)
+#  define CMP_R1_R2_XMM(...)	CMP_R1_R2(__VA_ARGS__, XMM, VMM_128)
+
+#  define CMP_R1_S2_VMM(...)	CMP_R1_S2(__VA_ARGS__, V, VMM)
+#  define CMP_R1_S2_YMM(...)	CMP_R1_S2(__VA_ARGS__, YMM, VMM_256)
+#  define CMP_R1_S2_XMM(...)	CMP_R1_S2(__VA_ARGS__, XMM, VMM_128)
 
 # else
 #  define TOLOWER_gpr(...)
+#  define TOLOWER_VMM(...)
 #  define TOLOWER_YMM(...)
 #  define TOLOWER_XMM(...)
 
-#  define CMP_R1_R2_YMM(s1_reg, s2_reg, reg_out)						\
-	VPCMP	$0, s2_reg, s1_reg, reg_out
-
-#  define CMP_R1_R2_XMM(...) CMP_R1_R2_YMM(__VA_ARGS__)
+#  define CMP_R1_R2_VMM(s1_reg, s2_reg, reg_out)	\
+	VPCMPEQ	s2_reg, s1_reg, reg_out
 
-#  define CMP_R1_S2_YMM(s1_reg, s2_mem, unused, reg_out)				\
-	VPCMP	$0, s2_mem, s1_reg, reg_out
+#  define CMP_R1_R2_YMM(...)	CMP_R1_R2_VMM(__VA_ARGS__)
+#  define CMP_R1_R2_XMM(...)	CMP_R1_R2_VMM(__VA_ARGS__)
 
-#  define CMP_R1_S2_XMM(...) CMP_R1_S2_YMM(__VA_ARGS__)
+#  define CMP_R1_S2_VMM(s1_reg, s2_mem, unused, reg_out)	\
+	VPCMPEQ	s2_mem, s1_reg, reg_out
+#  define CMP_R1_S2_YMM(...)	CMP_R1_S2_VMM(__VA_ARGS__)
+#  define CMP_R1_S2_XMM(...)	CMP_R1_S2_VMM(__VA_ARGS__)
 # endif
 
 /* Warning!
@@ -203,7 +208,7 @@
    the maximum offset is reached before a difference is found, zero is
    returned.  */
 
-	.section .text.evex, "ax", @progbits
+	.section SECTION(.text), "ax", @progbits
 	.align	16
 	.type	STRCMP, @function
 	.globl	STRCMP
@@ -232,7 +237,7 @@ STRCMP:
 #  else
 	mov	(%LOCALE_REG), %RAX_LP
 #  endif
-	testl	$1, LOCALE_DATA_VALUES + _NL_CTYPE_NONASCII_CASE * SIZEOF_VALUES(%rax)
+	testb	$1, LOCALE_DATA_VALUES + _NL_CTYPE_NONASCII_CASE * SIZEOF_VALUES(%rax)
 	jne	STRCASECMP_L_NONASCII
 	leaq	_nl_C_LC_CTYPE_tolower + 128 * 4(%rip), TOLOWER_BASE
 # endif
@@ -254,28 +259,46 @@ STRCMP:
 # endif
 
 # if defined USE_AS_STRCASECMP_L
-	.section .rodata.cst32, "aM", @progbits, 32
-	.align	32
+	.section RODATA_SECTION, "aM", @progbits, VEC_SIZE
+	.align	VEC_SIZE
 L(lcase_min):
 	.quad	0x4141414141414141
 	.quad	0x4141414141414141
 	.quad	0x4141414141414141
 	.quad	0x4141414141414141
+#  if VEC_SIZE == 64
+	.quad	0x4141414141414141
+	.quad	0x4141414141414141
+	.quad	0x4141414141414141
+	.quad	0x4141414141414141
+#  endif
 L(lcase_max):
 	.quad	0x1a1a1a1a1a1a1a1a
 	.quad	0x1a1a1a1a1a1a1a1a
 	.quad	0x1a1a1a1a1a1a1a1a
 	.quad	0x1a1a1a1a1a1a1a1a
+#  if VEC_SIZE == 64
+	.quad	0x1a1a1a1a1a1a1a1a
+	.quad	0x1a1a1a1a1a1a1a1a
+	.quad	0x1a1a1a1a1a1a1a1a
+	.quad	0x1a1a1a1a1a1a1a1a
+#  endif
 L(case_add):
 	.quad	0x2020202020202020
 	.quad	0x2020202020202020
 	.quad	0x2020202020202020
 	.quad	0x2020202020202020
+#  if VEC_SIZE == 64
+	.quad	0x2020202020202020
+	.quad	0x2020202020202020
+	.quad	0x2020202020202020
+	.quad	0x2020202020202020
+#  endif
 	.previous
 
-	vmovdqa64 L(lcase_min)(%rip), LCASE_MIN_YMM
-	vmovdqa64 L(lcase_max)(%rip), LCASE_MAX_YMM
-	vmovdqa64 L(case_add)(%rip), CASE_ADD_YMM
+	VMOVA	L(lcase_min)(%rip), %LCASE_MIN_V
+	VMOVA	L(lcase_max)(%rip), %LCASE_MAX_V
+	VMOVA	L(case_add)(%rip), %CASE_ADD_V
 # endif
 
 	movl	%edi, %eax
@@ -288,12 +311,12 @@ L(case_add):
 
 L(no_page_cross):
 	/* Safe to compare 4x vectors.  */
-	VMOVU	(%rdi), %YMM0
-	VPTESTM	%YMM0, %YMM0, %k2
+	VMOVU	(%rdi), %VMM(0)
+	VPTESTM	%VMM(0), %VMM(0), %k2
 	/* Each bit cleared in K1 represents a mismatch or a null CHAR
 	   in YMM0 and 32 bytes at (%rsi).  */
-	CMP_R1_S2_YMM (%YMM0, (%rsi), %YMM1, %k1){%k2}
-	kmovd	%k1, %ecx
+	CMP_R1_S2_VMM (%VMM(0), (%rsi), %VMM(1), %k1){%k2}
+	KMOV	%k1, %VRCX
 # ifdef USE_AS_STRNCMP
 	cmpq	$CHAR_PER_VEC, %rdx
 	jbe	L(vec_0_test_len)
@@ -303,14 +326,14 @@ L(no_page_cross):
 	   wcscmp/wcsncmp.  */
 
 	/* All 1s represents all equals. TESTEQ will overflow to zero in
-	   all equals case. Otherwise 1s will carry until position of first
-	   mismatch.  */
-	TESTEQ	%ecx
+	   all equals case. Otherwise 1s will carry until position of
+	   first mismatch.  */
+	TESTEQ	%VRCX
 	jz	L(more_3x_vec)
 
 	.p2align 4,, 4
 L(return_vec_0):
-	tzcntl	%ecx, %ecx
+	bsf	%VRCX, %VRCX
 # ifdef USE_AS_WCSCMP
 	movl	(%rdi, %rcx, SIZE_OF_CHAR), %edx
 	xorl	%eax, %eax
@@ -321,7 +344,16 @@ L(return_vec_0):
 	orl	$1, %eax
 # else
 	movzbl	(%rdi, %rcx), %eax
+	/* For VEC_SIZE == 64 use movb instead of movzbl to save a byte
+	   and keep logic for len <= VEC_SIZE (common) in just the
+	   first cache line.  NB: No evex512 processor has partial-
+	   register stalls. If that changes this ifdef can be disabled
+	   without affecting correctness.  */
+#  if !defined USE_AS_STRNCMP && !defined USE_AS_STRCASECMP_L && VEC_SIZE == 64
+	movb	(%rsi, %rcx), %cl
+#  else
 	movzbl	(%rsi, %rcx), %ecx
+#  endif
 	TOLOWER_gpr (%rax, %eax)
 	TOLOWER_gpr (%rcx, %ecx)
 	subl	%ecx, %eax
@@ -332,8 +364,8 @@ L(ret0):
 # ifdef USE_AS_STRNCMP
 	.p2align 4,, 4
 L(vec_0_test_len):
-	notl	%ecx
-	bzhil	%edx, %ecx, %eax
+	not	%VRCX
+	bzhi	%VRDX, %VRCX, %VRAX
 	jnz	L(return_vec_0)
 	/* Align if will cross fetch block.  */
 	.p2align 4,, 2
@@ -372,7 +404,7 @@ L(ret1):
 
 	.p2align 4,, 10
 L(return_vec_1):
-	tzcntl	%ecx, %ecx
+	bsf	%VRCX, %VRCX
 # ifdef USE_AS_STRNCMP
 	/* rdx must be > CHAR_PER_VEC so its safe to subtract without
 	   worrying about underflow.  */
@@ -401,24 +433,41 @@ L(ret2):
 	.p2align 4,, 10
 # ifdef USE_AS_STRNCMP
 L(return_vec_3):
-#  if CHAR_PER_VEC <= 16
+#  if CHAR_PER_VEC <= 32
+	/* If CHAR_PER_VEC <= 32 reuse code from L(return_vec_3) without
+	   additional branches by adjusting the bit positions from
+	   VEC3.  We can't do this for CHAR_PER_VEC == 64.  */
+#   if CHAR_PER_VEC <= 16
 	sall	$CHAR_PER_VEC, %ecx
-#  else
+#   else
 	salq	$CHAR_PER_VEC, %rcx
+#   endif
+#  else
+	/* If CHAR_PER_VEC == 64 we can't shift the return GPR so just
+	   check it.  */
+	bsf	%VRCX, %VRCX
+	addl	$(CHAR_PER_VEC), %ecx
+	cmpq	%rcx, %rdx
+	ja	L(ret_vec_3_finish)
+	xorl	%eax, %eax
+	ret
 #  endif
 # endif
+
+	/* If CHAR_PER_VEC == 64 we can't combine matches from the last
+	   2x VEC so need seperate return label.  */
 L(return_vec_2):
 # if (CHAR_PER_VEC <= 16) || !(defined USE_AS_STRNCMP)
-	tzcntl	%ecx, %ecx
+	bsf	%VRCX, %VRCX
 # else
-	tzcntq	%rcx, %rcx
+	bsfq	%rcx, %rcx
 # endif
-
 # ifdef USE_AS_STRNCMP
 	cmpq	%rcx, %rdx
 	jbe	L(ret_zero)
 # endif
 
+L(ret_vec_3_finish):
 # ifdef USE_AS_WCSCMP
 	movl	(VEC_SIZE * 2)(%rdi, %rcx, SIZE_OF_CHAR), %edx
 	xorl	%eax, %eax
@@ -440,7 +489,7 @@ L(ret3):
 # ifndef USE_AS_STRNCMP
 	.p2align 4,, 10
 L(return_vec_3):
-	tzcntl	%ecx, %ecx
+	bsf	%VRCX, %VRCX
 #  ifdef USE_AS_WCSCMP
 	movl	(VEC_SIZE * 3)(%rdi, %rcx, SIZE_OF_CHAR), %edx
 	xorl	%eax, %eax
@@ -465,11 +514,11 @@ L(ret4):
 	.p2align 5
 L(more_3x_vec):
 	/* Safe to compare 4x vectors.  */
-	VMOVU	(VEC_SIZE)(%rdi), %YMM0
-	VPTESTM	%YMM0, %YMM0, %k2
-	CMP_R1_S2_YMM (%YMM0, VEC_SIZE(%rsi), %YMM1, %k1){%k2}
-	kmovd	%k1, %ecx
-	TESTEQ	%ecx
+	VMOVU	(VEC_SIZE)(%rdi), %VMM(0)
+	VPTESTM	%VMM(0), %VMM(0), %k2
+	CMP_R1_S2_VMM (%VMM(0), VEC_SIZE(%rsi), %VMM(1), %k1){%k2}
+	KMOV	%k1, %VRCX
+	TESTEQ	%VRCX
 	jnz	L(return_vec_1)
 
 # ifdef USE_AS_STRNCMP
@@ -477,18 +526,18 @@ L(more_3x_vec):
 	jbe	L(ret_zero)
 # endif
 
-	VMOVU	(VEC_SIZE * 2)(%rdi), %YMM0
-	VPTESTM	%YMM0, %YMM0, %k2
-	CMP_R1_S2_YMM (%YMM0, (VEC_SIZE * 2)(%rsi), %YMM1, %k1){%k2}
-	kmovd	%k1, %ecx
-	TESTEQ	%ecx
+	VMOVU	(VEC_SIZE * 2)(%rdi), %VMM(0)
+	VPTESTM	%VMM(0), %VMM(0), %k2
+	CMP_R1_S2_VMM (%VMM(0), (VEC_SIZE * 2)(%rsi), %VMM(1), %k1){%k2}
+	KMOV	%k1, %VRCX
+	TESTEQ	%VRCX
 	jnz	L(return_vec_2)
 
-	VMOVU	(VEC_SIZE * 3)(%rdi), %YMM0
-	VPTESTM	%YMM0, %YMM0, %k2
-	CMP_R1_S2_YMM (%YMM0, (VEC_SIZE * 3)(%rsi), %YMM1, %k1){%k2}
-	kmovd	%k1, %ecx
-	TESTEQ	%ecx
+	VMOVU	(VEC_SIZE * 3)(%rdi), %VMM(0)
+	VPTESTM	%VMM(0), %VMM(0), %k2
+	CMP_R1_S2_VMM (%VMM(0), (VEC_SIZE * 3)(%rsi), %VMM(1), %k1){%k2}
+	KMOV	%k1, %VRCX
+	TESTEQ	%VRCX
 	jnz	L(return_vec_3)
 
 # ifdef USE_AS_STRNCMP
@@ -565,110 +614,123 @@ L(loop):
 
 	/* Loop entry after handling page cross during loop.  */
 L(loop_skip_page_cross_check):
-	VMOVA	(VEC_SIZE * 0)(%rdi), %YMM0
-	VMOVA	(VEC_SIZE * 1)(%rdi), %YMM2
-	VMOVA	(VEC_SIZE * 2)(%rdi), %YMM4
-	VMOVA	(VEC_SIZE * 3)(%rdi), %YMM6
+	VMOVA	(VEC_SIZE * 0)(%rdi), %VMM(0)
+	VMOVA	(VEC_SIZE * 1)(%rdi), %VMM(2)
+	VMOVA	(VEC_SIZE * 2)(%rdi), %VMM(4)
+	VMOVA	(VEC_SIZE * 3)(%rdi), %VMM(6)
 
-	VPMINU	%YMM0, %YMM2, %YMM8
-	VPMINU	%YMM4, %YMM6, %YMM9
+	VPMINU	%VMM(0), %VMM(2), %VMM(8)
+	VPMINU	%VMM(4), %VMM(6), %VMM(9)
 
 	/* A zero CHAR in YMM9 means that there is a null CHAR.  */
-	VPMINU	%YMM8, %YMM9, %YMM9
+	VPMINU	%VMM(8), %VMM(9), %VMM(9)
 
 	/* Each bit set in K1 represents a non-null CHAR in YMM9.  */
-	VPTESTM	%YMM9, %YMM9, %k1
+	VPTESTM	%VMM(9), %VMM(9), %k1
 # ifndef USE_AS_STRCASECMP_L
-	vpxorq	(VEC_SIZE * 0)(%rsi), %YMM0, %YMM1
-	vpxorq	(VEC_SIZE * 1)(%rsi), %YMM2, %YMM3
-	vpxorq	(VEC_SIZE * 2)(%rsi), %YMM4, %YMM5
+	vpxorq	(VEC_SIZE * 0)(%rsi), %VMM(0), %VMM(1)
+	vpxorq	(VEC_SIZE * 1)(%rsi), %VMM(2), %VMM(3)
+	vpxorq	(VEC_SIZE * 2)(%rsi), %VMM(4), %VMM(5)
 	/* Ternary logic to xor (VEC_SIZE * 3)(%rsi) with YMM6 while
 	   oring with YMM1. Result is stored in YMM6.  */
-	vpternlogd $0xde, (VEC_SIZE * 3)(%rsi), %YMM1, %YMM6
+	vpternlogd $0xde, (VEC_SIZE * 3)(%rsi), %VMM(1), %VMM(6)
 # else
-	VMOVU	(VEC_SIZE * 0)(%rsi), %YMM1
-	TOLOWER_YMM (%YMM0, %YMM1)
-	VMOVU	(VEC_SIZE * 1)(%rsi), %YMM3
-	TOLOWER_YMM (%YMM2, %YMM3)
-	VMOVU	(VEC_SIZE * 2)(%rsi), %YMM5
-	TOLOWER_YMM (%YMM4, %YMM5)
-	VMOVU	(VEC_SIZE * 3)(%rsi), %YMM7
-	TOLOWER_YMM (%YMM6, %YMM7)
-	vpxorq	%YMM0, %YMM1, %YMM1
-	vpxorq	%YMM2, %YMM3, %YMM3
-	vpxorq	%YMM4, %YMM5, %YMM5
-	vpternlogd $0xde, %YMM7, %YMM1, %YMM6
+	VMOVU	(VEC_SIZE * 0)(%rsi), %VMM(1)
+	TOLOWER_VMM (%VMM(0), %VMM(1))
+	VMOVU	(VEC_SIZE * 1)(%rsi), %VMM(3)
+	TOLOWER_VMM (%VMM(2), %VMM(3))
+	VMOVU	(VEC_SIZE * 2)(%rsi), %VMM(5)
+	TOLOWER_VMM (%VMM(4), %VMM(5))
+	VMOVU	(VEC_SIZE * 3)(%rsi), %VMM(7)
+	TOLOWER_VMM (%VMM(6), %VMM(7))
+	vpxorq	%VMM(0), %VMM(1), %VMM(1)
+	vpxorq	%VMM(2), %VMM(3), %VMM(3)
+	vpxorq	%VMM(4), %VMM(5), %VMM(5)
+	vpternlogd $0xde, %VMM(7), %VMM(1), %VMM(6)
 # endif
 	/* Or together YMM3, YMM5, and YMM6.  */
-	vpternlogd $0xfe, %YMM3, %YMM5, %YMM6
+	vpternlogd $0xfe, %VMM(3), %VMM(5), %VMM(6)
 
 
 	/* A non-zero CHAR in YMM6 represents a mismatch.  */
-	VPTESTNM %YMM6, %YMM6, %k0{%k1}
-	kmovd	%k0, %LOOP_REG
+	VPTESTNM %VMM(6), %VMM(6), %k0{%k1}
+	KMOV	%k0, %LOOP_REG
 
 	TESTEQ	%LOOP_REG
 	jz	L(loop)
 
 
 	/* Find which VEC has the mismatch of end of string.  */
-	VPTESTM	%YMM0, %YMM0, %k1
-	VPTESTNM %YMM1, %YMM1, %k0{%k1}
-	kmovd	%k0, %ecx
-	TESTEQ	%ecx
+	VPTESTM	%VMM(0), %VMM(0), %k1
+	VPTESTNM %VMM(1), %VMM(1), %k0{%k1}
+	KMOV	%k0, %VRCX
+	TESTEQ	%VRCX
 	jnz	L(return_vec_0_end)
 
-	VPTESTM	%YMM2, %YMM2, %k1
-	VPTESTNM %YMM3, %YMM3, %k0{%k1}
-	kmovd	%k0, %ecx
-	TESTEQ	%ecx
+	VPTESTM	%VMM(2), %VMM(2), %k1
+	VPTESTNM %VMM(3), %VMM(3), %k0{%k1}
+	KMOV	%k0, %VRCX
+	TESTEQ	%VRCX
 	jnz	L(return_vec_1_end)
 
 
-	/* Handle VEC 2 and 3 without branches.  */
+	/* Handle VEC 2 and 3 without branches if CHAR_PER_VEC <= 32.
+	 */
 L(return_vec_2_3_end):
 # ifdef USE_AS_STRNCMP
 	subq	$(CHAR_PER_VEC * 2), %rdx
 	jbe	L(ret_zero_end)
 # endif
 
-	VPTESTM	%YMM4, %YMM4, %k1
-	VPTESTNM %YMM5, %YMM5, %k0{%k1}
-	kmovd	%k0, %ecx
-	TESTEQ	%ecx
+	VPTESTM	%VMM(4), %VMM(4), %k1
+	VPTESTNM %VMM(5), %VMM(5), %k0{%k1}
+	KMOV	%k0, %VRCX
+	TESTEQ	%VRCX
 # if CHAR_PER_VEC <= 16
 	sall	$CHAR_PER_VEC, %LOOP_REG
 	orl	%ecx, %LOOP_REG
-# else
+# elif CHAR_PER_VEC <= 32
 	salq	$CHAR_PER_VEC, %LOOP_REG64
 	orq	%rcx, %LOOP_REG64
+# else
+	/* We aren't combining last 2x VEC so branch on second the last.
+	 */
+	jnz	L(return_vec_2_end)
 # endif
-L(return_vec_3_end):
+
 	/* LOOP_REG contains matches for null/mismatch from the loop. If
-	   VEC 0,1,and 2 all have no null and no mismatches then mismatch
-	   must entirely be from VEC 3 which is fully represented by
-	   LOOP_REG.  */
+	   VEC 0,1,and 2 all have no null and no mismatches then
+	   mismatch must entirely be from VEC 3 which is fully
+	   represented by LOOP_REG.  */
 # if CHAR_PER_VEC <= 16
-	tzcntl	%LOOP_REG, %LOOP_REG
+	bsf	%LOOP_REG, %LOOP_REG
 # else
-	tzcntq	%LOOP_REG64, %LOOP_REG64
+	bsfq	%LOOP_REG64, %LOOP_REG64
 # endif
 # ifdef USE_AS_STRNCMP
+
+	/* If CHAR_PER_VEC == 64 we can't combine last 2x VEC so need to
+	   adj length before last comparison.  */
+#  if CHAR_PER_VEC == 64
+	subq	$CHAR_PER_VEC, %rdx
+	jbe	L(ret_zero_end)
+#  endif
+
 	cmpq	%LOOP_REG64, %rdx
 	jbe	L(ret_zero_end)
 # endif
 
 # ifdef USE_AS_WCSCMP
-	movl	(VEC_SIZE * 2)(%rdi, %LOOP_REG64, SIZE_OF_CHAR), %ecx
+	movl	(FALLTHROUGH_RETURN_OFFSET)(%rdi, %LOOP_REG64, SIZE_OF_CHAR), %ecx
 	xorl	%eax, %eax
-	cmpl	(VEC_SIZE * 2)(%rsi, %LOOP_REG64, SIZE_OF_CHAR), %ecx
+	cmpl	(FALLTHROUGH_RETURN_OFFSET)(%rsi, %LOOP_REG64, SIZE_OF_CHAR), %ecx
 	je	L(ret5)
 	setl	%al
 	negl	%eax
 	xorl	%r8d, %eax
 # else
-	movzbl	(VEC_SIZE * 2)(%rdi, %LOOP_REG64), %eax
-	movzbl	(VEC_SIZE * 2)(%rsi, %LOOP_REG64), %ecx
+	movzbl	(FALLTHROUGH_RETURN_OFFSET)(%rdi, %LOOP_REG64), %eax
+	movzbl	(FALLTHROUGH_RETURN_OFFSET)(%rsi, %LOOP_REG64), %ecx
 	TOLOWER_gpr (%rax, %eax)
 	TOLOWER_gpr (%rcx, %ecx)
 	subl	%ecx, %eax
@@ -686,23 +748,39 @@ L(ret_zero_end):
 # endif
 
 
+
 	/* The L(return_vec_N_end) differ from L(return_vec_N) in that
-	   they use the value of `r8` to negate the return value. This is
-	   because the page cross logic can swap `rdi` and `rsi`.  */
+	   they use the value of `r8` to negate the return value. This
+	   is because the page cross logic can swap `rdi` and `rsi`.
+	 */
 	.p2align 4,, 10
 # ifdef USE_AS_STRNCMP
 L(return_vec_1_end):
-#  if CHAR_PER_VEC <= 16
+#  if CHAR_PER_VEC <= 32
+	/* If CHAR_PER_VEC <= 32 reuse code from L(return_vec_0_end)
+	   without additional branches by adjusting the bit positions
+	   from VEC1.  We can't do this for CHAR_PER_VEC == 64.  */
+#   if CHAR_PER_VEC <= 16
 	sall	$CHAR_PER_VEC, %ecx
-#  else
+#   else
 	salq	$CHAR_PER_VEC, %rcx
+#   endif
+#  else
+	/* If CHAR_PER_VEC == 64 we can't shift the return GPR so just
+	   check it.  */
+	bsf	%VRCX, %VRCX
+	addl	$(CHAR_PER_VEC), %ecx
+	cmpq	%rcx, %rdx
+	ja	L(ret_vec_0_end_finish)
+	xorl	%eax, %eax
+	ret
 #  endif
 # endif
 L(return_vec_0_end):
 # if (CHAR_PER_VEC <= 16) || !(defined USE_AS_STRNCMP)
-	tzcntl	%ecx, %ecx
+	bsf	%VRCX, %VRCX
 # else
-	tzcntq	%rcx, %rcx
+	bsfq	%rcx, %rcx
 # endif
 
 # ifdef USE_AS_STRNCMP
@@ -710,6 +788,7 @@ L(return_vec_0_end):
 	jbe	L(ret_zero_end)
 # endif
 
+L(ret_vec_0_end_finish):
 # ifdef USE_AS_WCSCMP
 	movl	(%rdi, %rcx, SIZE_OF_CHAR), %edx
 	xorl	%eax, %eax
@@ -737,7 +816,7 @@ L(ret6):
 # ifndef USE_AS_STRNCMP
 	.p2align 4,, 10
 L(return_vec_1_end):
-	tzcntl	%ecx, %ecx
+	bsf	%VRCX, %VRCX
 #  ifdef USE_AS_WCSCMP
 	movl	VEC_SIZE(%rdi, %rcx, SIZE_OF_CHAR), %edx
 	xorl	%eax, %eax
@@ -760,6 +839,41 @@ L(ret7):
 # endif
 
 
+	/* If CHAR_PER_VEC == 64 we can't combine matches from the last
+	   2x VEC so need seperate return label.  */
+# if CHAR_PER_VEC == 64
+L(return_vec_2_end):
+	bsf	%VRCX, %VRCX
+#  ifdef USE_AS_STRNCMP
+	cmpq	%rcx, %rdx
+	jbe	L(ret_zero_end)
+#  endif
+#  ifdef USE_AS_WCSCMP
+	movl	(VEC_SIZE * 2)(%rdi, %rcx, SIZE_OF_CHAR), %edx
+	xorl	%eax, %eax
+	cmpl	(VEC_SIZE * 2)(%rsi, %rcx, SIZE_OF_CHAR), %edx
+	je	L(ret31)
+	setl	%al
+	negl	%eax
+	/* This is the non-zero case for `eax` so just xorl with `r8d`
+	   flip is `rdi` and `rsi` where swapped.  */
+	xorl	%r8d, %eax
+#  else
+	movzbl	(VEC_SIZE * 2)(%rdi, %rcx), %eax
+	movzbl	(VEC_SIZE * 2)(%rsi, %rcx), %ecx
+	TOLOWER_gpr (%rax, %eax)
+	TOLOWER_gpr (%rcx, %ecx)
+	subl	%ecx, %eax
+	/* Flip `eax` if `rdi` and `rsi` where swapped in page cross
+	   logic. Subtract `r8d` after xor for zero case.  */
+	xorl	%r8d, %eax
+	subl	%r8d, %eax
+#  endif
+L(ret13):
+	ret
+# endif
+
+
 	/* Page cross in rsi in next 4x VEC.  */
 
 	/* TODO: Improve logic here.  */
@@ -778,11 +892,11 @@ L(page_cross_during_loop):
 	cmpl	$-(VEC_SIZE * 3), %eax
 	jle	L(less_1x_vec_till_page_cross)
 
-	VMOVA	(%rdi), %YMM0
-	VPTESTM	%YMM0, %YMM0, %k2
-	CMP_R1_S2_YMM (%YMM0, (%rsi), %YMM1, %k1){%k2}
-	kmovd	%k1, %ecx
-	TESTEQ	%ecx
+	VMOVA	(%rdi), %VMM(0)
+	VPTESTM	%VMM(0), %VMM(0), %k2
+	CMP_R1_S2_VMM (%VMM(0), (%rsi), %VMM(1), %k1){%k2}
+	KMOV	%k1, %VRCX
+	TESTEQ	%VRCX
 	jnz	L(return_vec_0_end)
 
 	/* if distance >= 2x VEC then eax > -(VEC_SIZE * 2).  */
@@ -799,9 +913,9 @@ L(less_1x_vec_till_page_cross):
 	   to read back -VEC_SIZE. If rdi is truly at the start of a page
 	   here, it means the previous page (rdi - VEC_SIZE) has already
 	   been loaded earlier so must be valid.  */
-	VMOVU	-VEC_SIZE(%rdi, %rax), %YMM0
-	VPTESTM	%YMM0, %YMM0, %k2
-	CMP_R1_S2_YMM (%YMM0, -VEC_SIZE(%rsi, %rax), %YMM1, %k1){%k2}
+	VMOVU	-VEC_SIZE(%rdi, %rax), %VMM(0)
+	VPTESTM	%VMM(0), %VMM(0), %k2
+	CMP_R1_S2_VMM (%VMM(0), -VEC_SIZE(%rsi, %rax), %VMM(1), %k1){%k2}
 	/* Mask of potentially valid bits. The lower bits can be out of
 	   range comparisons (but safe regarding page crosses).  */
 
@@ -811,14 +925,22 @@ L(less_1x_vec_till_page_cross):
 	andl	$(VEC_SIZE - 1), %ecx
 	shrl	$2, %ecx
 	shlxl	%ecx, %r10d, %ecx
+	/* Depending on CHAR_PER_VEC extract mask for possible in-bound
+	   matches.  */
+#  if CHAR_PER_VEC == 16
+	movzwl	%cx, %r10d
+#  elif CHAR_PER_VEC == 8
 	movzbl	%cl, %r10d
+#  else
+#   error "Invalid CHAR_SIZE or VEC_SIZE"
+#  endif
 # else
-	movl	$-1, %ecx
-	shlxl	%esi, %ecx, %r10d
+	mov	$-1, %VRCX
+	shlx	%VRSI, %VRCX, %VR10
 # endif
 
-	kmovd	%k1, %ecx
-	notl	%ecx
+	KMOV	%k1, %VRCX
+	not	%VRCX
 
 
 # ifdef USE_AS_STRNCMP
@@ -838,12 +960,10 @@ L(less_1x_vec_till_page_cross):
 	/* Readjust eax before potentially returning to the loop.  */
 	addl	$(PAGE_SIZE - VEC_SIZE * 4), %eax
 
-	andl	%r10d, %ecx
+	and	%VR10, %VRCX
 	jz	L(loop_skip_page_cross_check)
 
-	.p2align 4,, 3
-L(return_page_cross_end):
-	tzcntl	%ecx, %ecx
+	bsf	%VRCX, %VRCX
 
 # if (defined USE_AS_STRNCMP) || (defined USE_AS_WCSCMP)
 	leal	-VEC_SIZE(%OFFSET_REG64, %rcx, SIZE_OF_CHAR), %ecx
@@ -874,8 +994,12 @@ L(ret8):
 # ifdef USE_AS_STRNCMP
 	.p2align 4,, 10
 L(return_page_cross_end_check):
-	andl	%r10d, %ecx
-	tzcntl	%ecx, %ecx
+	and	%VR10, %VRCX
+	/* Need to use tzcnt here as VRCX may be zero.  If VRCX is zero
+	   tzcnt(VRCX) will be CHAR_PER and remaining length (edx) is
+	   guranteed to be <= CHAR_PER_VEC so we will only use the return
+	   idx if VRCX was non-zero.  */
+	tzcnt	%VRCX, %VRCX
 	leal	-VEC_SIZE(%rax, %rcx, SIZE_OF_CHAR), %ecx
 #  ifdef USE_AS_WCSCMP
 	sall	$2, %edx
@@ -892,11 +1016,11 @@ L(more_2x_vec_till_page_cross):
 	/* If more 2x vec till cross we will complete a full loop
 	   iteration here.  */
 
-	VMOVA	VEC_SIZE(%rdi), %YMM0
-	VPTESTM	%YMM0, %YMM0, %k2
-	CMP_R1_S2_YMM (%YMM0, VEC_SIZE(%rsi), %YMM1, %k1){%k2}
-	kmovd	%k1, %ecx
-	TESTEQ	%ecx
+	VMOVA	VEC_SIZE(%rdi), %VMM(0)
+	VPTESTM	%VMM(0), %VMM(0), %k2
+	CMP_R1_S2_VMM (%VMM(0), VEC_SIZE(%rsi), %VMM(1), %k1){%k2}
+	KMOV	%k1, %VRCX
+	TESTEQ	%VRCX
 	jnz	L(return_vec_1_end)
 
 # ifdef USE_AS_STRNCMP
@@ -907,18 +1031,18 @@ L(more_2x_vec_till_page_cross):
 	subl	$-(VEC_SIZE * 4), %eax
 
 	/* Safe to include comparisons from lower bytes.  */
-	VMOVU	-(VEC_SIZE * 2)(%rdi, %rax), %YMM0
-	VPTESTM	%YMM0, %YMM0, %k2
-	CMP_R1_S2_YMM (%YMM0, -(VEC_SIZE * 2)(%rsi, %rax), %YMM1, %k1){%k2}
-	kmovd	%k1, %ecx
-	TESTEQ	%ecx
+	VMOVU	-(VEC_SIZE * 2)(%rdi, %rax), %VMM(0)
+	VPTESTM	%VMM(0), %VMM(0), %k2
+	CMP_R1_S2_VMM (%VMM(0), -(VEC_SIZE * 2)(%rsi, %rax), %VMM(1), %k1){%k2}
+	KMOV	%k1, %VRCX
+	TESTEQ	%VRCX
 	jnz	L(return_vec_page_cross_0)
 
-	VMOVU	-(VEC_SIZE * 1)(%rdi, %rax), %YMM0
-	VPTESTM	%YMM0, %YMM0, %k2
-	CMP_R1_S2_YMM (%YMM0, -(VEC_SIZE * 1)(%rsi, %rax), %YMM1, %k1){%k2}
-	kmovd	%k1, %ecx
-	TESTEQ	%ecx
+	VMOVU	-(VEC_SIZE * 1)(%rdi, %rax), %VMM(0)
+	VPTESTM	%VMM(0), %VMM(0), %k2
+	CMP_R1_S2_VMM (%VMM(0), -(VEC_SIZE * 1)(%rsi, %rax), %VMM(1), %k1){%k2}
+	KMOV	%k1, %VRCX
+	TESTEQ	%VRCX
 	jnz	L(return_vec_page_cross_1)
 
 # ifdef USE_AS_STRNCMP
@@ -937,30 +1061,30 @@ L(more_2x_vec_till_page_cross):
 # endif
 
 	/* Finish the loop.  */
-	VMOVA	(VEC_SIZE * 2)(%rdi), %YMM4
-	VMOVA	(VEC_SIZE * 3)(%rdi), %YMM6
-	VPMINU	%YMM4, %YMM6, %YMM9
-	VPTESTM	%YMM9, %YMM9, %k1
+	VMOVA	(VEC_SIZE * 2)(%rdi), %VMM(4)
+	VMOVA	(VEC_SIZE * 3)(%rdi), %VMM(6)
+	VPMINU	%VMM(4), %VMM(6), %VMM(9)
+	VPTESTM	%VMM(9), %VMM(9), %k1
 # ifndef USE_AS_STRCASECMP_L
-	vpxorq	(VEC_SIZE * 2)(%rsi), %YMM4, %YMM5
+	vpxorq	(VEC_SIZE * 2)(%rsi), %VMM(4), %VMM(5)
 	/* YMM6 = YMM5 | ((VEC_SIZE * 3)(%rsi) ^ YMM6).  */
-	vpternlogd $0xde, (VEC_SIZE * 3)(%rsi), %YMM5, %YMM6
+	vpternlogd $0xde, (VEC_SIZE * 3)(%rsi), %VMM(5), %VMM(6)
 # else
-	VMOVU	(VEC_SIZE * 2)(%rsi), %YMM5
-	TOLOWER_YMM (%YMM4, %YMM5)
-	VMOVU	(VEC_SIZE * 3)(%rsi), %YMM7
-	TOLOWER_YMM (%YMM6, %YMM7)
-	vpxorq	%YMM4, %YMM5, %YMM5
-	vpternlogd $0xde, %YMM7, %YMM5, %YMM6
-# endif
-	VPTESTNM %YMM6, %YMM6, %k0{%k1}
-	kmovd	%k0, %LOOP_REG
+	VMOVU	(VEC_SIZE * 2)(%rsi), %VMM(5)
+	TOLOWER_VMM (%VMM(4), %VMM(5))
+	VMOVU	(VEC_SIZE * 3)(%rsi), %VMM(7)
+	TOLOWER_VMM (%VMM(6), %VMM(7))
+	vpxorq	%VMM(4), %VMM(5), %VMM(5)
+	vpternlogd $0xde, %VMM(7), %VMM(5), %VMM(6)
+# endif
+	VPTESTNM %VMM(6), %VMM(6), %k0{%k1}
+	KMOV	%k0, %LOOP_REG
 	TESTEQ	%LOOP_REG
 	jnz	L(return_vec_2_3_end)
 
 	/* Best for code size to include ucond-jmp here. Would be faster
-	   if this case is hot to duplicate the L(return_vec_2_3_end) code
-	   as fall-through and have jump back to loop on mismatch
+	   if this case is hot to duplicate the L(return_vec_2_3_end)
+	   code as fall-through and have jump back to loop on mismatch
 	   comparison.  */
 	subq	$-(VEC_SIZE * 4), %rdi
 	subq	$-(VEC_SIZE * 4), %rsi
@@ -980,7 +1104,7 @@ L(ret_zero_in_loop_page_cross):
 L(return_vec_page_cross_0):
 	addl	$-VEC_SIZE, %eax
 L(return_vec_page_cross_1):
-	tzcntl	%ecx, %ecx
+	bsf	%VRCX, %VRCX
 # if defined USE_AS_STRNCMP || defined USE_AS_WCSCMP
 	leal	-VEC_SIZE(%rax, %rcx, SIZE_OF_CHAR), %ecx
 #  ifdef USE_AS_STRNCMP
@@ -1023,8 +1147,8 @@ L(ret9):
 L(page_cross):
 # ifndef USE_AS_STRNCMP
 	/* If both are VEC aligned we don't need any special logic here.
-	   Only valid for strcmp where stop condition is guranteed to be
-	   reachable by just reading memory.  */
+	   Only valid for strcmp where stop condition is guranteed to
+	   be reachable by just reading memory.  */
 	testl	$((VEC_SIZE - 1) << 20), %eax
 	jz	L(no_page_cross)
 # endif
@@ -1065,11 +1189,11 @@ L(page_cross):
 	   loadable memory until within 1x VEC of page cross.  */
 	.p2align 4,, 8
 L(page_cross_loop):
-	VMOVU	(%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %YMM0
-	VPTESTM	%YMM0, %YMM0, %k2
-	CMP_R1_S2_YMM (%YMM0, (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %YMM1, %k1){%k2}
-	kmovd	%k1, %ecx
-	TESTEQ	%ecx
+	VMOVU	(%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %VMM(0)
+	VPTESTM	%VMM(0), %VMM(0), %k2
+	CMP_R1_S2_VMM (%VMM(0), (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %VMM(1), %k1){%k2}
+	KMOV	%k1, %VRCX
+	TESTEQ	%VRCX
 	jnz	L(check_ret_vec_page_cross)
 	addl	$CHAR_PER_VEC, %OFFSET_REG
 # ifdef USE_AS_STRNCMP
@@ -1087,13 +1211,13 @@ L(page_cross_loop):
 	subl	%eax, %OFFSET_REG
 	/* OFFSET_REG has distance to page cross - VEC_SIZE. Guranteed
 	   to not cross page so is safe to load. Since we have already
-	   loaded at least 1 VEC from rsi it is also guranteed to be safe.
-	 */
-	VMOVU	(%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %YMM0
-	VPTESTM	%YMM0, %YMM0, %k2
-	CMP_R1_S2_YMM (%YMM0, (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %YMM1, %k1){%k2}
+	   loaded at least 1 VEC from rsi it is also guranteed to be
+	   safe.  */
+	VMOVU	(%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %VMM(0)
+	VPTESTM	%VMM(0), %VMM(0), %k2
+	CMP_R1_S2_VMM (%VMM(0), (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %VMM(1), %k1){%k2}
 
-	kmovd	%k1, %ecx
+	KMOV	%k1, %VRCX
 # ifdef USE_AS_STRNCMP
 	leal	CHAR_PER_VEC(%OFFSET_REG64), %eax
 	cmpq	%rax, %rdx
@@ -1104,7 +1228,7 @@ L(page_cross_loop):
 	addq	%rdi, %rdx
 #  endif
 # endif
-	TESTEQ	%ecx
+	TESTEQ	%VRCX
 	jz	L(prepare_loop_no_len)
 
 	.p2align 4,, 4
@@ -1112,7 +1236,7 @@ L(ret_vec_page_cross):
 # ifndef USE_AS_STRNCMP
 L(check_ret_vec_page_cross):
 # endif
-	tzcntl	%ecx, %ecx
+	tzcnt	%VRCX, %VRCX
 	addl	%OFFSET_REG, %ecx
 L(ret_vec_page_cross_cont):
 # ifdef USE_AS_WCSCMP
@@ -1139,9 +1263,9 @@ L(ret12):
 # ifdef USE_AS_STRNCMP
 	.p2align 4,, 10
 L(check_ret_vec_page_cross2):
-	TESTEQ	%ecx
+	TESTEQ	%VRCX
 L(check_ret_vec_page_cross):
-	tzcntl	%ecx, %ecx
+	tzcnt	%VRCX, %VRCX
 	addl	%OFFSET_REG, %ecx
 	cmpq	%rcx, %rdx
 	ja	L(ret_vec_page_cross_cont)
@@ -1180,8 +1304,71 @@ L(less_1x_vec_till_page):
 # ifdef USE_AS_WCSCMP
 	shrl	$2, %eax
 # endif
+
+	/* Find largest load size we can use. VEC_SIZE == 64 only check
+	   if we can do a full ymm load.  */
+# if VEC_SIZE == 64
+
+	cmpl	$((VEC_SIZE - 32) / SIZE_OF_CHAR), %eax
+	ja	L(less_32_till_page)
+
+
+	/* Use 16 byte comparison.  */
+	VMOVU	(%rdi), %VMM_256(0)
+	VPTESTM	%VMM_256(0), %VMM_256(0), %k2
+	CMP_R1_S2_YMM (%VMM_256(0), (%rsi), %VMM_256(1), %k1){%k2}
+	kmovd	%k1, %ecx
+#  ifdef USE_AS_WCSCMP
+	subl	$0xff, %ecx
+#  else
+	incl	%ecx
+#  endif
+	jnz	L(check_ret_vec_page_cross)
+	movl	$((VEC_SIZE - 32) / SIZE_OF_CHAR), %OFFSET_REG
+#  ifdef USE_AS_STRNCMP
+	cmpq	%OFFSET_REG64, %rdx
+	jbe	L(ret_zero_page_cross_slow_case64)
+	subl	%eax, %OFFSET_REG
+#  else
+	/* Explicit check for 32 byte alignment.  */
+	subl	%eax, %OFFSET_REG
+	jz	L(prepare_loop)
+#  endif
+	VMOVU	(%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %VMM_256(0)
+	VPTESTM	%VMM_256(0), %VMM_256(0), %k2
+	CMP_R1_S2_YMM (%VMM_256(0), (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %VMM_256(1), %k1){%k2}
+	kmovd	%k1, %ecx
+#  ifdef USE_AS_WCSCMP
+	subl	$0xff, %ecx
+#  else
+	incl	%ecx
+#  endif
+	jnz	L(check_ret_vec_page_cross)
+#  ifdef USE_AS_STRNCMP
+	addl	$(32 / SIZE_OF_CHAR), %OFFSET_REG
+	subq	%OFFSET_REG64, %rdx
+	jbe	L(ret_zero_page_cross_slow_case64)
+	subq	$-(CHAR_PER_VEC * 4), %rdx
+
+	leaq	-(VEC_SIZE * 4)(%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %rdi
+	leaq	-(VEC_SIZE * 4)(%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %rsi
+#  else
+	leaq	(32 - VEC_SIZE * 4)(%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %rdi
+	leaq	(32 - VEC_SIZE * 4)(%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %rsi
+#  endif
+	jmp	L(prepare_loop_aligned)
+
+#  ifdef USE_AS_STRNCMP
+	.p2align 4,, 2
+L(ret_zero_page_cross_slow_case64):
+	xorl	%eax, %eax
+	ret
+#  endif
+L(less_32_till_page):
+# endif
+
 	/* Find largest load size we can use.  */
-	cmpl	$(16 / SIZE_OF_CHAR), %eax
+	cmpl	$((VEC_SIZE - 16) / SIZE_OF_CHAR), %eax
 	ja	L(less_16_till_page)
 
 	/* Use 16 byte comparison.  */
@@ -1195,9 +1382,14 @@ L(less_1x_vec_till_page):
 	incw	%cx
 # endif
 	jnz	L(check_ret_vec_page_cross)
-	movl	$(16 / SIZE_OF_CHAR), %OFFSET_REG
+
+	movl	$((VEC_SIZE - 16) / SIZE_OF_CHAR), %OFFSET_REG
 # ifdef USE_AS_STRNCMP
+#  if VEC_SIZE == 32
 	cmpq	%OFFSET_REG64, %rdx
+#  else
+	cmpq	$(16 / SIZE_OF_CHAR), %rdx
+#  endif
 	jbe	L(ret_zero_page_cross_slow_case0)
 	subl	%eax, %OFFSET_REG
 # else
@@ -1239,7 +1431,7 @@ L(ret_zero_page_cross_slow_case0):
 
 	.p2align 4,, 10
 L(less_16_till_page):
-	cmpl	$(24 / SIZE_OF_CHAR), %eax
+	cmpl	$((VEC_SIZE - 8) / SIZE_OF_CHAR), %eax
 	ja	L(less_8_till_page)
 
 	/* Use 8 byte comparison.  */
@@ -1260,7 +1452,7 @@ L(less_16_till_page):
 	cmpq	$(8 / SIZE_OF_CHAR), %rdx
 	jbe	L(ret_zero_page_cross_slow_case0)
 # endif
-	movl	$(24 / SIZE_OF_CHAR), %OFFSET_REG
+	movl	$((VEC_SIZE - 8) / SIZE_OF_CHAR), %OFFSET_REG
 	subl	%eax, %OFFSET_REG
 
 	vmovq	(%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm0
@@ -1320,7 +1512,7 @@ L(ret_less_8_wcs):
 	ret
 
 # else
-	cmpl	$28, %eax
+	cmpl	$(VEC_SIZE - 4), %eax
 	ja	L(less_4_till_page)
 
 	vmovd	(%rdi), %xmm0
@@ -1335,7 +1527,7 @@ L(ret_less_8_wcs):
 	cmpq	$4, %rdx
 	jbe	L(ret_zero_page_cross_slow_case1)
 #  endif
-	movl	$(28 / SIZE_OF_CHAR), %OFFSET_REG
+	movl	$((VEC_SIZE - 4) / SIZE_OF_CHAR), %OFFSET_REG
 	subl	%eax, %OFFSET_REG
 
 	vmovd	(%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm0
@@ -1386,7 +1578,7 @@ L(less_4_loop):
 #  endif
 	incq	%rdi
 	/* end condition is reach page boundary (rdi is aligned).  */
-	testl	$31, %edi
+	testb	$(VEC_SIZE - 1), %dil
 	jnz	L(less_4_loop)
 	leaq	-(VEC_SIZE * 4)(%rdi, %rsi), %rsi
 	addq	$-(VEC_SIZE * 4), %rdi
-- 
2.34.1


^ permalink raw reply	[flat|nested] 41+ messages in thread

* Re: [PATCH v4] x86: Add support for VEC_SIZE == 64 in strcmp-evex.S impl
  2022-10-20  2:15   ` [PATCH v4] " Noah Goldstein
@ 2022-10-20  3:46     ` H.J. Lu
  0 siblings, 0 replies; 41+ messages in thread
From: H.J. Lu @ 2022-10-20  3:46 UTC (permalink / raw)
  To: Noah Goldstein; +Cc: libc-alpha, carlos

On Wed, Oct 19, 2022 at 7:16 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> Unused at the moment, but evex512 strcmp, strncmp, strcasecmp{l}, and
> strncasecmp{l} functions can be added by including strcmp-evex.S with
> "x86-evex512-vecs.h" defined.
>
> In addition save code size a bit in a few places.
>
> 1. tzcnt ...         -> bsf ...
> 2. vpcmp{b|d} $0 ... -> vpcmpeq{b|d}
>
> This saves a touch of code size but has minimal net affect.
>
> Full check passes on x86-64.
> ---
>  sysdeps/x86_64/multiarch/strcmp-evex.S | 684 ++++++++++++++++---------
>  1 file changed, 438 insertions(+), 246 deletions(-)
>
> diff --git a/sysdeps/x86_64/multiarch/strcmp-evex.S b/sysdeps/x86_64/multiarch/strcmp-evex.S
> index e482d0167f..e47aa8ef99 100644
> --- a/sysdeps/x86_64/multiarch/strcmp-evex.S
> +++ b/sysdeps/x86_64/multiarch/strcmp-evex.S
> @@ -20,6 +20,10 @@
>
>  #if ISA_SHOULD_BUILD (4)
>
> +# ifndef VEC_SIZE
> +#  include "x86-evex256-vecs.h"
> +# endif
> +
>  # define STRCMP_ISA    _evex
>  # include "strcmp-naming.h"
>
> @@ -35,41 +39,57 @@
>  # define PAGE_SIZE     4096
>
>         /* VEC_SIZE = Number of bytes in a ymm register.  */
> -# define VEC_SIZE      32
>  # define CHAR_PER_VEC  (VEC_SIZE       /       SIZE_OF_CHAR)
>
> -# define VMOVU vmovdqu64
> -# define VMOVA vmovdqa64
> -
>  # ifdef USE_AS_WCSCMP
> -#  define TESTEQ       subl $0xff,
>         /* Compare packed dwords.  */
>  #  define VPCMP        vpcmpd
> +#  define VPCMPEQ      vpcmpeqd
>  #  define VPMINU       vpminud
>  #  define VPTESTM      vptestmd
>  #  define VPTESTNM     vptestnmd
>         /* 1 dword char == 4 bytes.  */
>  #  define SIZE_OF_CHAR 4
> +
> +#  define TESTEQ       sub $((1 << CHAR_PER_VEC) - 1),
> +
> +#  define USE_WIDE_CHAR
>  # else
> -#  define TESTEQ       incl
>         /* Compare packed bytes.  */
>  #  define VPCMP        vpcmpb
> +#  define VPCMPEQ      vpcmpeqb
>  #  define VPMINU       vpminub
>  #  define VPTESTM      vptestmb
>  #  define VPTESTNM     vptestnmb
>         /* 1 byte char == 1 byte.  */
>  #  define SIZE_OF_CHAR 1
> +
> +#  define TESTEQ       inc
> +# endif
> +
> +# include "reg-macros.h"
> +
> +# if VEC_SIZE == 64
> +#  define RODATA_SECTION       rodata.cst64
> +# else
> +#  define RODATA_SECTION       rodata.cst32
> +# endif
> +
> +# if CHAR_PER_VEC == 64
> +#  define FALLTHROUGH_RETURN_OFFSET    (VEC_SIZE * 3)
> +# else
> +#  define FALLTHROUGH_RETURN_OFFSET    (VEC_SIZE * 2)
>  # endif
>
>  # ifdef USE_AS_STRNCMP
> -#  define LOOP_REG     r9d
> +#  define LOOP_REG     VR9
>  #  define LOOP_REG64   r9
>
>  #  define OFFSET_REG8  r9b
>  #  define OFFSET_REG   r9d
>  #  define OFFSET_REG64 r9
>  # else
> -#  define LOOP_REG     edx
> +#  define LOOP_REG     VRDX
>  #  define LOOP_REG64   rdx
>
>  #  define OFFSET_REG8  dl
> @@ -83,32 +103,6 @@
>  #  define VEC_OFFSET   (-VEC_SIZE)
>  # endif
>
> -# define XMM0  xmm17
> -# define XMM1  xmm18
> -
> -# define XMM10 xmm27
> -# define XMM11 xmm28
> -# define XMM12 xmm29
> -# define XMM13 xmm30
> -# define XMM14 xmm31
> -
> -
> -# define YMM0  ymm17
> -# define YMM1  ymm18
> -# define YMM2  ymm19
> -# define YMM3  ymm20
> -# define YMM4  ymm21
> -# define YMM5  ymm22
> -# define YMM6  ymm23
> -# define YMM7  ymm24
> -# define YMM8  ymm25
> -# define YMM9  ymm26
> -# define YMM10 ymm27
> -# define YMM11 ymm28
> -# define YMM12 ymm29
> -# define YMM13 ymm30
> -# define YMM14 ymm31
> -
>  # ifdef USE_AS_STRCASECMP_L
>  #  define BYTE_LOOP_REG        OFFSET_REG
>  # else
> @@ -125,61 +119,72 @@
>  #  endif
>  # endif
>
> -# define LCASE_MIN_YMM %YMM12
> -# define LCASE_MAX_YMM %YMM13
> -# define CASE_ADD_YMM  %YMM14
> +# define LCASE_MIN_V   VMM(12)
> +# define LCASE_MAX_V   VMM(13)
> +# define CASE_ADD_V    VMM(14)
>
> -# define LCASE_MIN_XMM %XMM12
> -# define LCASE_MAX_XMM %XMM13
> -# define CASE_ADD_XMM  %XMM14
> +# if VEC_SIZE == 64
> +#  define LCASE_MIN_YMM        VMM_256(12)
> +#  define LCASE_MAX_YMM        VMM_256(13)
> +#  define CASE_ADD_YMM VMM_256(14)
> +# endif
> +
> +# define LCASE_MIN_XMM VMM_128(12)
> +# define LCASE_MAX_XMM VMM_128(13)
> +# define CASE_ADD_XMM  VMM_128(14)
>
>         /* NB: wcsncmp uses r11 but strcasecmp is never used in
>            conjunction with wcscmp.  */
>  # define TOLOWER_BASE  %r11
>
>  # ifdef USE_AS_STRCASECMP_L
> -#  define _REG(x, y) x ## y
> -#  define REG(x, y) _REG(x, y)
> -#  define TOLOWER(reg1, reg2, ext)                                                                             \
> -       vpsubb  REG(LCASE_MIN_, ext), reg1, REG(%ext, 10);                                      \
> -       vpsubb  REG(LCASE_MIN_, ext), reg2, REG(%ext, 11);                                      \
> -       vpcmpub $1, REG(LCASE_MAX_, ext), REG(%ext, 10), %k5;                           \
> -       vpcmpub $1, REG(LCASE_MAX_, ext), REG(%ext, 11), %k6;                           \
> -       vpaddb  reg1, REG(CASE_ADD_, ext), reg1{%k5};                                           \
> -       vpaddb  reg2, REG(CASE_ADD_, ext), reg2{%k6}
> -
> -#  define TOLOWER_gpr(src, dst) movl (TOLOWER_BASE, src, 4), dst
> -#  define TOLOWER_YMM(...)     TOLOWER(__VA_ARGS__, YMM)
> -#  define TOLOWER_XMM(...)     TOLOWER(__VA_ARGS__, XMM)
> -
> -#  define CMP_R1_R2(s1_reg, s2_reg, reg_out, ext)                                              \
> -       TOLOWER (s1_reg, s2_reg, ext);                                                                          \
> -       VPCMP   $0, s1_reg, s2_reg, reg_out
> -
> -#  define CMP_R1_S2(s1_reg, s2_mem, s2_reg, reg_out, ext)                              \
> -       VMOVU   s2_mem, s2_reg;                                                                                         \
> -       CMP_R1_R2(s1_reg, s2_reg, reg_out, ext)
> -
> -#  define CMP_R1_R2_YMM(...) CMP_R1_R2(__VA_ARGS__, YMM)
> -#  define CMP_R1_R2_XMM(...) CMP_R1_R2(__VA_ARGS__, XMM)
> -
> -#  define CMP_R1_S2_YMM(...) CMP_R1_S2(__VA_ARGS__, YMM)
> -#  define CMP_R1_S2_XMM(...) CMP_R1_S2(__VA_ARGS__, XMM)
> +#  define _REG(x, y)   x ## y
> +#  define REG(x, y)    _REG(x, y)
> +#  define TOLOWER(reg1, reg2, ext, vec_macro)  \
> +       vpsubb  %REG(LCASE_MIN_, ext), reg1, %vec_macro(10);    \
> +       vpsubb  %REG(LCASE_MIN_, ext), reg2, %vec_macro(11);    \
> +       vpcmpub $1, %REG(LCASE_MAX_, ext), %vec_macro(10), %k5; \
> +       vpcmpub $1, %REG(LCASE_MAX_, ext), %vec_macro(11), %k6; \
> +       vpaddb  reg1, %REG(CASE_ADD_, ext), reg1{%k5};  \
> +       vpaddb  reg2, %REG(CASE_ADD_, ext), reg2{%k6}
> +
> +#  define TOLOWER_gpr(src, dst)        movl (TOLOWER_BASE, src, 4), dst
> +#  define TOLOWER_VMM(...)     TOLOWER(__VA_ARGS__, V, VMM)
> +#  define TOLOWER_YMM(...)     TOLOWER(__VA_ARGS__, YMM, VMM_256)
> +#  define TOLOWER_XMM(...)     TOLOWER(__VA_ARGS__, XMM, VMM_128)
> +
> +#  define CMP_R1_R2(s1_reg, s2_reg, reg_out, ext, vec_macro)   \
> +       TOLOWER (s1_reg, s2_reg, ext, vec_macro);       \
> +       VPCMPEQ s1_reg, s2_reg, reg_out
> +
> +#  define CMP_R1_S2(s1_reg, s2_mem, s2_reg, reg_out, ext, vec_macro)   \
> +       VMOVU   s2_mem, s2_reg; \
> +       CMP_R1_R2 (s1_reg, s2_reg, reg_out, ext, vec_macro)
> +
> +#  define CMP_R1_R2_VMM(...)   CMP_R1_R2(__VA_ARGS__, V, VMM)
> +#  define CMP_R1_R2_YMM(...)   CMP_R1_R2(__VA_ARGS__, YMM, VMM_256)
> +#  define CMP_R1_R2_XMM(...)   CMP_R1_R2(__VA_ARGS__, XMM, VMM_128)
> +
> +#  define CMP_R1_S2_VMM(...)   CMP_R1_S2(__VA_ARGS__, V, VMM)
> +#  define CMP_R1_S2_YMM(...)   CMP_R1_S2(__VA_ARGS__, YMM, VMM_256)
> +#  define CMP_R1_S2_XMM(...)   CMP_R1_S2(__VA_ARGS__, XMM, VMM_128)
>
>  # else
>  #  define TOLOWER_gpr(...)
> +#  define TOLOWER_VMM(...)
>  #  define TOLOWER_YMM(...)
>  #  define TOLOWER_XMM(...)
>
> -#  define CMP_R1_R2_YMM(s1_reg, s2_reg, reg_out)                                               \
> -       VPCMP   $0, s2_reg, s1_reg, reg_out
> -
> -#  define CMP_R1_R2_XMM(...) CMP_R1_R2_YMM(__VA_ARGS__)
> +#  define CMP_R1_R2_VMM(s1_reg, s2_reg, reg_out)       \
> +       VPCMPEQ s2_reg, s1_reg, reg_out
>
> -#  define CMP_R1_S2_YMM(s1_reg, s2_mem, unused, reg_out)                               \
> -       VPCMP   $0, s2_mem, s1_reg, reg_out
> +#  define CMP_R1_R2_YMM(...)   CMP_R1_R2_VMM(__VA_ARGS__)
> +#  define CMP_R1_R2_XMM(...)   CMP_R1_R2_VMM(__VA_ARGS__)
>
> -#  define CMP_R1_S2_XMM(...) CMP_R1_S2_YMM(__VA_ARGS__)
> +#  define CMP_R1_S2_VMM(s1_reg, s2_mem, unused, reg_out)       \
> +       VPCMPEQ s2_mem, s1_reg, reg_out
> +#  define CMP_R1_S2_YMM(...)   CMP_R1_S2_VMM(__VA_ARGS__)
> +#  define CMP_R1_S2_XMM(...)   CMP_R1_S2_VMM(__VA_ARGS__)
>  # endif
>
>  /* Warning!
> @@ -203,7 +208,7 @@
>     the maximum offset is reached before a difference is found, zero is
>     returned.  */
>
> -       .section .text.evex, "ax", @progbits
> +       .section SECTION(.text), "ax", @progbits
>         .align  16
>         .type   STRCMP, @function
>         .globl  STRCMP
> @@ -232,7 +237,7 @@ STRCMP:
>  #  else
>         mov     (%LOCALE_REG), %RAX_LP
>  #  endif
> -       testl   $1, LOCALE_DATA_VALUES + _NL_CTYPE_NONASCII_CASE * SIZEOF_VALUES(%rax)
> +       testb   $1, LOCALE_DATA_VALUES + _NL_CTYPE_NONASCII_CASE * SIZEOF_VALUES(%rax)
>         jne     STRCASECMP_L_NONASCII
>         leaq    _nl_C_LC_CTYPE_tolower + 128 * 4(%rip), TOLOWER_BASE
>  # endif
> @@ -254,28 +259,46 @@ STRCMP:
>  # endif
>
>  # if defined USE_AS_STRCASECMP_L
> -       .section .rodata.cst32, "aM", @progbits, 32
> -       .align  32
> +       .section RODATA_SECTION, "aM", @progbits, VEC_SIZE
> +       .align  VEC_SIZE
>  L(lcase_min):
>         .quad   0x4141414141414141
>         .quad   0x4141414141414141
>         .quad   0x4141414141414141
>         .quad   0x4141414141414141
> +#  if VEC_SIZE == 64
> +       .quad   0x4141414141414141
> +       .quad   0x4141414141414141
> +       .quad   0x4141414141414141
> +       .quad   0x4141414141414141
> +#  endif
>  L(lcase_max):
>         .quad   0x1a1a1a1a1a1a1a1a
>         .quad   0x1a1a1a1a1a1a1a1a
>         .quad   0x1a1a1a1a1a1a1a1a
>         .quad   0x1a1a1a1a1a1a1a1a
> +#  if VEC_SIZE == 64
> +       .quad   0x1a1a1a1a1a1a1a1a
> +       .quad   0x1a1a1a1a1a1a1a1a
> +       .quad   0x1a1a1a1a1a1a1a1a
> +       .quad   0x1a1a1a1a1a1a1a1a
> +#  endif
>  L(case_add):
>         .quad   0x2020202020202020
>         .quad   0x2020202020202020
>         .quad   0x2020202020202020
>         .quad   0x2020202020202020
> +#  if VEC_SIZE == 64
> +       .quad   0x2020202020202020
> +       .quad   0x2020202020202020
> +       .quad   0x2020202020202020
> +       .quad   0x2020202020202020
> +#  endif
>         .previous
>
> -       vmovdqa64 L(lcase_min)(%rip), LCASE_MIN_YMM
> -       vmovdqa64 L(lcase_max)(%rip), LCASE_MAX_YMM
> -       vmovdqa64 L(case_add)(%rip), CASE_ADD_YMM
> +       VMOVA   L(lcase_min)(%rip), %LCASE_MIN_V
> +       VMOVA   L(lcase_max)(%rip), %LCASE_MAX_V
> +       VMOVA   L(case_add)(%rip), %CASE_ADD_V
>  # endif
>
>         movl    %edi, %eax
> @@ -288,12 +311,12 @@ L(case_add):
>
>  L(no_page_cross):
>         /* Safe to compare 4x vectors.  */
> -       VMOVU   (%rdi), %YMM0
> -       VPTESTM %YMM0, %YMM0, %k2
> +       VMOVU   (%rdi), %VMM(0)
> +       VPTESTM %VMM(0), %VMM(0), %k2
>         /* Each bit cleared in K1 represents a mismatch or a null CHAR
>            in YMM0 and 32 bytes at (%rsi).  */
> -       CMP_R1_S2_YMM (%YMM0, (%rsi), %YMM1, %k1){%k2}
> -       kmovd   %k1, %ecx
> +       CMP_R1_S2_VMM (%VMM(0), (%rsi), %VMM(1), %k1){%k2}
> +       KMOV    %k1, %VRCX
>  # ifdef USE_AS_STRNCMP
>         cmpq    $CHAR_PER_VEC, %rdx
>         jbe     L(vec_0_test_len)
> @@ -303,14 +326,14 @@ L(no_page_cross):
>            wcscmp/wcsncmp.  */
>
>         /* All 1s represents all equals. TESTEQ will overflow to zero in
> -          all equals case. Otherwise 1s will carry until position of first
> -          mismatch.  */
> -       TESTEQ  %ecx
> +          all equals case. Otherwise 1s will carry until position of
> +          first mismatch.  */
> +       TESTEQ  %VRCX
>         jz      L(more_3x_vec)
>
>         .p2align 4,, 4
>  L(return_vec_0):
> -       tzcntl  %ecx, %ecx
> +       bsf     %VRCX, %VRCX
>  # ifdef USE_AS_WCSCMP
>         movl    (%rdi, %rcx, SIZE_OF_CHAR), %edx
>         xorl    %eax, %eax
> @@ -321,7 +344,16 @@ L(return_vec_0):
>         orl     $1, %eax
>  # else
>         movzbl  (%rdi, %rcx), %eax
> +       /* For VEC_SIZE == 64 use movb instead of movzbl to save a byte
> +          and keep logic for len <= VEC_SIZE (common) in just the
> +          first cache line.  NB: No evex512 processor has partial-
> +          register stalls. If that changes this ifdef can be disabled
> +          without affecting correctness.  */
> +#  if !defined USE_AS_STRNCMP && !defined USE_AS_STRCASECMP_L && VEC_SIZE == 64
> +       movb    (%rsi, %rcx), %cl
> +#  else
>         movzbl  (%rsi, %rcx), %ecx
> +#  endif
>         TOLOWER_gpr (%rax, %eax)
>         TOLOWER_gpr (%rcx, %ecx)
>         subl    %ecx, %eax
> @@ -332,8 +364,8 @@ L(ret0):
>  # ifdef USE_AS_STRNCMP
>         .p2align 4,, 4
>  L(vec_0_test_len):
> -       notl    %ecx
> -       bzhil   %edx, %ecx, %eax
> +       not     %VRCX
> +       bzhi    %VRDX, %VRCX, %VRAX
>         jnz     L(return_vec_0)
>         /* Align if will cross fetch block.  */
>         .p2align 4,, 2
> @@ -372,7 +404,7 @@ L(ret1):
>
>         .p2align 4,, 10
>  L(return_vec_1):
> -       tzcntl  %ecx, %ecx
> +       bsf     %VRCX, %VRCX
>  # ifdef USE_AS_STRNCMP
>         /* rdx must be > CHAR_PER_VEC so its safe to subtract without
>            worrying about underflow.  */
> @@ -401,24 +433,41 @@ L(ret2):
>         .p2align 4,, 10
>  # ifdef USE_AS_STRNCMP
>  L(return_vec_3):
> -#  if CHAR_PER_VEC <= 16
> +#  if CHAR_PER_VEC <= 32
> +       /* If CHAR_PER_VEC <= 32 reuse code from L(return_vec_3) without
> +          additional branches by adjusting the bit positions from
> +          VEC3.  We can't do this for CHAR_PER_VEC == 64.  */
> +#   if CHAR_PER_VEC <= 16
>         sall    $CHAR_PER_VEC, %ecx
> -#  else
> +#   else
>         salq    $CHAR_PER_VEC, %rcx
> +#   endif
> +#  else
> +       /* If CHAR_PER_VEC == 64 we can't shift the return GPR so just
> +          check it.  */
> +       bsf     %VRCX, %VRCX
> +       addl    $(CHAR_PER_VEC), %ecx
> +       cmpq    %rcx, %rdx
> +       ja      L(ret_vec_3_finish)
> +       xorl    %eax, %eax
> +       ret
>  #  endif
>  # endif
> +
> +       /* If CHAR_PER_VEC == 64 we can't combine matches from the last
> +          2x VEC so need seperate return label.  */
>  L(return_vec_2):
>  # if (CHAR_PER_VEC <= 16) || !(defined USE_AS_STRNCMP)
> -       tzcntl  %ecx, %ecx
> +       bsf     %VRCX, %VRCX
>  # else
> -       tzcntq  %rcx, %rcx
> +       bsfq    %rcx, %rcx
>  # endif
> -
>  # ifdef USE_AS_STRNCMP
>         cmpq    %rcx, %rdx
>         jbe     L(ret_zero)
>  # endif
>
> +L(ret_vec_3_finish):
>  # ifdef USE_AS_WCSCMP
>         movl    (VEC_SIZE * 2)(%rdi, %rcx, SIZE_OF_CHAR), %edx
>         xorl    %eax, %eax
> @@ -440,7 +489,7 @@ L(ret3):
>  # ifndef USE_AS_STRNCMP
>         .p2align 4,, 10
>  L(return_vec_3):
> -       tzcntl  %ecx, %ecx
> +       bsf     %VRCX, %VRCX
>  #  ifdef USE_AS_WCSCMP
>         movl    (VEC_SIZE * 3)(%rdi, %rcx, SIZE_OF_CHAR), %edx
>         xorl    %eax, %eax
> @@ -465,11 +514,11 @@ L(ret4):
>         .p2align 5
>  L(more_3x_vec):
>         /* Safe to compare 4x vectors.  */
> -       VMOVU   (VEC_SIZE)(%rdi), %YMM0
> -       VPTESTM %YMM0, %YMM0, %k2
> -       CMP_R1_S2_YMM (%YMM0, VEC_SIZE(%rsi), %YMM1, %k1){%k2}
> -       kmovd   %k1, %ecx
> -       TESTEQ  %ecx
> +       VMOVU   (VEC_SIZE)(%rdi), %VMM(0)
> +       VPTESTM %VMM(0), %VMM(0), %k2
> +       CMP_R1_S2_VMM (%VMM(0), VEC_SIZE(%rsi), %VMM(1), %k1){%k2}
> +       KMOV    %k1, %VRCX
> +       TESTEQ  %VRCX
>         jnz     L(return_vec_1)
>
>  # ifdef USE_AS_STRNCMP
> @@ -477,18 +526,18 @@ L(more_3x_vec):
>         jbe     L(ret_zero)
>  # endif
>
> -       VMOVU   (VEC_SIZE * 2)(%rdi), %YMM0
> -       VPTESTM %YMM0, %YMM0, %k2
> -       CMP_R1_S2_YMM (%YMM0, (VEC_SIZE * 2)(%rsi), %YMM1, %k1){%k2}
> -       kmovd   %k1, %ecx
> -       TESTEQ  %ecx
> +       VMOVU   (VEC_SIZE * 2)(%rdi), %VMM(0)
> +       VPTESTM %VMM(0), %VMM(0), %k2
> +       CMP_R1_S2_VMM (%VMM(0), (VEC_SIZE * 2)(%rsi), %VMM(1), %k1){%k2}
> +       KMOV    %k1, %VRCX
> +       TESTEQ  %VRCX
>         jnz     L(return_vec_2)
>
> -       VMOVU   (VEC_SIZE * 3)(%rdi), %YMM0
> -       VPTESTM %YMM0, %YMM0, %k2
> -       CMP_R1_S2_YMM (%YMM0, (VEC_SIZE * 3)(%rsi), %YMM1, %k1){%k2}
> -       kmovd   %k1, %ecx
> -       TESTEQ  %ecx
> +       VMOVU   (VEC_SIZE * 3)(%rdi), %VMM(0)
> +       VPTESTM %VMM(0), %VMM(0), %k2
> +       CMP_R1_S2_VMM (%VMM(0), (VEC_SIZE * 3)(%rsi), %VMM(1), %k1){%k2}
> +       KMOV    %k1, %VRCX
> +       TESTEQ  %VRCX
>         jnz     L(return_vec_3)
>
>  # ifdef USE_AS_STRNCMP
> @@ -565,110 +614,123 @@ L(loop):
>
>         /* Loop entry after handling page cross during loop.  */
>  L(loop_skip_page_cross_check):
> -       VMOVA   (VEC_SIZE * 0)(%rdi), %YMM0
> -       VMOVA   (VEC_SIZE * 1)(%rdi), %YMM2
> -       VMOVA   (VEC_SIZE * 2)(%rdi), %YMM4
> -       VMOVA   (VEC_SIZE * 3)(%rdi), %YMM6
> +       VMOVA   (VEC_SIZE * 0)(%rdi), %VMM(0)
> +       VMOVA   (VEC_SIZE * 1)(%rdi), %VMM(2)
> +       VMOVA   (VEC_SIZE * 2)(%rdi), %VMM(4)
> +       VMOVA   (VEC_SIZE * 3)(%rdi), %VMM(6)
>
> -       VPMINU  %YMM0, %YMM2, %YMM8
> -       VPMINU  %YMM4, %YMM6, %YMM9
> +       VPMINU  %VMM(0), %VMM(2), %VMM(8)
> +       VPMINU  %VMM(4), %VMM(6), %VMM(9)
>
>         /* A zero CHAR in YMM9 means that there is a null CHAR.  */
> -       VPMINU  %YMM8, %YMM9, %YMM9
> +       VPMINU  %VMM(8), %VMM(9), %VMM(9)
>
>         /* Each bit set in K1 represents a non-null CHAR in YMM9.  */
> -       VPTESTM %YMM9, %YMM9, %k1
> +       VPTESTM %VMM(9), %VMM(9), %k1
>  # ifndef USE_AS_STRCASECMP_L
> -       vpxorq  (VEC_SIZE * 0)(%rsi), %YMM0, %YMM1
> -       vpxorq  (VEC_SIZE * 1)(%rsi), %YMM2, %YMM3
> -       vpxorq  (VEC_SIZE * 2)(%rsi), %YMM4, %YMM5
> +       vpxorq  (VEC_SIZE * 0)(%rsi), %VMM(0), %VMM(1)
> +       vpxorq  (VEC_SIZE * 1)(%rsi), %VMM(2), %VMM(3)
> +       vpxorq  (VEC_SIZE * 2)(%rsi), %VMM(4), %VMM(5)
>         /* Ternary logic to xor (VEC_SIZE * 3)(%rsi) with YMM6 while
>            oring with YMM1. Result is stored in YMM6.  */
> -       vpternlogd $0xde, (VEC_SIZE * 3)(%rsi), %YMM1, %YMM6
> +       vpternlogd $0xde, (VEC_SIZE * 3)(%rsi), %VMM(1), %VMM(6)
>  # else
> -       VMOVU   (VEC_SIZE * 0)(%rsi), %YMM1
> -       TOLOWER_YMM (%YMM0, %YMM1)
> -       VMOVU   (VEC_SIZE * 1)(%rsi), %YMM3
> -       TOLOWER_YMM (%YMM2, %YMM3)
> -       VMOVU   (VEC_SIZE * 2)(%rsi), %YMM5
> -       TOLOWER_YMM (%YMM4, %YMM5)
> -       VMOVU   (VEC_SIZE * 3)(%rsi), %YMM7
> -       TOLOWER_YMM (%YMM6, %YMM7)
> -       vpxorq  %YMM0, %YMM1, %YMM1
> -       vpxorq  %YMM2, %YMM3, %YMM3
> -       vpxorq  %YMM4, %YMM5, %YMM5
> -       vpternlogd $0xde, %YMM7, %YMM1, %YMM6
> +       VMOVU   (VEC_SIZE * 0)(%rsi), %VMM(1)
> +       TOLOWER_VMM (%VMM(0), %VMM(1))
> +       VMOVU   (VEC_SIZE * 1)(%rsi), %VMM(3)
> +       TOLOWER_VMM (%VMM(2), %VMM(3))
> +       VMOVU   (VEC_SIZE * 2)(%rsi), %VMM(5)
> +       TOLOWER_VMM (%VMM(4), %VMM(5))
> +       VMOVU   (VEC_SIZE * 3)(%rsi), %VMM(7)
> +       TOLOWER_VMM (%VMM(6), %VMM(7))
> +       vpxorq  %VMM(0), %VMM(1), %VMM(1)
> +       vpxorq  %VMM(2), %VMM(3), %VMM(3)
> +       vpxorq  %VMM(4), %VMM(5), %VMM(5)
> +       vpternlogd $0xde, %VMM(7), %VMM(1), %VMM(6)
>  # endif
>         /* Or together YMM3, YMM5, and YMM6.  */
> -       vpternlogd $0xfe, %YMM3, %YMM5, %YMM6
> +       vpternlogd $0xfe, %VMM(3), %VMM(5), %VMM(6)
>
>
>         /* A non-zero CHAR in YMM6 represents a mismatch.  */
> -       VPTESTNM %YMM6, %YMM6, %k0{%k1}
> -       kmovd   %k0, %LOOP_REG
> +       VPTESTNM %VMM(6), %VMM(6), %k0{%k1}
> +       KMOV    %k0, %LOOP_REG
>
>         TESTEQ  %LOOP_REG
>         jz      L(loop)
>
>
>         /* Find which VEC has the mismatch of end of string.  */
> -       VPTESTM %YMM0, %YMM0, %k1
> -       VPTESTNM %YMM1, %YMM1, %k0{%k1}
> -       kmovd   %k0, %ecx
> -       TESTEQ  %ecx
> +       VPTESTM %VMM(0), %VMM(0), %k1
> +       VPTESTNM %VMM(1), %VMM(1), %k0{%k1}
> +       KMOV    %k0, %VRCX
> +       TESTEQ  %VRCX
>         jnz     L(return_vec_0_end)
>
> -       VPTESTM %YMM2, %YMM2, %k1
> -       VPTESTNM %YMM3, %YMM3, %k0{%k1}
> -       kmovd   %k0, %ecx
> -       TESTEQ  %ecx
> +       VPTESTM %VMM(2), %VMM(2), %k1
> +       VPTESTNM %VMM(3), %VMM(3), %k0{%k1}
> +       KMOV    %k0, %VRCX
> +       TESTEQ  %VRCX
>         jnz     L(return_vec_1_end)
>
>
> -       /* Handle VEC 2 and 3 without branches.  */
> +       /* Handle VEC 2 and 3 without branches if CHAR_PER_VEC <= 32.
> +        */
>  L(return_vec_2_3_end):
>  # ifdef USE_AS_STRNCMP
>         subq    $(CHAR_PER_VEC * 2), %rdx
>         jbe     L(ret_zero_end)
>  # endif
>
> -       VPTESTM %YMM4, %YMM4, %k1
> -       VPTESTNM %YMM5, %YMM5, %k0{%k1}
> -       kmovd   %k0, %ecx
> -       TESTEQ  %ecx
> +       VPTESTM %VMM(4), %VMM(4), %k1
> +       VPTESTNM %VMM(5), %VMM(5), %k0{%k1}
> +       KMOV    %k0, %VRCX
> +       TESTEQ  %VRCX
>  # if CHAR_PER_VEC <= 16
>         sall    $CHAR_PER_VEC, %LOOP_REG
>         orl     %ecx, %LOOP_REG
> -# else
> +# elif CHAR_PER_VEC <= 32
>         salq    $CHAR_PER_VEC, %LOOP_REG64
>         orq     %rcx, %LOOP_REG64
> +# else
> +       /* We aren't combining last 2x VEC so branch on second the last.
> +        */
> +       jnz     L(return_vec_2_end)
>  # endif
> -L(return_vec_3_end):
> +
>         /* LOOP_REG contains matches for null/mismatch from the loop. If
> -          VEC 0,1,and 2 all have no null and no mismatches then mismatch
> -          must entirely be from VEC 3 which is fully represented by
> -          LOOP_REG.  */
> +          VEC 0,1,and 2 all have no null and no mismatches then
> +          mismatch must entirely be from VEC 3 which is fully
> +          represented by LOOP_REG.  */
>  # if CHAR_PER_VEC <= 16
> -       tzcntl  %LOOP_REG, %LOOP_REG
> +       bsf     %LOOP_REG, %LOOP_REG
>  # else
> -       tzcntq  %LOOP_REG64, %LOOP_REG64
> +       bsfq    %LOOP_REG64, %LOOP_REG64
>  # endif
>  # ifdef USE_AS_STRNCMP
> +
> +       /* If CHAR_PER_VEC == 64 we can't combine last 2x VEC so need to
> +          adj length before last comparison.  */
> +#  if CHAR_PER_VEC == 64
> +       subq    $CHAR_PER_VEC, %rdx
> +       jbe     L(ret_zero_end)
> +#  endif
> +
>         cmpq    %LOOP_REG64, %rdx
>         jbe     L(ret_zero_end)
>  # endif
>
>  # ifdef USE_AS_WCSCMP
> -       movl    (VEC_SIZE * 2)(%rdi, %LOOP_REG64, SIZE_OF_CHAR), %ecx
> +       movl    (FALLTHROUGH_RETURN_OFFSET)(%rdi, %LOOP_REG64, SIZE_OF_CHAR), %ecx
>         xorl    %eax, %eax
> -       cmpl    (VEC_SIZE * 2)(%rsi, %LOOP_REG64, SIZE_OF_CHAR), %ecx
> +       cmpl    (FALLTHROUGH_RETURN_OFFSET)(%rsi, %LOOP_REG64, SIZE_OF_CHAR), %ecx
>         je      L(ret5)
>         setl    %al
>         negl    %eax
>         xorl    %r8d, %eax
>  # else
> -       movzbl  (VEC_SIZE * 2)(%rdi, %LOOP_REG64), %eax
> -       movzbl  (VEC_SIZE * 2)(%rsi, %LOOP_REG64), %ecx
> +       movzbl  (FALLTHROUGH_RETURN_OFFSET)(%rdi, %LOOP_REG64), %eax
> +       movzbl  (FALLTHROUGH_RETURN_OFFSET)(%rsi, %LOOP_REG64), %ecx
>         TOLOWER_gpr (%rax, %eax)
>         TOLOWER_gpr (%rcx, %ecx)
>         subl    %ecx, %eax
> @@ -686,23 +748,39 @@ L(ret_zero_end):
>  # endif
>
>
> +
>         /* The L(return_vec_N_end) differ from L(return_vec_N) in that
> -          they use the value of `r8` to negate the return value. This is
> -          because the page cross logic can swap `rdi` and `rsi`.  */
> +          they use the value of `r8` to negate the return value. This
> +          is because the page cross logic can swap `rdi` and `rsi`.
> +        */
>         .p2align 4,, 10
>  # ifdef USE_AS_STRNCMP
>  L(return_vec_1_end):
> -#  if CHAR_PER_VEC <= 16
> +#  if CHAR_PER_VEC <= 32
> +       /* If CHAR_PER_VEC <= 32 reuse code from L(return_vec_0_end)
> +          without additional branches by adjusting the bit positions
> +          from VEC1.  We can't do this for CHAR_PER_VEC == 64.  */
> +#   if CHAR_PER_VEC <= 16
>         sall    $CHAR_PER_VEC, %ecx
> -#  else
> +#   else
>         salq    $CHAR_PER_VEC, %rcx
> +#   endif
> +#  else
> +       /* If CHAR_PER_VEC == 64 we can't shift the return GPR so just
> +          check it.  */
> +       bsf     %VRCX, %VRCX
> +       addl    $(CHAR_PER_VEC), %ecx
> +       cmpq    %rcx, %rdx
> +       ja      L(ret_vec_0_end_finish)
> +       xorl    %eax, %eax
> +       ret
>  #  endif
>  # endif
>  L(return_vec_0_end):
>  # if (CHAR_PER_VEC <= 16) || !(defined USE_AS_STRNCMP)
> -       tzcntl  %ecx, %ecx
> +       bsf     %VRCX, %VRCX
>  # else
> -       tzcntq  %rcx, %rcx
> +       bsfq    %rcx, %rcx
>  # endif
>
>  # ifdef USE_AS_STRNCMP
> @@ -710,6 +788,7 @@ L(return_vec_0_end):
>         jbe     L(ret_zero_end)
>  # endif
>
> +L(ret_vec_0_end_finish):
>  # ifdef USE_AS_WCSCMP
>         movl    (%rdi, %rcx, SIZE_OF_CHAR), %edx
>         xorl    %eax, %eax
> @@ -737,7 +816,7 @@ L(ret6):
>  # ifndef USE_AS_STRNCMP
>         .p2align 4,, 10
>  L(return_vec_1_end):
> -       tzcntl  %ecx, %ecx
> +       bsf     %VRCX, %VRCX
>  #  ifdef USE_AS_WCSCMP
>         movl    VEC_SIZE(%rdi, %rcx, SIZE_OF_CHAR), %edx
>         xorl    %eax, %eax
> @@ -760,6 +839,41 @@ L(ret7):
>  # endif
>
>
> +       /* If CHAR_PER_VEC == 64 we can't combine matches from the last
> +          2x VEC so need seperate return label.  */
> +# if CHAR_PER_VEC == 64
> +L(return_vec_2_end):
> +       bsf     %VRCX, %VRCX
> +#  ifdef USE_AS_STRNCMP
> +       cmpq    %rcx, %rdx
> +       jbe     L(ret_zero_end)
> +#  endif
> +#  ifdef USE_AS_WCSCMP
> +       movl    (VEC_SIZE * 2)(%rdi, %rcx, SIZE_OF_CHAR), %edx
> +       xorl    %eax, %eax
> +       cmpl    (VEC_SIZE * 2)(%rsi, %rcx, SIZE_OF_CHAR), %edx
> +       je      L(ret31)
> +       setl    %al
> +       negl    %eax
> +       /* This is the non-zero case for `eax` so just xorl with `r8d`
> +          flip is `rdi` and `rsi` where swapped.  */
> +       xorl    %r8d, %eax
> +#  else
> +       movzbl  (VEC_SIZE * 2)(%rdi, %rcx), %eax
> +       movzbl  (VEC_SIZE * 2)(%rsi, %rcx), %ecx
> +       TOLOWER_gpr (%rax, %eax)
> +       TOLOWER_gpr (%rcx, %ecx)
> +       subl    %ecx, %eax
> +       /* Flip `eax` if `rdi` and `rsi` where swapped in page cross
> +          logic. Subtract `r8d` after xor for zero case.  */
> +       xorl    %r8d, %eax
> +       subl    %r8d, %eax
> +#  endif
> +L(ret13):
> +       ret
> +# endif
> +
> +
>         /* Page cross in rsi in next 4x VEC.  */
>
>         /* TODO: Improve logic here.  */
> @@ -778,11 +892,11 @@ L(page_cross_during_loop):
>         cmpl    $-(VEC_SIZE * 3), %eax
>         jle     L(less_1x_vec_till_page_cross)
>
> -       VMOVA   (%rdi), %YMM0
> -       VPTESTM %YMM0, %YMM0, %k2
> -       CMP_R1_S2_YMM (%YMM0, (%rsi), %YMM1, %k1){%k2}
> -       kmovd   %k1, %ecx
> -       TESTEQ  %ecx
> +       VMOVA   (%rdi), %VMM(0)
> +       VPTESTM %VMM(0), %VMM(0), %k2
> +       CMP_R1_S2_VMM (%VMM(0), (%rsi), %VMM(1), %k1){%k2}
> +       KMOV    %k1, %VRCX
> +       TESTEQ  %VRCX
>         jnz     L(return_vec_0_end)
>
>         /* if distance >= 2x VEC then eax > -(VEC_SIZE * 2).  */
> @@ -799,9 +913,9 @@ L(less_1x_vec_till_page_cross):
>            to read back -VEC_SIZE. If rdi is truly at the start of a page
>            here, it means the previous page (rdi - VEC_SIZE) has already
>            been loaded earlier so must be valid.  */
> -       VMOVU   -VEC_SIZE(%rdi, %rax), %YMM0
> -       VPTESTM %YMM0, %YMM0, %k2
> -       CMP_R1_S2_YMM (%YMM0, -VEC_SIZE(%rsi, %rax), %YMM1, %k1){%k2}
> +       VMOVU   -VEC_SIZE(%rdi, %rax), %VMM(0)
> +       VPTESTM %VMM(0), %VMM(0), %k2
> +       CMP_R1_S2_VMM (%VMM(0), -VEC_SIZE(%rsi, %rax), %VMM(1), %k1){%k2}
>         /* Mask of potentially valid bits. The lower bits can be out of
>            range comparisons (but safe regarding page crosses).  */
>
> @@ -811,14 +925,22 @@ L(less_1x_vec_till_page_cross):
>         andl    $(VEC_SIZE - 1), %ecx
>         shrl    $2, %ecx
>         shlxl   %ecx, %r10d, %ecx
> +       /* Depending on CHAR_PER_VEC extract mask for possible in-bound
> +          matches.  */
> +#  if CHAR_PER_VEC == 16
> +       movzwl  %cx, %r10d
> +#  elif CHAR_PER_VEC == 8
>         movzbl  %cl, %r10d
> +#  else
> +#   error "Invalid CHAR_SIZE or VEC_SIZE"
> +#  endif
>  # else
> -       movl    $-1, %ecx
> -       shlxl   %esi, %ecx, %r10d
> +       mov     $-1, %VRCX
> +       shlx    %VRSI, %VRCX, %VR10
>  # endif
>
> -       kmovd   %k1, %ecx
> -       notl    %ecx
> +       KMOV    %k1, %VRCX
> +       not     %VRCX
>
>
>  # ifdef USE_AS_STRNCMP
> @@ -838,12 +960,10 @@ L(less_1x_vec_till_page_cross):
>         /* Readjust eax before potentially returning to the loop.  */
>         addl    $(PAGE_SIZE - VEC_SIZE * 4), %eax
>
> -       andl    %r10d, %ecx
> +       and     %VR10, %VRCX
>         jz      L(loop_skip_page_cross_check)
>
> -       .p2align 4,, 3
> -L(return_page_cross_end):
> -       tzcntl  %ecx, %ecx
> +       bsf     %VRCX, %VRCX
>
>  # if (defined USE_AS_STRNCMP) || (defined USE_AS_WCSCMP)
>         leal    -VEC_SIZE(%OFFSET_REG64, %rcx, SIZE_OF_CHAR), %ecx
> @@ -874,8 +994,12 @@ L(ret8):
>  # ifdef USE_AS_STRNCMP
>         .p2align 4,, 10
>  L(return_page_cross_end_check):
> -       andl    %r10d, %ecx
> -       tzcntl  %ecx, %ecx
> +       and     %VR10, %VRCX
> +       /* Need to use tzcnt here as VRCX may be zero.  If VRCX is zero
> +          tzcnt(VRCX) will be CHAR_PER and remaining length (edx) is
> +          guranteed to be <= CHAR_PER_VEC so we will only use the return
> +          idx if VRCX was non-zero.  */
> +       tzcnt   %VRCX, %VRCX
>         leal    -VEC_SIZE(%rax, %rcx, SIZE_OF_CHAR), %ecx
>  #  ifdef USE_AS_WCSCMP
>         sall    $2, %edx
> @@ -892,11 +1016,11 @@ L(more_2x_vec_till_page_cross):
>         /* If more 2x vec till cross we will complete a full loop
>            iteration here.  */
>
> -       VMOVA   VEC_SIZE(%rdi), %YMM0
> -       VPTESTM %YMM0, %YMM0, %k2
> -       CMP_R1_S2_YMM (%YMM0, VEC_SIZE(%rsi), %YMM1, %k1){%k2}
> -       kmovd   %k1, %ecx
> -       TESTEQ  %ecx
> +       VMOVA   VEC_SIZE(%rdi), %VMM(0)
> +       VPTESTM %VMM(0), %VMM(0), %k2
> +       CMP_R1_S2_VMM (%VMM(0), VEC_SIZE(%rsi), %VMM(1), %k1){%k2}
> +       KMOV    %k1, %VRCX
> +       TESTEQ  %VRCX
>         jnz     L(return_vec_1_end)
>
>  # ifdef USE_AS_STRNCMP
> @@ -907,18 +1031,18 @@ L(more_2x_vec_till_page_cross):
>         subl    $-(VEC_SIZE * 4), %eax
>
>         /* Safe to include comparisons from lower bytes.  */
> -       VMOVU   -(VEC_SIZE * 2)(%rdi, %rax), %YMM0
> -       VPTESTM %YMM0, %YMM0, %k2
> -       CMP_R1_S2_YMM (%YMM0, -(VEC_SIZE * 2)(%rsi, %rax), %YMM1, %k1){%k2}
> -       kmovd   %k1, %ecx
> -       TESTEQ  %ecx
> +       VMOVU   -(VEC_SIZE * 2)(%rdi, %rax), %VMM(0)
> +       VPTESTM %VMM(0), %VMM(0), %k2
> +       CMP_R1_S2_VMM (%VMM(0), -(VEC_SIZE * 2)(%rsi, %rax), %VMM(1), %k1){%k2}
> +       KMOV    %k1, %VRCX
> +       TESTEQ  %VRCX
>         jnz     L(return_vec_page_cross_0)
>
> -       VMOVU   -(VEC_SIZE * 1)(%rdi, %rax), %YMM0
> -       VPTESTM %YMM0, %YMM0, %k2
> -       CMP_R1_S2_YMM (%YMM0, -(VEC_SIZE * 1)(%rsi, %rax), %YMM1, %k1){%k2}
> -       kmovd   %k1, %ecx
> -       TESTEQ  %ecx
> +       VMOVU   -(VEC_SIZE * 1)(%rdi, %rax), %VMM(0)
> +       VPTESTM %VMM(0), %VMM(0), %k2
> +       CMP_R1_S2_VMM (%VMM(0), -(VEC_SIZE * 1)(%rsi, %rax), %VMM(1), %k1){%k2}
> +       KMOV    %k1, %VRCX
> +       TESTEQ  %VRCX
>         jnz     L(return_vec_page_cross_1)
>
>  # ifdef USE_AS_STRNCMP
> @@ -937,30 +1061,30 @@ L(more_2x_vec_till_page_cross):
>  # endif
>
>         /* Finish the loop.  */
> -       VMOVA   (VEC_SIZE * 2)(%rdi), %YMM4
> -       VMOVA   (VEC_SIZE * 3)(%rdi), %YMM6
> -       VPMINU  %YMM4, %YMM6, %YMM9
> -       VPTESTM %YMM9, %YMM9, %k1
> +       VMOVA   (VEC_SIZE * 2)(%rdi), %VMM(4)
> +       VMOVA   (VEC_SIZE * 3)(%rdi), %VMM(6)
> +       VPMINU  %VMM(4), %VMM(6), %VMM(9)
> +       VPTESTM %VMM(9), %VMM(9), %k1
>  # ifndef USE_AS_STRCASECMP_L
> -       vpxorq  (VEC_SIZE * 2)(%rsi), %YMM4, %YMM5
> +       vpxorq  (VEC_SIZE * 2)(%rsi), %VMM(4), %VMM(5)
>         /* YMM6 = YMM5 | ((VEC_SIZE * 3)(%rsi) ^ YMM6).  */
> -       vpternlogd $0xde, (VEC_SIZE * 3)(%rsi), %YMM5, %YMM6
> +       vpternlogd $0xde, (VEC_SIZE * 3)(%rsi), %VMM(5), %VMM(6)
>  # else
> -       VMOVU   (VEC_SIZE * 2)(%rsi), %YMM5
> -       TOLOWER_YMM (%YMM4, %YMM5)
> -       VMOVU   (VEC_SIZE * 3)(%rsi), %YMM7
> -       TOLOWER_YMM (%YMM6, %YMM7)
> -       vpxorq  %YMM4, %YMM5, %YMM5
> -       vpternlogd $0xde, %YMM7, %YMM5, %YMM6
> -# endif
> -       VPTESTNM %YMM6, %YMM6, %k0{%k1}
> -       kmovd   %k0, %LOOP_REG
> +       VMOVU   (VEC_SIZE * 2)(%rsi), %VMM(5)
> +       TOLOWER_VMM (%VMM(4), %VMM(5))
> +       VMOVU   (VEC_SIZE * 3)(%rsi), %VMM(7)
> +       TOLOWER_VMM (%VMM(6), %VMM(7))
> +       vpxorq  %VMM(4), %VMM(5), %VMM(5)
> +       vpternlogd $0xde, %VMM(7), %VMM(5), %VMM(6)
> +# endif
> +       VPTESTNM %VMM(6), %VMM(6), %k0{%k1}
> +       KMOV    %k0, %LOOP_REG
>         TESTEQ  %LOOP_REG
>         jnz     L(return_vec_2_3_end)
>
>         /* Best for code size to include ucond-jmp here. Would be faster
> -          if this case is hot to duplicate the L(return_vec_2_3_end) code
> -          as fall-through and have jump back to loop on mismatch
> +          if this case is hot to duplicate the L(return_vec_2_3_end)
> +          code as fall-through and have jump back to loop on mismatch
>            comparison.  */
>         subq    $-(VEC_SIZE * 4), %rdi
>         subq    $-(VEC_SIZE * 4), %rsi
> @@ -980,7 +1104,7 @@ L(ret_zero_in_loop_page_cross):
>  L(return_vec_page_cross_0):
>         addl    $-VEC_SIZE, %eax
>  L(return_vec_page_cross_1):
> -       tzcntl  %ecx, %ecx
> +       bsf     %VRCX, %VRCX
>  # if defined USE_AS_STRNCMP || defined USE_AS_WCSCMP
>         leal    -VEC_SIZE(%rax, %rcx, SIZE_OF_CHAR), %ecx
>  #  ifdef USE_AS_STRNCMP
> @@ -1023,8 +1147,8 @@ L(ret9):
>  L(page_cross):
>  # ifndef USE_AS_STRNCMP
>         /* If both are VEC aligned we don't need any special logic here.
> -          Only valid for strcmp where stop condition is guranteed to be
> -          reachable by just reading memory.  */
> +          Only valid for strcmp where stop condition is guranteed to
> +          be reachable by just reading memory.  */
>         testl   $((VEC_SIZE - 1) << 20), %eax
>         jz      L(no_page_cross)
>  # endif
> @@ -1065,11 +1189,11 @@ L(page_cross):
>            loadable memory until within 1x VEC of page cross.  */
>         .p2align 4,, 8
>  L(page_cross_loop):
> -       VMOVU   (%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %YMM0
> -       VPTESTM %YMM0, %YMM0, %k2
> -       CMP_R1_S2_YMM (%YMM0, (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %YMM1, %k1){%k2}
> -       kmovd   %k1, %ecx
> -       TESTEQ  %ecx
> +       VMOVU   (%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %VMM(0)
> +       VPTESTM %VMM(0), %VMM(0), %k2
> +       CMP_R1_S2_VMM (%VMM(0), (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %VMM(1), %k1){%k2}
> +       KMOV    %k1, %VRCX
> +       TESTEQ  %VRCX
>         jnz     L(check_ret_vec_page_cross)
>         addl    $CHAR_PER_VEC, %OFFSET_REG
>  # ifdef USE_AS_STRNCMP
> @@ -1087,13 +1211,13 @@ L(page_cross_loop):
>         subl    %eax, %OFFSET_REG
>         /* OFFSET_REG has distance to page cross - VEC_SIZE. Guranteed
>            to not cross page so is safe to load. Since we have already
> -          loaded at least 1 VEC from rsi it is also guranteed to be safe.
> -        */
> -       VMOVU   (%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %YMM0
> -       VPTESTM %YMM0, %YMM0, %k2
> -       CMP_R1_S2_YMM (%YMM0, (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %YMM1, %k1){%k2}
> +          loaded at least 1 VEC from rsi it is also guranteed to be
> +          safe.  */
> +       VMOVU   (%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %VMM(0)
> +       VPTESTM %VMM(0), %VMM(0), %k2
> +       CMP_R1_S2_VMM (%VMM(0), (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %VMM(1), %k1){%k2}
>
> -       kmovd   %k1, %ecx
> +       KMOV    %k1, %VRCX
>  # ifdef USE_AS_STRNCMP
>         leal    CHAR_PER_VEC(%OFFSET_REG64), %eax
>         cmpq    %rax, %rdx
> @@ -1104,7 +1228,7 @@ L(page_cross_loop):
>         addq    %rdi, %rdx
>  #  endif
>  # endif
> -       TESTEQ  %ecx
> +       TESTEQ  %VRCX
>         jz      L(prepare_loop_no_len)
>
>         .p2align 4,, 4
> @@ -1112,7 +1236,7 @@ L(ret_vec_page_cross):
>  # ifndef USE_AS_STRNCMP
>  L(check_ret_vec_page_cross):
>  # endif
> -       tzcntl  %ecx, %ecx
> +       tzcnt   %VRCX, %VRCX
>         addl    %OFFSET_REG, %ecx
>  L(ret_vec_page_cross_cont):
>  # ifdef USE_AS_WCSCMP
> @@ -1139,9 +1263,9 @@ L(ret12):
>  # ifdef USE_AS_STRNCMP
>         .p2align 4,, 10
>  L(check_ret_vec_page_cross2):
> -       TESTEQ  %ecx
> +       TESTEQ  %VRCX
>  L(check_ret_vec_page_cross):
> -       tzcntl  %ecx, %ecx
> +       tzcnt   %VRCX, %VRCX
>         addl    %OFFSET_REG, %ecx
>         cmpq    %rcx, %rdx
>         ja      L(ret_vec_page_cross_cont)
> @@ -1180,8 +1304,71 @@ L(less_1x_vec_till_page):
>  # ifdef USE_AS_WCSCMP
>         shrl    $2, %eax
>  # endif
> +
> +       /* Find largest load size we can use. VEC_SIZE == 64 only check
> +          if we can do a full ymm load.  */
> +# if VEC_SIZE == 64
> +
> +       cmpl    $((VEC_SIZE - 32) / SIZE_OF_CHAR), %eax
> +       ja      L(less_32_till_page)
> +
> +
> +       /* Use 16 byte comparison.  */
> +       VMOVU   (%rdi), %VMM_256(0)
> +       VPTESTM %VMM_256(0), %VMM_256(0), %k2
> +       CMP_R1_S2_YMM (%VMM_256(0), (%rsi), %VMM_256(1), %k1){%k2}
> +       kmovd   %k1, %ecx
> +#  ifdef USE_AS_WCSCMP
> +       subl    $0xff, %ecx
> +#  else
> +       incl    %ecx
> +#  endif
> +       jnz     L(check_ret_vec_page_cross)
> +       movl    $((VEC_SIZE - 32) / SIZE_OF_CHAR), %OFFSET_REG
> +#  ifdef USE_AS_STRNCMP
> +       cmpq    %OFFSET_REG64, %rdx
> +       jbe     L(ret_zero_page_cross_slow_case64)
> +       subl    %eax, %OFFSET_REG
> +#  else
> +       /* Explicit check for 32 byte alignment.  */
> +       subl    %eax, %OFFSET_REG
> +       jz      L(prepare_loop)
> +#  endif
> +       VMOVU   (%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %VMM_256(0)
> +       VPTESTM %VMM_256(0), %VMM_256(0), %k2
> +       CMP_R1_S2_YMM (%VMM_256(0), (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %VMM_256(1), %k1){%k2}
> +       kmovd   %k1, %ecx
> +#  ifdef USE_AS_WCSCMP
> +       subl    $0xff, %ecx
> +#  else
> +       incl    %ecx
> +#  endif
> +       jnz     L(check_ret_vec_page_cross)
> +#  ifdef USE_AS_STRNCMP
> +       addl    $(32 / SIZE_OF_CHAR), %OFFSET_REG
> +       subq    %OFFSET_REG64, %rdx
> +       jbe     L(ret_zero_page_cross_slow_case64)
> +       subq    $-(CHAR_PER_VEC * 4), %rdx
> +
> +       leaq    -(VEC_SIZE * 4)(%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %rdi
> +       leaq    -(VEC_SIZE * 4)(%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %rsi
> +#  else
> +       leaq    (32 - VEC_SIZE * 4)(%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %rdi
> +       leaq    (32 - VEC_SIZE * 4)(%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %rsi
> +#  endif
> +       jmp     L(prepare_loop_aligned)
> +
> +#  ifdef USE_AS_STRNCMP
> +       .p2align 4,, 2
> +L(ret_zero_page_cross_slow_case64):
> +       xorl    %eax, %eax
> +       ret
> +#  endif
> +L(less_32_till_page):
> +# endif
> +
>         /* Find largest load size we can use.  */
> -       cmpl    $(16 / SIZE_OF_CHAR), %eax
> +       cmpl    $((VEC_SIZE - 16) / SIZE_OF_CHAR), %eax
>         ja      L(less_16_till_page)
>
>         /* Use 16 byte comparison.  */
> @@ -1195,9 +1382,14 @@ L(less_1x_vec_till_page):
>         incw    %cx
>  # endif
>         jnz     L(check_ret_vec_page_cross)
> -       movl    $(16 / SIZE_OF_CHAR), %OFFSET_REG
> +
> +       movl    $((VEC_SIZE - 16) / SIZE_OF_CHAR), %OFFSET_REG
>  # ifdef USE_AS_STRNCMP
> +#  if VEC_SIZE == 32
>         cmpq    %OFFSET_REG64, %rdx
> +#  else
> +       cmpq    $(16 / SIZE_OF_CHAR), %rdx
> +#  endif
>         jbe     L(ret_zero_page_cross_slow_case0)
>         subl    %eax, %OFFSET_REG
>  # else
> @@ -1239,7 +1431,7 @@ L(ret_zero_page_cross_slow_case0):
>
>         .p2align 4,, 10
>  L(less_16_till_page):
> -       cmpl    $(24 / SIZE_OF_CHAR), %eax
> +       cmpl    $((VEC_SIZE - 8) / SIZE_OF_CHAR), %eax
>         ja      L(less_8_till_page)
>
>         /* Use 8 byte comparison.  */
> @@ -1260,7 +1452,7 @@ L(less_16_till_page):
>         cmpq    $(8 / SIZE_OF_CHAR), %rdx
>         jbe     L(ret_zero_page_cross_slow_case0)
>  # endif
> -       movl    $(24 / SIZE_OF_CHAR), %OFFSET_REG
> +       movl    $((VEC_SIZE - 8) / SIZE_OF_CHAR), %OFFSET_REG
>         subl    %eax, %OFFSET_REG
>
>         vmovq   (%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm0
> @@ -1320,7 +1512,7 @@ L(ret_less_8_wcs):
>         ret
>
>  # else
> -       cmpl    $28, %eax
> +       cmpl    $(VEC_SIZE - 4), %eax
>         ja      L(less_4_till_page)
>
>         vmovd   (%rdi), %xmm0
> @@ -1335,7 +1527,7 @@ L(ret_less_8_wcs):
>         cmpq    $4, %rdx
>         jbe     L(ret_zero_page_cross_slow_case1)
>  #  endif
> -       movl    $(28 / SIZE_OF_CHAR), %OFFSET_REG
> +       movl    $((VEC_SIZE - 4) / SIZE_OF_CHAR), %OFFSET_REG
>         subl    %eax, %OFFSET_REG
>
>         vmovd   (%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm0
> @@ -1386,7 +1578,7 @@ L(less_4_loop):
>  #  endif
>         incq    %rdi
>         /* end condition is reach page boundary (rdi is aligned).  */
> -       testl   $31, %edi
> +       testb   $(VEC_SIZE - 1), %dil
>         jnz     L(less_4_loop)
>         leaq    -(VEC_SIZE * 4)(%rdi, %rsi), %rsi
>         addq    $-(VEC_SIZE * 4), %rdi
> --
> 2.34.1
>

LGTM.

Thanks.

-- 
H.J.

^ permalink raw reply	[flat|nested] 41+ messages in thread

end of thread, other threads:[~2022-10-20  3:46 UTC | newest]

Thread overview: 41+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-10-18  2:48 [PATCH v1 1/7] x86: Optimize memchr-evex.S and implement with VMM headers Noah Goldstein
2022-10-18  2:48 ` [PATCH v1 2/7] x86: Shrink / minorly optimize strchr-evex " Noah Goldstein
2022-10-18  2:51   ` Noah Goldstein
2022-10-18  2:48 ` [PATCH v1 3/7] x86: Optimize strnlen-evex.S " Noah Goldstein
2022-10-18  2:51   ` Noah Goldstein
2022-10-18  2:48 ` [PATCH v1 4/7] x86: Optimize memrchr-evex.S Noah Goldstein
2022-10-18  2:51   ` Noah Goldstein
2022-10-18  2:48 ` [PATCH v1 5/7] x86: Optimize strrchr-evex.S and implement with VMM headers Noah Goldstein
2022-10-18  2:52   ` Noah Goldstein
2022-10-18  2:49 ` [PATCH v1 6/7] x86: Add support for VEC_SIZE == 64 in strcmp-evex.S impl Noah Goldstein
2022-10-20  2:15   ` [PATCH v4] " Noah Goldstein
2022-10-20  3:46     ` H.J. Lu
2022-10-18  2:49 ` [PATCH v1 7/7] Bench: Improve benchtests for memchr, strchr, strnlen, strrchr Noah Goldstein
2022-10-18 21:00   ` H.J. Lu
2022-10-18 21:05     ` Noah Goldstein
2022-10-18 21:53       ` H.J. Lu
2022-10-18 22:58         ` Noah Goldstein
2022-10-18  2:50 ` [PATCH v1 1/7] x86: Optimize memchr-evex.S and implement with VMM headers Noah Goldstein
2022-10-18 23:19 ` [PATCH v2 " Noah Goldstein
2022-10-18 23:19   ` [PATCH v2 2/7] x86: Shrink / minorly optimize strchr-evex " Noah Goldstein
2022-10-18 23:19   ` [PATCH v2 3/7] x86: Optimize strnlen-evex.S " Noah Goldstein
2022-10-18 23:19   ` [PATCH v2 4/7] x86: Optimize memrchr-evex.S Noah Goldstein
2022-10-18 23:19   ` [PATCH v2 5/7] x86: Optimize strrchr-evex.S and implement with VMM headers Noah Goldstein
2022-10-18 23:19   ` [PATCH v2 6/7] x86: Add support for VEC_SIZE == 64 in strcmp-evex.S impl Noah Goldstein
2022-10-18 23:19   ` [PATCH v2 7/7] Bench: Improve benchtests for memchr, strchr, strnlen, strrchr Noah Goldstein
2022-10-19  0:01     ` H.J. Lu
2022-10-19  0:44       ` Noah Goldstein
2022-10-19  0:44 ` [PATCH v3 1/7] x86: Optimize memchr-evex.S and implement with VMM headers Noah Goldstein
2022-10-19  0:44   ` [PATCH v3 2/7] x86: Shrink / minorly optimize strchr-evex " Noah Goldstein
2022-10-19 16:53     ` H.J. Lu
2022-10-19  0:44   ` [PATCH v3 3/7] x86: Optimize strnlen-evex.S " Noah Goldstein
2022-10-19 16:57     ` H.J. Lu
2022-10-19  0:44   ` [PATCH v3 4/7] x86: Optimize memrchr-evex.S Noah Goldstein
2022-10-19 16:58     ` H.J. Lu
2022-10-19  0:44   ` [PATCH v3 5/7] x86: Optimize strrchr-evex.S and implement with VMM headers Noah Goldstein
2022-10-19 16:58     ` H.J. Lu
2022-10-19  0:44   ` [PATCH v3 6/7] x86: Add support for VEC_SIZE == 64 in strcmp-evex.S impl Noah Goldstein
2022-10-19 16:59     ` H.J. Lu
2022-10-19  0:44   ` [PATCH v3 7/7] Bench: Improve benchtests for memchr, strchr, strnlen, strrchr Noah Goldstein
2022-10-19 17:00     ` H.J. Lu
2022-10-19 16:52   ` [PATCH v3 1/7] x86: Optimize memchr-evex.S and implement with VMM headers H.J. Lu

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).