From mboxrd@z Thu Jan  1 00:00:00 1970
Return-Path: <nwg@sourceware.org>
Received: by sourceware.org (Postfix, from userid 7844)
	id E5949385417E; Thu, 20 Oct 2022 01:44:57 +0000 (GMT)
DKIM-Filter: OpenDKIM Filter v2.11.0 sourceware.org E5949385417E
DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=sourceware.org;
	s=default; t=1666230297;
	bh=RN2HiBAIqIwSmJw0ioNk4AXm+GxL77r+hRIQ/gKlTds=;
	h=From:To:Subject:Date:From;
	b=FuM6YLODDvUSlJCjcUa7JKWv2ERSAR3AgSidojCRbY+QdeKxIVo3vX2Hp+MhvHi63
	 ZBbTOv/P/tI48Pa77RWAIjBVPzTNyDTkBcI+4a3boYkRgb7pBfU566lmGJ6WFJzYoI
	 5ZLqR1/a0V+X8OqCbA+B6rFmxcCyhdxW372vRt5w=
Content-Type: text/plain; charset="us-ascii"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
From: Noah Goldstein <nwg@sourceware.org>
To: glibc-cvs@sourceware.org
Subject: [glibc] x86: Optimize memchr-evex.S and implement with VMM headers
X-Act-Checkin: glibc
X-Git-Author: Noah Goldstein <goldstein.w.n@gmail.com>
X-Git-Refname: refs/heads/master
X-Git-Oldrev: 451c6e58540e8571e31581c04c4829e5d2cfe8ac
X-Git-Newrev: 330881763efff626d6b1cdf8de9ffee4ed7a1ba1
Message-Id: <20221020014457.E5949385417E@sourceware.org>
Date: Thu, 20 Oct 2022 01:44:57 +0000 (GMT)
List-Id: <glibc-cvs.sourceware.org>

https://sourceware.org/git/gitweb.cgi?p=glibc.git;h=330881763efff626d6b1cdf8de9ffee4ed7a1ba1

commit 330881763efff626d6b1cdf8de9ffee4ed7a1ba1
Author: Noah Goldstein <goldstein.w.n@gmail.com>
Date:   Tue Oct 18 17:44:03 2022 -0700

    x86: Optimize memchr-evex.S and implement with VMM headers
    
    Optimizations are:
    
    1. Use the fact that tzcnt(0) -> VEC_SIZE for memchr to save a branch
       in short string case.
    2. Restructure code so that small strings are given the hot path.
            - This is a net-zero on the benchmark suite but in general makes
          sense as smaller sizes are far more common.
    3. Use more code-size efficient instructions.
            - tzcnt ...     -> bsf ...
            - vpcmpb $0 ... -> vpcmpeq ...
    4. Align labels less aggressively, especially if it doesn't save fetch
       blocks / causes the basic-block to span extra cache-lines.
    
    The optimizations (especially for point 2) make the memchr and
    rawmemchr code essentially incompatible so split rawmemchr-evex
    to a new file.
    
    Code Size Changes:
    memchr-evex.S       : -107 bytes
    rawmemchr-evex.S    :  -53 bytes
    
    Net perf changes:
    
    Reported as geometric mean of all improvements / regressions from N=10
    runs of the benchtests. Value as New Time / Old Time so < 1.0 is
    improvement and 1.0 is regression.
    
    memchr-evex.S       : 0.928
    rawmemchr-evex.S    : 0.986 (Less targets cross cache lines)
    
    Full results attached in email.
    
    Full check passes on x86-64.

Diff:
---
 sysdeps/x86_64/multiarch/memchr-evex.S        | 939 +++++++++++++++-----------
 sysdeps/x86_64/multiarch/rawmemchr-evex-rtm.S |   9 +-
 sysdeps/x86_64/multiarch/rawmemchr-evex.S     | 313 ++++++++-
 3 files changed, 851 insertions(+), 410 deletions(-)

diff --git a/sysdeps/x86_64/multiarch/memchr-evex.S b/sysdeps/x86_64/multiarch/memchr-evex.S
index 0dd4f1dcce..23a1c0018e 100644
--- a/sysdeps/x86_64/multiarch/memchr-evex.S
+++ b/sysdeps/x86_64/multiarch/memchr-evex.S
@@ -21,17 +21,27 @@
 
 #if ISA_SHOULD_BUILD (4)
 
+# ifndef VEC_SIZE
+#  include "x86-evex256-vecs.h"
+# endif
+
 # ifndef MEMCHR
 #  define MEMCHR	__memchr_evex
 # endif
 
 # ifdef USE_AS_WMEMCHR
+#  define PC_SHIFT_GPR	rcx
+#  define VPTESTN	vptestnmd
 #  define VPBROADCAST	vpbroadcastd
 #  define VPMINU	vpminud
 #  define VPCMP	vpcmpd
 #  define VPCMPEQ	vpcmpeqd
 #  define CHAR_SIZE	4
+
+#  define USE_WIDE_CHAR
 # else
+#  define PC_SHIFT_GPR	rdi
+#  define VPTESTN	vptestnmb
 #  define VPBROADCAST	vpbroadcastb
 #  define VPMINU	vpminub
 #  define VPCMP	vpcmpb
@@ -39,534 +49,661 @@
 #  define CHAR_SIZE	1
 # endif
 
-	/* In the 4x loop the RTM and non-RTM versions have data pointer
-	   off by VEC_SIZE * 4 with RTM version being VEC_SIZE * 4 greater.
-	   This is represented by BASE_OFFSET. As well because the RTM
-	   version uses vpcmp which stores a bit per element compared where
-	   the non-RTM version uses vpcmpeq which stores a bit per byte
-	   compared RET_SCALE of CHAR_SIZE is only relevant for the RTM
-	   version.  */
-# ifdef USE_IN_RTM
+# include "reg-macros.h"
+
+
+/* If not in an RTM and VEC_SIZE != 64 (the VEC_SIZE = 64
+   doesn't have VEX encoding), use VEX encoding in loop so we
+   can use vpcmpeqb + vptern which is more efficient than the
+   EVEX alternative.  */
+# if defined USE_IN_RTM || VEC_SIZE == 64
+#  undef COND_VZEROUPPER
+#  undef VZEROUPPER_RETURN
+#  undef VZEROUPPER
+
+#  define COND_VZEROUPPER
+#  define VZEROUPPER_RETURN	ret
 #  define VZEROUPPER
-#  define BASE_OFFSET	(VEC_SIZE * 4)
-#  define RET_SCALE	CHAR_SIZE
+
+#  define USE_TERN_IN_LOOP	0
 # else
+#  define USE_TERN_IN_LOOP	1
+#  undef VZEROUPPER
 #  define VZEROUPPER	vzeroupper
-#  define BASE_OFFSET	0
-#  define RET_SCALE	1
 # endif
 
-	/* In the return from 4x loop memchr and rawmemchr versions have
-	   data pointers off by VEC_SIZE * 4 with memchr version being
-	   VEC_SIZE * 4 greater.  */
-# ifdef USE_AS_RAWMEMCHR
-#  define RET_OFFSET	(BASE_OFFSET - (VEC_SIZE * 4))
-#  define RAW_PTR_REG	rcx
-#  define ALGN_PTR_REG	rdi
+# if USE_TERN_IN_LOOP
+	/* Resulting bitmask for vpmovmskb has 4-bits set for each wchar
+	   so we don't want to multiply resulting index.  */
+#  define TERN_CHAR_MULT	1
+
+#  ifdef USE_AS_WMEMCHR
+#   define TEST_END()	inc %VRCX
+#  else
+#   define TEST_END()	add %rdx, %rcx
+#  endif
 # else
-#  define RET_OFFSET	BASE_OFFSET
-#  define RAW_PTR_REG	rdi
-#  define ALGN_PTR_REG	rcx
+#  define TERN_CHAR_MULT	CHAR_SIZE
+#  define TEST_END()	KORTEST %k2, %k3
 # endif
 
-# define XMMZERO	xmm23
-# define YMMZERO	ymm23
-# define XMMMATCH	xmm16
-# define YMMMATCH	ymm16
-# define YMM1		ymm17
-# define YMM2		ymm18
-# define YMM3		ymm19
-# define YMM4		ymm20
-# define YMM5		ymm21
-# define YMM6		ymm22
+# if defined USE_AS_WMEMCHR || !USE_TERN_IN_LOOP
+#  ifndef USE_AS_WMEMCHR
+#   define GPR_X0_IS_RET	1
+#  else
+#   define GPR_X0_IS_RET	0
+#  endif
+#  define GPR_X0	rax
+# else
+#  define GPR_X0_IS_RET	0
+#  define GPR_X0	rdx
+# endif
+
+# define CHAR_PER_VEC	(VEC_SIZE / CHAR_SIZE)
 
-# ifndef SECTION
-#  define SECTION(p)	p##.evex
+# if CHAR_PER_VEC == 64
+#  define LAST_VEC_OFFSET	(VEC_SIZE * 3)
+# else
+#  define LAST_VEC_OFFSET	(VEC_SIZE * 2)
+# endif
+# if CHAR_PER_VEC >= 32
+#  define MASK_GPR(...)	VGPR(__VA_ARGS__)
+# elif CHAR_PER_VEC == 16
+#  define MASK_GPR(reg)	VGPR_SZ(reg, 16)
+# else
+#  define MASK_GPR(reg)	VGPR_SZ(reg, 8)
 # endif
 
-# define VEC_SIZE 32
-# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE)
-# define PAGE_SIZE 4096
+# define VMATCH	VMM(0)
+# define VMATCH_LO	VMM_lo(0)
 
-	.section SECTION(.text),"ax",@progbits
+# define PAGE_SIZE	4096
+
+
+	.section SECTION(.text), "ax", @progbits
 ENTRY_P2ALIGN (MEMCHR, 6)
-# ifndef USE_AS_RAWMEMCHR
 	/* Check for zero length.  */
 	test	%RDX_LP, %RDX_LP
-	jz	L(zero)
+	jz	L(zero_0)
 
-#  ifdef __ILP32__
+# ifdef __ILP32__
 	/* Clear the upper 32 bits.  */
 	movl	%edx, %edx
-#  endif
 # endif
-	/* Broadcast CHAR to YMMMATCH.  */
-	VPBROADCAST %esi, %YMMMATCH
+	VPBROADCAST %esi, %VMATCH
 	/* Check if we may cross page boundary with one vector load.  */
 	movl	%edi, %eax
 	andl	$(PAGE_SIZE - 1), %eax
 	cmpl	$(PAGE_SIZE - VEC_SIZE), %eax
-	ja	L(cross_page_boundary)
+	ja	L(page_cross)
+
+	VPCMPEQ	(%rdi), %VMATCH, %k0
+	KMOV	%k0, %VRAX
+# ifndef USE_AS_WMEMCHR
+	/* If rcx is zero then tzcnt -> CHAR_PER_VEC.  NB: there is a
+	   already a dependency between rcx and rsi so no worries about
+	   false-dep here.  */
+	tzcnt	%VRAX, %VRSI
+	/* If rdx <= rsi then either 1) rcx was non-zero (there was a
+	   match) but it was out of bounds or 2) rcx was zero and rdx
+	   was <= VEC_SIZE so we are done scanning.  */
+	cmpq	%rsi, %rdx
+	/* NB: Use branch to return zero/non-zero.  Common usage will
+	   branch on result of function (if return is null/non-null).
+	   This branch can be used to predict the ensuing one so there
+	   is no reason to extend the data-dependency with cmovcc.  */
+	jbe	L(zero_0)
+
+	/* If rcx is zero then len must be > RDX, otherwise since we
+	   already tested len vs lzcnt(rcx) (in rsi) we are good to
+	   return this match.  */
+	test	%VRAX, %VRAX
+	jz	L(more_1x_vec)
+	leaq	(%rdi, %rsi), %rax
+# else
 
-	/* Check the first VEC_SIZE bytes.  */
-	VPCMP	$0, (%rdi), %YMMMATCH, %k0
-	kmovd	%k0, %eax
-# ifndef USE_AS_RAWMEMCHR
-	/* If length < CHAR_PER_VEC handle special.  */
+	/* We can't use the `tzcnt` trick for wmemchr because CHAR_SIZE
+	   > 1 so if rcx is tzcnt != CHAR_PER_VEC.  */
 	cmpq	$CHAR_PER_VEC, %rdx
-	jbe	L(first_vec_x0)
-# endif
-	testl	%eax, %eax
-	jz	L(aligned_more)
-	tzcntl	%eax, %eax
-# ifdef USE_AS_WMEMCHR
-	/* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count.  */
+	ja	L(more_1x_vec)
+	tzcnt	%VRAX, %VRAX
+	cmpl	%eax, %edx
+	jbe	L(zero_0)
+L(first_vec_x0_ret):
 	leaq	(%rdi, %rax, CHAR_SIZE), %rax
-# else
-	addq	%rdi, %rax
 # endif
 	ret
 
-# ifndef USE_AS_RAWMEMCHR
-L(zero):
-	xorl	%eax, %eax
-	ret
-
-	.p2align 4
-L(first_vec_x0):
-	/* Check if first match was before length. NB: tzcnt has false data-
-	   dependency on destination. eax already had a data-dependency on esi
-	   so this should have no affect here.  */
-	tzcntl	%eax, %esi
-#  ifdef USE_AS_WMEMCHR
-	leaq	(%rdi, %rsi, CHAR_SIZE), %rdi
-#  else
-	addq	%rsi, %rdi
-#  endif
+	/* Only fits in first cache line for VEC_SIZE == 32.  */
+# if VEC_SIZE == 32
+	.p2align 4,, 2
+L(zero_0):
 	xorl	%eax, %eax
-	cmpl	%esi, %edx
-	cmovg	%rdi, %rax
 	ret
 # endif
 
-	.p2align 4
-L(cross_page_boundary):
-	/* Save pointer before aligning as its original value is
-	   necessary for computer return address if byte is found or
-	   adjusting length if it is not and this is memchr.  */
-	movq	%rdi, %rcx
-	/* Align data to VEC_SIZE. ALGN_PTR_REG is rcx for memchr and rdi
-	   for rawmemchr.  */
-	andq	$-VEC_SIZE, %ALGN_PTR_REG
-	VPCMP	$0, (%ALGN_PTR_REG), %YMMMATCH, %k0
-	kmovd	%k0, %r8d
+	.p2align 4,, 9
+L(more_1x_vec):
 # ifdef USE_AS_WMEMCHR
-	/* NB: Divide shift count by 4 since each bit in K0 represent 4
-	   bytes.  */
-	sarl	$2, %eax
-# endif
-# ifndef USE_AS_RAWMEMCHR
-	movl	$(PAGE_SIZE / CHAR_SIZE), %esi
-	subl	%eax, %esi
+	/* If wmemchr still need to test if there was a match in first
+	   VEC.  Use bsf to test here so we can reuse
+	   L(first_vec_x0_ret).  */
+	bsf	%VRAX, %VRAX
+	jnz	L(first_vec_x0_ret)
 # endif
+
+L(page_cross_continue):
 # ifdef USE_AS_WMEMCHR
-	andl	$(CHAR_PER_VEC - 1), %eax
-# endif
-	/* Remove the leading bytes.  */
-	sarxl	%eax, %r8d, %eax
-# ifndef USE_AS_RAWMEMCHR
-	/* Check the end of data.  */
-	cmpq	%rsi, %rdx
-	jbe	L(first_vec_x0)
+	/* We can't use end of the buffer to re-calculate length for
+	   wmemchr as len * CHAR_SIZE may overflow.  */
+	leaq	-(VEC_SIZE + CHAR_SIZE)(%rdi), %rax
+	andq	$(VEC_SIZE * -1), %rdi
+	subq	%rdi, %rax
+	sarq	$2, %rax
+	addq	%rdx, %rax
+# else
+	leaq	-(VEC_SIZE + 1)(%rdx, %rdi), %rax
+	andq	$(VEC_SIZE * -1), %rdi
+	subq	%rdi, %rax
 # endif
-	testl	%eax, %eax
-	jz	L(cross_page_continue)
-	tzcntl	%eax, %eax
+
+	/* rax contains remaining length - 1.  -1 so we can get imm8
+	   encoding in a few additional places saving code size.  */
+
+	/* Needed regardless of remaining length.  */
+	VPCMPEQ	VEC_SIZE(%rdi), %VMATCH, %k0
+	KMOV	%k0, %VRDX
+
+	/* We cannot fold the above `sub %rdi, %rax` with the `cmp
+	   $(CHAR_PER_VEC * 2), %rax` because its possible for a very
+	   large length to overflow and cause the subtract to carry
+	   despite length being above CHAR_PER_VEC * 2.  */
+	cmpq	$(CHAR_PER_VEC * 2 - 1), %rax
+	ja	L(more_2x_vec)
+L(last_2x_vec):
+
+	test	%VRDX, %VRDX
+	jnz	L(first_vec_x1_check)
+
+	/* Check the end of data.  NB: use 8-bit operations to save code
+	   size.  We no longer need the full-width of eax and will
+	   perform a write-only operation over eax so there will be no
+	   partial-register stalls.  */
+	subb	$(CHAR_PER_VEC * 1 - 1), %al
+	jle	L(zero_0)
+
+	VPCMPEQ	(VEC_SIZE * 2)(%rdi), %VMATCH, %k0
+	KMOV	%k0, %VRCX
 # ifdef USE_AS_WMEMCHR
-	/* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count.  */
-	leaq	(%RAW_PTR_REG, %rax, CHAR_SIZE), %rax
+	/* For wmemchr against we can't take advantage of tzcnt(0) ==
+	   VEC_SIZE as CHAR_PER_VEC != VEC_SIZE.  */
+	test	%VRCX, %VRCX
+	jz	L(zero_0)
+# endif
+	tzcnt	%VRCX, %VRCX
+	cmp	%cl, %al
+
+	/* Same CFG for VEC_SIZE == 64 and VEC_SIZE == 32.  We give
+	   fallthrough to L(zero_0) for VEC_SIZE == 64 here as there is
+	   not enough space before the next cache line to fit the `lea`
+	   for return.  */
+# if VEC_SIZE == 64
+	ja	L(first_vec_x2_ret)
+L(zero_0):
+	xorl	%eax, %eax
+	ret
 # else
-	addq	%RAW_PTR_REG, %rax
+	jbe	L(zero_0)
+	leaq	(VEC_SIZE * 2)(%rdi, %rcx, CHAR_SIZE), %rax
+	ret
 # endif
+
+	.p2align 4,, 5
+L(first_vec_x1_check):
+	bsf	%VRDX, %VRDX
+	cmpb	%dl, %al
+	jb	L(zero_4)
+	leaq	(VEC_SIZE * 1)(%rdi, %rdx, CHAR_SIZE), %rax
 	ret
 
-	.p2align 4
-L(first_vec_x1):
-	tzcntl	%eax, %eax
-	leaq	VEC_SIZE(%rdi, %rax, CHAR_SIZE), %rax
+	/* Fits at the end of the cache line here for VEC_SIZE == 32.
+	 */
+# if VEC_SIZE == 32
+L(zero_4):
+	xorl	%eax, %eax
 	ret
+# endif
 
-	.p2align 4
+
+	.p2align 4,, 4
 L(first_vec_x2):
-	tzcntl	%eax, %eax
-	leaq	(VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
+	bsf	%VRCX, %VRCX
+L(first_vec_x2_ret):
+	leaq	(VEC_SIZE * 2)(%rdi, %rcx, CHAR_SIZE), %rax
 	ret
 
-	.p2align 4
-L(first_vec_x3):
-	tzcntl	%eax, %eax
-	leaq	(VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax
+	/* Fits at the end of the cache line here for VEC_SIZE == 64.
+	 */
+# if VEC_SIZE == 64
+L(zero_4):
+	xorl	%eax, %eax
 	ret
+# endif
 
-	.p2align 4
-L(first_vec_x4):
-	tzcntl	%eax, %eax
-	leaq	(VEC_SIZE * 4)(%rdi, %rax, CHAR_SIZE), %rax
+	.p2align 4,, 4
+L(first_vec_x1):
+	bsf	%VRDX, %VRDX
+	leaq	(VEC_SIZE * 1)(%rdi, %rdx, CHAR_SIZE), %rax
 	ret
 
-	.p2align 5
-L(aligned_more):
-	/* Check the first 4 * VEC_SIZE.  Only one VEC_SIZE at a time
-	   since data is only aligned to VEC_SIZE.  */
 
-# ifndef USE_AS_RAWMEMCHR
-	/* Align data to VEC_SIZE.  */
-L(cross_page_continue):
-	xorl	%ecx, %ecx
-	subl	%edi, %ecx
-	andq	$-VEC_SIZE, %rdi
-	/* esi is for adjusting length to see if near the end.  */
-	leal	(VEC_SIZE * 5)(%rdi, %rcx), %esi
-#  ifdef USE_AS_WMEMCHR
-	/* NB: Divide bytes by 4 to get the wchar_t count.  */
-	sarl	$2, %esi
-#  endif
-# else
-	andq	$-VEC_SIZE, %rdi
-L(cross_page_continue):
-# endif
-	/* Load first VEC regardless.  */
-	VPCMP	$0, (VEC_SIZE)(%rdi), %YMMMATCH, %k0
-	kmovd	%k0, %eax
-# ifndef USE_AS_RAWMEMCHR
-	/* Adjust length. If near end handle specially.  */
-	subq	%rsi, %rdx
-	jbe	L(last_4x_vec_or_less)
-# endif
-	testl	%eax, %eax
+	.p2align 4,, 5
+L(more_2x_vec):
+	/* Length > VEC_SIZE * 2 so check first 2x VEC before rechecking
+	   length.  */
+
+
+	/* Already computed matches for first VEC in rdx.  */
+	test	%VRDX, %VRDX
 	jnz	L(first_vec_x1)
 
-	VPCMP	$0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k0
-	kmovd	%k0, %eax
-	testl	%eax, %eax
+
+	VPCMPEQ	(VEC_SIZE * 2)(%rdi), %VMATCH, %k0
+	KMOV	%k0, %VRCX
+	test	%VRCX, %VRCX
 	jnz	L(first_vec_x2)
 
-	VPCMP	$0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k0
-	kmovd	%k0, %eax
-	testl	%eax, %eax
+	/* Needed regardless of next length check.  */
+	VPCMPEQ	(VEC_SIZE * 3)(%rdi), %VMATCH, %k0
+	KMOV	%k0, %VRCX
+
+	/* Check if we are near the end.  */
+	cmpq	$(CHAR_PER_VEC * 4 - 1), %rax
+	ja	L(more_4x_vec)
+
+	test	%VRCX, %VRCX
+	jnz	L(first_vec_x3_check)
+
+	/* Use 8-bit instructions to save code size.  We won't use full-
+	   width eax again and will perform a write-only operation to
+	   eax so no worries about partial-register stalls.  */
+	subb	$(CHAR_PER_VEC * 3), %al
+	jb	L(zero_2)
+L(last_vec_check):
+	VPCMPEQ	(VEC_SIZE * 4)(%rdi), %VMATCH, %k0
+	KMOV	%k0, %VRCX
+# ifdef USE_AS_WMEMCHR
+	/* For wmemchr against we can't take advantage of tzcnt(0) ==
+	   VEC_SIZE as CHAR_PER_VEC != VEC_SIZE.  */
+	test	%VRCX, %VRCX
+	jz	L(zero_2)
+# endif
+	tzcnt	%VRCX, %VRCX
+	cmp	%cl, %al
+	jae	L(first_vec_x4_ret)
+L(zero_2):
+	xorl	%eax, %eax
+	ret
+
+	/* Fits at the end of the cache line here for VEC_SIZE == 64.
+	   For VEC_SIZE == 32 we put the return label at the end of
+	   L(first_vec_x4).  */
+# if VEC_SIZE == 64
+L(first_vec_x4_ret):
+	leaq	(VEC_SIZE * 4)(%rdi, %rcx, CHAR_SIZE), %rax
+	ret
+# endif
+
+	.p2align 4,, 6
+L(first_vec_x4):
+	bsf	%VRCX, %VRCX
+# if VEC_SIZE == 32
+	/* Place L(first_vec_x4_ret) here as we can't fit it in the same
+	   cache line as where it is called from so we might as well
+	   save code size by reusing return of L(first_vec_x4).  */
+L(first_vec_x4_ret):
+# endif
+	leaq	(VEC_SIZE * 4)(%rdi, %rcx, CHAR_SIZE), %rax
+	ret
+
+	.p2align 4,, 6
+L(first_vec_x3_check):
+	/* Need to adjust remaining length before checking.  */
+	addb	$-(CHAR_PER_VEC * 2), %al
+	bsf	%VRCX, %VRCX
+	cmpb	%cl, %al
+	jb	L(zero_2)
+	leaq	(VEC_SIZE * 3)(%rdi, %rcx, CHAR_SIZE), %rax
+	ret
+
+	.p2align 4,, 6
+L(first_vec_x3):
+	bsf	%VRCX, %VRCX
+	leaq	(VEC_SIZE * 3)(%rdi, %rcx, CHAR_SIZE), %rax
+	ret
+
+	.p2align 4,, 3
+# if !USE_TERN_IN_LOOP
+	.p2align 4,, 10
+# endif
+L(more_4x_vec):
+	test	%VRCX, %VRCX
 	jnz	L(first_vec_x3)
 
-	VPCMP	$0, (VEC_SIZE * 4)(%rdi), %YMMMATCH, %k0
-	kmovd	%k0, %eax
-	testl	%eax, %eax
+	VPCMPEQ	(VEC_SIZE * 4)(%rdi), %VMATCH, %k0
+	KMOV	%k0, %VRCX
+	test	%VRCX, %VRCX
 	jnz	L(first_vec_x4)
 
+	subq	$-(VEC_SIZE * 5), %rdi
+	subq	$(CHAR_PER_VEC * 8), %rax
+	jb	L(last_4x_vec)
 
-# ifndef USE_AS_RAWMEMCHR
-	/* Check if at last CHAR_PER_VEC * 4 length.  */
-	subq	$(CHAR_PER_VEC * 4), %rdx
-	jbe	L(last_4x_vec_or_less_cmpeq)
-	/* +VEC_SIZE if USE_IN_RTM otherwise +VEC_SIZE * 5.  */
-	addq	$(VEC_SIZE + (VEC_SIZE * 4 - BASE_OFFSET)), %rdi
-
-	/* Align data to VEC_SIZE * 4 for the loop and readjust length.
-	 */
-#  ifdef USE_AS_WMEMCHR
+# ifdef USE_AS_WMEMCHR
 	movl	%edi, %ecx
-	andq	$-(4 * VEC_SIZE), %rdi
+# else
+	addq	%rdi, %rax
+# endif
+
+
+# if VEC_SIZE == 64
+	/* use xorb to do `andq $-(VEC_SIZE * 4), %rdi`. No evex
+	   processor has partial register stalls (all have merging
+	   uop). If that changes this can be removed.  */
+	xorb	%dil, %dil
+# else
+	andq	$-(VEC_SIZE * 4), %rdi
+# endif
+
+# ifdef USE_AS_WMEMCHR
 	subl	%edi, %ecx
-	/* NB: Divide bytes by 4 to get the wchar_t count.  */
 	sarl	$2, %ecx
-	addq	%rcx, %rdx
-#  else
-	addq	%rdi, %rdx
-	andq	$-(4 * VEC_SIZE), %rdi
-	subq	%rdi, %rdx
-#  endif
+	addq	%rcx, %rax
 # else
-	addq	$(VEC_SIZE + (VEC_SIZE * 4 - BASE_OFFSET)), %rdi
-	andq	$-(4 * VEC_SIZE), %rdi
+	subq	%rdi, %rax
 # endif
-# ifdef USE_IN_RTM
-	vpxorq	%XMMZERO, %XMMZERO, %XMMZERO
-# else
-	/* copy ymmmatch to ymm0 so we can use vpcmpeq which is not
-	   encodable with EVEX registers (ymm16-ymm31).  */
-	vmovdqa64 %YMMMATCH, %ymm0
+
+
+
+# if USE_TERN_IN_LOOP
+	/* copy VMATCH to low ymm so we can use vpcmpeq which is not
+	   encodable with EVEX registers.  NB: this is VEC_SIZE == 32
+	   only as there is no way to encode vpcmpeq with zmm0-15.  */
+	vmovdqa64 %VMATCH, %VMATCH_LO
 # endif
 
-	/* Compare 4 * VEC at a time forward.  */
-	.p2align 4
+	.p2align 4,, 11
 L(loop_4x_vec):
-	/* Two versions of the loop. One that does not require
-	   vzeroupper by not using ymm0-ymm15 and another does that require
-	   vzeroupper because it uses ymm0-ymm15. The reason why ymm0-ymm15
-	   is used at all is because there is no EVEX encoding vpcmpeq and
-	   with vpcmpeq this loop can be performed more efficiently. The
-	   non-vzeroupper version is safe for RTM while the vzeroupper
-	   version should be prefered if RTM are not supported.  */
-# ifdef USE_IN_RTM
-	/* It would be possible to save some instructions using 4x VPCMP
-	   but bottleneck on port 5 makes it not woth it.  */
-	VPCMP	$4, (VEC_SIZE * 4)(%rdi), %YMMMATCH, %k1
-	/* xor will set bytes match esi to zero.  */
-	vpxorq	(VEC_SIZE * 5)(%rdi), %YMMMATCH, %YMM2
-	vpxorq	(VEC_SIZE * 6)(%rdi), %YMMMATCH, %YMM3
-	VPCMP	$0, (VEC_SIZE * 7)(%rdi), %YMMMATCH, %k3
-	/* Reduce VEC2 / VEC3 with min and VEC1 with zero mask.  */
-	VPMINU	%YMM2, %YMM3, %YMM3{%k1}{z}
-	VPCMP	$0, %YMM3, %YMMZERO, %k2
-# else
+	/* Two versions of the loop.  One that does not require
+	   vzeroupper by not using ymmm0-15 and another does that
+	   require vzeroupper because it uses ymmm0-15.  The reason why
+	   ymm0-15 is used at all is because there is no EVEX encoding
+	   vpcmpeq and with vpcmpeq this loop can be performed more
+	   efficiently.  The non-vzeroupper version is safe for RTM
+	   while the vzeroupper version should be prefered if RTM are
+	   not supported.   Which loop version we use is determined by
+	   USE_TERN_IN_LOOP.  */
+
+# if USE_TERN_IN_LOOP
 	/* Since vptern can only take 3x vectors fastest to do 1 vec
 	   seperately with EVEX vpcmp.  */
 #  ifdef USE_AS_WMEMCHR
 	/* vptern can only accept masks for epi32/epi64 so can only save
-	   instruction using not equals mask on vptern with wmemchr.  */
-	VPCMP	$4, (%rdi), %YMMMATCH, %k1
+	   instruction using not equals mask on vptern with wmemchr.
+	 */
+	VPCMP	$4, (VEC_SIZE * 0)(%rdi), %VMATCH, %k1
 #  else
-	VPCMP	$0, (%rdi), %YMMMATCH, %k1
+	VPCMPEQ	(VEC_SIZE * 0)(%rdi), %VMATCH, %k1
 #  endif
 	/* Compare 3x with vpcmpeq and or them all together with vptern.
 	 */
-	VPCMPEQ	VEC_SIZE(%rdi), %ymm0, %ymm2
-	VPCMPEQ	(VEC_SIZE * 2)(%rdi), %ymm0, %ymm3
-	VPCMPEQ	(VEC_SIZE * 3)(%rdi), %ymm0, %ymm4
+	VPCMPEQ	(VEC_SIZE * 1)(%rdi), %VMATCH_LO, %VMM_lo(2)
+	VPCMPEQ	(VEC_SIZE * 2)(%rdi), %VMATCH_LO, %VMM_lo(3)
+	VPCMPEQ	(VEC_SIZE * 3)(%rdi), %VMATCH_LO, %VMM_lo(4)
 #  ifdef USE_AS_WMEMCHR
-	/* This takes the not of or between ymm2, ymm3, ymm4 as well as
-	   combines result from VEC0 with zero mask.  */
-	vpternlogd $1, %ymm2, %ymm3, %ymm4{%k1}{z}
-	vpmovmskb %ymm4, %ecx
+	/* This takes the not of or between VEC_lo(2), VEC_lo(3),
+	   VEC_lo(4) as well as combines result from VEC(0) with zero
+	   mask.  */
+	vpternlogd $1, %VMM_lo(2), %VMM_lo(3), %VMM_lo(4){%k1}{z}
+	vpmovmskb %VMM_lo(4), %VRCX
 #  else
-	/* 254 is mask for oring ymm2, ymm3, ymm4 into ymm4.  */
-	vpternlogd $254, %ymm2, %ymm3, %ymm4
-	vpmovmskb %ymm4, %ecx
-	kmovd	%k1, %eax
+	/* 254 is mask for oring VEC_lo(2), VEC_lo(3), VEC_lo(4) into
+	   VEC_lo(4).  */
+	vpternlogd $254, %VMM_lo(2), %VMM_lo(3), %VMM_lo(4)
+	vpmovmskb %VMM_lo(4), %VRCX
+	KMOV	%k1, %edx
 #  endif
-# endif
 
-# ifdef USE_AS_RAWMEMCHR
-	subq	$-(VEC_SIZE * 4), %rdi
-# endif
-# ifdef USE_IN_RTM
-	kortestd %k2, %k3
 # else
-#  ifdef USE_AS_WMEMCHR
-	/* ecx contains not of matches. All 1s means no matches. incl will
-	   overflow and set zeroflag if that is the case.  */
-	incl	%ecx
-#  else
-	/* If either VEC1 (eax) or VEC2-VEC4 (ecx) are not zero. Adding
-	   to ecx is not an issue because if eax is non-zero it will be
-	   used for returning the match. If it is zero the add does
-	   nothing.  */
-	addq	%rax, %rcx
-#  endif
+	/* Loop version that uses EVEX encoding.  */
+	VPCMP	$4, (VEC_SIZE * 0)(%rdi), %VMATCH, %k1
+	vpxorq	(VEC_SIZE * 1)(%rdi), %VMATCH, %VMM(2)
+	vpxorq	(VEC_SIZE * 2)(%rdi), %VMATCH, %VMM(3)
+	VPCMPEQ	(VEC_SIZE * 3)(%rdi), %VMATCH, %k3
+	VPMINU	%VMM(2), %VMM(3), %VMM(3){%k1}{z}
+	VPTESTN	%VMM(3), %VMM(3), %k2
 # endif
-# ifdef USE_AS_RAWMEMCHR
-	jz	L(loop_4x_vec)
-# else
-	jnz	L(loop_4x_vec_end)
+
+
+	TEST_END ()
+	jnz	L(loop_vec_ret)
 
 	subq	$-(VEC_SIZE * 4), %rdi
 
-	subq	$(CHAR_PER_VEC * 4), %rdx
-	ja	L(loop_4x_vec)
+	subq	$(CHAR_PER_VEC * 4), %rax
+	jae	L(loop_4x_vec)
 
-	/* Fall through into less than 4 remaining vectors of length case.
+	/* COND_VZEROUPPER is vzeroupper if we use the VEX encoded loop.
 	 */
-	VPCMP	$0, BASE_OFFSET(%rdi), %YMMMATCH, %k0
-	addq	$(BASE_OFFSET - VEC_SIZE), %rdi
-	kmovd	%k0, %eax
-	VZEROUPPER
-
-L(last_4x_vec_or_less):
-	/* Check if first VEC contained match.  */
-	testl	%eax, %eax
-	jnz	L(first_vec_x1_check)
+	COND_VZEROUPPER
 
-	/* If remaining length > CHAR_PER_VEC * 2.  */
-	addl	$(CHAR_PER_VEC * 2), %edx
-	jg	L(last_4x_vec)
-
-L(last_2x_vec):
-	/* If remaining length < CHAR_PER_VEC.  */
-	addl	$CHAR_PER_VEC, %edx
-	jle	L(zero_end)
-
-	/* Check VEC2 and compare any match with remaining length.  */
-	VPCMP	$0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k0
-	kmovd	%k0, %eax
-	tzcntl	%eax, %eax
-	cmpl	%eax, %edx
-	jbe	L(set_zero_end)
-	leaq	(VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
-L(zero_end):
-	ret
+	.p2align 4,, 10
+L(last_4x_vec):
+	/* For CHAR_PER_VEC == 64 we don't need to mask as we use 8-bit
+	   instructions on eax from here on out.  */
+# if CHAR_PER_VEC != 64
+	andl	$(CHAR_PER_VEC * 4 - 1), %eax
+# endif
+	VPCMPEQ	(VEC_SIZE * 0)(%rdi), %VMATCH, %k0
+	subq	$(VEC_SIZE * 1), %rdi
+	KMOV	%k0, %VRDX
+	cmpb	$(CHAR_PER_VEC * 2 - 1), %al
+	jbe	L(last_2x_vec)
+	test	%VRDX, %VRDX
+	jnz	L(last_vec_x1_novzero)
+
+	VPCMPEQ	(VEC_SIZE * 2)(%rdi), %VMATCH, %k0
+	KMOV	%k0, %VRDX
+	test	%VRDX, %VRDX
+	jnz	L(last_vec_x2_novzero)
+
+	VPCMPEQ	(VEC_SIZE * 3)(%rdi), %VMATCH, %k0
+	KMOV	%k0, %VRCX
+	test	%VRCX, %VRCX
+	jnz	L(first_vec_x3_check)
+
+	subb	$(CHAR_PER_VEC * 3), %al
+	jae	L(last_vec_check)
 
-L(set_zero_end):
 	xorl	%eax, %eax
 	ret
 
-	.p2align 4
-L(first_vec_x1_check):
-	/* eax must be non-zero. Use bsfl to save code size.  */
-	bsfl	%eax, %eax
-	/* Adjust length.  */
-	subl	$-(CHAR_PER_VEC * 4), %edx
-	/* Check if match within remaining length.  */
-	cmpl	%eax, %edx
-	jbe	L(set_zero_end)
-	/* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count.  */
-	leaq	VEC_SIZE(%rdi, %rax, CHAR_SIZE), %rax
+# if defined USE_AS_WMEMCHR && USE_TERN_IN_LOOP
+L(last_vec_x2_novzero):
+	addq	$VEC_SIZE, %rdi
+L(last_vec_x1_novzero):
+	bsf	%VRDX, %VRDX
+	leaq	(VEC_SIZE * 1)(%rdi, %rdx, CHAR_SIZE), %rax
 	ret
+# endif
 
-	.p2align 4
-L(loop_4x_vec_end):
+# if CHAR_PER_VEC == 64
+	/* Since we can't combine the last 2x VEC when CHAR_PER_VEC ==
+	   64 it needs a seperate return label.  */
+	.p2align 4,, 4
+L(last_vec_x2):
+L(last_vec_x2_novzero):
+	bsf	%VRDX, %VRDX
+	leaq	(VEC_SIZE * 2)(%rdi, %rdx, TERN_CHAR_MULT), %rax
+	ret
 # endif
-	/* rawmemchr will fall through into this if match was found in
-	   loop.  */
 
-# if defined USE_IN_RTM || defined USE_AS_WMEMCHR
-	/* k1 has not of matches with VEC1.  */
-	kmovd	%k1, %eax
-#  ifdef USE_AS_WMEMCHR
-	subl	$((1 << CHAR_PER_VEC) - 1), %eax
-#  else
-	incl	%eax
-#  endif
+	.p2align 4,, 4
+L(loop_vec_ret):
+# if defined USE_AS_WMEMCHR || !USE_TERN_IN_LOOP
+	KMOV	%k1, %VRAX
+	inc	%MASK_GPR(rax)
 # else
-	/* eax already has matches for VEC1.  */
-	testl	%eax, %eax
+	test	%VRDX, %VRDX
 # endif
-	jnz	L(last_vec_x1_return)
+	jnz	L(last_vec_x0)
 
-# ifdef USE_IN_RTM
-	VPCMP	$0, %YMM2, %YMMZERO, %k0
-	kmovd	%k0, %eax
+
+# if USE_TERN_IN_LOOP
+	vpmovmskb %VMM_lo(2), %VRDX
 # else
-	vpmovmskb %ymm2, %eax
+	VPTESTN	%VMM(2), %VMM(2), %k1
+	KMOV	%k1, %VRDX
 # endif
-	testl	%eax, %eax
-	jnz	L(last_vec_x2_return)
+	test	%VRDX, %VRDX
+	jnz	L(last_vec_x1)
 
-# ifdef USE_IN_RTM
-	kmovd	%k2, %eax
-	testl	%eax, %eax
-	jnz	L(last_vec_x3_return)
 
-	kmovd	%k3, %eax
-	tzcntl	%eax, %eax
-	leaq	(VEC_SIZE * 3 + RET_OFFSET)(%rdi, %rax, CHAR_SIZE), %rax
+# if USE_TERN_IN_LOOP
+	vpmovmskb %VMM_lo(3), %VRDX
 # else
-	vpmovmskb %ymm3, %eax
-	/* Combine matches in VEC3 (eax) with matches in VEC4 (ecx).  */
-	salq	$VEC_SIZE, %rcx
-	orq	%rcx, %rax
-	tzcntq	%rax, %rax
-	leaq	(VEC_SIZE * 2 + RET_OFFSET)(%rdi, %rax), %rax
-	VZEROUPPER
+	KMOV	%k2, %VRDX
 # endif
-	ret
 
-	.p2align 4,, 10
-L(last_vec_x1_return):
-	tzcntl	%eax, %eax
-# if defined USE_AS_WMEMCHR || RET_OFFSET != 0
-	/* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count.  */
-	leaq	RET_OFFSET(%rdi, %rax, CHAR_SIZE), %rax
+	/* No longer need any of the lo vecs (ymm0-15) so vzeroupper
+	   (only if used VEX encoded loop).  */
+	COND_VZEROUPPER
+
+	/* Seperate logic for CHAR_PER_VEC == 64 vs the rest.  For
+	   CHAR_PER_VEC we test the last 2x VEC seperately, for
+	   CHAR_PER_VEC <= 32 we can combine the results from the 2x
+	   VEC in a single GPR.  */
+# if CHAR_PER_VEC == 64
+#  if USE_TERN_IN_LOOP
+#   error "Unsupported"
+#  endif
+
+
+	/* If CHAR_PER_VEC == 64 we can't combine the last two VEC.  */
+	test	%VRDX, %VRDX
+	jnz	L(last_vec_x2)
+	KMOV	%k3, %VRDX
 # else
-	addq	%rdi, %rax
+	/* CHAR_PER_VEC <= 32 so we can combine the results from the
+	   last 2x VEC.  */
+
+#  if !USE_TERN_IN_LOOP
+	KMOV	%k3, %VRCX
+#  endif
+	salq	$(VEC_SIZE / TERN_CHAR_MULT), %rcx
+	addq	%rcx, %rdx
+#  if !defined USE_AS_WMEMCHR || !USE_TERN_IN_LOOP
+L(last_vec_x2_novzero):
+#  endif
 # endif
-	VZEROUPPER
+	bsf	%rdx, %rdx
+	leaq	(LAST_VEC_OFFSET)(%rdi, %rdx, TERN_CHAR_MULT), %rax
 	ret
 
-	.p2align 4
-L(last_vec_x2_return):
-	tzcntl	%eax, %eax
-	/* NB: Multiply bytes by RET_SCALE to get the wchar_t count
-	   if relevant (RET_SCALE = CHAR_SIZE if USE_AS_WMEMCHAR and
-	   USE_IN_RTM are both defined. Otherwise RET_SCALE = 1.  */
-	leaq	(VEC_SIZE + RET_OFFSET)(%rdi, %rax, RET_SCALE), %rax
-	VZEROUPPER
+	.p2align 4,, 8
+L(last_vec_x1):
+	COND_VZEROUPPER
+# if !defined USE_AS_WMEMCHR || !USE_TERN_IN_LOOP
+L(last_vec_x1_novzero):
+# endif
+	bsf	%VRDX, %VRDX
+	leaq	(VEC_SIZE * 1)(%rdi, %rdx, TERN_CHAR_MULT), %rax
 	ret
 
-# ifdef USE_IN_RTM
-	.p2align 4
-L(last_vec_x3_return):
-	tzcntl	%eax, %eax
-	/* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count.  */
-	leaq	(VEC_SIZE * 2 + RET_OFFSET)(%rdi, %rax, CHAR_SIZE), %rax
+
+	.p2align 4,, 4
+L(last_vec_x0):
+	COND_VZEROUPPER
+	bsf	%VGPR(GPR_X0), %VGPR(GPR_X0)
+# if GPR_X0_IS_RET
+	addq	%rdi, %rax
+# else
+	leaq	(%rdi, %GPR_X0, CHAR_SIZE), %rax
+# endif
 	ret
+
+	.p2align 4,, 6
+L(page_cross):
+	/* Need to preserve eax to compute inbound bytes we are
+	   checking.  */
+# ifdef USE_AS_WMEMCHR
+	movl	%eax, %ecx
+# else
+	xorl	%ecx, %ecx
+	subl	%eax, %ecx
 # endif
 
-# ifndef USE_AS_RAWMEMCHR
-	.p2align 4,, 5
-L(last_4x_vec_or_less_cmpeq):
-	VPCMP	$0, (VEC_SIZE * 5)(%rdi), %YMMMATCH, %k0
-	kmovd	%k0, %eax
-	subq	$-(VEC_SIZE * 4), %rdi
-	/* Check first VEC regardless.  */
-	testl	%eax, %eax
-	jnz	L(first_vec_x1_check)
+	xorq	%rdi, %rax
+	VPCMPEQ	(PAGE_SIZE - VEC_SIZE)(%rax), %VMATCH, %k0
+	KMOV	%k0, %VRAX
 
-	/* If remaining length <= CHAR_PER_VEC * 2.  */
-	addl	$(CHAR_PER_VEC * 2), %edx
-	jle	L(last_2x_vec)
+# ifdef USE_AS_WMEMCHR
+	/* NB: Divide by CHAR_SIZE to shift out out of bounds bytes.  */
+	shrl	$2, %ecx
+	andl	$(CHAR_PER_VEC - 1), %ecx
+# endif
 
-	.p2align 4
-L(last_4x_vec):
-	VPCMP	$0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k0
-	kmovd	%k0, %eax
-	testl	%eax, %eax
-	jnz	L(last_vec_x2)
 
+	shrx	%VGPR(PC_SHIFT_GPR), %VRAX, %VRAX
 
-	VPCMP	$0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k0
-	kmovd	%k0, %eax
-	/* Create mask for possible matches within remaining length.  */
-#  ifdef USE_AS_WMEMCHR
-	movl	$((1 << (CHAR_PER_VEC * 2)) - 1), %ecx
-	bzhil	%edx, %ecx, %ecx
-#  else
-	movq	$-1, %rcx
-	bzhiq	%rdx, %rcx, %rcx
-#  endif
-	/* Test matches in data against length match.  */
-	andl	%ecx, %eax
-	jnz	L(last_vec_x3)
+# ifdef USE_AS_WMEMCHR
+	negl	%ecx
+# endif
 
-	/* if remaining length <= CHAR_PER_VEC * 3 (Note this is after
-	   remaining length was found to be > CHAR_PER_VEC * 2.  */
-	subl	$CHAR_PER_VEC, %edx
-	jbe	L(zero_end2)
+	/* mask lower bits from ecx (negative eax) to get bytes till
+	   next VEC.  */
+	andl	$(CHAR_PER_VEC - 1), %ecx
 
+	/* Check if VEC is entirely contained in the remainder of the
+	   page.  */
+	cmpq	%rcx, %rdx
+	jbe	L(page_cross_ret)
 
-	VPCMP	$0, (VEC_SIZE * 4)(%rdi), %YMMMATCH, %k0
-	kmovd	%k0, %eax
-	/* Shift remaining length mask for last VEC.  */
-#  ifdef USE_AS_WMEMCHR
-	shrl	$CHAR_PER_VEC, %ecx
-#  else
-	shrq	$CHAR_PER_VEC, %rcx
-#  endif
-	andl	%ecx, %eax
-	jz	L(zero_end2)
-	bsfl	%eax, %eax
-	leaq	(VEC_SIZE * 4)(%rdi, %rax, CHAR_SIZE), %rax
-L(zero_end2):
-	ret
+	/* Length crosses the page so if rax is zero (no matches)
+	   continue.  */
+	test	%VRAX, %VRAX
+	jz	L(page_cross_continue)
 
-L(last_vec_x2):
-	tzcntl	%eax, %eax
-	leaq	(VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
+	/* if rdx > rcx then any match here must be in [buf:buf + len].
+	 */
+	tzcnt	%VRAX, %VRAX
+# ifdef USE_AS_WMEMCHR
+	leaq	(%rdi, %rax, CHAR_SIZE), %rax
+# else
+	addq	%rdi, %rax
+# endif
 	ret
 
-	.p2align 4
-L(last_vec_x3):
-	tzcntl	%eax, %eax
-	leaq	(VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax
+	.p2align 4,, 2
+L(page_cross_zero):
+	xorl	%eax, %eax
 	ret
+
+	.p2align 4,, 4
+L(page_cross_ret):
+	/* Search is entirely contained in page cross case.  */
+# ifdef USE_AS_WMEMCHR
+	test	%VRAX, %VRAX
+	jz	L(page_cross_zero)
+# endif
+	tzcnt	%VRAX, %VRAX
+	cmpl	%eax, %edx
+	jbe	L(page_cross_zero)
+# ifdef USE_AS_WMEMCHR
+	leaq	(%rdi, %rax, CHAR_SIZE), %rax
+# else
+	addq	%rdi, %rax
 # endif
-	/* 7 bytes from next cache line.  */
+	ret
 END (MEMCHR)
 #endif
diff --git a/sysdeps/x86_64/multiarch/rawmemchr-evex-rtm.S b/sysdeps/x86_64/multiarch/rawmemchr-evex-rtm.S
index deda1ca395..2073eaa620 100644
--- a/sysdeps/x86_64/multiarch/rawmemchr-evex-rtm.S
+++ b/sysdeps/x86_64/multiarch/rawmemchr-evex-rtm.S
@@ -1,3 +1,6 @@
-#define MEMCHR __rawmemchr_evex_rtm
-#define USE_AS_RAWMEMCHR 1
-#include "memchr-evex-rtm.S"
+#define RAWMEMCHR	__rawmemchr_evex_rtm
+
+#define USE_IN_RTM	1
+#define SECTION(p)	p##.evex.rtm
+
+#include "rawmemchr-evex.S"
diff --git a/sysdeps/x86_64/multiarch/rawmemchr-evex.S b/sysdeps/x86_64/multiarch/rawmemchr-evex.S
index dc1c450699..dad54def2b 100644
--- a/sysdeps/x86_64/multiarch/rawmemchr-evex.S
+++ b/sysdeps/x86_64/multiarch/rawmemchr-evex.S
@@ -1,7 +1,308 @@
-#ifndef RAWMEMCHR
-# define RAWMEMCHR	__rawmemchr_evex
-#endif
-#define USE_AS_RAWMEMCHR	1
-#define MEMCHR	RAWMEMCHR
+/* rawmemchr optimized with 256-bit EVEX instructions.
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <isa-level.h>
+#include <sysdep.h>
+
+#if ISA_SHOULD_BUILD (4)
+
+# ifndef VEC_SIZE
+#  include "x86-evex256-vecs.h"
+# endif
+
+# ifndef RAWMEMCHR
+#  define RAWMEMCHR	__rawmemchr_evex
+# endif
+
+
+# define PC_SHIFT_GPR	rdi
+# define REG_WIDTH	VEC_SIZE
+# define VPTESTN	vptestnmb
+# define VPBROADCAST	vpbroadcastb
+# define VPMINU	vpminub
+# define VPCMP	vpcmpb
+# define VPCMPEQ	vpcmpeqb
+# define CHAR_SIZE	1
+
+# include "reg-macros.h"
+
+/* If not in an RTM and VEC_SIZE != 64 (the VEC_SIZE = 64
+   doesn't have VEX encoding), use VEX encoding in loop so we
+   can use vpcmpeqb + vptern which is more efficient than the
+   EVEX alternative.  */
+# if defined USE_IN_RTM || VEC_SIZE == 64
+#  undef COND_VZEROUPPER
+#  undef VZEROUPPER_RETURN
+#  undef VZEROUPPER
+
+
+#  define COND_VZEROUPPER
+#  define VZEROUPPER_RETURN	ret
+#  define VZEROUPPER
+
+#  define USE_TERN_IN_LOOP	0
+# else
+#  define USE_TERN_IN_LOOP	1
+#  undef VZEROUPPER
+#  define VZEROUPPER	vzeroupper
+# endif
+
+# define CHAR_PER_VEC	VEC_SIZE
+
+# if CHAR_PER_VEC == 64
+
+#  define TAIL_RETURN_LBL	first_vec_x2
+#  define TAIL_RETURN_OFFSET	(CHAR_PER_VEC * 2)
+
+#  define FALLTHROUGH_RETURN_LBL	first_vec_x3
+#  define FALLTHROUGH_RETURN_OFFSET	(CHAR_PER_VEC * 3)
+
+# else	/* !(CHAR_PER_VEC == 64) */
+
+#  define TAIL_RETURN_LBL	first_vec_x3
+#  define TAIL_RETURN_OFFSET	(CHAR_PER_VEC * 3)
+
+#  define FALLTHROUGH_RETURN_LBL	first_vec_x2
+#  define FALLTHROUGH_RETURN_OFFSET	(CHAR_PER_VEC * 2)
+# endif	/* !(CHAR_PER_VEC == 64) */
+
+
+# define VMATCH	VMM(0)
+# define VMATCH_LO	VMM_lo(0)
+
+# define PAGE_SIZE	4096
+
+	.section SECTION(.text), "ax", @progbits
+ENTRY_P2ALIGN (RAWMEMCHR, 6)
+	VPBROADCAST %esi, %VMATCH
+	/* Check if we may cross page boundary with one vector load.  */
+	movl	%edi, %eax
+	andl	$(PAGE_SIZE - 1), %eax
+	cmpl	$(PAGE_SIZE - VEC_SIZE), %eax
+	ja	L(page_cross)
+
+	VPCMPEQ	(%rdi), %VMATCH, %k0
+	KMOV	%k0, %VRAX
+
+	test	%VRAX, %VRAX
+	jz	L(aligned_more)
+L(first_vec_x0):
+	bsf	%VRAX, %VRAX
+	addq	%rdi, %rax
+	ret
+
+	.p2align 4,, 4
+L(first_vec_x4):
+	bsf	%VRAX, %VRAX
+	leaq	(VEC_SIZE * 4)(%rdi, %rax), %rax
+	ret
 
-#include "memchr-evex.S"
+	/* For VEC_SIZE == 32 we can fit this in aligning bytes so might
+	   as well place it more locally.  For VEC_SIZE == 64 we reuse
+	   return code at the end of loop's return.  */
+# if VEC_SIZE == 32
+	.p2align 4,, 4
+L(FALLTHROUGH_RETURN_LBL):
+	bsf	%VRAX, %VRAX
+	leaq	(FALLTHROUGH_RETURN_OFFSET)(%rdi, %rax), %rax
+	ret
+# endif
+
+	.p2align 4,, 6
+L(page_cross):
+	/* eax has lower page-offset bits of rdi so xor will zero them
+	   out.  */
+	xorq	%rdi, %rax
+	VPCMPEQ	(PAGE_SIZE - VEC_SIZE)(%rax), %VMATCH, %k0
+	KMOV	%k0, %VRAX
+
+	/* Shift out out-of-bounds matches.  */
+	shrx	%VRDI, %VRAX, %VRAX
+	test	%VRAX, %VRAX
+	jnz	L(first_vec_x0)
+
+	.p2align 4,, 10
+L(aligned_more):
+L(page_cross_continue):
+	/* Align pointer.  */
+	andq	$(VEC_SIZE * -1), %rdi
+
+	VPCMPEQ	VEC_SIZE(%rdi), %VMATCH, %k0
+	KMOV	%k0, %VRAX
+	test	%VRAX, %VRAX
+	jnz	L(first_vec_x1)
+
+	VPCMPEQ	(VEC_SIZE * 2)(%rdi), %VMATCH, %k0
+	KMOV	%k0, %VRAX
+	test	%VRAX, %VRAX
+	jnz	L(first_vec_x2)
+
+	VPCMPEQ	(VEC_SIZE * 3)(%rdi), %VMATCH, %k0
+	KMOV	%k0, %VRAX
+	test	%VRAX, %VRAX
+	jnz	L(first_vec_x3)
+
+	VPCMPEQ	(VEC_SIZE * 4)(%rdi), %VMATCH, %k0
+	KMOV	%k0, %VRAX
+	test	%VRAX, %VRAX
+	jnz	L(first_vec_x4)
+
+	subq	$-(VEC_SIZE * 1), %rdi
+# if VEC_SIZE == 64
+	/* Saves code size.  No evex512 processor has partial register
+	   stalls.  If that change this can be replaced with `andq
+	   $-(VEC_SIZE * 4), %rdi`.  */
+	xorb	%dil, %dil
+# else
+	andq	$-(VEC_SIZE * 4), %rdi
+# endif
+
+# if USE_TERN_IN_LOOP
+	/* copy VMATCH to low ymm so we can use vpcmpeq which is not
+	   encodable with EVEX registers.  NB: this is VEC_SIZE == 32
+	   only as there is no way to encode vpcmpeq with zmm0-15.  */
+	vmovdqa64 %VMATCH, %VMATCH_LO
+# endif
+
+	.p2align 4
+L(loop_4x_vec):
+	/* Two versions of the loop.  One that does not require
+	   vzeroupper by not using ymm0-15 and another does that
+	   require vzeroupper because it uses ymm0-15.  The reason why
+	   ymm0-15 is used at all is because there is no EVEX encoding
+	   vpcmpeq and with vpcmpeq this loop can be performed more
+	   efficiently.  The non-vzeroupper version is safe for RTM
+	   while the vzeroupper version should be prefered if RTM are
+	   not supported.   Which loop version we use is determined by
+	   USE_TERN_IN_LOOP.  */
+
+# if USE_TERN_IN_LOOP
+	/* Since vptern can only take 3x vectors fastest to do 1 vec
+	   seperately with EVEX vpcmp.  */
+	VPCMPEQ	(VEC_SIZE * 4)(%rdi), %VMATCH, %k1
+	/* Compare 3x with vpcmpeq and or them all together with vptern.
+	 */
+
+	VPCMPEQ	(VEC_SIZE * 5)(%rdi), %VMATCH_LO, %VMM_lo(2)
+	subq	$(VEC_SIZE * -4), %rdi
+	VPCMPEQ	(VEC_SIZE * 2)(%rdi), %VMATCH_LO, %VMM_lo(3)
+	VPCMPEQ	(VEC_SIZE * 3)(%rdi), %VMATCH_LO, %VMM_lo(4)
+
+	/* 254 is mask for oring VEC_lo(2), VEC_lo(3), VEC_lo(4) into
+	   VEC_lo(4).  */
+	vpternlogd $254, %VMM_lo(2), %VMM_lo(3), %VMM_lo(4)
+	vpmovmskb %VMM_lo(4), %VRCX
+
+	KMOV	%k1, %eax
+
+	/* NB:  rax has match from first VEC and rcx has matches from
+	   VEC 2-4.  If rax is non-zero we will return that match.  If
+	   rax is zero adding won't disturb the bits in rcx.  */
+	add	%rax, %rcx
+# else
+	/* Loop version that uses EVEX encoding.  */
+	VPCMP	$4, (VEC_SIZE * 4)(%rdi), %VMATCH, %k1
+	vpxorq	(VEC_SIZE * 5)(%rdi), %VMATCH, %VMM(2)
+	vpxorq	(VEC_SIZE * 6)(%rdi), %VMATCH, %VMM(3)
+	VPCMPEQ	(VEC_SIZE * 7)(%rdi), %VMATCH, %k3
+	VPMINU	%VMM(2), %VMM(3), %VMM(3){%k1}{z}
+	VPTESTN	%VMM(3), %VMM(3), %k2
+	subq	$(VEC_SIZE * -4), %rdi
+	KORTEST %k2, %k3
+# endif
+	jz	L(loop_4x_vec)
+
+# if USE_TERN_IN_LOOP
+	test	%VRAX, %VRAX
+# else
+	KMOV	%k1, %VRAX
+	inc	%VRAX
+# endif
+	jnz	L(last_vec_x0)
+
+
+# if USE_TERN_IN_LOOP
+	vpmovmskb %VMM_lo(2), %VRAX
+# else
+	VPTESTN	%VMM(2), %VMM(2), %k1
+	KMOV	%k1, %VRAX
+# endif
+	test	%VRAX, %VRAX
+	jnz	L(last_vec_x1)
+
+
+# if USE_TERN_IN_LOOP
+	vpmovmskb %VMM_lo(3), %VRAX
+# else
+	KMOV	%k2, %VRAX
+# endif
+
+	/* No longer need any of the lo vecs (ymm0-15) so vzeroupper
+	   (only if used VEX encoded loop).  */
+	COND_VZEROUPPER
+
+	/* Seperate logic for VEC_SIZE == 64 and VEC_SIZE == 32 for
+	   returning last 2x VEC. For VEC_SIZE == 64 we test each VEC
+	   individually, for VEC_SIZE == 32 we combine them in a single
+	   64-bit GPR.  */
+# if CHAR_PER_VEC == 64
+#  if USE_TERN_IN_LOOP
+#   error "Unsupported"
+#  endif
+
+
+	/* If CHAR_PER_VEC == 64 we can't combine the last two VEC.  */
+	test	%VRAX, %VRAX
+	jnz	L(first_vec_x2)
+	KMOV	%k3, %VRAX
+L(FALLTHROUGH_RETURN_LBL):
+# else
+	/* CHAR_PER_VEC <= 32 so we can combine the results from the
+	   last 2x VEC.  */
+#  if !USE_TERN_IN_LOOP
+	KMOV	%k3, %VRCX
+#  endif
+	salq	$CHAR_PER_VEC, %rcx
+	addq	%rcx, %rax
+# endif
+	bsf	%rax, %rax
+	leaq	(FALLTHROUGH_RETURN_OFFSET)(%rdi, %rax), %rax
+	ret
+
+	.p2align 4,, 8
+L(TAIL_RETURN_LBL):
+	bsf	%rax, %rax
+	leaq	(TAIL_RETURN_OFFSET)(%rdi, %rax), %rax
+	ret
+
+	.p2align 4,, 8
+L(last_vec_x1):
+	COND_VZEROUPPER
+L(first_vec_x1):
+	bsf	%VRAX, %VRAX
+	leaq	(VEC_SIZE * 1)(%rdi, %rax), %rax
+	ret
+
+	.p2align 4,, 8
+L(last_vec_x0):
+	COND_VZEROUPPER
+	bsf	%VRAX, %VRAX
+	addq	%rdi, %rax
+	ret
+END (RAWMEMCHR)
+#endif