public inbox for libc-stable@sourceware.org
 help / color / mirror / Atom feed
* Re: [PATCH v3 1/3] x86: Optimize memcmp SSE2 in memcmp.S
       [not found]             ` <CAFUsyfKre7d72QZtAx_E67vP7x-N8fuePorgg5vhe+5HDPQaVA@mail.gmail.com>
@ 2022-05-12 19:59               ` Sunil Pandey
  0 siblings, 0 replies; 3+ messages in thread
From: Sunil Pandey @ 2022-05-12 19:59 UTC (permalink / raw)
  To: Noah Goldstein, Libc-stable Mailing List; +Cc: Joseph Myers, GNU C Library

[-- Attachment #1: Type: text/plain, Size: 1780 bytes --]

On Tue, Apr 19, 2022 at 6:21 PM Noah Goldstein via Libc-alpha
<libc-alpha@sourceware.org> wrote:
>
> On Tue, Apr 19, 2022 at 5:56 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> >
> > On Tue, Apr 19, 2022 at 3:53 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> > >
> > > On Tue, Apr 19, 2022 at 3:30 PM Joseph Myers <joseph@codesourcery.com> wrote:
> > > >
> > > > A recent commit, probably this one, broke the build for x86_64-linux-gnu
> > > > --disable-multi-arch.
> > > >
> > > > /scratch/jmyers/glibc-bot/install/compilers/x86_64-linux-gnu/lib/gcc/x86_64-glibc-linux-gnu/11.2.1/../../../../x86_64-glibc-linux-gnu/bin/ld: /scratch/jmyers/glibc-bot/build/glibcs/x86_64-linux-gnu-minimal/glibc/libc_pic.os: in function `internal_fnwmatch':
> > > > /scratch/jmyers/glibc-bot/src/glibc/posix/fnmatch_loop.c:513: undefined reference to `__wmemcmp'
> > > > /scratch/jmyers/glibc-bot/install/compilers/x86_64-linux-gnu/lib/gcc/x86_64-glibc-linux-gnu/11.2.1/../../../../x86_64-glibc-linux-gnu/bin/ld: /scratch/jmyers/glibc-bot/src/glibc/posix/fnmatch_loop.c:536: undefined reference to `__wmemcmp'
> > > > /scratch/jmyers/glibc-bot/install/compilers/x86_64-linux-gnu/lib/gcc/x86_64-glibc-linux-gnu/11.2.1/../../../../x86_64-glibc-linux-gnu/bin/ld: /scratch/jmyers/glibc-bot/src/glibc/posix/fnmatch_loop.c:704: undefined reference to `__wmemcmp'
> > > >
> > > > https://sourceware.org/pipermail/libc-testresults/2022q2/009576.html
> > >
> > > Reproduced issue, think have fix. Testing it now.
> >
> > Posted a patch.
>
> Pushed fix.
> > > >
> > > > --
> > > > Joseph S. Myers
> > > > joseph@codesourcery.com

I would like to backport this patch to release branches.
Any comments or objections?

Conflict resolution patch attached.

--Sunil

[-- Attachment #2: 0019-x86-Optimize-memcmp-SSE2-in-memcmp.S.patch --]
[-- Type: application/octet-stream, Size: 25841 bytes --]

From 06e92fca9937b2255538ae703b398e0e6a609445 Mon Sep 17 00:00:00 2001
From: Noah Goldstein <goldstein.w.n@gmail.com>
Date: Fri, 15 Apr 2022 12:27:59 -0500
Subject: [PATCH 19/26] x86: Optimize memcmp SSE2 in memcmp.S

New code save size (-303 bytes) and has significantly better
performance.

geometric_mean(N=20) of page cross cases New / Original: 0.634
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>

(cherry picked from commit 8804157ad9da39631703b92315460808eac86b0c)
---
 sysdeps/x86_64/memcmp.S                  | 884 ++++++++++++++---------
 sysdeps/x86_64/memcmpeq.S                |   2 +-
 sysdeps/x86_64/multiarch/Makefile        |   2 +-
 sysdeps/x86_64/multiarch/memcmp-sse2.S   |   4 +-
 sysdeps/x86_64/multiarch/memcmpeq-sse2.S |   4 +-
 sysdeps/x86_64/multiarch/wmemcmp-c.c     |   9 -
 sysdeps/x86_64/multiarch/wmemcmp-sse2.S  |  25 +
 sysdeps/x86_64/wmemcmp.S                 |  21 +
 8 files changed, 575 insertions(+), 376 deletions(-)
 delete mode 100644 sysdeps/x86_64/multiarch/wmemcmp-c.c
 create mode 100644 sysdeps/x86_64/multiarch/wmemcmp-sse2.S
 create mode 100644 sysdeps/x86_64/wmemcmp.S

diff --git a/sysdeps/x86_64/memcmp.S b/sysdeps/x86_64/memcmp.S
index e02a53ea1e..b153694048 100644
--- a/sysdeps/x86_64/memcmp.S
+++ b/sysdeps/x86_64/memcmp.S
@@ -18,395 +18,557 @@
 
 #include <sysdep.h>
 
-	.text
-ENTRY (memcmp)
-#ifdef __ILP32__
-	/* Clear the upper 32 bits.  */
-	movl	%edx, %edx
+#ifdef USE_AS_WMEMCMP
+# define PCMPEQ	pcmpeqd
+# define CHAR_SIZE	4
+# define SIZE_OFFSET	(0)
+#else
+# define PCMPEQ	pcmpeqb
+# define CHAR_SIZE	1
 #endif
-	test	%RDX_LP, %RDX_LP
-	jz	L(finz)
-	cmpq	$1, %rdx
-	jbe	L(finr1b)
-	subq	%rdi, %rsi
-	movq	%rdx, %r10
-	cmpq	$32, %r10
-	jae	L(gt32)
-	/* Handle small chunks and last block of less than 32 bytes.  */
-L(small):
-	testq	$1, %r10
-	jz	L(s2b)
-	movzbl	(%rdi),	%eax
-	movzbl	(%rdi, %rsi), %edx
-	subq    $1, %r10
-	je	L(finz1)
-	addq	$1, %rdi
-	subl	%edx, %eax
-	jnz	L(exit)
-L(s2b):
-	testq	$2, %r10
-	jz	L(s4b)
-	movzwl	(%rdi),	%eax
-	movzwl	(%rdi, %rsi), %edx
-	subq    $2, %r10
+
 #ifdef USE_AS_MEMCMPEQ
-	je	L(finz1)
+# define SIZE_OFFSET	(0)
+# define CHECK_CMP(x, y)	subl x, y
 #else
-	je	L(fin2_7)
+# ifndef SIZE_OFFSET
+#  define SIZE_OFFSET	(CHAR_PER_VEC * 2)
+# endif
+# define CHECK_CMP(x, y)	cmpl x, y
 #endif
-	addq	$2, %rdi
-	cmpl	%edx, %eax
-#ifdef USE_AS_MEMCMPEQ
-	jnz	L(neq_early)
+
+#define VEC_SIZE	16
+#define CHAR_PER_VEC	(VEC_SIZE / CHAR_SIZE)
+
+#ifndef MEMCMP
+# define MEMCMP	memcmp
+#endif
+
+	.text
+ENTRY(MEMCMP)
+#ifdef USE_AS_WMEMCMP
+	/* Use 0xffff to test for mismatches on pmovmskb bitmask. Store
+	   in ecx for code size. This is preferable to using `incw` as
+	   it avoids partial register stalls on older hardware (pre
+	   SnB).  */
+	movl	$0xffff, %ecx
+#endif
+	cmpq	$CHAR_PER_VEC, %rdx
+	ja	L(more_1x_vec)
+
+#ifdef USE_AS_WMEMCMP
+	/* saves a byte of code keeping the fall through path n = [2, 4]
+	   in the initial cache line.  */
+	decl	%edx
+	jle	L(cmp_0_1)
+
+	movq	(%rsi), %xmm0
+	movq	(%rdi), %xmm1
+	PCMPEQ	%xmm0, %xmm1
+	pmovmskb %xmm1, %eax
+	subl	%ecx, %eax
+	jnz	L(ret_nonzero_vec_start_0)
+
+	movq	-4(%rsi, %rdx, CHAR_SIZE), %xmm0
+	movq	-4(%rdi, %rdx, CHAR_SIZE), %xmm1
+	PCMPEQ	%xmm0, %xmm1
+	pmovmskb %xmm1, %eax
+	subl	%ecx, %eax
+	jnz	L(ret_nonzero_vec_end_0_adj)
 #else
-	jnz	L(fin2_7)
+	cmpl	$8, %edx
+	ja	L(cmp_9_16)
+
+	cmpl	$4, %edx
+	jb	L(cmp_0_3)
+
+# ifdef USE_AS_MEMCMPEQ
+	movl	(%rsi), %eax
+	subl	(%rdi), %eax
+
+	movl	-4(%rsi, %rdx), %esi
+	subl	-4(%rdi, %rdx), %esi
+
+	orl	%esi, %eax
+	ret
+# else
+	/* Combine comparisons for lo and hi 4-byte comparisons.  */
+	movl	-4(%rsi, %rdx), %ecx
+	movl	-4(%rdi, %rdx), %eax
+	shlq	$32, %rcx
+	shlq	$32, %rax
+	movl	(%rsi), %esi
+	movl	(%rdi), %edi
+	orq	%rsi, %rcx
+	orq	%rdi, %rax
+	/* Only compute proper return if not-equal.  */
+	cmpq	%rcx, %rax
+	jnz	L(ret_nonzero)
+	xorl	%eax, %eax
+	ret
+# endif
+
+	.p2align 4,, 10
+L(cmp_9_16):
+# ifdef USE_AS_MEMCMPEQ
+	movq	(%rsi), %rax
+	subq	(%rdi), %rax
+
+	movq	-8(%rsi, %rdx), %rcx
+	subq	-8(%rdi, %rdx), %rcx
+	orq	%rcx, %rax
+	/* Convert 64 bit -> 32 bit boolean (we should have made the ABI
+	   return long).  */
+	setnz	%cl
+	movzbl	%cl, %eax
+# else
+	movq	(%rsi), %rcx
+	movq	(%rdi), %rax
+	/* Only compute proper return if not-equal.  */
+	cmpq	%rcx, %rax
+	jnz	L(ret_nonzero)
+
+	movq	-8(%rsi, %rdx, CHAR_SIZE), %rcx
+	movq	-8(%rdi, %rdx, CHAR_SIZE), %rax
+	/* Only compute proper return if not-equal.  */
+	cmpq	%rcx, %rax
+	jnz	L(ret_nonzero)
+	xorl	%eax, %eax
+# endif
 #endif
-L(s4b):
-	testq	$4, %r10
-	jz	L(s8b)
-	movl	(%rdi),	%eax
-	movl	(%rdi, %rsi), %edx
-	subq    $4, %r10
-#ifdef USE_AS_MEMCMPEQ
-	je	L(finz1)
+	ret
+
+	.p2align 4,, 8
+L(cmp_0_1):
+	/* Flag set by earlier comparison against 1.  */
+	jne	L(cmp_0_0)
+#ifdef USE_AS_WMEMCMP
+	movl	(%rdi), %ecx
+	xorl	%edx, %edx
+	cmpl	(%rsi), %ecx
+	je	L(cmp_0_0)
+	setg	%dl
+	leal	-1(%rdx, %rdx), %eax
 #else
-	je	L(fin2_7)
+	movzbl	(%rdi), %eax
+	movzbl	(%rsi), %ecx
+	subl	%ecx, %eax
 #endif
-	addq	$4, %rdi
-	cmpl	%edx, %eax
-#ifdef USE_AS_MEMCMPEQ
-	jnz	L(neq_early)
+	ret
+
+	/* Fits in aligning bytes.  */
+L(cmp_0_0):
+	xorl	%eax, %eax
+	ret
+
+#ifdef USE_AS_WMEMCMP
+	.p2align 4
+L(ret_nonzero_vec_start_0):
+	bsfl	%eax, %eax
+	movl	(%rdi, %rax), %ecx
+	xorl	%edx, %edx
+	cmpl	(%rsi, %rax), %ecx
+	/* NB: no partial register stall here because xorl zero idiom
+	   above.  */
+	setg	%dl
+	leal	-1(%rdx, %rdx), %eax
+	ret
+#else
+
+# ifndef USE_AS_MEMCMPEQ
+	.p2align 4,, 14
+L(ret_nonzero):
+	/* Need to bswap to get proper return without branch.  */
+	bswapq	%rcx
+	bswapq	%rax
+	subq	%rcx, %rax
+	sbbl	%eax, %eax
+	orl	$1, %eax
+	ret
+# endif
+
+	.p2align 4
+L(cmp_0_3):
+# ifdef USE_AS_MEMCMPEQ
+	/* No reason to add to dependency chain on rdx. Saving a the
+	   bytes here doesn't change number of fetch blocks.  */
+	cmpl	$1, %edx
+	jbe	L(cmp_0_1)
+# else
+	/* We need the code size to prevent taking an extra fetch block.
+	 */
+	decl	%edx
+	jle	L(cmp_0_1)
+# endif
+	movzwl	(%rsi), %ecx
+	movzwl	(%rdi), %eax
+
+# ifdef USE_AS_MEMCMPEQ
+	subl	%ecx, %eax
+
+	movzbl	-1(%rsi, %rdx), %esi
+	movzbl	-1(%rdi, %rdx), %edi
+	subl	%edi, %esi
+	orl	%esi, %eax
+# else
+	bswapl	%ecx
+	bswapl	%eax
+
+	/* Implicit right shift by one. We just need to displace the
+	   sign bits.  */
+	shrl	%ecx
+	shrl	%eax
+
+	/* Eat a partial register stall here. Saves code stopping
+	   L(cmp_0_3) from bleeding into the next fetch block and saves
+	   an ALU.  */
+	movb	(%rsi, %rdx), %cl
+	movzbl	(%rdi, %rdx), %edi
+	orl	%edi, %eax
+	subl	%ecx, %eax
+# endif
+	ret
+#endif
+
+	.p2align 5
+L(more_1x_vec):
+#ifndef USE_AS_WMEMCMP
+	/* Use 0xffff to test for mismatches on pmovmskb bitmask. Store
+	   in ecx for code size. This is preferable to using `incw` as
+	   it avoids partial register stalls on older hardware (pre
+	   SnB).  */
+	movl	$0xffff, %ecx
+#endif
+	movups	(%rsi), %xmm0
+	movups	(%rdi), %xmm1
+	PCMPEQ	%xmm0, %xmm1
+	pmovmskb %xmm1, %eax
+	subl	%ecx, %eax
+	jnz	L(ret_nonzero_vec_start_0)
+#if SIZE_OFFSET == 0
+	cmpq	$(CHAR_PER_VEC * 2), %rdx
 #else
-	jnz	L(fin2_7)
+	/* Offset rdx. Saves just enough code size to keep the
+	   L(last_2x_vec) case and the non-zero return in a single
+	   cache line.  */
+	subq	$(CHAR_PER_VEC * 2), %rdx
 #endif
-L(s8b):
-	testq	$8, %r10
-	jz	L(s16b)
-	movq	(%rdi),	%rax
-	movq	(%rdi, %rsi), %rdx
-	subq    $8, %r10
-#ifdef USE_AS_MEMCMPEQ
-	je	L(sub_return8)
+	ja	L(more_2x_vec)
+
+	movups	(VEC_SIZE * -1 + SIZE_OFFSET)(%rsi, %rdx, CHAR_SIZE), %xmm0
+	movups	(VEC_SIZE * -1 + SIZE_OFFSET)(%rdi, %rdx, CHAR_SIZE), %xmm1
+	PCMPEQ	%xmm0, %xmm1
+	pmovmskb %xmm1, %eax
+	subl	%ecx, %eax
+#ifndef USE_AS_MEMCMPEQ
+	/* Don't use `incw ax` as machines this code runs on are liable
+	   to have partial register stall.  */
+	jnz	L(ret_nonzero_vec_end_0)
 #else
-	je	L(fin2_7)
+	/* Various return targets for memcmpeq. Will always be hot in
+	   Icache and get short encoding.  */
+L(ret_nonzero_vec_start_1):
+L(ret_nonzero_vec_start_0):
+L(ret_nonzero_vec_end_0):
 #endif
-	addq	$8, %rdi
-	cmpq	%rdx, %rax
-#ifdef USE_AS_MEMCMPEQ
-	jnz	L(neq_early)
+	ret
+
+#ifndef USE_AS_MEMCMPEQ
+# ifdef USE_AS_WMEMCMP
+	.p2align 4
+L(ret_nonzero_vec_end_0_adj):
+	addl	$3, %edx
+# else
+	.p2align 4,, 8
+# endif
+L(ret_nonzero_vec_end_0):
+	bsfl	%eax, %eax
+# ifdef USE_AS_WMEMCMP
+	leal	(%rax, %rdx, CHAR_SIZE), %eax
+	movl	(VEC_SIZE * -1 + SIZE_OFFSET)(%rdi, %rax), %ecx
+	xorl	%edx, %edx
+	cmpl	(VEC_SIZE * -1 + SIZE_OFFSET)(%rsi, %rax), %ecx
+	/* NB: no partial register stall here because xorl zero idiom
+	   above.  */
+	setg	%dl
+	leal	-1(%rdx, %rdx), %eax
+# else
+	addl	%edx, %eax
+	movzbl	(VEC_SIZE * -1 + SIZE_OFFSET)(%rsi, %rax), %ecx
+	movzbl	(VEC_SIZE * -1 + SIZE_OFFSET)(%rdi, %rax), %eax
+	subl	%ecx, %eax
+# endif
+	ret
+# ifndef USE_AS_WMEMCMP
+	.p2align 4,, 10
+L(ret_nonzero_vec_start_0):
+	bsfl	%eax, %eax
+	movzbl	(%rsi, %rax), %ecx
+	movzbl	(%rdi, %rax), %eax
+	subl	%ecx, %eax
+	ret
+# endif
 #else
-	jnz	L(fin2_7)
 #endif
-L(s16b):
-	movdqu    (%rdi), %xmm1
-	movdqu    (%rdi, %rsi), %xmm0
-	pcmpeqb   %xmm0, %xmm1
+
+	.p2align 5
+L(more_2x_vec):
+	movups	(VEC_SIZE * 1)(%rsi), %xmm0
+	movups	(VEC_SIZE * 1)(%rdi), %xmm1
+	PCMPEQ	%xmm0, %xmm1
+	pmovmskb %xmm1, %eax
+	subl	%ecx, %eax
+	jnz	L(ret_nonzero_vec_start_1)
+
+	cmpq	$(CHAR_PER_VEC * 4 - SIZE_OFFSET), %rdx
+	jbe	L(last_2x_vec)
+
+	cmpq	$(CHAR_PER_VEC * 8 - SIZE_OFFSET), %rdx
+	ja	L(more_8x_vec)
+
+	/* Do comparisons for [65, 96] and [97, 128] 2x VEC at a time.
+	   This can harm performance if non-zero return in [65, 80] or
+	   [97, 112] but helps performance otherwise. Generally zero-
+	   return is hotter.  */
+	movups	(VEC_SIZE * 2)(%rsi), %xmm0
+	movups	(VEC_SIZE * 2)(%rdi), %xmm1
+	PCMPEQ	%xmm0, %xmm1
+	movups	(VEC_SIZE * 3)(%rsi), %xmm2
+	movups	(VEC_SIZE * 3)(%rdi), %xmm3
+	PCMPEQ	%xmm2, %xmm3
+	pand	%xmm1, %xmm3
+
+	pmovmskb %xmm3, %eax
+	CHECK_CMP (%ecx, %eax)
+	jnz	L(ret_nonzero_vec_start_2_3)
+
+	cmpl	$(CHAR_PER_VEC * 6 - SIZE_OFFSET), %edx
+	jbe	L(last_2x_vec)
+
+	movups	(VEC_SIZE * 4)(%rsi), %xmm0
+	movups	(VEC_SIZE * 4)(%rdi), %xmm1
+	PCMPEQ	%xmm0, %xmm1
+	movups	(VEC_SIZE * 5)(%rsi), %xmm2
+	movups	(VEC_SIZE * 5)(%rdi), %xmm3
+	PCMPEQ	%xmm2, %xmm3
+	pand	%xmm1, %xmm3
+
+	pmovmskb %xmm3, %eax
+	CHECK_CMP (%ecx, %eax)
 #ifdef USE_AS_MEMCMPEQ
-	pmovmskb  %xmm1, %eax
-	subl      $0xffff, %eax
+	jz	L(last_2x_vec)
 	ret
 #else
-	pmovmskb  %xmm1, %edx
-	xorl	  %eax, %eax
-	subl      $0xffff, %edx
-	jz	  L(finz)
-	bsfl      %edx, %ecx
-	leaq	 (%rdi, %rcx), %rcx
-	movzbl	 (%rcx), %eax
-	movzbl	 (%rsi, %rcx), %edx
-	jmp	 L(finz1)
+	jnz	L(ret_nonzero_vec_start_4_5)
 #endif
-	.p2align 4,, 4
-L(finr1b):
-	movzbl	(%rdi), %eax
-	movzbl  (%rsi), %edx
-L(finz1):
-	subl	%edx, %eax
-L(exit):
-	ret
+	.p2align 4
+L(last_2x_vec):
+	movups	(VEC_SIZE * -2 + SIZE_OFFSET)(%rsi, %rdx, CHAR_SIZE), %xmm0
+	movups	(VEC_SIZE * -2 + SIZE_OFFSET)(%rdi, %rdx, CHAR_SIZE), %xmm1
+	PCMPEQ	%xmm0, %xmm1
+	movups	(VEC_SIZE * -1 + SIZE_OFFSET)(%rsi, %rdx, CHAR_SIZE), %xmm2
+	movups	(VEC_SIZE * -1 + SIZE_OFFSET)(%rdi, %rdx, CHAR_SIZE), %xmm3
+	PCMPEQ	%xmm2, %xmm3
+	pand	%xmm1, %xmm3
+	pmovmskb %xmm3, %eax
+	subl	%ecx, %eax
 #ifdef USE_AS_MEMCMPEQ
-	.p2align 4,, 4
-L(sub_return8):
-	subq	%rdx, %rax
-	movl	%eax, %edx
-	shrq	$32, %rax
-	orl	%edx, %eax
+	/* Various return targets for memcmpeq. Will always be hot in
+	   Icache and get short encoding.  */
+L(ret_nonzero_vec_start_2_3):
+L(ret_nonzero_vec_start_4_5):
 	ret
 #else
-	.p2align 4,, 4
-L(fin2_7):
-	cmpq	%rdx, %rax
-	jz	L(finz)
-	movq	%rax, %r11
-	subq	%rdx, %r11
-	bsfq	%r11, %rcx
-	sarq	$3, %rcx
-	salq	$3, %rcx
-	sarq	%cl, %rax
-	movzbl  %al, %eax
-	sarq	%cl, %rdx
-	movzbl  %dl, %edx
-	subl	%edx, %eax
+	jnz	L(ret_nonzero_vec_end_1)
 	ret
-#endif
-	.p2align 4,, 4
-L(finz):
-	xorl	%eax, %eax
+
+	.p2align 4,, 8
+L(ret_nonzero_vec_end_1):
+	pmovmskb %xmm1, %ecx
+	/* High 16 bits of eax guranteed to be all ones. Rotate them in
+	   to we can do `or + not` with just `xor`.  */
+	rorl	$16, %eax
+	xorl	%ecx, %eax
+	/* Partial register stall.  */
+
+	bsfl	%eax, %eax
+# ifdef USE_AS_WMEMCMP
+	leal	(%rax, %rdx, CHAR_SIZE), %eax
+	movl	(VEC_SIZE * -2 + SIZE_OFFSET)(%rdi, %rax), %ecx
+	xorl	%edx, %edx
+	cmpl	(VEC_SIZE * -2 + SIZE_OFFSET)(%rsi, %rax), %ecx
+	/* NB: no partial register stall here because xorl zero idiom
+	   above.  */
+	setg	%dl
+	leal	-1(%rdx, %rdx), %eax
+# else
+	addl	%edx, %eax
+	movzbl	(VEC_SIZE * -2 + SIZE_OFFSET)(%rsi, %rax), %ecx
+	movzbl	(VEC_SIZE * -2 + SIZE_OFFSET)(%rdi, %rax), %eax
+	subl	%ecx, %eax
+# endif
 	ret
-#ifdef USE_AS_MEMCMPEQ
-	.p2align 4,, 4
-L(neq_early):
-	movl	$1, %eax
+
+	.p2align 4
+L(ret_nonzero_vec_start_4_5):
+	pmovmskb %xmm1, %edx
+	sall	$16, %eax
+	leal	1(%rax, %rdx), %eax
+	bsfl	%eax, %eax
+# ifdef USE_AS_WMEMCMP
+	movl	(VEC_SIZE * 4)(%rdi, %rax), %ecx
+	xorl	%edx, %edx
+	cmpl	(VEC_SIZE * 4)(%rsi, %rax), %ecx
+	/* NB: no partial register stall here because xorl zero idiom
+	   above.  */
+	setg	%dl
+	leal	-1(%rdx, %rdx), %eax
+# else
+	movzbl	(VEC_SIZE * 4)(%rsi, %rax), %ecx
+	movzbl	(VEC_SIZE * 4)(%rdi, %rax), %eax
+	subl	%ecx, %eax
+# endif
+	ret
+
+	.p2align 4,, 8
+L(ret_nonzero_vec_start_1):
+	bsfl	%eax, %eax
+# ifdef USE_AS_WMEMCMP
+	movl	(VEC_SIZE * 1)(%rdi, %rax), %ecx
+	xorl	%edx, %edx
+	cmpl	(VEC_SIZE * 1)(%rsi, %rax), %ecx
+	/* NB: no partial register stall here because xorl zero idiom
+	   above.  */
+	setg	%dl
+	leal	-1(%rdx, %rdx), %eax
+# else
+	movzbl	(VEC_SIZE * 1)(%rsi, %rax), %ecx
+	movzbl	(VEC_SIZE * 1)(%rdi, %rax), %eax
+	subl	%ecx, %eax
+# endif
 	ret
 #endif
-	/* For blocks bigger than 32 bytes
-	   1. Advance one of the addr pointer to be 16B aligned.
-	   2. Treat the case of both addr pointers aligned to 16B
-	      separately to avoid movdqu.
-	   3. Handle any blocks of greater than 64 consecutive bytes with
-	      unrolling to reduce branches.
-	   4. At least one addr pointer is 16B aligned, use memory version
-	      of pcmbeqb.
-	*/
-	.p2align 4,, 4
-L(gt32):
-	movq	%rdx, %r11
-	addq	%rdi, %r11
-	movq	%rdi, %r8
-
-	andq	$15, %r8
-	jz	L(16am)
-	/* Both pointers may be misaligned.  */
-	movdqu	(%rdi),	%xmm1
-	movdqu	(%rdi, %rsi), %xmm0
-	pcmpeqb   %xmm0, %xmm1
-	pmovmskb  %xmm1, %edx
-	subl      $0xffff, %edx
-	jnz       L(neq)
-	neg	 %r8
-	leaq    16(%rdi, %r8), %rdi
-L(16am):
-	/* Handle two 16B aligned pointers separately.  */
-	testq   $15, %rsi
-	jz      L(ATR)
-	testq	$16, %rdi
-	jz	L(A32)
-	movdqu	(%rdi, %rsi), %xmm0
-	pcmpeqb   (%rdi), %xmm0
-	pmovmskb  %xmm0, %edx
-	subl      $0xffff, %edx
-	jnz       L(neq)
-	addq	$16, %rdi
-L(A32):
-	movq	%r11, %r10
-	andq	$-32, %r10
-	cmpq	%r10, %rdi
-        jae	L(mt16)
-	/* Pre-unroll to be ready for unrolled 64B loop.  */
-	testq	$32, %rdi
-	jz	L(A64)
-	movdqu    (%rdi,%rsi), %xmm0
-	pcmpeqb   (%rdi), %xmm0
-	pmovmskb  %xmm0, %edx
-	subl      $0xffff, %edx
-	jnz       L(neq)
-	addq       $16, %rdi
-
-	movdqu    (%rdi,%rsi), %xmm0
-	pcmpeqb  (%rdi), %xmm0
-	pmovmskb  %xmm0, %edx
-	subl      $0xffff, %edx
-	jnz       L(neq)
-	addq       $16, %rdi
-
-L(A64):
-	movq	%r11, %r10
-	andq	$-64, %r10
-	cmpq	%r10, %rdi
-        jae	L(mt32)
-
-L(A64main):
-	movdqu    (%rdi,%rsi), %xmm0
-	pcmpeqb   (%rdi), %xmm0
-	pmovmskb  %xmm0, %edx
-	subl      $0xffff, %edx
-	jnz       L(neq)
-	addq       $16, %rdi
-
-	movdqu    (%rdi,%rsi), %xmm0
-	pcmpeqb   (%rdi), %xmm0
-	pmovmskb  %xmm0, %edx
-	subl      $0xffff, %edx
-	jnz       L(neq)
-	addq       $16, %rdi
-
-	movdqu    (%rdi,%rsi), %xmm0
-	pcmpeqb   (%rdi), %xmm0
-	pmovmskb  %xmm0, %edx
-	subl      $0xffff, %edx
-	jnz       L(neq)
-	addq       $16, %rdi
-
-	movdqu    (%rdi,%rsi), %xmm0
-	pcmpeqb  (%rdi), %xmm0
-	pmovmskb  %xmm0, %edx
-	subl      $0xffff, %edx
-	jnz       L(neq)
-	addq       $16, %rdi
-
-	cmpq       %rdi, %r10
-	jne       L(A64main)
-
-L(mt32):
-	movq	%r11, %r10
-	andq	$-32, %r10
-	cmpq	%r10, %rdi
-        jae	L(mt16)
-
-L(A32main):
-	movdqu    (%rdi,%rsi), %xmm0
-	pcmpeqb   (%rdi), %xmm0
-	pmovmskb  %xmm0, %edx
-	subl      $0xffff, %edx
-	jnz       L(neq)
-	addq       $16, %rdi
-
-	movdqu    (%rdi,%rsi), %xmm0
-	pcmpeqb  (%rdi), %xmm0
-	pmovmskb  %xmm0, %edx
-	subl      $0xffff, %edx
-	jnz       L(neq)
-	addq       $16, %rdi
-
-	cmpq       %rdi, %r10
-	jne       L(A32main)
-L(mt16):
-	subq       %rdi, %r11
-	je	  L(finz)
-	movq	  %r11, %r10
-	jmp	  L(small)
-
-	.p2align 4,, 4
-L(neq):
-#ifdef USE_AS_MEMCMPEQ
-	movl	$1, %eax
-    ret
-#else
-	bsfl      %edx, %ecx
-	movzbl	 (%rdi, %rcx), %eax
-	addq	 %rdi, %rsi
-	movzbl	 (%rsi,%rcx), %edx
-	jmp	 L(finz1)
+
+	.p2align 4
+L(more_8x_vec):
+	subq	%rdi, %rsi
+	leaq	(VEC_SIZE * -6 + SIZE_OFFSET)(%rdi, %rdx, CHAR_SIZE), %rdx
+	andq	$(VEC_SIZE * -1), %rdi
+	addq	%rdi, %rsi
+	.p2align 4
+L(loop_4x):
+	movups	(VEC_SIZE * 2)(%rsi), %xmm0
+	movups	(VEC_SIZE * 3)(%rsi), %xmm1
+
+	PCMPEQ	(VEC_SIZE * 2)(%rdi), %xmm0
+	PCMPEQ	(VEC_SIZE * 3)(%rdi), %xmm1
+
+	movups	(VEC_SIZE * 4)(%rsi), %xmm2
+	movups	(VEC_SIZE * 5)(%rsi), %xmm3
+
+	PCMPEQ	(VEC_SIZE * 4)(%rdi), %xmm2
+	PCMPEQ	(VEC_SIZE * 5)(%rdi), %xmm3
+
+	pand	%xmm0, %xmm1
+	pand	%xmm2, %xmm3
+	pand	%xmm1, %xmm3
+
+	pmovmskb %xmm3, %eax
+	subl	%ecx, %eax
+	jnz	L(ret_nonzero_loop)
+
+	addq	$(VEC_SIZE * 4), %rdi
+	addq	$(VEC_SIZE * 4), %rsi
+	cmpq	%rdi, %rdx
+	ja	L(loop_4x)
+	/* Get remaining length in edx.  */
+	subl	%edi, %edx
+	/* Restore offset so we can reuse L(last_2x_vec).  */
+	addl	$(VEC_SIZE * 6 - SIZE_OFFSET), %edx
+#ifdef USE_AS_WMEMCMP
+	shrl	$2, %edx
 #endif
+	cmpl	$(CHAR_PER_VEC * 4 - SIZE_OFFSET), %edx
+	jbe	L(last_2x_vec)
+
 
-	.p2align 4,, 4
-L(ATR):
-	movq	%r11, %r10
-	andq	$-32, %r10
-	cmpq	%r10, %rdi
-        jae	L(mt16)
-	testq	$16, %rdi
-	jz	L(ATR32)
-
-	movdqa    (%rdi,%rsi), %xmm0
-	pcmpeqb   (%rdi), %xmm0
-	pmovmskb  %xmm0, %edx
-	subl      $0xffff, %edx
-	jnz       L(neq)
-	addq       $16, %rdi
-	cmpq       %rdi, %r10
-	je       L(mt16)
-
-L(ATR32):
-	movq	%r11, %r10
-	andq	$-64, %r10
-	testq	$32, %rdi
-	jz	L(ATR64)
-
-	movdqa    (%rdi,%rsi), %xmm0
-	pcmpeqb   (%rdi), %xmm0
-	pmovmskb  %xmm0, %edx
-	subl      $0xffff, %edx
-	jnz       L(neq)
-	addq       $16, %rdi
-
-	movdqa    (%rdi,%rsi), %xmm0
-	pcmpeqb   (%rdi), %xmm0
-	pmovmskb  %xmm0, %edx
-	subl      $0xffff, %edx
-	jnz       L(neq)
-	addq       $16, %rdi
-
-L(ATR64):
-	cmpq       %rdi, %r10
-	je	   L(mt32)
-
-L(ATR64main):
-	movdqa    (%rdi,%rsi), %xmm0
-	pcmpeqb   (%rdi), %xmm0
-	pmovmskb  %xmm0, %edx
-	subl      $0xffff, %edx
-	jnz       L(neq)
-	addq       $16, %rdi
-
-	movdqa    (%rdi,%rsi), %xmm0
-	pcmpeqb   (%rdi), %xmm0
-	pmovmskb  %xmm0, %edx
-	subl      $0xffff, %edx
-	jnz       L(neq)
-	addq       $16, %rdi
-
-	movdqa    (%rdi,%rsi), %xmm0
-	pcmpeqb   (%rdi), %xmm0
-	pmovmskb  %xmm0, %edx
-	subl      $0xffff, %edx
-	jnz       L(neq)
-	addq       $16, %rdi
-
-	movdqa    (%rdi,%rsi), %xmm0
-	pcmpeqb   (%rdi), %xmm0
-	pmovmskb  %xmm0, %edx
-	subl      $0xffff, %edx
-	jnz       L(neq)
-	addq       $16, %rdi
-	cmpq       %rdi, %r10
-	jne       L(ATR64main)
-
-	movq	%r11, %r10
-	andq	$-32, %r10
-	cmpq	%r10, %rdi
-        jae	L(mt16)
-
-L(ATR32res):
-	movdqa    (%rdi,%rsi), %xmm0
-	pcmpeqb   (%rdi), %xmm0
-	pmovmskb  %xmm0, %edx
-	subl      $0xffff, %edx
-	jnz       L(neq)
-	addq       $16, %rdi
-
-	movdqa    (%rdi,%rsi), %xmm0
-	pcmpeqb   (%rdi), %xmm0
-	pmovmskb  %xmm0, %edx
-	subl      $0xffff, %edx
-	jnz       L(neq)
-	addq       $16, %rdi
-
-	cmpq	  %r10, %rdi
-	jne       L(ATR32res)
-
-	subq       %rdi, %r11
-	je	  L(finz)
-	movq	  %r11, %r10
-	jmp	  L(small)
-	/* Align to 16byte to improve instruction fetch.  */
-	.p2align 4,, 4
-END(memcmp)
+	movups	(VEC_SIZE * 2)(%rsi), %xmm0
+	movups	(VEC_SIZE * 2)(%rdi), %xmm1
+	PCMPEQ	%xmm0, %xmm1
+	movups	(VEC_SIZE * 3)(%rsi), %xmm2
+	movups	(VEC_SIZE * 3)(%rdi), %xmm3
+	PCMPEQ	%xmm2, %xmm3
+	pand	%xmm1, %xmm3
 
+	pmovmskb %xmm3, %eax
+	CHECK_CMP (%ecx, %eax)
+	jz	L(last_2x_vec)
 #ifdef USE_AS_MEMCMPEQ
-libc_hidden_def (memcmp)
+L(ret_nonzero_loop):
+	ret
 #else
-# undef bcmp
-weak_alias (memcmp, bcmp)
-libc_hidden_builtin_def (memcmp)
+
+	.p2align 4
+L(ret_nonzero_vec_start_2_3):
+	pmovmskb %xmm1, %edx
+	sall	$16, %eax
+	leal	1(%rax, %rdx), %eax
+
+	bsfl	%eax, %eax
+# ifdef USE_AS_WMEMCMP
+	movl	(VEC_SIZE * 2)(%rdi, %rax), %ecx
+	xorl	%edx, %edx
+	cmpl	(VEC_SIZE * 2)(%rsi, %rax), %ecx
+	/* NB: no partial register stall here because xorl zero idiom
+	   above.  */
+	setg	%dl
+	leal	-1(%rdx, %rdx), %eax
+# else
+	movzbl	(VEC_SIZE * 2)(%rsi, %rax), %ecx
+	movzbl	(VEC_SIZE * 2)(%rdi, %rax), %eax
+	subl	%ecx, %eax
+# endif
+	ret
+
+	.p2align 4
+L(ret_nonzero_loop):
+	pmovmskb %xmm0, %ecx
+	pmovmskb %xmm1, %edx
+	sall	$(VEC_SIZE * 1), %edx
+	leal	1(%rcx, %rdx), %edx
+	pmovmskb %xmm2, %ecx
+	/* High 16 bits of eax guranteed to be all ones. Rotate them in
+	   to we can do `or + not` with just `xor`.  */
+	rorl	$16, %eax
+	xorl	%ecx, %eax
+
+	salq	$32, %rax
+	orq	%rdx, %rax
+
+	bsfq	%rax, %rax
+# ifdef USE_AS_WMEMCMP
+	movl	(VEC_SIZE * 2)(%rdi, %rax), %ecx
+	xorl	%edx, %edx
+	cmpl	(VEC_SIZE * 2)(%rsi, %rax), %ecx
+	/* NB: no partial register stall here because xorl zero idiom
+	   above.  */
+	setg	%dl
+	leal	-1(%rdx, %rdx), %eax
+# else
+	movzbl	(VEC_SIZE * 2)(%rsi, %rax), %ecx
+	movzbl	(VEC_SIZE * 2)(%rdi, %rax), %eax
+	subl	%ecx, %eax
+# endif
+	ret
+#endif
+END(MEMCMP)
+
+#ifndef USE_AS_WMEMCMP
+# ifdef USE_AS_MEMCMPEQ
+libc_hidden_def (MEMCMP)
+# else
+#  undef bcmp
+weak_alias (MEMCMP, bcmp)
+libc_hidden_builtin_def (MEMCMP)
+# endif
 #endif
diff --git a/sysdeps/x86_64/memcmpeq.S b/sysdeps/x86_64/memcmpeq.S
index 2cee881fed..80c5e912a6 100644
--- a/sysdeps/x86_64/memcmpeq.S
+++ b/sysdeps/x86_64/memcmpeq.S
@@ -16,6 +16,6 @@
    License along with the GNU C Library; if not, see
    <https://www.gnu.org/licenses/>.  */
 
-#define memcmp	__memcmpeq
+#define MEMCMP	__memcmpeq
 #define USE_AS_MEMCMPEQ	1
 #include "multiarch/memcmp-sse2.S"
diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
index 6507d1b7fa..ecb4dea190 100644
--- a/sysdeps/x86_64/multiarch/Makefile
+++ b/sysdeps/x86_64/multiarch/Makefile
@@ -176,8 +176,8 @@ sysdep_routines += \
   wmemchr-sse2 \
   wmemcmp-avx2-movbe \
   wmemcmp-avx2-movbe-rtm \
-  wmemcmp-c \
   wmemcmp-evex-movbe \
+  wmemcmp-sse2 \
   wmemcmp-sse4 \
   wmemcmp-ssse3 \
 # sysdep_routines
diff --git a/sysdeps/x86_64/multiarch/memcmp-sse2.S b/sysdeps/x86_64/multiarch/memcmp-sse2.S
index e10555638d..4080fc1875 100644
--- a/sysdeps/x86_64/multiarch/memcmp-sse2.S
+++ b/sysdeps/x86_64/multiarch/memcmp-sse2.S
@@ -17,8 +17,8 @@
    <https://www.gnu.org/licenses/>.  */
 
 #if IS_IN (libc)
-# ifndef memcmp
-#  define memcmp __memcmp_sse2
+# ifndef MEMCMP
+#  define MEMCMP __memcmp_sse2
 # endif
 
 # ifdef SHARED
diff --git a/sysdeps/x86_64/multiarch/memcmpeq-sse2.S b/sysdeps/x86_64/multiarch/memcmpeq-sse2.S
index de7f5a7525..9d991e5c74 100644
--- a/sysdeps/x86_64/multiarch/memcmpeq-sse2.S
+++ b/sysdeps/x86_64/multiarch/memcmpeq-sse2.S
@@ -17,9 +17,9 @@
    <https://www.gnu.org/licenses/>.  */
 
 #if IS_IN (libc)
-# define memcmp	__memcmpeq_sse2
+# define MEMCMP	__memcmpeq_sse2
 #else
-# define memcmp	__memcmpeq
+# define MEMCMP	__memcmpeq
 #endif
 #define USE_AS_MEMCMPEQ	1
 #include "memcmp-sse2.S"
diff --git a/sysdeps/x86_64/multiarch/wmemcmp-c.c b/sysdeps/x86_64/multiarch/wmemcmp-c.c
deleted file mode 100644
index 46b6715e18..0000000000
--- a/sysdeps/x86_64/multiarch/wmemcmp-c.c
+++ /dev/null
@@ -1,9 +0,0 @@
-#if IS_IN (libc)
-# include <wchar.h>
-
-# define WMEMCMP  __wmemcmp_sse2
-
-extern __typeof (wmemcmp) __wmemcmp_sse2;
-#endif
-
-#include "wcsmbs/wmemcmp.c"
diff --git a/sysdeps/x86_64/multiarch/wmemcmp-sse2.S b/sysdeps/x86_64/multiarch/wmemcmp-sse2.S
new file mode 100644
index 0000000000..57be1c446e
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wmemcmp-sse2.S
@@ -0,0 +1,25 @@
+/* wmemcmp optimized with SSE2.
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#if IS_IN (libc)
+# define MEMCMP	__wmemcmp_sse2
+#else
+# define MEMCMP	wmemcmp
+#endif
+#define USE_AS_WMEMCMP	1
+#include "memcmp-sse2.S"
diff --git a/sysdeps/x86_64/wmemcmp.S b/sysdeps/x86_64/wmemcmp.S
new file mode 100644
index 0000000000..032f389158
--- /dev/null
+++ b/sysdeps/x86_64/wmemcmp.S
@@ -0,0 +1,21 @@
+/* wmemcmp optimized with SSE2.
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#define MEMCMP	wmemcmp
+#define USE_AS_WMEMCMP	1
+#include "multiarch/memcmp-sse2.S"
-- 
2.35.1


^ permalink raw reply	[flat|nested] 3+ messages in thread

* Re: [PATCH v3 2/3] x86: Remove memcmp-sse4.S
       [not found]     ` <CAMe9rOr2Cr4FRJDY2p7J3HVtDt4VmQAkt-MPtETj-+cCfvUJ0w@mail.gmail.com>
@ 2022-05-12 20:01       ` Sunil Pandey
  0 siblings, 0 replies; 3+ messages in thread
From: Sunil Pandey @ 2022-05-12 20:01 UTC (permalink / raw)
  To: H.J. Lu, Libc-stable Mailing List; +Cc: Noah Goldstein, GNU C Library

[-- Attachment #1: Type: text/plain, Size: 29709 bytes --]

On Fri, Apr 15, 2022 at 10:33 AM H.J. Lu via Libc-alpha
<libc-alpha@sourceware.org> wrote:
>
> On Fri, Apr 15, 2022 at 10:28 AM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> >
> > Code didn't actually use any sse4 instructions since `ptest` was
> > removed in:
> >
> > commit 2f9062d7171850451e6044ef78d91ff8c017b9c0
> > Author: Noah Goldstein <goldstein.w.n@gmail.com>
> > Date:   Wed Nov 10 16:18:56 2021 -0600
> >
> >     x86: Shrink memcmp-sse4.S code size
> >
> > The new memcmp-sse2 implementation is also faster.
> >
> > geometric_mean(N=20) of page cross cases SSE2 / SSE4: 0.905
> >
> > Note there are two regressions prefering SSE2 for Size = 1 and Size =
>                                                      preferring
>
> LGTM with the commit log typo fix.
>
> Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
>
> Thanks.
>
> > 65.
> >
> > Size = 1:
> > size, align0, align1, ret, New Time/Old Time
> >    1,      1,      1,   0,               1.2
> >    1,      1,      1,   1,             1.197
> >    1,      1,      1,  -1,               1.2
> >
> > This is intentional. Size == 1 is significantly less hot based on
> > profiles of GCC11 and Python3 than sizes [4, 8] (which is made
> > hotter).
> >
> > Python3 Size = 1        -> 13.64%
> > Python3 Size = [4, 8]   -> 60.92%
> >
> > GCC11   Size = 1        ->  1.29%
> > GCC11   Size = [4, 8]   -> 33.86%
> >
> > size, align0, align1, ret, New Time/Old Time
> >    4,      4,      4,   0,             0.622
> >    4,      4,      4,   1,             0.797
> >    4,      4,      4,  -1,             0.805
> >    5,      5,      5,   0,             0.623
> >    5,      5,      5,   1,             0.777
> >    5,      5,      5,  -1,             0.802
> >    6,      6,      6,   0,             0.625
> >    6,      6,      6,   1,             0.813
> >    6,      6,      6,  -1,             0.788
> >    7,      7,      7,   0,             0.625
> >    7,      7,      7,   1,             0.799
> >    7,      7,      7,  -1,             0.795
> >    8,      8,      8,   0,             0.625
> >    8,      8,      8,   1,             0.848
> >    8,      8,      8,  -1,             0.914
> >    9,      9,      9,   0,             0.625
> >
> > Size = 65:
> > size, align0, align1, ret, New Time/Old Time
> >   65,      0,      0,   0,             1.103
> >   65,      0,      0,   1,             1.216
> >   65,      0,      0,  -1,             1.227
> >   65,     65,      0,   0,             1.091
> >   65,      0,     65,   1,              1.19
> >   65,     65,     65,  -1,             1.215
> >
> > This is because A) the checks in range [65, 96] are now unrolled 2x
> > and B) because smaller values <= 16 are now given a hotter path. By
> > contrast the SSE4 version has a branch for Size = 80. The unrolled
> > version has get better performance for returns which need both
> > comparisons.
> >
> > size, align0, align1, ret, New Time/Old Time
> >  128,      4,      8,   0,             0.858
> >  128,      4,      8,   1,             0.879
> >  128,      4,      8,  -1,             0.888
> >
> > As well, out of microbenchmark environments that are not full
> > predictable the branch will have a real-cost.
> > ---
> >  sysdeps/x86_64/multiarch/Makefile          |   2 -
> >  sysdeps/x86_64/multiarch/ifunc-impl-list.c |   4 -
> >  sysdeps/x86_64/multiarch/ifunc-memcmp.h    |   4 -
> >  sysdeps/x86_64/multiarch/memcmp-sse4.S     | 803 ---------------------
> >  4 files changed, 813 deletions(-)
> >  delete mode 100644 sysdeps/x86_64/multiarch/memcmp-sse4.S
> >
> > diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
> > index b573966966..0400ea332b 100644
> > --- a/sysdeps/x86_64/multiarch/Makefile
> > +++ b/sysdeps/x86_64/multiarch/Makefile
> > @@ -11,7 +11,6 @@ sysdep_routines += \
> >    memcmp-avx2-movbe-rtm \
> >    memcmp-evex-movbe \
> >    memcmp-sse2 \
> > -  memcmp-sse4 \
> >    memcmpeq-avx2 \
> >    memcmpeq-avx2-rtm \
> >    memcmpeq-evex \
> > @@ -164,7 +163,6 @@ sysdep_routines += \
> >    wmemcmp-avx2-movbe-rtm \
> >    wmemcmp-evex-movbe \
> >    wmemcmp-sse2 \
> > -  wmemcmp-sse4 \
> >  # sysdep_routines
> >  endif
> >
> > diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> > index c6008a73ed..a8afcf81bb 100644
> > --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> > +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> > @@ -96,8 +96,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> >                                && CPU_FEATURE_USABLE (BMI2)
> >                                && CPU_FEATURE_USABLE (MOVBE)),
> >                               __memcmp_evex_movbe)
> > -             IFUNC_IMPL_ADD (array, i, memcmp, CPU_FEATURE_USABLE (SSE4_1),
> > -                             __memcmp_sse4_1)
> >               IFUNC_IMPL_ADD (array, i, memcmp, 1, __memcmp_sse2))
> >
> >  #ifdef SHARED
> > @@ -809,8 +807,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> >                                && CPU_FEATURE_USABLE (BMI2)
> >                                && CPU_FEATURE_USABLE (MOVBE)),
> >                               __wmemcmp_evex_movbe)
> > -             IFUNC_IMPL_ADD (array, i, wmemcmp, CPU_FEATURE_USABLE (SSE4_1),
> > -                             __wmemcmp_sse4_1)
> >               IFUNC_IMPL_ADD (array, i, wmemcmp, 1, __wmemcmp_sse2))
> >
> >    /* Support sysdeps/x86_64/multiarch/wmemset.c.  */
> > diff --git a/sysdeps/x86_64/multiarch/ifunc-memcmp.h b/sysdeps/x86_64/multiarch/ifunc-memcmp.h
> > index 44759a3ad5..c743970fe3 100644
> > --- a/sysdeps/x86_64/multiarch/ifunc-memcmp.h
> > +++ b/sysdeps/x86_64/multiarch/ifunc-memcmp.h
> > @@ -20,7 +20,6 @@
> >  # include <init-arch.h>
> >
> >  extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden;
> > -extern __typeof (REDIRECT_NAME) OPTIMIZE (sse4_1) attribute_hidden;
> >  extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_movbe) attribute_hidden;
> >  extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_movbe_rtm) attribute_hidden;
> >  extern __typeof (REDIRECT_NAME) OPTIMIZE (evex_movbe) attribute_hidden;
> > @@ -46,8 +45,5 @@ IFUNC_SELECTOR (void)
> >         return OPTIMIZE (avx2_movbe);
> >      }
> >
> > -  if (CPU_FEATURE_USABLE_P (cpu_features, SSE4_1))
> > -    return OPTIMIZE (sse4_1);
> > -
> >    return OPTIMIZE (sse2);
> >  }
> > diff --git a/sysdeps/x86_64/multiarch/memcmp-sse4.S b/sysdeps/x86_64/multiarch/memcmp-sse4.S
> > deleted file mode 100644
> > index cd57c1e2c7..0000000000
> > --- a/sysdeps/x86_64/multiarch/memcmp-sse4.S
> > +++ /dev/null
> > @@ -1,803 +0,0 @@
> > -/* memcmp with SSE4.1, wmemcmp with SSE4.1
> > -   Copyright (C) 2010-2022 Free Software Foundation, Inc.
> > -   This file is part of the GNU C Library.
> > -
> > -   The GNU C Library is free software; you can redistribute it and/or
> > -   modify it under the terms of the GNU Lesser General Public
> > -   License as published by the Free Software Foundation; either
> > -   version 2.1 of the License, or (at your option) any later version.
> > -
> > -   The GNU C Library is distributed in the hope that it will be useful,
> > -   but WITHOUT ANY WARRANTY; without even the implied warranty of
> > -   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > -   Lesser General Public License for more details.
> > -
> > -   You should have received a copy of the GNU Lesser General Public
> > -   License along with the GNU C Library; if not, see
> > -   <https://www.gnu.org/licenses/>.  */
> > -
> > -#if IS_IN (libc)
> > -
> > -# include <sysdep.h>
> > -
> > -# ifndef MEMCMP
> > -#  define MEMCMP       __memcmp_sse4_1
> > -# endif
> > -
> > -#ifdef USE_AS_WMEMCMP
> > -# define CMPEQ pcmpeqd
> > -# define CHAR_SIZE     4
> > -#else
> > -# define CMPEQ pcmpeqb
> > -# define CHAR_SIZE     1
> > -#endif
> > -
> > -
> > -/* Warning!
> > -           wmemcmp has to use SIGNED comparison for elements.
> > -           memcmp has to use UNSIGNED comparison for elemnts.
> > -*/
> > -
> > -       .section .text.sse4.1,"ax",@progbits
> > -ENTRY (MEMCMP)
> > -# ifdef USE_AS_WMEMCMP
> > -       shl     $2, %RDX_LP
> > -# elif defined __ILP32__
> > -       /* Clear the upper 32 bits.  */
> > -       mov     %edx, %edx
> > -# endif
> > -       cmp     $79, %RDX_LP
> > -       ja      L(79bytesormore)
> > -
> > -       cmp     $CHAR_SIZE, %RDX_LP
> > -       jbe     L(firstbyte)
> > -
> > -       /* N in (CHAR_SIZE, 79) bytes.  */
> > -       cmpl    $32, %edx
> > -       ja      L(more_32_bytes)
> > -
> > -       cmpl    $16, %edx
> > -       jae     L(16_to_32_bytes)
> > -
> > -# ifndef USE_AS_WMEMCMP
> > -       cmpl    $8, %edx
> > -       jae     L(8_to_16_bytes)
> > -
> > -       cmpl    $4, %edx
> > -       jb      L(2_to_3_bytes)
> > -
> > -       movl    (%rdi), %eax
> > -       movl    (%rsi), %ecx
> > -
> > -       bswap   %eax
> > -       bswap   %ecx
> > -
> > -       shlq    $32, %rax
> > -       shlq    $32, %rcx
> > -
> > -       movl    -4(%rdi, %rdx), %edi
> > -       movl    -4(%rsi, %rdx), %esi
> > -
> > -       bswap   %edi
> > -       bswap   %esi
> > -
> > -       orq     %rdi, %rax
> > -       orq     %rsi, %rcx
> > -       subq    %rcx, %rax
> > -       cmovne  %edx, %eax
> > -       sbbl    %ecx, %ecx
> > -       orl     %ecx, %eax
> > -       ret
> > -
> > -       .p2align 4,, 8
> > -L(2_to_3_bytes):
> > -       movzwl  (%rdi), %eax
> > -       movzwl  (%rsi), %ecx
> > -       shll    $8, %eax
> > -       shll    $8, %ecx
> > -       bswap   %eax
> > -       bswap   %ecx
> > -       movzbl  -1(%rdi, %rdx), %edi
> > -       movzbl  -1(%rsi, %rdx), %esi
> > -       orl     %edi, %eax
> > -       orl     %esi, %ecx
> > -       subl    %ecx, %eax
> > -       ret
> > -
> > -       .p2align 4,, 8
> > -L(8_to_16_bytes):
> > -       movq    (%rdi), %rax
> > -       movq    (%rsi), %rcx
> > -
> > -       bswap   %rax
> > -       bswap   %rcx
> > -
> > -       subq    %rcx, %rax
> > -       jne     L(8_to_16_bytes_done)
> > -
> > -       movq    -8(%rdi, %rdx), %rax
> > -       movq    -8(%rsi, %rdx), %rcx
> > -
> > -       bswap   %rax
> > -       bswap   %rcx
> > -
> > -       subq    %rcx, %rax
> > -
> > -L(8_to_16_bytes_done):
> > -       cmovne  %edx, %eax
> > -       sbbl    %ecx, %ecx
> > -       orl     %ecx, %eax
> > -       ret
> > -# else
> > -       xorl    %eax, %eax
> > -       movl    (%rdi), %ecx
> > -       cmpl    (%rsi), %ecx
> > -       jne     L(8_to_16_bytes_done)
> > -       movl    4(%rdi), %ecx
> > -       cmpl    4(%rsi), %ecx
> > -       jne     L(8_to_16_bytes_done)
> > -       movl    -4(%rdi, %rdx), %ecx
> > -       cmpl    -4(%rsi, %rdx), %ecx
> > -       jne     L(8_to_16_bytes_done)
> > -       ret
> > -# endif
> > -
> > -       .p2align 4,, 3
> > -L(ret_zero):
> > -       xorl    %eax, %eax
> > -L(zero):
> > -       ret
> > -
> > -       .p2align 4,, 8
> > -L(firstbyte):
> > -       jb      L(ret_zero)
> > -# ifdef USE_AS_WMEMCMP
> > -       xorl    %eax, %eax
> > -       movl    (%rdi), %ecx
> > -       cmpl    (%rsi), %ecx
> > -       je      L(zero)
> > -L(8_to_16_bytes_done):
> > -       setg    %al
> > -       leal    -1(%rax, %rax), %eax
> > -# else
> > -       movzbl  (%rdi), %eax
> > -       movzbl  (%rsi), %ecx
> > -       sub     %ecx, %eax
> > -# endif
> > -       ret
> > -
> > -       .p2align 4
> > -L(vec_return_begin_48):
> > -       addq    $16, %rdi
> > -       addq    $16, %rsi
> > -L(vec_return_begin_32):
> > -       bsfl    %eax, %eax
> > -# ifdef USE_AS_WMEMCMP
> > -       movl    32(%rdi, %rax), %ecx
> > -       xorl    %edx, %edx
> > -       cmpl    32(%rsi, %rax), %ecx
> > -       setg    %dl
> > -       leal    -1(%rdx, %rdx), %eax
> > -# else
> > -       movzbl  32(%rsi, %rax), %ecx
> > -       movzbl  32(%rdi, %rax), %eax
> > -       subl    %ecx, %eax
> > -# endif
> > -       ret
> > -
> > -       .p2align 4
> > -L(vec_return_begin_16):
> > -       addq    $16, %rdi
> > -       addq    $16, %rsi
> > -L(vec_return_begin):
> > -       bsfl    %eax, %eax
> > -# ifdef USE_AS_WMEMCMP
> > -       movl    (%rdi, %rax), %ecx
> > -       xorl    %edx, %edx
> > -       cmpl    (%rsi, %rax), %ecx
> > -       setg    %dl
> > -       leal    -1(%rdx, %rdx), %eax
> > -# else
> > -       movzbl  (%rsi, %rax), %ecx
> > -       movzbl  (%rdi, %rax), %eax
> > -       subl    %ecx, %eax
> > -# endif
> > -       ret
> > -
> > -       .p2align 4
> > -L(vec_return_end_16):
> > -       subl    $16, %edx
> > -L(vec_return_end):
> > -       bsfl    %eax, %eax
> > -       addl    %edx, %eax
> > -# ifdef USE_AS_WMEMCMP
> > -       movl    -16(%rdi, %rax), %ecx
> > -       xorl    %edx, %edx
> > -       cmpl    -16(%rsi, %rax), %ecx
> > -       setg    %dl
> > -       leal    -1(%rdx, %rdx), %eax
> > -# else
> > -       movzbl  -16(%rsi, %rax), %ecx
> > -       movzbl  -16(%rdi, %rax), %eax
> > -       subl    %ecx, %eax
> > -# endif
> > -       ret
> > -
> > -       .p2align 4,, 8
> > -L(more_32_bytes):
> > -       movdqu  (%rdi), %xmm0
> > -       movdqu  (%rsi), %xmm1
> > -       CMPEQ   %xmm0, %xmm1
> > -       pmovmskb %xmm1, %eax
> > -       incw    %ax
> > -       jnz     L(vec_return_begin)
> > -
> > -       movdqu  16(%rdi), %xmm0
> > -       movdqu  16(%rsi), %xmm1
> > -       CMPEQ   %xmm0, %xmm1
> > -       pmovmskb %xmm1, %eax
> > -       incw    %ax
> > -       jnz     L(vec_return_begin_16)
> > -
> > -       cmpl    $64, %edx
> > -       jbe     L(32_to_64_bytes)
> > -       movdqu  32(%rdi), %xmm0
> > -       movdqu  32(%rsi), %xmm1
> > -       CMPEQ   %xmm0, %xmm1
> > -       pmovmskb %xmm1, %eax
> > -       incw    %ax
> > -       jnz     L(vec_return_begin_32)
> > -
> > -       .p2align 4,, 6
> > -L(32_to_64_bytes):
> > -       movdqu  -32(%rdi, %rdx), %xmm0
> > -       movdqu  -32(%rsi, %rdx), %xmm1
> > -       CMPEQ   %xmm0, %xmm1
> > -       pmovmskb %xmm1, %eax
> > -       incw    %ax
> > -       jnz     L(vec_return_end_16)
> > -
> > -       movdqu  -16(%rdi, %rdx), %xmm0
> > -       movdqu  -16(%rsi, %rdx), %xmm1
> > -       CMPEQ   %xmm0, %xmm1
> > -       pmovmskb %xmm1, %eax
> > -       incw    %ax
> > -       jnz     L(vec_return_end)
> > -       ret
> > -
> > -       .p2align 4
> > -L(16_to_32_bytes):
> > -       movdqu  (%rdi), %xmm0
> > -       movdqu  (%rsi), %xmm1
> > -       CMPEQ   %xmm0, %xmm1
> > -       pmovmskb %xmm1, %eax
> > -       incw    %ax
> > -       jnz     L(vec_return_begin)
> > -
> > -       movdqu  -16(%rdi, %rdx), %xmm0
> > -       movdqu  -16(%rsi, %rdx), %xmm1
> > -       CMPEQ   %xmm0, %xmm1
> > -       pmovmskb %xmm1, %eax
> > -       incw    %ax
> > -       jnz     L(vec_return_end)
> > -       ret
> > -
> > -
> > -       .p2align 4
> > -L(79bytesormore):
> > -       movdqu  (%rdi), %xmm0
> > -       movdqu  (%rsi), %xmm1
> > -       CMPEQ   %xmm0, %xmm1
> > -       pmovmskb %xmm1, %eax
> > -       incw    %ax
> > -       jnz     L(vec_return_begin)
> > -
> > -
> > -       mov     %rsi, %rcx
> > -       and     $-16, %rsi
> > -       add     $16, %rsi
> > -       sub     %rsi, %rcx
> > -
> > -       sub     %rcx, %rdi
> > -       add     %rcx, %rdx
> > -       test    $0xf, %rdi
> > -       jz      L(2aligned)
> > -
> > -       cmp     $128, %rdx
> > -       ja      L(128bytesormore)
> > -
> > -       .p2align 4,, 6
> > -L(less128bytes):
> > -       movdqu  (%rdi), %xmm1
> > -       CMPEQ   (%rsi), %xmm1
> > -       pmovmskb %xmm1, %eax
> > -       incw    %ax
> > -       jnz     L(vec_return_begin)
> > -
> > -       movdqu  16(%rdi), %xmm1
> > -       CMPEQ   16(%rsi), %xmm1
> > -       pmovmskb %xmm1, %eax
> > -       incw    %ax
> > -       jnz     L(vec_return_begin_16)
> > -
> > -       movdqu  32(%rdi), %xmm1
> > -       CMPEQ   32(%rsi), %xmm1
> > -       pmovmskb %xmm1, %eax
> > -       incw    %ax
> > -       jnz     L(vec_return_begin_32)
> > -
> > -       movdqu  48(%rdi), %xmm1
> > -       CMPEQ   48(%rsi), %xmm1
> > -       pmovmskb %xmm1, %eax
> > -       incw    %ax
> > -       jnz     L(vec_return_begin_48)
> > -
> > -       cmp     $96, %rdx
> > -       jb      L(32_to_64_bytes)
> > -
> > -       addq    $64, %rdi
> > -       addq    $64, %rsi
> > -       subq    $64, %rdx
> > -
> > -       .p2align 4,, 6
> > -L(last_64_bytes):
> > -       movdqu  (%rdi), %xmm1
> > -       CMPEQ   (%rsi), %xmm1
> > -       pmovmskb %xmm1, %eax
> > -       incw    %ax
> > -       jnz     L(vec_return_begin)
> > -
> > -       movdqu  16(%rdi), %xmm1
> > -       CMPEQ   16(%rsi), %xmm1
> > -       pmovmskb %xmm1, %eax
> > -       incw    %ax
> > -       jnz     L(vec_return_begin_16)
> > -
> > -       movdqu  -32(%rdi, %rdx), %xmm0
> > -       movdqu  -32(%rsi, %rdx), %xmm1
> > -       CMPEQ   %xmm0, %xmm1
> > -       pmovmskb %xmm1, %eax
> > -       incw    %ax
> > -       jnz     L(vec_return_end_16)
> > -
> > -       movdqu  -16(%rdi, %rdx), %xmm0
> > -       movdqu  -16(%rsi, %rdx), %xmm1
> > -       CMPEQ   %xmm0, %xmm1
> > -       pmovmskb %xmm1, %eax
> > -       incw    %ax
> > -       jnz     L(vec_return_end)
> > -       ret
> > -
> > -       .p2align 4
> > -L(128bytesormore):
> > -       cmp     $256, %rdx
> > -       ja      L(unaligned_loop)
> > -L(less256bytes):
> > -       movdqu  (%rdi), %xmm1
> > -       CMPEQ   (%rsi), %xmm1
> > -       pmovmskb %xmm1, %eax
> > -       incw    %ax
> > -       jnz     L(vec_return_begin)
> > -
> > -       movdqu  16(%rdi), %xmm1
> > -       CMPEQ   16(%rsi), %xmm1
> > -       pmovmskb %xmm1, %eax
> > -       incw    %ax
> > -       jnz     L(vec_return_begin_16)
> > -
> > -       movdqu  32(%rdi), %xmm1
> > -       CMPEQ   32(%rsi), %xmm1
> > -       pmovmskb %xmm1, %eax
> > -       incw    %ax
> > -       jnz     L(vec_return_begin_32)
> > -
> > -       movdqu  48(%rdi), %xmm1
> > -       CMPEQ   48(%rsi), %xmm1
> > -       pmovmskb %xmm1, %eax
> > -       incw    %ax
> > -       jnz     L(vec_return_begin_48)
> > -
> > -       addq    $64, %rdi
> > -       addq    $64, %rsi
> > -
> > -       movdqu  (%rdi), %xmm1
> > -       CMPEQ   (%rsi), %xmm1
> > -       pmovmskb %xmm1, %eax
> > -       incw    %ax
> > -       jnz     L(vec_return_begin)
> > -
> > -       movdqu  16(%rdi), %xmm1
> > -       CMPEQ   16(%rsi), %xmm1
> > -       pmovmskb %xmm1, %eax
> > -       incw    %ax
> > -       jnz     L(vec_return_begin_16)
> > -
> > -       movdqu  32(%rdi), %xmm1
> > -       CMPEQ   32(%rsi), %xmm1
> > -       pmovmskb %xmm1, %eax
> > -       incw    %ax
> > -       jnz     L(vec_return_begin_32)
> > -
> > -       movdqu  48(%rdi), %xmm1
> > -       CMPEQ   48(%rsi), %xmm1
> > -       pmovmskb %xmm1, %eax
> > -       incw    %ax
> > -       jnz     L(vec_return_begin_48)
> > -
> > -       addq    $-128, %rdx
> > -       subq    $-64, %rsi
> > -       subq    $-64, %rdi
> > -
> > -       cmp     $64, %rdx
> > -       ja      L(less128bytes)
> > -
> > -       cmp     $32, %rdx
> > -       ja      L(last_64_bytes)
> > -
> > -       movdqu  -32(%rdi, %rdx), %xmm0
> > -       movdqu  -32(%rsi, %rdx), %xmm1
> > -       CMPEQ   %xmm0, %xmm1
> > -       pmovmskb %xmm1, %eax
> > -       incw    %ax
> > -       jnz     L(vec_return_end_16)
> > -
> > -       movdqu  -16(%rdi, %rdx), %xmm0
> > -       movdqu  -16(%rsi, %rdx), %xmm1
> > -       CMPEQ   %xmm0, %xmm1
> > -       pmovmskb %xmm1, %eax
> > -       incw    %ax
> > -       jnz     L(vec_return_end)
> > -       ret
> > -
> > -       .p2align 4
> > -L(unaligned_loop):
> > -# ifdef DATA_CACHE_SIZE_HALF
> > -       mov     $DATA_CACHE_SIZE_HALF, %R8_LP
> > -# else
> > -       mov     __x86_data_cache_size_half(%rip), %R8_LP
> > -# endif
> > -       movq    %r8, %r9
> > -       addq    %r8, %r8
> > -       addq    %r9, %r8
> > -       cmpq    %r8, %rdx
> > -       ja      L(L2_L3_cache_unaligned)
> > -       sub     $64, %rdx
> > -       .p2align 4
> > -L(64bytesormore_loop):
> > -       movdqu  (%rdi), %xmm0
> > -       movdqu  16(%rdi), %xmm1
> > -       movdqu  32(%rdi), %xmm2
> > -       movdqu  48(%rdi), %xmm3
> > -
> > -       CMPEQ   (%rsi), %xmm0
> > -       CMPEQ   16(%rsi), %xmm1
> > -       CMPEQ   32(%rsi), %xmm2
> > -       CMPEQ   48(%rsi), %xmm3
> > -
> > -       pand    %xmm0, %xmm1
> > -       pand    %xmm2, %xmm3
> > -       pand    %xmm1, %xmm3
> > -
> > -       pmovmskb %xmm3, %eax
> > -       incw    %ax
> > -       jnz     L(64bytesormore_loop_end)
> > -
> > -       add     $64, %rsi
> > -       add     $64, %rdi
> > -       sub     $64, %rdx
> > -       ja      L(64bytesormore_loop)
> > -
> > -       .p2align 4,, 6
> > -L(loop_tail):
> > -       addq    %rdx, %rdi
> > -       movdqu  (%rdi), %xmm0
> > -       movdqu  16(%rdi), %xmm1
> > -       movdqu  32(%rdi), %xmm2
> > -       movdqu  48(%rdi), %xmm3
> > -
> > -       addq    %rdx, %rsi
> > -       movdqu  (%rsi), %xmm4
> > -       movdqu  16(%rsi), %xmm5
> > -       movdqu  32(%rsi), %xmm6
> > -       movdqu  48(%rsi), %xmm7
> > -
> > -       CMPEQ   %xmm4, %xmm0
> > -       CMPEQ   %xmm5, %xmm1
> > -       CMPEQ   %xmm6, %xmm2
> > -       CMPEQ   %xmm7, %xmm3
> > -
> > -       pand    %xmm0, %xmm1
> > -       pand    %xmm2, %xmm3
> > -       pand    %xmm1, %xmm3
> > -
> > -       pmovmskb %xmm3, %eax
> > -       incw    %ax
> > -       jnz     L(64bytesormore_loop_end)
> > -       ret
> > -
> > -L(L2_L3_cache_unaligned):
> > -       subq    $64, %rdx
> > -       .p2align 4
> > -L(L2_L3_unaligned_128bytes_loop):
> > -       prefetchnta 0x1c0(%rdi)
> > -       prefetchnta 0x1c0(%rsi)
> > -
> > -       movdqu  (%rdi), %xmm0
> > -       movdqu  16(%rdi), %xmm1
> > -       movdqu  32(%rdi), %xmm2
> > -       movdqu  48(%rdi), %xmm3
> > -
> > -       CMPEQ   (%rsi), %xmm0
> > -       CMPEQ   16(%rsi), %xmm1
> > -       CMPEQ   32(%rsi), %xmm2
> > -       CMPEQ   48(%rsi), %xmm3
> > -
> > -       pand    %xmm0, %xmm1
> > -       pand    %xmm2, %xmm3
> > -       pand    %xmm1, %xmm3
> > -
> > -       pmovmskb %xmm3, %eax
> > -       incw    %ax
> > -       jnz     L(64bytesormore_loop_end)
> > -
> > -       add     $64, %rsi
> > -       add     $64, %rdi
> > -       sub     $64, %rdx
> > -       ja      L(L2_L3_unaligned_128bytes_loop)
> > -       jmp     L(loop_tail)
> > -
> > -
> > -       /* This case is for machines which are sensitive for unaligned
> > -        * instructions.  */
> > -       .p2align 4
> > -L(2aligned):
> > -       cmp     $128, %rdx
> > -       ja      L(128bytesormorein2aligned)
> > -L(less128bytesin2aligned):
> > -       movdqa  (%rdi), %xmm1
> > -       CMPEQ   (%rsi), %xmm1
> > -       pmovmskb %xmm1, %eax
> > -       incw    %ax
> > -       jnz     L(vec_return_begin)
> > -
> > -       movdqa  16(%rdi), %xmm1
> > -       CMPEQ   16(%rsi), %xmm1
> > -       pmovmskb %xmm1, %eax
> > -       incw    %ax
> > -       jnz     L(vec_return_begin_16)
> > -
> > -       movdqa  32(%rdi), %xmm1
> > -       CMPEQ   32(%rsi), %xmm1
> > -       pmovmskb %xmm1, %eax
> > -       incw    %ax
> > -       jnz     L(vec_return_begin_32)
> > -
> > -       movdqa  48(%rdi), %xmm1
> > -       CMPEQ   48(%rsi), %xmm1
> > -       pmovmskb %xmm1, %eax
> > -       incw    %ax
> > -       jnz     L(vec_return_begin_48)
> > -
> > -       cmp     $96, %rdx
> > -       jb      L(32_to_64_bytes)
> > -
> > -       addq    $64, %rdi
> > -       addq    $64, %rsi
> > -       subq    $64, %rdx
> > -
> > -       .p2align 4,, 6
> > -L(aligned_last_64_bytes):
> > -       movdqa  (%rdi), %xmm1
> > -       CMPEQ   (%rsi), %xmm1
> > -       pmovmskb %xmm1, %eax
> > -       incw    %ax
> > -       jnz     L(vec_return_begin)
> > -
> > -       movdqa  16(%rdi), %xmm1
> > -       CMPEQ   16(%rsi), %xmm1
> > -       pmovmskb %xmm1, %eax
> > -       incw    %ax
> > -       jnz     L(vec_return_begin_16)
> > -
> > -       movdqu  -32(%rdi, %rdx), %xmm0
> > -       movdqu  -32(%rsi, %rdx), %xmm1
> > -       CMPEQ   %xmm0, %xmm1
> > -       pmovmskb %xmm1, %eax
> > -       incw    %ax
> > -       jnz     L(vec_return_end_16)
> > -
> > -       movdqu  -16(%rdi, %rdx), %xmm0
> > -       movdqu  -16(%rsi, %rdx), %xmm1
> > -       CMPEQ   %xmm0, %xmm1
> > -       pmovmskb %xmm1, %eax
> > -       incw    %ax
> > -       jnz     L(vec_return_end)
> > -       ret
> > -
> > -       .p2align 4
> > -L(128bytesormorein2aligned):
> > -       cmp     $256, %rdx
> > -       ja      L(aligned_loop)
> > -L(less256bytesin2alinged):
> > -       movdqa  (%rdi), %xmm1
> > -       CMPEQ   (%rsi), %xmm1
> > -       pmovmskb %xmm1, %eax
> > -       incw    %ax
> > -       jnz     L(vec_return_begin)
> > -
> > -       movdqa  16(%rdi), %xmm1
> > -       CMPEQ   16(%rsi), %xmm1
> > -       pmovmskb %xmm1, %eax
> > -       incw    %ax
> > -       jnz     L(vec_return_begin_16)
> > -
> > -       movdqa  32(%rdi), %xmm1
> > -       CMPEQ   32(%rsi), %xmm1
> > -       pmovmskb %xmm1, %eax
> > -       incw    %ax
> > -       jnz     L(vec_return_begin_32)
> > -
> > -       movdqa  48(%rdi), %xmm1
> > -       CMPEQ   48(%rsi), %xmm1
> > -       pmovmskb %xmm1, %eax
> > -       incw    %ax
> > -       jnz     L(vec_return_begin_48)
> > -
> > -       addq    $64, %rdi
> > -       addq    $64, %rsi
> > -
> > -       movdqa  (%rdi), %xmm1
> > -       CMPEQ   (%rsi), %xmm1
> > -       pmovmskb %xmm1, %eax
> > -       incw    %ax
> > -       jnz     L(vec_return_begin)
> > -
> > -       movdqa  16(%rdi), %xmm1
> > -       CMPEQ   16(%rsi), %xmm1
> > -       pmovmskb %xmm1, %eax
> > -       incw    %ax
> > -       jnz     L(vec_return_begin_16)
> > -
> > -       movdqa  32(%rdi), %xmm1
> > -       CMPEQ   32(%rsi), %xmm1
> > -       pmovmskb %xmm1, %eax
> > -       incw    %ax
> > -       jnz     L(vec_return_begin_32)
> > -
> > -       movdqa  48(%rdi), %xmm1
> > -       CMPEQ   48(%rsi), %xmm1
> > -       pmovmskb %xmm1, %eax
> > -       incw    %ax
> > -       jnz     L(vec_return_begin_48)
> > -
> > -       addq    $-128, %rdx
> > -       subq    $-64, %rsi
> > -       subq    $-64, %rdi
> > -
> > -       cmp     $64, %rdx
> > -       ja      L(less128bytesin2aligned)
> > -
> > -       cmp     $32, %rdx
> > -       ja      L(aligned_last_64_bytes)
> > -
> > -       movdqu  -32(%rdi, %rdx), %xmm0
> > -       movdqu  -32(%rsi, %rdx), %xmm1
> > -       CMPEQ   %xmm0, %xmm1
> > -       pmovmskb %xmm1, %eax
> > -       incw    %ax
> > -       jnz     L(vec_return_end_16)
> > -
> > -       movdqu  -16(%rdi, %rdx), %xmm0
> > -       movdqu  -16(%rsi, %rdx), %xmm1
> > -       CMPEQ   %xmm0, %xmm1
> > -       pmovmskb %xmm1, %eax
> > -       incw    %ax
> > -       jnz     L(vec_return_end)
> > -       ret
> > -
> > -       .p2align 4
> > -L(aligned_loop):
> > -# ifdef DATA_CACHE_SIZE_HALF
> > -       mov     $DATA_CACHE_SIZE_HALF, %R8_LP
> > -# else
> > -       mov     __x86_data_cache_size_half(%rip), %R8_LP
> > -# endif
> > -       movq    %r8, %r9
> > -       addq    %r8, %r8
> > -       addq    %r9, %r8
> > -       cmpq    %r8, %rdx
> > -       ja      L(L2_L3_cache_aligned)
> > -
> > -       sub     $64, %rdx
> > -       .p2align 4
> > -L(64bytesormore_loopin2aligned):
> > -       movdqa  (%rdi), %xmm0
> > -       movdqa  16(%rdi), %xmm1
> > -       movdqa  32(%rdi), %xmm2
> > -       movdqa  48(%rdi), %xmm3
> > -
> > -       CMPEQ   (%rsi), %xmm0
> > -       CMPEQ   16(%rsi), %xmm1
> > -       CMPEQ   32(%rsi), %xmm2
> > -       CMPEQ   48(%rsi), %xmm3
> > -
> > -       pand    %xmm0, %xmm1
> > -       pand    %xmm2, %xmm3
> > -       pand    %xmm1, %xmm3
> > -
> > -       pmovmskb %xmm3, %eax
> > -       incw    %ax
> > -       jnz     L(64bytesormore_loop_end)
> > -       add     $64, %rsi
> > -       add     $64, %rdi
> > -       sub     $64, %rdx
> > -       ja      L(64bytesormore_loopin2aligned)
> > -       jmp     L(loop_tail)
> > -
> > -L(L2_L3_cache_aligned):
> > -       subq    $64, %rdx
> > -       .p2align 4
> > -L(L2_L3_aligned_128bytes_loop):
> > -       prefetchnta 0x1c0(%rdi)
> > -       prefetchnta 0x1c0(%rsi)
> > -       movdqa  (%rdi), %xmm0
> > -       movdqa  16(%rdi), %xmm1
> > -       movdqa  32(%rdi), %xmm2
> > -       movdqa  48(%rdi), %xmm3
> > -
> > -       CMPEQ   (%rsi), %xmm0
> > -       CMPEQ   16(%rsi), %xmm1
> > -       CMPEQ   32(%rsi), %xmm2
> > -       CMPEQ   48(%rsi), %xmm3
> > -
> > -       pand    %xmm0, %xmm1
> > -       pand    %xmm2, %xmm3
> > -       pand    %xmm1, %xmm3
> > -
> > -       pmovmskb %xmm3, %eax
> > -       incw    %ax
> > -       jnz     L(64bytesormore_loop_end)
> > -
> > -       addq    $64, %rsi
> > -       addq    $64, %rdi
> > -       subq    $64, %rdx
> > -       ja      L(L2_L3_aligned_128bytes_loop)
> > -       jmp     L(loop_tail)
> > -
> > -       .p2align 4
> > -L(64bytesormore_loop_end):
> > -       pmovmskb %xmm0, %ecx
> > -       incw    %cx
> > -       jnz     L(loop_end_ret)
> > -
> > -       pmovmskb %xmm1, %ecx
> > -       notw    %cx
> > -       sall    $16, %ecx
> > -       jnz     L(loop_end_ret)
> > -
> > -       pmovmskb %xmm2, %ecx
> > -       notw    %cx
> > -       shlq    $32, %rcx
> > -       jnz     L(loop_end_ret)
> > -
> > -       addq    $48, %rdi
> > -       addq    $48, %rsi
> > -       movq    %rax, %rcx
> > -
> > -       .p2align 4,, 6
> > -L(loop_end_ret):
> > -       bsfq    %rcx, %rcx
> > -# ifdef USE_AS_WMEMCMP
> > -       movl    (%rdi, %rcx), %eax
> > -       xorl    %edx, %edx
> > -       cmpl    (%rsi, %rcx), %eax
> > -       setg    %dl
> > -       leal    -1(%rdx, %rdx), %eax
> > -# else
> > -       movzbl  (%rdi, %rcx), %eax
> > -       movzbl  (%rsi, %rcx), %ecx
> > -       subl    %ecx, %eax
> > -# endif
> > -       ret
> > -END (MEMCMP)
> > -#endif
> > --
> > 2.25.1
> >
>
>
> --
> H.J.

I would like to backport this patch to release branches.
Any comments or objections?

Conflict resolution patch attached,

--Sunil

[-- Attachment #2: 0020-x86-Remove-memcmp-sse4.S.patch --]
[-- Type: application/octet-stream, Size: 20997 bytes --]

From 350ed366ce9b21c817ce280102bd1ea9ff843bab Mon Sep 17 00:00:00 2001
From: Noah Goldstein <goldstein.w.n@gmail.com>
Date: Fri, 15 Apr 2022 12:28:00 -0500
Subject: [PATCH 20/26] x86: Remove memcmp-sse4.S

Code didn't actually use any sse4 instructions since `ptest` was
removed in:

commit 2f9062d7171850451e6044ef78d91ff8c017b9c0
Author: Noah Goldstein <goldstein.w.n@gmail.com>
Date:   Wed Nov 10 16:18:56 2021 -0600

    x86: Shrink memcmp-sse4.S code size

The new memcmp-sse2 implementation is also faster.

geometric_mean(N=20) of page cross cases SSE2 / SSE4: 0.905

Note there are two regressions preferring SSE2 for Size = 1 and Size =
65.

Size = 1:
size, align0, align1, ret, New Time/Old Time
   1,      1,      1,   0,               1.2
   1,      1,      1,   1,             1.197
   1,      1,      1,  -1,               1.2

This is intentional. Size == 1 is significantly less hot based on
profiles of GCC11 and Python3 than sizes [4, 8] (which is made
hotter).

Python3 Size = 1        -> 13.64%
Python3 Size = [4, 8]   -> 60.92%

GCC11   Size = 1        ->  1.29%
GCC11   Size = [4, 8]   -> 33.86%

size, align0, align1, ret, New Time/Old Time
   4,      4,      4,   0,             0.622
   4,      4,      4,   1,             0.797
   4,      4,      4,  -1,             0.805
   5,      5,      5,   0,             0.623
   5,      5,      5,   1,             0.777
   5,      5,      5,  -1,             0.802
   6,      6,      6,   0,             0.625
   6,      6,      6,   1,             0.813
   6,      6,      6,  -1,             0.788
   7,      7,      7,   0,             0.625
   7,      7,      7,   1,             0.799
   7,      7,      7,  -1,             0.795
   8,      8,      8,   0,             0.625
   8,      8,      8,   1,             0.848
   8,      8,      8,  -1,             0.914
   9,      9,      9,   0,             0.625

Size = 65:
size, align0, align1, ret, New Time/Old Time
  65,      0,      0,   0,             1.103
  65,      0,      0,   1,             1.216
  65,      0,      0,  -1,             1.227
  65,     65,      0,   0,             1.091
  65,      0,     65,   1,              1.19
  65,     65,     65,  -1,             1.215

This is because A) the checks in range [65, 96] are now unrolled 2x
and B) because smaller values <= 16 are now given a hotter path. By
contrast the SSE4 version has a branch for Size = 80. The unrolled
version has get better performance for returns which need both
comparisons.

size, align0, align1, ret, New Time/Old Time
 128,      4,      8,   0,             0.858
 128,      4,      8,   1,             0.879
 128,      4,      8,  -1,             0.888

As well, out of microbenchmark environments that are not full
predictable the branch will have a real-cost.
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>

(cherry picked from commit 7cbc03d03091d5664060924789afe46d30a5477e)
---
 sysdeps/x86_64/multiarch/Makefile          |   2 -
 sysdeps/x86_64/multiarch/ifunc-impl-list.c |   4 -
 sysdeps/x86_64/multiarch/ifunc-memcmp.h    |   4 -
 sysdeps/x86_64/multiarch/memcmp-sse4.S     | 803 ---------------------
 4 files changed, 813 deletions(-)
 delete mode 100644 sysdeps/x86_64/multiarch/memcmp-sse4.S

diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
index ecb4dea190..2d8dc26ac2 100644
--- a/sysdeps/x86_64/multiarch/Makefile
+++ b/sysdeps/x86_64/multiarch/Makefile
@@ -11,7 +11,6 @@ sysdep_routines += \
   memcmp-avx2-movbe-rtm \
   memcmp-evex-movbe \
   memcmp-sse2 \
-  memcmp-sse4 \
   memcmp-ssse3 \
   memcmpeq-avx2 \
   memcmpeq-avx2-rtm \
@@ -178,7 +177,6 @@ sysdep_routines += \
   wmemcmp-avx2-movbe-rtm \
   wmemcmp-evex-movbe \
   wmemcmp-sse2 \
-  wmemcmp-sse4 \
   wmemcmp-ssse3 \
 # sysdep_routines
 endif
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
index 40cc6cc49e..516f7f10e0 100644
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
@@ -96,8 +96,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 			       && CPU_FEATURE_USABLE (BMI2)
 			       && CPU_FEATURE_USABLE (MOVBE)),
 			      __memcmp_evex_movbe)
-	      IFUNC_IMPL_ADD (array, i, memcmp, CPU_FEATURE_USABLE (SSE4_1),
-			      __memcmp_sse4_1)
 	      IFUNC_IMPL_ADD (array, i, memcmp, CPU_FEATURE_USABLE (SSSE3),
 			      __memcmp_ssse3)
 	      IFUNC_IMPL_ADD (array, i, memcmp, 1, __memcmp_sse2))
@@ -842,8 +840,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 			       && CPU_FEATURE_USABLE (BMI2)
 			       && CPU_FEATURE_USABLE (MOVBE)),
 			      __wmemcmp_evex_movbe)
-	      IFUNC_IMPL_ADD (array, i, wmemcmp, CPU_FEATURE_USABLE (SSE4_1),
-			      __wmemcmp_sse4_1)
 	      IFUNC_IMPL_ADD (array, i, wmemcmp, CPU_FEATURE_USABLE (SSSE3),
 			      __wmemcmp_ssse3)
 	      IFUNC_IMPL_ADD (array, i, wmemcmp, 1, __wmemcmp_sse2))
diff --git a/sysdeps/x86_64/multiarch/ifunc-memcmp.h b/sysdeps/x86_64/multiarch/ifunc-memcmp.h
index cd12613699..4518b0f98c 100644
--- a/sysdeps/x86_64/multiarch/ifunc-memcmp.h
+++ b/sysdeps/x86_64/multiarch/ifunc-memcmp.h
@@ -21,7 +21,6 @@
 
 extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden;
 extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden;
-extern __typeof (REDIRECT_NAME) OPTIMIZE (sse4_1) attribute_hidden;
 extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_movbe) attribute_hidden;
 extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_movbe_rtm) attribute_hidden;
 extern __typeof (REDIRECT_NAME) OPTIMIZE (evex_movbe) attribute_hidden;
@@ -47,9 +46,6 @@ IFUNC_SELECTOR (void)
 	return OPTIMIZE (avx2_movbe);
     }
 
-  if (CPU_FEATURE_USABLE_P (cpu_features, SSE4_1))
-    return OPTIMIZE (sse4_1);
-
   if (CPU_FEATURE_USABLE_P (cpu_features, SSSE3))
     return OPTIMIZE (ssse3);
 
diff --git a/sysdeps/x86_64/multiarch/memcmp-sse4.S b/sysdeps/x86_64/multiarch/memcmp-sse4.S
deleted file mode 100644
index cd57c1e2c7..0000000000
--- a/sysdeps/x86_64/multiarch/memcmp-sse4.S
+++ /dev/null
@@ -1,803 +0,0 @@
-/* memcmp with SSE4.1, wmemcmp with SSE4.1
-   Copyright (C) 2010-2022 Free Software Foundation, Inc.
-   This file is part of the GNU C Library.
-
-   The GNU C Library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 2.1 of the License, or (at your option) any later version.
-
-   The GNU C Library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   Lesser General Public License for more details.
-
-   You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, see
-   <https://www.gnu.org/licenses/>.  */
-
-#if IS_IN (libc)
-
-# include <sysdep.h>
-
-# ifndef MEMCMP
-#  define MEMCMP	__memcmp_sse4_1
-# endif
-
-#ifdef USE_AS_WMEMCMP
-# define CMPEQ	pcmpeqd
-# define CHAR_SIZE	4
-#else
-# define CMPEQ	pcmpeqb
-# define CHAR_SIZE	1
-#endif
-
-
-/* Warning!
-           wmemcmp has to use SIGNED comparison for elements.
-           memcmp has to use UNSIGNED comparison for elemnts.
-*/
-
-	.section .text.sse4.1,"ax",@progbits
-ENTRY (MEMCMP)
-# ifdef USE_AS_WMEMCMP
-	shl	$2, %RDX_LP
-# elif defined __ILP32__
-	/* Clear the upper 32 bits.  */
-	mov	%edx, %edx
-# endif
-	cmp	$79, %RDX_LP
-	ja	L(79bytesormore)
-
-	cmp	$CHAR_SIZE, %RDX_LP
-	jbe	L(firstbyte)
-
-	/* N in (CHAR_SIZE, 79) bytes.  */
-	cmpl	$32, %edx
-	ja	L(more_32_bytes)
-
-	cmpl	$16, %edx
-	jae	L(16_to_32_bytes)
-
-# ifndef USE_AS_WMEMCMP
-	cmpl	$8, %edx
-	jae	L(8_to_16_bytes)
-
-	cmpl	$4, %edx
-	jb	L(2_to_3_bytes)
-
-	movl	(%rdi), %eax
-	movl	(%rsi), %ecx
-
-	bswap	%eax
-	bswap	%ecx
-
-	shlq	$32, %rax
-	shlq	$32, %rcx
-
-	movl	-4(%rdi, %rdx), %edi
-	movl	-4(%rsi, %rdx), %esi
-
-	bswap	%edi
-	bswap	%esi
-
-	orq	%rdi, %rax
-	orq	%rsi, %rcx
-	subq	%rcx, %rax
-	cmovne	%edx, %eax
-	sbbl	%ecx, %ecx
-	orl	%ecx, %eax
-	ret
-
-	.p2align 4,, 8
-L(2_to_3_bytes):
-	movzwl	(%rdi), %eax
-	movzwl	(%rsi), %ecx
-	shll	$8, %eax
-	shll	$8, %ecx
-	bswap	%eax
-	bswap	%ecx
-	movzbl	-1(%rdi, %rdx), %edi
-	movzbl	-1(%rsi, %rdx), %esi
-	orl	%edi, %eax
-	orl	%esi, %ecx
-	subl	%ecx, %eax
-	ret
-
-	.p2align 4,, 8
-L(8_to_16_bytes):
-	movq	(%rdi), %rax
-	movq	(%rsi), %rcx
-
-	bswap	%rax
-	bswap	%rcx
-
-	subq	%rcx, %rax
-	jne	L(8_to_16_bytes_done)
-
-	movq	-8(%rdi, %rdx), %rax
-	movq	-8(%rsi, %rdx), %rcx
-
-	bswap	%rax
-	bswap	%rcx
-
-	subq	%rcx, %rax
-
-L(8_to_16_bytes_done):
-	cmovne	%edx, %eax
-	sbbl	%ecx, %ecx
-	orl	%ecx, %eax
-	ret
-# else
-	xorl	%eax, %eax
-	movl	(%rdi), %ecx
-	cmpl	(%rsi), %ecx
-	jne	L(8_to_16_bytes_done)
-	movl	4(%rdi), %ecx
-	cmpl	4(%rsi), %ecx
-	jne	L(8_to_16_bytes_done)
-	movl	-4(%rdi, %rdx), %ecx
-	cmpl	-4(%rsi, %rdx), %ecx
-	jne	L(8_to_16_bytes_done)
-	ret
-# endif
-
-	.p2align 4,, 3
-L(ret_zero):
-	xorl	%eax, %eax
-L(zero):
-	ret
-
-	.p2align 4,, 8
-L(firstbyte):
-	jb	L(ret_zero)
-# ifdef USE_AS_WMEMCMP
-	xorl	%eax, %eax
-	movl	(%rdi), %ecx
-	cmpl	(%rsi), %ecx
-	je	L(zero)
-L(8_to_16_bytes_done):
-	setg	%al
-	leal	-1(%rax, %rax), %eax
-# else
-	movzbl	(%rdi), %eax
-	movzbl	(%rsi), %ecx
-	sub	%ecx, %eax
-# endif
-	ret
-
-	.p2align 4
-L(vec_return_begin_48):
-	addq	$16, %rdi
-	addq	$16, %rsi
-L(vec_return_begin_32):
-	bsfl	%eax, %eax
-# ifdef USE_AS_WMEMCMP
-	movl	32(%rdi, %rax), %ecx
-	xorl	%edx, %edx
-	cmpl	32(%rsi, %rax), %ecx
-	setg	%dl
-	leal	-1(%rdx, %rdx), %eax
-# else
-	movzbl	32(%rsi, %rax), %ecx
-	movzbl	32(%rdi, %rax), %eax
-	subl	%ecx, %eax
-# endif
-	ret
-
-	.p2align 4
-L(vec_return_begin_16):
-	addq	$16, %rdi
-	addq	$16, %rsi
-L(vec_return_begin):
-	bsfl	%eax, %eax
-# ifdef USE_AS_WMEMCMP
-	movl	(%rdi, %rax), %ecx
-	xorl	%edx, %edx
-	cmpl	(%rsi, %rax), %ecx
-	setg	%dl
-	leal	-1(%rdx, %rdx), %eax
-# else
-	movzbl	(%rsi, %rax), %ecx
-	movzbl	(%rdi, %rax), %eax
-	subl	%ecx, %eax
-# endif
-	ret
-
-	.p2align 4
-L(vec_return_end_16):
-	subl	$16, %edx
-L(vec_return_end):
-	bsfl	%eax, %eax
-	addl	%edx, %eax
-# ifdef USE_AS_WMEMCMP
-	movl	-16(%rdi, %rax), %ecx
-	xorl	%edx, %edx
-	cmpl	-16(%rsi, %rax), %ecx
-	setg	%dl
-	leal	-1(%rdx, %rdx), %eax
-# else
-	movzbl	-16(%rsi, %rax), %ecx
-	movzbl	-16(%rdi, %rax), %eax
-	subl	%ecx, %eax
-# endif
-	ret
-
-	.p2align 4,, 8
-L(more_32_bytes):
-	movdqu	(%rdi), %xmm0
-	movdqu	(%rsi), %xmm1
-	CMPEQ	%xmm0, %xmm1
-	pmovmskb %xmm1, %eax
-	incw	%ax
-	jnz	L(vec_return_begin)
-
-	movdqu	16(%rdi), %xmm0
-	movdqu	16(%rsi), %xmm1
-	CMPEQ	%xmm0, %xmm1
-	pmovmskb %xmm1, %eax
-	incw	%ax
-	jnz	L(vec_return_begin_16)
-
-	cmpl	$64, %edx
-	jbe	L(32_to_64_bytes)
-	movdqu	32(%rdi), %xmm0
-	movdqu	32(%rsi), %xmm1
-	CMPEQ	%xmm0, %xmm1
-	pmovmskb %xmm1, %eax
-	incw	%ax
-	jnz	L(vec_return_begin_32)
-
-	.p2align 4,, 6
-L(32_to_64_bytes):
-	movdqu	-32(%rdi, %rdx), %xmm0
-	movdqu	-32(%rsi, %rdx), %xmm1
-	CMPEQ	%xmm0, %xmm1
-	pmovmskb %xmm1, %eax
-	incw	%ax
-	jnz	L(vec_return_end_16)
-
-	movdqu	-16(%rdi, %rdx), %xmm0
-	movdqu	-16(%rsi, %rdx), %xmm1
-	CMPEQ	%xmm0, %xmm1
-	pmovmskb %xmm1, %eax
-	incw	%ax
-	jnz	L(vec_return_end)
-	ret
-
-	.p2align 4
-L(16_to_32_bytes):
-	movdqu	(%rdi), %xmm0
-	movdqu	(%rsi), %xmm1
-	CMPEQ	%xmm0, %xmm1
-	pmovmskb %xmm1, %eax
-	incw	%ax
-	jnz	L(vec_return_begin)
-
-	movdqu	-16(%rdi, %rdx), %xmm0
-	movdqu	-16(%rsi, %rdx), %xmm1
-	CMPEQ	%xmm0, %xmm1
-	pmovmskb %xmm1, %eax
-	incw	%ax
-	jnz	L(vec_return_end)
-	ret
-
-
-	.p2align 4
-L(79bytesormore):
-	movdqu	(%rdi), %xmm0
-	movdqu	(%rsi), %xmm1
-	CMPEQ	%xmm0, %xmm1
-	pmovmskb %xmm1, %eax
-	incw	%ax
-	jnz	L(vec_return_begin)
-
-
-	mov	%rsi, %rcx
-	and	$-16, %rsi
-	add	$16, %rsi
-	sub	%rsi, %rcx
-
-	sub	%rcx, %rdi
-	add	%rcx, %rdx
-	test	$0xf, %rdi
-	jz	L(2aligned)
-
-	cmp	$128, %rdx
-	ja	L(128bytesormore)
-
-	.p2align 4,, 6
-L(less128bytes):
-	movdqu	(%rdi), %xmm1
-	CMPEQ	(%rsi), %xmm1
-	pmovmskb %xmm1, %eax
-	incw	%ax
-	jnz	L(vec_return_begin)
-
-	movdqu	16(%rdi), %xmm1
-	CMPEQ	16(%rsi), %xmm1
-	pmovmskb %xmm1, %eax
-	incw	%ax
-	jnz	L(vec_return_begin_16)
-
-	movdqu	32(%rdi), %xmm1
-	CMPEQ	32(%rsi), %xmm1
-	pmovmskb %xmm1, %eax
-	incw	%ax
-	jnz	L(vec_return_begin_32)
-
-	movdqu	48(%rdi), %xmm1
-	CMPEQ	48(%rsi), %xmm1
-	pmovmskb %xmm1, %eax
-	incw	%ax
-	jnz	L(vec_return_begin_48)
-
-	cmp	$96, %rdx
-	jb	L(32_to_64_bytes)
-
-	addq	$64, %rdi
-	addq	$64, %rsi
-	subq	$64, %rdx
-
-	.p2align 4,, 6
-L(last_64_bytes):
-	movdqu	(%rdi), %xmm1
-	CMPEQ	(%rsi), %xmm1
-	pmovmskb %xmm1, %eax
-	incw	%ax
-	jnz	L(vec_return_begin)
-
-	movdqu	16(%rdi), %xmm1
-	CMPEQ	16(%rsi), %xmm1
-	pmovmskb %xmm1, %eax
-	incw	%ax
-	jnz	L(vec_return_begin_16)
-
-	movdqu	-32(%rdi, %rdx), %xmm0
-	movdqu	-32(%rsi, %rdx), %xmm1
-	CMPEQ	%xmm0, %xmm1
-	pmovmskb %xmm1, %eax
-	incw	%ax
-	jnz	L(vec_return_end_16)
-
-	movdqu	-16(%rdi, %rdx), %xmm0
-	movdqu	-16(%rsi, %rdx), %xmm1
-	CMPEQ	%xmm0, %xmm1
-	pmovmskb %xmm1, %eax
-	incw	%ax
-	jnz	L(vec_return_end)
-	ret
-
-	.p2align 4
-L(128bytesormore):
-	cmp	$256, %rdx
-	ja	L(unaligned_loop)
-L(less256bytes):
-	movdqu	(%rdi), %xmm1
-	CMPEQ	(%rsi), %xmm1
-	pmovmskb %xmm1, %eax
-	incw	%ax
-	jnz	L(vec_return_begin)
-
-	movdqu	16(%rdi), %xmm1
-	CMPEQ	16(%rsi), %xmm1
-	pmovmskb %xmm1, %eax
-	incw	%ax
-	jnz	L(vec_return_begin_16)
-
-	movdqu	32(%rdi), %xmm1
-	CMPEQ	32(%rsi), %xmm1
-	pmovmskb %xmm1, %eax
-	incw	%ax
-	jnz	L(vec_return_begin_32)
-
-	movdqu	48(%rdi), %xmm1
-	CMPEQ	48(%rsi), %xmm1
-	pmovmskb %xmm1, %eax
-	incw	%ax
-	jnz	L(vec_return_begin_48)
-
-	addq	$64, %rdi
-	addq	$64, %rsi
-
-	movdqu	(%rdi), %xmm1
-	CMPEQ	(%rsi), %xmm1
-	pmovmskb %xmm1, %eax
-	incw	%ax
-	jnz	L(vec_return_begin)
-
-	movdqu	16(%rdi), %xmm1
-	CMPEQ	16(%rsi), %xmm1
-	pmovmskb %xmm1, %eax
-	incw	%ax
-	jnz	L(vec_return_begin_16)
-
-	movdqu	32(%rdi), %xmm1
-	CMPEQ	32(%rsi), %xmm1
-	pmovmskb %xmm1, %eax
-	incw	%ax
-	jnz	L(vec_return_begin_32)
-
-	movdqu	48(%rdi), %xmm1
-	CMPEQ	48(%rsi), %xmm1
-	pmovmskb %xmm1, %eax
-	incw	%ax
-	jnz	L(vec_return_begin_48)
-
-	addq	$-128, %rdx
-	subq	$-64, %rsi
-	subq	$-64, %rdi
-
-	cmp	$64, %rdx
-	ja	L(less128bytes)
-
-	cmp	$32, %rdx
-	ja	L(last_64_bytes)
-
-	movdqu	-32(%rdi, %rdx), %xmm0
-	movdqu	-32(%rsi, %rdx), %xmm1
-	CMPEQ	%xmm0, %xmm1
-	pmovmskb %xmm1, %eax
-	incw	%ax
-	jnz	L(vec_return_end_16)
-
-	movdqu	-16(%rdi, %rdx), %xmm0
-	movdqu	-16(%rsi, %rdx), %xmm1
-	CMPEQ	%xmm0, %xmm1
-	pmovmskb %xmm1, %eax
-	incw	%ax
-	jnz	L(vec_return_end)
-	ret
-
-	.p2align 4
-L(unaligned_loop):
-# ifdef DATA_CACHE_SIZE_HALF
-	mov	$DATA_CACHE_SIZE_HALF, %R8_LP
-# else
-	mov	__x86_data_cache_size_half(%rip), %R8_LP
-# endif
-	movq	%r8, %r9
-	addq	%r8, %r8
-	addq	%r9, %r8
-	cmpq	%r8, %rdx
-	ja	L(L2_L3_cache_unaligned)
-	sub	$64, %rdx
-	.p2align 4
-L(64bytesormore_loop):
-	movdqu	(%rdi), %xmm0
-	movdqu	16(%rdi), %xmm1
-	movdqu	32(%rdi), %xmm2
-	movdqu	48(%rdi), %xmm3
-
-	CMPEQ	(%rsi), %xmm0
-	CMPEQ	16(%rsi), %xmm1
-	CMPEQ	32(%rsi), %xmm2
-	CMPEQ	48(%rsi), %xmm3
-
-	pand	%xmm0, %xmm1
-	pand	%xmm2, %xmm3
-	pand	%xmm1, %xmm3
-
-	pmovmskb %xmm3, %eax
-	incw	%ax
-	jnz	L(64bytesormore_loop_end)
-
-	add	$64, %rsi
-	add	$64, %rdi
-	sub	$64, %rdx
-	ja	L(64bytesormore_loop)
-
-	.p2align 4,, 6
-L(loop_tail):
-	addq	%rdx, %rdi
-	movdqu	(%rdi), %xmm0
-	movdqu	16(%rdi), %xmm1
-	movdqu	32(%rdi), %xmm2
-	movdqu	48(%rdi), %xmm3
-
-	addq	%rdx, %rsi
-	movdqu	(%rsi), %xmm4
-	movdqu	16(%rsi), %xmm5
-	movdqu	32(%rsi), %xmm6
-	movdqu	48(%rsi), %xmm7
-
-	CMPEQ	%xmm4, %xmm0
-	CMPEQ	%xmm5, %xmm1
-	CMPEQ	%xmm6, %xmm2
-	CMPEQ	%xmm7, %xmm3
-
-	pand	%xmm0, %xmm1
-	pand	%xmm2, %xmm3
-	pand	%xmm1, %xmm3
-
-	pmovmskb %xmm3, %eax
-	incw	%ax
-	jnz	L(64bytesormore_loop_end)
-	ret
-
-L(L2_L3_cache_unaligned):
-	subq	$64, %rdx
-	.p2align 4
-L(L2_L3_unaligned_128bytes_loop):
-	prefetchnta 0x1c0(%rdi)
-	prefetchnta 0x1c0(%rsi)
-
-	movdqu	(%rdi), %xmm0
-	movdqu	16(%rdi), %xmm1
-	movdqu	32(%rdi), %xmm2
-	movdqu	48(%rdi), %xmm3
-
-	CMPEQ	(%rsi), %xmm0
-	CMPEQ	16(%rsi), %xmm1
-	CMPEQ	32(%rsi), %xmm2
-	CMPEQ	48(%rsi), %xmm3
-
-	pand	%xmm0, %xmm1
-	pand	%xmm2, %xmm3
-	pand	%xmm1, %xmm3
-
-	pmovmskb %xmm3, %eax
-	incw	%ax
-	jnz	L(64bytesormore_loop_end)
-
-	add	$64, %rsi
-	add	$64, %rdi
-	sub	$64, %rdx
-	ja	L(L2_L3_unaligned_128bytes_loop)
-	jmp	L(loop_tail)
-
-
-	/* This case is for machines which are sensitive for unaligned
-	 * instructions.  */
-	.p2align 4
-L(2aligned):
-	cmp	$128, %rdx
-	ja	L(128bytesormorein2aligned)
-L(less128bytesin2aligned):
-	movdqa	(%rdi), %xmm1
-	CMPEQ	(%rsi), %xmm1
-	pmovmskb %xmm1, %eax
-	incw	%ax
-	jnz	L(vec_return_begin)
-
-	movdqa	16(%rdi), %xmm1
-	CMPEQ	16(%rsi), %xmm1
-	pmovmskb %xmm1, %eax
-	incw	%ax
-	jnz	L(vec_return_begin_16)
-
-	movdqa	32(%rdi), %xmm1
-	CMPEQ	32(%rsi), %xmm1
-	pmovmskb %xmm1, %eax
-	incw	%ax
-	jnz	L(vec_return_begin_32)
-
-	movdqa	48(%rdi), %xmm1
-	CMPEQ	48(%rsi), %xmm1
-	pmovmskb %xmm1, %eax
-	incw	%ax
-	jnz	L(vec_return_begin_48)
-
-	cmp	$96, %rdx
-	jb	L(32_to_64_bytes)
-
-	addq	$64, %rdi
-	addq	$64, %rsi
-	subq	$64, %rdx
-
-	.p2align 4,, 6
-L(aligned_last_64_bytes):
-	movdqa	(%rdi), %xmm1
-	CMPEQ	(%rsi), %xmm1
-	pmovmskb %xmm1, %eax
-	incw	%ax
-	jnz	L(vec_return_begin)
-
-	movdqa	16(%rdi), %xmm1
-	CMPEQ	16(%rsi), %xmm1
-	pmovmskb %xmm1, %eax
-	incw	%ax
-	jnz	L(vec_return_begin_16)
-
-	movdqu	-32(%rdi, %rdx), %xmm0
-	movdqu	-32(%rsi, %rdx), %xmm1
-	CMPEQ	%xmm0, %xmm1
-	pmovmskb %xmm1, %eax
-	incw	%ax
-	jnz	L(vec_return_end_16)
-
-	movdqu	-16(%rdi, %rdx), %xmm0
-	movdqu	-16(%rsi, %rdx), %xmm1
-	CMPEQ	%xmm0, %xmm1
-	pmovmskb %xmm1, %eax
-	incw	%ax
-	jnz	L(vec_return_end)
-	ret
-
-	.p2align 4
-L(128bytesormorein2aligned):
-	cmp	$256, %rdx
-	ja	L(aligned_loop)
-L(less256bytesin2alinged):
-	movdqa	(%rdi), %xmm1
-	CMPEQ	(%rsi), %xmm1
-	pmovmskb %xmm1, %eax
-	incw	%ax
-	jnz	L(vec_return_begin)
-
-	movdqa	16(%rdi), %xmm1
-	CMPEQ	16(%rsi), %xmm1
-	pmovmskb %xmm1, %eax
-	incw	%ax
-	jnz	L(vec_return_begin_16)
-
-	movdqa	32(%rdi), %xmm1
-	CMPEQ	32(%rsi), %xmm1
-	pmovmskb %xmm1, %eax
-	incw	%ax
-	jnz	L(vec_return_begin_32)
-
-	movdqa	48(%rdi), %xmm1
-	CMPEQ	48(%rsi), %xmm1
-	pmovmskb %xmm1, %eax
-	incw	%ax
-	jnz	L(vec_return_begin_48)
-
-	addq	$64, %rdi
-	addq	$64, %rsi
-
-	movdqa	(%rdi), %xmm1
-	CMPEQ	(%rsi), %xmm1
-	pmovmskb %xmm1, %eax
-	incw	%ax
-	jnz	L(vec_return_begin)
-
-	movdqa	16(%rdi), %xmm1
-	CMPEQ	16(%rsi), %xmm1
-	pmovmskb %xmm1, %eax
-	incw	%ax
-	jnz	L(vec_return_begin_16)
-
-	movdqa	32(%rdi), %xmm1
-	CMPEQ	32(%rsi), %xmm1
-	pmovmskb %xmm1, %eax
-	incw	%ax
-	jnz	L(vec_return_begin_32)
-
-	movdqa	48(%rdi), %xmm1
-	CMPEQ	48(%rsi), %xmm1
-	pmovmskb %xmm1, %eax
-	incw	%ax
-	jnz	L(vec_return_begin_48)
-
-	addq	$-128, %rdx
-	subq	$-64, %rsi
-	subq	$-64, %rdi
-
-	cmp	$64, %rdx
-	ja	L(less128bytesin2aligned)
-
-	cmp	$32, %rdx
-	ja	L(aligned_last_64_bytes)
-
-	movdqu	-32(%rdi, %rdx), %xmm0
-	movdqu	-32(%rsi, %rdx), %xmm1
-	CMPEQ	%xmm0, %xmm1
-	pmovmskb %xmm1, %eax
-	incw	%ax
-	jnz	L(vec_return_end_16)
-
-	movdqu	-16(%rdi, %rdx), %xmm0
-	movdqu	-16(%rsi, %rdx), %xmm1
-	CMPEQ	%xmm0, %xmm1
-	pmovmskb %xmm1, %eax
-	incw	%ax
-	jnz	L(vec_return_end)
-	ret
-
-	.p2align 4
-L(aligned_loop):
-# ifdef DATA_CACHE_SIZE_HALF
-	mov	$DATA_CACHE_SIZE_HALF, %R8_LP
-# else
-	mov	__x86_data_cache_size_half(%rip), %R8_LP
-# endif
-	movq	%r8, %r9
-	addq	%r8, %r8
-	addq	%r9, %r8
-	cmpq	%r8, %rdx
-	ja	L(L2_L3_cache_aligned)
-
-	sub	$64, %rdx
-	.p2align 4
-L(64bytesormore_loopin2aligned):
-	movdqa	(%rdi), %xmm0
-	movdqa	16(%rdi), %xmm1
-	movdqa	32(%rdi), %xmm2
-	movdqa	48(%rdi), %xmm3
-
-	CMPEQ	(%rsi), %xmm0
-	CMPEQ	16(%rsi), %xmm1
-	CMPEQ	32(%rsi), %xmm2
-	CMPEQ	48(%rsi), %xmm3
-
-	pand	%xmm0, %xmm1
-	pand	%xmm2, %xmm3
-	pand	%xmm1, %xmm3
-
-	pmovmskb %xmm3, %eax
-	incw	%ax
-	jnz	L(64bytesormore_loop_end)
-	add	$64, %rsi
-	add	$64, %rdi
-	sub	$64, %rdx
-	ja	L(64bytesormore_loopin2aligned)
-	jmp	L(loop_tail)
-
-L(L2_L3_cache_aligned):
-	subq	$64, %rdx
-	.p2align 4
-L(L2_L3_aligned_128bytes_loop):
-	prefetchnta 0x1c0(%rdi)
-	prefetchnta 0x1c0(%rsi)
-	movdqa	(%rdi), %xmm0
-	movdqa	16(%rdi), %xmm1
-	movdqa	32(%rdi), %xmm2
-	movdqa	48(%rdi), %xmm3
-
-	CMPEQ	(%rsi), %xmm0
-	CMPEQ	16(%rsi), %xmm1
-	CMPEQ	32(%rsi), %xmm2
-	CMPEQ	48(%rsi), %xmm3
-
-	pand	%xmm0, %xmm1
-	pand	%xmm2, %xmm3
-	pand	%xmm1, %xmm3
-
-	pmovmskb %xmm3, %eax
-	incw	%ax
-	jnz	L(64bytesormore_loop_end)
-
-	addq	$64, %rsi
-	addq	$64, %rdi
-	subq	$64, %rdx
-	ja	L(L2_L3_aligned_128bytes_loop)
-	jmp	L(loop_tail)
-
-	.p2align 4
-L(64bytesormore_loop_end):
-	pmovmskb %xmm0, %ecx
-	incw	%cx
-	jnz	L(loop_end_ret)
-
-	pmovmskb %xmm1, %ecx
-	notw	%cx
-	sall	$16, %ecx
-	jnz	L(loop_end_ret)
-
-	pmovmskb %xmm2, %ecx
-	notw	%cx
-	shlq	$32, %rcx
-	jnz	L(loop_end_ret)
-
-	addq	$48, %rdi
-	addq	$48, %rsi
-	movq	%rax, %rcx
-
-	.p2align 4,, 6
-L(loop_end_ret):
-	bsfq	%rcx, %rcx
-# ifdef USE_AS_WMEMCMP
-	movl	(%rdi, %rcx), %eax
-	xorl	%edx, %edx
-	cmpl	(%rsi, %rcx), %eax
-	setg	%dl
-	leal	-1(%rdx, %rdx), %eax
-# else
-	movzbl	(%rdi, %rcx), %eax
-	movzbl	(%rsi, %rcx), %ecx
-	subl	%ecx, %eax
-# endif
-	ret
-END (MEMCMP)
-#endif
-- 
2.35.1


^ permalink raw reply	[flat|nested] 3+ messages in thread

* Re: [PATCH v3 3/3] x86: Cleanup page cross code in memcmp-avx2-movbe.S
       [not found]     ` <CAMe9rOofRMrG0Hvjq42r_c-24juXbS+rRb7QXKGjetmSp93Uwg@mail.gmail.com>
@ 2022-05-12 20:03       ` Sunil Pandey
  0 siblings, 0 replies; 3+ messages in thread
From: Sunil Pandey @ 2022-05-12 20:03 UTC (permalink / raw)
  To: H.J. Lu, Libc-stable Mailing List; +Cc: Noah Goldstein, GNU C Library

[-- Attachment #1: Type: text/plain, Size: 10139 bytes --]

On Fri, Apr 15, 2022 at 10:34 AM H.J. Lu via Libc-alpha
<libc-alpha@sourceware.org> wrote:
>
> On Fri, Apr 15, 2022 at 10:28 AM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> >
> > Old code was both inefficient and wasted code size. New code (-62
> > bytes) and comparable or better performance in the page cross case.
> >
> > geometric_mean(N=20) of page cross cases New / Original: 0.960
> >
> > size, align0, align1, ret, New Time/Old Time
> >    1,   4095,      0,   0,             1.001
> >    1,   4095,      0,   1,             0.999
> >    1,   4095,      0,  -1,               1.0
> >    2,   4094,      0,   0,               1.0
> >    2,   4094,      0,   1,               1.0
> >    2,   4094,      0,  -1,               1.0
> >    3,   4093,      0,   0,               1.0
> >    3,   4093,      0,   1,               1.0
> >    3,   4093,      0,  -1,               1.0
> >    4,   4092,      0,   0,             0.987
> >    4,   4092,      0,   1,               1.0
> >    4,   4092,      0,  -1,               1.0
> >    5,   4091,      0,   0,             0.984
> >    5,   4091,      0,   1,             1.002
> >    5,   4091,      0,  -1,             1.005
> >    6,   4090,      0,   0,             0.993
> >    6,   4090,      0,   1,             1.001
> >    6,   4090,      0,  -1,             1.003
> >    7,   4089,      0,   0,             0.991
> >    7,   4089,      0,   1,               1.0
> >    7,   4089,      0,  -1,             1.001
> >    8,   4088,      0,   0,             0.875
> >    8,   4088,      0,   1,             0.881
> >    8,   4088,      0,  -1,             0.888
> >    9,   4087,      0,   0,             0.872
> >    9,   4087,      0,   1,             0.879
> >    9,   4087,      0,  -1,             0.883
> >   10,   4086,      0,   0,             0.878
> >   10,   4086,      0,   1,             0.886
> >   10,   4086,      0,  -1,             0.873
> >   11,   4085,      0,   0,             0.878
> >   11,   4085,      0,   1,             0.881
> >   11,   4085,      0,  -1,             0.879
> >   12,   4084,      0,   0,             0.873
> >   12,   4084,      0,   1,             0.889
> >   12,   4084,      0,  -1,             0.875
> >   13,   4083,      0,   0,             0.873
> >   13,   4083,      0,   1,             0.863
> >   13,   4083,      0,  -1,             0.863
> >   14,   4082,      0,   0,             0.838
> >   14,   4082,      0,   1,             0.869
> >   14,   4082,      0,  -1,             0.877
> >   15,   4081,      0,   0,             0.841
> >   15,   4081,      0,   1,             0.869
> >   15,   4081,      0,  -1,             0.876
> >   16,   4080,      0,   0,             0.988
> >   16,   4080,      0,   1,              0.99
> >   16,   4080,      0,  -1,             0.989
> >   17,   4079,      0,   0,             0.978
> >   17,   4079,      0,   1,             0.981
> >   17,   4079,      0,  -1,              0.98
> >   18,   4078,      0,   0,             0.981
> >   18,   4078,      0,   1,              0.98
> >   18,   4078,      0,  -1,             0.985
> >   19,   4077,      0,   0,             0.977
> >   19,   4077,      0,   1,             0.979
> >   19,   4077,      0,  -1,             0.986
> >   20,   4076,      0,   0,             0.977
> >   20,   4076,      0,   1,             0.986
> >   20,   4076,      0,  -1,             0.984
> >   21,   4075,      0,   0,             0.977
> >   21,   4075,      0,   1,             0.983
> >   21,   4075,      0,  -1,             0.988
> >   22,   4074,      0,   0,             0.983
> >   22,   4074,      0,   1,             0.994
> >   22,   4074,      0,  -1,             0.993
> >   23,   4073,      0,   0,              0.98
> >   23,   4073,      0,   1,             0.992
> >   23,   4073,      0,  -1,             0.995
> >   24,   4072,      0,   0,             0.989
> >   24,   4072,      0,   1,             0.989
> >   24,   4072,      0,  -1,             0.991
> >   25,   4071,      0,   0,              0.99
> >   25,   4071,      0,   1,             0.999
> >   25,   4071,      0,  -1,             0.996
> >   26,   4070,      0,   0,             0.993
> >   26,   4070,      0,   1,             0.995
> >   26,   4070,      0,  -1,             0.998
> >   27,   4069,      0,   0,             0.993
> >   27,   4069,      0,   1,             0.999
> >   27,   4069,      0,  -1,               1.0
> >   28,   4068,      0,   0,             0.997
> >   28,   4068,      0,   1,               1.0
> >   28,   4068,      0,  -1,             0.999
> >   29,   4067,      0,   0,             0.996
> >   29,   4067,      0,   1,             0.999
> >   29,   4067,      0,  -1,             0.999
> >   30,   4066,      0,   0,             0.991
> >   30,   4066,      0,   1,             1.001
> >   30,   4066,      0,  -1,             0.999
> >   31,   4065,      0,   0,             0.988
> >   31,   4065,      0,   1,             0.998
> >   31,   4065,      0,  -1,             0.998
> > ---
> >  sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S | 98 ++++++++++++--------
> >  1 file changed, 61 insertions(+), 37 deletions(-)
> >
> > diff --git a/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S b/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S
> > index a34ea1645d..210c9925b6 100644
> > --- a/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S
> > +++ b/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S
> > @@ -429,22 +429,21 @@ L(page_cross_less_vec):
> >  # ifndef USE_AS_WMEMCMP
> >         cmpl    $8, %edx
> >         jae     L(between_8_15)
> > +       /* Fall through for [4, 7].  */
> >         cmpl    $4, %edx
> > -       jae     L(between_4_7)
> > +       jb      L(between_2_3)
> >
> > -       /* Load as big endian to avoid branches.  */
> > -       movzwl  (%rdi), %eax
> > -       movzwl  (%rsi), %ecx
> > -       shll    $8, %eax
> > -       shll    $8, %ecx
> > -       bswap   %eax
> > -       bswap   %ecx
> > -       movzbl  -1(%rdi, %rdx), %edi
> > -       movzbl  -1(%rsi, %rdx), %esi
> > -       orl     %edi, %eax
> > -       orl     %esi, %ecx
> > -       /* Subtraction is okay because the upper 8 bits are zero.  */
> > -       subl    %ecx, %eax
> > +       movbe   (%rdi), %eax
> > +       movbe   (%rsi), %ecx
> > +       shlq    $32, %rax
> > +       shlq    $32, %rcx
> > +       movbe   -4(%rdi, %rdx), %edi
> > +       movbe   -4(%rsi, %rdx), %esi
> > +       orq     %rdi, %rax
> > +       orq     %rsi, %rcx
> > +       subq    %rcx, %rax
> > +       /* Fast path for return zero.  */
> > +       jnz     L(ret_nonzero)
> >         /* No ymm register was touched.  */
> >         ret
> >
> > @@ -457,9 +456,33 @@ L(one_or_less):
> >         /* No ymm register was touched.  */
> >         ret
> >
> > +       .p2align 4,, 5
> > +L(ret_nonzero):
> > +       sbbl    %eax, %eax
> > +       orl     $1, %eax
> > +       /* No ymm register was touched.  */
> > +       ret
> > +
> > +       .p2align 4,, 2
> > +L(zero):
> > +       xorl    %eax, %eax
> > +       /* No ymm register was touched.  */
> > +       ret
> > +
> >         .p2align 4
> >  L(between_8_15):
> > -# endif
> > +       movbe   (%rdi), %rax
> > +       movbe   (%rsi), %rcx
> > +       subq    %rcx, %rax
> > +       jnz     L(ret_nonzero)
> > +       movbe   -8(%rdi, %rdx), %rax
> > +       movbe   -8(%rsi, %rdx), %rcx
> > +       subq    %rcx, %rax
> > +       /* Fast path for return zero.  */
> > +       jnz     L(ret_nonzero)
> > +       /* No ymm register was touched.  */
> > +       ret
> > +# else
> >         /* If USE_AS_WMEMCMP fall through into 8-15 byte case.  */
> >         vmovq   (%rdi), %xmm1
> >         vmovq   (%rsi), %xmm2
> > @@ -475,16 +498,13 @@ L(between_8_15):
> >         VPCMPEQ %xmm1, %xmm2, %xmm2
> >         vpmovmskb %xmm2, %eax
> >         subl    $0xffff, %eax
> > +       /* Fast path for return zero.  */
> >         jnz     L(return_vec_0)
> >         /* No ymm register was touched.  */
> >         ret
> > +# endif
> >
> > -       .p2align 4
> > -L(zero):
> > -       xorl    %eax, %eax
> > -       ret
> > -
> > -       .p2align 4
> > +       .p2align 4,, 10
> >  L(between_16_31):
> >         /* From 16 to 31 bytes.  No branch when size == 16.  */
> >         vmovdqu (%rsi), %xmm2
> > @@ -501,11 +521,17 @@ L(between_16_31):
> >         VPCMPEQ (%rdi), %xmm2, %xmm2
> >         vpmovmskb %xmm2, %eax
> >         subl    $0xffff, %eax
> > +       /* Fast path for return zero.  */
> >         jnz     L(return_vec_0)
> >         /* No ymm register was touched.  */
> >         ret
> >
> >  # ifdef USE_AS_WMEMCMP
> > +       .p2align 4,, 2
> > +L(zero):
> > +       xorl    %eax, %eax
> > +       ret
> > +
> >         .p2align 4
> >  L(one_or_less):
> >         jb      L(zero)
> > @@ -520,22 +546,20 @@ L(one_or_less):
> >  # else
> >
> >         .p2align 4
> > -L(between_4_7):
> > -       /* Load as big endian with overlapping movbe to avoid branches.
> > -        */
> > -       movbe   (%rdi), %eax
> > -       movbe   (%rsi), %ecx
> > -       shlq    $32, %rax
> > -       shlq    $32, %rcx
> > -       movbe   -4(%rdi, %rdx), %edi
> > -       movbe   -4(%rsi, %rdx), %esi
> > -       orq     %rdi, %rax
> > -       orq     %rsi, %rcx
> > -       subq    %rcx, %rax
> > -       jz      L(zero_4_7)
> > -       sbbl    %eax, %eax
> > -       orl     $1, %eax
> > -L(zero_4_7):
> > +L(between_2_3):
> > +       /* Load as big endian to avoid branches.  */
> > +       movzwl  (%rdi), %eax
> > +       movzwl  (%rsi), %ecx
> > +       bswap   %eax
> > +       bswap   %ecx
> > +       shrl    %eax
> > +       shrl    %ecx
> > +       movzbl  -1(%rdi, %rdx), %edi
> > +       movzbl  -1(%rsi, %rdx), %esi
> > +       orl     %edi, %eax
> > +       orl     %esi, %ecx
> > +       /* Subtraction is okay because the upper bit is zero.  */
> > +       subl    %ecx, %eax
> >         /* No ymm register was touched.  */
> >         ret
> >  # endif
> > --
> > 2.25.1
> >
>
> LGTM.
>
> Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
>
> Thanks.
>
> --
> H.J.

I would like to backport this patch to release branches.
Any comments or objections?

Conflict resolution patch attached.

--Sunil

[-- Attachment #2: 0021-x86-Cleanup-page-cross-code-in-memcmp-avx2-movbe.S.patch --]
[-- Type: application/octet-stream, Size: 8202 bytes --]

From 3d74dd4ebc4098bc7f4b91b2794684427c7694dd Mon Sep 17 00:00:00 2001
From: Noah Goldstein <goldstein.w.n@gmail.com>
Date: Fri, 15 Apr 2022 12:28:01 -0500
Subject: [PATCH 21/26] x86: Cleanup page cross code in memcmp-avx2-movbe.S

Old code was both inefficient and wasted code size. New code (-62
bytes) and comparable or better performance in the page cross case.

geometric_mean(N=20) of page cross cases New / Original: 0.960

size, align0, align1, ret, New Time/Old Time
   1,   4095,      0,   0,             1.001
   1,   4095,      0,   1,             0.999
   1,   4095,      0,  -1,               1.0
   2,   4094,      0,   0,               1.0
   2,   4094,      0,   1,               1.0
   2,   4094,      0,  -1,               1.0
   3,   4093,      0,   0,               1.0
   3,   4093,      0,   1,               1.0
   3,   4093,      0,  -1,               1.0
   4,   4092,      0,   0,             0.987
   4,   4092,      0,   1,               1.0
   4,   4092,      0,  -1,               1.0
   5,   4091,      0,   0,             0.984
   5,   4091,      0,   1,             1.002
   5,   4091,      0,  -1,             1.005
   6,   4090,      0,   0,             0.993
   6,   4090,      0,   1,             1.001
   6,   4090,      0,  -1,             1.003
   7,   4089,      0,   0,             0.991
   7,   4089,      0,   1,               1.0
   7,   4089,      0,  -1,             1.001
   8,   4088,      0,   0,             0.875
   8,   4088,      0,   1,             0.881
   8,   4088,      0,  -1,             0.888
   9,   4087,      0,   0,             0.872
   9,   4087,      0,   1,             0.879
   9,   4087,      0,  -1,             0.883
  10,   4086,      0,   0,             0.878
  10,   4086,      0,   1,             0.886
  10,   4086,      0,  -1,             0.873
  11,   4085,      0,   0,             0.878
  11,   4085,      0,   1,             0.881
  11,   4085,      0,  -1,             0.879
  12,   4084,      0,   0,             0.873
  12,   4084,      0,   1,             0.889
  12,   4084,      0,  -1,             0.875
  13,   4083,      0,   0,             0.873
  13,   4083,      0,   1,             0.863
  13,   4083,      0,  -1,             0.863
  14,   4082,      0,   0,             0.838
  14,   4082,      0,   1,             0.869
  14,   4082,      0,  -1,             0.877
  15,   4081,      0,   0,             0.841
  15,   4081,      0,   1,             0.869
  15,   4081,      0,  -1,             0.876
  16,   4080,      0,   0,             0.988
  16,   4080,      0,   1,              0.99
  16,   4080,      0,  -1,             0.989
  17,   4079,      0,   0,             0.978
  17,   4079,      0,   1,             0.981
  17,   4079,      0,  -1,              0.98
  18,   4078,      0,   0,             0.981
  18,   4078,      0,   1,              0.98
  18,   4078,      0,  -1,             0.985
  19,   4077,      0,   0,             0.977
  19,   4077,      0,   1,             0.979
  19,   4077,      0,  -1,             0.986
  20,   4076,      0,   0,             0.977
  20,   4076,      0,   1,             0.986
  20,   4076,      0,  -1,             0.984
  21,   4075,      0,   0,             0.977
  21,   4075,      0,   1,             0.983
  21,   4075,      0,  -1,             0.988
  22,   4074,      0,   0,             0.983
  22,   4074,      0,   1,             0.994
  22,   4074,      0,  -1,             0.993
  23,   4073,      0,   0,              0.98
  23,   4073,      0,   1,             0.992
  23,   4073,      0,  -1,             0.995
  24,   4072,      0,   0,             0.989
  24,   4072,      0,   1,             0.989
  24,   4072,      0,  -1,             0.991
  25,   4071,      0,   0,              0.99
  25,   4071,      0,   1,             0.999
  25,   4071,      0,  -1,             0.996
  26,   4070,      0,   0,             0.993
  26,   4070,      0,   1,             0.995
  26,   4070,      0,  -1,             0.998
  27,   4069,      0,   0,             0.993
  27,   4069,      0,   1,             0.999
  27,   4069,      0,  -1,               1.0
  28,   4068,      0,   0,             0.997
  28,   4068,      0,   1,               1.0
  28,   4068,      0,  -1,             0.999
  29,   4067,      0,   0,             0.996
  29,   4067,      0,   1,             0.999
  29,   4067,      0,  -1,             0.999
  30,   4066,      0,   0,             0.991
  30,   4066,      0,   1,             1.001
  30,   4066,      0,  -1,             0.999
  31,   4065,      0,   0,             0.988
  31,   4065,      0,   1,             0.998
  31,   4065,      0,  -1,             0.998
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>

(cherry picked from commit 23102686ec67b856a2d4fd25ddaa1c0b8d175c4f)
---
 sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S | 98 ++++++++++++--------
 1 file changed, 61 insertions(+), 37 deletions(-)

diff --git a/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S b/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S
index a34ea1645d..210c9925b6 100644
--- a/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S
+++ b/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S
@@ -429,22 +429,21 @@ L(page_cross_less_vec):
 # ifndef USE_AS_WMEMCMP
 	cmpl	$8, %edx
 	jae	L(between_8_15)
+	/* Fall through for [4, 7].  */
 	cmpl	$4, %edx
-	jae	L(between_4_7)
+	jb	L(between_2_3)
 
-	/* Load as big endian to avoid branches.  */
-	movzwl	(%rdi), %eax
-	movzwl	(%rsi), %ecx
-	shll	$8, %eax
-	shll	$8, %ecx
-	bswap	%eax
-	bswap	%ecx
-	movzbl	-1(%rdi, %rdx), %edi
-	movzbl	-1(%rsi, %rdx), %esi
-	orl	%edi, %eax
-	orl	%esi, %ecx
-	/* Subtraction is okay because the upper 8 bits are zero.  */
-	subl	%ecx, %eax
+	movbe	(%rdi), %eax
+	movbe	(%rsi), %ecx
+	shlq	$32, %rax
+	shlq	$32, %rcx
+	movbe	-4(%rdi, %rdx), %edi
+	movbe	-4(%rsi, %rdx), %esi
+	orq	%rdi, %rax
+	orq	%rsi, %rcx
+	subq	%rcx, %rax
+	/* Fast path for return zero.  */
+	jnz	L(ret_nonzero)
 	/* No ymm register was touched.  */
 	ret
 
@@ -457,9 +456,33 @@ L(one_or_less):
 	/* No ymm register was touched.  */
 	ret
 
+	.p2align 4,, 5
+L(ret_nonzero):
+	sbbl	%eax, %eax
+	orl	$1, %eax
+	/* No ymm register was touched.  */
+	ret
+
+	.p2align 4,, 2
+L(zero):
+	xorl	%eax, %eax
+	/* No ymm register was touched.  */
+	ret
+
 	.p2align 4
 L(between_8_15):
-# endif
+	movbe	(%rdi), %rax
+	movbe	(%rsi), %rcx
+	subq	%rcx, %rax
+	jnz	L(ret_nonzero)
+	movbe	-8(%rdi, %rdx), %rax
+	movbe	-8(%rsi, %rdx), %rcx
+	subq	%rcx, %rax
+	/* Fast path for return zero.  */
+	jnz	L(ret_nonzero)
+	/* No ymm register was touched.  */
+	ret
+# else
 	/* If USE_AS_WMEMCMP fall through into 8-15 byte case.  */
 	vmovq	(%rdi), %xmm1
 	vmovq	(%rsi), %xmm2
@@ -475,16 +498,13 @@ L(between_8_15):
 	VPCMPEQ	%xmm1, %xmm2, %xmm2
 	vpmovmskb %xmm2, %eax
 	subl	$0xffff, %eax
+	/* Fast path for return zero.  */
 	jnz	L(return_vec_0)
 	/* No ymm register was touched.  */
 	ret
+# endif
 
-	.p2align 4
-L(zero):
-	xorl	%eax, %eax
-	ret
-
-	.p2align 4
+	.p2align 4,, 10
 L(between_16_31):
 	/* From 16 to 31 bytes.  No branch when size == 16.  */
 	vmovdqu	(%rsi), %xmm2
@@ -501,11 +521,17 @@ L(between_16_31):
 	VPCMPEQ	(%rdi), %xmm2, %xmm2
 	vpmovmskb %xmm2, %eax
 	subl	$0xffff, %eax
+	/* Fast path for return zero.  */
 	jnz	L(return_vec_0)
 	/* No ymm register was touched.  */
 	ret
 
 # ifdef USE_AS_WMEMCMP
+	.p2align 4,, 2
+L(zero):
+	xorl	%eax, %eax
+	ret
+
 	.p2align 4
 L(one_or_less):
 	jb	L(zero)
@@ -520,22 +546,20 @@ L(one_or_less):
 # else
 
 	.p2align 4
-L(between_4_7):
-	/* Load as big endian with overlapping movbe to avoid branches.
-	 */
-	movbe	(%rdi), %eax
-	movbe	(%rsi), %ecx
-	shlq	$32, %rax
-	shlq	$32, %rcx
-	movbe	-4(%rdi, %rdx), %edi
-	movbe	-4(%rsi, %rdx), %esi
-	orq	%rdi, %rax
-	orq	%rsi, %rcx
-	subq	%rcx, %rax
-	jz	L(zero_4_7)
-	sbbl	%eax, %eax
-	orl	$1, %eax
-L(zero_4_7):
+L(between_2_3):
+	/* Load as big endian to avoid branches.  */
+	movzwl	(%rdi), %eax
+	movzwl	(%rsi), %ecx
+	bswap	%eax
+	bswap	%ecx
+	shrl	%eax
+	shrl	%ecx
+	movzbl	-1(%rdi, %rdx), %edi
+	movzbl	-1(%rsi, %rdx), %esi
+	orl	%edi, %eax
+	orl	%esi, %ecx
+	/* Subtraction is okay because the upper bit is zero.  */
+	subl	%ecx, %eax
 	/* No ymm register was touched.  */
 	ret
 # endif
-- 
2.35.1


^ permalink raw reply	[flat|nested] 3+ messages in thread

end of thread, other threads:[~2022-05-12 20:04 UTC | newest]

Thread overview: 3+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
     [not found] <20220415055132.1257272-1-goldstein.w.n@gmail.com>
     [not found] ` <20220415172801.1525674-1-goldstein.w.n@gmail.com>
     [not found]   ` <CAMe9rOrZGGy6-wRu1V+20bBJUZH_D_aOWzwxTp-BAyZ8ANB7YA@mail.gmail.com>
     [not found]     ` <CAFUsyfLpQLwN=H5Tnhsqn0vK3i=a5LE-G_rV3Z6QxShtkJRm3g@mail.gmail.com>
     [not found]       ` <alpine.DEB.2.22.394.2204192028050.18024@digraph.polyomino.org.uk>
     [not found]         ` <CAFUsyfJt_HOQ1msmj_xSfAYm_7jB-XZg-7jNepXSD-Xd7Ui2nw@mail.gmail.com>
     [not found]           ` <CAFUsyfLOTir+QsJOnhmQiU1n7jw4NP8=TrAecgMbww5HEFG=-g@mail.gmail.com>
     [not found]             ` <CAFUsyfKre7d72QZtAx_E67vP7x-N8fuePorgg5vhe+5HDPQaVA@mail.gmail.com>
2022-05-12 19:59               ` [PATCH v3 1/3] x86: Optimize memcmp SSE2 in memcmp.S Sunil Pandey
     [not found]   ` <20220415172801.1525674-2-goldstein.w.n@gmail.com>
     [not found]     ` <CAMe9rOr2Cr4FRJDY2p7J3HVtDt4VmQAkt-MPtETj-+cCfvUJ0w@mail.gmail.com>
2022-05-12 20:01       ` [PATCH v3 2/3] x86: Remove memcmp-sse4.S Sunil Pandey
     [not found]   ` <20220415172801.1525674-3-goldstein.w.n@gmail.com>
     [not found]     ` <CAMe9rOofRMrG0Hvjq42r_c-24juXbS+rRb7QXKGjetmSp93Uwg@mail.gmail.com>
2022-05-12 20:03       ` [PATCH v3 3/3] x86: Cleanup page cross code in memcmp-avx2-movbe.S Sunil Pandey

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).