public inbox for libc-alpha@sourceware.org
 help / color / mirror / Atom feed
* [PATCH v1] x86: Move strlen SSE2 implementation to multiarch/strlen-sse2.S
@ 2022-07-12 19:29 Noah Goldstein
  2022-07-12 19:29 ` [PATCH v1] x86: Move strcpy SSE2 implementation to multiarch/strcpy-sse2.S Noah Goldstein
                   ` (9 more replies)
  0 siblings, 10 replies; 21+ messages in thread
From: Noah Goldstein @ 2022-07-12 19:29 UTC (permalink / raw)
  To: libc-alpha

This commit doesn't affect libc.so.6, its just housekeeping to prepare
for adding explicit ISA level support.

Tested build on x86_64 and x86_32 with/without multiarch.
---
 sysdeps/x86_64/multiarch/rtld-strlen.S    |  18 ++
 sysdeps/x86_64/multiarch/rtld-strnlen.S   |  18 ++
 sysdeps/x86_64/multiarch/strlen-sse2.S    | 260 ++++++++++++++++++++-
 sysdeps/x86_64/multiarch/strlen-vec.S     | 267 ----------------------
 sysdeps/x86_64/multiarch/strnlen-sse2.S   |  12 +-
 sysdeps/x86_64/multiarch/wcslen-sse4_1.S  |   4 +-
 sysdeps/x86_64/multiarch/wcsnlen-sse4_1.S |   4 +-
 sysdeps/x86_64/strlen.S                   |   3 +-
 sysdeps/x86_64/strnlen.S                  |   6 +-
 9 files changed, 306 insertions(+), 286 deletions(-)
 create mode 100644 sysdeps/x86_64/multiarch/rtld-strlen.S
 create mode 100644 sysdeps/x86_64/multiarch/rtld-strnlen.S
 delete mode 100644 sysdeps/x86_64/multiarch/strlen-vec.S

diff --git a/sysdeps/x86_64/multiarch/rtld-strlen.S b/sysdeps/x86_64/multiarch/rtld-strlen.S
new file mode 100644
index 0000000000..609d26256e
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/rtld-strlen.S
@@ -0,0 +1,18 @@
+/* Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include "../strlen.S"
diff --git a/sysdeps/x86_64/multiarch/rtld-strnlen.S b/sysdeps/x86_64/multiarch/rtld-strnlen.S
new file mode 100644
index 0000000000..ef2d64abc2
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/rtld-strnlen.S
@@ -0,0 +1,18 @@
+/* Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include "../strnlen.S"
diff --git a/sysdeps/x86_64/multiarch/strlen-sse2.S b/sysdeps/x86_64/multiarch/strlen-sse2.S
index 660b327ed2..5be72267d5 100644
--- a/sysdeps/x86_64/multiarch/strlen-sse2.S
+++ b/sysdeps/x86_64/multiarch/strlen-sse2.S
@@ -16,8 +16,260 @@
    License along with the GNU C Library; if not, see
    <https://www.gnu.org/licenses/>.  */
 
-#if IS_IN (libc)
-# define strlen __strlen_sse2
-#endif
+#if IS_IN (libc) || defined STRLEN
+
+# ifndef STRLEN
+#  define STRLEN __strlen_sse2
+# endif
+
+
+# include <sysdep.h>
+
+# ifdef AS_WCSLEN
+#  define PMINU		pminud
+#  define PCMPEQ		pcmpeqd
+#  define SHIFT_RETURN	shrq $2, %rax
+# else
+#  define PMINU		pminub
+#  define PCMPEQ		pcmpeqb
+#  define SHIFT_RETURN
+# endif
+
+# ifndef SECTION
+#  define SECTION(p)	p
+# endif
+
+/* Long lived register in strlen(s), strnlen(s, n) are:
+
+	%xmm3 - zero
+	%rdi   - s
+	%r10  (s+n) & (~(64-1))
+	%r11   s+n
+*/
+
+
+	.section SECTION(.text),"ax",@progbits
+ENTRY(STRLEN)
+
+/* Test 64 bytes from %rax for zero. Save result as bitmask in %rdx.  */
+# define FIND_ZERO	\
+	PCMPEQ	(%rax), %xmm0;	\
+	PCMPEQ	16(%rax), %xmm1;	\
+	PCMPEQ	32(%rax), %xmm2;	\
+	PCMPEQ	48(%rax), %xmm3;	\
+	pmovmskb	%xmm0, %esi;	\
+	pmovmskb	%xmm1, %edx;	\
+	pmovmskb	%xmm2, %r8d;	\
+	pmovmskb	%xmm3, %ecx;	\
+	salq	$16, %rdx;	\
+	salq	$16, %rcx;	\
+	orq	%rsi, %rdx;	\
+	orq	%r8, %rcx;	\
+	salq	$32, %rcx;	\
+	orq	%rcx, %rdx;
+
+# ifdef AS_STRNLEN
+/* Do not read anything when n==0.  */
+	test	%RSI_LP, %RSI_LP
+	jne	L(n_nonzero)
+	xor	%rax, %rax
+	ret
+L(n_nonzero):
+#  ifdef AS_WCSLEN
+/* Check for overflow from maxlen * sizeof(wchar_t). If it would
+   overflow the only way this program doesn't have undefined behavior
+   is if there is a null terminator in valid memory so wcslen will
+   suffice.  */
+	mov	%RSI_LP, %R10_LP
+	sar	$62, %R10_LP
+	jnz	__wcslen_sse4_1
+	sal	$2, %RSI_LP
+#  endif
+
+/* Initialize long lived registers.  */
+	add	%RDI_LP, %RSI_LP
+	mov	%RSI_LP, %R10_LP
+	and	$-64, %R10_LP
+	mov	%RSI_LP, %R11_LP
+# endif
+
+	pxor	%xmm0, %xmm0
+	pxor	%xmm1, %xmm1
+	pxor	%xmm2, %xmm2
+	pxor	%xmm3, %xmm3
+	movq	%rdi, %rax
+	movq	%rdi, %rcx
+	andq	$4095, %rcx
+/* Offsets 4032-4047 will be aligned into 4032 thus fit into page.  */
+	cmpq	$4047, %rcx
+/* We cannot unify this branching as it would be ~6 cycles slower.  */
+	ja	L(cross_page)
+
+# ifdef AS_STRNLEN
+/* Test if end is among first 64 bytes.  */
+#  define STRNLEN_PROLOG	\
+	mov	%r11, %rsi;	\
+	subq	%rax, %rsi;	\
+	andq	$-64, %rax;	\
+	testq	$-64, %rsi;	\
+	je	L(strnlen_ret)
+# else
+#  define STRNLEN_PROLOG  andq $-64, %rax;
+# endif
+
+/* Ignore bits in mask that come before start of string.  */
+# define PROLOG(lab)	\
+	movq	%rdi, %rcx;	\
+	xorq	%rax, %rcx;	\
+	STRNLEN_PROLOG;	\
+	sarq	%cl, %rdx;	\
+	test	%rdx, %rdx;	\
+	je	L(lab);	\
+	bsfq	%rdx, %rax;	\
+	SHIFT_RETURN;		\
+	ret
+
+# ifdef AS_STRNLEN
+	andq	$-16, %rax
+	FIND_ZERO
+# else
+	/* Test first 16 bytes unaligned.  */
+	movdqu	(%rax), %xmm4
+	PCMPEQ	%xmm0, %xmm4
+	pmovmskb	%xmm4, %edx
+	test	%edx, %edx
+	je 	L(next48_bytes)
+	bsf	%edx, %eax /* If eax is zeroed 16bit bsf can be used.  */
+	SHIFT_RETURN
+	ret
+
+L(next48_bytes):
+/* Same as FIND_ZERO except we do not check first 16 bytes.  */
+	andq	$-16, %rax
+	PCMPEQ 16(%rax), %xmm1
+	PCMPEQ 32(%rax), %xmm2
+	PCMPEQ 48(%rax), %xmm3
+	pmovmskb	%xmm1, %edx
+	pmovmskb	%xmm2, %r8d
+	pmovmskb	%xmm3, %ecx
+	salq	$16, %rdx
+	salq	$16, %rcx
+	orq	%r8, %rcx
+	salq	$32, %rcx
+	orq	%rcx, %rdx
+# endif
 
-#include "strlen-vec.S"
+	/* When no zero byte is found xmm1-3 are zero so we do not have to
+	   zero them.  */
+	PROLOG(loop)
+
+	.p2align 4
+L(cross_page):
+	andq	$-64, %rax
+	FIND_ZERO
+	PROLOG(loop_init)
+
+# ifdef AS_STRNLEN
+/* We must do this check to correctly handle strnlen (s, -1).  */
+L(strnlen_ret):
+	bts	%rsi, %rdx
+	sarq	%cl, %rdx
+	test	%rdx, %rdx
+	je	L(loop_init)
+	bsfq	%rdx, %rax
+	SHIFT_RETURN
+	ret
+# endif
+	.p2align 4
+L(loop_init):
+	pxor	%xmm1, %xmm1
+	pxor	%xmm2, %xmm2
+	pxor	%xmm3, %xmm3
+# ifdef AS_STRNLEN
+	.p2align 4
+L(loop):
+
+	addq	$64, %rax
+	cmpq	%rax, %r10
+	je	L(exit_end)
+
+	movdqa	(%rax), %xmm0
+	PMINU	16(%rax), %xmm0
+	PMINU	32(%rax), %xmm0
+	PMINU	48(%rax), %xmm0
+	PCMPEQ	%xmm3, %xmm0
+	pmovmskb	%xmm0, %edx
+	testl	%edx, %edx
+	jne	L(exit)
+	jmp	L(loop)
+
+	.p2align 4
+L(exit_end):
+	cmp	%rax, %r11
+	je	L(first) /* Do not read when end is at page boundary.  */
+	pxor	%xmm0, %xmm0
+	FIND_ZERO
+
+L(first):
+	bts	%r11, %rdx
+	bsfq	%rdx, %rdx
+	addq	%rdx, %rax
+	subq	%rdi, %rax
+	SHIFT_RETURN
+	ret
+
+	.p2align 4
+L(exit):
+	pxor	%xmm0, %xmm0
+	FIND_ZERO
+
+	bsfq	%rdx, %rdx
+	addq	%rdx, %rax
+	subq	%rdi, %rax
+	SHIFT_RETURN
+	ret
+
+# else
+
+	/* Main loop.  Unrolled twice to improve L2 cache performance on core2.  */
+	.p2align 4
+L(loop):
+
+	movdqa	64(%rax), %xmm0
+	PMINU	80(%rax), %xmm0
+	PMINU	96(%rax), %xmm0
+	PMINU	112(%rax), %xmm0
+	PCMPEQ	%xmm3, %xmm0
+	pmovmskb	%xmm0, %edx
+	testl	%edx, %edx
+	jne	L(exit64)
+
+	subq	$-128, %rax
+
+	movdqa	(%rax), %xmm0
+	PMINU	16(%rax), %xmm0
+	PMINU	32(%rax), %xmm0
+	PMINU	48(%rax), %xmm0
+	PCMPEQ	%xmm3, %xmm0
+	pmovmskb	%xmm0, %edx
+	testl	%edx, %edx
+	jne	L(exit0)
+	jmp	L(loop)
+
+	.p2align 4
+L(exit64):
+	addq	$64, %rax
+L(exit0):
+	pxor	%xmm0, %xmm0
+	FIND_ZERO
+
+	bsfq	%rdx, %rdx
+	addq	%rdx, %rax
+	subq	%rdi, %rax
+	SHIFT_RETURN
+	ret
+
+# endif
+
+END(STRLEN)
+#endif
diff --git a/sysdeps/x86_64/multiarch/strlen-vec.S b/sysdeps/x86_64/multiarch/strlen-vec.S
deleted file mode 100644
index 874123d604..0000000000
--- a/sysdeps/x86_64/multiarch/strlen-vec.S
+++ /dev/null
@@ -1,267 +0,0 @@
-/* SSE2 version of strlen and SSE4.1 version of wcslen.
-   Copyright (C) 2012-2022 Free Software Foundation, Inc.
-   This file is part of the GNU C Library.
-
-   The GNU C Library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 2.1 of the License, or (at your option) any later version.
-
-   The GNU C Library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   Lesser General Public License for more details.
-
-   You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, see
-   <https://www.gnu.org/licenses/>.  */
-
-#include <sysdep.h>
-
-#ifdef AS_WCSLEN
-# define PMINU		pminud
-# define PCMPEQ		pcmpeqd
-# define SHIFT_RETURN	shrq $2, %rax
-#else
-# define PMINU		pminub
-# define PCMPEQ		pcmpeqb
-# define SHIFT_RETURN
-#endif
-
-#ifndef SECTION
-# define SECTION(p)	p
-#endif
-
-/* Long lived register in strlen(s), strnlen(s, n) are:
-
-	%xmm3 - zero
-	%rdi   - s
-	%r10  (s+n) & (~(64-1))
-	%r11   s+n
-*/
-
-
-	.section SECTION(.text),"ax",@progbits
-ENTRY(strlen)
-
-/* Test 64 bytes from %rax for zero. Save result as bitmask in %rdx.  */
-#define FIND_ZERO	\
-	PCMPEQ	(%rax), %xmm0;	\
-	PCMPEQ	16(%rax), %xmm1;	\
-	PCMPEQ	32(%rax), %xmm2;	\
-	PCMPEQ	48(%rax), %xmm3;	\
-	pmovmskb	%xmm0, %esi;	\
-	pmovmskb	%xmm1, %edx;	\
-	pmovmskb	%xmm2, %r8d;	\
-	pmovmskb	%xmm3, %ecx;	\
-	salq	$16, %rdx;	\
-	salq	$16, %rcx;	\
-	orq	%rsi, %rdx;	\
-	orq	%r8, %rcx;	\
-	salq	$32, %rcx;	\
-	orq	%rcx, %rdx;
-
-#ifdef AS_STRNLEN
-/* Do not read anything when n==0.  */
-	test	%RSI_LP, %RSI_LP
-	jne	L(n_nonzero)
-	xor	%rax, %rax
-	ret
-L(n_nonzero):
-# ifdef AS_WCSLEN
-/* Check for overflow from maxlen * sizeof(wchar_t). If it would
-   overflow the only way this program doesn't have undefined behavior
-   is if there is a null terminator in valid memory so wcslen will
-   suffice.  */
-	mov	%RSI_LP, %R10_LP
-	sar	$62, %R10_LP
-	jnz	__wcslen_sse4_1
-	sal	$2, %RSI_LP
-# endif
-
-/* Initialize long lived registers.  */
-	add	%RDI_LP, %RSI_LP
-	mov	%RSI_LP, %R10_LP
-	and	$-64, %R10_LP
-	mov	%RSI_LP, %R11_LP
-#endif
-
-	pxor	%xmm0, %xmm0
-	pxor	%xmm1, %xmm1
-	pxor	%xmm2, %xmm2
-	pxor	%xmm3, %xmm3
-	movq	%rdi, %rax
-	movq	%rdi, %rcx
-	andq	$4095, %rcx
-/* Offsets 4032-4047 will be aligned into 4032 thus fit into page.  */
-	cmpq	$4047, %rcx
-/* We cannot unify this branching as it would be ~6 cycles slower.  */
-	ja	L(cross_page)
-
-#ifdef AS_STRNLEN
-/* Test if end is among first 64 bytes.  */
-# define STRNLEN_PROLOG	\
-	mov	%r11, %rsi;	\
-	subq	%rax, %rsi;	\
-	andq	$-64, %rax;	\
-	testq	$-64, %rsi;	\
-	je	L(strnlen_ret)
-#else
-# define STRNLEN_PROLOG  andq $-64, %rax;
-#endif
-
-/* Ignore bits in mask that come before start of string.  */
-#define PROLOG(lab)	\
-	movq	%rdi, %rcx;	\
-	xorq	%rax, %rcx;	\
-	STRNLEN_PROLOG;	\
-	sarq	%cl, %rdx;	\
-	test	%rdx, %rdx;	\
-	je	L(lab);	\
-	bsfq	%rdx, %rax;	\
-	SHIFT_RETURN;		\
-	ret
-
-#ifdef AS_STRNLEN
-	andq	$-16, %rax
-	FIND_ZERO
-#else
-	/* Test first 16 bytes unaligned.  */
-	movdqu	(%rax), %xmm4
-	PCMPEQ	%xmm0, %xmm4
-	pmovmskb	%xmm4, %edx
-	test	%edx, %edx
-	je 	L(next48_bytes)
-	bsf	%edx, %eax /* If eax is zeroed 16bit bsf can be used.  */
-	SHIFT_RETURN
-	ret
-
-L(next48_bytes):
-/* Same as FIND_ZERO except we do not check first 16 bytes.  */
-	andq	$-16, %rax
-	PCMPEQ 16(%rax), %xmm1
-	PCMPEQ 32(%rax), %xmm2
-	PCMPEQ 48(%rax), %xmm3
-	pmovmskb	%xmm1, %edx
-	pmovmskb	%xmm2, %r8d
-	pmovmskb	%xmm3, %ecx
-	salq	$16, %rdx
-	salq	$16, %rcx
-	orq	%r8, %rcx
-	salq	$32, %rcx
-	orq	%rcx, %rdx
-#endif
-
-	/* When no zero byte is found xmm1-3 are zero so we do not have to
-	   zero them.  */
-	PROLOG(loop)
-
-	.p2align 4
-L(cross_page):
-	andq	$-64, %rax
-	FIND_ZERO
-	PROLOG(loop_init)
-
-#ifdef AS_STRNLEN
-/* We must do this check to correctly handle strnlen (s, -1).  */
-L(strnlen_ret):
-	bts	%rsi, %rdx
-	sarq	%cl, %rdx
-	test	%rdx, %rdx
-	je	L(loop_init)
-	bsfq	%rdx, %rax
-	SHIFT_RETURN
-	ret
-#endif
-	.p2align 4
-L(loop_init):
-	pxor	%xmm1, %xmm1
-	pxor	%xmm2, %xmm2
-	pxor	%xmm3, %xmm3
-#ifdef AS_STRNLEN
-	.p2align 4
-L(loop):
-
-	addq	$64, %rax
-	cmpq	%rax, %r10
-	je	L(exit_end)
-
-	movdqa	(%rax), %xmm0
-	PMINU	16(%rax), %xmm0
-	PMINU	32(%rax), %xmm0
-	PMINU	48(%rax), %xmm0
-	PCMPEQ	%xmm3, %xmm0
-	pmovmskb	%xmm0, %edx
-	testl	%edx, %edx
-	jne	L(exit)
-	jmp	L(loop)
-
-	.p2align 4
-L(exit_end):
-	cmp	%rax, %r11
-	je	L(first) /* Do not read when end is at page boundary.  */
-	pxor	%xmm0, %xmm0
-	FIND_ZERO
-
-L(first):
-	bts	%r11, %rdx
-	bsfq	%rdx, %rdx
-	addq	%rdx, %rax
-	subq	%rdi, %rax
-	SHIFT_RETURN
-	ret
-
-	.p2align 4
-L(exit):
-	pxor	%xmm0, %xmm0
-	FIND_ZERO
-
-	bsfq	%rdx, %rdx
-	addq	%rdx, %rax
-	subq	%rdi, %rax
-	SHIFT_RETURN
-	ret
-
-#else
-
-	/* Main loop.  Unrolled twice to improve L2 cache performance on core2.  */
-	.p2align 4
-L(loop):
-
-	movdqa	64(%rax), %xmm0
-	PMINU	80(%rax), %xmm0
-	PMINU	96(%rax), %xmm0
-	PMINU	112(%rax), %xmm0
-	PCMPEQ	%xmm3, %xmm0
-	pmovmskb	%xmm0, %edx
-	testl	%edx, %edx
-	jne	L(exit64)
-
-	subq	$-128, %rax
-
-	movdqa	(%rax), %xmm0
-	PMINU	16(%rax), %xmm0
-	PMINU	32(%rax), %xmm0
-	PMINU	48(%rax), %xmm0
-	PCMPEQ	%xmm3, %xmm0
-	pmovmskb	%xmm0, %edx
-	testl	%edx, %edx
-	jne	L(exit0)
-	jmp	L(loop)
-
-	.p2align 4
-L(exit64):
-	addq	$64, %rax
-L(exit0):
-	pxor	%xmm0, %xmm0
-	FIND_ZERO
-
-	bsfq	%rdx, %rdx
-	addq	%rdx, %rax
-	subq	%rdi, %rax
-	SHIFT_RETURN
-	ret
-
-#endif
-
-END(strlen)
diff --git a/sysdeps/x86_64/multiarch/strnlen-sse2.S b/sysdeps/x86_64/multiarch/strnlen-sse2.S
index c4f395c210..a50c7d6a28 100644
--- a/sysdeps/x86_64/multiarch/strnlen-sse2.S
+++ b/sysdeps/x86_64/multiarch/strnlen-sse2.S
@@ -17,12 +17,10 @@
    <https://www.gnu.org/licenses/>.  */
 
 #if IS_IN (libc)
-# define __strnlen __strnlen_sse2
-
-# undef weak_alias
-# define weak_alias(__strnlen, strnlen)
-# undef libc_hidden_builtin_def
-# define libc_hidden_builtin_def(strnlen)
+# ifndef STRLEN
+#  define STRLEN	__strnlen_sse2
+# endif
 #endif
 
-#include "../strnlen.S"
+#define AS_STRNLEN
+#include "strlen-sse2.S"
diff --git a/sysdeps/x86_64/multiarch/wcslen-sse4_1.S b/sysdeps/x86_64/multiarch/wcslen-sse4_1.S
index e306a77f51..c88e8342a1 100644
--- a/sysdeps/x86_64/multiarch/wcslen-sse4_1.S
+++ b/sysdeps/x86_64/multiarch/wcslen-sse4_1.S
@@ -1,5 +1,5 @@
 #define AS_WCSLEN
-#define strlen	__wcslen_sse4_1
+#define STRLEN	__wcslen_sse4_1
 #define SECTION(p)	p##.sse4.1
 
-#include "strlen-vec.S"
+#include "strlen-sse2.S"
diff --git a/sysdeps/x86_64/multiarch/wcsnlen-sse4_1.S b/sysdeps/x86_64/multiarch/wcsnlen-sse4_1.S
index d2f7dd6e22..17cdedc2a9 100644
--- a/sysdeps/x86_64/multiarch/wcsnlen-sse4_1.S
+++ b/sysdeps/x86_64/multiarch/wcsnlen-sse4_1.S
@@ -1,6 +1,6 @@
 #define AS_WCSLEN
 #define AS_STRNLEN
-#define strlen	__wcsnlen_sse4_1
+#define STRLEN	__wcsnlen_sse4_1
 #define SECTION(p)	p##.sse4.1
 
-#include "strlen-vec.S"
+#include "strlen-sse2.S"
diff --git a/sysdeps/x86_64/strlen.S b/sysdeps/x86_64/strlen.S
index e1f0b19f2f..c2f5674f8d 100644
--- a/sysdeps/x86_64/strlen.S
+++ b/sysdeps/x86_64/strlen.S
@@ -16,6 +16,7 @@
    License along with the GNU C Library; if not, see
    <https://www.gnu.org/licenses/>.  */
 
-#include "multiarch/strlen-vec.S"
+#define STRLEN	strlen
+#include "multiarch/strlen-sse2.S"
 
 libc_hidden_builtin_def (strlen)
diff --git a/sysdeps/x86_64/strnlen.S b/sysdeps/x86_64/strnlen.S
index d3c43ac482..174970d58f 100644
--- a/sysdeps/x86_64/strnlen.S
+++ b/sysdeps/x86_64/strnlen.S
@@ -1,6 +1,6 @@
-#define AS_STRNLEN
-#define strlen __strnlen
-#include "strlen.S"
+#define STRLEN __strnlen
+#include "multiarch/strnlen-sse2.S"
 
+libc_hidden_def (__strnlen)
 weak_alias (__strnlen, strnlen);
 libc_hidden_builtin_def (strnlen)
-- 
2.34.1


^ permalink raw reply	[flat|nested] 21+ messages in thread

* [PATCH v1] x86: Move strcpy SSE2 implementation to multiarch/strcpy-sse2.S
  2022-07-12 19:29 [PATCH v1] x86: Move strlen SSE2 implementation to multiarch/strlen-sse2.S Noah Goldstein
@ 2022-07-12 19:29 ` Noah Goldstein
  2022-07-12 23:23   ` H.J. Lu
  2022-07-12 19:29 ` [PATCH v1] x86: Move memrchr SSE2 implementation to multiarch/memrchr-sse2.S Noah Goldstein
                   ` (8 subsequent siblings)
  9 siblings, 1 reply; 21+ messages in thread
From: Noah Goldstein @ 2022-07-12 19:29 UTC (permalink / raw)
  To: libc-alpha

This commit doesn't affect libc.so.6, its just housekeeping to prepare
for adding explicit ISA level support.

Tested build on x86_64 and x86_32 with/without multiarch.
---
 sysdeps/x86_64/multiarch/rtld-stpcpy.S |  18 ++++
 sysdeps/x86_64/multiarch/stpcpy-sse2.S |  15 +--
 sysdeps/x86_64/multiarch/strcpy-sse2.S | 137 ++++++++++++++++++++++--
 sysdeps/x86_64/stpcpy.S                |   3 +-
 sysdeps/x86_64/strcpy.S                | 138 +------------------------
 5 files changed, 156 insertions(+), 155 deletions(-)
 create mode 100644 sysdeps/x86_64/multiarch/rtld-stpcpy.S

diff --git a/sysdeps/x86_64/multiarch/rtld-stpcpy.S b/sysdeps/x86_64/multiarch/rtld-stpcpy.S
new file mode 100644
index 0000000000..914141f07f
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/rtld-stpcpy.S
@@ -0,0 +1,18 @@
+/* Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include "../stpcpy.S"
diff --git a/sysdeps/x86_64/multiarch/stpcpy-sse2.S b/sysdeps/x86_64/multiarch/stpcpy-sse2.S
index 078504a44e..ea9f973af3 100644
--- a/sysdeps/x86_64/multiarch/stpcpy-sse2.S
+++ b/sysdeps/x86_64/multiarch/stpcpy-sse2.S
@@ -17,17 +17,10 @@
    <https://www.gnu.org/licenses/>.  */
 
 #if IS_IN (libc)
-
-# include <sysdep.h>
-# define __stpcpy __stpcpy_sse2
-
-# undef weak_alias
-# define weak_alias(ignored1, ignored2)
-# undef libc_hidden_def
-# define libc_hidden_def(__stpcpy)
-# undef libc_hidden_builtin_def
-# define libc_hidden_builtin_def(stpcpy)
+# ifndef STRCPY
+#  define STRCPY	__stpcpy_sse2
+# endif
 #endif
 
 #define USE_AS_STPCPY
-#include <sysdeps/x86_64/stpcpy.S>
+#include "strcpy-sse2.S"
diff --git a/sysdeps/x86_64/multiarch/strcpy-sse2.S b/sysdeps/x86_64/multiarch/strcpy-sse2.S
index f37967c441..8b5db8b13d 100644
--- a/sysdeps/x86_64/multiarch/strcpy-sse2.S
+++ b/sysdeps/x86_64/multiarch/strcpy-sse2.S
@@ -17,12 +17,137 @@
    <https://www.gnu.org/licenses/>.  */
 
 #if IS_IN (libc)
+# ifndef STRCPY
+#  define STRCPY __strcpy_sse2
+# endif
+#endif
 
-# include <sysdep.h>
-# define strcpy __strcpy_sse2
+#include <sysdep.h>
 
-# undef libc_hidden_builtin_def
-# define libc_hidden_builtin_def(strcpy)
-#endif
+	.text
+ENTRY (STRCPY)
+	movq %rsi, %rcx		/* Source register. */
+	andl $7, %ecx		/* mask alignment bits */
+	movq %rdi, %rdx		/* Duplicate destination pointer.  */
+
+	jz 5f			/* aligned => start loop */
+
+	neg %ecx		/* We need to align to 8 bytes.  */
+	addl $8,%ecx
+	/* Search the first bytes directly.  */
+0:
+	movb	(%rsi), %al	/* Fetch a byte */
+	testb	%al, %al	/* Is it NUL? */
+	movb	%al, (%rdx)	/* Store it */
+	jz	4f		/* If it was NUL, done! */
+	incq	%rsi
+	incq	%rdx
+	decl	%ecx
+	jnz	0b
+
+5:
+	movq $0xfefefefefefefeff,%r8
+
+	/* Now the sources is aligned.  Unfortunatly we cannot force
+	   to have both source and destination aligned, so ignore the
+	   alignment of the destination.  */
+	.p2align 4
+1:
+	/* 1st unroll.  */
+	movq	(%rsi), %rax	/* Read double word (8 bytes).  */
+	addq	$8, %rsi	/* Adjust pointer for next word.  */
+	movq	%rax, %r9	/* Save a copy for NUL finding.  */
+	addq	%r8, %r9	/* add the magic value to the word.  We get
+				   carry bits reported for each byte which
+				   is *not* 0 */
+	jnc	3f		/* highest byte is NUL => return pointer */
+	xorq	%rax, %r9	/* (word+magic)^word */
+	orq	%r8, %r9	/* set all non-carry bits */
+	incq	%r9		/* add 1: if one carry bit was *not* set
+				   the addition will not result in 0.  */
+
+	jnz	3f		/* found NUL => return pointer */
+
+	movq	%rax, (%rdx)	/* Write value to destination.  */
+	addq	$8, %rdx	/* Adjust pointer.  */
+
+	/* 2nd unroll.  */
+	movq	(%rsi), %rax	/* Read double word (8 bytes).  */
+	addq	$8, %rsi	/* Adjust pointer for next word.  */
+	movq	%rax, %r9	/* Save a copy for NUL finding.  */
+	addq	%r8, %r9	/* add the magic value to the word.  We get
+				   carry bits reported for each byte which
+				   is *not* 0 */
+	jnc	3f		/* highest byte is NUL => return pointer */
+	xorq	%rax, %r9	/* (word+magic)^word */
+	orq	%r8, %r9	/* set all non-carry bits */
+	incq	%r9		/* add 1: if one carry bit was *not* set
+				   the addition will not result in 0.  */
+
+	jnz	3f		/* found NUL => return pointer */
 
-#include <sysdeps/x86_64/strcpy.S>
+	movq	%rax, (%rdx)	/* Write value to destination.  */
+	addq	$8, %rdx	/* Adjust pointer.  */
+
+	/* 3rd unroll.  */
+	movq	(%rsi), %rax	/* Read double word (8 bytes).  */
+	addq	$8, %rsi	/* Adjust pointer for next word.  */
+	movq	%rax, %r9	/* Save a copy for NUL finding.  */
+	addq	%r8, %r9	/* add the magic value to the word.  We get
+				   carry bits reported for each byte which
+				   is *not* 0 */
+	jnc	3f		/* highest byte is NUL => return pointer */
+	xorq	%rax, %r9	/* (word+magic)^word */
+	orq	%r8, %r9	/* set all non-carry bits */
+	incq	%r9		/* add 1: if one carry bit was *not* set
+				   the addition will not result in 0.  */
+
+	jnz	3f		/* found NUL => return pointer */
+
+	movq	%rax, (%rdx)	/* Write value to destination.  */
+	addq	$8, %rdx	/* Adjust pointer.  */
+
+	/* 4th unroll.  */
+	movq	(%rsi), %rax	/* Read double word (8 bytes).  */
+	addq	$8, %rsi	/* Adjust pointer for next word.  */
+	movq	%rax, %r9	/* Save a copy for NUL finding.  */
+	addq	%r8, %r9	/* add the magic value to the word.  We get
+				   carry bits reported for each byte which
+				   is *not* 0 */
+	jnc	3f		/* highest byte is NUL => return pointer */
+	xorq	%rax, %r9	/* (word+magic)^word */
+	orq	%r8, %r9	/* set all non-carry bits */
+	incq	%r9		/* add 1: if one carry bit was *not* set
+				   the addition will not result in 0.  */
+
+	jnz	3f		/* found NUL => return pointer */
+
+	movq	%rax, (%rdx)	/* Write value to destination.  */
+	addq	$8, %rdx	/* Adjust pointer.  */
+	jmp	1b		/* Next iteration.  */
+
+	/* Do the last few bytes. %rax contains the value to write.
+	   The loop is unrolled twice.  */
+	.p2align 4
+3:
+	/* Note that stpcpy needs to return with the value of the NUL
+	   byte.  */
+	movb	%al, (%rdx)	/* 1st byte.  */
+	testb	%al, %al	/* Is it NUL.  */
+	jz	4f		/* yes, finish.  */
+	incq	%rdx		/* Increment destination.  */
+	movb	%ah, (%rdx)	/* 2nd byte.  */
+	testb	%ah, %ah	/* Is it NUL?.  */
+	jz	4f		/* yes, finish.  */
+	incq	%rdx		/* Increment destination.  */
+	shrq	$16, %rax	/* Shift...  */
+	jmp	3b		/* and look at next two bytes in %rax.  */
+
+4:
+#ifdef USE_AS_STPCPY
+	movq	%rdx, %rax	/* Destination is return value.  */
+#else
+	movq	%rdi, %rax	/* Source is return value.  */
+#endif
+	retq
+END (STRCPY)
diff --git a/sysdeps/x86_64/stpcpy.S b/sysdeps/x86_64/stpcpy.S
index ec23de1416..b097c203dd 100644
--- a/sysdeps/x86_64/stpcpy.S
+++ b/sysdeps/x86_64/stpcpy.S
@@ -1,7 +1,6 @@
-#define USE_AS_STPCPY
 #define STRCPY __stpcpy
 
-#include <sysdeps/x86_64/strcpy.S>
+#include "multiarch/stpcpy-sse2.S"
 
 weak_alias (__stpcpy, stpcpy)
 libc_hidden_def (__stpcpy)
diff --git a/sysdeps/x86_64/strcpy.S b/sysdeps/x86_64/strcpy.S
index 17e8073550..05f19e6e94 100644
--- a/sysdeps/x86_64/strcpy.S
+++ b/sysdeps/x86_64/strcpy.S
@@ -16,140 +16,6 @@
    License along with the GNU C Library; if not, see
    <https://www.gnu.org/licenses/>.  */
 
-#include <sysdep.h>
-#include "asm-syntax.h"
-
-#ifndef USE_AS_STPCPY
-# define STRCPY strcpy
-#endif
-
-	.text
-ENTRY (STRCPY)
-	movq %rsi, %rcx		/* Source register. */
-	andl $7, %ecx		/* mask alignment bits */
-	movq %rdi, %rdx		/* Duplicate destination pointer.  */
-
-	jz 5f			/* aligned => start loop */
-
-	neg %ecx		/* We need to align to 8 bytes.  */
-	addl $8,%ecx
-	/* Search the first bytes directly.  */
-0:
-	movb	(%rsi), %al	/* Fetch a byte */
-	testb	%al, %al	/* Is it NUL? */
-	movb	%al, (%rdx)	/* Store it */
-	jz	4f		/* If it was NUL, done! */
-	incq	%rsi
-	incq	%rdx
-	decl	%ecx
-	jnz	0b
-
-5:
-	movq $0xfefefefefefefeff,%r8
-
-	/* Now the sources is aligned.  Unfortunatly we cannot force
-	   to have both source and destination aligned, so ignore the
-	   alignment of the destination.  */
-	.p2align 4
-1:
-	/* 1st unroll.  */
-	movq	(%rsi), %rax	/* Read double word (8 bytes).  */
-	addq	$8, %rsi	/* Adjust pointer for next word.  */
-	movq	%rax, %r9	/* Save a copy for NUL finding.  */
-	addq	%r8, %r9	/* add the magic value to the word.  We get
-				   carry bits reported for each byte which
-				   is *not* 0 */
-	jnc	3f		/* highest byte is NUL => return pointer */
-	xorq	%rax, %r9	/* (word+magic)^word */
-	orq	%r8, %r9	/* set all non-carry bits */
-	incq	%r9		/* add 1: if one carry bit was *not* set
-				   the addition will not result in 0.  */
-
-	jnz	3f		/* found NUL => return pointer */
-
-	movq	%rax, (%rdx)	/* Write value to destination.  */
-	addq	$8, %rdx	/* Adjust pointer.  */
-
-	/* 2nd unroll.  */
-	movq	(%rsi), %rax	/* Read double word (8 bytes).  */
-	addq	$8, %rsi	/* Adjust pointer for next word.  */
-	movq	%rax, %r9	/* Save a copy for NUL finding.  */
-	addq	%r8, %r9	/* add the magic value to the word.  We get
-				   carry bits reported for each byte which
-				   is *not* 0 */
-	jnc	3f		/* highest byte is NUL => return pointer */
-	xorq	%rax, %r9	/* (word+magic)^word */
-	orq	%r8, %r9	/* set all non-carry bits */
-	incq	%r9		/* add 1: if one carry bit was *not* set
-				   the addition will not result in 0.  */
-
-	jnz	3f		/* found NUL => return pointer */
-
-	movq	%rax, (%rdx)	/* Write value to destination.  */
-	addq	$8, %rdx	/* Adjust pointer.  */
-
-	/* 3rd unroll.  */
-	movq	(%rsi), %rax	/* Read double word (8 bytes).  */
-	addq	$8, %rsi	/* Adjust pointer for next word.  */
-	movq	%rax, %r9	/* Save a copy for NUL finding.  */
-	addq	%r8, %r9	/* add the magic value to the word.  We get
-				   carry bits reported for each byte which
-				   is *not* 0 */
-	jnc	3f		/* highest byte is NUL => return pointer */
-	xorq	%rax, %r9	/* (word+magic)^word */
-	orq	%r8, %r9	/* set all non-carry bits */
-	incq	%r9		/* add 1: if one carry bit was *not* set
-				   the addition will not result in 0.  */
-
-	jnz	3f		/* found NUL => return pointer */
-
-	movq	%rax, (%rdx)	/* Write value to destination.  */
-	addq	$8, %rdx	/* Adjust pointer.  */
-
-	/* 4th unroll.  */
-	movq	(%rsi), %rax	/* Read double word (8 bytes).  */
-	addq	$8, %rsi	/* Adjust pointer for next word.  */
-	movq	%rax, %r9	/* Save a copy for NUL finding.  */
-	addq	%r8, %r9	/* add the magic value to the word.  We get
-				   carry bits reported for each byte which
-				   is *not* 0 */
-	jnc	3f		/* highest byte is NUL => return pointer */
-	xorq	%rax, %r9	/* (word+magic)^word */
-	orq	%r8, %r9	/* set all non-carry bits */
-	incq	%r9		/* add 1: if one carry bit was *not* set
-				   the addition will not result in 0.  */
-
-	jnz	3f		/* found NUL => return pointer */
-
-	movq	%rax, (%rdx)	/* Write value to destination.  */
-	addq	$8, %rdx	/* Adjust pointer.  */
-	jmp	1b		/* Next iteration.  */
-
-	/* Do the last few bytes. %rax contains the value to write.
-	   The loop is unrolled twice.  */
-	.p2align 4
-3:
-	/* Note that stpcpy needs to return with the value of the NUL
-	   byte.  */
-	movb	%al, (%rdx)	/* 1st byte.  */
-	testb	%al, %al	/* Is it NUL.  */
-	jz	4f		/* yes, finish.  */
-	incq	%rdx		/* Increment destination.  */
-	movb	%ah, (%rdx)	/* 2nd byte.  */
-	testb	%ah, %ah	/* Is it NUL?.  */
-	jz	4f		/* yes, finish.  */
-	incq	%rdx		/* Increment destination.  */
-	shrq	$16, %rax	/* Shift...  */
-	jmp	3b		/* and look at next two bytes in %rax.  */
-
-4:
-#ifdef USE_AS_STPCPY
-	movq	%rdx, %rax	/* Destination is return value.  */
-#else
-	movq	%rdi, %rax	/* Source is return value.  */
-#endif
-	retq
-END (STRCPY)
-#ifndef USE_AS_STPCPY
+#define STRCPY	strcpy
+#include "multiarch/strcpy-sse2.S"
 libc_hidden_builtin_def (strcpy)
-#endif
-- 
2.34.1


^ permalink raw reply	[flat|nested] 21+ messages in thread

* [PATCH v1] x86: Move memrchr SSE2 implementation to multiarch/memrchr-sse2.S
  2022-07-12 19:29 [PATCH v1] x86: Move strlen SSE2 implementation to multiarch/strlen-sse2.S Noah Goldstein
  2022-07-12 19:29 ` [PATCH v1] x86: Move strcpy SSE2 implementation to multiarch/strcpy-sse2.S Noah Goldstein
@ 2022-07-12 19:29 ` Noah Goldstein
  2022-07-12 22:58   ` H.J. Lu
  2022-07-12 19:29 ` [PATCH v1] x86: Move strrchr SSE2 implementation to multiarch/strrchr-sse2.S Noah Goldstein
                   ` (7 subsequent siblings)
  9 siblings, 1 reply; 21+ messages in thread
From: Noah Goldstein @ 2022-07-12 19:29 UTC (permalink / raw)
  To: libc-alpha

This commit doesn't affect libc.so.6, its just housekeeping to prepare
for adding explicit ISA level support.

Tested build on x86_64 and x86_32 with/without multiarch.
---
 sysdeps/x86_64/memrchr.S                | 332 +----------------------
 sysdeps/x86_64/multiarch/memrchr-sse2.S | 336 +++++++++++++++++++++++-
 2 files changed, 334 insertions(+), 334 deletions(-)

diff --git a/sysdeps/x86_64/memrchr.S b/sysdeps/x86_64/memrchr.S
index b0dffd2ae2..385e2c5668 100644
--- a/sysdeps/x86_64/memrchr.S
+++ b/sysdeps/x86_64/memrchr.S
@@ -17,334 +17,6 @@
    License along with the GNU C Library; if not, see
    <https://www.gnu.org/licenses/>.  */
 
-#include <sysdep.h>
-#define VEC_SIZE			16
-#define PAGE_SIZE			4096
-
-	.text
-ENTRY_P2ALIGN(__memrchr, 6)
-#ifdef __ILP32__
-	/* Clear upper bits.  */
-	mov	%RDX_LP, %RDX_LP
-#endif
-	movd	%esi, %xmm0
-
-	/* Get end pointer.  */
-	leaq	(%rdx, %rdi), %rcx
-
-	punpcklbw %xmm0, %xmm0
-	punpcklwd %xmm0, %xmm0
-	pshufd	$0, %xmm0, %xmm0
-
-	/* Check if we can load 1x VEC without cross a page.  */
-	testl	$(PAGE_SIZE - VEC_SIZE), %ecx
-	jz	L(page_cross)
-
-	/* NB: This load happens regardless of whether rdx (len) is zero. Since
-	   it doesn't cross a page and the standard gurantees any pointer have
-	   at least one-valid byte this load must be safe. For the entire
-	   history of the x86 memrchr implementation this has been possible so
-	   no code "should" be relying on a zero-length check before this load.
-	   The zero-length check is moved to the page cross case because it is
-	   1) pretty cold and including it pushes the hot case len <= VEC_SIZE
-	   into 2-cache lines.  */
-	movups	-(VEC_SIZE)(%rcx), %xmm1
-	pcmpeqb	%xmm0, %xmm1
-	pmovmskb %xmm1, %eax
-
-	subq	$VEC_SIZE, %rdx
-	ja	L(more_1x_vec)
-L(ret_vec_x0_test):
-	/* Zero-flag set if eax (src) is zero. Destination unchanged if src is
-	   zero.  */
-	bsrl	%eax, %eax
-	jz	L(ret_0)
-	/* Check if the CHAR match is in bounds. Need to truly zero `eax` here
-	   if out of bounds.  */
-	addl	%edx, %eax
-	jl	L(zero_0)
-	/* Since we subtracted VEC_SIZE from rdx earlier we can just add to base
-	   ptr.  */
-	addq	%rdi, %rax
-L(ret_0):
-	ret
-
-	.p2align 4,, 5
-L(ret_vec_x0):
-	bsrl	%eax, %eax
-	leaq	-(VEC_SIZE)(%rcx, %rax), %rax
-	ret
-
-	.p2align 4,, 2
-L(zero_0):
-	xorl	%eax, %eax
-	ret
-
-
-	.p2align 4,, 8
-L(more_1x_vec):
-	testl	%eax, %eax
-	jnz	L(ret_vec_x0)
-
-	/* Align rcx (pointer to string).  */
-	decq	%rcx
-	andq	$-VEC_SIZE, %rcx
-
-	movq	%rcx, %rdx
-	/* NB: We could consistenyl save 1-byte in this pattern with `movaps
-	   %xmm0, %xmm1; pcmpeq IMM8(r), %xmm1; ...`. The reason against it is
-	   it adds more frontend uops (even if the moves can be eliminated) and
-	   some percentage of the time actual backend uops.  */
-	movaps	-(VEC_SIZE)(%rcx), %xmm1
-	pcmpeqb	%xmm0, %xmm1
-	subq	%rdi, %rdx
-	pmovmskb %xmm1, %eax
-
-	cmpq	$(VEC_SIZE * 2), %rdx
-	ja	L(more_2x_vec)
-L(last_2x_vec):
-	subl	$VEC_SIZE, %edx
-	jbe	L(ret_vec_x0_test)
-
-	testl	%eax, %eax
-	jnz	L(ret_vec_x0)
-
-	movaps	-(VEC_SIZE * 2)(%rcx), %xmm1
-	pcmpeqb	%xmm0, %xmm1
-	pmovmskb %xmm1, %eax
-
-	subl	$VEC_SIZE, %edx
-	bsrl	%eax, %eax
-	jz	L(ret_1)
-	addl	%edx, %eax
-	jl	L(zero_0)
-	addq	%rdi, %rax
-L(ret_1):
-	ret
-
-	/* Don't align. Otherwise lose 2-byte encoding in jump to L(page_cross)
-	   causes the hot pause (length <= VEC_SIZE) to span multiple cache
-	   lines.  Naturally aligned % 16 to 8-bytes.  */
-L(page_cross):
-	/* Zero length check.  */
-	testq	%rdx, %rdx
-	jz	L(zero_0)
-
-	leaq	-1(%rcx), %r8
-	andq	$-(VEC_SIZE), %r8
-
-	movaps	(%r8), %xmm1
-	pcmpeqb	%xmm0, %xmm1
-	pmovmskb %xmm1, %esi
-	/* Shift out negative alignment (because we are starting from endptr and
-	   working backwards).  */
-	negl	%ecx
-	/* 32-bit shift but VEC_SIZE=16 so need to mask the shift count
-	   explicitly.  */
-	andl	$(VEC_SIZE - 1), %ecx
-	shl	%cl, %esi
-	movzwl	%si, %eax
-	leaq	(%rdi, %rdx), %rcx
-	cmpq	%rdi, %r8
-	ja	L(more_1x_vec)
-	subl	$VEC_SIZE, %edx
-	bsrl	%eax, %eax
-	jz	L(ret_2)
-	addl	%edx, %eax
-	jl	L(zero_1)
-	addq	%rdi, %rax
-L(ret_2):
-	ret
-
-	/* Fits in aliging bytes.  */
-L(zero_1):
-	xorl	%eax, %eax
-	ret
-
-	.p2align 4,, 5
-L(ret_vec_x1):
-	bsrl	%eax, %eax
-	leaq	-(VEC_SIZE * 2)(%rcx, %rax), %rax
-	ret
-
-	.p2align 4,, 8
-L(more_2x_vec):
-	testl	%eax, %eax
-	jnz	L(ret_vec_x0)
-
-	movaps	-(VEC_SIZE * 2)(%rcx), %xmm1
-	pcmpeqb	%xmm0, %xmm1
-	pmovmskb %xmm1, %eax
-	testl	%eax, %eax
-	jnz	L(ret_vec_x1)
-
-
-	movaps	-(VEC_SIZE * 3)(%rcx), %xmm1
-	pcmpeqb	%xmm0, %xmm1
-	pmovmskb %xmm1, %eax
-
-	subq	$(VEC_SIZE * 4), %rdx
-	ja	L(more_4x_vec)
-
-	addl	$(VEC_SIZE), %edx
-	jle	L(ret_vec_x2_test)
-
-L(last_vec):
-	testl	%eax, %eax
-	jnz	L(ret_vec_x2)
-
-	movaps	-(VEC_SIZE * 4)(%rcx), %xmm1
-	pcmpeqb	%xmm0, %xmm1
-	pmovmskb %xmm1, %eax
-
-	subl	$(VEC_SIZE), %edx
-	bsrl	%eax, %eax
-	jz	L(ret_3)
-	addl	%edx, %eax
-	jl	L(zero_2)
-	addq	%rdi, %rax
-L(ret_3):
-	ret
-
-	.p2align 4,, 6
-L(ret_vec_x2_test):
-	bsrl	%eax, %eax
-	jz	L(zero_2)
-	addl	%edx, %eax
-	jl	L(zero_2)
-	addq	%rdi, %rax
-	ret
-
-L(zero_2):
-	xorl	%eax, %eax
-	ret
-
-
-	.p2align 4,, 5
-L(ret_vec_x2):
-	bsrl	%eax, %eax
-	leaq	-(VEC_SIZE * 3)(%rcx, %rax), %rax
-	ret
-
-	.p2align 4,, 5
-L(ret_vec_x3):
-	bsrl	%eax, %eax
-	leaq	-(VEC_SIZE * 4)(%rcx, %rax), %rax
-	ret
-
-	.p2align 4,, 8
-L(more_4x_vec):
-	testl	%eax, %eax
-	jnz	L(ret_vec_x2)
-
-	movaps	-(VEC_SIZE * 4)(%rcx), %xmm1
-	pcmpeqb	%xmm0, %xmm1
-	pmovmskb %xmm1, %eax
-
-	testl	%eax, %eax
-	jnz	L(ret_vec_x3)
-
-	addq	$-(VEC_SIZE * 4), %rcx
-	cmpq	$(VEC_SIZE * 4), %rdx
-	jbe	L(last_4x_vec)
-
-	/* Offset everything by 4x VEC_SIZE here to save a few bytes at the end
-	   keeping the code from spilling to the next cache line.  */
-	addq	$(VEC_SIZE * 4 - 1), %rcx
-	andq	$-(VEC_SIZE * 4), %rcx
-	leaq	(VEC_SIZE * 4)(%rdi), %rdx
-	andq	$-(VEC_SIZE * 4), %rdx
-
-	.p2align 4,, 11
-L(loop_4x_vec):
-	movaps	(VEC_SIZE * -1)(%rcx), %xmm1
-	movaps	(VEC_SIZE * -2)(%rcx), %xmm2
-	movaps	(VEC_SIZE * -3)(%rcx), %xmm3
-	movaps	(VEC_SIZE * -4)(%rcx), %xmm4
-	pcmpeqb	%xmm0, %xmm1
-	pcmpeqb	%xmm0, %xmm2
-	pcmpeqb	%xmm0, %xmm3
-	pcmpeqb	%xmm0, %xmm4
-
-	por	%xmm1, %xmm2
-	por	%xmm3, %xmm4
-	por	%xmm2, %xmm4
-
-	pmovmskb %xmm4, %esi
-	testl	%esi, %esi
-	jnz	L(loop_end)
-
-	addq	$-(VEC_SIZE * 4), %rcx
-	cmpq	%rdx, %rcx
-	jne	L(loop_4x_vec)
-
-	subl	%edi, %edx
-
-	/* Ends up being 1-byte nop.  */
-	.p2align 4,, 2
-L(last_4x_vec):
-	movaps	-(VEC_SIZE)(%rcx), %xmm1
-	pcmpeqb	%xmm0, %xmm1
-	pmovmskb %xmm1, %eax
-
-	cmpl	$(VEC_SIZE * 2), %edx
-	jbe	L(last_2x_vec)
-
-	testl	%eax, %eax
-	jnz	L(ret_vec_x0)
-
-
-	movaps	-(VEC_SIZE * 2)(%rcx), %xmm1
-	pcmpeqb	%xmm0, %xmm1
-	pmovmskb %xmm1, %eax
-
-	testl	%eax, %eax
-	jnz	L(ret_vec_end)
-
-	movaps	-(VEC_SIZE * 3)(%rcx), %xmm1
-	pcmpeqb	%xmm0, %xmm1
-	pmovmskb %xmm1, %eax
-
-	subl	$(VEC_SIZE * 3), %edx
-	ja	L(last_vec)
-	bsrl	%eax, %eax
-	jz	L(ret_4)
-	addl	%edx, %eax
-	jl	L(zero_3)
-	addq	%rdi, %rax
-L(ret_4):
-	ret
-
-	/* Ends up being 1-byte nop.  */
-	.p2align 4,, 3
-L(loop_end):
-	pmovmskb %xmm1, %eax
-	sall	$16, %eax
-	jnz	L(ret_vec_end)
-
-	pmovmskb %xmm2, %eax
-	testl	%eax, %eax
-	jnz	L(ret_vec_end)
-
-	pmovmskb %xmm3, %eax
-	/* Combine last 2 VEC matches. If ecx (VEC3) is zero (no CHAR in VEC3)
-	   then it won't affect the result in esi (VEC4). If ecx is non-zero
-	   then CHAR in VEC3 and bsrq will use that position.  */
-	sall	$16, %eax
-	orl	%esi, %eax
-	bsrl	%eax, %eax
-	leaq	-(VEC_SIZE * 4)(%rcx, %rax), %rax
-	ret
-
-L(ret_vec_end):
-	bsrl	%eax, %eax
-	leaq	(VEC_SIZE * -2)(%rax, %rcx), %rax
-	ret
-	/* Use in L(last_4x_vec). In the same cache line. This is just a spare
-	   aligning bytes.  */
-L(zero_3):
-	xorl	%eax, %eax
-	ret
-	/* 2-bytes from next cache line.  */
-END(__memrchr)
+#define MEMRCHR	__memrchr
+#include "multiarch/memrchr-sse2.S"
 weak_alias (__memrchr, memrchr)
diff --git a/sysdeps/x86_64/multiarch/memrchr-sse2.S b/sysdeps/x86_64/multiarch/memrchr-sse2.S
index b04202e171..d92a4022dc 100644
--- a/sysdeps/x86_64/multiarch/memrchr-sse2.S
+++ b/sysdeps/x86_64/multiarch/memrchr-sse2.S
@@ -17,10 +17,338 @@
    <https://www.gnu.org/licenses/>.  */
 
 #if IS_IN (libc)
-# define __memrchr __memrchr_sse2
+# ifndef MEMRCHR
+#  define MEMRCHR __memrchr_sse2
+# endif
+#endif
+
+#include <sysdep.h>
+#define VEC_SIZE			16
+#define PAGE_SIZE			4096
 
-# undef weak_alias
-# define weak_alias(__memrchr, memrchr)
+	.text
+ENTRY_P2ALIGN(MEMRCHR, 6)
+#ifdef __ILP32__
+	/* Clear upper bits.  */
+	mov	%RDX_LP, %RDX_LP
 #endif
+	movd	%esi, %xmm0
+
+	/* Get end pointer.  */
+	leaq	(%rdx, %rdi), %rcx
+
+	punpcklbw %xmm0, %xmm0
+	punpcklwd %xmm0, %xmm0
+	pshufd	$0, %xmm0, %xmm0
+
+	/* Check if we can load 1x VEC without cross a page.  */
+	testl	$(PAGE_SIZE - VEC_SIZE), %ecx
+	jz	L(page_cross)
+
+	/* NB: This load happens regardless of whether rdx (len) is zero. Since
+	   it doesn't cross a page and the standard gurantees any pointer have
+	   at least one-valid byte this load must be safe. For the entire
+	   history of the x86 memrchr implementation this has been possible so
+	   no code "should" be relying on a zero-length check before this load.
+	   The zero-length check is moved to the page cross case because it is
+	   1) pretty cold and including it pushes the hot case len <= VEC_SIZE
+	   into 2-cache lines.  */
+	movups	-(VEC_SIZE)(%rcx), %xmm1
+	pcmpeqb	%xmm0, %xmm1
+	pmovmskb %xmm1, %eax
+
+	subq	$VEC_SIZE, %rdx
+	ja	L(more_1x_vec)
+L(ret_vec_x0_test):
+	/* Zero-flag set if eax (src) is zero. Destination unchanged if src is
+	   zero.  */
+	bsrl	%eax, %eax
+	jz	L(ret_0)
+	/* Check if the CHAR match is in bounds. Need to truly zero `eax` here
+	   if out of bounds.  */
+	addl	%edx, %eax
+	jl	L(zero_0)
+	/* Since we subtracted VEC_SIZE from rdx earlier we can just add to base
+	   ptr.  */
+	addq	%rdi, %rax
+L(ret_0):
+	ret
+
+	.p2align 4,, 5
+L(ret_vec_x0):
+	bsrl	%eax, %eax
+	leaq	-(VEC_SIZE)(%rcx, %rax), %rax
+	ret
+
+	.p2align 4,, 2
+L(zero_0):
+	xorl	%eax, %eax
+	ret
+
+
+	.p2align 4,, 8
+L(more_1x_vec):
+	testl	%eax, %eax
+	jnz	L(ret_vec_x0)
+
+	/* Align rcx (pointer to string).  */
+	decq	%rcx
+	andq	$-VEC_SIZE, %rcx
+
+	movq	%rcx, %rdx
+	/* NB: We could consistenyl save 1-byte in this pattern with `movaps
+	   %xmm0, %xmm1; pcmpeq IMM8(r), %xmm1; ...`. The reason against it is
+	   it adds more frontend uops (even if the moves can be eliminated) and
+	   some percentage of the time actual backend uops.  */
+	movaps	-(VEC_SIZE)(%rcx), %xmm1
+	pcmpeqb	%xmm0, %xmm1
+	subq	%rdi, %rdx
+	pmovmskb %xmm1, %eax
+
+	cmpq	$(VEC_SIZE * 2), %rdx
+	ja	L(more_2x_vec)
+L(last_2x_vec):
+	subl	$VEC_SIZE, %edx
+	jbe	L(ret_vec_x0_test)
+
+	testl	%eax, %eax
+	jnz	L(ret_vec_x0)
+
+	movaps	-(VEC_SIZE * 2)(%rcx), %xmm1
+	pcmpeqb	%xmm0, %xmm1
+	pmovmskb %xmm1, %eax
+
+	subl	$VEC_SIZE, %edx
+	bsrl	%eax, %eax
+	jz	L(ret_1)
+	addl	%edx, %eax
+	jl	L(zero_0)
+	addq	%rdi, %rax
+L(ret_1):
+	ret
+
+	/* Don't align. Otherwise lose 2-byte encoding in jump to L(page_cross)
+	   causes the hot pause (length <= VEC_SIZE) to span multiple cache
+	   lines.  Naturally aligned % 16 to 8-bytes.  */
+L(page_cross):
+	/* Zero length check.  */
+	testq	%rdx, %rdx
+	jz	L(zero_0)
+
+	leaq	-1(%rcx), %r8
+	andq	$-(VEC_SIZE), %r8
+
+	movaps	(%r8), %xmm1
+	pcmpeqb	%xmm0, %xmm1
+	pmovmskb %xmm1, %esi
+	/* Shift out negative alignment (because we are starting from endptr and
+	   working backwards).  */
+	negl	%ecx
+	/* 32-bit shift but VEC_SIZE=16 so need to mask the shift count
+	   explicitly.  */
+	andl	$(VEC_SIZE - 1), %ecx
+	shl	%cl, %esi
+	movzwl	%si, %eax
+	leaq	(%rdi, %rdx), %rcx
+	cmpq	%rdi, %r8
+	ja	L(more_1x_vec)
+	subl	$VEC_SIZE, %edx
+	bsrl	%eax, %eax
+	jz	L(ret_2)
+	addl	%edx, %eax
+	jl	L(zero_1)
+	addq	%rdi, %rax
+L(ret_2):
+	ret
+
+	/* Fits in aliging bytes.  */
+L(zero_1):
+	xorl	%eax, %eax
+	ret
+
+	.p2align 4,, 5
+L(ret_vec_x1):
+	bsrl	%eax, %eax
+	leaq	-(VEC_SIZE * 2)(%rcx, %rax), %rax
+	ret
+
+	.p2align 4,, 8
+L(more_2x_vec):
+	testl	%eax, %eax
+	jnz	L(ret_vec_x0)
+
+	movaps	-(VEC_SIZE * 2)(%rcx), %xmm1
+	pcmpeqb	%xmm0, %xmm1
+	pmovmskb %xmm1, %eax
+	testl	%eax, %eax
+	jnz	L(ret_vec_x1)
+
+
+	movaps	-(VEC_SIZE * 3)(%rcx), %xmm1
+	pcmpeqb	%xmm0, %xmm1
+	pmovmskb %xmm1, %eax
+
+	subq	$(VEC_SIZE * 4), %rdx
+	ja	L(more_4x_vec)
+
+	addl	$(VEC_SIZE), %edx
+	jle	L(ret_vec_x2_test)
+
+L(last_vec):
+	testl	%eax, %eax
+	jnz	L(ret_vec_x2)
+
+	movaps	-(VEC_SIZE * 4)(%rcx), %xmm1
+	pcmpeqb	%xmm0, %xmm1
+	pmovmskb %xmm1, %eax
+
+	subl	$(VEC_SIZE), %edx
+	bsrl	%eax, %eax
+	jz	L(ret_3)
+	addl	%edx, %eax
+	jl	L(zero_2)
+	addq	%rdi, %rax
+L(ret_3):
+	ret
+
+	.p2align 4,, 6
+L(ret_vec_x2_test):
+	bsrl	%eax, %eax
+	jz	L(zero_2)
+	addl	%edx, %eax
+	jl	L(zero_2)
+	addq	%rdi, %rax
+	ret
+
+L(zero_2):
+	xorl	%eax, %eax
+	ret
+
+
+	.p2align 4,, 5
+L(ret_vec_x2):
+	bsrl	%eax, %eax
+	leaq	-(VEC_SIZE * 3)(%rcx, %rax), %rax
+	ret
+
+	.p2align 4,, 5
+L(ret_vec_x3):
+	bsrl	%eax, %eax
+	leaq	-(VEC_SIZE * 4)(%rcx, %rax), %rax
+	ret
+
+	.p2align 4,, 8
+L(more_4x_vec):
+	testl	%eax, %eax
+	jnz	L(ret_vec_x2)
+
+	movaps	-(VEC_SIZE * 4)(%rcx), %xmm1
+	pcmpeqb	%xmm0, %xmm1
+	pmovmskb %xmm1, %eax
+
+	testl	%eax, %eax
+	jnz	L(ret_vec_x3)
+
+	addq	$-(VEC_SIZE * 4), %rcx
+	cmpq	$(VEC_SIZE * 4), %rdx
+	jbe	L(last_4x_vec)
+
+	/* Offset everything by 4x VEC_SIZE here to save a few bytes at the end
+	   keeping the code from spilling to the next cache line.  */
+	addq	$(VEC_SIZE * 4 - 1), %rcx
+	andq	$-(VEC_SIZE * 4), %rcx
+	leaq	(VEC_SIZE * 4)(%rdi), %rdx
+	andq	$-(VEC_SIZE * 4), %rdx
+
+	.p2align 4,, 11
+L(loop_4x_vec):
+	movaps	(VEC_SIZE * -1)(%rcx), %xmm1
+	movaps	(VEC_SIZE * -2)(%rcx), %xmm2
+	movaps	(VEC_SIZE * -3)(%rcx), %xmm3
+	movaps	(VEC_SIZE * -4)(%rcx), %xmm4
+	pcmpeqb	%xmm0, %xmm1
+	pcmpeqb	%xmm0, %xmm2
+	pcmpeqb	%xmm0, %xmm3
+	pcmpeqb	%xmm0, %xmm4
+
+	por	%xmm1, %xmm2
+	por	%xmm3, %xmm4
+	por	%xmm2, %xmm4
+
+	pmovmskb %xmm4, %esi
+	testl	%esi, %esi
+	jnz	L(loop_end)
+
+	addq	$-(VEC_SIZE * 4), %rcx
+	cmpq	%rdx, %rcx
+	jne	L(loop_4x_vec)
+
+	subl	%edi, %edx
+
+	/* Ends up being 1-byte nop.  */
+	.p2align 4,, 2
+L(last_4x_vec):
+	movaps	-(VEC_SIZE)(%rcx), %xmm1
+	pcmpeqb	%xmm0, %xmm1
+	pmovmskb %xmm1, %eax
+
+	cmpl	$(VEC_SIZE * 2), %edx
+	jbe	L(last_2x_vec)
+
+	testl	%eax, %eax
+	jnz	L(ret_vec_x0)
+
+
+	movaps	-(VEC_SIZE * 2)(%rcx), %xmm1
+	pcmpeqb	%xmm0, %xmm1
+	pmovmskb %xmm1, %eax
+
+	testl	%eax, %eax
+	jnz	L(ret_vec_end)
+
+	movaps	-(VEC_SIZE * 3)(%rcx), %xmm1
+	pcmpeqb	%xmm0, %xmm1
+	pmovmskb %xmm1, %eax
+
+	subl	$(VEC_SIZE * 3), %edx
+	ja	L(last_vec)
+	bsrl	%eax, %eax
+	jz	L(ret_4)
+	addl	%edx, %eax
+	jl	L(zero_3)
+	addq	%rdi, %rax
+L(ret_4):
+	ret
+
+	/* Ends up being 1-byte nop.  */
+	.p2align 4,, 3
+L(loop_end):
+	pmovmskb %xmm1, %eax
+	sall	$16, %eax
+	jnz	L(ret_vec_end)
+
+	pmovmskb %xmm2, %eax
+	testl	%eax, %eax
+	jnz	L(ret_vec_end)
+
+	pmovmskb %xmm3, %eax
+	/* Combine last 2 VEC matches. If ecx (VEC3) is zero (no CHAR in VEC3)
+	   then it won't affect the result in esi (VEC4). If ecx is non-zero
+	   then CHAR in VEC3 and bsrq will use that position.  */
+	sall	$16, %eax
+	orl	%esi, %eax
+	bsrl	%eax, %eax
+	leaq	-(VEC_SIZE * 4)(%rcx, %rax), %rax
+	ret
 
-#include "../memrchr.S"
+L(ret_vec_end):
+	bsrl	%eax, %eax
+	leaq	(VEC_SIZE * -2)(%rax, %rcx), %rax
+	ret
+	/* Use in L(last_4x_vec). In the same cache line. This is just a spare
+	   aligning bytes.  */
+L(zero_3):
+	xorl	%eax, %eax
+	ret
+	/* 2-bytes from next cache line.  */
+END(MEMRCHR)
-- 
2.34.1


^ permalink raw reply	[flat|nested] 21+ messages in thread

* [PATCH v1] x86: Move strrchr SSE2 implementation to multiarch/strrchr-sse2.S
  2022-07-12 19:29 [PATCH v1] x86: Move strlen SSE2 implementation to multiarch/strlen-sse2.S Noah Goldstein
  2022-07-12 19:29 ` [PATCH v1] x86: Move strcpy SSE2 implementation to multiarch/strcpy-sse2.S Noah Goldstein
  2022-07-12 19:29 ` [PATCH v1] x86: Move memrchr SSE2 implementation to multiarch/memrchr-sse2.S Noah Goldstein
@ 2022-07-12 19:29 ` Noah Goldstein
  2022-07-12 22:28   ` H.J. Lu
  2022-07-12 19:29 ` [PATCH v1] x86: Move strchr SSE2 implementation to multiarch/strchr-sse2.S Noah Goldstein
                   ` (6 subsequent siblings)
  9 siblings, 1 reply; 21+ messages in thread
From: Noah Goldstein @ 2022-07-12 19:29 UTC (permalink / raw)
  To: libc-alpha

This commit doesn't affect libc.so.6, its just housekeeping to prepare
for adding explicit ISA level support.

Tested build on x86_64 and x86_32 with/without multiarch.
---
 sysdeps/x86_64/multiarch/strrchr-sse2.S | 358 ++++++++++++++++++++++-
 sysdeps/x86_64/multiarch/wcsrchr-sse2.S |  10 +-
 sysdeps/x86_64/strrchr.S                | 364 +-----------------------
 sysdeps/x86_64/wcsrchr.S                |  11 +-
 4 files changed, 366 insertions(+), 377 deletions(-)

diff --git a/sysdeps/x86_64/multiarch/strrchr-sse2.S b/sysdeps/x86_64/multiarch/strrchr-sse2.S
index 866396e947..6ee7a5e33a 100644
--- a/sysdeps/x86_64/multiarch/strrchr-sse2.S
+++ b/sysdeps/x86_64/multiarch/strrchr-sse2.S
@@ -17,12 +17,358 @@
    <https://www.gnu.org/licenses/>.  */
 
 #if IS_IN (libc)
-# define STRRCHR __strrchr_sse2
+# ifndef STRRCHR
+#  define STRRCHR __strrchr_sse2
+# endif
+#endif
+
+#include <sysdep.h>
+
+#ifdef USE_AS_WCSRCHR
+# define PCMPEQ	pcmpeqd
+# define CHAR_SIZE	4
+# define PMINU	pminud
+#else
+# define PCMPEQ	pcmpeqb
+# define CHAR_SIZE	1
+# define PMINU	pminub
+#endif
+
+#define PAGE_SIZE	4096
+#define VEC_SIZE	16
+
+	.text
+ENTRY(STRRCHR)
+	movd	%esi, %xmm0
+	movq	%rdi, %rax
+	andl	$(PAGE_SIZE - 1), %eax
+#ifndef USE_AS_WCSRCHR
+	punpcklbw %xmm0, %xmm0
+	punpcklwd %xmm0, %xmm0
+#endif
+	pshufd	$0, %xmm0, %xmm0
+	cmpl	$(PAGE_SIZE - VEC_SIZE), %eax
+	ja	L(cross_page)
+
+L(cross_page_continue):
+	movups	(%rdi), %xmm1
+	pxor	%xmm2, %xmm2
+	PCMPEQ	%xmm1, %xmm2
+	pmovmskb %xmm2, %ecx
+	testl	%ecx, %ecx
+	jz	L(aligned_more)
+
+	PCMPEQ	%xmm0, %xmm1
+	pmovmskb %xmm1, %eax
+	leal	-1(%rcx), %edx
+	xorl	%edx, %ecx
+	andl	%ecx, %eax
+	jz	L(ret0)
+	bsrl	%eax, %eax
+	addq	%rdi, %rax
+	/* We are off by 3 for wcsrchr if search CHAR is non-zero. If
+	   search CHAR is zero we are correct. Either way `andq
+	   -CHAR_SIZE, %rax` gets the correct result.  */
+#ifdef USE_AS_WCSRCHR
+	andq	$-CHAR_SIZE, %rax
+#endif
+L(ret0):
+	ret
+
+	/* Returns for first vec x1/x2 have hard coded backward search
+	   path for earlier matches.  */
+	.p2align 4
+L(first_vec_x0_test):
+	PCMPEQ	%xmm0, %xmm1
+	pmovmskb %xmm1, %eax
+	testl	%eax, %eax
+	jz	L(ret0)
+	bsrl	%eax, %eax
+	addq	%r8, %rax
+#ifdef USE_AS_WCSRCHR
+	andq	$-CHAR_SIZE, %rax
+#endif
+	ret
+
+	.p2align 4
+L(first_vec_x1):
+	PCMPEQ	%xmm0, %xmm2
+	pmovmskb %xmm2, %eax
+	leal	-1(%rcx), %edx
+	xorl	%edx, %ecx
+	andl	%ecx, %eax
+	jz	L(first_vec_x0_test)
+	bsrl	%eax, %eax
+	leaq	(VEC_SIZE)(%rdi, %rax), %rax
+#ifdef USE_AS_WCSRCHR
+	andq	$-CHAR_SIZE, %rax
+#endif
+	ret
+
+	.p2align 4
+L(first_vec_x1_test):
+	PCMPEQ	%xmm0, %xmm2
+	pmovmskb %xmm2, %eax
+	testl	%eax, %eax
+	jz	L(first_vec_x0_test)
+	bsrl	%eax, %eax
+	leaq	(VEC_SIZE)(%rdi, %rax), %rax
+#ifdef USE_AS_WCSRCHR
+	andq	$-CHAR_SIZE, %rax
+#endif
+	ret
+
+	.p2align 4
+L(first_vec_x2):
+	PCMPEQ	%xmm0, %xmm3
+	pmovmskb %xmm3, %eax
+	leal	-1(%rcx), %edx
+	xorl	%edx, %ecx
+	andl	%ecx, %eax
+	jz	L(first_vec_x1_test)
+	bsrl	%eax, %eax
+	leaq	(VEC_SIZE * 2)(%rdi, %rax), %rax
+#ifdef USE_AS_WCSRCHR
+	andq	$-CHAR_SIZE, %rax
+#endif
+	ret
+
+	.p2align 4
+L(aligned_more):
+	/* Save original pointer if match was in VEC 0.  */
+	movq	%rdi, %r8
+	andq	$-VEC_SIZE, %rdi
+
+	movaps	VEC_SIZE(%rdi), %xmm2
+	pxor	%xmm3, %xmm3
+	PCMPEQ	%xmm2, %xmm3
+	pmovmskb %xmm3, %ecx
+	testl	%ecx, %ecx
+	jnz	L(first_vec_x1)
+
+	movaps	(VEC_SIZE * 2)(%rdi), %xmm3
+	pxor	%xmm4, %xmm4
+	PCMPEQ	%xmm3, %xmm4
+	pmovmskb %xmm4, %ecx
+	testl	%ecx, %ecx
+	jnz	L(first_vec_x2)
+
+	addq	$VEC_SIZE, %rdi
+	/* Save pointer again before realigning.  */
+	movq	%rdi, %rsi
+	andq	$-(VEC_SIZE * 2), %rdi
+	.p2align 4
+L(first_loop):
+	/* Do 2x VEC at a time.  */
+	movaps	(VEC_SIZE * 2)(%rdi), %xmm4
+	movaps	(VEC_SIZE * 3)(%rdi), %xmm5
+	/* Since SSE2 no pminud so wcsrchr needs seperate logic for
+	   detecting zero. Note if this is found to be a bottleneck it
+	   may be worth adding an SSE4.1 wcsrchr implementation.  */
+#ifdef USE_AS_WCSRCHR
+	movaps	%xmm5, %xmm6
+	pxor	%xmm8, %xmm8
+
+	PCMPEQ	%xmm8, %xmm5
+	PCMPEQ	%xmm4, %xmm8
+	por	%xmm5, %xmm8
+#else
+	movaps	%xmm5, %xmm6
+	PMINU	%xmm4, %xmm5
+#endif
+
+	movaps	%xmm4, %xmm9
+	PCMPEQ	%xmm0, %xmm4
+	PCMPEQ	%xmm0, %xmm6
+	movaps	%xmm6, %xmm7
+	por	%xmm4, %xmm6
+#ifndef USE_AS_WCSRCHR
+	pxor	%xmm8, %xmm8
+	PCMPEQ	%xmm5, %xmm8
+#endif
+	pmovmskb %xmm8, %ecx
+	pmovmskb %xmm6, %eax
 
-# undef weak_alias
-# define weak_alias(strrchr, rindex)
-# undef libc_hidden_builtin_def
-# define libc_hidden_builtin_def(strrchr)
+	addq	$(VEC_SIZE * 2), %rdi
+	/* Use `addl` 1) so we can undo it with `subl` and 2) it can
+	   macro-fuse with `jz`.  */
+	addl	%ecx, %eax
+	jz	L(first_loop)
+
+	/* Check if there is zero match.  */
+	testl	%ecx, %ecx
+	jz	L(second_loop_match)
+
+	/* Check if there was a match in last iteration.  */
+	subl	%ecx, %eax
+	jnz	L(new_match)
+
+L(first_loop_old_match):
+	PCMPEQ	%xmm0, %xmm2
+	PCMPEQ	%xmm0, %xmm3
+	pmovmskb %xmm2, %ecx
+	pmovmskb %xmm3, %eax
+	addl	%eax, %ecx
+	jz	L(first_vec_x0_test)
+	/* NB: We could move this shift to before the branch and save a
+	   bit of code size / performance on the fall through. The
+	   branch leads to the null case which generally seems hotter
+	   than char in first 3x VEC.  */
+	sall	$16, %eax
+	orl	%ecx, %eax
+
+	bsrl	%eax, %eax
+	addq	%rsi, %rax
+#ifdef USE_AS_WCSRCHR
+	andq	$-CHAR_SIZE, %rax
+#endif
+	ret
+
+	.p2align 4
+L(new_match):
+	pxor	%xmm6, %xmm6
+	PCMPEQ	%xmm9, %xmm6
+	pmovmskb %xmm6, %eax
+	sall	$16, %ecx
+	orl	%eax, %ecx
+
+	/* We can't reuse either of the old comparisons as since we mask
+	   of zeros after first zero (instead of using the full
+	   comparison) we can't gurantee no interference between match
+	   after end of string and valid match.  */
+	pmovmskb %xmm4, %eax
+	pmovmskb %xmm7, %edx
+	sall	$16, %edx
+	orl	%edx, %eax
+
+	leal	-1(%ecx), %edx
+	xorl	%edx, %ecx
+	andl	%ecx, %eax
+	jz	L(first_loop_old_match)
+	bsrl	%eax, %eax
+	addq	%rdi, %rax
+#ifdef USE_AS_WCSRCHR
+	andq	$-CHAR_SIZE, %rax
+#endif
+	ret
+
+	/* Save minimum state for getting most recent match. We can
+	   throw out all previous work.  */
+	.p2align 4
+L(second_loop_match):
+	movq	%rdi, %rsi
+	movaps	%xmm4, %xmm2
+	movaps	%xmm7, %xmm3
+
+	.p2align 4
+L(second_loop):
+	movaps	(VEC_SIZE * 2)(%rdi), %xmm4
+	movaps	(VEC_SIZE * 3)(%rdi), %xmm5
+	/* Since SSE2 no pminud so wcsrchr needs seperate logic for
+	   detecting zero. Note if this is found to be a bottleneck it
+	   may be worth adding an SSE4.1 wcsrchr implementation.  */
+#ifdef USE_AS_WCSRCHR
+	movaps	%xmm5, %xmm6
+	pxor	%xmm8, %xmm8
+
+	PCMPEQ	%xmm8, %xmm5
+	PCMPEQ	%xmm4, %xmm8
+	por	%xmm5, %xmm8
+#else
+	movaps	%xmm5, %xmm6
+	PMINU	%xmm4, %xmm5
+#endif
+
+	movaps	%xmm4, %xmm9
+	PCMPEQ	%xmm0, %xmm4
+	PCMPEQ	%xmm0, %xmm6
+	movaps	%xmm6, %xmm7
+	por	%xmm4, %xmm6
+#ifndef USE_AS_WCSRCHR
+	pxor	%xmm8, %xmm8
+	PCMPEQ	%xmm5, %xmm8
 #endif
 
-#include "../strrchr.S"
+	pmovmskb %xmm8, %ecx
+	pmovmskb %xmm6, %eax
+
+	addq	$(VEC_SIZE * 2), %rdi
+	/* Either null term or new occurence of CHAR.  */
+	addl	%ecx, %eax
+	jz	L(second_loop)
+
+	/* No null term so much be new occurence of CHAR.  */
+	testl	%ecx, %ecx
+	jz	L(second_loop_match)
+
+
+	subl	%ecx, %eax
+	jnz	L(second_loop_new_match)
+
+L(second_loop_old_match):
+	pmovmskb %xmm2, %ecx
+	pmovmskb %xmm3, %eax
+	sall	$16, %eax
+	orl	%ecx, %eax
+	bsrl	%eax, %eax
+	addq	%rsi, %rax
+#ifdef USE_AS_WCSRCHR
+	andq	$-CHAR_SIZE, %rax
+#endif
+	ret
+
+	.p2align 4
+L(second_loop_new_match):
+	pxor	%xmm6, %xmm6
+	PCMPEQ	%xmm9, %xmm6
+	pmovmskb %xmm6, %eax
+	sall	$16, %ecx
+	orl	%eax, %ecx
+
+	/* We can't reuse either of the old comparisons as since we mask
+	   of zeros after first zero (instead of using the full
+	   comparison) we can't gurantee no interference between match
+	   after end of string and valid match.  */
+	pmovmskb %xmm4, %eax
+	pmovmskb %xmm7, %edx
+	sall	$16, %edx
+	orl	%edx, %eax
+
+	leal	-1(%ecx), %edx
+	xorl	%edx, %ecx
+	andl	%ecx, %eax
+	jz	L(second_loop_old_match)
+	bsrl	%eax, %eax
+	addq	%rdi, %rax
+#ifdef USE_AS_WCSRCHR
+	andq	$-CHAR_SIZE, %rax
+#endif
+	ret
+
+	.p2align 4,, 4
+L(cross_page):
+	movq	%rdi, %rsi
+	andq	$-VEC_SIZE, %rsi
+	movaps	(%rsi), %xmm1
+	pxor	%xmm2, %xmm2
+	PCMPEQ	%xmm1, %xmm2
+	pmovmskb %xmm2, %edx
+	movl	%edi, %ecx
+	andl	$(VEC_SIZE - 1), %ecx
+	sarl	%cl, %edx
+	jz	L(cross_page_continue)
+	PCMPEQ	%xmm0, %xmm1
+	pmovmskb %xmm1, %eax
+	sarl	%cl, %eax
+	leal	-1(%rdx), %ecx
+	xorl	%edx, %ecx
+	andl	%ecx, %eax
+	jz	L(ret1)
+	bsrl	%eax, %eax
+	addq	%rdi, %rax
+#ifdef USE_AS_WCSRCHR
+	andq	$-CHAR_SIZE, %rax
+#endif
+L(ret1):
+	ret
+END(STRRCHR)
diff --git a/sysdeps/x86_64/multiarch/wcsrchr-sse2.S b/sysdeps/x86_64/multiarch/wcsrchr-sse2.S
index 69d2f3cdb1..d9259720f8 100644
--- a/sysdeps/x86_64/multiarch/wcsrchr-sse2.S
+++ b/sysdeps/x86_64/multiarch/wcsrchr-sse2.S
@@ -17,6 +17,12 @@
    <https://www.gnu.org/licenses/>.  */
 
 #if IS_IN (libc)
-# define STRRCHR	__wcsrchr_sse2
+# ifndef STRRCHR
+#  define STRRCHR	__wcsrchr_sse2
+# endif
 #endif
-#include "../wcsrchr.S"
+
+#define USE_AS_WCSRCHR	1
+#define NO_PMINU	1
+
+#include "strrchr-sse2.S"
diff --git a/sysdeps/x86_64/strrchr.S b/sysdeps/x86_64/strrchr.S
index 4d7ba4ceb2..f39da60454 100644
--- a/sysdeps/x86_64/strrchr.S
+++ b/sysdeps/x86_64/strrchr.S
@@ -16,363 +16,7 @@
    License along with the GNU C Library; if not, see
    <https://www.gnu.org/licenses/>.  */
 
-
-#include <sysdep.h>
-
-#ifndef STRRCHR
-# define STRRCHR	strrchr
-#endif
-
-#ifdef USE_AS_WCSRCHR
-# define PCMPEQ	pcmpeqd
-# define CHAR_SIZE	4
-# define PMINU	pminud
-#else
-# define PCMPEQ	pcmpeqb
-# define CHAR_SIZE	1
-# define PMINU	pminub
-#endif
-
-#define PAGE_SIZE	4096
-#define VEC_SIZE	16
-
-	.text
-ENTRY(STRRCHR)
-	movd	%esi, %xmm0
-	movq	%rdi, %rax
-	andl	$(PAGE_SIZE - 1), %eax
-#ifndef USE_AS_WCSRCHR
-	punpcklbw %xmm0, %xmm0
-	punpcklwd %xmm0, %xmm0
-#endif
-	pshufd	$0, %xmm0, %xmm0
-	cmpl	$(PAGE_SIZE - VEC_SIZE), %eax
-	ja	L(cross_page)
-
-L(cross_page_continue):
-	movups	(%rdi), %xmm1
-	pxor	%xmm2, %xmm2
-	PCMPEQ	%xmm1, %xmm2
-	pmovmskb %xmm2, %ecx
-	testl	%ecx, %ecx
-	jz	L(aligned_more)
-
-	PCMPEQ	%xmm0, %xmm1
-	pmovmskb %xmm1, %eax
-	leal	-1(%rcx), %edx
-	xorl	%edx, %ecx
-	andl	%ecx, %eax
-	jz	L(ret0)
-	bsrl	%eax, %eax
-	addq	%rdi, %rax
-	/* We are off by 3 for wcsrchr if search CHAR is non-zero. If
-	   search CHAR is zero we are correct. Either way `andq
-	   -CHAR_SIZE, %rax` gets the correct result.  */
-#ifdef USE_AS_WCSRCHR
-	andq	$-CHAR_SIZE, %rax
-#endif
-L(ret0):
-	ret
-
-	/* Returns for first vec x1/x2 have hard coded backward search
-	   path for earlier matches.  */
-	.p2align 4
-L(first_vec_x0_test):
-	PCMPEQ	%xmm0, %xmm1
-	pmovmskb %xmm1, %eax
-	testl	%eax, %eax
-	jz	L(ret0)
-	bsrl	%eax, %eax
-	addq	%r8, %rax
-#ifdef USE_AS_WCSRCHR
-	andq	$-CHAR_SIZE, %rax
-#endif
-	ret
-
-	.p2align 4
-L(first_vec_x1):
-	PCMPEQ	%xmm0, %xmm2
-	pmovmskb %xmm2, %eax
-	leal	-1(%rcx), %edx
-	xorl	%edx, %ecx
-	andl	%ecx, %eax
-	jz	L(first_vec_x0_test)
-	bsrl	%eax, %eax
-	leaq	(VEC_SIZE)(%rdi, %rax), %rax
-#ifdef USE_AS_WCSRCHR
-	andq	$-CHAR_SIZE, %rax
-#endif
-	ret
-
-	.p2align 4
-L(first_vec_x1_test):
-	PCMPEQ	%xmm0, %xmm2
-	pmovmskb %xmm2, %eax
-	testl	%eax, %eax
-	jz	L(first_vec_x0_test)
-	bsrl	%eax, %eax
-	leaq	(VEC_SIZE)(%rdi, %rax), %rax
-#ifdef USE_AS_WCSRCHR
-	andq	$-CHAR_SIZE, %rax
-#endif
-	ret
-
-	.p2align 4
-L(first_vec_x2):
-	PCMPEQ	%xmm0, %xmm3
-	pmovmskb %xmm3, %eax
-	leal	-1(%rcx), %edx
-	xorl	%edx, %ecx
-	andl	%ecx, %eax
-	jz	L(first_vec_x1_test)
-	bsrl	%eax, %eax
-	leaq	(VEC_SIZE * 2)(%rdi, %rax), %rax
-#ifdef USE_AS_WCSRCHR
-	andq	$-CHAR_SIZE, %rax
-#endif
-	ret
-
-	.p2align 4
-L(aligned_more):
-	/* Save original pointer if match was in VEC 0.  */
-	movq	%rdi, %r8
-	andq	$-VEC_SIZE, %rdi
-
-	movaps	VEC_SIZE(%rdi), %xmm2
-	pxor	%xmm3, %xmm3
-	PCMPEQ	%xmm2, %xmm3
-	pmovmskb %xmm3, %ecx
-	testl	%ecx, %ecx
-	jnz	L(first_vec_x1)
-
-	movaps	(VEC_SIZE * 2)(%rdi), %xmm3
-	pxor	%xmm4, %xmm4
-	PCMPEQ	%xmm3, %xmm4
-	pmovmskb %xmm4, %ecx
-	testl	%ecx, %ecx
-	jnz	L(first_vec_x2)
-
-	addq	$VEC_SIZE, %rdi
-	/* Save pointer again before realigning.  */
-	movq	%rdi, %rsi
-	andq	$-(VEC_SIZE * 2), %rdi
-	.p2align 4
-L(first_loop):
-	/* Do 2x VEC at a time.  */
-	movaps	(VEC_SIZE * 2)(%rdi), %xmm4
-	movaps	(VEC_SIZE * 3)(%rdi), %xmm5
-	/* Since SSE2 no pminud so wcsrchr needs seperate logic for
-	   detecting zero. Note if this is found to be a bottleneck it
-	   may be worth adding an SSE4.1 wcsrchr implementation.  */
-#ifdef USE_AS_WCSRCHR
-	movaps	%xmm5, %xmm6
-	pxor	%xmm8, %xmm8
-
-	PCMPEQ	%xmm8, %xmm5
-	PCMPEQ	%xmm4, %xmm8
-	por	%xmm5, %xmm8
-#else
-	movaps	%xmm5, %xmm6
-	PMINU	%xmm4, %xmm5
-#endif
-
-	movaps	%xmm4, %xmm9
-	PCMPEQ	%xmm0, %xmm4
-	PCMPEQ	%xmm0, %xmm6
-	movaps	%xmm6, %xmm7
-	por	%xmm4, %xmm6
-#ifndef USE_AS_WCSRCHR
-	pxor	%xmm8, %xmm8
-	PCMPEQ	%xmm5, %xmm8
-#endif
-	pmovmskb %xmm8, %ecx
-	pmovmskb %xmm6, %eax
-
-	addq	$(VEC_SIZE * 2), %rdi
-	/* Use `addl` 1) so we can undo it with `subl` and 2) it can
-	   macro-fuse with `jz`.  */
-	addl	%ecx, %eax
-	jz	L(first_loop)
-
-	/* Check if there is zero match.  */
-	testl	%ecx, %ecx
-	jz	L(second_loop_match)
-
-	/* Check if there was a match in last iteration.  */
-	subl	%ecx, %eax
-	jnz	L(new_match)
-
-L(first_loop_old_match):
-	PCMPEQ	%xmm0, %xmm2
-	PCMPEQ	%xmm0, %xmm3
-	pmovmskb %xmm2, %ecx
-	pmovmskb %xmm3, %eax
-	addl	%eax, %ecx
-	jz	L(first_vec_x0_test)
-	/* NB: We could move this shift to before the branch and save a
-	   bit of code size / performance on the fall through. The
-	   branch leads to the null case which generally seems hotter
-	   than char in first 3x VEC.  */
-	sall	$16, %eax
-	orl	%ecx, %eax
-
-	bsrl	%eax, %eax
-	addq	%rsi, %rax
-#ifdef USE_AS_WCSRCHR
-	andq	$-CHAR_SIZE, %rax
-#endif
-	ret
-
-	.p2align 4
-L(new_match):
-	pxor	%xmm6, %xmm6
-	PCMPEQ	%xmm9, %xmm6
-	pmovmskb %xmm6, %eax
-	sall	$16, %ecx
-	orl	%eax, %ecx
-
-	/* We can't reuse either of the old comparisons as since we mask
-	   of zeros after first zero (instead of using the full
-	   comparison) we can't gurantee no interference between match
-	   after end of string and valid match.  */
-	pmovmskb %xmm4, %eax
-	pmovmskb %xmm7, %edx
-	sall	$16, %edx
-	orl	%edx, %eax
-
-	leal	-1(%ecx), %edx
-	xorl	%edx, %ecx
-	andl	%ecx, %eax
-	jz	L(first_loop_old_match)
-	bsrl	%eax, %eax
-	addq	%rdi, %rax
-#ifdef USE_AS_WCSRCHR
-	andq	$-CHAR_SIZE, %rax
-#endif
-	ret
-
-	/* Save minimum state for getting most recent match. We can
-	   throw out all previous work.  */
-	.p2align 4
-L(second_loop_match):
-	movq	%rdi, %rsi
-	movaps	%xmm4, %xmm2
-	movaps	%xmm7, %xmm3
-
-	.p2align 4
-L(second_loop):
-	movaps	(VEC_SIZE * 2)(%rdi), %xmm4
-	movaps	(VEC_SIZE * 3)(%rdi), %xmm5
-	/* Since SSE2 no pminud so wcsrchr needs seperate logic for
-	   detecting zero. Note if this is found to be a bottleneck it
-	   may be worth adding an SSE4.1 wcsrchr implementation.  */
-#ifdef USE_AS_WCSRCHR
-	movaps	%xmm5, %xmm6
-	pxor	%xmm8, %xmm8
-
-	PCMPEQ	%xmm8, %xmm5
-	PCMPEQ	%xmm4, %xmm8
-	por	%xmm5, %xmm8
-#else
-	movaps	%xmm5, %xmm6
-	PMINU	%xmm4, %xmm5
-#endif
-
-	movaps	%xmm4, %xmm9
-	PCMPEQ	%xmm0, %xmm4
-	PCMPEQ	%xmm0, %xmm6
-	movaps	%xmm6, %xmm7
-	por	%xmm4, %xmm6
-#ifndef USE_AS_WCSRCHR
-	pxor	%xmm8, %xmm8
-	PCMPEQ	%xmm5, %xmm8
-#endif
-
-	pmovmskb %xmm8, %ecx
-	pmovmskb %xmm6, %eax
-
-	addq	$(VEC_SIZE * 2), %rdi
-	/* Either null term or new occurence of CHAR.  */
-	addl	%ecx, %eax
-	jz	L(second_loop)
-
-	/* No null term so much be new occurence of CHAR.  */
-	testl	%ecx, %ecx
-	jz	L(second_loop_match)
-
-
-	subl	%ecx, %eax
-	jnz	L(second_loop_new_match)
-
-L(second_loop_old_match):
-	pmovmskb %xmm2, %ecx
-	pmovmskb %xmm3, %eax
-	sall	$16, %eax
-	orl	%ecx, %eax
-	bsrl	%eax, %eax
-	addq	%rsi, %rax
-#ifdef USE_AS_WCSRCHR
-	andq	$-CHAR_SIZE, %rax
-#endif
-	ret
-
-	.p2align 4
-L(second_loop_new_match):
-	pxor	%xmm6, %xmm6
-	PCMPEQ	%xmm9, %xmm6
-	pmovmskb %xmm6, %eax
-	sall	$16, %ecx
-	orl	%eax, %ecx
-
-	/* We can't reuse either of the old comparisons as since we mask
-	   of zeros after first zero (instead of using the full
-	   comparison) we can't gurantee no interference between match
-	   after end of string and valid match.  */
-	pmovmskb %xmm4, %eax
-	pmovmskb %xmm7, %edx
-	sall	$16, %edx
-	orl	%edx, %eax
-
-	leal	-1(%ecx), %edx
-	xorl	%edx, %ecx
-	andl	%ecx, %eax
-	jz	L(second_loop_old_match)
-	bsrl	%eax, %eax
-	addq	%rdi, %rax
-#ifdef USE_AS_WCSRCHR
-	andq	$-CHAR_SIZE, %rax
-#endif
-	ret
-
-	.p2align 4,, 4
-L(cross_page):
-	movq	%rdi, %rsi
-	andq	$-VEC_SIZE, %rsi
-	movaps	(%rsi), %xmm1
-	pxor	%xmm2, %xmm2
-	PCMPEQ	%xmm1, %xmm2
-	pmovmskb %xmm2, %edx
-	movl	%edi, %ecx
-	andl	$(VEC_SIZE - 1), %ecx
-	sarl	%cl, %edx
-	jz	L(cross_page_continue)
-	PCMPEQ	%xmm0, %xmm1
-	pmovmskb %xmm1, %eax
-	sarl	%cl, %eax
-	leal	-1(%rdx), %ecx
-	xorl	%edx, %ecx
-	andl	%ecx, %eax
-	jz	L(ret1)
-	bsrl	%eax, %eax
-	addq	%rdi, %rax
-#ifdef USE_AS_WCSRCHR
-	andq	$-CHAR_SIZE, %rax
-#endif
-L(ret1):
-	ret
-END(STRRCHR)
-
-#ifndef USE_AS_WCSRCHR
-	weak_alias (STRRCHR, rindex)
-	libc_hidden_builtin_def (STRRCHR)
-#endif
+#define STRRCHR	strrchr
+#include "multiarch/strrchr-sse2.S"
+weak_alias (strrchr, rindex)
+libc_hidden_builtin_def (strrchr)
diff --git a/sysdeps/x86_64/wcsrchr.S b/sysdeps/x86_64/wcsrchr.S
index 2b80efc5ef..1d4b1eb21c 100644
--- a/sysdeps/x86_64/wcsrchr.S
+++ b/sysdeps/x86_64/wcsrchr.S
@@ -16,12 +16,5 @@
    License along with the GNU C Library; if not, see
    <https://www.gnu.org/licenses/>.  */
 
-
-#define USE_AS_WCSRCHR	1
-#define NO_PMINU	1
-
-#ifndef STRRCHR
-# define STRRCHR	wcsrchr
-#endif
-
-#include "../strrchr.S"
+#define STRRCHR	wcsrchr
+#include "multiarch/wcsrchr-sse2.S"
-- 
2.34.1


^ permalink raw reply	[flat|nested] 21+ messages in thread

* [PATCH v1] x86: Move strchr SSE2 implementation to multiarch/strchr-sse2.S
  2022-07-12 19:29 [PATCH v1] x86: Move strlen SSE2 implementation to multiarch/strlen-sse2.S Noah Goldstein
                   ` (2 preceding siblings ...)
  2022-07-12 19:29 ` [PATCH v1] x86: Move strrchr SSE2 implementation to multiarch/strrchr-sse2.S Noah Goldstein
@ 2022-07-12 19:29 ` Noah Goldstein
  2022-07-12 21:27   ` H.J. Lu
  2022-07-12 19:29 ` [PATCH v1] x86: Move strcat SSE2 implementation to multiarch/strcat-sse2.S Noah Goldstein
                   ` (5 subsequent siblings)
  9 siblings, 1 reply; 21+ messages in thread
From: Noah Goldstein @ 2022-07-12 19:29 UTC (permalink / raw)
  To: libc-alpha

This commit doesn't affect libc.so.6, its just housekeeping to prepare
for adding explicit ISA level support.

Tested build on x86_64 and x86_32 with/without multiarch.
---
 sysdeps/x86_64/multiarch/rtld-strchr.S    |  18 +++
 sysdeps/x86_64/multiarch/rtld-strchrnul.S |  18 +++
 sysdeps/x86_64/multiarch/strchr-sse2.S    | 175 +++++++++++++++++++++-
 sysdeps/x86_64/multiarch/strchrnul-sse2.S |  11 +-
 sysdeps/x86_64/strchr.S                   | 167 +--------------------
 sysdeps/x86_64/strchrnul.S                |   7 +-
 6 files changed, 213 insertions(+), 183 deletions(-)
 create mode 100644 sysdeps/x86_64/multiarch/rtld-strchr.S
 create mode 100644 sysdeps/x86_64/multiarch/rtld-strchrnul.S

diff --git a/sysdeps/x86_64/multiarch/rtld-strchr.S b/sysdeps/x86_64/multiarch/rtld-strchr.S
new file mode 100644
index 0000000000..2b7b879e37
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/rtld-strchr.S
@@ -0,0 +1,18 @@
+/* Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include "../strchr.S"
diff --git a/sysdeps/x86_64/multiarch/rtld-strchrnul.S b/sysdeps/x86_64/multiarch/rtld-strchrnul.S
new file mode 100644
index 0000000000..0cc5becc88
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/rtld-strchrnul.S
@@ -0,0 +1,18 @@
+/* Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include "../strchrnul.S"
diff --git a/sysdeps/x86_64/multiarch/strchr-sse2.S b/sysdeps/x86_64/multiarch/strchr-sse2.S
index 992f700077..f7767ca543 100644
--- a/sysdeps/x86_64/multiarch/strchr-sse2.S
+++ b/sysdeps/x86_64/multiarch/strchr-sse2.S
@@ -16,13 +16,172 @@
    License along with the GNU C Library; if not, see
    <https://www.gnu.org/licenses/>.  */
 
-#if IS_IN (libc)
-# define strchr __strchr_sse2
+#if IS_IN (libc) || defined STRCHR
+# ifndef STRCHR
+#  define STRCHR __strchr_sse2
+# endif
 
-# undef weak_alias
-# define weak_alias(strchr, index)
-# undef libc_hidden_builtin_def
-# define libc_hidden_builtin_def(strchr)
-#endif
+# include <sysdep.h>
+
+	.text
+ENTRY (STRCHR)
+	movd	%esi, %xmm1
+	movl	%edi, %eax
+	andl	$4095, %eax
+	punpcklbw %xmm1, %xmm1
+	cmpl	$4032, %eax
+	punpcklwd %xmm1, %xmm1
+	pshufd	$0, %xmm1, %xmm1
+	jg	L(cross_page)
+	movdqu	(%rdi), %xmm0
+	pxor	%xmm3, %xmm3
+	movdqa	%xmm0, %xmm4
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm3, %xmm4
+	por	%xmm4, %xmm0
+	pmovmskb %xmm0, %eax
+	test	%eax, %eax
+	je	L(next_48_bytes)
+	bsf	%eax, %eax
+# ifdef AS_STRCHRNUL
+	leaq	(%rdi,%rax), %rax
+# else
+	movl	$0, %edx
+	leaq	(%rdi,%rax), %rax
+	cmpb	%sil, (%rax)
+	cmovne	%rdx, %rax
+# endif
+	ret
+
+	.p2align 3
+L(next_48_bytes):
+	movdqu	16(%rdi), %xmm0
+	movdqa	%xmm0, %xmm4
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm3, %xmm4
+	por	%xmm4, %xmm0
+	pmovmskb %xmm0, %ecx
+	movdqu	32(%rdi), %xmm0
+	movdqa	%xmm0, %xmm4
+	pcmpeqb	%xmm1, %xmm0
+	salq	$16, %rcx
+	pcmpeqb	%xmm3, %xmm4
+	por	%xmm4, %xmm0
+	pmovmskb %xmm0, %eax
+	movdqu	48(%rdi), %xmm0
+	pcmpeqb	%xmm0, %xmm3
+	salq	$32, %rax
+	pcmpeqb	%xmm1, %xmm0
+	orq	%rcx, %rax
+	por	%xmm3, %xmm0
+	pmovmskb %xmm0, %ecx
+	salq	$48, %rcx
+	orq	%rcx, %rax
+	testq	%rax, %rax
+	jne	L(return)
+L(loop_start):
+	/* We use this alignment to force loop be aligned to 8 but not
+	   16 bytes.  This gives better sheduling on AMD processors.  */
+	.p2align 4
+	pxor	%xmm6, %xmm6
+	andq	$-64, %rdi
+	.p2align 3
+L(loop64):
+	addq	$64, %rdi
+	movdqa	(%rdi), %xmm5
+	movdqa	16(%rdi), %xmm2
+	movdqa	32(%rdi), %xmm3
+	pxor	%xmm1, %xmm5
+	movdqa	48(%rdi), %xmm4
+	pxor	%xmm1, %xmm2
+	pxor	%xmm1, %xmm3
+	pminub	(%rdi), %xmm5
+	pxor	%xmm1, %xmm4
+	pminub	16(%rdi), %xmm2
+	pminub	32(%rdi), %xmm3
+	pminub	%xmm2, %xmm5
+	pminub	48(%rdi), %xmm4
+	pminub	%xmm3, %xmm5
+	pminub	%xmm4, %xmm5
+	pcmpeqb %xmm6, %xmm5
+	pmovmskb %xmm5, %eax
+
+	testl	%eax, %eax
+	je	L(loop64)
 
-#include "../strchr.S"
+	movdqa	(%rdi), %xmm5
+	movdqa	%xmm5, %xmm0
+	pcmpeqb	%xmm1, %xmm5
+	pcmpeqb	%xmm6, %xmm0
+	por	%xmm0, %xmm5
+	pcmpeqb %xmm6, %xmm2
+	pcmpeqb %xmm6, %xmm3
+	pcmpeqb %xmm6, %xmm4
+
+	pmovmskb %xmm5, %ecx
+	pmovmskb %xmm2, %eax
+	salq	$16, %rax
+	pmovmskb %xmm3, %r8d
+	pmovmskb %xmm4, %edx
+	salq	$32, %r8
+	orq	%r8, %rax
+	orq	%rcx, %rax
+	salq	$48, %rdx
+	orq	%rdx, %rax
+	.p2align 3
+L(return):
+	bsfq	%rax, %rax
+# ifdef AS_STRCHRNUL
+	leaq	(%rdi,%rax), %rax
+# else
+	movl	$0, %edx
+	leaq	(%rdi,%rax), %rax
+	cmpb	%sil, (%rax)
+	cmovne	%rdx, %rax
+# endif
+	ret
+	.p2align 4
+
+L(cross_page):
+	movq	%rdi, %rdx
+	pxor	%xmm2, %xmm2
+	andq	$-64, %rdx
+	movdqa	%xmm1, %xmm0
+	movdqa	(%rdx), %xmm3
+	movdqa	%xmm3, %xmm4
+	pcmpeqb	%xmm1, %xmm3
+	pcmpeqb	%xmm2, %xmm4
+	por	%xmm4, %xmm3
+	pmovmskb %xmm3, %r8d
+	movdqa	16(%rdx), %xmm3
+	movdqa	%xmm3, %xmm4
+	pcmpeqb	%xmm1, %xmm3
+	pcmpeqb	%xmm2, %xmm4
+	por	%xmm4, %xmm3
+	pmovmskb %xmm3, %eax
+	movdqa	32(%rdx), %xmm3
+	movdqa	%xmm3, %xmm4
+	pcmpeqb	%xmm1, %xmm3
+	salq	$16, %rax
+	pcmpeqb	%xmm2, %xmm4
+	por	%xmm4, %xmm3
+	pmovmskb %xmm3, %r9d
+	movdqa	48(%rdx), %xmm3
+	pcmpeqb	%xmm3, %xmm2
+	salq	$32, %r9
+	pcmpeqb	%xmm3, %xmm0
+	orq	%r9, %rax
+	orq	%r8, %rax
+	por	%xmm2, %xmm0
+	pmovmskb %xmm0, %ecx
+	salq	$48, %rcx
+	orq	%rcx, %rax
+	movl	%edi, %ecx
+	subb	%dl, %cl
+	shrq	%cl, %rax
+	testq	%rax, %rax
+	jne	L(return)
+	jmp	L(loop_start)
+
+END (STRCHR)
+#endif
diff --git a/sysdeps/x86_64/multiarch/strchrnul-sse2.S b/sysdeps/x86_64/multiarch/strchrnul-sse2.S
index f91c670369..7238977a21 100644
--- a/sysdeps/x86_64/multiarch/strchrnul-sse2.S
+++ b/sysdeps/x86_64/multiarch/strchrnul-sse2.S
@@ -17,10 +17,11 @@
    <https://www.gnu.org/licenses/>.  */
 
 #if IS_IN (libc)
-# define __strchrnul __strchrnul_sse2
-
-# undef weak_alias
-# define weak_alias(__strchrnul, strchrnul)
+# ifndef STRCHR
+#  define STRCHR	__strchrnul_sse2
+# endif
 #endif
 
-#include "../strchrnul.S"
+#define AS_STRCHRNUL
+
+#include "strchr-sse2.S"
diff --git a/sysdeps/x86_64/strchr.S b/sysdeps/x86_64/strchr.S
index dda7c0431d..77c956c92c 100644
--- a/sysdeps/x86_64/strchr.S
+++ b/sysdeps/x86_64/strchr.S
@@ -17,171 +17,8 @@
    License along with the GNU C Library; if not, see
    <https://www.gnu.org/licenses/>.  */
 
-#include <sysdep.h>
 
-	.text
-ENTRY (strchr)
-	movd	%esi, %xmm1
-	movl	%edi, %eax
-	andl	$4095, %eax
-	punpcklbw %xmm1, %xmm1
-	cmpl	$4032, %eax
-	punpcklwd %xmm1, %xmm1
-	pshufd	$0, %xmm1, %xmm1
-	jg	L(cross_page)
-	movdqu	(%rdi), %xmm0
-	pxor	%xmm3, %xmm3
-	movdqa	%xmm0, %xmm4
-	pcmpeqb	%xmm1, %xmm0
-	pcmpeqb	%xmm3, %xmm4
-	por	%xmm4, %xmm0
-	pmovmskb %xmm0, %eax
-	test	%eax, %eax
-	je	L(next_48_bytes)
-	bsf	%eax, %eax
-#ifdef AS_STRCHRNUL
-	leaq	(%rdi,%rax), %rax
-#else
-	movl	$0, %edx
-	leaq	(%rdi,%rax), %rax
-	cmpb	%sil, (%rax)
-	cmovne	%rdx, %rax
-#endif
-	ret
-
-	.p2align 3
-	L(next_48_bytes):
-	movdqu	16(%rdi), %xmm0
-	movdqa	%xmm0, %xmm4
-	pcmpeqb	%xmm1, %xmm0
-	pcmpeqb	%xmm3, %xmm4
-	por	%xmm4, %xmm0
-	pmovmskb %xmm0, %ecx
-	movdqu	32(%rdi), %xmm0
-	movdqa	%xmm0, %xmm4
-	pcmpeqb	%xmm1, %xmm0
-	salq	$16, %rcx
-	pcmpeqb	%xmm3, %xmm4
-	por	%xmm4, %xmm0
-	pmovmskb %xmm0, %eax
-	movdqu	48(%rdi), %xmm0
-	pcmpeqb	%xmm0, %xmm3
-	salq	$32, %rax
-	pcmpeqb	%xmm1, %xmm0
-	orq	%rcx, %rax
-	por	%xmm3, %xmm0
-	pmovmskb %xmm0, %ecx
-	salq	$48, %rcx
-	orq	%rcx, %rax
-	testq	%rax, %rax
-	jne	L(return)
-L(loop_start):
-	/* We use this alignment to force loop be aligned to 8 but not
-	   16 bytes.  This gives better sheduling on AMD processors.  */
-	.p2align 4
-	pxor	%xmm6, %xmm6
-	andq	$-64, %rdi
-	.p2align 3
-L(loop64):
-	addq	$64, %rdi
-	movdqa	(%rdi), %xmm5
-	movdqa	16(%rdi), %xmm2
-	movdqa	32(%rdi), %xmm3
-	pxor	%xmm1, %xmm5
-	movdqa	48(%rdi), %xmm4
-	pxor	%xmm1, %xmm2
-	pxor	%xmm1, %xmm3
-	pminub	(%rdi), %xmm5
-	pxor	%xmm1, %xmm4
-	pminub	16(%rdi), %xmm2
-	pminub	32(%rdi), %xmm3
-	pminub	%xmm2, %xmm5
-	pminub	48(%rdi), %xmm4
-	pminub	%xmm3, %xmm5
-	pminub	%xmm4, %xmm5
-	pcmpeqb %xmm6, %xmm5
-	pmovmskb %xmm5, %eax
-
-	testl	%eax, %eax
-	je	L(loop64)
-
-	movdqa	(%rdi), %xmm5
-	movdqa	%xmm5, %xmm0
-	pcmpeqb	%xmm1, %xmm5
-	pcmpeqb	%xmm6, %xmm0
-	por	%xmm0, %xmm5
-	pcmpeqb %xmm6, %xmm2
-	pcmpeqb %xmm6, %xmm3
-	pcmpeqb %xmm6, %xmm4
-
-	pmovmskb %xmm5, %ecx
-	pmovmskb %xmm2, %eax
-	salq	$16, %rax
-	pmovmskb %xmm3, %r8d
-	pmovmskb %xmm4, %edx
-	salq	$32, %r8
-	orq	%r8, %rax
-	orq	%rcx, %rax
-	salq	$48, %rdx
-	orq	%rdx, %rax
-	.p2align 3
-L(return):
-	bsfq	%rax, %rax
-#ifdef AS_STRCHRNUL
-	leaq	(%rdi,%rax), %rax
-#else
-	movl	$0, %edx
-	leaq	(%rdi,%rax), %rax
-	cmpb	%sil, (%rax)
-	cmovne	%rdx, %rax
-#endif
-	ret
-	.p2align 4
-
-L(cross_page):
-	movq	%rdi, %rdx
-	pxor	%xmm2, %xmm2
-	andq	$-64, %rdx
-	movdqa	%xmm1, %xmm0
-	movdqa	(%rdx), %xmm3
-	movdqa	%xmm3, %xmm4
-	pcmpeqb	%xmm1, %xmm3
-	pcmpeqb	%xmm2, %xmm4
-	por	%xmm4, %xmm3
-	pmovmskb %xmm3, %r8d
-	movdqa	16(%rdx), %xmm3
-	movdqa	%xmm3, %xmm4
-	pcmpeqb	%xmm1, %xmm3
-	pcmpeqb	%xmm2, %xmm4
-	por	%xmm4, %xmm3
-	pmovmskb %xmm3, %eax
-	movdqa	32(%rdx), %xmm3
-	movdqa	%xmm3, %xmm4
-	pcmpeqb	%xmm1, %xmm3
-	salq	$16, %rax
-	pcmpeqb	%xmm2, %xmm4
-	por	%xmm4, %xmm3
-	pmovmskb %xmm3, %r9d
-	movdqa	48(%rdx), %xmm3
-	pcmpeqb	%xmm3, %xmm2
-	salq	$32, %r9
-	pcmpeqb	%xmm3, %xmm0
-	orq	%r9, %rax
-	orq	%r8, %rax
-	por	%xmm2, %xmm0
-	pmovmskb %xmm0, %ecx
-	salq	$48, %rcx
-	orq	%rcx, %rax
-	movl	%edi, %ecx
-	subb	%dl, %cl
-	shrq	%cl, %rax
-	testq	%rax, %rax
-	jne	L(return)
-	jmp	L(loop_start)
-
-END (strchr)
-
-#ifndef AS_STRCHRNUL
+#define STRCHR strchr
+#include "multiarch/strchr-sse2.S"
 weak_alias (strchr, index)
 libc_hidden_builtin_def (strchr)
-#endif
diff --git a/sysdeps/x86_64/strchrnul.S b/sysdeps/x86_64/strchrnul.S
index ec2e652e25..508e42db26 100644
--- a/sysdeps/x86_64/strchrnul.S
+++ b/sysdeps/x86_64/strchrnul.S
@@ -18,10 +18,7 @@
    License along with the GNU C Library; if not, see
    <https://www.gnu.org/licenses/>.  */
 
-#include <sysdep.h>
-
-#define strchr __strchrnul
-#define AS_STRCHRNUL
-#include "strchr.S"
+#define STRCHR __strchrnul
+#include "multiarch/strchrnul-sse2.S"
 
 weak_alias (__strchrnul, strchrnul)
-- 
2.34.1


^ permalink raw reply	[flat|nested] 21+ messages in thread

* [PATCH v1] x86: Move strcat SSE2 implementation to multiarch/strcat-sse2.S
  2022-07-12 19:29 [PATCH v1] x86: Move strlen SSE2 implementation to multiarch/strlen-sse2.S Noah Goldstein
                   ` (3 preceding siblings ...)
  2022-07-12 19:29 ` [PATCH v1] x86: Move strchr SSE2 implementation to multiarch/strchr-sse2.S Noah Goldstein
@ 2022-07-12 19:29 ` Noah Goldstein
  2022-07-12 21:16   ` H.J. Lu
  2022-07-12 19:29 ` [PATCH v1] x86: Move wcschr SSE2 implementation to multiarch/wcschr-sse2.S Noah Goldstein
                   ` (4 subsequent siblings)
  9 siblings, 1 reply; 21+ messages in thread
From: Noah Goldstein @ 2022-07-12 19:29 UTC (permalink / raw)
  To: libc-alpha

This commit doesn't affect libc.so.6, its just housekeeping to prepare
for adding explicit ISA level support.

Tested build on x86_64 and x86_32 with/without multiarch.
---
 sysdeps/x86_64/multiarch/strcat-sse2.S | 242 ++++++++++++++++++++++++-
 sysdeps/x86_64/strcat.S                | 239 +-----------------------
 2 files changed, 238 insertions(+), 243 deletions(-)

diff --git a/sysdeps/x86_64/multiarch/strcat-sse2.S b/sysdeps/x86_64/multiarch/strcat-sse2.S
index 449e102438..244c4a6d74 100644
--- a/sysdeps/x86_64/multiarch/strcat-sse2.S
+++ b/sysdeps/x86_64/multiarch/strcat-sse2.S
@@ -17,12 +17,242 @@
    <https://www.gnu.org/licenses/>.  */
 
 #if IS_IN (libc)
+# ifndef STRCAT
+#  define STRCAT __strcat_sse2
+# endif
+#endif
 
-# include <sysdep.h>
-# define strcat __strcat_sse2
+#include <sysdep.h>
+
+	.text
+ENTRY (STRCAT)
+	movq %rdi, %rcx		/* Dest. register. */
+	andl $7, %ecx		/* mask alignment bits */
+	movq %rdi, %rax		/* Duplicate destination pointer.  */
+	movq $0xfefefefefefefeff,%r8
+
+	/* First step: Find end of destination.  */
+	jz 4f			/* aligned => start loop */
+
+	neg %ecx		/* We need to align to 8 bytes.  */
+	addl $8,%ecx
+	/* Search the first bytes directly.  */
+0:	cmpb $0x0,(%rax)	/* is byte NUL? */
+	je 2f			/* yes => start copy */
+	incq %rax		/* increment pointer */
+	decl %ecx
+	jnz 0b
+
+
+
+	/* Now the source is aligned.  Scan for NUL byte.  */
+	.p2align 4
+4:
+	/* First unroll.  */
+	movq (%rax), %rcx	/* get double word (= 8 bytes) in question */
+	addq $8,%rax		/* adjust pointer for next word */
+	movq %r8, %rdx		/* magic value */
+	addq %rcx, %rdx		/* add the magic value to the word.  We get
+				   carry bits reported for each byte which
+				   is *not* 0 */
+	jnc 3f			/* highest byte is NUL => return pointer */
+	xorq %rcx, %rdx		/* (word+magic)^word */
+	orq %r8, %rdx		/* set all non-carry bits */
+	incq %rdx		/* add 1: if one carry bit was *not* set
+				   the addition will not result in 0.  */
+	jnz 3f			/* found NUL => return pointer */
+
+	/* Second unroll.  */
+	movq (%rax), %rcx	/* get double word (= 8 bytes) in question */
+	addq $8,%rax		/* adjust pointer for next word */
+	movq %r8, %rdx		/* magic value */
+	addq %rcx, %rdx		/* add the magic value to the word.  We get
+				   carry bits reported for each byte which
+				   is *not* 0 */
+	jnc 3f			/* highest byte is NUL => return pointer */
+	xorq %rcx, %rdx		/* (word+magic)^word */
+	orq %r8, %rdx		/* set all non-carry bits */
+	incq %rdx		/* add 1: if one carry bit was *not* set
+				   the addition will not result in 0.  */
+	jnz 3f			/* found NUL => return pointer */
+
+	/* Third unroll.  */
+	movq (%rax), %rcx	/* get double word (= 8 bytes) in question */
+	addq $8,%rax		/* adjust pointer for next word */
+	movq %r8, %rdx		/* magic value */
+	addq %rcx, %rdx		/* add the magic value to the word.  We get
+				   carry bits reported for each byte which
+				   is *not* 0 */
+	jnc 3f			/* highest byte is NUL => return pointer */
+	xorq %rcx, %rdx		/* (word+magic)^word */
+	orq %r8, %rdx		/* set all non-carry bits */
+	incq %rdx		/* add 1: if one carry bit was *not* set
+				   the addition will not result in 0.  */
+	jnz 3f			/* found NUL => return pointer */
+
+	/* Fourth unroll.  */
+	movq (%rax), %rcx	/* get double word (= 8 bytes) in question */
+	addq $8,%rax		/* adjust pointer for next word */
+	movq %r8, %rdx		/* magic value */
+	addq %rcx, %rdx		/* add the magic value to the word.  We get
+				   carry bits reported for each byte which
+				   is *not* 0 */
+	jnc 3f			/* highest byte is NUL => return pointer */
+	xorq %rcx, %rdx		/* (word+magic)^word */
+	orq %r8, %rdx		/* set all non-carry bits */
+	incq %rdx		/* add 1: if one carry bit was *not* set
+				   the addition will not result in 0.  */
+	jz 4b			/* no NUL found => continue loop */
+
+	.p2align 4		/* Align, it's a jump target.  */
+3:	subq $8,%rax		/* correct pointer increment.  */
+
+	testb %cl, %cl		/* is first byte NUL? */
+	jz 2f			/* yes => return */
+	incq %rax		/* increment pointer */
+
+	testb %ch, %ch		/* is second byte NUL? */
+	jz 2f			/* yes => return */
+	incq %rax		/* increment pointer */
+
+	testl $0x00ff0000, %ecx /* is third byte NUL? */
+	jz 2f			/* yes => return pointer */
+	incq %rax		/* increment pointer */
+
+	testl $0xff000000, %ecx /* is fourth byte NUL? */
+	jz 2f			/* yes => return pointer */
+	incq %rax		/* increment pointer */
+
+	shrq $32, %rcx		/* look at other half.  */
+
+	testb %cl, %cl		/* is first byte NUL? */
+	jz 2f			/* yes => return */
+	incq %rax		/* increment pointer */
+
+	testb %ch, %ch		/* is second byte NUL? */
+	jz 2f			/* yes => return */
+	incq %rax		/* increment pointer */
+
+	testl $0xff0000, %ecx	/* is third byte NUL? */
+	jz 2f			/* yes => return pointer */
+	incq %rax		/* increment pointer */
+
+2:
+	/* Second step: Copy source to destination.  */
+
+	movq	%rsi, %rcx	/* duplicate  */
+	andl	$7,%ecx		/* mask alignment bits */
+	movq	%rax, %rdx	/* move around */
+	jz	22f		/* aligned => start loop */
+
+	neg	%ecx		/* align to 8 bytes.  */
+	addl	$8, %ecx
+	/* Align the source pointer.  */
+21:
+	movb	(%rsi), %al	/* Fetch a byte */
+	testb	%al, %al	/* Is it NUL? */
+	movb	%al, (%rdx)	/* Store it */
+	jz	24f		/* If it was NUL, done! */
+	incq	%rsi
+	incq	%rdx
+	decl	%ecx
+	jnz	21b
+
+	/* Now the sources is aligned.  Unfortunatly we cannot force
+	   to have both source and destination aligned, so ignore the
+	   alignment of the destination.  */
+	.p2align 4
+22:
+	/* 1st unroll.  */
+	movq	(%rsi), %rax	/* Read double word (8 bytes).  */
+	addq	$8, %rsi	/* Adjust pointer for next word.  */
+	movq	%rax, %r9	/* Save a copy for NUL finding.  */
+	addq	%r8, %r9	/* add the magic value to the word.  We get
+				   carry bits reported for each byte which
+				   is *not* 0 */
+	jnc	23f		/* highest byte is NUL => return pointer */
+	xorq	%rax, %r9	/* (word+magic)^word */
+	orq	%r8, %r9	/* set all non-carry bits */
+	incq	%r9		/* add 1: if one carry bit was *not* set
+				   the addition will not result in 0.  */
+
+	jnz	23f		/* found NUL => return pointer */
+
+	movq	%rax, (%rdx)	/* Write value to destination.  */
+	addq	$8, %rdx	/* Adjust pointer.  */
+
+	/* 2nd unroll.  */
+	movq	(%rsi), %rax	/* Read double word (8 bytes).  */
+	addq	$8, %rsi	/* Adjust pointer for next word.  */
+	movq	%rax, %r9	/* Save a copy for NUL finding.  */
+	addq	%r8, %r9	/* add the magic value to the word.  We get
+				   carry bits reported for each byte which
+				   is *not* 0 */
+	jnc	23f		/* highest byte is NUL => return pointer */
+	xorq	%rax, %r9	/* (word+magic)^word */
+	orq	%r8, %r9	/* set all non-carry bits */
+	incq	%r9		/* add 1: if one carry bit was *not* set
+				   the addition will not result in 0.  */
+
+	jnz	23f		/* found NUL => return pointer */
+
+	movq	%rax, (%rdx)	/* Write value to destination.  */
+	addq	$8, %rdx	/* Adjust pointer.  */
+
+	/* 3rd unroll.  */
+	movq	(%rsi), %rax	/* Read double word (8 bytes).  */
+	addq	$8, %rsi	/* Adjust pointer for next word.  */
+	movq	%rax, %r9	/* Save a copy for NUL finding.  */
+	addq	%r8, %r9	/* add the magic value to the word.  We get
+				   carry bits reported for each byte which
+				   is *not* 0 */
+	jnc	23f		/* highest byte is NUL => return pointer */
+	xorq	%rax, %r9	/* (word+magic)^word */
+	orq	%r8, %r9	/* set all non-carry bits */
+	incq	%r9		/* add 1: if one carry bit was *not* set
+				   the addition will not result in 0.  */
+
+	jnz	23f		/* found NUL => return pointer */
+
+	movq	%rax, (%rdx)	/* Write value to destination.  */
+	addq	$8, %rdx	/* Adjust pointer.  */
+
+	/* 4th unroll.  */
+	movq	(%rsi), %rax	/* Read double word (8 bytes).  */
+	addq	$8, %rsi	/* Adjust pointer for next word.  */
+	movq	%rax, %r9	/* Save a copy for NUL finding.  */
+	addq	%r8, %r9	/* add the magic value to the word.  We get
+				   carry bits reported for each byte which
+				   is *not* 0 */
+	jnc	23f		/* highest byte is NUL => return pointer */
+	xorq	%rax, %r9	/* (word+magic)^word */
+	orq	%r8, %r9	/* set all non-carry bits */
+	incq	%r9		/* add 1: if one carry bit was *not* set
+				   the addition will not result in 0.  */
+
+	jnz	23f		/* found NUL => return pointer */
+
+	movq	%rax, (%rdx)	/* Write value to destination.  */
+	addq	$8, %rdx	/* Adjust pointer.  */
+	jmp	22b		/* Next iteration.  */
+
+	/* Do the last few bytes. %rax contains the value to write.
+	   The loop is unrolled twice.  */
+	.p2align 4
+23:
+	movb	%al, (%rdx)	/* 1st byte.  */
+	testb	%al, %al	/* Is it NUL.  */
+	jz	24f		/* yes, finish.  */
+	incq	%rdx		/* Increment destination.  */
+	movb	%ah, (%rdx)	/* 2nd byte.  */
+	testb	%ah, %ah	/* Is it NUL?.  */
+	jz	24f		/* yes, finish.  */
+	incq	%rdx		/* Increment destination.  */
+	shrq	$16, %rax	/* Shift...  */
+	jmp	23b		/* and look at next two bytes in %rax.  */
 
-# undef libc_hidden_builtin_def
-# define libc_hidden_builtin_def(strcat)
-#endif
 
-#include <sysdeps/x86_64/strcat.S>
+24:
+	movq	%rdi, %rax	/* Source is return value.  */
+	retq
+END (STRCAT)
diff --git a/sysdeps/x86_64/strcat.S b/sysdeps/x86_64/strcat.S
index 565a9c785a..fc3e8a9bcf 100644
--- a/sysdeps/x86_64/strcat.S
+++ b/sysdeps/x86_64/strcat.S
@@ -17,241 +17,6 @@
    License along with the GNU C Library; if not, see
    <https://www.gnu.org/licenses/>.  */
 
-#include <sysdep.h>
-#include "asm-syntax.h"
-
-/* Will be removed when new strcpy implementation gets merged.  */
-
-	.text
-ENTRY (strcat)
-	movq %rdi, %rcx		/* Dest. register. */
-	andl $7, %ecx		/* mask alignment bits */
-	movq %rdi, %rax		/* Duplicate destination pointer.  */
-	movq $0xfefefefefefefeff,%r8
-
-	/* First step: Find end of destination.  */
-	jz 4f			/* aligned => start loop */
-
-	neg %ecx		/* We need to align to 8 bytes.  */
-	addl $8,%ecx
-	/* Search the first bytes directly.  */
-0:	cmpb $0x0,(%rax)	/* is byte NUL? */
-	je 2f			/* yes => start copy */
-	incq %rax		/* increment pointer */
-	decl %ecx
-	jnz 0b
-
-
-
-	/* Now the source is aligned.  Scan for NUL byte.  */
-	.p2align 4
-4:
-	/* First unroll.  */
-	movq (%rax), %rcx	/* get double word (= 8 bytes) in question */
-	addq $8,%rax		/* adjust pointer for next word */
-	movq %r8, %rdx		/* magic value */
-	addq %rcx, %rdx		/* add the magic value to the word.  We get
-				   carry bits reported for each byte which
-				   is *not* 0 */
-	jnc 3f			/* highest byte is NUL => return pointer */
-	xorq %rcx, %rdx		/* (word+magic)^word */
-	orq %r8, %rdx		/* set all non-carry bits */
-	incq %rdx		/* add 1: if one carry bit was *not* set
-				   the addition will not result in 0.  */
-	jnz 3f			/* found NUL => return pointer */
-
-	/* Second unroll.  */
-	movq (%rax), %rcx	/* get double word (= 8 bytes) in question */
-	addq $8,%rax		/* adjust pointer for next word */
-	movq %r8, %rdx		/* magic value */
-	addq %rcx, %rdx		/* add the magic value to the word.  We get
-				   carry bits reported for each byte which
-				   is *not* 0 */
-	jnc 3f			/* highest byte is NUL => return pointer */
-	xorq %rcx, %rdx		/* (word+magic)^word */
-	orq %r8, %rdx		/* set all non-carry bits */
-	incq %rdx		/* add 1: if one carry bit was *not* set
-				   the addition will not result in 0.  */
-	jnz 3f			/* found NUL => return pointer */
-
-	/* Third unroll.  */
-	movq (%rax), %rcx	/* get double word (= 8 bytes) in question */
-	addq $8,%rax		/* adjust pointer for next word */
-	movq %r8, %rdx		/* magic value */
-	addq %rcx, %rdx		/* add the magic value to the word.  We get
-				   carry bits reported for each byte which
-				   is *not* 0 */
-	jnc 3f			/* highest byte is NUL => return pointer */
-	xorq %rcx, %rdx		/* (word+magic)^word */
-	orq %r8, %rdx		/* set all non-carry bits */
-	incq %rdx		/* add 1: if one carry bit was *not* set
-				   the addition will not result in 0.  */
-	jnz 3f			/* found NUL => return pointer */
-
-	/* Fourth unroll.  */
-	movq (%rax), %rcx	/* get double word (= 8 bytes) in question */
-	addq $8,%rax		/* adjust pointer for next word */
-	movq %r8, %rdx		/* magic value */
-	addq %rcx, %rdx		/* add the magic value to the word.  We get
-				   carry bits reported for each byte which
-				   is *not* 0 */
-	jnc 3f			/* highest byte is NUL => return pointer */
-	xorq %rcx, %rdx		/* (word+magic)^word */
-	orq %r8, %rdx		/* set all non-carry bits */
-	incq %rdx		/* add 1: if one carry bit was *not* set
-				   the addition will not result in 0.  */
-	jz 4b			/* no NUL found => continue loop */
-
-	.p2align 4		/* Align, it's a jump target.  */
-3:	subq $8,%rax		/* correct pointer increment.  */
-
-	testb %cl, %cl		/* is first byte NUL? */
-	jz 2f			/* yes => return */
-	incq %rax		/* increment pointer */
-
-	testb %ch, %ch		/* is second byte NUL? */
-	jz 2f			/* yes => return */
-	incq %rax		/* increment pointer */
-
-	testl $0x00ff0000, %ecx /* is third byte NUL? */
-	jz 2f			/* yes => return pointer */
-	incq %rax		/* increment pointer */
-
-	testl $0xff000000, %ecx /* is fourth byte NUL? */
-	jz 2f			/* yes => return pointer */
-	incq %rax		/* increment pointer */
-
-	shrq $32, %rcx		/* look at other half.  */
-
-	testb %cl, %cl		/* is first byte NUL? */
-	jz 2f			/* yes => return */
-	incq %rax		/* increment pointer */
-
-	testb %ch, %ch		/* is second byte NUL? */
-	jz 2f			/* yes => return */
-	incq %rax		/* increment pointer */
-
-	testl $0xff0000, %ecx	/* is third byte NUL? */
-	jz 2f			/* yes => return pointer */
-	incq %rax		/* increment pointer */
-
-2:
-	/* Second step: Copy source to destination.  */
-
-	movq	%rsi, %rcx	/* duplicate  */
-	andl	$7,%ecx		/* mask alignment bits */
-	movq	%rax, %rdx	/* move around */
-	jz	22f		/* aligned => start loop */
-
-	neg	%ecx		/* align to 8 bytes.  */
-	addl	$8, %ecx
-	/* Align the source pointer.  */
-21:
-	movb	(%rsi), %al	/* Fetch a byte */
-	testb	%al, %al	/* Is it NUL? */
-	movb	%al, (%rdx)	/* Store it */
-	jz	24f		/* If it was NUL, done! */
-	incq	%rsi
-	incq	%rdx
-	decl	%ecx
-	jnz	21b
-
-	/* Now the sources is aligned.  Unfortunatly we cannot force
-	   to have both source and destination aligned, so ignore the
-	   alignment of the destination.  */
-	.p2align 4
-22:
-	/* 1st unroll.  */
-	movq	(%rsi), %rax	/* Read double word (8 bytes).  */
-	addq	$8, %rsi	/* Adjust pointer for next word.  */
-	movq	%rax, %r9	/* Save a copy for NUL finding.  */
-	addq	%r8, %r9	/* add the magic value to the word.  We get
-				   carry bits reported for each byte which
-				   is *not* 0 */
-	jnc	23f		/* highest byte is NUL => return pointer */
-	xorq	%rax, %r9	/* (word+magic)^word */
-	orq	%r8, %r9	/* set all non-carry bits */
-	incq	%r9		/* add 1: if one carry bit was *not* set
-				   the addition will not result in 0.  */
-
-	jnz	23f		/* found NUL => return pointer */
-
-	movq	%rax, (%rdx)	/* Write value to destination.  */
-	addq	$8, %rdx	/* Adjust pointer.  */
-
-	/* 2nd unroll.  */
-	movq	(%rsi), %rax	/* Read double word (8 bytes).  */
-	addq	$8, %rsi	/* Adjust pointer for next word.  */
-	movq	%rax, %r9	/* Save a copy for NUL finding.  */
-	addq	%r8, %r9	/* add the magic value to the word.  We get
-				   carry bits reported for each byte which
-				   is *not* 0 */
-	jnc	23f		/* highest byte is NUL => return pointer */
-	xorq	%rax, %r9	/* (word+magic)^word */
-	orq	%r8, %r9	/* set all non-carry bits */
-	incq	%r9		/* add 1: if one carry bit was *not* set
-				   the addition will not result in 0.  */
-
-	jnz	23f		/* found NUL => return pointer */
-
-	movq	%rax, (%rdx)	/* Write value to destination.  */
-	addq	$8, %rdx	/* Adjust pointer.  */
-
-	/* 3rd unroll.  */
-	movq	(%rsi), %rax	/* Read double word (8 bytes).  */
-	addq	$8, %rsi	/* Adjust pointer for next word.  */
-	movq	%rax, %r9	/* Save a copy for NUL finding.  */
-	addq	%r8, %r9	/* add the magic value to the word.  We get
-				   carry bits reported for each byte which
-				   is *not* 0 */
-	jnc	23f		/* highest byte is NUL => return pointer */
-	xorq	%rax, %r9	/* (word+magic)^word */
-	orq	%r8, %r9	/* set all non-carry bits */
-	incq	%r9		/* add 1: if one carry bit was *not* set
-				   the addition will not result in 0.  */
-
-	jnz	23f		/* found NUL => return pointer */
-
-	movq	%rax, (%rdx)	/* Write value to destination.  */
-	addq	$8, %rdx	/* Adjust pointer.  */
-
-	/* 4th unroll.  */
-	movq	(%rsi), %rax	/* Read double word (8 bytes).  */
-	addq	$8, %rsi	/* Adjust pointer for next word.  */
-	movq	%rax, %r9	/* Save a copy for NUL finding.  */
-	addq	%r8, %r9	/* add the magic value to the word.  We get
-				   carry bits reported for each byte which
-				   is *not* 0 */
-	jnc	23f		/* highest byte is NUL => return pointer */
-	xorq	%rax, %r9	/* (word+magic)^word */
-	orq	%r8, %r9	/* set all non-carry bits */
-	incq	%r9		/* add 1: if one carry bit was *not* set
-				   the addition will not result in 0.  */
-
-	jnz	23f		/* found NUL => return pointer */
-
-	movq	%rax, (%rdx)	/* Write value to destination.  */
-	addq	$8, %rdx	/* Adjust pointer.  */
-	jmp	22b		/* Next iteration.  */
-
-	/* Do the last few bytes. %rax contains the value to write.
-	   The loop is unrolled twice.  */
-	.p2align 4
-23:
-	movb	%al, (%rdx)	/* 1st byte.  */
-	testb	%al, %al	/* Is it NUL.  */
-	jz	24f		/* yes, finish.  */
-	incq	%rdx		/* Increment destination.  */
-	movb	%ah, (%rdx)	/* 2nd byte.  */
-	testb	%ah, %ah	/* Is it NUL?.  */
-	jz	24f		/* yes, finish.  */
-	incq	%rdx		/* Increment destination.  */
-	shrq	$16, %rax	/* Shift...  */
-	jmp	23b		/* and look at next two bytes in %rax.  */
-
-
-24:
-	movq	%rdi, %rax	/* Source is return value.  */
-	retq
-END (strcat)
+#define STRCAT strcat
+#include "multiarch/strcat-sse2.S"
 libc_hidden_builtin_def (strcat)
-- 
2.34.1


^ permalink raw reply	[flat|nested] 21+ messages in thread

* [PATCH v1] x86: Move wcschr SSE2 implementation to multiarch/wcschr-sse2.S
  2022-07-12 19:29 [PATCH v1] x86: Move strlen SSE2 implementation to multiarch/strlen-sse2.S Noah Goldstein
                   ` (4 preceding siblings ...)
  2022-07-12 19:29 ` [PATCH v1] x86: Move strcat SSE2 implementation to multiarch/strcat-sse2.S Noah Goldstein
@ 2022-07-12 19:29 ` Noah Goldstein
  2022-07-12 20:55   ` H.J. Lu
  2022-07-12 19:29 ` [PATCH v1] x86: Move wcslen SSE2 implementation to multiarch/wcslen-sse2.S Noah Goldstein
                   ` (3 subsequent siblings)
  9 siblings, 1 reply; 21+ messages in thread
From: Noah Goldstein @ 2022-07-12 19:29 UTC (permalink / raw)
  To: libc-alpha

This commit doesn't affect libc.so.6, its just housekeeping to prepare
for adding explicit ISA level support.

Tested build on x86_64 and x86_32 with/without multiarch.
---
 sysdeps/x86_64/multiarch/wcschr-sse2.S | 145 +++++++++++++++++++++++--
 sysdeps/x86_64/wcschr.S                | 135 +----------------------
 2 files changed, 138 insertions(+), 142 deletions(-)

diff --git a/sysdeps/x86_64/multiarch/wcschr-sse2.S b/sysdeps/x86_64/multiarch/wcschr-sse2.S
index 218ea609b9..c872926ba9 100644
--- a/sysdeps/x86_64/multiarch/wcschr-sse2.S
+++ b/sysdeps/x86_64/multiarch/wcschr-sse2.S
@@ -17,14 +17,141 @@
    <https://www.gnu.org/licenses/>.  */
 
 #if IS_IN (libc)
-# define __wcschr __wcschr_sse2
-
-# undef weak_alias
-# define weak_alias(__wcschr, wcschr)
-# undef libc_hidden_def
-# define libc_hidden_def(__wcschr)
-# undef libc_hidden_weak
-# define libc_hidden_weak(wcschr)
+# ifndef WCSCHR
+#  define WCSCHR __wcschr_sse2
+# endif
 #endif
 
-#include "../wcschr.S"
+#include <sysdep.h>
+
+	.text
+ENTRY (WCSCHR)
+
+	movd	%rsi, %xmm1
+	pxor	%xmm2, %xmm2
+	mov	%rdi, %rcx
+	punpckldq %xmm1, %xmm1
+	punpckldq %xmm1, %xmm1
+
+	and	$63, %rcx
+	cmp	$48, %rcx
+	ja	L(cross_cache)
+
+	movdqu	(%rdi), %xmm0
+	pcmpeqd	%xmm0, %xmm2
+	add	$16, %rdi
+	pcmpeqd	%xmm1, %xmm0
+	pmovmskb %xmm2, %rdx
+	pmovmskb %xmm0, %rax
+	or	%rax, %rdx
+	jnz	L(matches)
+
+	and	$-16, %rdi
+
+	movdqa	(%rdi), %xmm0
+	pcmpeqd	%xmm0, %xmm2
+	add	$16, %rdi
+	pcmpeqd	%xmm1, %xmm0
+	pmovmskb %xmm2, %rdx
+	pmovmskb %xmm0, %rax
+	or	%rax, %rdx
+	jnz	L(matches)
+
+	jmp	L(loop)
+
+L(cross_cache):
+	and	$15, %rcx
+	and	$-16, %rdi
+	movdqa	(%rdi), %xmm0
+	pcmpeqd	%xmm0, %xmm2
+	pcmpeqd	%xmm1, %xmm0
+	pmovmskb %xmm2, %rdx
+	pmovmskb %xmm0, %rax
+
+	sar	%cl, %rdx
+	sar	%cl, %rax
+	test	%rax, %rax
+	je	L(unaligned_no_match)
+
+	bsf	%rax, %rax
+	test	%rdx, %rdx
+	je	L(unaligned_match)
+	bsf	%rdx, %rdx
+	cmp	%rdx, %rax
+	ja	L(return_null)
+
+L(unaligned_match):
+	add	%rdi, %rax
+	add	%rcx, %rax
+	ret
+
+	.p2align 4
+L(unaligned_no_match):
+	test	%rdx, %rdx
+	jne	L(return_null)
+	pxor	%xmm2, %xmm2
+
+	add	$16, %rdi
+
+	.p2align 4
+/* Loop start on aligned string.  */
+L(loop):
+	movdqa	(%rdi), %xmm0
+	pcmpeqd	%xmm0, %xmm2
+	add	$16, %rdi
+	pcmpeqd	%xmm1, %xmm0
+	pmovmskb %xmm2, %rdx
+	pmovmskb %xmm0, %rax
+	or	%rax, %rdx
+	jnz	L(matches)
+
+	movdqa	(%rdi), %xmm0
+	pcmpeqd	%xmm0, %xmm2
+	add	$16, %rdi
+	pcmpeqd	%xmm1, %xmm0
+	pmovmskb %xmm2, %rdx
+	pmovmskb %xmm0, %rax
+	or	%rax, %rdx
+	jnz	L(matches)
+
+	movdqa	(%rdi), %xmm0
+	pcmpeqd	%xmm0, %xmm2
+	add	$16, %rdi
+	pcmpeqd	%xmm1, %xmm0
+	pmovmskb %xmm2, %rdx
+	pmovmskb %xmm0, %rax
+	or	%rax, %rdx
+	jnz	L(matches)
+
+	movdqa	(%rdi), %xmm0
+	pcmpeqd	%xmm0, %xmm2
+	add	$16, %rdi
+	pcmpeqd	%xmm1, %xmm0
+	pmovmskb %xmm2, %rdx
+	pmovmskb %xmm0, %rax
+	or	%rax, %rdx
+	jnz	L(matches)
+	jmp	L(loop)
+
+	.p2align 4
+L(matches):
+	pmovmskb %xmm2, %rdx
+	test	%rax, %rax
+	jz	L(return_null)
+	bsf	%rax, %rax
+	test	%rdx, %rdx
+	je	L(match)
+	bsf	%rdx, %rcx
+	cmp	%rcx, %rax
+	ja	L(return_null)
+L(match):
+	sub	$16, %rdi
+	add	%rdi, %rax
+	ret
+
+	.p2align 4
+L(return_null):
+	xor	%rax, %rax
+	ret
+
+END (WCSCHR)
diff --git a/sysdeps/x86_64/wcschr.S b/sysdeps/x86_64/wcschr.S
index 2131220382..80b12c4286 100644
--- a/sysdeps/x86_64/wcschr.S
+++ b/sysdeps/x86_64/wcschr.S
@@ -16,140 +16,9 @@
    License along with the GNU C Library; if not, see
    <https://www.gnu.org/licenses/>.  */
 
-#include <sysdep.h>
-
-	.text
-ENTRY (__wcschr)
-
-	movd	%rsi, %xmm1
-	pxor	%xmm2, %xmm2
-	mov	%rdi, %rcx
-	punpckldq %xmm1, %xmm1
-	punpckldq %xmm1, %xmm1
-
-	and	$63, %rcx
-	cmp	$48, %rcx
-	ja	L(cross_cache)
-
-	movdqu	(%rdi), %xmm0
-	pcmpeqd	%xmm0, %xmm2
-	add	$16, %rdi
-	pcmpeqd	%xmm1, %xmm0
-	pmovmskb %xmm2, %rdx
-	pmovmskb %xmm0, %rax
-	or	%rax, %rdx
-	jnz	L(matches)
-
-	and	$-16, %rdi
-
-	movdqa	(%rdi), %xmm0
-	pcmpeqd	%xmm0, %xmm2
-	add	$16, %rdi
-	pcmpeqd	%xmm1, %xmm0
-	pmovmskb %xmm2, %rdx
-	pmovmskb %xmm0, %rax
-	or	%rax, %rdx
-	jnz	L(matches)
-
-	jmp	L(loop)
-
-L(cross_cache):
-	and	$15, %rcx
-	and	$-16, %rdi
-	movdqa	(%rdi), %xmm0
-	pcmpeqd	%xmm0, %xmm2
-	pcmpeqd	%xmm1, %xmm0
-	pmovmskb %xmm2, %rdx
-	pmovmskb %xmm0, %rax
-
-	sar	%cl, %rdx
-	sar	%cl, %rax
-	test	%rax, %rax
-	je	L(unaligned_no_match)
-
-	bsf	%rax, %rax
-	test	%rdx, %rdx
-	je	L(unaligned_match)
-	bsf	%rdx, %rdx
-	cmp	%rdx, %rax
-	ja	L(return_null)
-
-L(unaligned_match):
-	add	%rdi, %rax
-	add	%rcx, %rax
-	ret
-
-	.p2align 4
-L(unaligned_no_match):
-	test	%rdx, %rdx
-	jne	L(return_null)
-	pxor	%xmm2, %xmm2
-
-	add	$16, %rdi
-
-	.p2align 4
-/* Loop start on aligned string.  */
-L(loop):
-	movdqa	(%rdi), %xmm0
-	pcmpeqd	%xmm0, %xmm2
-	add	$16, %rdi
-	pcmpeqd	%xmm1, %xmm0
-	pmovmskb %xmm2, %rdx
-	pmovmskb %xmm0, %rax
-	or	%rax, %rdx
-	jnz	L(matches)
-
-	movdqa	(%rdi), %xmm0
-	pcmpeqd	%xmm0, %xmm2
-	add	$16, %rdi
-	pcmpeqd	%xmm1, %xmm0
-	pmovmskb %xmm2, %rdx
-	pmovmskb %xmm0, %rax
-	or	%rax, %rdx
-	jnz	L(matches)
-
-	movdqa	(%rdi), %xmm0
-	pcmpeqd	%xmm0, %xmm2
-	add	$16, %rdi
-	pcmpeqd	%xmm1, %xmm0
-	pmovmskb %xmm2, %rdx
-	pmovmskb %xmm0, %rax
-	or	%rax, %rdx
-	jnz	L(matches)
-
-	movdqa	(%rdi), %xmm0
-	pcmpeqd	%xmm0, %xmm2
-	add	$16, %rdi
-	pcmpeqd	%xmm1, %xmm0
-	pmovmskb %xmm2, %rdx
-	pmovmskb %xmm0, %rax
-	or	%rax, %rdx
-	jnz	L(matches)
-	jmp	L(loop)
-
-	.p2align 4
-L(matches):
-	pmovmskb %xmm2, %rdx
-	test	%rax, %rax
-	jz	L(return_null)
-	bsf	%rax, %rax
-	test	%rdx, %rdx
-	je	L(match)
-	bsf	%rdx, %rcx
-	cmp	%rcx, %rax
-	ja	L(return_null)
-L(match):
-	sub	$16, %rdi
-	add	%rdi, %rax
-	ret
-
-	.p2align 4
-L(return_null):
-	xor	%rax, %rax
-	ret
-
-END (__wcschr)
 
+#define WCSCHR	__wcschr
+#include "multiarch/wcschr-sse2.S"
 libc_hidden_def(__wcschr)
 weak_alias (__wcschr, wcschr)
 libc_hidden_weak (wcschr)
-- 
2.34.1


^ permalink raw reply	[flat|nested] 21+ messages in thread

* [PATCH v1] x86: Move wcslen SSE2 implementation to multiarch/wcslen-sse2.S
  2022-07-12 19:29 [PATCH v1] x86: Move strlen SSE2 implementation to multiarch/strlen-sse2.S Noah Goldstein
                   ` (5 preceding siblings ...)
  2022-07-12 19:29 ` [PATCH v1] x86: Move wcschr SSE2 implementation to multiarch/wcschr-sse2.S Noah Goldstein
@ 2022-07-12 19:29 ` Noah Goldstein
  2022-07-12 20:26   ` H.J. Lu
  2022-07-12 19:29 ` [PATCH v1] x86: Remove unneeded rtld-wmemcmp Noah Goldstein
                   ` (2 subsequent siblings)
  9 siblings, 1 reply; 21+ messages in thread
From: Noah Goldstein @ 2022-07-12 19:29 UTC (permalink / raw)
  To: libc-alpha

This commit doesn't affect libc.so.6, its just housekeeping to prepare
for adding explicit ISA level support.

Tested build on x86_64 and x86_32 with/without multiarch.
---
 sysdeps/x86_64/multiarch/wcslen-sse2.S | 221 ++++++++++++++++++++++++-
 sysdeps/x86_64/wcslen.S                | 216 +-----------------------
 2 files changed, 218 insertions(+), 219 deletions(-)

diff --git a/sysdeps/x86_64/multiarch/wcslen-sse2.S b/sysdeps/x86_64/multiarch/wcslen-sse2.S
index 2b3a9efd64..944c3bd9c6 100644
--- a/sysdeps/x86_64/multiarch/wcslen-sse2.S
+++ b/sysdeps/x86_64/multiarch/wcslen-sse2.S
@@ -17,10 +17,221 @@
    <https://www.gnu.org/licenses/>.  */
 
 #if IS_IN (libc)
-# define __wcslen __wcslen_sse2
-
-# undef weak_alias
-# define weak_alias(__wcslen, wcslen)
+# ifndef WCSLEN
+#  define WCSLEN __wcslen_sse2
+# endif
 #endif
 
-#include "../wcslen.S"
+#include <sysdep.h>
+
+	.text
+ENTRY (WCSLEN)
+	cmpl	$0, (%rdi)
+	jz	L(exit_tail0)
+	cmpl	$0, 4(%rdi)
+	jz	L(exit_tail1)
+	cmpl	$0, 8(%rdi)
+	jz	L(exit_tail2)
+	cmpl	$0, 12(%rdi)
+	jz	L(exit_tail3)
+	cmpl	$0, 16(%rdi)
+	jz	L(exit_tail4)
+	cmpl	$0, 20(%rdi)
+	jz	L(exit_tail5)
+	cmpl	$0, 24(%rdi)
+	jz	L(exit_tail6)
+	cmpl	$0, 28(%rdi)
+	jz	L(exit_tail7)
+
+	pxor	%xmm0, %xmm0
+
+	lea	32(%rdi), %rax
+	addq	$16, %rdi
+	and	$-16, %rax
+
+	pcmpeqd	(%rax), %xmm0
+	pmovmskb %xmm0, %edx
+	pxor	%xmm1, %xmm1
+	addq	$16, %rax
+	test	%edx, %edx
+	jnz	L(exit)
+
+	pcmpeqd	(%rax), %xmm1
+	pmovmskb %xmm1, %edx
+	pxor	%xmm2, %xmm2
+	addq	$16, %rax
+	test	%edx, %edx
+	jnz	L(exit)
+
+	pcmpeqd	(%rax), %xmm2
+	pmovmskb %xmm2, %edx
+	pxor	%xmm3, %xmm3
+	addq	$16, %rax
+	test	%edx, %edx
+	jnz	L(exit)
+
+	pcmpeqd	(%rax), %xmm3
+	pmovmskb %xmm3, %edx
+	addq	$16, %rax
+	test	%edx, %edx
+	jnz	L(exit)
+
+	pcmpeqd	(%rax), %xmm0
+	pmovmskb %xmm0, %edx
+	addq	$16, %rax
+	test	%edx, %edx
+	jnz	L(exit)
+
+	pcmpeqd	(%rax), %xmm1
+	pmovmskb %xmm1, %edx
+	addq	$16, %rax
+	test	%edx, %edx
+	jnz	L(exit)
+
+	pcmpeqd	(%rax), %xmm2
+	pmovmskb %xmm2, %edx
+	addq	$16, %rax
+	test	%edx, %edx
+	jnz	L(exit)
+
+	pcmpeqd	(%rax), %xmm3
+	pmovmskb %xmm3, %edx
+	addq	$16, %rax
+	test	%edx, %edx
+	jnz	L(exit)
+
+	pcmpeqd	(%rax), %xmm0
+	pmovmskb %xmm0, %edx
+	addq	$16, %rax
+	test	%edx, %edx
+	jnz	L(exit)
+
+	pcmpeqd	(%rax), %xmm1
+	pmovmskb %xmm1, %edx
+	addq	$16, %rax
+	test	%edx, %edx
+	jnz	L(exit)
+
+	pcmpeqd	(%rax), %xmm2
+	pmovmskb %xmm2, %edx
+	addq	$16, %rax
+	test	%edx, %edx
+	jnz	L(exit)
+
+	pcmpeqd	(%rax), %xmm3
+	pmovmskb %xmm3, %edx
+	addq	$16, %rax
+	test	%edx, %edx
+	jnz	L(exit)
+
+	and	$-0x40, %rax
+
+	.p2align 4
+L(aligned_64_loop):
+	movaps	(%rax), %xmm0
+	movaps	16(%rax), %xmm1
+	movaps	32(%rax), %xmm2
+	movaps	48(%rax), %xmm6
+
+	pminub	%xmm1, %xmm0
+	pminub	%xmm6, %xmm2
+	pminub	%xmm0, %xmm2
+	pcmpeqd	%xmm3, %xmm2
+	pmovmskb %xmm2, %edx
+	addq	$64, %rax
+	test	%edx, %edx
+	jz	L(aligned_64_loop)
+
+	pcmpeqd	-64(%rax), %xmm3
+	pmovmskb %xmm3, %edx
+    addq	$48, %rdi
+	test	%edx, %edx
+	jnz	L(exit)
+
+	pcmpeqd	%xmm1, %xmm3
+	pmovmskb %xmm3, %edx
+    addq	$-16, %rdi
+	test	%edx, %edx
+	jnz	L(exit)
+
+	pcmpeqd	-32(%rax), %xmm3
+	pmovmskb %xmm3, %edx
+    addq	$-16, %rdi
+	test	%edx, %edx
+	jnz	L(exit)
+
+	pcmpeqd	%xmm6, %xmm3
+	pmovmskb %xmm3, %edx
+    addq	$-16, %rdi
+	test	%edx, %edx
+	jz	L(aligned_64_loop)
+
+	.p2align 4
+L(exit):
+	sub	%rdi, %rax
+	shr	$2, %rax
+	test	%dl, %dl
+	jz	L(exit_high)
+
+	andl	$15, %edx
+	jz	L(exit_1)
+	ret
+
+	/* No align here. Naturally aligned % 16 == 1.  */
+L(exit_high):
+	andl	$(15 << 8), %edx
+	jz	L(exit_3)
+	add	$2, %rax
+	ret
+
+	.p2align 3
+L(exit_1):
+	add	$1, %rax
+	ret
+
+	.p2align 3
+L(exit_3):
+	add	$3, %rax
+	ret
+
+	.p2align 3
+L(exit_tail0):
+	xorl	%eax, %eax
+	ret
+
+	.p2align 3
+L(exit_tail1):
+	movl	$1, %eax
+	ret
+
+	.p2align 3
+L(exit_tail2):
+	movl	$2, %eax
+	ret
+
+	.p2align 3
+L(exit_tail3):
+	movl	$3, %eax
+	ret
+
+	.p2align 3
+L(exit_tail4):
+	movl	$4, %eax
+	ret
+
+	.p2align 3
+L(exit_tail5):
+	movl	$5, %eax
+	ret
+
+	.p2align 3
+L(exit_tail6):
+	movl	$6, %eax
+	ret
+
+	.p2align 3
+L(exit_tail7):
+	movl	$7, %eax
+	ret
+
+END (WCSLEN)
diff --git a/sysdeps/x86_64/wcslen.S b/sysdeps/x86_64/wcslen.S
index d641141d75..588a0fbe01 100644
--- a/sysdeps/x86_64/wcslen.S
+++ b/sysdeps/x86_64/wcslen.S
@@ -16,218 +16,6 @@
    License along with the GNU C Library; if not, see
    <https://www.gnu.org/licenses/>.  */
 
-#include <sysdep.h>
-
-	.text
-ENTRY (__wcslen)
-	cmpl	$0, (%rdi)
-	jz	L(exit_tail0)
-	cmpl	$0, 4(%rdi)
-	jz	L(exit_tail1)
-	cmpl	$0, 8(%rdi)
-	jz	L(exit_tail2)
-	cmpl	$0, 12(%rdi)
-	jz	L(exit_tail3)
-	cmpl	$0, 16(%rdi)
-	jz	L(exit_tail4)
-	cmpl	$0, 20(%rdi)
-	jz	L(exit_tail5)
-	cmpl	$0, 24(%rdi)
-	jz	L(exit_tail6)
-	cmpl	$0, 28(%rdi)
-	jz	L(exit_tail7)
-
-	pxor	%xmm0, %xmm0
-
-	lea	32(%rdi), %rax
-	addq	$16, %rdi
-	and	$-16, %rax
-
-	pcmpeqd	(%rax), %xmm0
-	pmovmskb %xmm0, %edx
-	pxor	%xmm1, %xmm1
-	addq	$16, %rax
-	test	%edx, %edx
-	jnz	L(exit)
-
-	pcmpeqd	(%rax), %xmm1
-	pmovmskb %xmm1, %edx
-	pxor	%xmm2, %xmm2
-	addq	$16, %rax
-	test	%edx, %edx
-	jnz	L(exit)
-
-	pcmpeqd	(%rax), %xmm2
-	pmovmskb %xmm2, %edx
-	pxor	%xmm3, %xmm3
-	addq	$16, %rax
-	test	%edx, %edx
-	jnz	L(exit)
-
-	pcmpeqd	(%rax), %xmm3
-	pmovmskb %xmm3, %edx
-	addq	$16, %rax
-	test	%edx, %edx
-	jnz	L(exit)
-
-	pcmpeqd	(%rax), %xmm0
-	pmovmskb %xmm0, %edx
-	addq	$16, %rax
-	test	%edx, %edx
-	jnz	L(exit)
-
-	pcmpeqd	(%rax), %xmm1
-	pmovmskb %xmm1, %edx
-	addq	$16, %rax
-	test	%edx, %edx
-	jnz	L(exit)
-
-	pcmpeqd	(%rax), %xmm2
-	pmovmskb %xmm2, %edx
-	addq	$16, %rax
-	test	%edx, %edx
-	jnz	L(exit)
-
-	pcmpeqd	(%rax), %xmm3
-	pmovmskb %xmm3, %edx
-	addq	$16, %rax
-	test	%edx, %edx
-	jnz	L(exit)
-
-	pcmpeqd	(%rax), %xmm0
-	pmovmskb %xmm0, %edx
-	addq	$16, %rax
-	test	%edx, %edx
-	jnz	L(exit)
-
-	pcmpeqd	(%rax), %xmm1
-	pmovmskb %xmm1, %edx
-	addq	$16, %rax
-	test	%edx, %edx
-	jnz	L(exit)
-
-	pcmpeqd	(%rax), %xmm2
-	pmovmskb %xmm2, %edx
-	addq	$16, %rax
-	test	%edx, %edx
-	jnz	L(exit)
-
-	pcmpeqd	(%rax), %xmm3
-	pmovmskb %xmm3, %edx
-	addq	$16, %rax
-	test	%edx, %edx
-	jnz	L(exit)
-
-	and	$-0x40, %rax
-
-	.p2align 4
-L(aligned_64_loop):
-	movaps	(%rax), %xmm0
-	movaps	16(%rax), %xmm1
-	movaps	32(%rax), %xmm2
-	movaps	48(%rax), %xmm6
-
-	pminub	%xmm1, %xmm0
-	pminub	%xmm6, %xmm2
-	pminub	%xmm0, %xmm2
-	pcmpeqd	%xmm3, %xmm2
-	pmovmskb %xmm2, %edx
-	addq	$64, %rax
-	test	%edx, %edx
-	jz	L(aligned_64_loop)
-
-	pcmpeqd	-64(%rax), %xmm3
-	pmovmskb %xmm3, %edx
-    addq	$48, %rdi
-	test	%edx, %edx
-	jnz	L(exit)
-
-	pcmpeqd	%xmm1, %xmm3
-	pmovmskb %xmm3, %edx
-    addq	$-16, %rdi
-	test	%edx, %edx
-	jnz	L(exit)
-
-	pcmpeqd	-32(%rax), %xmm3
-	pmovmskb %xmm3, %edx
-    addq	$-16, %rdi
-	test	%edx, %edx
-	jnz	L(exit)
-
-	pcmpeqd	%xmm6, %xmm3
-	pmovmskb %xmm3, %edx
-    addq	$-16, %rdi
-	test	%edx, %edx
-	jz	L(aligned_64_loop)
-
-	.p2align 4
-L(exit):
-	sub	%rdi, %rax
-	shr	$2, %rax
-	test	%dl, %dl
-	jz	L(exit_high)
-
-	andl	$15, %edx
-	jz	L(exit_1)
-	ret
-
-	/* No align here. Naturally aligned % 16 == 1.  */
-L(exit_high):
-	andl	$(15 << 8), %edx
-	jz	L(exit_3)
-	add	$2, %rax
-	ret
-
-	.p2align 3
-L(exit_1):
-	add	$1, %rax
-	ret
-
-	.p2align 3
-L(exit_3):
-	add	$3, %rax
-	ret
-
-	.p2align 3
-L(exit_tail0):
-	xorl	%eax, %eax
-	ret
-
-	.p2align 3
-L(exit_tail1):
-	movl	$1, %eax
-	ret
-
-	.p2align 3
-L(exit_tail2):
-	movl	$2, %eax
-	ret
-
-	.p2align 3
-L(exit_tail3):
-	movl	$3, %eax
-	ret
-
-	.p2align 3
-L(exit_tail4):
-	movl	$4, %eax
-	ret
-
-	.p2align 3
-L(exit_tail5):
-	movl	$5, %eax
-	ret
-
-	.p2align 3
-L(exit_tail6):
-	movl	$6, %eax
-	ret
-
-	.p2align 3
-L(exit_tail7):
-	movl	$7, %eax
-	ret
-
-END (__wcslen)
-
+#define WCSLEN	__wcslen
+#include "multiarch/wcslen-sse2.S"
 weak_alias(__wcslen, wcslen)
-- 
2.34.1


^ permalink raw reply	[flat|nested] 21+ messages in thread

* [PATCH v1] x86: Remove unneeded rtld-wmemcmp
  2022-07-12 19:29 [PATCH v1] x86: Move strlen SSE2 implementation to multiarch/strlen-sse2.S Noah Goldstein
                   ` (6 preceding siblings ...)
  2022-07-12 19:29 ` [PATCH v1] x86: Move wcslen SSE2 implementation to multiarch/wcslen-sse2.S Noah Goldstein
@ 2022-07-12 19:29 ` Noah Goldstein
  2022-07-12 19:44   ` H.J. Lu
  2022-07-12 19:29 ` [PATCH v1] x86: Add missing rtm tests for strcmp family Noah Goldstein
  2022-07-12 23:29 ` [PATCH v1] x86: Move strlen SSE2 implementation to multiarch/strlen-sse2.S H.J. Lu
  9 siblings, 1 reply; 21+ messages in thread
From: Noah Goldstein @ 2022-07-12 19:29 UTC (permalink / raw)
  To: libc-alpha

wmemcmp isn't used by the dynamic loader so their no need to add an
RTLD stub for it.

Tested with and without multiarch on x86_64 for ISA levels:
{generic, x86-64-v2, x86-64-v3, x86-64-v4}

And m32 with and without multiarch.
---
 sysdeps/x86_64/multiarch/rtld-wmemcmp.S | 18 ------------------
 1 file changed, 18 deletions(-)
 delete mode 100644 sysdeps/x86_64/multiarch/rtld-wmemcmp.S

diff --git a/sysdeps/x86_64/multiarch/rtld-wmemcmp.S b/sysdeps/x86_64/multiarch/rtld-wmemcmp.S
deleted file mode 100644
index 71a6f0affa..0000000000
--- a/sysdeps/x86_64/multiarch/rtld-wmemcmp.S
+++ /dev/null
@@ -1,18 +0,0 @@
-/* Copyright (C) 2022 Free Software Foundation, Inc.
-   This file is part of the GNU C Library.
-
-   The GNU C Library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 2.1 of the License, or (at your option) any later version.
-
-   The GNU C Library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   Lesser General Public License for more details.
-
-   You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, see
-   <https://www.gnu.org/licenses/>.  */
-
-#include "../wmemcmp.S"
-- 
2.34.1


^ permalink raw reply	[flat|nested] 21+ messages in thread

* [PATCH v1] x86: Add missing rtm tests for strcmp family
  2022-07-12 19:29 [PATCH v1] x86: Move strlen SSE2 implementation to multiarch/strlen-sse2.S Noah Goldstein
                   ` (7 preceding siblings ...)
  2022-07-12 19:29 ` [PATCH v1] x86: Remove unneeded rtld-wmemcmp Noah Goldstein
@ 2022-07-12 19:29 ` Noah Goldstein
  2022-07-12 19:59   ` H.J. Lu
  2022-07-12 23:29 ` [PATCH v1] x86: Move strlen SSE2 implementation to multiarch/strlen-sse2.S H.J. Lu
  9 siblings, 1 reply; 21+ messages in thread
From: Noah Goldstein @ 2022-07-12 19:29 UTC (permalink / raw)
  To: libc-alpha

Add new tests for:
    strcasecmp
    strncasecmp
    strcmp
    wcscmp

These functions all have avx2_rtm implementations so should be tested.
---
 sysdeps/x86/Makefile              |  8 ++++
 sysdeps/x86/tst-strcasecmp-rtm.c  | 23 ++++++++++
 sysdeps/x86/tst-strcmp-rtm.c      | 70 +++++++++++++++++++++++++++++++
 sysdeps/x86/tst-strncasecmp-rtm.c | 23 ++++++++++
 sysdeps/x86/tst-strncmp-rtm.c     |  6 ++-
 sysdeps/x86/tst-wcscmp-rtm.c      | 22 ++++++++++
 6 files changed, 150 insertions(+), 2 deletions(-)
 create mode 100644 sysdeps/x86/tst-strcasecmp-rtm.c
 create mode 100644 sysdeps/x86/tst-strcmp-rtm.c
 create mode 100644 sysdeps/x86/tst-strncasecmp-rtm.c
 create mode 100644 sysdeps/x86/tst-wcscmp-rtm.c

diff --git a/sysdeps/x86/Makefile b/sysdeps/x86/Makefile
index c6bee981f8..56fd5fc805 100644
--- a/sysdeps/x86/Makefile
+++ b/sysdeps/x86/Makefile
@@ -95,11 +95,15 @@ tests += \
   tst-memmove-rtm \
   tst-memrchr-rtm \
   tst-memset-rtm \
+  tst-strcasecmp-rtm \
   tst-strchr-rtm \
+  tst-strcmp-rtm \
   tst-strcpy-rtm \
   tst-strlen-rtm \
+  tst-strncasecmp-rtm \
   tst-strncmp-rtm \
   tst-strrchr-rtm \
+  tst-wcscmp-rtm \
   tst-wcsncmp-rtm \
 # tests
 
@@ -108,11 +112,15 @@ CFLAGS-tst-memcmp-rtm.c += -mrtm
 CFLAGS-tst-memmove-rtm.c += -mrtm
 CFLAGS-tst-memrchr-rtm.c += -mrtm
 CFLAGS-tst-memset-rtm.c += -mrtm
+CFLAGS-tst-strcasecmp-rtm.c += -mrtm
 CFLAGS-tst-strchr-rtm.c += -mrtm
+CFLAGS-tst-strcmp-rtm.c += -mrtm
 CFLAGS-tst-strcpy-rtm.c += -mrtm
 CFLAGS-tst-strlen-rtm.c += -mrtm
+CFLAGS-tst-strncasecmp-rtm.c += -mrtm -Wno-error
 CFLAGS-tst-strncmp-rtm.c += -mrtm -Wno-error
 CFLAGS-tst-strrchr-rtm.c += -mrtm
+CFLAGS-tst-wcscmp-rtm.c += -mrtm
 CFLAGS-tst-wcsncmp-rtm.c += -mrtm -Wno-error
 endif
 
diff --git a/sysdeps/x86/tst-strcasecmp-rtm.c b/sysdeps/x86/tst-strcasecmp-rtm.c
new file mode 100644
index 0000000000..da460799ce
--- /dev/null
+++ b/sysdeps/x86/tst-strcasecmp-rtm.c
@@ -0,0 +1,23 @@
+/* Test case for strcasecmp inside a transactionally executing RTM
+   region.
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#define STRCMP strcasecmp
+#define TEST_NAME "strcasecmp"
+
+#include "tst-strcmp-rtm.c"
diff --git a/sysdeps/x86/tst-strcmp-rtm.c b/sysdeps/x86/tst-strcmp-rtm.c
new file mode 100644
index 0000000000..371916a2f0
--- /dev/null
+++ b/sysdeps/x86/tst-strcmp-rtm.c
@@ -0,0 +1,70 @@
+/* Test case for strcmp inside a transactionally executing RTM
+   region.
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <stdint.h>
+#include <tst-string-rtm.h>
+
+#ifdef WIDE
+# define CHAR wchar_t
+# define MEMSET wmemset
+# define STRCMP wcscmp
+# define TEST_NAME "wcscmp"
+#else /* !WIDE */
+# define CHAR char
+# define MEMSET memset
+
+# ifndef STRCMP
+#  define STRCMP strcmp
+#  define TEST_NAME "strcmp"
+# endif
+#endif
+
+#define LOOP 3000
+#define STRING_SIZE 1024
+CHAR string1[STRING_SIZE];
+CHAR string2[STRING_SIZE];
+
+__attribute__ ((noinline, noclone))
+static int
+prepare (void)
+{
+  MEMSET (string1, 'a', STRING_SIZE - 1);
+  MEMSET (string2, 'a', STRING_SIZE - 1);
+  if (STRCMP (string1, string2) == 0)
+    return EXIT_SUCCESS;
+  else
+    return EXIT_FAILURE;
+}
+
+__attribute__ ((noinline, noclone))
+static int
+function (void)
+{
+  if (STRCMP (string1, string2) == 0)
+    return 0;
+  else
+    return 1;
+}
+
+
+static int
+do_test (void)
+{
+  return do_test_1 (TEST_NAME, LOOP, prepare, function);
+}
diff --git a/sysdeps/x86/tst-strncasecmp-rtm.c b/sysdeps/x86/tst-strncasecmp-rtm.c
new file mode 100644
index 0000000000..4ebe58951b
--- /dev/null
+++ b/sysdeps/x86/tst-strncasecmp-rtm.c
@@ -0,0 +1,23 @@
+/* Test case for strncasecmp inside a transactionally executing RTM
+   region.
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#define STRNCMP strncasecmp
+#define TEST_NAME "strncasecmp"
+
+#include "tst-strncmp-rtm.c"
diff --git a/sysdeps/x86/tst-strncmp-rtm.c b/sysdeps/x86/tst-strncmp-rtm.c
index a3b14e72ff..2d27b20a68 100644
--- a/sysdeps/x86/tst-strncmp-rtm.c
+++ b/sysdeps/x86/tst-strncmp-rtm.c
@@ -27,8 +27,10 @@
 #else /* !WIDE */
 # define CHAR char
 # define MEMSET memset
-# define STRNCMP strncmp
-# define TEST_NAME "strncmp"
+# ifndef STRNCMP
+#  define STRNCMP strncmp
+#  define TEST_NAME "strncmp"
+# endif
 #endif /* !WIDE */
 
 
diff --git a/sysdeps/x86/tst-wcscmp-rtm.c b/sysdeps/x86/tst-wcscmp-rtm.c
new file mode 100644
index 0000000000..28a5b4b82d
--- /dev/null
+++ b/sysdeps/x86/tst-wcscmp-rtm.c
@@ -0,0 +1,22 @@
+/* Test case for wcscmp inside a transactionally executing RTM
+   region.
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#define WIDE 1
+#include <wchar.h>
+#include "tst-strcmp-rtm.c"
-- 
2.34.1


^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH v1] x86: Remove unneeded rtld-wmemcmp
  2022-07-12 19:29 ` [PATCH v1] x86: Remove unneeded rtld-wmemcmp Noah Goldstein
@ 2022-07-12 19:44   ` H.J. Lu
  0 siblings, 0 replies; 21+ messages in thread
From: H.J. Lu @ 2022-07-12 19:44 UTC (permalink / raw)
  To: Noah Goldstein; +Cc: GNU C Library, Carlos O'Donell

On Tue, Jul 12, 2022 at 12:29 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> wmemcmp isn't used by the dynamic loader so their no need to add an
> RTLD stub for it.
>
> Tested with and without multiarch on x86_64 for ISA levels:
> {generic, x86-64-v2, x86-64-v3, x86-64-v4}
>
> And m32 with and without multiarch.
> ---
>  sysdeps/x86_64/multiarch/rtld-wmemcmp.S | 18 ------------------
>  1 file changed, 18 deletions(-)
>  delete mode 100644 sysdeps/x86_64/multiarch/rtld-wmemcmp.S
>
> diff --git a/sysdeps/x86_64/multiarch/rtld-wmemcmp.S b/sysdeps/x86_64/multiarch/rtld-wmemcmp.S
> deleted file mode 100644
> index 71a6f0affa..0000000000
> --- a/sysdeps/x86_64/multiarch/rtld-wmemcmp.S
> +++ /dev/null
> @@ -1,18 +0,0 @@
> -/* Copyright (C) 2022 Free Software Foundation, Inc.
> -   This file is part of the GNU C Library.
> -
> -   The GNU C Library is free software; you can redistribute it and/or
> -   modify it under the terms of the GNU Lesser General Public
> -   License as published by the Free Software Foundation; either
> -   version 2.1 of the License, or (at your option) any later version.
> -
> -   The GNU C Library is distributed in the hope that it will be useful,
> -   but WITHOUT ANY WARRANTY; without even the implied warranty of
> -   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> -   Lesser General Public License for more details.
> -
> -   You should have received a copy of the GNU Lesser General Public
> -   License along with the GNU C Library; if not, see
> -   <https://www.gnu.org/licenses/>.  */
> -
> -#include "../wmemcmp.S"
> --
> 2.34.1
>

LGTM.

Thanks.

-- 
H.J.

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH v1] x86: Add missing rtm tests for strcmp family
  2022-07-12 19:29 ` [PATCH v1] x86: Add missing rtm tests for strcmp family Noah Goldstein
@ 2022-07-12 19:59   ` H.J. Lu
  0 siblings, 0 replies; 21+ messages in thread
From: H.J. Lu @ 2022-07-12 19:59 UTC (permalink / raw)
  To: Noah Goldstein; +Cc: GNU C Library, Carlos O'Donell

On Tue, Jul 12, 2022 at 12:29 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> Add new tests for:
>     strcasecmp
>     strncasecmp
>     strcmp
>     wcscmp
>
> These functions all have avx2_rtm implementations so should be tested.
> ---
>  sysdeps/x86/Makefile              |  8 ++++
>  sysdeps/x86/tst-strcasecmp-rtm.c  | 23 ++++++++++
>  sysdeps/x86/tst-strcmp-rtm.c      | 70 +++++++++++++++++++++++++++++++
>  sysdeps/x86/tst-strncasecmp-rtm.c | 23 ++++++++++
>  sysdeps/x86/tst-strncmp-rtm.c     |  6 ++-
>  sysdeps/x86/tst-wcscmp-rtm.c      | 22 ++++++++++
>  6 files changed, 150 insertions(+), 2 deletions(-)
>  create mode 100644 sysdeps/x86/tst-strcasecmp-rtm.c
>  create mode 100644 sysdeps/x86/tst-strcmp-rtm.c
>  create mode 100644 sysdeps/x86/tst-strncasecmp-rtm.c
>  create mode 100644 sysdeps/x86/tst-wcscmp-rtm.c
>
> diff --git a/sysdeps/x86/Makefile b/sysdeps/x86/Makefile
> index c6bee981f8..56fd5fc805 100644
> --- a/sysdeps/x86/Makefile
> +++ b/sysdeps/x86/Makefile
> @@ -95,11 +95,15 @@ tests += \
>    tst-memmove-rtm \
>    tst-memrchr-rtm \
>    tst-memset-rtm \
> +  tst-strcasecmp-rtm \
>    tst-strchr-rtm \
> +  tst-strcmp-rtm \
>    tst-strcpy-rtm \
>    tst-strlen-rtm \
> +  tst-strncasecmp-rtm \
>    tst-strncmp-rtm \
>    tst-strrchr-rtm \
> +  tst-wcscmp-rtm \
>    tst-wcsncmp-rtm \
>  # tests
>
> @@ -108,11 +112,15 @@ CFLAGS-tst-memcmp-rtm.c += -mrtm
>  CFLAGS-tst-memmove-rtm.c += -mrtm
>  CFLAGS-tst-memrchr-rtm.c += -mrtm
>  CFLAGS-tst-memset-rtm.c += -mrtm
> +CFLAGS-tst-strcasecmp-rtm.c += -mrtm
>  CFLAGS-tst-strchr-rtm.c += -mrtm
> +CFLAGS-tst-strcmp-rtm.c += -mrtm
>  CFLAGS-tst-strcpy-rtm.c += -mrtm
>  CFLAGS-tst-strlen-rtm.c += -mrtm
> +CFLAGS-tst-strncasecmp-rtm.c += -mrtm -Wno-error
>  CFLAGS-tst-strncmp-rtm.c += -mrtm -Wno-error
>  CFLAGS-tst-strrchr-rtm.c += -mrtm
> +CFLAGS-tst-wcscmp-rtm.c += -mrtm
>  CFLAGS-tst-wcsncmp-rtm.c += -mrtm -Wno-error
>  endif
>
> diff --git a/sysdeps/x86/tst-strcasecmp-rtm.c b/sysdeps/x86/tst-strcasecmp-rtm.c
> new file mode 100644
> index 0000000000..da460799ce
> --- /dev/null
> +++ b/sysdeps/x86/tst-strcasecmp-rtm.c
> @@ -0,0 +1,23 @@
> +/* Test case for strcasecmp inside a transactionally executing RTM
> +   region.
> +   Copyright (C) 2022 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +#define STRCMP strcasecmp
> +#define TEST_NAME "strcasecmp"
> +
> +#include "tst-strcmp-rtm.c"
> diff --git a/sysdeps/x86/tst-strcmp-rtm.c b/sysdeps/x86/tst-strcmp-rtm.c
> new file mode 100644
> index 0000000000..371916a2f0
> --- /dev/null
> +++ b/sysdeps/x86/tst-strcmp-rtm.c
> @@ -0,0 +1,70 @@
> +/* Test case for strcmp inside a transactionally executing RTM
> +   region.
> +   Copyright (C) 2022 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +#include <stdint.h>
> +#include <tst-string-rtm.h>
> +
> +#ifdef WIDE
> +# define CHAR wchar_t
> +# define MEMSET wmemset
> +# define STRCMP wcscmp
> +# define TEST_NAME "wcscmp"
> +#else /* !WIDE */
> +# define CHAR char
> +# define MEMSET memset
> +
> +# ifndef STRCMP
> +#  define STRCMP strcmp
> +#  define TEST_NAME "strcmp"
> +# endif
> +#endif
> +
> +#define LOOP 3000
> +#define STRING_SIZE 1024
> +CHAR string1[STRING_SIZE];
> +CHAR string2[STRING_SIZE];
> +
> +__attribute__ ((noinline, noclone))
> +static int
> +prepare (void)
> +{
> +  MEMSET (string1, 'a', STRING_SIZE - 1);
> +  MEMSET (string2, 'a', STRING_SIZE - 1);
> +  if (STRCMP (string1, string2) == 0)
> +    return EXIT_SUCCESS;
> +  else
> +    return EXIT_FAILURE;
> +}
> +
> +__attribute__ ((noinline, noclone))
> +static int
> +function (void)
> +{
> +  if (STRCMP (string1, string2) == 0)
> +    return 0;
> +  else
> +    return 1;
> +}
> +
> +
> +static int
> +do_test (void)
> +{
> +  return do_test_1 (TEST_NAME, LOOP, prepare, function);
> +}
> diff --git a/sysdeps/x86/tst-strncasecmp-rtm.c b/sysdeps/x86/tst-strncasecmp-rtm.c
> new file mode 100644
> index 0000000000..4ebe58951b
> --- /dev/null
> +++ b/sysdeps/x86/tst-strncasecmp-rtm.c
> @@ -0,0 +1,23 @@
> +/* Test case for strncasecmp inside a transactionally executing RTM
> +   region.
> +   Copyright (C) 2022 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +#define STRNCMP strncasecmp
> +#define TEST_NAME "strncasecmp"
> +
> +#include "tst-strncmp-rtm.c"
> diff --git a/sysdeps/x86/tst-strncmp-rtm.c b/sysdeps/x86/tst-strncmp-rtm.c
> index a3b14e72ff..2d27b20a68 100644
> --- a/sysdeps/x86/tst-strncmp-rtm.c
> +++ b/sysdeps/x86/tst-strncmp-rtm.c
> @@ -27,8 +27,10 @@
>  #else /* !WIDE */
>  # define CHAR char
>  # define MEMSET memset
> -# define STRNCMP strncmp
> -# define TEST_NAME "strncmp"
> +# ifndef STRNCMP
> +#  define STRNCMP strncmp
> +#  define TEST_NAME "strncmp"
> +# endif
>  #endif /* !WIDE */
>
>
> diff --git a/sysdeps/x86/tst-wcscmp-rtm.c b/sysdeps/x86/tst-wcscmp-rtm.c
> new file mode 100644
> index 0000000000..28a5b4b82d
> --- /dev/null
> +++ b/sysdeps/x86/tst-wcscmp-rtm.c
> @@ -0,0 +1,22 @@
> +/* Test case for wcscmp inside a transactionally executing RTM
> +   region.
> +   Copyright (C) 2022 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +#define WIDE 1
> +#include <wchar.h>
> +#include "tst-strcmp-rtm.c"
> --
> 2.34.1
>

LGTM.

Thanks.

-- 
H.J.

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH v1] x86: Move wcslen SSE2 implementation to multiarch/wcslen-sse2.S
  2022-07-12 19:29 ` [PATCH v1] x86: Move wcslen SSE2 implementation to multiarch/wcslen-sse2.S Noah Goldstein
@ 2022-07-12 20:26   ` H.J. Lu
  0 siblings, 0 replies; 21+ messages in thread
From: H.J. Lu @ 2022-07-12 20:26 UTC (permalink / raw)
  To: Noah Goldstein; +Cc: GNU C Library, Carlos O'Donell

On Tue, Jul 12, 2022 at 12:29 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> This commit doesn't affect libc.so.6, its just housekeeping to prepare
> for adding explicit ISA level support.
>
> Tested build on x86_64 and x86_32 with/without multiarch.
> ---
>  sysdeps/x86_64/multiarch/wcslen-sse2.S | 221 ++++++++++++++++++++++++-
>  sysdeps/x86_64/wcslen.S                | 216 +-----------------------
>  2 files changed, 218 insertions(+), 219 deletions(-)
>
> diff --git a/sysdeps/x86_64/multiarch/wcslen-sse2.S b/sysdeps/x86_64/multiarch/wcslen-sse2.S
> index 2b3a9efd64..944c3bd9c6 100644
> --- a/sysdeps/x86_64/multiarch/wcslen-sse2.S
> +++ b/sysdeps/x86_64/multiarch/wcslen-sse2.S
> @@ -17,10 +17,221 @@
>     <https://www.gnu.org/licenses/>.  */
>
>  #if IS_IN (libc)
> -# define __wcslen __wcslen_sse2
> -
> -# undef weak_alias
> -# define weak_alias(__wcslen, wcslen)
> +# ifndef WCSLEN
> +#  define WCSLEN __wcslen_sse2
> +# endif
>  #endif
>
> -#include "../wcslen.S"
> +#include <sysdep.h>
> +
> +       .text
> +ENTRY (WCSLEN)
> +       cmpl    $0, (%rdi)
> +       jz      L(exit_tail0)
> +       cmpl    $0, 4(%rdi)
> +       jz      L(exit_tail1)
> +       cmpl    $0, 8(%rdi)
> +       jz      L(exit_tail2)
> +       cmpl    $0, 12(%rdi)
> +       jz      L(exit_tail3)
> +       cmpl    $0, 16(%rdi)
> +       jz      L(exit_tail4)
> +       cmpl    $0, 20(%rdi)
> +       jz      L(exit_tail5)
> +       cmpl    $0, 24(%rdi)
> +       jz      L(exit_tail6)
> +       cmpl    $0, 28(%rdi)
> +       jz      L(exit_tail7)
> +
> +       pxor    %xmm0, %xmm0
> +
> +       lea     32(%rdi), %rax
> +       addq    $16, %rdi
> +       and     $-16, %rax
> +
> +       pcmpeqd (%rax), %xmm0
> +       pmovmskb %xmm0, %edx
> +       pxor    %xmm1, %xmm1
> +       addq    $16, %rax
> +       test    %edx, %edx
> +       jnz     L(exit)
> +
> +       pcmpeqd (%rax), %xmm1
> +       pmovmskb %xmm1, %edx
> +       pxor    %xmm2, %xmm2
> +       addq    $16, %rax
> +       test    %edx, %edx
> +       jnz     L(exit)
> +
> +       pcmpeqd (%rax), %xmm2
> +       pmovmskb %xmm2, %edx
> +       pxor    %xmm3, %xmm3
> +       addq    $16, %rax
> +       test    %edx, %edx
> +       jnz     L(exit)
> +
> +       pcmpeqd (%rax), %xmm3
> +       pmovmskb %xmm3, %edx
> +       addq    $16, %rax
> +       test    %edx, %edx
> +       jnz     L(exit)
> +
> +       pcmpeqd (%rax), %xmm0
> +       pmovmskb %xmm0, %edx
> +       addq    $16, %rax
> +       test    %edx, %edx
> +       jnz     L(exit)
> +
> +       pcmpeqd (%rax), %xmm1
> +       pmovmskb %xmm1, %edx
> +       addq    $16, %rax
> +       test    %edx, %edx
> +       jnz     L(exit)
> +
> +       pcmpeqd (%rax), %xmm2
> +       pmovmskb %xmm2, %edx
> +       addq    $16, %rax
> +       test    %edx, %edx
> +       jnz     L(exit)
> +
> +       pcmpeqd (%rax), %xmm3
> +       pmovmskb %xmm3, %edx
> +       addq    $16, %rax
> +       test    %edx, %edx
> +       jnz     L(exit)
> +
> +       pcmpeqd (%rax), %xmm0
> +       pmovmskb %xmm0, %edx
> +       addq    $16, %rax
> +       test    %edx, %edx
> +       jnz     L(exit)
> +
> +       pcmpeqd (%rax), %xmm1
> +       pmovmskb %xmm1, %edx
> +       addq    $16, %rax
> +       test    %edx, %edx
> +       jnz     L(exit)
> +
> +       pcmpeqd (%rax), %xmm2
> +       pmovmskb %xmm2, %edx
> +       addq    $16, %rax
> +       test    %edx, %edx
> +       jnz     L(exit)
> +
> +       pcmpeqd (%rax), %xmm3
> +       pmovmskb %xmm3, %edx
> +       addq    $16, %rax
> +       test    %edx, %edx
> +       jnz     L(exit)
> +
> +       and     $-0x40, %rax
> +
> +       .p2align 4
> +L(aligned_64_loop):
> +       movaps  (%rax), %xmm0
> +       movaps  16(%rax), %xmm1
> +       movaps  32(%rax), %xmm2
> +       movaps  48(%rax), %xmm6
> +
> +       pminub  %xmm1, %xmm0
> +       pminub  %xmm6, %xmm2
> +       pminub  %xmm0, %xmm2
> +       pcmpeqd %xmm3, %xmm2
> +       pmovmskb %xmm2, %edx
> +       addq    $64, %rax
> +       test    %edx, %edx
> +       jz      L(aligned_64_loop)
> +
> +       pcmpeqd -64(%rax), %xmm3
> +       pmovmskb %xmm3, %edx
> +    addq       $48, %rdi
> +       test    %edx, %edx
> +       jnz     L(exit)
> +
> +       pcmpeqd %xmm1, %xmm3
> +       pmovmskb %xmm3, %edx
> +    addq       $-16, %rdi
> +       test    %edx, %edx
> +       jnz     L(exit)
> +
> +       pcmpeqd -32(%rax), %xmm3
> +       pmovmskb %xmm3, %edx
> +    addq       $-16, %rdi
> +       test    %edx, %edx
> +       jnz     L(exit)
> +
> +       pcmpeqd %xmm6, %xmm3
> +       pmovmskb %xmm3, %edx
> +    addq       $-16, %rdi
> +       test    %edx, %edx
> +       jz      L(aligned_64_loop)
> +
> +       .p2align 4
> +L(exit):
> +       sub     %rdi, %rax
> +       shr     $2, %rax
> +       test    %dl, %dl
> +       jz      L(exit_high)
> +
> +       andl    $15, %edx
> +       jz      L(exit_1)
> +       ret
> +
> +       /* No align here. Naturally aligned % 16 == 1.  */
> +L(exit_high):
> +       andl    $(15 << 8), %edx
> +       jz      L(exit_3)
> +       add     $2, %rax
> +       ret
> +
> +       .p2align 3
> +L(exit_1):
> +       add     $1, %rax
> +       ret
> +
> +       .p2align 3
> +L(exit_3):
> +       add     $3, %rax
> +       ret
> +
> +       .p2align 3
> +L(exit_tail0):
> +       xorl    %eax, %eax
> +       ret
> +
> +       .p2align 3
> +L(exit_tail1):
> +       movl    $1, %eax
> +       ret
> +
> +       .p2align 3
> +L(exit_tail2):
> +       movl    $2, %eax
> +       ret
> +
> +       .p2align 3
> +L(exit_tail3):
> +       movl    $3, %eax
> +       ret
> +
> +       .p2align 3
> +L(exit_tail4):
> +       movl    $4, %eax
> +       ret
> +
> +       .p2align 3
> +L(exit_tail5):
> +       movl    $5, %eax
> +       ret
> +
> +       .p2align 3
> +L(exit_tail6):
> +       movl    $6, %eax
> +       ret
> +
> +       .p2align 3
> +L(exit_tail7):
> +       movl    $7, %eax
> +       ret
> +
> +END (WCSLEN)
> diff --git a/sysdeps/x86_64/wcslen.S b/sysdeps/x86_64/wcslen.S
> index d641141d75..588a0fbe01 100644
> --- a/sysdeps/x86_64/wcslen.S
> +++ b/sysdeps/x86_64/wcslen.S
> @@ -16,218 +16,6 @@
>     License along with the GNU C Library; if not, see
>     <https://www.gnu.org/licenses/>.  */
>
> -#include <sysdep.h>
> -
> -       .text
> -ENTRY (__wcslen)
> -       cmpl    $0, (%rdi)
> -       jz      L(exit_tail0)
> -       cmpl    $0, 4(%rdi)
> -       jz      L(exit_tail1)
> -       cmpl    $0, 8(%rdi)
> -       jz      L(exit_tail2)
> -       cmpl    $0, 12(%rdi)
> -       jz      L(exit_tail3)
> -       cmpl    $0, 16(%rdi)
> -       jz      L(exit_tail4)
> -       cmpl    $0, 20(%rdi)
> -       jz      L(exit_tail5)
> -       cmpl    $0, 24(%rdi)
> -       jz      L(exit_tail6)
> -       cmpl    $0, 28(%rdi)
> -       jz      L(exit_tail7)
> -
> -       pxor    %xmm0, %xmm0
> -
> -       lea     32(%rdi), %rax
> -       addq    $16, %rdi
> -       and     $-16, %rax
> -
> -       pcmpeqd (%rax), %xmm0
> -       pmovmskb %xmm0, %edx
> -       pxor    %xmm1, %xmm1
> -       addq    $16, %rax
> -       test    %edx, %edx
> -       jnz     L(exit)
> -
> -       pcmpeqd (%rax), %xmm1
> -       pmovmskb %xmm1, %edx
> -       pxor    %xmm2, %xmm2
> -       addq    $16, %rax
> -       test    %edx, %edx
> -       jnz     L(exit)
> -
> -       pcmpeqd (%rax), %xmm2
> -       pmovmskb %xmm2, %edx
> -       pxor    %xmm3, %xmm3
> -       addq    $16, %rax
> -       test    %edx, %edx
> -       jnz     L(exit)
> -
> -       pcmpeqd (%rax), %xmm3
> -       pmovmskb %xmm3, %edx
> -       addq    $16, %rax
> -       test    %edx, %edx
> -       jnz     L(exit)
> -
> -       pcmpeqd (%rax), %xmm0
> -       pmovmskb %xmm0, %edx
> -       addq    $16, %rax
> -       test    %edx, %edx
> -       jnz     L(exit)
> -
> -       pcmpeqd (%rax), %xmm1
> -       pmovmskb %xmm1, %edx
> -       addq    $16, %rax
> -       test    %edx, %edx
> -       jnz     L(exit)
> -
> -       pcmpeqd (%rax), %xmm2
> -       pmovmskb %xmm2, %edx
> -       addq    $16, %rax
> -       test    %edx, %edx
> -       jnz     L(exit)
> -
> -       pcmpeqd (%rax), %xmm3
> -       pmovmskb %xmm3, %edx
> -       addq    $16, %rax
> -       test    %edx, %edx
> -       jnz     L(exit)
> -
> -       pcmpeqd (%rax), %xmm0
> -       pmovmskb %xmm0, %edx
> -       addq    $16, %rax
> -       test    %edx, %edx
> -       jnz     L(exit)
> -
> -       pcmpeqd (%rax), %xmm1
> -       pmovmskb %xmm1, %edx
> -       addq    $16, %rax
> -       test    %edx, %edx
> -       jnz     L(exit)
> -
> -       pcmpeqd (%rax), %xmm2
> -       pmovmskb %xmm2, %edx
> -       addq    $16, %rax
> -       test    %edx, %edx
> -       jnz     L(exit)
> -
> -       pcmpeqd (%rax), %xmm3
> -       pmovmskb %xmm3, %edx
> -       addq    $16, %rax
> -       test    %edx, %edx
> -       jnz     L(exit)
> -
> -       and     $-0x40, %rax
> -
> -       .p2align 4
> -L(aligned_64_loop):
> -       movaps  (%rax), %xmm0
> -       movaps  16(%rax), %xmm1
> -       movaps  32(%rax), %xmm2
> -       movaps  48(%rax), %xmm6
> -
> -       pminub  %xmm1, %xmm0
> -       pminub  %xmm6, %xmm2
> -       pminub  %xmm0, %xmm2
> -       pcmpeqd %xmm3, %xmm2
> -       pmovmskb %xmm2, %edx
> -       addq    $64, %rax
> -       test    %edx, %edx
> -       jz      L(aligned_64_loop)
> -
> -       pcmpeqd -64(%rax), %xmm3
> -       pmovmskb %xmm3, %edx
> -    addq       $48, %rdi
> -       test    %edx, %edx
> -       jnz     L(exit)
> -
> -       pcmpeqd %xmm1, %xmm3
> -       pmovmskb %xmm3, %edx
> -    addq       $-16, %rdi
> -       test    %edx, %edx
> -       jnz     L(exit)
> -
> -       pcmpeqd -32(%rax), %xmm3
> -       pmovmskb %xmm3, %edx
> -    addq       $-16, %rdi
> -       test    %edx, %edx
> -       jnz     L(exit)
> -
> -       pcmpeqd %xmm6, %xmm3
> -       pmovmskb %xmm3, %edx
> -    addq       $-16, %rdi
> -       test    %edx, %edx
> -       jz      L(aligned_64_loop)
> -
> -       .p2align 4
> -L(exit):
> -       sub     %rdi, %rax
> -       shr     $2, %rax
> -       test    %dl, %dl
> -       jz      L(exit_high)
> -
> -       andl    $15, %edx
> -       jz      L(exit_1)
> -       ret
> -
> -       /* No align here. Naturally aligned % 16 == 1.  */
> -L(exit_high):
> -       andl    $(15 << 8), %edx
> -       jz      L(exit_3)
> -       add     $2, %rax
> -       ret
> -
> -       .p2align 3
> -L(exit_1):
> -       add     $1, %rax
> -       ret
> -
> -       .p2align 3
> -L(exit_3):
> -       add     $3, %rax
> -       ret
> -
> -       .p2align 3
> -L(exit_tail0):
> -       xorl    %eax, %eax
> -       ret
> -
> -       .p2align 3
> -L(exit_tail1):
> -       movl    $1, %eax
> -       ret
> -
> -       .p2align 3
> -L(exit_tail2):
> -       movl    $2, %eax
> -       ret
> -
> -       .p2align 3
> -L(exit_tail3):
> -       movl    $3, %eax
> -       ret
> -
> -       .p2align 3
> -L(exit_tail4):
> -       movl    $4, %eax
> -       ret
> -
> -       .p2align 3
> -L(exit_tail5):
> -       movl    $5, %eax
> -       ret
> -
> -       .p2align 3
> -L(exit_tail6):
> -       movl    $6, %eax
> -       ret
> -
> -       .p2align 3
> -L(exit_tail7):
> -       movl    $7, %eax
> -       ret
> -
> -END (__wcslen)
> -
> +#define WCSLEN __wcslen
> +#include "multiarch/wcslen-sse2.S"
>  weak_alias(__wcslen, wcslen)
> --
> 2.34.1
>

LGTM.

Thanks.

-- 
H.J.

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH v1] x86: Move wcschr SSE2 implementation to multiarch/wcschr-sse2.S
  2022-07-12 19:29 ` [PATCH v1] x86: Move wcschr SSE2 implementation to multiarch/wcschr-sse2.S Noah Goldstein
@ 2022-07-12 20:55   ` H.J. Lu
  0 siblings, 0 replies; 21+ messages in thread
From: H.J. Lu @ 2022-07-12 20:55 UTC (permalink / raw)
  To: Noah Goldstein; +Cc: GNU C Library, Carlos O'Donell

On Tue, Jul 12, 2022 at 12:29 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> This commit doesn't affect libc.so.6, its just housekeeping to prepare
> for adding explicit ISA level support.
>
> Tested build on x86_64 and x86_32 with/without multiarch.
> ---
>  sysdeps/x86_64/multiarch/wcschr-sse2.S | 145 +++++++++++++++++++++++--
>  sysdeps/x86_64/wcschr.S                | 135 +----------------------
>  2 files changed, 138 insertions(+), 142 deletions(-)
>
> diff --git a/sysdeps/x86_64/multiarch/wcschr-sse2.S b/sysdeps/x86_64/multiarch/wcschr-sse2.S
> index 218ea609b9..c872926ba9 100644
> --- a/sysdeps/x86_64/multiarch/wcschr-sse2.S
> +++ b/sysdeps/x86_64/multiarch/wcschr-sse2.S
> @@ -17,14 +17,141 @@
>     <https://www.gnu.org/licenses/>.  */
>
>  #if IS_IN (libc)
> -# define __wcschr __wcschr_sse2
> -
> -# undef weak_alias
> -# define weak_alias(__wcschr, wcschr)
> -# undef libc_hidden_def
> -# define libc_hidden_def(__wcschr)
> -# undef libc_hidden_weak
> -# define libc_hidden_weak(wcschr)
> +# ifndef WCSCHR
> +#  define WCSCHR __wcschr_sse2
> +# endif
>  #endif
>
> -#include "../wcschr.S"
> +#include <sysdep.h>
> +
> +       .text
> +ENTRY (WCSCHR)
> +
> +       movd    %rsi, %xmm1
> +       pxor    %xmm2, %xmm2
> +       mov     %rdi, %rcx
> +       punpckldq %xmm1, %xmm1
> +       punpckldq %xmm1, %xmm1
> +
> +       and     $63, %rcx
> +       cmp     $48, %rcx
> +       ja      L(cross_cache)
> +
> +       movdqu  (%rdi), %xmm0
> +       pcmpeqd %xmm0, %xmm2
> +       add     $16, %rdi
> +       pcmpeqd %xmm1, %xmm0
> +       pmovmskb %xmm2, %rdx
> +       pmovmskb %xmm0, %rax
> +       or      %rax, %rdx
> +       jnz     L(matches)
> +
> +       and     $-16, %rdi
> +
> +       movdqa  (%rdi), %xmm0
> +       pcmpeqd %xmm0, %xmm2
> +       add     $16, %rdi
> +       pcmpeqd %xmm1, %xmm0
> +       pmovmskb %xmm2, %rdx
> +       pmovmskb %xmm0, %rax
> +       or      %rax, %rdx
> +       jnz     L(matches)
> +
> +       jmp     L(loop)
> +
> +L(cross_cache):
> +       and     $15, %rcx
> +       and     $-16, %rdi
> +       movdqa  (%rdi), %xmm0
> +       pcmpeqd %xmm0, %xmm2
> +       pcmpeqd %xmm1, %xmm0
> +       pmovmskb %xmm2, %rdx
> +       pmovmskb %xmm0, %rax
> +
> +       sar     %cl, %rdx
> +       sar     %cl, %rax
> +       test    %rax, %rax
> +       je      L(unaligned_no_match)
> +
> +       bsf     %rax, %rax
> +       test    %rdx, %rdx
> +       je      L(unaligned_match)
> +       bsf     %rdx, %rdx
> +       cmp     %rdx, %rax
> +       ja      L(return_null)
> +
> +L(unaligned_match):
> +       add     %rdi, %rax
> +       add     %rcx, %rax
> +       ret
> +
> +       .p2align 4
> +L(unaligned_no_match):
> +       test    %rdx, %rdx
> +       jne     L(return_null)
> +       pxor    %xmm2, %xmm2
> +
> +       add     $16, %rdi
> +
> +       .p2align 4
> +/* Loop start on aligned string.  */
> +L(loop):
> +       movdqa  (%rdi), %xmm0
> +       pcmpeqd %xmm0, %xmm2
> +       add     $16, %rdi
> +       pcmpeqd %xmm1, %xmm0
> +       pmovmskb %xmm2, %rdx
> +       pmovmskb %xmm0, %rax
> +       or      %rax, %rdx
> +       jnz     L(matches)
> +
> +       movdqa  (%rdi), %xmm0
> +       pcmpeqd %xmm0, %xmm2
> +       add     $16, %rdi
> +       pcmpeqd %xmm1, %xmm0
> +       pmovmskb %xmm2, %rdx
> +       pmovmskb %xmm0, %rax
> +       or      %rax, %rdx
> +       jnz     L(matches)
> +
> +       movdqa  (%rdi), %xmm0
> +       pcmpeqd %xmm0, %xmm2
> +       add     $16, %rdi
> +       pcmpeqd %xmm1, %xmm0
> +       pmovmskb %xmm2, %rdx
> +       pmovmskb %xmm0, %rax
> +       or      %rax, %rdx
> +       jnz     L(matches)
> +
> +       movdqa  (%rdi), %xmm0
> +       pcmpeqd %xmm0, %xmm2
> +       add     $16, %rdi
> +       pcmpeqd %xmm1, %xmm0
> +       pmovmskb %xmm2, %rdx
> +       pmovmskb %xmm0, %rax
> +       or      %rax, %rdx
> +       jnz     L(matches)
> +       jmp     L(loop)
> +
> +       .p2align 4
> +L(matches):
> +       pmovmskb %xmm2, %rdx
> +       test    %rax, %rax
> +       jz      L(return_null)
> +       bsf     %rax, %rax
> +       test    %rdx, %rdx
> +       je      L(match)
> +       bsf     %rdx, %rcx
> +       cmp     %rcx, %rax
> +       ja      L(return_null)
> +L(match):
> +       sub     $16, %rdi
> +       add     %rdi, %rax
> +       ret
> +
> +       .p2align 4
> +L(return_null):
> +       xor     %rax, %rax
> +       ret
> +
> +END (WCSCHR)
> diff --git a/sysdeps/x86_64/wcschr.S b/sysdeps/x86_64/wcschr.S
> index 2131220382..80b12c4286 100644
> --- a/sysdeps/x86_64/wcschr.S
> +++ b/sysdeps/x86_64/wcschr.S
> @@ -16,140 +16,9 @@
>     License along with the GNU C Library; if not, see
>     <https://www.gnu.org/licenses/>.  */
>
> -#include <sysdep.h>
> -
> -       .text
> -ENTRY (__wcschr)
> -
> -       movd    %rsi, %xmm1
> -       pxor    %xmm2, %xmm2
> -       mov     %rdi, %rcx
> -       punpckldq %xmm1, %xmm1
> -       punpckldq %xmm1, %xmm1
> -
> -       and     $63, %rcx
> -       cmp     $48, %rcx
> -       ja      L(cross_cache)
> -
> -       movdqu  (%rdi), %xmm0
> -       pcmpeqd %xmm0, %xmm2
> -       add     $16, %rdi
> -       pcmpeqd %xmm1, %xmm0
> -       pmovmskb %xmm2, %rdx
> -       pmovmskb %xmm0, %rax
> -       or      %rax, %rdx
> -       jnz     L(matches)
> -
> -       and     $-16, %rdi
> -
> -       movdqa  (%rdi), %xmm0
> -       pcmpeqd %xmm0, %xmm2
> -       add     $16, %rdi
> -       pcmpeqd %xmm1, %xmm0
> -       pmovmskb %xmm2, %rdx
> -       pmovmskb %xmm0, %rax
> -       or      %rax, %rdx
> -       jnz     L(matches)
> -
> -       jmp     L(loop)
> -
> -L(cross_cache):
> -       and     $15, %rcx
> -       and     $-16, %rdi
> -       movdqa  (%rdi), %xmm0
> -       pcmpeqd %xmm0, %xmm2
> -       pcmpeqd %xmm1, %xmm0
> -       pmovmskb %xmm2, %rdx
> -       pmovmskb %xmm0, %rax
> -
> -       sar     %cl, %rdx
> -       sar     %cl, %rax
> -       test    %rax, %rax
> -       je      L(unaligned_no_match)
> -
> -       bsf     %rax, %rax
> -       test    %rdx, %rdx
> -       je      L(unaligned_match)
> -       bsf     %rdx, %rdx
> -       cmp     %rdx, %rax
> -       ja      L(return_null)
> -
> -L(unaligned_match):
> -       add     %rdi, %rax
> -       add     %rcx, %rax
> -       ret
> -
> -       .p2align 4
> -L(unaligned_no_match):
> -       test    %rdx, %rdx
> -       jne     L(return_null)
> -       pxor    %xmm2, %xmm2
> -
> -       add     $16, %rdi
> -
> -       .p2align 4
> -/* Loop start on aligned string.  */
> -L(loop):
> -       movdqa  (%rdi), %xmm0
> -       pcmpeqd %xmm0, %xmm2
> -       add     $16, %rdi
> -       pcmpeqd %xmm1, %xmm0
> -       pmovmskb %xmm2, %rdx
> -       pmovmskb %xmm0, %rax
> -       or      %rax, %rdx
> -       jnz     L(matches)
> -
> -       movdqa  (%rdi), %xmm0
> -       pcmpeqd %xmm0, %xmm2
> -       add     $16, %rdi
> -       pcmpeqd %xmm1, %xmm0
> -       pmovmskb %xmm2, %rdx
> -       pmovmskb %xmm0, %rax
> -       or      %rax, %rdx
> -       jnz     L(matches)
> -
> -       movdqa  (%rdi), %xmm0
> -       pcmpeqd %xmm0, %xmm2
> -       add     $16, %rdi
> -       pcmpeqd %xmm1, %xmm0
> -       pmovmskb %xmm2, %rdx
> -       pmovmskb %xmm0, %rax
> -       or      %rax, %rdx
> -       jnz     L(matches)
> -
> -       movdqa  (%rdi), %xmm0
> -       pcmpeqd %xmm0, %xmm2
> -       add     $16, %rdi
> -       pcmpeqd %xmm1, %xmm0
> -       pmovmskb %xmm2, %rdx
> -       pmovmskb %xmm0, %rax
> -       or      %rax, %rdx
> -       jnz     L(matches)
> -       jmp     L(loop)
> -
> -       .p2align 4
> -L(matches):
> -       pmovmskb %xmm2, %rdx
> -       test    %rax, %rax
> -       jz      L(return_null)
> -       bsf     %rax, %rax
> -       test    %rdx, %rdx
> -       je      L(match)
> -       bsf     %rdx, %rcx
> -       cmp     %rcx, %rax
> -       ja      L(return_null)
> -L(match):
> -       sub     $16, %rdi
> -       add     %rdi, %rax
> -       ret
> -
> -       .p2align 4
> -L(return_null):
> -       xor     %rax, %rax
> -       ret
> -
> -END (__wcschr)
>
> +#define WCSCHR __wcschr
> +#include "multiarch/wcschr-sse2.S"
>  libc_hidden_def(__wcschr)
>  weak_alias (__wcschr, wcschr)
>  libc_hidden_weak (wcschr)
> --
> 2.34.1
>

LGTM.

Thanks.

-- 
H.J.

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH v1] x86: Move strcat SSE2 implementation to multiarch/strcat-sse2.S
  2022-07-12 19:29 ` [PATCH v1] x86: Move strcat SSE2 implementation to multiarch/strcat-sse2.S Noah Goldstein
@ 2022-07-12 21:16   ` H.J. Lu
  0 siblings, 0 replies; 21+ messages in thread
From: H.J. Lu @ 2022-07-12 21:16 UTC (permalink / raw)
  To: Noah Goldstein; +Cc: GNU C Library, Carlos O'Donell

On Tue, Jul 12, 2022 at 12:29 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> This commit doesn't affect libc.so.6, its just housekeeping to prepare
> for adding explicit ISA level support.
>
> Tested build on x86_64 and x86_32 with/without multiarch.
> ---
>  sysdeps/x86_64/multiarch/strcat-sse2.S | 242 ++++++++++++++++++++++++-
>  sysdeps/x86_64/strcat.S                | 239 +-----------------------
>  2 files changed, 238 insertions(+), 243 deletions(-)
>
> diff --git a/sysdeps/x86_64/multiarch/strcat-sse2.S b/sysdeps/x86_64/multiarch/strcat-sse2.S
> index 449e102438..244c4a6d74 100644
> --- a/sysdeps/x86_64/multiarch/strcat-sse2.S
> +++ b/sysdeps/x86_64/multiarch/strcat-sse2.S
> @@ -17,12 +17,242 @@
>     <https://www.gnu.org/licenses/>.  */
>
>  #if IS_IN (libc)
> +# ifndef STRCAT
> +#  define STRCAT __strcat_sse2
> +# endif
> +#endif
>
> -# include <sysdep.h>
> -# define strcat __strcat_sse2
> +#include <sysdep.h>
> +
> +       .text
> +ENTRY (STRCAT)
> +       movq %rdi, %rcx         /* Dest. register. */
> +       andl $7, %ecx           /* mask alignment bits */
> +       movq %rdi, %rax         /* Duplicate destination pointer.  */
> +       movq $0xfefefefefefefeff,%r8
> +
> +       /* First step: Find end of destination.  */
> +       jz 4f                   /* aligned => start loop */
> +
> +       neg %ecx                /* We need to align to 8 bytes.  */
> +       addl $8,%ecx
> +       /* Search the first bytes directly.  */
> +0:     cmpb $0x0,(%rax)        /* is byte NUL? */
> +       je 2f                   /* yes => start copy */
> +       incq %rax               /* increment pointer */
> +       decl %ecx
> +       jnz 0b
> +
> +
> +
> +       /* Now the source is aligned.  Scan for NUL byte.  */
> +       .p2align 4
> +4:
> +       /* First unroll.  */
> +       movq (%rax), %rcx       /* get double word (= 8 bytes) in question */
> +       addq $8,%rax            /* adjust pointer for next word */
> +       movq %r8, %rdx          /* magic value */
> +       addq %rcx, %rdx         /* add the magic value to the word.  We get
> +                                  carry bits reported for each byte which
> +                                  is *not* 0 */
> +       jnc 3f                  /* highest byte is NUL => return pointer */
> +       xorq %rcx, %rdx         /* (word+magic)^word */
> +       orq %r8, %rdx           /* set all non-carry bits */
> +       incq %rdx               /* add 1: if one carry bit was *not* set
> +                                  the addition will not result in 0.  */
> +       jnz 3f                  /* found NUL => return pointer */
> +
> +       /* Second unroll.  */
> +       movq (%rax), %rcx       /* get double word (= 8 bytes) in question */
> +       addq $8,%rax            /* adjust pointer for next word */
> +       movq %r8, %rdx          /* magic value */
> +       addq %rcx, %rdx         /* add the magic value to the word.  We get
> +                                  carry bits reported for each byte which
> +                                  is *not* 0 */
> +       jnc 3f                  /* highest byte is NUL => return pointer */
> +       xorq %rcx, %rdx         /* (word+magic)^word */
> +       orq %r8, %rdx           /* set all non-carry bits */
> +       incq %rdx               /* add 1: if one carry bit was *not* set
> +                                  the addition will not result in 0.  */
> +       jnz 3f                  /* found NUL => return pointer */
> +
> +       /* Third unroll.  */
> +       movq (%rax), %rcx       /* get double word (= 8 bytes) in question */
> +       addq $8,%rax            /* adjust pointer for next word */
> +       movq %r8, %rdx          /* magic value */
> +       addq %rcx, %rdx         /* add the magic value to the word.  We get
> +                                  carry bits reported for each byte which
> +                                  is *not* 0 */
> +       jnc 3f                  /* highest byte is NUL => return pointer */
> +       xorq %rcx, %rdx         /* (word+magic)^word */
> +       orq %r8, %rdx           /* set all non-carry bits */
> +       incq %rdx               /* add 1: if one carry bit was *not* set
> +                                  the addition will not result in 0.  */
> +       jnz 3f                  /* found NUL => return pointer */
> +
> +       /* Fourth unroll.  */
> +       movq (%rax), %rcx       /* get double word (= 8 bytes) in question */
> +       addq $8,%rax            /* adjust pointer for next word */
> +       movq %r8, %rdx          /* magic value */
> +       addq %rcx, %rdx         /* add the magic value to the word.  We get
> +                                  carry bits reported for each byte which
> +                                  is *not* 0 */
> +       jnc 3f                  /* highest byte is NUL => return pointer */
> +       xorq %rcx, %rdx         /* (word+magic)^word */
> +       orq %r8, %rdx           /* set all non-carry bits */
> +       incq %rdx               /* add 1: if one carry bit was *not* set
> +                                  the addition will not result in 0.  */
> +       jz 4b                   /* no NUL found => continue loop */
> +
> +       .p2align 4              /* Align, it's a jump target.  */
> +3:     subq $8,%rax            /* correct pointer increment.  */
> +
> +       testb %cl, %cl          /* is first byte NUL? */
> +       jz 2f                   /* yes => return */
> +       incq %rax               /* increment pointer */
> +
> +       testb %ch, %ch          /* is second byte NUL? */
> +       jz 2f                   /* yes => return */
> +       incq %rax               /* increment pointer */
> +
> +       testl $0x00ff0000, %ecx /* is third byte NUL? */
> +       jz 2f                   /* yes => return pointer */
> +       incq %rax               /* increment pointer */
> +
> +       testl $0xff000000, %ecx /* is fourth byte NUL? */
> +       jz 2f                   /* yes => return pointer */
> +       incq %rax               /* increment pointer */
> +
> +       shrq $32, %rcx          /* look at other half.  */
> +
> +       testb %cl, %cl          /* is first byte NUL? */
> +       jz 2f                   /* yes => return */
> +       incq %rax               /* increment pointer */
> +
> +       testb %ch, %ch          /* is second byte NUL? */
> +       jz 2f                   /* yes => return */
> +       incq %rax               /* increment pointer */
> +
> +       testl $0xff0000, %ecx   /* is third byte NUL? */
> +       jz 2f                   /* yes => return pointer */
> +       incq %rax               /* increment pointer */
> +
> +2:
> +       /* Second step: Copy source to destination.  */
> +
> +       movq    %rsi, %rcx      /* duplicate  */
> +       andl    $7,%ecx         /* mask alignment bits */
> +       movq    %rax, %rdx      /* move around */
> +       jz      22f             /* aligned => start loop */
> +
> +       neg     %ecx            /* align to 8 bytes.  */
> +       addl    $8, %ecx
> +       /* Align the source pointer.  */
> +21:
> +       movb    (%rsi), %al     /* Fetch a byte */
> +       testb   %al, %al        /* Is it NUL? */
> +       movb    %al, (%rdx)     /* Store it */
> +       jz      24f             /* If it was NUL, done! */
> +       incq    %rsi
> +       incq    %rdx
> +       decl    %ecx
> +       jnz     21b
> +
> +       /* Now the sources is aligned.  Unfortunatly we cannot force
> +          to have both source and destination aligned, so ignore the
> +          alignment of the destination.  */
> +       .p2align 4
> +22:
> +       /* 1st unroll.  */
> +       movq    (%rsi), %rax    /* Read double word (8 bytes).  */
> +       addq    $8, %rsi        /* Adjust pointer for next word.  */
> +       movq    %rax, %r9       /* Save a copy for NUL finding.  */
> +       addq    %r8, %r9        /* add the magic value to the word.  We get
> +                                  carry bits reported for each byte which
> +                                  is *not* 0 */
> +       jnc     23f             /* highest byte is NUL => return pointer */
> +       xorq    %rax, %r9       /* (word+magic)^word */
> +       orq     %r8, %r9        /* set all non-carry bits */
> +       incq    %r9             /* add 1: if one carry bit was *not* set
> +                                  the addition will not result in 0.  */
> +
> +       jnz     23f             /* found NUL => return pointer */
> +
> +       movq    %rax, (%rdx)    /* Write value to destination.  */
> +       addq    $8, %rdx        /* Adjust pointer.  */
> +
> +       /* 2nd unroll.  */
> +       movq    (%rsi), %rax    /* Read double word (8 bytes).  */
> +       addq    $8, %rsi        /* Adjust pointer for next word.  */
> +       movq    %rax, %r9       /* Save a copy for NUL finding.  */
> +       addq    %r8, %r9        /* add the magic value to the word.  We get
> +                                  carry bits reported for each byte which
> +                                  is *not* 0 */
> +       jnc     23f             /* highest byte is NUL => return pointer */
> +       xorq    %rax, %r9       /* (word+magic)^word */
> +       orq     %r8, %r9        /* set all non-carry bits */
> +       incq    %r9             /* add 1: if one carry bit was *not* set
> +                                  the addition will not result in 0.  */
> +
> +       jnz     23f             /* found NUL => return pointer */
> +
> +       movq    %rax, (%rdx)    /* Write value to destination.  */
> +       addq    $8, %rdx        /* Adjust pointer.  */
> +
> +       /* 3rd unroll.  */
> +       movq    (%rsi), %rax    /* Read double word (8 bytes).  */
> +       addq    $8, %rsi        /* Adjust pointer for next word.  */
> +       movq    %rax, %r9       /* Save a copy for NUL finding.  */
> +       addq    %r8, %r9        /* add the magic value to the word.  We get
> +                                  carry bits reported for each byte which
> +                                  is *not* 0 */
> +       jnc     23f             /* highest byte is NUL => return pointer */
> +       xorq    %rax, %r9       /* (word+magic)^word */
> +       orq     %r8, %r9        /* set all non-carry bits */
> +       incq    %r9             /* add 1: if one carry bit was *not* set
> +                                  the addition will not result in 0.  */
> +
> +       jnz     23f             /* found NUL => return pointer */
> +
> +       movq    %rax, (%rdx)    /* Write value to destination.  */
> +       addq    $8, %rdx        /* Adjust pointer.  */
> +
> +       /* 4th unroll.  */
> +       movq    (%rsi), %rax    /* Read double word (8 bytes).  */
> +       addq    $8, %rsi        /* Adjust pointer for next word.  */
> +       movq    %rax, %r9       /* Save a copy for NUL finding.  */
> +       addq    %r8, %r9        /* add the magic value to the word.  We get
> +                                  carry bits reported for each byte which
> +                                  is *not* 0 */
> +       jnc     23f             /* highest byte is NUL => return pointer */
> +       xorq    %rax, %r9       /* (word+magic)^word */
> +       orq     %r8, %r9        /* set all non-carry bits */
> +       incq    %r9             /* add 1: if one carry bit was *not* set
> +                                  the addition will not result in 0.  */
> +
> +       jnz     23f             /* found NUL => return pointer */
> +
> +       movq    %rax, (%rdx)    /* Write value to destination.  */
> +       addq    $8, %rdx        /* Adjust pointer.  */
> +       jmp     22b             /* Next iteration.  */
> +
> +       /* Do the last few bytes. %rax contains the value to write.
> +          The loop is unrolled twice.  */
> +       .p2align 4
> +23:
> +       movb    %al, (%rdx)     /* 1st byte.  */
> +       testb   %al, %al        /* Is it NUL.  */
> +       jz      24f             /* yes, finish.  */
> +       incq    %rdx            /* Increment destination.  */
> +       movb    %ah, (%rdx)     /* 2nd byte.  */
> +       testb   %ah, %ah        /* Is it NUL?.  */
> +       jz      24f             /* yes, finish.  */
> +       incq    %rdx            /* Increment destination.  */
> +       shrq    $16, %rax       /* Shift...  */
> +       jmp     23b             /* and look at next two bytes in %rax.  */
>
> -# undef libc_hidden_builtin_def
> -# define libc_hidden_builtin_def(strcat)
> -#endif
>
> -#include <sysdeps/x86_64/strcat.S>
> +24:
> +       movq    %rdi, %rax      /* Source is return value.  */
> +       retq
> +END (STRCAT)
> diff --git a/sysdeps/x86_64/strcat.S b/sysdeps/x86_64/strcat.S
> index 565a9c785a..fc3e8a9bcf 100644
> --- a/sysdeps/x86_64/strcat.S
> +++ b/sysdeps/x86_64/strcat.S
> @@ -17,241 +17,6 @@
>     License along with the GNU C Library; if not, see
>     <https://www.gnu.org/licenses/>.  */
>
> -#include <sysdep.h>
> -#include "asm-syntax.h"
> -
> -/* Will be removed when new strcpy implementation gets merged.  */
> -
> -       .text
> -ENTRY (strcat)
> -       movq %rdi, %rcx         /* Dest. register. */
> -       andl $7, %ecx           /* mask alignment bits */
> -       movq %rdi, %rax         /* Duplicate destination pointer.  */
> -       movq $0xfefefefefefefeff,%r8
> -
> -       /* First step: Find end of destination.  */
> -       jz 4f                   /* aligned => start loop */
> -
> -       neg %ecx                /* We need to align to 8 bytes.  */
> -       addl $8,%ecx
> -       /* Search the first bytes directly.  */
> -0:     cmpb $0x0,(%rax)        /* is byte NUL? */
> -       je 2f                   /* yes => start copy */
> -       incq %rax               /* increment pointer */
> -       decl %ecx
> -       jnz 0b
> -
> -
> -
> -       /* Now the source is aligned.  Scan for NUL byte.  */
> -       .p2align 4
> -4:
> -       /* First unroll.  */
> -       movq (%rax), %rcx       /* get double word (= 8 bytes) in question */
> -       addq $8,%rax            /* adjust pointer for next word */
> -       movq %r8, %rdx          /* magic value */
> -       addq %rcx, %rdx         /* add the magic value to the word.  We get
> -                                  carry bits reported for each byte which
> -                                  is *not* 0 */
> -       jnc 3f                  /* highest byte is NUL => return pointer */
> -       xorq %rcx, %rdx         /* (word+magic)^word */
> -       orq %r8, %rdx           /* set all non-carry bits */
> -       incq %rdx               /* add 1: if one carry bit was *not* set
> -                                  the addition will not result in 0.  */
> -       jnz 3f                  /* found NUL => return pointer */
> -
> -       /* Second unroll.  */
> -       movq (%rax), %rcx       /* get double word (= 8 bytes) in question */
> -       addq $8,%rax            /* adjust pointer for next word */
> -       movq %r8, %rdx          /* magic value */
> -       addq %rcx, %rdx         /* add the magic value to the word.  We get
> -                                  carry bits reported for each byte which
> -                                  is *not* 0 */
> -       jnc 3f                  /* highest byte is NUL => return pointer */
> -       xorq %rcx, %rdx         /* (word+magic)^word */
> -       orq %r8, %rdx           /* set all non-carry bits */
> -       incq %rdx               /* add 1: if one carry bit was *not* set
> -                                  the addition will not result in 0.  */
> -       jnz 3f                  /* found NUL => return pointer */
> -
> -       /* Third unroll.  */
> -       movq (%rax), %rcx       /* get double word (= 8 bytes) in question */
> -       addq $8,%rax            /* adjust pointer for next word */
> -       movq %r8, %rdx          /* magic value */
> -       addq %rcx, %rdx         /* add the magic value to the word.  We get
> -                                  carry bits reported for each byte which
> -                                  is *not* 0 */
> -       jnc 3f                  /* highest byte is NUL => return pointer */
> -       xorq %rcx, %rdx         /* (word+magic)^word */
> -       orq %r8, %rdx           /* set all non-carry bits */
> -       incq %rdx               /* add 1: if one carry bit was *not* set
> -                                  the addition will not result in 0.  */
> -       jnz 3f                  /* found NUL => return pointer */
> -
> -       /* Fourth unroll.  */
> -       movq (%rax), %rcx       /* get double word (= 8 bytes) in question */
> -       addq $8,%rax            /* adjust pointer for next word */
> -       movq %r8, %rdx          /* magic value */
> -       addq %rcx, %rdx         /* add the magic value to the word.  We get
> -                                  carry bits reported for each byte which
> -                                  is *not* 0 */
> -       jnc 3f                  /* highest byte is NUL => return pointer */
> -       xorq %rcx, %rdx         /* (word+magic)^word */
> -       orq %r8, %rdx           /* set all non-carry bits */
> -       incq %rdx               /* add 1: if one carry bit was *not* set
> -                                  the addition will not result in 0.  */
> -       jz 4b                   /* no NUL found => continue loop */
> -
> -       .p2align 4              /* Align, it's a jump target.  */
> -3:     subq $8,%rax            /* correct pointer increment.  */
> -
> -       testb %cl, %cl          /* is first byte NUL? */
> -       jz 2f                   /* yes => return */
> -       incq %rax               /* increment pointer */
> -
> -       testb %ch, %ch          /* is second byte NUL? */
> -       jz 2f                   /* yes => return */
> -       incq %rax               /* increment pointer */
> -
> -       testl $0x00ff0000, %ecx /* is third byte NUL? */
> -       jz 2f                   /* yes => return pointer */
> -       incq %rax               /* increment pointer */
> -
> -       testl $0xff000000, %ecx /* is fourth byte NUL? */
> -       jz 2f                   /* yes => return pointer */
> -       incq %rax               /* increment pointer */
> -
> -       shrq $32, %rcx          /* look at other half.  */
> -
> -       testb %cl, %cl          /* is first byte NUL? */
> -       jz 2f                   /* yes => return */
> -       incq %rax               /* increment pointer */
> -
> -       testb %ch, %ch          /* is second byte NUL? */
> -       jz 2f                   /* yes => return */
> -       incq %rax               /* increment pointer */
> -
> -       testl $0xff0000, %ecx   /* is third byte NUL? */
> -       jz 2f                   /* yes => return pointer */
> -       incq %rax               /* increment pointer */
> -
> -2:
> -       /* Second step: Copy source to destination.  */
> -
> -       movq    %rsi, %rcx      /* duplicate  */
> -       andl    $7,%ecx         /* mask alignment bits */
> -       movq    %rax, %rdx      /* move around */
> -       jz      22f             /* aligned => start loop */
> -
> -       neg     %ecx            /* align to 8 bytes.  */
> -       addl    $8, %ecx
> -       /* Align the source pointer.  */
> -21:
> -       movb    (%rsi), %al     /* Fetch a byte */
> -       testb   %al, %al        /* Is it NUL? */
> -       movb    %al, (%rdx)     /* Store it */
> -       jz      24f             /* If it was NUL, done! */
> -       incq    %rsi
> -       incq    %rdx
> -       decl    %ecx
> -       jnz     21b
> -
> -       /* Now the sources is aligned.  Unfortunatly we cannot force
> -          to have both source and destination aligned, so ignore the
> -          alignment of the destination.  */
> -       .p2align 4
> -22:
> -       /* 1st unroll.  */
> -       movq    (%rsi), %rax    /* Read double word (8 bytes).  */
> -       addq    $8, %rsi        /* Adjust pointer for next word.  */
> -       movq    %rax, %r9       /* Save a copy for NUL finding.  */
> -       addq    %r8, %r9        /* add the magic value to the word.  We get
> -                                  carry bits reported for each byte which
> -                                  is *not* 0 */
> -       jnc     23f             /* highest byte is NUL => return pointer */
> -       xorq    %rax, %r9       /* (word+magic)^word */
> -       orq     %r8, %r9        /* set all non-carry bits */
> -       incq    %r9             /* add 1: if one carry bit was *not* set
> -                                  the addition will not result in 0.  */
> -
> -       jnz     23f             /* found NUL => return pointer */
> -
> -       movq    %rax, (%rdx)    /* Write value to destination.  */
> -       addq    $8, %rdx        /* Adjust pointer.  */
> -
> -       /* 2nd unroll.  */
> -       movq    (%rsi), %rax    /* Read double word (8 bytes).  */
> -       addq    $8, %rsi        /* Adjust pointer for next word.  */
> -       movq    %rax, %r9       /* Save a copy for NUL finding.  */
> -       addq    %r8, %r9        /* add the magic value to the word.  We get
> -                                  carry bits reported for each byte which
> -                                  is *not* 0 */
> -       jnc     23f             /* highest byte is NUL => return pointer */
> -       xorq    %rax, %r9       /* (word+magic)^word */
> -       orq     %r8, %r9        /* set all non-carry bits */
> -       incq    %r9             /* add 1: if one carry bit was *not* set
> -                                  the addition will not result in 0.  */
> -
> -       jnz     23f             /* found NUL => return pointer */
> -
> -       movq    %rax, (%rdx)    /* Write value to destination.  */
> -       addq    $8, %rdx        /* Adjust pointer.  */
> -
> -       /* 3rd unroll.  */
> -       movq    (%rsi), %rax    /* Read double word (8 bytes).  */
> -       addq    $8, %rsi        /* Adjust pointer for next word.  */
> -       movq    %rax, %r9       /* Save a copy for NUL finding.  */
> -       addq    %r8, %r9        /* add the magic value to the word.  We get
> -                                  carry bits reported for each byte which
> -                                  is *not* 0 */
> -       jnc     23f             /* highest byte is NUL => return pointer */
> -       xorq    %rax, %r9       /* (word+magic)^word */
> -       orq     %r8, %r9        /* set all non-carry bits */
> -       incq    %r9             /* add 1: if one carry bit was *not* set
> -                                  the addition will not result in 0.  */
> -
> -       jnz     23f             /* found NUL => return pointer */
> -
> -       movq    %rax, (%rdx)    /* Write value to destination.  */
> -       addq    $8, %rdx        /* Adjust pointer.  */
> -
> -       /* 4th unroll.  */
> -       movq    (%rsi), %rax    /* Read double word (8 bytes).  */
> -       addq    $8, %rsi        /* Adjust pointer for next word.  */
> -       movq    %rax, %r9       /* Save a copy for NUL finding.  */
> -       addq    %r8, %r9        /* add the magic value to the word.  We get
> -                                  carry bits reported for each byte which
> -                                  is *not* 0 */
> -       jnc     23f             /* highest byte is NUL => return pointer */
> -       xorq    %rax, %r9       /* (word+magic)^word */
> -       orq     %r8, %r9        /* set all non-carry bits */
> -       incq    %r9             /* add 1: if one carry bit was *not* set
> -                                  the addition will not result in 0.  */
> -
> -       jnz     23f             /* found NUL => return pointer */
> -
> -       movq    %rax, (%rdx)    /* Write value to destination.  */
> -       addq    $8, %rdx        /* Adjust pointer.  */
> -       jmp     22b             /* Next iteration.  */
> -
> -       /* Do the last few bytes. %rax contains the value to write.
> -          The loop is unrolled twice.  */
> -       .p2align 4
> -23:
> -       movb    %al, (%rdx)     /* 1st byte.  */
> -       testb   %al, %al        /* Is it NUL.  */
> -       jz      24f             /* yes, finish.  */
> -       incq    %rdx            /* Increment destination.  */
> -       movb    %ah, (%rdx)     /* 2nd byte.  */
> -       testb   %ah, %ah        /* Is it NUL?.  */
> -       jz      24f             /* yes, finish.  */
> -       incq    %rdx            /* Increment destination.  */
> -       shrq    $16, %rax       /* Shift...  */
> -       jmp     23b             /* and look at next two bytes in %rax.  */
> -
> -
> -24:
> -       movq    %rdi, %rax      /* Source is return value.  */
> -       retq
> -END (strcat)
> +#define STRCAT strcat
> +#include "multiarch/strcat-sse2.S"
>  libc_hidden_builtin_def (strcat)
> --
> 2.34.1
>

LGTM.

Thanks.

-- 
H.J.

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH v1] x86: Move strchr SSE2 implementation to multiarch/strchr-sse2.S
  2022-07-12 19:29 ` [PATCH v1] x86: Move strchr SSE2 implementation to multiarch/strchr-sse2.S Noah Goldstein
@ 2022-07-12 21:27   ` H.J. Lu
  0 siblings, 0 replies; 21+ messages in thread
From: H.J. Lu @ 2022-07-12 21:27 UTC (permalink / raw)
  To: Noah Goldstein; +Cc: GNU C Library, Carlos O'Donell

On Tue, Jul 12, 2022 at 12:29 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> This commit doesn't affect libc.so.6, its just housekeeping to prepare
> for adding explicit ISA level support.
>
> Tested build on x86_64 and x86_32 with/without multiarch.
> ---
>  sysdeps/x86_64/multiarch/rtld-strchr.S    |  18 +++
>  sysdeps/x86_64/multiarch/rtld-strchrnul.S |  18 +++
>  sysdeps/x86_64/multiarch/strchr-sse2.S    | 175 +++++++++++++++++++++-
>  sysdeps/x86_64/multiarch/strchrnul-sse2.S |  11 +-
>  sysdeps/x86_64/strchr.S                   | 167 +--------------------
>  sysdeps/x86_64/strchrnul.S                |   7 +-
>  6 files changed, 213 insertions(+), 183 deletions(-)
>  create mode 100644 sysdeps/x86_64/multiarch/rtld-strchr.S
>  create mode 100644 sysdeps/x86_64/multiarch/rtld-strchrnul.S
>
> diff --git a/sysdeps/x86_64/multiarch/rtld-strchr.S b/sysdeps/x86_64/multiarch/rtld-strchr.S
> new file mode 100644
> index 0000000000..2b7b879e37
> --- /dev/null
> +++ b/sysdeps/x86_64/multiarch/rtld-strchr.S
> @@ -0,0 +1,18 @@
> +/* Copyright (C) 2022 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +#include "../strchr.S"
> diff --git a/sysdeps/x86_64/multiarch/rtld-strchrnul.S b/sysdeps/x86_64/multiarch/rtld-strchrnul.S
> new file mode 100644
> index 0000000000..0cc5becc88
> --- /dev/null
> +++ b/sysdeps/x86_64/multiarch/rtld-strchrnul.S
> @@ -0,0 +1,18 @@
> +/* Copyright (C) 2022 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +#include "../strchrnul.S"
> diff --git a/sysdeps/x86_64/multiarch/strchr-sse2.S b/sysdeps/x86_64/multiarch/strchr-sse2.S
> index 992f700077..f7767ca543 100644
> --- a/sysdeps/x86_64/multiarch/strchr-sse2.S
> +++ b/sysdeps/x86_64/multiarch/strchr-sse2.S
> @@ -16,13 +16,172 @@
>     License along with the GNU C Library; if not, see
>     <https://www.gnu.org/licenses/>.  */
>
> -#if IS_IN (libc)
> -# define strchr __strchr_sse2
> +#if IS_IN (libc) || defined STRCHR
> +# ifndef STRCHR
> +#  define STRCHR __strchr_sse2
> +# endif
>
> -# undef weak_alias
> -# define weak_alias(strchr, index)
> -# undef libc_hidden_builtin_def
> -# define libc_hidden_builtin_def(strchr)
> -#endif
> +# include <sysdep.h>
> +
> +       .text
> +ENTRY (STRCHR)
> +       movd    %esi, %xmm1
> +       movl    %edi, %eax
> +       andl    $4095, %eax
> +       punpcklbw %xmm1, %xmm1
> +       cmpl    $4032, %eax
> +       punpcklwd %xmm1, %xmm1
> +       pshufd  $0, %xmm1, %xmm1
> +       jg      L(cross_page)
> +       movdqu  (%rdi), %xmm0
> +       pxor    %xmm3, %xmm3
> +       movdqa  %xmm0, %xmm4
> +       pcmpeqb %xmm1, %xmm0
> +       pcmpeqb %xmm3, %xmm4
> +       por     %xmm4, %xmm0
> +       pmovmskb %xmm0, %eax
> +       test    %eax, %eax
> +       je      L(next_48_bytes)
> +       bsf     %eax, %eax
> +# ifdef AS_STRCHRNUL
> +       leaq    (%rdi,%rax), %rax
> +# else
> +       movl    $0, %edx
> +       leaq    (%rdi,%rax), %rax
> +       cmpb    %sil, (%rax)
> +       cmovne  %rdx, %rax
> +# endif
> +       ret
> +
> +       .p2align 3
> +L(next_48_bytes):
> +       movdqu  16(%rdi), %xmm0
> +       movdqa  %xmm0, %xmm4
> +       pcmpeqb %xmm1, %xmm0
> +       pcmpeqb %xmm3, %xmm4
> +       por     %xmm4, %xmm0
> +       pmovmskb %xmm0, %ecx
> +       movdqu  32(%rdi), %xmm0
> +       movdqa  %xmm0, %xmm4
> +       pcmpeqb %xmm1, %xmm0
> +       salq    $16, %rcx
> +       pcmpeqb %xmm3, %xmm4
> +       por     %xmm4, %xmm0
> +       pmovmskb %xmm0, %eax
> +       movdqu  48(%rdi), %xmm0
> +       pcmpeqb %xmm0, %xmm3
> +       salq    $32, %rax
> +       pcmpeqb %xmm1, %xmm0
> +       orq     %rcx, %rax
> +       por     %xmm3, %xmm0
> +       pmovmskb %xmm0, %ecx
> +       salq    $48, %rcx
> +       orq     %rcx, %rax
> +       testq   %rax, %rax
> +       jne     L(return)
> +L(loop_start):
> +       /* We use this alignment to force loop be aligned to 8 but not
> +          16 bytes.  This gives better sheduling on AMD processors.  */
> +       .p2align 4
> +       pxor    %xmm6, %xmm6
> +       andq    $-64, %rdi
> +       .p2align 3
> +L(loop64):
> +       addq    $64, %rdi
> +       movdqa  (%rdi), %xmm5
> +       movdqa  16(%rdi), %xmm2
> +       movdqa  32(%rdi), %xmm3
> +       pxor    %xmm1, %xmm5
> +       movdqa  48(%rdi), %xmm4
> +       pxor    %xmm1, %xmm2
> +       pxor    %xmm1, %xmm3
> +       pminub  (%rdi), %xmm5
> +       pxor    %xmm1, %xmm4
> +       pminub  16(%rdi), %xmm2
> +       pminub  32(%rdi), %xmm3
> +       pminub  %xmm2, %xmm5
> +       pminub  48(%rdi), %xmm4
> +       pminub  %xmm3, %xmm5
> +       pminub  %xmm4, %xmm5
> +       pcmpeqb %xmm6, %xmm5
> +       pmovmskb %xmm5, %eax
> +
> +       testl   %eax, %eax
> +       je      L(loop64)
>
> -#include "../strchr.S"
> +       movdqa  (%rdi), %xmm5
> +       movdqa  %xmm5, %xmm0
> +       pcmpeqb %xmm1, %xmm5
> +       pcmpeqb %xmm6, %xmm0
> +       por     %xmm0, %xmm5
> +       pcmpeqb %xmm6, %xmm2
> +       pcmpeqb %xmm6, %xmm3
> +       pcmpeqb %xmm6, %xmm4
> +
> +       pmovmskb %xmm5, %ecx
> +       pmovmskb %xmm2, %eax
> +       salq    $16, %rax
> +       pmovmskb %xmm3, %r8d
> +       pmovmskb %xmm4, %edx
> +       salq    $32, %r8
> +       orq     %r8, %rax
> +       orq     %rcx, %rax
> +       salq    $48, %rdx
> +       orq     %rdx, %rax
> +       .p2align 3
> +L(return):
> +       bsfq    %rax, %rax
> +# ifdef AS_STRCHRNUL
> +       leaq    (%rdi,%rax), %rax
> +# else
> +       movl    $0, %edx
> +       leaq    (%rdi,%rax), %rax
> +       cmpb    %sil, (%rax)
> +       cmovne  %rdx, %rax
> +# endif
> +       ret
> +       .p2align 4
> +
> +L(cross_page):
> +       movq    %rdi, %rdx
> +       pxor    %xmm2, %xmm2
> +       andq    $-64, %rdx
> +       movdqa  %xmm1, %xmm0
> +       movdqa  (%rdx), %xmm3
> +       movdqa  %xmm3, %xmm4
> +       pcmpeqb %xmm1, %xmm3
> +       pcmpeqb %xmm2, %xmm4
> +       por     %xmm4, %xmm3
> +       pmovmskb %xmm3, %r8d
> +       movdqa  16(%rdx), %xmm3
> +       movdqa  %xmm3, %xmm4
> +       pcmpeqb %xmm1, %xmm3
> +       pcmpeqb %xmm2, %xmm4
> +       por     %xmm4, %xmm3
> +       pmovmskb %xmm3, %eax
> +       movdqa  32(%rdx), %xmm3
> +       movdqa  %xmm3, %xmm4
> +       pcmpeqb %xmm1, %xmm3
> +       salq    $16, %rax
> +       pcmpeqb %xmm2, %xmm4
> +       por     %xmm4, %xmm3
> +       pmovmskb %xmm3, %r9d
> +       movdqa  48(%rdx), %xmm3
> +       pcmpeqb %xmm3, %xmm2
> +       salq    $32, %r9
> +       pcmpeqb %xmm3, %xmm0
> +       orq     %r9, %rax
> +       orq     %r8, %rax
> +       por     %xmm2, %xmm0
> +       pmovmskb %xmm0, %ecx
> +       salq    $48, %rcx
> +       orq     %rcx, %rax
> +       movl    %edi, %ecx
> +       subb    %dl, %cl
> +       shrq    %cl, %rax
> +       testq   %rax, %rax
> +       jne     L(return)
> +       jmp     L(loop_start)
> +
> +END (STRCHR)
> +#endif
> diff --git a/sysdeps/x86_64/multiarch/strchrnul-sse2.S b/sysdeps/x86_64/multiarch/strchrnul-sse2.S
> index f91c670369..7238977a21 100644
> --- a/sysdeps/x86_64/multiarch/strchrnul-sse2.S
> +++ b/sysdeps/x86_64/multiarch/strchrnul-sse2.S
> @@ -17,10 +17,11 @@
>     <https://www.gnu.org/licenses/>.  */
>
>  #if IS_IN (libc)
> -# define __strchrnul __strchrnul_sse2
> -
> -# undef weak_alias
> -# define weak_alias(__strchrnul, strchrnul)
> +# ifndef STRCHR
> +#  define STRCHR       __strchrnul_sse2
> +# endif
>  #endif
>
> -#include "../strchrnul.S"
> +#define AS_STRCHRNUL
> +
> +#include "strchr-sse2.S"
> diff --git a/sysdeps/x86_64/strchr.S b/sysdeps/x86_64/strchr.S
> index dda7c0431d..77c956c92c 100644
> --- a/sysdeps/x86_64/strchr.S
> +++ b/sysdeps/x86_64/strchr.S
> @@ -17,171 +17,8 @@
>     License along with the GNU C Library; if not, see
>     <https://www.gnu.org/licenses/>.  */
>
> -#include <sysdep.h>
>
> -       .text
> -ENTRY (strchr)
> -       movd    %esi, %xmm1
> -       movl    %edi, %eax
> -       andl    $4095, %eax
> -       punpcklbw %xmm1, %xmm1
> -       cmpl    $4032, %eax
> -       punpcklwd %xmm1, %xmm1
> -       pshufd  $0, %xmm1, %xmm1
> -       jg      L(cross_page)
> -       movdqu  (%rdi), %xmm0
> -       pxor    %xmm3, %xmm3
> -       movdqa  %xmm0, %xmm4
> -       pcmpeqb %xmm1, %xmm0
> -       pcmpeqb %xmm3, %xmm4
> -       por     %xmm4, %xmm0
> -       pmovmskb %xmm0, %eax
> -       test    %eax, %eax
> -       je      L(next_48_bytes)
> -       bsf     %eax, %eax
> -#ifdef AS_STRCHRNUL
> -       leaq    (%rdi,%rax), %rax
> -#else
> -       movl    $0, %edx
> -       leaq    (%rdi,%rax), %rax
> -       cmpb    %sil, (%rax)
> -       cmovne  %rdx, %rax
> -#endif
> -       ret
> -
> -       .p2align 3
> -       L(next_48_bytes):
> -       movdqu  16(%rdi), %xmm0
> -       movdqa  %xmm0, %xmm4
> -       pcmpeqb %xmm1, %xmm0
> -       pcmpeqb %xmm3, %xmm4
> -       por     %xmm4, %xmm0
> -       pmovmskb %xmm0, %ecx
> -       movdqu  32(%rdi), %xmm0
> -       movdqa  %xmm0, %xmm4
> -       pcmpeqb %xmm1, %xmm0
> -       salq    $16, %rcx
> -       pcmpeqb %xmm3, %xmm4
> -       por     %xmm4, %xmm0
> -       pmovmskb %xmm0, %eax
> -       movdqu  48(%rdi), %xmm0
> -       pcmpeqb %xmm0, %xmm3
> -       salq    $32, %rax
> -       pcmpeqb %xmm1, %xmm0
> -       orq     %rcx, %rax
> -       por     %xmm3, %xmm0
> -       pmovmskb %xmm0, %ecx
> -       salq    $48, %rcx
> -       orq     %rcx, %rax
> -       testq   %rax, %rax
> -       jne     L(return)
> -L(loop_start):
> -       /* We use this alignment to force loop be aligned to 8 but not
> -          16 bytes.  This gives better sheduling on AMD processors.  */
> -       .p2align 4
> -       pxor    %xmm6, %xmm6
> -       andq    $-64, %rdi
> -       .p2align 3
> -L(loop64):
> -       addq    $64, %rdi
> -       movdqa  (%rdi), %xmm5
> -       movdqa  16(%rdi), %xmm2
> -       movdqa  32(%rdi), %xmm3
> -       pxor    %xmm1, %xmm5
> -       movdqa  48(%rdi), %xmm4
> -       pxor    %xmm1, %xmm2
> -       pxor    %xmm1, %xmm3
> -       pminub  (%rdi), %xmm5
> -       pxor    %xmm1, %xmm4
> -       pminub  16(%rdi), %xmm2
> -       pminub  32(%rdi), %xmm3
> -       pminub  %xmm2, %xmm5
> -       pminub  48(%rdi), %xmm4
> -       pminub  %xmm3, %xmm5
> -       pminub  %xmm4, %xmm5
> -       pcmpeqb %xmm6, %xmm5
> -       pmovmskb %xmm5, %eax
> -
> -       testl   %eax, %eax
> -       je      L(loop64)
> -
> -       movdqa  (%rdi), %xmm5
> -       movdqa  %xmm5, %xmm0
> -       pcmpeqb %xmm1, %xmm5
> -       pcmpeqb %xmm6, %xmm0
> -       por     %xmm0, %xmm5
> -       pcmpeqb %xmm6, %xmm2
> -       pcmpeqb %xmm6, %xmm3
> -       pcmpeqb %xmm6, %xmm4
> -
> -       pmovmskb %xmm5, %ecx
> -       pmovmskb %xmm2, %eax
> -       salq    $16, %rax
> -       pmovmskb %xmm3, %r8d
> -       pmovmskb %xmm4, %edx
> -       salq    $32, %r8
> -       orq     %r8, %rax
> -       orq     %rcx, %rax
> -       salq    $48, %rdx
> -       orq     %rdx, %rax
> -       .p2align 3
> -L(return):
> -       bsfq    %rax, %rax
> -#ifdef AS_STRCHRNUL
> -       leaq    (%rdi,%rax), %rax
> -#else
> -       movl    $0, %edx
> -       leaq    (%rdi,%rax), %rax
> -       cmpb    %sil, (%rax)
> -       cmovne  %rdx, %rax
> -#endif
> -       ret
> -       .p2align 4
> -
> -L(cross_page):
> -       movq    %rdi, %rdx
> -       pxor    %xmm2, %xmm2
> -       andq    $-64, %rdx
> -       movdqa  %xmm1, %xmm0
> -       movdqa  (%rdx), %xmm3
> -       movdqa  %xmm3, %xmm4
> -       pcmpeqb %xmm1, %xmm3
> -       pcmpeqb %xmm2, %xmm4
> -       por     %xmm4, %xmm3
> -       pmovmskb %xmm3, %r8d
> -       movdqa  16(%rdx), %xmm3
> -       movdqa  %xmm3, %xmm4
> -       pcmpeqb %xmm1, %xmm3
> -       pcmpeqb %xmm2, %xmm4
> -       por     %xmm4, %xmm3
> -       pmovmskb %xmm3, %eax
> -       movdqa  32(%rdx), %xmm3
> -       movdqa  %xmm3, %xmm4
> -       pcmpeqb %xmm1, %xmm3
> -       salq    $16, %rax
> -       pcmpeqb %xmm2, %xmm4
> -       por     %xmm4, %xmm3
> -       pmovmskb %xmm3, %r9d
> -       movdqa  48(%rdx), %xmm3
> -       pcmpeqb %xmm3, %xmm2
> -       salq    $32, %r9
> -       pcmpeqb %xmm3, %xmm0
> -       orq     %r9, %rax
> -       orq     %r8, %rax
> -       por     %xmm2, %xmm0
> -       pmovmskb %xmm0, %ecx
> -       salq    $48, %rcx
> -       orq     %rcx, %rax
> -       movl    %edi, %ecx
> -       subb    %dl, %cl
> -       shrq    %cl, %rax
> -       testq   %rax, %rax
> -       jne     L(return)
> -       jmp     L(loop_start)
> -
> -END (strchr)
> -
> -#ifndef AS_STRCHRNUL
> +#define STRCHR strchr
> +#include "multiarch/strchr-sse2.S"
>  weak_alias (strchr, index)
>  libc_hidden_builtin_def (strchr)
> -#endif
> diff --git a/sysdeps/x86_64/strchrnul.S b/sysdeps/x86_64/strchrnul.S
> index ec2e652e25..508e42db26 100644
> --- a/sysdeps/x86_64/strchrnul.S
> +++ b/sysdeps/x86_64/strchrnul.S
> @@ -18,10 +18,7 @@
>     License along with the GNU C Library; if not, see
>     <https://www.gnu.org/licenses/>.  */
>
> -#include <sysdep.h>
> -
> -#define strchr __strchrnul
> -#define AS_STRCHRNUL
> -#include "strchr.S"
> +#define STRCHR __strchrnul
> +#include "multiarch/strchrnul-sse2.S"
>
>  weak_alias (__strchrnul, strchrnul)
> --
> 2.34.1
>

LGTM.

Thanks.

-- 
H.J.

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH v1] x86: Move strrchr SSE2 implementation to multiarch/strrchr-sse2.S
  2022-07-12 19:29 ` [PATCH v1] x86: Move strrchr SSE2 implementation to multiarch/strrchr-sse2.S Noah Goldstein
@ 2022-07-12 22:28   ` H.J. Lu
  0 siblings, 0 replies; 21+ messages in thread
From: H.J. Lu @ 2022-07-12 22:28 UTC (permalink / raw)
  To: Noah Goldstein; +Cc: GNU C Library, Carlos O'Donell

On Tue, Jul 12, 2022 at 12:29 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> This commit doesn't affect libc.so.6, its just housekeeping to prepare
> for adding explicit ISA level support.
>
> Tested build on x86_64 and x86_32 with/without multiarch.
> ---
>  sysdeps/x86_64/multiarch/strrchr-sse2.S | 358 ++++++++++++++++++++++-
>  sysdeps/x86_64/multiarch/wcsrchr-sse2.S |  10 +-
>  sysdeps/x86_64/strrchr.S                | 364 +-----------------------
>  sysdeps/x86_64/wcsrchr.S                |  11 +-
>  4 files changed, 366 insertions(+), 377 deletions(-)
>
> diff --git a/sysdeps/x86_64/multiarch/strrchr-sse2.S b/sysdeps/x86_64/multiarch/strrchr-sse2.S
> index 866396e947..6ee7a5e33a 100644
> --- a/sysdeps/x86_64/multiarch/strrchr-sse2.S
> +++ b/sysdeps/x86_64/multiarch/strrchr-sse2.S
> @@ -17,12 +17,358 @@
>     <https://www.gnu.org/licenses/>.  */
>
>  #if IS_IN (libc)
> -# define STRRCHR __strrchr_sse2
> +# ifndef STRRCHR
> +#  define STRRCHR __strrchr_sse2
> +# endif
> +#endif
> +
> +#include <sysdep.h>
> +
> +#ifdef USE_AS_WCSRCHR
> +# define PCMPEQ        pcmpeqd
> +# define CHAR_SIZE     4
> +# define PMINU pminud
> +#else
> +# define PCMPEQ        pcmpeqb
> +# define CHAR_SIZE     1
> +# define PMINU pminub
> +#endif
> +
> +#define PAGE_SIZE      4096
> +#define VEC_SIZE       16
> +
> +       .text
> +ENTRY(STRRCHR)
> +       movd    %esi, %xmm0
> +       movq    %rdi, %rax
> +       andl    $(PAGE_SIZE - 1), %eax
> +#ifndef USE_AS_WCSRCHR
> +       punpcklbw %xmm0, %xmm0
> +       punpcklwd %xmm0, %xmm0
> +#endif
> +       pshufd  $0, %xmm0, %xmm0
> +       cmpl    $(PAGE_SIZE - VEC_SIZE), %eax
> +       ja      L(cross_page)
> +
> +L(cross_page_continue):
> +       movups  (%rdi), %xmm1
> +       pxor    %xmm2, %xmm2
> +       PCMPEQ  %xmm1, %xmm2
> +       pmovmskb %xmm2, %ecx
> +       testl   %ecx, %ecx
> +       jz      L(aligned_more)
> +
> +       PCMPEQ  %xmm0, %xmm1
> +       pmovmskb %xmm1, %eax
> +       leal    -1(%rcx), %edx
> +       xorl    %edx, %ecx
> +       andl    %ecx, %eax
> +       jz      L(ret0)
> +       bsrl    %eax, %eax
> +       addq    %rdi, %rax
> +       /* We are off by 3 for wcsrchr if search CHAR is non-zero. If
> +          search CHAR is zero we are correct. Either way `andq
> +          -CHAR_SIZE, %rax` gets the correct result.  */
> +#ifdef USE_AS_WCSRCHR
> +       andq    $-CHAR_SIZE, %rax
> +#endif
> +L(ret0):
> +       ret
> +
> +       /* Returns for first vec x1/x2 have hard coded backward search
> +          path for earlier matches.  */
> +       .p2align 4
> +L(first_vec_x0_test):
> +       PCMPEQ  %xmm0, %xmm1
> +       pmovmskb %xmm1, %eax
> +       testl   %eax, %eax
> +       jz      L(ret0)
> +       bsrl    %eax, %eax
> +       addq    %r8, %rax
> +#ifdef USE_AS_WCSRCHR
> +       andq    $-CHAR_SIZE, %rax
> +#endif
> +       ret
> +
> +       .p2align 4
> +L(first_vec_x1):
> +       PCMPEQ  %xmm0, %xmm2
> +       pmovmskb %xmm2, %eax
> +       leal    -1(%rcx), %edx
> +       xorl    %edx, %ecx
> +       andl    %ecx, %eax
> +       jz      L(first_vec_x0_test)
> +       bsrl    %eax, %eax
> +       leaq    (VEC_SIZE)(%rdi, %rax), %rax
> +#ifdef USE_AS_WCSRCHR
> +       andq    $-CHAR_SIZE, %rax
> +#endif
> +       ret
> +
> +       .p2align 4
> +L(first_vec_x1_test):
> +       PCMPEQ  %xmm0, %xmm2
> +       pmovmskb %xmm2, %eax
> +       testl   %eax, %eax
> +       jz      L(first_vec_x0_test)
> +       bsrl    %eax, %eax
> +       leaq    (VEC_SIZE)(%rdi, %rax), %rax
> +#ifdef USE_AS_WCSRCHR
> +       andq    $-CHAR_SIZE, %rax
> +#endif
> +       ret
> +
> +       .p2align 4
> +L(first_vec_x2):
> +       PCMPEQ  %xmm0, %xmm3
> +       pmovmskb %xmm3, %eax
> +       leal    -1(%rcx), %edx
> +       xorl    %edx, %ecx
> +       andl    %ecx, %eax
> +       jz      L(first_vec_x1_test)
> +       bsrl    %eax, %eax
> +       leaq    (VEC_SIZE * 2)(%rdi, %rax), %rax
> +#ifdef USE_AS_WCSRCHR
> +       andq    $-CHAR_SIZE, %rax
> +#endif
> +       ret
> +
> +       .p2align 4
> +L(aligned_more):
> +       /* Save original pointer if match was in VEC 0.  */
> +       movq    %rdi, %r8
> +       andq    $-VEC_SIZE, %rdi
> +
> +       movaps  VEC_SIZE(%rdi), %xmm2
> +       pxor    %xmm3, %xmm3
> +       PCMPEQ  %xmm2, %xmm3
> +       pmovmskb %xmm3, %ecx
> +       testl   %ecx, %ecx
> +       jnz     L(first_vec_x1)
> +
> +       movaps  (VEC_SIZE * 2)(%rdi), %xmm3
> +       pxor    %xmm4, %xmm4
> +       PCMPEQ  %xmm3, %xmm4
> +       pmovmskb %xmm4, %ecx
> +       testl   %ecx, %ecx
> +       jnz     L(first_vec_x2)
> +
> +       addq    $VEC_SIZE, %rdi
> +       /* Save pointer again before realigning.  */
> +       movq    %rdi, %rsi
> +       andq    $-(VEC_SIZE * 2), %rdi
> +       .p2align 4
> +L(first_loop):
> +       /* Do 2x VEC at a time.  */
> +       movaps  (VEC_SIZE * 2)(%rdi), %xmm4
> +       movaps  (VEC_SIZE * 3)(%rdi), %xmm5
> +       /* Since SSE2 no pminud so wcsrchr needs seperate logic for
> +          detecting zero. Note if this is found to be a bottleneck it
> +          may be worth adding an SSE4.1 wcsrchr implementation.  */
> +#ifdef USE_AS_WCSRCHR
> +       movaps  %xmm5, %xmm6
> +       pxor    %xmm8, %xmm8
> +
> +       PCMPEQ  %xmm8, %xmm5
> +       PCMPEQ  %xmm4, %xmm8
> +       por     %xmm5, %xmm8
> +#else
> +       movaps  %xmm5, %xmm6
> +       PMINU   %xmm4, %xmm5
> +#endif
> +
> +       movaps  %xmm4, %xmm9
> +       PCMPEQ  %xmm0, %xmm4
> +       PCMPEQ  %xmm0, %xmm6
> +       movaps  %xmm6, %xmm7
> +       por     %xmm4, %xmm6
> +#ifndef USE_AS_WCSRCHR
> +       pxor    %xmm8, %xmm8
> +       PCMPEQ  %xmm5, %xmm8
> +#endif
> +       pmovmskb %xmm8, %ecx
> +       pmovmskb %xmm6, %eax
>
> -# undef weak_alias
> -# define weak_alias(strrchr, rindex)
> -# undef libc_hidden_builtin_def
> -# define libc_hidden_builtin_def(strrchr)
> +       addq    $(VEC_SIZE * 2), %rdi
> +       /* Use `addl` 1) so we can undo it with `subl` and 2) it can
> +          macro-fuse with `jz`.  */
> +       addl    %ecx, %eax
> +       jz      L(first_loop)
> +
> +       /* Check if there is zero match.  */
> +       testl   %ecx, %ecx
> +       jz      L(second_loop_match)
> +
> +       /* Check if there was a match in last iteration.  */
> +       subl    %ecx, %eax
> +       jnz     L(new_match)
> +
> +L(first_loop_old_match):
> +       PCMPEQ  %xmm0, %xmm2
> +       PCMPEQ  %xmm0, %xmm3
> +       pmovmskb %xmm2, %ecx
> +       pmovmskb %xmm3, %eax
> +       addl    %eax, %ecx
> +       jz      L(first_vec_x0_test)
> +       /* NB: We could move this shift to before the branch and save a
> +          bit of code size / performance on the fall through. The
> +          branch leads to the null case which generally seems hotter
> +          than char in first 3x VEC.  */
> +       sall    $16, %eax
> +       orl     %ecx, %eax
> +
> +       bsrl    %eax, %eax
> +       addq    %rsi, %rax
> +#ifdef USE_AS_WCSRCHR
> +       andq    $-CHAR_SIZE, %rax
> +#endif
> +       ret
> +
> +       .p2align 4
> +L(new_match):
> +       pxor    %xmm6, %xmm6
> +       PCMPEQ  %xmm9, %xmm6
> +       pmovmskb %xmm6, %eax
> +       sall    $16, %ecx
> +       orl     %eax, %ecx
> +
> +       /* We can't reuse either of the old comparisons as since we mask
> +          of zeros after first zero (instead of using the full
> +          comparison) we can't gurantee no interference between match
> +          after end of string and valid match.  */
> +       pmovmskb %xmm4, %eax
> +       pmovmskb %xmm7, %edx
> +       sall    $16, %edx
> +       orl     %edx, %eax
> +
> +       leal    -1(%ecx), %edx
> +       xorl    %edx, %ecx
> +       andl    %ecx, %eax
> +       jz      L(first_loop_old_match)
> +       bsrl    %eax, %eax
> +       addq    %rdi, %rax
> +#ifdef USE_AS_WCSRCHR
> +       andq    $-CHAR_SIZE, %rax
> +#endif
> +       ret
> +
> +       /* Save minimum state for getting most recent match. We can
> +          throw out all previous work.  */
> +       .p2align 4
> +L(second_loop_match):
> +       movq    %rdi, %rsi
> +       movaps  %xmm4, %xmm2
> +       movaps  %xmm7, %xmm3
> +
> +       .p2align 4
> +L(second_loop):
> +       movaps  (VEC_SIZE * 2)(%rdi), %xmm4
> +       movaps  (VEC_SIZE * 3)(%rdi), %xmm5
> +       /* Since SSE2 no pminud so wcsrchr needs seperate logic for
> +          detecting zero. Note if this is found to be a bottleneck it
> +          may be worth adding an SSE4.1 wcsrchr implementation.  */
> +#ifdef USE_AS_WCSRCHR
> +       movaps  %xmm5, %xmm6
> +       pxor    %xmm8, %xmm8
> +
> +       PCMPEQ  %xmm8, %xmm5
> +       PCMPEQ  %xmm4, %xmm8
> +       por     %xmm5, %xmm8
> +#else
> +       movaps  %xmm5, %xmm6
> +       PMINU   %xmm4, %xmm5
> +#endif
> +
> +       movaps  %xmm4, %xmm9
> +       PCMPEQ  %xmm0, %xmm4
> +       PCMPEQ  %xmm0, %xmm6
> +       movaps  %xmm6, %xmm7
> +       por     %xmm4, %xmm6
> +#ifndef USE_AS_WCSRCHR
> +       pxor    %xmm8, %xmm8
> +       PCMPEQ  %xmm5, %xmm8
>  #endif
>
> -#include "../strrchr.S"
> +       pmovmskb %xmm8, %ecx
> +       pmovmskb %xmm6, %eax
> +
> +       addq    $(VEC_SIZE * 2), %rdi
> +       /* Either null term or new occurence of CHAR.  */
> +       addl    %ecx, %eax
> +       jz      L(second_loop)
> +
> +       /* No null term so much be new occurence of CHAR.  */
> +       testl   %ecx, %ecx
> +       jz      L(second_loop_match)
> +
> +
> +       subl    %ecx, %eax
> +       jnz     L(second_loop_new_match)
> +
> +L(second_loop_old_match):
> +       pmovmskb %xmm2, %ecx
> +       pmovmskb %xmm3, %eax
> +       sall    $16, %eax
> +       orl     %ecx, %eax
> +       bsrl    %eax, %eax
> +       addq    %rsi, %rax
> +#ifdef USE_AS_WCSRCHR
> +       andq    $-CHAR_SIZE, %rax
> +#endif
> +       ret
> +
> +       .p2align 4
> +L(second_loop_new_match):
> +       pxor    %xmm6, %xmm6
> +       PCMPEQ  %xmm9, %xmm6
> +       pmovmskb %xmm6, %eax
> +       sall    $16, %ecx
> +       orl     %eax, %ecx
> +
> +       /* We can't reuse either of the old comparisons as since we mask
> +          of zeros after first zero (instead of using the full
> +          comparison) we can't gurantee no interference between match
> +          after end of string and valid match.  */
> +       pmovmskb %xmm4, %eax
> +       pmovmskb %xmm7, %edx
> +       sall    $16, %edx
> +       orl     %edx, %eax
> +
> +       leal    -1(%ecx), %edx
> +       xorl    %edx, %ecx
> +       andl    %ecx, %eax
> +       jz      L(second_loop_old_match)
> +       bsrl    %eax, %eax
> +       addq    %rdi, %rax
> +#ifdef USE_AS_WCSRCHR
> +       andq    $-CHAR_SIZE, %rax
> +#endif
> +       ret
> +
> +       .p2align 4,, 4
> +L(cross_page):
> +       movq    %rdi, %rsi
> +       andq    $-VEC_SIZE, %rsi
> +       movaps  (%rsi), %xmm1
> +       pxor    %xmm2, %xmm2
> +       PCMPEQ  %xmm1, %xmm2
> +       pmovmskb %xmm2, %edx
> +       movl    %edi, %ecx
> +       andl    $(VEC_SIZE - 1), %ecx
> +       sarl    %cl, %edx
> +       jz      L(cross_page_continue)
> +       PCMPEQ  %xmm0, %xmm1
> +       pmovmskb %xmm1, %eax
> +       sarl    %cl, %eax
> +       leal    -1(%rdx), %ecx
> +       xorl    %edx, %ecx
> +       andl    %ecx, %eax
> +       jz      L(ret1)
> +       bsrl    %eax, %eax
> +       addq    %rdi, %rax
> +#ifdef USE_AS_WCSRCHR
> +       andq    $-CHAR_SIZE, %rax
> +#endif
> +L(ret1):
> +       ret
> +END(STRRCHR)
> diff --git a/sysdeps/x86_64/multiarch/wcsrchr-sse2.S b/sysdeps/x86_64/multiarch/wcsrchr-sse2.S
> index 69d2f3cdb1..d9259720f8 100644
> --- a/sysdeps/x86_64/multiarch/wcsrchr-sse2.S
> +++ b/sysdeps/x86_64/multiarch/wcsrchr-sse2.S
> @@ -17,6 +17,12 @@
>     <https://www.gnu.org/licenses/>.  */
>
>  #if IS_IN (libc)
> -# define STRRCHR       __wcsrchr_sse2
> +# ifndef STRRCHR
> +#  define STRRCHR      __wcsrchr_sse2
> +# endif
>  #endif
> -#include "../wcsrchr.S"
> +
> +#define USE_AS_WCSRCHR 1
> +#define NO_PMINU       1
> +
> +#include "strrchr-sse2.S"
> diff --git a/sysdeps/x86_64/strrchr.S b/sysdeps/x86_64/strrchr.S
> index 4d7ba4ceb2..f39da60454 100644
> --- a/sysdeps/x86_64/strrchr.S
> +++ b/sysdeps/x86_64/strrchr.S
> @@ -16,363 +16,7 @@
>     License along with the GNU C Library; if not, see
>     <https://www.gnu.org/licenses/>.  */
>
> -
> -#include <sysdep.h>
> -
> -#ifndef STRRCHR
> -# define STRRCHR       strrchr
> -#endif
> -
> -#ifdef USE_AS_WCSRCHR
> -# define PCMPEQ        pcmpeqd
> -# define CHAR_SIZE     4
> -# define PMINU pminud
> -#else
> -# define PCMPEQ        pcmpeqb
> -# define CHAR_SIZE     1
> -# define PMINU pminub
> -#endif
> -
> -#define PAGE_SIZE      4096
> -#define VEC_SIZE       16
> -
> -       .text
> -ENTRY(STRRCHR)
> -       movd    %esi, %xmm0
> -       movq    %rdi, %rax
> -       andl    $(PAGE_SIZE - 1), %eax
> -#ifndef USE_AS_WCSRCHR
> -       punpcklbw %xmm0, %xmm0
> -       punpcklwd %xmm0, %xmm0
> -#endif
> -       pshufd  $0, %xmm0, %xmm0
> -       cmpl    $(PAGE_SIZE - VEC_SIZE), %eax
> -       ja      L(cross_page)
> -
> -L(cross_page_continue):
> -       movups  (%rdi), %xmm1
> -       pxor    %xmm2, %xmm2
> -       PCMPEQ  %xmm1, %xmm2
> -       pmovmskb %xmm2, %ecx
> -       testl   %ecx, %ecx
> -       jz      L(aligned_more)
> -
> -       PCMPEQ  %xmm0, %xmm1
> -       pmovmskb %xmm1, %eax
> -       leal    -1(%rcx), %edx
> -       xorl    %edx, %ecx
> -       andl    %ecx, %eax
> -       jz      L(ret0)
> -       bsrl    %eax, %eax
> -       addq    %rdi, %rax
> -       /* We are off by 3 for wcsrchr if search CHAR is non-zero. If
> -          search CHAR is zero we are correct. Either way `andq
> -          -CHAR_SIZE, %rax` gets the correct result.  */
> -#ifdef USE_AS_WCSRCHR
> -       andq    $-CHAR_SIZE, %rax
> -#endif
> -L(ret0):
> -       ret
> -
> -       /* Returns for first vec x1/x2 have hard coded backward search
> -          path for earlier matches.  */
> -       .p2align 4
> -L(first_vec_x0_test):
> -       PCMPEQ  %xmm0, %xmm1
> -       pmovmskb %xmm1, %eax
> -       testl   %eax, %eax
> -       jz      L(ret0)
> -       bsrl    %eax, %eax
> -       addq    %r8, %rax
> -#ifdef USE_AS_WCSRCHR
> -       andq    $-CHAR_SIZE, %rax
> -#endif
> -       ret
> -
> -       .p2align 4
> -L(first_vec_x1):
> -       PCMPEQ  %xmm0, %xmm2
> -       pmovmskb %xmm2, %eax
> -       leal    -1(%rcx), %edx
> -       xorl    %edx, %ecx
> -       andl    %ecx, %eax
> -       jz      L(first_vec_x0_test)
> -       bsrl    %eax, %eax
> -       leaq    (VEC_SIZE)(%rdi, %rax), %rax
> -#ifdef USE_AS_WCSRCHR
> -       andq    $-CHAR_SIZE, %rax
> -#endif
> -       ret
> -
> -       .p2align 4
> -L(first_vec_x1_test):
> -       PCMPEQ  %xmm0, %xmm2
> -       pmovmskb %xmm2, %eax
> -       testl   %eax, %eax
> -       jz      L(first_vec_x0_test)
> -       bsrl    %eax, %eax
> -       leaq    (VEC_SIZE)(%rdi, %rax), %rax
> -#ifdef USE_AS_WCSRCHR
> -       andq    $-CHAR_SIZE, %rax
> -#endif
> -       ret
> -
> -       .p2align 4
> -L(first_vec_x2):
> -       PCMPEQ  %xmm0, %xmm3
> -       pmovmskb %xmm3, %eax
> -       leal    -1(%rcx), %edx
> -       xorl    %edx, %ecx
> -       andl    %ecx, %eax
> -       jz      L(first_vec_x1_test)
> -       bsrl    %eax, %eax
> -       leaq    (VEC_SIZE * 2)(%rdi, %rax), %rax
> -#ifdef USE_AS_WCSRCHR
> -       andq    $-CHAR_SIZE, %rax
> -#endif
> -       ret
> -
> -       .p2align 4
> -L(aligned_more):
> -       /* Save original pointer if match was in VEC 0.  */
> -       movq    %rdi, %r8
> -       andq    $-VEC_SIZE, %rdi
> -
> -       movaps  VEC_SIZE(%rdi), %xmm2
> -       pxor    %xmm3, %xmm3
> -       PCMPEQ  %xmm2, %xmm3
> -       pmovmskb %xmm3, %ecx
> -       testl   %ecx, %ecx
> -       jnz     L(first_vec_x1)
> -
> -       movaps  (VEC_SIZE * 2)(%rdi), %xmm3
> -       pxor    %xmm4, %xmm4
> -       PCMPEQ  %xmm3, %xmm4
> -       pmovmskb %xmm4, %ecx
> -       testl   %ecx, %ecx
> -       jnz     L(first_vec_x2)
> -
> -       addq    $VEC_SIZE, %rdi
> -       /* Save pointer again before realigning.  */
> -       movq    %rdi, %rsi
> -       andq    $-(VEC_SIZE * 2), %rdi
> -       .p2align 4
> -L(first_loop):
> -       /* Do 2x VEC at a time.  */
> -       movaps  (VEC_SIZE * 2)(%rdi), %xmm4
> -       movaps  (VEC_SIZE * 3)(%rdi), %xmm5
> -       /* Since SSE2 no pminud so wcsrchr needs seperate logic for
> -          detecting zero. Note if this is found to be a bottleneck it
> -          may be worth adding an SSE4.1 wcsrchr implementation.  */
> -#ifdef USE_AS_WCSRCHR
> -       movaps  %xmm5, %xmm6
> -       pxor    %xmm8, %xmm8
> -
> -       PCMPEQ  %xmm8, %xmm5
> -       PCMPEQ  %xmm4, %xmm8
> -       por     %xmm5, %xmm8
> -#else
> -       movaps  %xmm5, %xmm6
> -       PMINU   %xmm4, %xmm5
> -#endif
> -
> -       movaps  %xmm4, %xmm9
> -       PCMPEQ  %xmm0, %xmm4
> -       PCMPEQ  %xmm0, %xmm6
> -       movaps  %xmm6, %xmm7
> -       por     %xmm4, %xmm6
> -#ifndef USE_AS_WCSRCHR
> -       pxor    %xmm8, %xmm8
> -       PCMPEQ  %xmm5, %xmm8
> -#endif
> -       pmovmskb %xmm8, %ecx
> -       pmovmskb %xmm6, %eax
> -
> -       addq    $(VEC_SIZE * 2), %rdi
> -       /* Use `addl` 1) so we can undo it with `subl` and 2) it can
> -          macro-fuse with `jz`.  */
> -       addl    %ecx, %eax
> -       jz      L(first_loop)
> -
> -       /* Check if there is zero match.  */
> -       testl   %ecx, %ecx
> -       jz      L(second_loop_match)
> -
> -       /* Check if there was a match in last iteration.  */
> -       subl    %ecx, %eax
> -       jnz     L(new_match)
> -
> -L(first_loop_old_match):
> -       PCMPEQ  %xmm0, %xmm2
> -       PCMPEQ  %xmm0, %xmm3
> -       pmovmskb %xmm2, %ecx
> -       pmovmskb %xmm3, %eax
> -       addl    %eax, %ecx
> -       jz      L(first_vec_x0_test)
> -       /* NB: We could move this shift to before the branch and save a
> -          bit of code size / performance on the fall through. The
> -          branch leads to the null case which generally seems hotter
> -          than char in first 3x VEC.  */
> -       sall    $16, %eax
> -       orl     %ecx, %eax
> -
> -       bsrl    %eax, %eax
> -       addq    %rsi, %rax
> -#ifdef USE_AS_WCSRCHR
> -       andq    $-CHAR_SIZE, %rax
> -#endif
> -       ret
> -
> -       .p2align 4
> -L(new_match):
> -       pxor    %xmm6, %xmm6
> -       PCMPEQ  %xmm9, %xmm6
> -       pmovmskb %xmm6, %eax
> -       sall    $16, %ecx
> -       orl     %eax, %ecx
> -
> -       /* We can't reuse either of the old comparisons as since we mask
> -          of zeros after first zero (instead of using the full
> -          comparison) we can't gurantee no interference between match
> -          after end of string and valid match.  */
> -       pmovmskb %xmm4, %eax
> -       pmovmskb %xmm7, %edx
> -       sall    $16, %edx
> -       orl     %edx, %eax
> -
> -       leal    -1(%ecx), %edx
> -       xorl    %edx, %ecx
> -       andl    %ecx, %eax
> -       jz      L(first_loop_old_match)
> -       bsrl    %eax, %eax
> -       addq    %rdi, %rax
> -#ifdef USE_AS_WCSRCHR
> -       andq    $-CHAR_SIZE, %rax
> -#endif
> -       ret
> -
> -       /* Save minimum state for getting most recent match. We can
> -          throw out all previous work.  */
> -       .p2align 4
> -L(second_loop_match):
> -       movq    %rdi, %rsi
> -       movaps  %xmm4, %xmm2
> -       movaps  %xmm7, %xmm3
> -
> -       .p2align 4
> -L(second_loop):
> -       movaps  (VEC_SIZE * 2)(%rdi), %xmm4
> -       movaps  (VEC_SIZE * 3)(%rdi), %xmm5
> -       /* Since SSE2 no pminud so wcsrchr needs seperate logic for
> -          detecting zero. Note if this is found to be a bottleneck it
> -          may be worth adding an SSE4.1 wcsrchr implementation.  */
> -#ifdef USE_AS_WCSRCHR
> -       movaps  %xmm5, %xmm6
> -       pxor    %xmm8, %xmm8
> -
> -       PCMPEQ  %xmm8, %xmm5
> -       PCMPEQ  %xmm4, %xmm8
> -       por     %xmm5, %xmm8
> -#else
> -       movaps  %xmm5, %xmm6
> -       PMINU   %xmm4, %xmm5
> -#endif
> -
> -       movaps  %xmm4, %xmm9
> -       PCMPEQ  %xmm0, %xmm4
> -       PCMPEQ  %xmm0, %xmm6
> -       movaps  %xmm6, %xmm7
> -       por     %xmm4, %xmm6
> -#ifndef USE_AS_WCSRCHR
> -       pxor    %xmm8, %xmm8
> -       PCMPEQ  %xmm5, %xmm8
> -#endif
> -
> -       pmovmskb %xmm8, %ecx
> -       pmovmskb %xmm6, %eax
> -
> -       addq    $(VEC_SIZE * 2), %rdi
> -       /* Either null term or new occurence of CHAR.  */
> -       addl    %ecx, %eax
> -       jz      L(second_loop)
> -
> -       /* No null term so much be new occurence of CHAR.  */
> -       testl   %ecx, %ecx
> -       jz      L(second_loop_match)
> -
> -
> -       subl    %ecx, %eax
> -       jnz     L(second_loop_new_match)
> -
> -L(second_loop_old_match):
> -       pmovmskb %xmm2, %ecx
> -       pmovmskb %xmm3, %eax
> -       sall    $16, %eax
> -       orl     %ecx, %eax
> -       bsrl    %eax, %eax
> -       addq    %rsi, %rax
> -#ifdef USE_AS_WCSRCHR
> -       andq    $-CHAR_SIZE, %rax
> -#endif
> -       ret
> -
> -       .p2align 4
> -L(second_loop_new_match):
> -       pxor    %xmm6, %xmm6
> -       PCMPEQ  %xmm9, %xmm6
> -       pmovmskb %xmm6, %eax
> -       sall    $16, %ecx
> -       orl     %eax, %ecx
> -
> -       /* We can't reuse either of the old comparisons as since we mask
> -          of zeros after first zero (instead of using the full
> -          comparison) we can't gurantee no interference between match
> -          after end of string and valid match.  */
> -       pmovmskb %xmm4, %eax
> -       pmovmskb %xmm7, %edx
> -       sall    $16, %edx
> -       orl     %edx, %eax
> -
> -       leal    -1(%ecx), %edx
> -       xorl    %edx, %ecx
> -       andl    %ecx, %eax
> -       jz      L(second_loop_old_match)
> -       bsrl    %eax, %eax
> -       addq    %rdi, %rax
> -#ifdef USE_AS_WCSRCHR
> -       andq    $-CHAR_SIZE, %rax
> -#endif
> -       ret
> -
> -       .p2align 4,, 4
> -L(cross_page):
> -       movq    %rdi, %rsi
> -       andq    $-VEC_SIZE, %rsi
> -       movaps  (%rsi), %xmm1
> -       pxor    %xmm2, %xmm2
> -       PCMPEQ  %xmm1, %xmm2
> -       pmovmskb %xmm2, %edx
> -       movl    %edi, %ecx
> -       andl    $(VEC_SIZE - 1), %ecx
> -       sarl    %cl, %edx
> -       jz      L(cross_page_continue)
> -       PCMPEQ  %xmm0, %xmm1
> -       pmovmskb %xmm1, %eax
> -       sarl    %cl, %eax
> -       leal    -1(%rdx), %ecx
> -       xorl    %edx, %ecx
> -       andl    %ecx, %eax
> -       jz      L(ret1)
> -       bsrl    %eax, %eax
> -       addq    %rdi, %rax
> -#ifdef USE_AS_WCSRCHR
> -       andq    $-CHAR_SIZE, %rax
> -#endif
> -L(ret1):
> -       ret
> -END(STRRCHR)
> -
> -#ifndef USE_AS_WCSRCHR
> -       weak_alias (STRRCHR, rindex)
> -       libc_hidden_builtin_def (STRRCHR)
> -#endif
> +#define STRRCHR        strrchr
> +#include "multiarch/strrchr-sse2.S"
> +weak_alias (strrchr, rindex)
> +libc_hidden_builtin_def (strrchr)
> diff --git a/sysdeps/x86_64/wcsrchr.S b/sysdeps/x86_64/wcsrchr.S
> index 2b80efc5ef..1d4b1eb21c 100644
> --- a/sysdeps/x86_64/wcsrchr.S
> +++ b/sysdeps/x86_64/wcsrchr.S
> @@ -16,12 +16,5 @@
>     License along with the GNU C Library; if not, see
>     <https://www.gnu.org/licenses/>.  */
>
> -
> -#define USE_AS_WCSRCHR 1
> -#define NO_PMINU       1
> -
> -#ifndef STRRCHR
> -# define STRRCHR       wcsrchr
> -#endif
> -
> -#include "../strrchr.S"
> +#define STRRCHR        wcsrchr
> +#include "multiarch/wcsrchr-sse2.S"
> --
> 2.34.1
>

LGTM.

Thanks.

-- 
H.J.

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH v1] x86: Move memrchr SSE2 implementation to multiarch/memrchr-sse2.S
  2022-07-12 19:29 ` [PATCH v1] x86: Move memrchr SSE2 implementation to multiarch/memrchr-sse2.S Noah Goldstein
@ 2022-07-12 22:58   ` H.J. Lu
  0 siblings, 0 replies; 21+ messages in thread
From: H.J. Lu @ 2022-07-12 22:58 UTC (permalink / raw)
  To: Noah Goldstein; +Cc: GNU C Library, Carlos O'Donell

On Tue, Jul 12, 2022 at 12:29 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> This commit doesn't affect libc.so.6, its just housekeeping to prepare
> for adding explicit ISA level support.
>
> Tested build on x86_64 and x86_32 with/without multiarch.
> ---
>  sysdeps/x86_64/memrchr.S                | 332 +----------------------
>  sysdeps/x86_64/multiarch/memrchr-sse2.S | 336 +++++++++++++++++++++++-
>  2 files changed, 334 insertions(+), 334 deletions(-)
>
> diff --git a/sysdeps/x86_64/memrchr.S b/sysdeps/x86_64/memrchr.S
> index b0dffd2ae2..385e2c5668 100644
> --- a/sysdeps/x86_64/memrchr.S
> +++ b/sysdeps/x86_64/memrchr.S
> @@ -17,334 +17,6 @@
>     License along with the GNU C Library; if not, see
>     <https://www.gnu.org/licenses/>.  */
>
> -#include <sysdep.h>
> -#define VEC_SIZE                       16
> -#define PAGE_SIZE                      4096
> -
> -       .text
> -ENTRY_P2ALIGN(__memrchr, 6)
> -#ifdef __ILP32__
> -       /* Clear upper bits.  */
> -       mov     %RDX_LP, %RDX_LP
> -#endif
> -       movd    %esi, %xmm0
> -
> -       /* Get end pointer.  */
> -       leaq    (%rdx, %rdi), %rcx
> -
> -       punpcklbw %xmm0, %xmm0
> -       punpcklwd %xmm0, %xmm0
> -       pshufd  $0, %xmm0, %xmm0
> -
> -       /* Check if we can load 1x VEC without cross a page.  */
> -       testl   $(PAGE_SIZE - VEC_SIZE), %ecx
> -       jz      L(page_cross)
> -
> -       /* NB: This load happens regardless of whether rdx (len) is zero. Since
> -          it doesn't cross a page and the standard gurantees any pointer have
> -          at least one-valid byte this load must be safe. For the entire
> -          history of the x86 memrchr implementation this has been possible so
> -          no code "should" be relying on a zero-length check before this load.
> -          The zero-length check is moved to the page cross case because it is
> -          1) pretty cold and including it pushes the hot case len <= VEC_SIZE
> -          into 2-cache lines.  */
> -       movups  -(VEC_SIZE)(%rcx), %xmm1
> -       pcmpeqb %xmm0, %xmm1
> -       pmovmskb %xmm1, %eax
> -
> -       subq    $VEC_SIZE, %rdx
> -       ja      L(more_1x_vec)
> -L(ret_vec_x0_test):
> -       /* Zero-flag set if eax (src) is zero. Destination unchanged if src is
> -          zero.  */
> -       bsrl    %eax, %eax
> -       jz      L(ret_0)
> -       /* Check if the CHAR match is in bounds. Need to truly zero `eax` here
> -          if out of bounds.  */
> -       addl    %edx, %eax
> -       jl      L(zero_0)
> -       /* Since we subtracted VEC_SIZE from rdx earlier we can just add to base
> -          ptr.  */
> -       addq    %rdi, %rax
> -L(ret_0):
> -       ret
> -
> -       .p2align 4,, 5
> -L(ret_vec_x0):
> -       bsrl    %eax, %eax
> -       leaq    -(VEC_SIZE)(%rcx, %rax), %rax
> -       ret
> -
> -       .p2align 4,, 2
> -L(zero_0):
> -       xorl    %eax, %eax
> -       ret
> -
> -
> -       .p2align 4,, 8
> -L(more_1x_vec):
> -       testl   %eax, %eax
> -       jnz     L(ret_vec_x0)
> -
> -       /* Align rcx (pointer to string).  */
> -       decq    %rcx
> -       andq    $-VEC_SIZE, %rcx
> -
> -       movq    %rcx, %rdx
> -       /* NB: We could consistenyl save 1-byte in this pattern with `movaps
> -          %xmm0, %xmm1; pcmpeq IMM8(r), %xmm1; ...`. The reason against it is
> -          it adds more frontend uops (even if the moves can be eliminated) and
> -          some percentage of the time actual backend uops.  */
> -       movaps  -(VEC_SIZE)(%rcx), %xmm1
> -       pcmpeqb %xmm0, %xmm1
> -       subq    %rdi, %rdx
> -       pmovmskb %xmm1, %eax
> -
> -       cmpq    $(VEC_SIZE * 2), %rdx
> -       ja      L(more_2x_vec)
> -L(last_2x_vec):
> -       subl    $VEC_SIZE, %edx
> -       jbe     L(ret_vec_x0_test)
> -
> -       testl   %eax, %eax
> -       jnz     L(ret_vec_x0)
> -
> -       movaps  -(VEC_SIZE * 2)(%rcx), %xmm1
> -       pcmpeqb %xmm0, %xmm1
> -       pmovmskb %xmm1, %eax
> -
> -       subl    $VEC_SIZE, %edx
> -       bsrl    %eax, %eax
> -       jz      L(ret_1)
> -       addl    %edx, %eax
> -       jl      L(zero_0)
> -       addq    %rdi, %rax
> -L(ret_1):
> -       ret
> -
> -       /* Don't align. Otherwise lose 2-byte encoding in jump to L(page_cross)
> -          causes the hot pause (length <= VEC_SIZE) to span multiple cache
> -          lines.  Naturally aligned % 16 to 8-bytes.  */
> -L(page_cross):
> -       /* Zero length check.  */
> -       testq   %rdx, %rdx
> -       jz      L(zero_0)
> -
> -       leaq    -1(%rcx), %r8
> -       andq    $-(VEC_SIZE), %r8
> -
> -       movaps  (%r8), %xmm1
> -       pcmpeqb %xmm0, %xmm1
> -       pmovmskb %xmm1, %esi
> -       /* Shift out negative alignment (because we are starting from endptr and
> -          working backwards).  */
> -       negl    %ecx
> -       /* 32-bit shift but VEC_SIZE=16 so need to mask the shift count
> -          explicitly.  */
> -       andl    $(VEC_SIZE - 1), %ecx
> -       shl     %cl, %esi
> -       movzwl  %si, %eax
> -       leaq    (%rdi, %rdx), %rcx
> -       cmpq    %rdi, %r8
> -       ja      L(more_1x_vec)
> -       subl    $VEC_SIZE, %edx
> -       bsrl    %eax, %eax
> -       jz      L(ret_2)
> -       addl    %edx, %eax
> -       jl      L(zero_1)
> -       addq    %rdi, %rax
> -L(ret_2):
> -       ret
> -
> -       /* Fits in aliging bytes.  */
> -L(zero_1):
> -       xorl    %eax, %eax
> -       ret
> -
> -       .p2align 4,, 5
> -L(ret_vec_x1):
> -       bsrl    %eax, %eax
> -       leaq    -(VEC_SIZE * 2)(%rcx, %rax), %rax
> -       ret
> -
> -       .p2align 4,, 8
> -L(more_2x_vec):
> -       testl   %eax, %eax
> -       jnz     L(ret_vec_x0)
> -
> -       movaps  -(VEC_SIZE * 2)(%rcx), %xmm1
> -       pcmpeqb %xmm0, %xmm1
> -       pmovmskb %xmm1, %eax
> -       testl   %eax, %eax
> -       jnz     L(ret_vec_x1)
> -
> -
> -       movaps  -(VEC_SIZE * 3)(%rcx), %xmm1
> -       pcmpeqb %xmm0, %xmm1
> -       pmovmskb %xmm1, %eax
> -
> -       subq    $(VEC_SIZE * 4), %rdx
> -       ja      L(more_4x_vec)
> -
> -       addl    $(VEC_SIZE), %edx
> -       jle     L(ret_vec_x2_test)
> -
> -L(last_vec):
> -       testl   %eax, %eax
> -       jnz     L(ret_vec_x2)
> -
> -       movaps  -(VEC_SIZE * 4)(%rcx), %xmm1
> -       pcmpeqb %xmm0, %xmm1
> -       pmovmskb %xmm1, %eax
> -
> -       subl    $(VEC_SIZE), %edx
> -       bsrl    %eax, %eax
> -       jz      L(ret_3)
> -       addl    %edx, %eax
> -       jl      L(zero_2)
> -       addq    %rdi, %rax
> -L(ret_3):
> -       ret
> -
> -       .p2align 4,, 6
> -L(ret_vec_x2_test):
> -       bsrl    %eax, %eax
> -       jz      L(zero_2)
> -       addl    %edx, %eax
> -       jl      L(zero_2)
> -       addq    %rdi, %rax
> -       ret
> -
> -L(zero_2):
> -       xorl    %eax, %eax
> -       ret
> -
> -
> -       .p2align 4,, 5
> -L(ret_vec_x2):
> -       bsrl    %eax, %eax
> -       leaq    -(VEC_SIZE * 3)(%rcx, %rax), %rax
> -       ret
> -
> -       .p2align 4,, 5
> -L(ret_vec_x3):
> -       bsrl    %eax, %eax
> -       leaq    -(VEC_SIZE * 4)(%rcx, %rax), %rax
> -       ret
> -
> -       .p2align 4,, 8
> -L(more_4x_vec):
> -       testl   %eax, %eax
> -       jnz     L(ret_vec_x2)
> -
> -       movaps  -(VEC_SIZE * 4)(%rcx), %xmm1
> -       pcmpeqb %xmm0, %xmm1
> -       pmovmskb %xmm1, %eax
> -
> -       testl   %eax, %eax
> -       jnz     L(ret_vec_x3)
> -
> -       addq    $-(VEC_SIZE * 4), %rcx
> -       cmpq    $(VEC_SIZE * 4), %rdx
> -       jbe     L(last_4x_vec)
> -
> -       /* Offset everything by 4x VEC_SIZE here to save a few bytes at the end
> -          keeping the code from spilling to the next cache line.  */
> -       addq    $(VEC_SIZE * 4 - 1), %rcx
> -       andq    $-(VEC_SIZE * 4), %rcx
> -       leaq    (VEC_SIZE * 4)(%rdi), %rdx
> -       andq    $-(VEC_SIZE * 4), %rdx
> -
> -       .p2align 4,, 11
> -L(loop_4x_vec):
> -       movaps  (VEC_SIZE * -1)(%rcx), %xmm1
> -       movaps  (VEC_SIZE * -2)(%rcx), %xmm2
> -       movaps  (VEC_SIZE * -3)(%rcx), %xmm3
> -       movaps  (VEC_SIZE * -4)(%rcx), %xmm4
> -       pcmpeqb %xmm0, %xmm1
> -       pcmpeqb %xmm0, %xmm2
> -       pcmpeqb %xmm0, %xmm3
> -       pcmpeqb %xmm0, %xmm4
> -
> -       por     %xmm1, %xmm2
> -       por     %xmm3, %xmm4
> -       por     %xmm2, %xmm4
> -
> -       pmovmskb %xmm4, %esi
> -       testl   %esi, %esi
> -       jnz     L(loop_end)
> -
> -       addq    $-(VEC_SIZE * 4), %rcx
> -       cmpq    %rdx, %rcx
> -       jne     L(loop_4x_vec)
> -
> -       subl    %edi, %edx
> -
> -       /* Ends up being 1-byte nop.  */
> -       .p2align 4,, 2
> -L(last_4x_vec):
> -       movaps  -(VEC_SIZE)(%rcx), %xmm1
> -       pcmpeqb %xmm0, %xmm1
> -       pmovmskb %xmm1, %eax
> -
> -       cmpl    $(VEC_SIZE * 2), %edx
> -       jbe     L(last_2x_vec)
> -
> -       testl   %eax, %eax
> -       jnz     L(ret_vec_x0)
> -
> -
> -       movaps  -(VEC_SIZE * 2)(%rcx), %xmm1
> -       pcmpeqb %xmm0, %xmm1
> -       pmovmskb %xmm1, %eax
> -
> -       testl   %eax, %eax
> -       jnz     L(ret_vec_end)
> -
> -       movaps  -(VEC_SIZE * 3)(%rcx), %xmm1
> -       pcmpeqb %xmm0, %xmm1
> -       pmovmskb %xmm1, %eax
> -
> -       subl    $(VEC_SIZE * 3), %edx
> -       ja      L(last_vec)
> -       bsrl    %eax, %eax
> -       jz      L(ret_4)
> -       addl    %edx, %eax
> -       jl      L(zero_3)
> -       addq    %rdi, %rax
> -L(ret_4):
> -       ret
> -
> -       /* Ends up being 1-byte nop.  */
> -       .p2align 4,, 3
> -L(loop_end):
> -       pmovmskb %xmm1, %eax
> -       sall    $16, %eax
> -       jnz     L(ret_vec_end)
> -
> -       pmovmskb %xmm2, %eax
> -       testl   %eax, %eax
> -       jnz     L(ret_vec_end)
> -
> -       pmovmskb %xmm3, %eax
> -       /* Combine last 2 VEC matches. If ecx (VEC3) is zero (no CHAR in VEC3)
> -          then it won't affect the result in esi (VEC4). If ecx is non-zero
> -          then CHAR in VEC3 and bsrq will use that position.  */
> -       sall    $16, %eax
> -       orl     %esi, %eax
> -       bsrl    %eax, %eax
> -       leaq    -(VEC_SIZE * 4)(%rcx, %rax), %rax
> -       ret
> -
> -L(ret_vec_end):
> -       bsrl    %eax, %eax
> -       leaq    (VEC_SIZE * -2)(%rax, %rcx), %rax
> -       ret
> -       /* Use in L(last_4x_vec). In the same cache line. This is just a spare
> -          aligning bytes.  */
> -L(zero_3):
> -       xorl    %eax, %eax
> -       ret
> -       /* 2-bytes from next cache line.  */
> -END(__memrchr)
> +#define MEMRCHR        __memrchr
> +#include "multiarch/memrchr-sse2.S"
>  weak_alias (__memrchr, memrchr)
> diff --git a/sysdeps/x86_64/multiarch/memrchr-sse2.S b/sysdeps/x86_64/multiarch/memrchr-sse2.S
> index b04202e171..d92a4022dc 100644
> --- a/sysdeps/x86_64/multiarch/memrchr-sse2.S
> +++ b/sysdeps/x86_64/multiarch/memrchr-sse2.S
> @@ -17,10 +17,338 @@
>     <https://www.gnu.org/licenses/>.  */
>
>  #if IS_IN (libc)
> -# define __memrchr __memrchr_sse2
> +# ifndef MEMRCHR
> +#  define MEMRCHR __memrchr_sse2
> +# endif
> +#endif
> +
> +#include <sysdep.h>
> +#define VEC_SIZE                       16
> +#define PAGE_SIZE                      4096
>
> -# undef weak_alias
> -# define weak_alias(__memrchr, memrchr)
> +       .text
> +ENTRY_P2ALIGN(MEMRCHR, 6)
> +#ifdef __ILP32__
> +       /* Clear upper bits.  */
> +       mov     %RDX_LP, %RDX_LP
>  #endif
> +       movd    %esi, %xmm0
> +
> +       /* Get end pointer.  */
> +       leaq    (%rdx, %rdi), %rcx
> +
> +       punpcklbw %xmm0, %xmm0
> +       punpcklwd %xmm0, %xmm0
> +       pshufd  $0, %xmm0, %xmm0
> +
> +       /* Check if we can load 1x VEC without cross a page.  */
> +       testl   $(PAGE_SIZE - VEC_SIZE), %ecx
> +       jz      L(page_cross)
> +
> +       /* NB: This load happens regardless of whether rdx (len) is zero. Since
> +          it doesn't cross a page and the standard gurantees any pointer have
> +          at least one-valid byte this load must be safe. For the entire
> +          history of the x86 memrchr implementation this has been possible so
> +          no code "should" be relying on a zero-length check before this load.
> +          The zero-length check is moved to the page cross case because it is
> +          1) pretty cold and including it pushes the hot case len <= VEC_SIZE
> +          into 2-cache lines.  */
> +       movups  -(VEC_SIZE)(%rcx), %xmm1
> +       pcmpeqb %xmm0, %xmm1
> +       pmovmskb %xmm1, %eax
> +
> +       subq    $VEC_SIZE, %rdx
> +       ja      L(more_1x_vec)
> +L(ret_vec_x0_test):
> +       /* Zero-flag set if eax (src) is zero. Destination unchanged if src is
> +          zero.  */
> +       bsrl    %eax, %eax
> +       jz      L(ret_0)
> +       /* Check if the CHAR match is in bounds. Need to truly zero `eax` here
> +          if out of bounds.  */
> +       addl    %edx, %eax
> +       jl      L(zero_0)
> +       /* Since we subtracted VEC_SIZE from rdx earlier we can just add to base
> +          ptr.  */
> +       addq    %rdi, %rax
> +L(ret_0):
> +       ret
> +
> +       .p2align 4,, 5
> +L(ret_vec_x0):
> +       bsrl    %eax, %eax
> +       leaq    -(VEC_SIZE)(%rcx, %rax), %rax
> +       ret
> +
> +       .p2align 4,, 2
> +L(zero_0):
> +       xorl    %eax, %eax
> +       ret
> +
> +
> +       .p2align 4,, 8
> +L(more_1x_vec):
> +       testl   %eax, %eax
> +       jnz     L(ret_vec_x0)
> +
> +       /* Align rcx (pointer to string).  */
> +       decq    %rcx
> +       andq    $-VEC_SIZE, %rcx
> +
> +       movq    %rcx, %rdx
> +       /* NB: We could consistenyl save 1-byte in this pattern with `movaps
> +          %xmm0, %xmm1; pcmpeq IMM8(r), %xmm1; ...`. The reason against it is
> +          it adds more frontend uops (even if the moves can be eliminated) and
> +          some percentage of the time actual backend uops.  */
> +       movaps  -(VEC_SIZE)(%rcx), %xmm1
> +       pcmpeqb %xmm0, %xmm1
> +       subq    %rdi, %rdx
> +       pmovmskb %xmm1, %eax
> +
> +       cmpq    $(VEC_SIZE * 2), %rdx
> +       ja      L(more_2x_vec)
> +L(last_2x_vec):
> +       subl    $VEC_SIZE, %edx
> +       jbe     L(ret_vec_x0_test)
> +
> +       testl   %eax, %eax
> +       jnz     L(ret_vec_x0)
> +
> +       movaps  -(VEC_SIZE * 2)(%rcx), %xmm1
> +       pcmpeqb %xmm0, %xmm1
> +       pmovmskb %xmm1, %eax
> +
> +       subl    $VEC_SIZE, %edx
> +       bsrl    %eax, %eax
> +       jz      L(ret_1)
> +       addl    %edx, %eax
> +       jl      L(zero_0)
> +       addq    %rdi, %rax
> +L(ret_1):
> +       ret
> +
> +       /* Don't align. Otherwise lose 2-byte encoding in jump to L(page_cross)
> +          causes the hot pause (length <= VEC_SIZE) to span multiple cache
> +          lines.  Naturally aligned % 16 to 8-bytes.  */
> +L(page_cross):
> +       /* Zero length check.  */
> +       testq   %rdx, %rdx
> +       jz      L(zero_0)
> +
> +       leaq    -1(%rcx), %r8
> +       andq    $-(VEC_SIZE), %r8
> +
> +       movaps  (%r8), %xmm1
> +       pcmpeqb %xmm0, %xmm1
> +       pmovmskb %xmm1, %esi
> +       /* Shift out negative alignment (because we are starting from endptr and
> +          working backwards).  */
> +       negl    %ecx
> +       /* 32-bit shift but VEC_SIZE=16 so need to mask the shift count
> +          explicitly.  */
> +       andl    $(VEC_SIZE - 1), %ecx
> +       shl     %cl, %esi
> +       movzwl  %si, %eax
> +       leaq    (%rdi, %rdx), %rcx
> +       cmpq    %rdi, %r8
> +       ja      L(more_1x_vec)
> +       subl    $VEC_SIZE, %edx
> +       bsrl    %eax, %eax
> +       jz      L(ret_2)
> +       addl    %edx, %eax
> +       jl      L(zero_1)
> +       addq    %rdi, %rax
> +L(ret_2):
> +       ret
> +
> +       /* Fits in aliging bytes.  */
> +L(zero_1):
> +       xorl    %eax, %eax
> +       ret
> +
> +       .p2align 4,, 5
> +L(ret_vec_x1):
> +       bsrl    %eax, %eax
> +       leaq    -(VEC_SIZE * 2)(%rcx, %rax), %rax
> +       ret
> +
> +       .p2align 4,, 8
> +L(more_2x_vec):
> +       testl   %eax, %eax
> +       jnz     L(ret_vec_x0)
> +
> +       movaps  -(VEC_SIZE * 2)(%rcx), %xmm1
> +       pcmpeqb %xmm0, %xmm1
> +       pmovmskb %xmm1, %eax
> +       testl   %eax, %eax
> +       jnz     L(ret_vec_x1)
> +
> +
> +       movaps  -(VEC_SIZE * 3)(%rcx), %xmm1
> +       pcmpeqb %xmm0, %xmm1
> +       pmovmskb %xmm1, %eax
> +
> +       subq    $(VEC_SIZE * 4), %rdx
> +       ja      L(more_4x_vec)
> +
> +       addl    $(VEC_SIZE), %edx
> +       jle     L(ret_vec_x2_test)
> +
> +L(last_vec):
> +       testl   %eax, %eax
> +       jnz     L(ret_vec_x2)
> +
> +       movaps  -(VEC_SIZE * 4)(%rcx), %xmm1
> +       pcmpeqb %xmm0, %xmm1
> +       pmovmskb %xmm1, %eax
> +
> +       subl    $(VEC_SIZE), %edx
> +       bsrl    %eax, %eax
> +       jz      L(ret_3)
> +       addl    %edx, %eax
> +       jl      L(zero_2)
> +       addq    %rdi, %rax
> +L(ret_3):
> +       ret
> +
> +       .p2align 4,, 6
> +L(ret_vec_x2_test):
> +       bsrl    %eax, %eax
> +       jz      L(zero_2)
> +       addl    %edx, %eax
> +       jl      L(zero_2)
> +       addq    %rdi, %rax
> +       ret
> +
> +L(zero_2):
> +       xorl    %eax, %eax
> +       ret
> +
> +
> +       .p2align 4,, 5
> +L(ret_vec_x2):
> +       bsrl    %eax, %eax
> +       leaq    -(VEC_SIZE * 3)(%rcx, %rax), %rax
> +       ret
> +
> +       .p2align 4,, 5
> +L(ret_vec_x3):
> +       bsrl    %eax, %eax
> +       leaq    -(VEC_SIZE * 4)(%rcx, %rax), %rax
> +       ret
> +
> +       .p2align 4,, 8
> +L(more_4x_vec):
> +       testl   %eax, %eax
> +       jnz     L(ret_vec_x2)
> +
> +       movaps  -(VEC_SIZE * 4)(%rcx), %xmm1
> +       pcmpeqb %xmm0, %xmm1
> +       pmovmskb %xmm1, %eax
> +
> +       testl   %eax, %eax
> +       jnz     L(ret_vec_x3)
> +
> +       addq    $-(VEC_SIZE * 4), %rcx
> +       cmpq    $(VEC_SIZE * 4), %rdx
> +       jbe     L(last_4x_vec)
> +
> +       /* Offset everything by 4x VEC_SIZE here to save a few bytes at the end
> +          keeping the code from spilling to the next cache line.  */
> +       addq    $(VEC_SIZE * 4 - 1), %rcx
> +       andq    $-(VEC_SIZE * 4), %rcx
> +       leaq    (VEC_SIZE * 4)(%rdi), %rdx
> +       andq    $-(VEC_SIZE * 4), %rdx
> +
> +       .p2align 4,, 11
> +L(loop_4x_vec):
> +       movaps  (VEC_SIZE * -1)(%rcx), %xmm1
> +       movaps  (VEC_SIZE * -2)(%rcx), %xmm2
> +       movaps  (VEC_SIZE * -3)(%rcx), %xmm3
> +       movaps  (VEC_SIZE * -4)(%rcx), %xmm4
> +       pcmpeqb %xmm0, %xmm1
> +       pcmpeqb %xmm0, %xmm2
> +       pcmpeqb %xmm0, %xmm3
> +       pcmpeqb %xmm0, %xmm4
> +
> +       por     %xmm1, %xmm2
> +       por     %xmm3, %xmm4
> +       por     %xmm2, %xmm4
> +
> +       pmovmskb %xmm4, %esi
> +       testl   %esi, %esi
> +       jnz     L(loop_end)
> +
> +       addq    $-(VEC_SIZE * 4), %rcx
> +       cmpq    %rdx, %rcx
> +       jne     L(loop_4x_vec)
> +
> +       subl    %edi, %edx
> +
> +       /* Ends up being 1-byte nop.  */
> +       .p2align 4,, 2
> +L(last_4x_vec):
> +       movaps  -(VEC_SIZE)(%rcx), %xmm1
> +       pcmpeqb %xmm0, %xmm1
> +       pmovmskb %xmm1, %eax
> +
> +       cmpl    $(VEC_SIZE * 2), %edx
> +       jbe     L(last_2x_vec)
> +
> +       testl   %eax, %eax
> +       jnz     L(ret_vec_x0)
> +
> +
> +       movaps  -(VEC_SIZE * 2)(%rcx), %xmm1
> +       pcmpeqb %xmm0, %xmm1
> +       pmovmskb %xmm1, %eax
> +
> +       testl   %eax, %eax
> +       jnz     L(ret_vec_end)
> +
> +       movaps  -(VEC_SIZE * 3)(%rcx), %xmm1
> +       pcmpeqb %xmm0, %xmm1
> +       pmovmskb %xmm1, %eax
> +
> +       subl    $(VEC_SIZE * 3), %edx
> +       ja      L(last_vec)
> +       bsrl    %eax, %eax
> +       jz      L(ret_4)
> +       addl    %edx, %eax
> +       jl      L(zero_3)
> +       addq    %rdi, %rax
> +L(ret_4):
> +       ret
> +
> +       /* Ends up being 1-byte nop.  */
> +       .p2align 4,, 3
> +L(loop_end):
> +       pmovmskb %xmm1, %eax
> +       sall    $16, %eax
> +       jnz     L(ret_vec_end)
> +
> +       pmovmskb %xmm2, %eax
> +       testl   %eax, %eax
> +       jnz     L(ret_vec_end)
> +
> +       pmovmskb %xmm3, %eax
> +       /* Combine last 2 VEC matches. If ecx (VEC3) is zero (no CHAR in VEC3)
> +          then it won't affect the result in esi (VEC4). If ecx is non-zero
> +          then CHAR in VEC3 and bsrq will use that position.  */
> +       sall    $16, %eax
> +       orl     %esi, %eax
> +       bsrl    %eax, %eax
> +       leaq    -(VEC_SIZE * 4)(%rcx, %rax), %rax
> +       ret
>
> -#include "../memrchr.S"
> +L(ret_vec_end):
> +       bsrl    %eax, %eax
> +       leaq    (VEC_SIZE * -2)(%rax, %rcx), %rax
> +       ret
> +       /* Use in L(last_4x_vec). In the same cache line. This is just a spare
> +          aligning bytes.  */
> +L(zero_3):
> +       xorl    %eax, %eax
> +       ret
> +       /* 2-bytes from next cache line.  */
> +END(MEMRCHR)
> --
> 2.34.1
>

LGTM.

Thanks.

-- 
H.J.

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH v1] x86: Move strcpy SSE2 implementation to multiarch/strcpy-sse2.S
  2022-07-12 19:29 ` [PATCH v1] x86: Move strcpy SSE2 implementation to multiarch/strcpy-sse2.S Noah Goldstein
@ 2022-07-12 23:23   ` H.J. Lu
  0 siblings, 0 replies; 21+ messages in thread
From: H.J. Lu @ 2022-07-12 23:23 UTC (permalink / raw)
  To: Noah Goldstein; +Cc: GNU C Library, Carlos O'Donell

On Tue, Jul 12, 2022 at 12:29 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> This commit doesn't affect libc.so.6, its just housekeeping to prepare
> for adding explicit ISA level support.
>
> Tested build on x86_64 and x86_32 with/without multiarch.
> ---
>  sysdeps/x86_64/multiarch/rtld-stpcpy.S |  18 ++++
>  sysdeps/x86_64/multiarch/stpcpy-sse2.S |  15 +--
>  sysdeps/x86_64/multiarch/strcpy-sse2.S | 137 ++++++++++++++++++++++--
>  sysdeps/x86_64/stpcpy.S                |   3 +-
>  sysdeps/x86_64/strcpy.S                | 138 +------------------------
>  5 files changed, 156 insertions(+), 155 deletions(-)
>  create mode 100644 sysdeps/x86_64/multiarch/rtld-stpcpy.S
>
> diff --git a/sysdeps/x86_64/multiarch/rtld-stpcpy.S b/sysdeps/x86_64/multiarch/rtld-stpcpy.S
> new file mode 100644
> index 0000000000..914141f07f
> --- /dev/null
> +++ b/sysdeps/x86_64/multiarch/rtld-stpcpy.S
> @@ -0,0 +1,18 @@
> +/* Copyright (C) 2022 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +#include "../stpcpy.S"
> diff --git a/sysdeps/x86_64/multiarch/stpcpy-sse2.S b/sysdeps/x86_64/multiarch/stpcpy-sse2.S
> index 078504a44e..ea9f973af3 100644
> --- a/sysdeps/x86_64/multiarch/stpcpy-sse2.S
> +++ b/sysdeps/x86_64/multiarch/stpcpy-sse2.S
> @@ -17,17 +17,10 @@
>     <https://www.gnu.org/licenses/>.  */
>
>  #if IS_IN (libc)
> -
> -# include <sysdep.h>
> -# define __stpcpy __stpcpy_sse2
> -
> -# undef weak_alias
> -# define weak_alias(ignored1, ignored2)
> -# undef libc_hidden_def
> -# define libc_hidden_def(__stpcpy)
> -# undef libc_hidden_builtin_def
> -# define libc_hidden_builtin_def(stpcpy)
> +# ifndef STRCPY
> +#  define STRCPY       __stpcpy_sse2
> +# endif
>  #endif
>
>  #define USE_AS_STPCPY
> -#include <sysdeps/x86_64/stpcpy.S>
> +#include "strcpy-sse2.S"
> diff --git a/sysdeps/x86_64/multiarch/strcpy-sse2.S b/sysdeps/x86_64/multiarch/strcpy-sse2.S
> index f37967c441..8b5db8b13d 100644
> --- a/sysdeps/x86_64/multiarch/strcpy-sse2.S
> +++ b/sysdeps/x86_64/multiarch/strcpy-sse2.S
> @@ -17,12 +17,137 @@
>     <https://www.gnu.org/licenses/>.  */
>
>  #if IS_IN (libc)
> +# ifndef STRCPY
> +#  define STRCPY __strcpy_sse2
> +# endif
> +#endif
>
> -# include <sysdep.h>
> -# define strcpy __strcpy_sse2
> +#include <sysdep.h>
>
> -# undef libc_hidden_builtin_def
> -# define libc_hidden_builtin_def(strcpy)
> -#endif
> +       .text
> +ENTRY (STRCPY)
> +       movq %rsi, %rcx         /* Source register. */
> +       andl $7, %ecx           /* mask alignment bits */
> +       movq %rdi, %rdx         /* Duplicate destination pointer.  */
> +
> +       jz 5f                   /* aligned => start loop */
> +
> +       neg %ecx                /* We need to align to 8 bytes.  */
> +       addl $8,%ecx
> +       /* Search the first bytes directly.  */
> +0:
> +       movb    (%rsi), %al     /* Fetch a byte */
> +       testb   %al, %al        /* Is it NUL? */
> +       movb    %al, (%rdx)     /* Store it */
> +       jz      4f              /* If it was NUL, done! */
> +       incq    %rsi
> +       incq    %rdx
> +       decl    %ecx
> +       jnz     0b
> +
> +5:
> +       movq $0xfefefefefefefeff,%r8
> +
> +       /* Now the sources is aligned.  Unfortunatly we cannot force
> +          to have both source and destination aligned, so ignore the
> +          alignment of the destination.  */
> +       .p2align 4
> +1:
> +       /* 1st unroll.  */
> +       movq    (%rsi), %rax    /* Read double word (8 bytes).  */
> +       addq    $8, %rsi        /* Adjust pointer for next word.  */
> +       movq    %rax, %r9       /* Save a copy for NUL finding.  */
> +       addq    %r8, %r9        /* add the magic value to the word.  We get
> +                                  carry bits reported for each byte which
> +                                  is *not* 0 */
> +       jnc     3f              /* highest byte is NUL => return pointer */
> +       xorq    %rax, %r9       /* (word+magic)^word */
> +       orq     %r8, %r9        /* set all non-carry bits */
> +       incq    %r9             /* add 1: if one carry bit was *not* set
> +                                  the addition will not result in 0.  */
> +
> +       jnz     3f              /* found NUL => return pointer */
> +
> +       movq    %rax, (%rdx)    /* Write value to destination.  */
> +       addq    $8, %rdx        /* Adjust pointer.  */
> +
> +       /* 2nd unroll.  */
> +       movq    (%rsi), %rax    /* Read double word (8 bytes).  */
> +       addq    $8, %rsi        /* Adjust pointer for next word.  */
> +       movq    %rax, %r9       /* Save a copy for NUL finding.  */
> +       addq    %r8, %r9        /* add the magic value to the word.  We get
> +                                  carry bits reported for each byte which
> +                                  is *not* 0 */
> +       jnc     3f              /* highest byte is NUL => return pointer */
> +       xorq    %rax, %r9       /* (word+magic)^word */
> +       orq     %r8, %r9        /* set all non-carry bits */
> +       incq    %r9             /* add 1: if one carry bit was *not* set
> +                                  the addition will not result in 0.  */
> +
> +       jnz     3f              /* found NUL => return pointer */
>
> -#include <sysdeps/x86_64/strcpy.S>
> +       movq    %rax, (%rdx)    /* Write value to destination.  */
> +       addq    $8, %rdx        /* Adjust pointer.  */
> +
> +       /* 3rd unroll.  */
> +       movq    (%rsi), %rax    /* Read double word (8 bytes).  */
> +       addq    $8, %rsi        /* Adjust pointer for next word.  */
> +       movq    %rax, %r9       /* Save a copy for NUL finding.  */
> +       addq    %r8, %r9        /* add the magic value to the word.  We get
> +                                  carry bits reported for each byte which
> +                                  is *not* 0 */
> +       jnc     3f              /* highest byte is NUL => return pointer */
> +       xorq    %rax, %r9       /* (word+magic)^word */
> +       orq     %r8, %r9        /* set all non-carry bits */
> +       incq    %r9             /* add 1: if one carry bit was *not* set
> +                                  the addition will not result in 0.  */
> +
> +       jnz     3f              /* found NUL => return pointer */
> +
> +       movq    %rax, (%rdx)    /* Write value to destination.  */
> +       addq    $8, %rdx        /* Adjust pointer.  */
> +
> +       /* 4th unroll.  */
> +       movq    (%rsi), %rax    /* Read double word (8 bytes).  */
> +       addq    $8, %rsi        /* Adjust pointer for next word.  */
> +       movq    %rax, %r9       /* Save a copy for NUL finding.  */
> +       addq    %r8, %r9        /* add the magic value to the word.  We get
> +                                  carry bits reported for each byte which
> +                                  is *not* 0 */
> +       jnc     3f              /* highest byte is NUL => return pointer */
> +       xorq    %rax, %r9       /* (word+magic)^word */
> +       orq     %r8, %r9        /* set all non-carry bits */
> +       incq    %r9             /* add 1: if one carry bit was *not* set
> +                                  the addition will not result in 0.  */
> +
> +       jnz     3f              /* found NUL => return pointer */
> +
> +       movq    %rax, (%rdx)    /* Write value to destination.  */
> +       addq    $8, %rdx        /* Adjust pointer.  */
> +       jmp     1b              /* Next iteration.  */
> +
> +       /* Do the last few bytes. %rax contains the value to write.
> +          The loop is unrolled twice.  */
> +       .p2align 4
> +3:
> +       /* Note that stpcpy needs to return with the value of the NUL
> +          byte.  */
> +       movb    %al, (%rdx)     /* 1st byte.  */
> +       testb   %al, %al        /* Is it NUL.  */
> +       jz      4f              /* yes, finish.  */
> +       incq    %rdx            /* Increment destination.  */
> +       movb    %ah, (%rdx)     /* 2nd byte.  */
> +       testb   %ah, %ah        /* Is it NUL?.  */
> +       jz      4f              /* yes, finish.  */
> +       incq    %rdx            /* Increment destination.  */
> +       shrq    $16, %rax       /* Shift...  */
> +       jmp     3b              /* and look at next two bytes in %rax.  */
> +
> +4:
> +#ifdef USE_AS_STPCPY
> +       movq    %rdx, %rax      /* Destination is return value.  */
> +#else
> +       movq    %rdi, %rax      /* Source is return value.  */
> +#endif
> +       retq
> +END (STRCPY)
> diff --git a/sysdeps/x86_64/stpcpy.S b/sysdeps/x86_64/stpcpy.S
> index ec23de1416..b097c203dd 100644
> --- a/sysdeps/x86_64/stpcpy.S
> +++ b/sysdeps/x86_64/stpcpy.S
> @@ -1,7 +1,6 @@
> -#define USE_AS_STPCPY
>  #define STRCPY __stpcpy
>
> -#include <sysdeps/x86_64/strcpy.S>
> +#include "multiarch/stpcpy-sse2.S"
>
>  weak_alias (__stpcpy, stpcpy)
>  libc_hidden_def (__stpcpy)
> diff --git a/sysdeps/x86_64/strcpy.S b/sysdeps/x86_64/strcpy.S
> index 17e8073550..05f19e6e94 100644
> --- a/sysdeps/x86_64/strcpy.S
> +++ b/sysdeps/x86_64/strcpy.S
> @@ -16,140 +16,6 @@
>     License along with the GNU C Library; if not, see
>     <https://www.gnu.org/licenses/>.  */
>
> -#include <sysdep.h>
> -#include "asm-syntax.h"
> -
> -#ifndef USE_AS_STPCPY
> -# define STRCPY strcpy
> -#endif
> -
> -       .text
> -ENTRY (STRCPY)
> -       movq %rsi, %rcx         /* Source register. */
> -       andl $7, %ecx           /* mask alignment bits */
> -       movq %rdi, %rdx         /* Duplicate destination pointer.  */
> -
> -       jz 5f                   /* aligned => start loop */
> -
> -       neg %ecx                /* We need to align to 8 bytes.  */
> -       addl $8,%ecx
> -       /* Search the first bytes directly.  */
> -0:
> -       movb    (%rsi), %al     /* Fetch a byte */
> -       testb   %al, %al        /* Is it NUL? */
> -       movb    %al, (%rdx)     /* Store it */
> -       jz      4f              /* If it was NUL, done! */
> -       incq    %rsi
> -       incq    %rdx
> -       decl    %ecx
> -       jnz     0b
> -
> -5:
> -       movq $0xfefefefefefefeff,%r8
> -
> -       /* Now the sources is aligned.  Unfortunatly we cannot force
> -          to have both source and destination aligned, so ignore the
> -          alignment of the destination.  */
> -       .p2align 4
> -1:
> -       /* 1st unroll.  */
> -       movq    (%rsi), %rax    /* Read double word (8 bytes).  */
> -       addq    $8, %rsi        /* Adjust pointer for next word.  */
> -       movq    %rax, %r9       /* Save a copy for NUL finding.  */
> -       addq    %r8, %r9        /* add the magic value to the word.  We get
> -                                  carry bits reported for each byte which
> -                                  is *not* 0 */
> -       jnc     3f              /* highest byte is NUL => return pointer */
> -       xorq    %rax, %r9       /* (word+magic)^word */
> -       orq     %r8, %r9        /* set all non-carry bits */
> -       incq    %r9             /* add 1: if one carry bit was *not* set
> -                                  the addition will not result in 0.  */
> -
> -       jnz     3f              /* found NUL => return pointer */
> -
> -       movq    %rax, (%rdx)    /* Write value to destination.  */
> -       addq    $8, %rdx        /* Adjust pointer.  */
> -
> -       /* 2nd unroll.  */
> -       movq    (%rsi), %rax    /* Read double word (8 bytes).  */
> -       addq    $8, %rsi        /* Adjust pointer for next word.  */
> -       movq    %rax, %r9       /* Save a copy for NUL finding.  */
> -       addq    %r8, %r9        /* add the magic value to the word.  We get
> -                                  carry bits reported for each byte which
> -                                  is *not* 0 */
> -       jnc     3f              /* highest byte is NUL => return pointer */
> -       xorq    %rax, %r9       /* (word+magic)^word */
> -       orq     %r8, %r9        /* set all non-carry bits */
> -       incq    %r9             /* add 1: if one carry bit was *not* set
> -                                  the addition will not result in 0.  */
> -
> -       jnz     3f              /* found NUL => return pointer */
> -
> -       movq    %rax, (%rdx)    /* Write value to destination.  */
> -       addq    $8, %rdx        /* Adjust pointer.  */
> -
> -       /* 3rd unroll.  */
> -       movq    (%rsi), %rax    /* Read double word (8 bytes).  */
> -       addq    $8, %rsi        /* Adjust pointer for next word.  */
> -       movq    %rax, %r9       /* Save a copy for NUL finding.  */
> -       addq    %r8, %r9        /* add the magic value to the word.  We get
> -                                  carry bits reported for each byte which
> -                                  is *not* 0 */
> -       jnc     3f              /* highest byte is NUL => return pointer */
> -       xorq    %rax, %r9       /* (word+magic)^word */
> -       orq     %r8, %r9        /* set all non-carry bits */
> -       incq    %r9             /* add 1: if one carry bit was *not* set
> -                                  the addition will not result in 0.  */
> -
> -       jnz     3f              /* found NUL => return pointer */
> -
> -       movq    %rax, (%rdx)    /* Write value to destination.  */
> -       addq    $8, %rdx        /* Adjust pointer.  */
> -
> -       /* 4th unroll.  */
> -       movq    (%rsi), %rax    /* Read double word (8 bytes).  */
> -       addq    $8, %rsi        /* Adjust pointer for next word.  */
> -       movq    %rax, %r9       /* Save a copy for NUL finding.  */
> -       addq    %r8, %r9        /* add the magic value to the word.  We get
> -                                  carry bits reported for each byte which
> -                                  is *not* 0 */
> -       jnc     3f              /* highest byte is NUL => return pointer */
> -       xorq    %rax, %r9       /* (word+magic)^word */
> -       orq     %r8, %r9        /* set all non-carry bits */
> -       incq    %r9             /* add 1: if one carry bit was *not* set
> -                                  the addition will not result in 0.  */
> -
> -       jnz     3f              /* found NUL => return pointer */
> -
> -       movq    %rax, (%rdx)    /* Write value to destination.  */
> -       addq    $8, %rdx        /* Adjust pointer.  */
> -       jmp     1b              /* Next iteration.  */
> -
> -       /* Do the last few bytes. %rax contains the value to write.
> -          The loop is unrolled twice.  */
> -       .p2align 4
> -3:
> -       /* Note that stpcpy needs to return with the value of the NUL
> -          byte.  */
> -       movb    %al, (%rdx)     /* 1st byte.  */
> -       testb   %al, %al        /* Is it NUL.  */
> -       jz      4f              /* yes, finish.  */
> -       incq    %rdx            /* Increment destination.  */
> -       movb    %ah, (%rdx)     /* 2nd byte.  */
> -       testb   %ah, %ah        /* Is it NUL?.  */
> -       jz      4f              /* yes, finish.  */
> -       incq    %rdx            /* Increment destination.  */
> -       shrq    $16, %rax       /* Shift...  */
> -       jmp     3b              /* and look at next two bytes in %rax.  */
> -
> -4:
> -#ifdef USE_AS_STPCPY
> -       movq    %rdx, %rax      /* Destination is return value.  */
> -#else
> -       movq    %rdi, %rax      /* Source is return value.  */
> -#endif
> -       retq
> -END (STRCPY)
> -#ifndef USE_AS_STPCPY
> +#define STRCPY strcpy
> +#include "multiarch/strcpy-sse2.S"
>  libc_hidden_builtin_def (strcpy)
> -#endif
> --
> 2.34.1
>

LGTM.

Thanks.

-- 
H.J.

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH v1] x86: Move strlen SSE2 implementation to multiarch/strlen-sse2.S
  2022-07-12 19:29 [PATCH v1] x86: Move strlen SSE2 implementation to multiarch/strlen-sse2.S Noah Goldstein
                   ` (8 preceding siblings ...)
  2022-07-12 19:29 ` [PATCH v1] x86: Add missing rtm tests for strcmp family Noah Goldstein
@ 2022-07-12 23:29 ` H.J. Lu
  2022-07-13  4:06   ` Noah Goldstein
  9 siblings, 1 reply; 21+ messages in thread
From: H.J. Lu @ 2022-07-12 23:29 UTC (permalink / raw)
  To: Noah Goldstein; +Cc: GNU C Library, Carlos O'Donell

On Tue, Jul 12, 2022 at 12:29 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> This commit doesn't affect libc.so.6, its just housekeeping to prepare
> for adding explicit ISA level support.
>
> Tested build on x86_64 and x86_32 with/without multiarch.
> ---
>  sysdeps/x86_64/multiarch/rtld-strlen.S    |  18 ++
>  sysdeps/x86_64/multiarch/rtld-strnlen.S   |  18 ++
>  sysdeps/x86_64/multiarch/strlen-sse2.S    | 260 ++++++++++++++++++++-
>  sysdeps/x86_64/multiarch/strlen-vec.S     | 267 ----------------------
>  sysdeps/x86_64/multiarch/strnlen-sse2.S   |  12 +-
>  sysdeps/x86_64/multiarch/wcslen-sse4_1.S  |   4 +-
>  sysdeps/x86_64/multiarch/wcsnlen-sse4_1.S |   4 +-
>  sysdeps/x86_64/strlen.S                   |   3 +-
>  sysdeps/x86_64/strnlen.S                  |   6 +-
>  9 files changed, 306 insertions(+), 286 deletions(-)
>  create mode 100644 sysdeps/x86_64/multiarch/rtld-strlen.S
>  create mode 100644 sysdeps/x86_64/multiarch/rtld-strnlen.S
>  delete mode 100644 sysdeps/x86_64/multiarch/strlen-vec.S
>
> diff --git a/sysdeps/x86_64/multiarch/rtld-strlen.S b/sysdeps/x86_64/multiarch/rtld-strlen.S
> new file mode 100644
> index 0000000000..609d26256e
> --- /dev/null
> +++ b/sysdeps/x86_64/multiarch/rtld-strlen.S
> @@ -0,0 +1,18 @@
> +/* Copyright (C) 2022 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +#include "../strlen.S"
> diff --git a/sysdeps/x86_64/multiarch/rtld-strnlen.S b/sysdeps/x86_64/multiarch/rtld-strnlen.S
> new file mode 100644
> index 0000000000..ef2d64abc2
> --- /dev/null
> +++ b/sysdeps/x86_64/multiarch/rtld-strnlen.S
> @@ -0,0 +1,18 @@
> +/* Copyright (C) 2022 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +#include "../strnlen.S"
> diff --git a/sysdeps/x86_64/multiarch/strlen-sse2.S b/sysdeps/x86_64/multiarch/strlen-sse2.S
> index 660b327ed2..5be72267d5 100644
> --- a/sysdeps/x86_64/multiarch/strlen-sse2.S
> +++ b/sysdeps/x86_64/multiarch/strlen-sse2.S
> @@ -16,8 +16,260 @@
>     License along with the GNU C Library; if not, see
>     <https://www.gnu.org/licenses/>.  */
>
> -#if IS_IN (libc)
> -# define strlen __strlen_sse2
> -#endif
> +#if IS_IN (libc) || defined STRLEN
> +
> +# ifndef STRLEN
> +#  define STRLEN __strlen_sse2
> +# endif
> +
> +
> +# include <sysdep.h>
> +
> +# ifdef AS_WCSLEN
> +#  define PMINU                pminud
> +#  define PCMPEQ               pcmpeqd
> +#  define SHIFT_RETURN shrq $2, %rax
> +# else
> +#  define PMINU                pminub
> +#  define PCMPEQ               pcmpeqb
> +#  define SHIFT_RETURN
> +# endif
> +
> +# ifndef SECTION
> +#  define SECTION(p)   p
> +# endif
> +
> +/* Long lived register in strlen(s), strnlen(s, n) are:
> +
> +       %xmm3 - zero
> +       %rdi   - s
> +       %r10  (s+n) & (~(64-1))
> +       %r11   s+n
> +*/
> +
> +
> +       .section SECTION(.text),"ax",@progbits
> +ENTRY(STRLEN)
> +
> +/* Test 64 bytes from %rax for zero. Save result as bitmask in %rdx.  */
> +# define FIND_ZERO     \
> +       PCMPEQ  (%rax), %xmm0;  \
> +       PCMPEQ  16(%rax), %xmm1;        \
> +       PCMPEQ  32(%rax), %xmm2;        \
> +       PCMPEQ  48(%rax), %xmm3;        \
> +       pmovmskb        %xmm0, %esi;    \
> +       pmovmskb        %xmm1, %edx;    \
> +       pmovmskb        %xmm2, %r8d;    \
> +       pmovmskb        %xmm3, %ecx;    \
> +       salq    $16, %rdx;      \
> +       salq    $16, %rcx;      \
> +       orq     %rsi, %rdx;     \
> +       orq     %r8, %rcx;      \
> +       salq    $32, %rcx;      \
> +       orq     %rcx, %rdx;
> +
> +# ifdef AS_STRNLEN
> +/* Do not read anything when n==0.  */
> +       test    %RSI_LP, %RSI_LP
> +       jne     L(n_nonzero)
> +       xor     %rax, %rax
> +       ret
> +L(n_nonzero):
> +#  ifdef AS_WCSLEN
> +/* Check for overflow from maxlen * sizeof(wchar_t). If it would
> +   overflow the only way this program doesn't have undefined behavior
> +   is if there is a null terminator in valid memory so wcslen will
> +   suffice.  */
> +       mov     %RSI_LP, %R10_LP
> +       sar     $62, %R10_LP
> +       jnz     __wcslen_sse4_1
> +       sal     $2, %RSI_LP
> +#  endif
> +
> +/* Initialize long lived registers.  */
> +       add     %RDI_LP, %RSI_LP
> +       mov     %RSI_LP, %R10_LP
> +       and     $-64, %R10_LP
> +       mov     %RSI_LP, %R11_LP
> +# endif
> +
> +       pxor    %xmm0, %xmm0
> +       pxor    %xmm1, %xmm1
> +       pxor    %xmm2, %xmm2
> +       pxor    %xmm3, %xmm3
> +       movq    %rdi, %rax
> +       movq    %rdi, %rcx
> +       andq    $4095, %rcx
> +/* Offsets 4032-4047 will be aligned into 4032 thus fit into page.  */
> +       cmpq    $4047, %rcx
> +/* We cannot unify this branching as it would be ~6 cycles slower.  */
> +       ja      L(cross_page)
> +
> +# ifdef AS_STRNLEN
> +/* Test if end is among first 64 bytes.  */
> +#  define STRNLEN_PROLOG       \
> +       mov     %r11, %rsi;     \
> +       subq    %rax, %rsi;     \
> +       andq    $-64, %rax;     \
> +       testq   $-64, %rsi;     \
> +       je      L(strnlen_ret)
> +# else
> +#  define STRNLEN_PROLOG  andq $-64, %rax;
> +# endif
> +
> +/* Ignore bits in mask that come before start of string.  */
> +# define PROLOG(lab)   \
> +       movq    %rdi, %rcx;     \
> +       xorq    %rax, %rcx;     \
> +       STRNLEN_PROLOG; \
> +       sarq    %cl, %rdx;      \
> +       test    %rdx, %rdx;     \
> +       je      L(lab); \
> +       bsfq    %rdx, %rax;     \
> +       SHIFT_RETURN;           \
> +       ret
> +
> +# ifdef AS_STRNLEN
> +       andq    $-16, %rax
> +       FIND_ZERO
> +# else
> +       /* Test first 16 bytes unaligned.  */
> +       movdqu  (%rax), %xmm4
> +       PCMPEQ  %xmm0, %xmm4
> +       pmovmskb        %xmm4, %edx
> +       test    %edx, %edx
> +       je      L(next48_bytes)
> +       bsf     %edx, %eax /* If eax is zeroed 16bit bsf can be used.  */
> +       SHIFT_RETURN
> +       ret
> +
> +L(next48_bytes):
> +/* Same as FIND_ZERO except we do not check first 16 bytes.  */
> +       andq    $-16, %rax
> +       PCMPEQ 16(%rax), %xmm1
> +       PCMPEQ 32(%rax), %xmm2
> +       PCMPEQ 48(%rax), %xmm3
> +       pmovmskb        %xmm1, %edx
> +       pmovmskb        %xmm2, %r8d
> +       pmovmskb        %xmm3, %ecx
> +       salq    $16, %rdx
> +       salq    $16, %rcx
> +       orq     %r8, %rcx
> +       salq    $32, %rcx
> +       orq     %rcx, %rdx
> +# endif
>
> -#include "strlen-vec.S"
> +       /* When no zero byte is found xmm1-3 are zero so we do not have to
> +          zero them.  */
> +       PROLOG(loop)
> +
> +       .p2align 4
> +L(cross_page):
> +       andq    $-64, %rax
> +       FIND_ZERO
> +       PROLOG(loop_init)
> +
> +# ifdef AS_STRNLEN
> +/* We must do this check to correctly handle strnlen (s, -1).  */
> +L(strnlen_ret):
> +       bts     %rsi, %rdx
> +       sarq    %cl, %rdx
> +       test    %rdx, %rdx
> +       je      L(loop_init)
> +       bsfq    %rdx, %rax
> +       SHIFT_RETURN
> +       ret
> +# endif
> +       .p2align 4
> +L(loop_init):
> +       pxor    %xmm1, %xmm1
> +       pxor    %xmm2, %xmm2
> +       pxor    %xmm3, %xmm3
> +# ifdef AS_STRNLEN
> +       .p2align 4
> +L(loop):
> +
> +       addq    $64, %rax
> +       cmpq    %rax, %r10
> +       je      L(exit_end)
> +
> +       movdqa  (%rax), %xmm0
> +       PMINU   16(%rax), %xmm0
> +       PMINU   32(%rax), %xmm0
> +       PMINU   48(%rax), %xmm0
> +       PCMPEQ  %xmm3, %xmm0
> +       pmovmskb        %xmm0, %edx
> +       testl   %edx, %edx
> +       jne     L(exit)
> +       jmp     L(loop)
> +
> +       .p2align 4
> +L(exit_end):
> +       cmp     %rax, %r11
> +       je      L(first) /* Do not read when end is at page boundary.  */
> +       pxor    %xmm0, %xmm0
> +       FIND_ZERO
> +
> +L(first):
> +       bts     %r11, %rdx
> +       bsfq    %rdx, %rdx
> +       addq    %rdx, %rax
> +       subq    %rdi, %rax
> +       SHIFT_RETURN
> +       ret
> +
> +       .p2align 4
> +L(exit):
> +       pxor    %xmm0, %xmm0
> +       FIND_ZERO
> +
> +       bsfq    %rdx, %rdx
> +       addq    %rdx, %rax
> +       subq    %rdi, %rax
> +       SHIFT_RETURN
> +       ret
> +
> +# else
> +
> +       /* Main loop.  Unrolled twice to improve L2 cache performance on core2.  */
> +       .p2align 4
> +L(loop):
> +
> +       movdqa  64(%rax), %xmm0
> +       PMINU   80(%rax), %xmm0
> +       PMINU   96(%rax), %xmm0
> +       PMINU   112(%rax), %xmm0
> +       PCMPEQ  %xmm3, %xmm0
> +       pmovmskb        %xmm0, %edx
> +       testl   %edx, %edx
> +       jne     L(exit64)
> +
> +       subq    $-128, %rax
> +
> +       movdqa  (%rax), %xmm0
> +       PMINU   16(%rax), %xmm0
> +       PMINU   32(%rax), %xmm0
> +       PMINU   48(%rax), %xmm0
> +       PCMPEQ  %xmm3, %xmm0
> +       pmovmskb        %xmm0, %edx
> +       testl   %edx, %edx
> +       jne     L(exit0)
> +       jmp     L(loop)
> +
> +       .p2align 4
> +L(exit64):
> +       addq    $64, %rax
> +L(exit0):
> +       pxor    %xmm0, %xmm0
> +       FIND_ZERO
> +
> +       bsfq    %rdx, %rdx
> +       addq    %rdx, %rax
> +       subq    %rdi, %rax
> +       SHIFT_RETURN
> +       ret
> +
> +# endif
> +
> +END(STRLEN)
> +#endif
> diff --git a/sysdeps/x86_64/multiarch/strlen-vec.S b/sysdeps/x86_64/multiarch/strlen-vec.S
> deleted file mode 100644
> index 874123d604..0000000000
> --- a/sysdeps/x86_64/multiarch/strlen-vec.S
> +++ /dev/null
> @@ -1,267 +0,0 @@
> -/* SSE2 version of strlen and SSE4.1 version of wcslen.
> -   Copyright (C) 2012-2022 Free Software Foundation, Inc.
> -   This file is part of the GNU C Library.
> -
> -   The GNU C Library is free software; you can redistribute it and/or
> -   modify it under the terms of the GNU Lesser General Public
> -   License as published by the Free Software Foundation; either
> -   version 2.1 of the License, or (at your option) any later version.
> -
> -   The GNU C Library is distributed in the hope that it will be useful,
> -   but WITHOUT ANY WARRANTY; without even the implied warranty of
> -   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> -   Lesser General Public License for more details.
> -
> -   You should have received a copy of the GNU Lesser General Public
> -   License along with the GNU C Library; if not, see
> -   <https://www.gnu.org/licenses/>.  */
> -
> -#include <sysdep.h>
> -
> -#ifdef AS_WCSLEN
> -# define PMINU         pminud
> -# define PCMPEQ                pcmpeqd
> -# define SHIFT_RETURN  shrq $2, %rax
> -#else
> -# define PMINU         pminub
> -# define PCMPEQ                pcmpeqb
> -# define SHIFT_RETURN
> -#endif
> -
> -#ifndef SECTION
> -# define SECTION(p)    p
> -#endif
> -
> -/* Long lived register in strlen(s), strnlen(s, n) are:
> -
> -       %xmm3 - zero
> -       %rdi   - s
> -       %r10  (s+n) & (~(64-1))
> -       %r11   s+n
> -*/
> -
> -
> -       .section SECTION(.text),"ax",@progbits
> -ENTRY(strlen)
> -
> -/* Test 64 bytes from %rax for zero. Save result as bitmask in %rdx.  */
> -#define FIND_ZERO      \
> -       PCMPEQ  (%rax), %xmm0;  \
> -       PCMPEQ  16(%rax), %xmm1;        \
> -       PCMPEQ  32(%rax), %xmm2;        \
> -       PCMPEQ  48(%rax), %xmm3;        \
> -       pmovmskb        %xmm0, %esi;    \
> -       pmovmskb        %xmm1, %edx;    \
> -       pmovmskb        %xmm2, %r8d;    \
> -       pmovmskb        %xmm3, %ecx;    \
> -       salq    $16, %rdx;      \
> -       salq    $16, %rcx;      \
> -       orq     %rsi, %rdx;     \
> -       orq     %r8, %rcx;      \
> -       salq    $32, %rcx;      \
> -       orq     %rcx, %rdx;
> -
> -#ifdef AS_STRNLEN
> -/* Do not read anything when n==0.  */
> -       test    %RSI_LP, %RSI_LP
> -       jne     L(n_nonzero)
> -       xor     %rax, %rax
> -       ret
> -L(n_nonzero):
> -# ifdef AS_WCSLEN
> -/* Check for overflow from maxlen * sizeof(wchar_t). If it would
> -   overflow the only way this program doesn't have undefined behavior
> -   is if there is a null terminator in valid memory so wcslen will
> -   suffice.  */
> -       mov     %RSI_LP, %R10_LP
> -       sar     $62, %R10_LP
> -       jnz     __wcslen_sse4_1
> -       sal     $2, %RSI_LP
> -# endif
> -
> -/* Initialize long lived registers.  */
> -       add     %RDI_LP, %RSI_LP
> -       mov     %RSI_LP, %R10_LP
> -       and     $-64, %R10_LP
> -       mov     %RSI_LP, %R11_LP
> -#endif
> -
> -       pxor    %xmm0, %xmm0
> -       pxor    %xmm1, %xmm1
> -       pxor    %xmm2, %xmm2
> -       pxor    %xmm3, %xmm3
> -       movq    %rdi, %rax
> -       movq    %rdi, %rcx
> -       andq    $4095, %rcx
> -/* Offsets 4032-4047 will be aligned into 4032 thus fit into page.  */
> -       cmpq    $4047, %rcx
> -/* We cannot unify this branching as it would be ~6 cycles slower.  */
> -       ja      L(cross_page)
> -
> -#ifdef AS_STRNLEN
> -/* Test if end is among first 64 bytes.  */
> -# define STRNLEN_PROLOG        \
> -       mov     %r11, %rsi;     \
> -       subq    %rax, %rsi;     \
> -       andq    $-64, %rax;     \
> -       testq   $-64, %rsi;     \
> -       je      L(strnlen_ret)
> -#else
> -# define STRNLEN_PROLOG  andq $-64, %rax;
> -#endif
> -
> -/* Ignore bits in mask that come before start of string.  */
> -#define PROLOG(lab)    \
> -       movq    %rdi, %rcx;     \
> -       xorq    %rax, %rcx;     \
> -       STRNLEN_PROLOG; \
> -       sarq    %cl, %rdx;      \
> -       test    %rdx, %rdx;     \
> -       je      L(lab); \
> -       bsfq    %rdx, %rax;     \
> -       SHIFT_RETURN;           \
> -       ret
> -
> -#ifdef AS_STRNLEN
> -       andq    $-16, %rax
> -       FIND_ZERO
> -#else
> -       /* Test first 16 bytes unaligned.  */
> -       movdqu  (%rax), %xmm4
> -       PCMPEQ  %xmm0, %xmm4
> -       pmovmskb        %xmm4, %edx
> -       test    %edx, %edx
> -       je      L(next48_bytes)
> -       bsf     %edx, %eax /* If eax is zeroed 16bit bsf can be used.  */
> -       SHIFT_RETURN
> -       ret
> -
> -L(next48_bytes):
> -/* Same as FIND_ZERO except we do not check first 16 bytes.  */
> -       andq    $-16, %rax
> -       PCMPEQ 16(%rax), %xmm1
> -       PCMPEQ 32(%rax), %xmm2
> -       PCMPEQ 48(%rax), %xmm3
> -       pmovmskb        %xmm1, %edx
> -       pmovmskb        %xmm2, %r8d
> -       pmovmskb        %xmm3, %ecx
> -       salq    $16, %rdx
> -       salq    $16, %rcx
> -       orq     %r8, %rcx
> -       salq    $32, %rcx
> -       orq     %rcx, %rdx
> -#endif
> -
> -       /* When no zero byte is found xmm1-3 are zero so we do not have to
> -          zero them.  */
> -       PROLOG(loop)
> -
> -       .p2align 4
> -L(cross_page):
> -       andq    $-64, %rax
> -       FIND_ZERO
> -       PROLOG(loop_init)
> -
> -#ifdef AS_STRNLEN
> -/* We must do this check to correctly handle strnlen (s, -1).  */
> -L(strnlen_ret):
> -       bts     %rsi, %rdx
> -       sarq    %cl, %rdx
> -       test    %rdx, %rdx
> -       je      L(loop_init)
> -       bsfq    %rdx, %rax
> -       SHIFT_RETURN
> -       ret
> -#endif
> -       .p2align 4
> -L(loop_init):
> -       pxor    %xmm1, %xmm1
> -       pxor    %xmm2, %xmm2
> -       pxor    %xmm3, %xmm3
> -#ifdef AS_STRNLEN
> -       .p2align 4
> -L(loop):
> -
> -       addq    $64, %rax
> -       cmpq    %rax, %r10
> -       je      L(exit_end)
> -
> -       movdqa  (%rax), %xmm0
> -       PMINU   16(%rax), %xmm0
> -       PMINU   32(%rax), %xmm0
> -       PMINU   48(%rax), %xmm0
> -       PCMPEQ  %xmm3, %xmm0
> -       pmovmskb        %xmm0, %edx
> -       testl   %edx, %edx
> -       jne     L(exit)
> -       jmp     L(loop)
> -
> -       .p2align 4
> -L(exit_end):
> -       cmp     %rax, %r11
> -       je      L(first) /* Do not read when end is at page boundary.  */
> -       pxor    %xmm0, %xmm0
> -       FIND_ZERO
> -
> -L(first):
> -       bts     %r11, %rdx
> -       bsfq    %rdx, %rdx
> -       addq    %rdx, %rax
> -       subq    %rdi, %rax
> -       SHIFT_RETURN
> -       ret
> -
> -       .p2align 4
> -L(exit):
> -       pxor    %xmm0, %xmm0
> -       FIND_ZERO
> -
> -       bsfq    %rdx, %rdx
> -       addq    %rdx, %rax
> -       subq    %rdi, %rax
> -       SHIFT_RETURN
> -       ret
> -
> -#else
> -
> -       /* Main loop.  Unrolled twice to improve L2 cache performance on core2.  */
> -       .p2align 4
> -L(loop):
> -
> -       movdqa  64(%rax), %xmm0
> -       PMINU   80(%rax), %xmm0
> -       PMINU   96(%rax), %xmm0
> -       PMINU   112(%rax), %xmm0
> -       PCMPEQ  %xmm3, %xmm0
> -       pmovmskb        %xmm0, %edx
> -       testl   %edx, %edx
> -       jne     L(exit64)
> -
> -       subq    $-128, %rax
> -
> -       movdqa  (%rax), %xmm0
> -       PMINU   16(%rax), %xmm0
> -       PMINU   32(%rax), %xmm0
> -       PMINU   48(%rax), %xmm0
> -       PCMPEQ  %xmm3, %xmm0
> -       pmovmskb        %xmm0, %edx
> -       testl   %edx, %edx
> -       jne     L(exit0)
> -       jmp     L(loop)
> -
> -       .p2align 4
> -L(exit64):
> -       addq    $64, %rax
> -L(exit0):
> -       pxor    %xmm0, %xmm0
> -       FIND_ZERO
> -
> -       bsfq    %rdx, %rdx
> -       addq    %rdx, %rax
> -       subq    %rdi, %rax
> -       SHIFT_RETURN
> -       ret
> -
> -#endif
> -
> -END(strlen)
> diff --git a/sysdeps/x86_64/multiarch/strnlen-sse2.S b/sysdeps/x86_64/multiarch/strnlen-sse2.S
> index c4f395c210..a50c7d6a28 100644
> --- a/sysdeps/x86_64/multiarch/strnlen-sse2.S
> +++ b/sysdeps/x86_64/multiarch/strnlen-sse2.S
> @@ -17,12 +17,10 @@
>     <https://www.gnu.org/licenses/>.  */
>
>  #if IS_IN (libc)
> -# define __strnlen __strnlen_sse2
> -
> -# undef weak_alias
> -# define weak_alias(__strnlen, strnlen)
> -# undef libc_hidden_builtin_def
> -# define libc_hidden_builtin_def(strnlen)
> +# ifndef STRLEN
> +#  define STRLEN       __strnlen_sse2
> +# endif
>  #endif
>
> -#include "../strnlen.S"
> +#define AS_STRNLEN
> +#include "strlen-sse2.S"
> diff --git a/sysdeps/x86_64/multiarch/wcslen-sse4_1.S b/sysdeps/x86_64/multiarch/wcslen-sse4_1.S
> index e306a77f51..c88e8342a1 100644
> --- a/sysdeps/x86_64/multiarch/wcslen-sse4_1.S
> +++ b/sysdeps/x86_64/multiarch/wcslen-sse4_1.S
> @@ -1,5 +1,5 @@
>  #define AS_WCSLEN
> -#define strlen __wcslen_sse4_1
> +#define STRLEN __wcslen_sse4_1
>  #define SECTION(p)     p##.sse4.1
>
> -#include "strlen-vec.S"
> +#include "strlen-sse2.S"
> diff --git a/sysdeps/x86_64/multiarch/wcsnlen-sse4_1.S b/sysdeps/x86_64/multiarch/wcsnlen-sse4_1.S
> index d2f7dd6e22..17cdedc2a9 100644
> --- a/sysdeps/x86_64/multiarch/wcsnlen-sse4_1.S
> +++ b/sysdeps/x86_64/multiarch/wcsnlen-sse4_1.S
> @@ -1,6 +1,6 @@
>  #define AS_WCSLEN
>  #define AS_STRNLEN
> -#define strlen __wcsnlen_sse4_1
> +#define STRLEN __wcsnlen_sse4_1
>  #define SECTION(p)     p##.sse4.1
>
> -#include "strlen-vec.S"
> +#include "strlen-sse2.S"
> diff --git a/sysdeps/x86_64/strlen.S b/sysdeps/x86_64/strlen.S
> index e1f0b19f2f..c2f5674f8d 100644
> --- a/sysdeps/x86_64/strlen.S
> +++ b/sysdeps/x86_64/strlen.S
> @@ -16,6 +16,7 @@
>     License along with the GNU C Library; if not, see
>     <https://www.gnu.org/licenses/>.  */
>
> -#include "multiarch/strlen-vec.S"
> +#define STRLEN strlen
> +#include "multiarch/strlen-sse2.S"
>
>  libc_hidden_builtin_def (strlen)
> diff --git a/sysdeps/x86_64/strnlen.S b/sysdeps/x86_64/strnlen.S
> index d3c43ac482..174970d58f 100644
> --- a/sysdeps/x86_64/strnlen.S
> +++ b/sysdeps/x86_64/strnlen.S
> @@ -1,6 +1,6 @@
> -#define AS_STRNLEN
> -#define strlen __strnlen
> -#include "strlen.S"
> +#define STRLEN __strnlen
> +#include "multiarch/strnlen-sse2.S"
>
> +libc_hidden_def (__strnlen)
>  weak_alias (__strnlen, strnlen);
>  libc_hidden_builtin_def (strnlen)
> --
> 2.34.1
>

LGTM.

Thanks.

-- 
H.J.

^ permalink raw reply	[flat|nested] 21+ messages in thread

* Re: [PATCH v1] x86: Move strlen SSE2 implementation to multiarch/strlen-sse2.S
  2022-07-12 23:29 ` [PATCH v1] x86: Move strlen SSE2 implementation to multiarch/strlen-sse2.S H.J. Lu
@ 2022-07-13  4:06   ` Noah Goldstein
  0 siblings, 0 replies; 21+ messages in thread
From: Noah Goldstein @ 2022-07-13  4:06 UTC (permalink / raw)
  To: H.J. Lu; +Cc: GNU C Library, Carlos O'Donell

Carlos,

Any issue with pushing the "move <func> SSE2 implementation to
multiarch/<func>-sse2.S" commits?

The follow on patchwork:
https://patchwork.sourceware.org/project/glibc/patch/20220712192910.351121-8-goldstein.w.n@gmail.com/
https://patchwork.sourceware.org/project/glibc/patch/20220712192910.351121-7-goldstein.w.n@gmail.com/
https://patchwork.sourceware.org/project/glibc/patch/20220712192910.351121-6-goldstein.w.n@gmail.com/
https://patchwork.sourceware.org/project/glibc/patch/20220712192910.351121-5-goldstein.w.n@gmail.com/
https://patchwork.sourceware.org/project/glibc/patch/20220712192910.351121-4-goldstein.w.n@gmail.com/
https://patchwork.sourceware.org/project/glibc/patch/20220712192910.351121-3-goldstein.w.n@gmail.com/
https://patchwork.sourceware.org/project/glibc/patch/20220712192910.351121-2-goldstein.w.n@gmail.com/
https://patchwork.sourceware.org/project/glibc/patch/20220712192910.351121-1-goldstein.w.n@gmail.com/
https://patchwork.sourceware.org/project/glibc/patch/20220712192808.335531-4-goldstein.w.n@gmail.com/
https://patchwork.sourceware.org/project/glibc/patch/20220712192808.335531-3-goldstein.w.n@gmail.com/
https://patchwork.sourceware.org/project/glibc/patch/20220712192808.335531-2-goldstein.w.n@gmail.com/

They are necessary for a coming ISA raising patch that I hope to get
into 2.36, but it
may be too near the release date for such large changes.


On Tue, Jul 12, 2022 at 4:30 PM H.J. Lu <hjl.tools@gmail.com> wrote:
>
> On Tue, Jul 12, 2022 at 12:29 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> >
> > This commit doesn't affect libc.so.6, its just housekeeping to prepare
> > for adding explicit ISA level support.
> >
> > Tested build on x86_64 and x86_32 with/without multiarch.
> > ---
> >  sysdeps/x86_64/multiarch/rtld-strlen.S    |  18 ++
> >  sysdeps/x86_64/multiarch/rtld-strnlen.S   |  18 ++
> >  sysdeps/x86_64/multiarch/strlen-sse2.S    | 260 ++++++++++++++++++++-
> >  sysdeps/x86_64/multiarch/strlen-vec.S     | 267 ----------------------
> >  sysdeps/x86_64/multiarch/strnlen-sse2.S   |  12 +-
> >  sysdeps/x86_64/multiarch/wcslen-sse4_1.S  |   4 +-
> >  sysdeps/x86_64/multiarch/wcsnlen-sse4_1.S |   4 +-
> >  sysdeps/x86_64/strlen.S                   |   3 +-
> >  sysdeps/x86_64/strnlen.S                  |   6 +-
> >  9 files changed, 306 insertions(+), 286 deletions(-)
> >  create mode 100644 sysdeps/x86_64/multiarch/rtld-strlen.S
> >  create mode 100644 sysdeps/x86_64/multiarch/rtld-strnlen.S
> >  delete mode 100644 sysdeps/x86_64/multiarch/strlen-vec.S
> >
> > diff --git a/sysdeps/x86_64/multiarch/rtld-strlen.S b/sysdeps/x86_64/multiarch/rtld-strlen.S
> > new file mode 100644
> > index 0000000000..609d26256e
> > --- /dev/null
> > +++ b/sysdeps/x86_64/multiarch/rtld-strlen.S
> > @@ -0,0 +1,18 @@
> > +/* Copyright (C) 2022 Free Software Foundation, Inc.
> > +   This file is part of the GNU C Library.
> > +
> > +   The GNU C Library is free software; you can redistribute it and/or
> > +   modify it under the terms of the GNU Lesser General Public
> > +   License as published by the Free Software Foundation; either
> > +   version 2.1 of the License, or (at your option) any later version.
> > +
> > +   The GNU C Library is distributed in the hope that it will be useful,
> > +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> > +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > +   Lesser General Public License for more details.
> > +
> > +   You should have received a copy of the GNU Lesser General Public
> > +   License along with the GNU C Library; if not, see
> > +   <https://www.gnu.org/licenses/>.  */
> > +
> > +#include "../strlen.S"
> > diff --git a/sysdeps/x86_64/multiarch/rtld-strnlen.S b/sysdeps/x86_64/multiarch/rtld-strnlen.S
> > new file mode 100644
> > index 0000000000..ef2d64abc2
> > --- /dev/null
> > +++ b/sysdeps/x86_64/multiarch/rtld-strnlen.S
> > @@ -0,0 +1,18 @@
> > +/* Copyright (C) 2022 Free Software Foundation, Inc.
> > +   This file is part of the GNU C Library.
> > +
> > +   The GNU C Library is free software; you can redistribute it and/or
> > +   modify it under the terms of the GNU Lesser General Public
> > +   License as published by the Free Software Foundation; either
> > +   version 2.1 of the License, or (at your option) any later version.
> > +
> > +   The GNU C Library is distributed in the hope that it will be useful,
> > +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> > +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > +   Lesser General Public License for more details.
> > +
> > +   You should have received a copy of the GNU Lesser General Public
> > +   License along with the GNU C Library; if not, see
> > +   <https://www.gnu.org/licenses/>.  */
> > +
> > +#include "../strnlen.S"
> > diff --git a/sysdeps/x86_64/multiarch/strlen-sse2.S b/sysdeps/x86_64/multiarch/strlen-sse2.S
> > index 660b327ed2..5be72267d5 100644
> > --- a/sysdeps/x86_64/multiarch/strlen-sse2.S
> > +++ b/sysdeps/x86_64/multiarch/strlen-sse2.S
> > @@ -16,8 +16,260 @@
> >     License along with the GNU C Library; if not, see
> >     <https://www.gnu.org/licenses/>.  */
> >
> > -#if IS_IN (libc)
> > -# define strlen __strlen_sse2
> > -#endif
> > +#if IS_IN (libc) || defined STRLEN
> > +
> > +# ifndef STRLEN
> > +#  define STRLEN __strlen_sse2
> > +# endif
> > +
> > +
> > +# include <sysdep.h>
> > +
> > +# ifdef AS_WCSLEN
> > +#  define PMINU                pminud
> > +#  define PCMPEQ               pcmpeqd
> > +#  define SHIFT_RETURN shrq $2, %rax
> > +# else
> > +#  define PMINU                pminub
> > +#  define PCMPEQ               pcmpeqb
> > +#  define SHIFT_RETURN
> > +# endif
> > +
> > +# ifndef SECTION
> > +#  define SECTION(p)   p
> > +# endif
> > +
> > +/* Long lived register in strlen(s), strnlen(s, n) are:
> > +
> > +       %xmm3 - zero
> > +       %rdi   - s
> > +       %r10  (s+n) & (~(64-1))
> > +       %r11   s+n
> > +*/
> > +
> > +
> > +       .section SECTION(.text),"ax",@progbits
> > +ENTRY(STRLEN)
> > +
> > +/* Test 64 bytes from %rax for zero. Save result as bitmask in %rdx.  */
> > +# define FIND_ZERO     \
> > +       PCMPEQ  (%rax), %xmm0;  \
> > +       PCMPEQ  16(%rax), %xmm1;        \
> > +       PCMPEQ  32(%rax), %xmm2;        \
> > +       PCMPEQ  48(%rax), %xmm3;        \
> > +       pmovmskb        %xmm0, %esi;    \
> > +       pmovmskb        %xmm1, %edx;    \
> > +       pmovmskb        %xmm2, %r8d;    \
> > +       pmovmskb        %xmm3, %ecx;    \
> > +       salq    $16, %rdx;      \
> > +       salq    $16, %rcx;      \
> > +       orq     %rsi, %rdx;     \
> > +       orq     %r8, %rcx;      \
> > +       salq    $32, %rcx;      \
> > +       orq     %rcx, %rdx;
> > +
> > +# ifdef AS_STRNLEN
> > +/* Do not read anything when n==0.  */
> > +       test    %RSI_LP, %RSI_LP
> > +       jne     L(n_nonzero)
> > +       xor     %rax, %rax
> > +       ret
> > +L(n_nonzero):
> > +#  ifdef AS_WCSLEN
> > +/* Check for overflow from maxlen * sizeof(wchar_t). If it would
> > +   overflow the only way this program doesn't have undefined behavior
> > +   is if there is a null terminator in valid memory so wcslen will
> > +   suffice.  */
> > +       mov     %RSI_LP, %R10_LP
> > +       sar     $62, %R10_LP
> > +       jnz     __wcslen_sse4_1
> > +       sal     $2, %RSI_LP
> > +#  endif
> > +
> > +/* Initialize long lived registers.  */
> > +       add     %RDI_LP, %RSI_LP
> > +       mov     %RSI_LP, %R10_LP
> > +       and     $-64, %R10_LP
> > +       mov     %RSI_LP, %R11_LP
> > +# endif
> > +
> > +       pxor    %xmm0, %xmm0
> > +       pxor    %xmm1, %xmm1
> > +       pxor    %xmm2, %xmm2
> > +       pxor    %xmm3, %xmm3
> > +       movq    %rdi, %rax
> > +       movq    %rdi, %rcx
> > +       andq    $4095, %rcx
> > +/* Offsets 4032-4047 will be aligned into 4032 thus fit into page.  */
> > +       cmpq    $4047, %rcx
> > +/* We cannot unify this branching as it would be ~6 cycles slower.  */
> > +       ja      L(cross_page)
> > +
> > +# ifdef AS_STRNLEN
> > +/* Test if end is among first 64 bytes.  */
> > +#  define STRNLEN_PROLOG       \
> > +       mov     %r11, %rsi;     \
> > +       subq    %rax, %rsi;     \
> > +       andq    $-64, %rax;     \
> > +       testq   $-64, %rsi;     \
> > +       je      L(strnlen_ret)
> > +# else
> > +#  define STRNLEN_PROLOG  andq $-64, %rax;
> > +# endif
> > +
> > +/* Ignore bits in mask that come before start of string.  */
> > +# define PROLOG(lab)   \
> > +       movq    %rdi, %rcx;     \
> > +       xorq    %rax, %rcx;     \
> > +       STRNLEN_PROLOG; \
> > +       sarq    %cl, %rdx;      \
> > +       test    %rdx, %rdx;     \
> > +       je      L(lab); \
> > +       bsfq    %rdx, %rax;     \
> > +       SHIFT_RETURN;           \
> > +       ret
> > +
> > +# ifdef AS_STRNLEN
> > +       andq    $-16, %rax
> > +       FIND_ZERO
> > +# else
> > +       /* Test first 16 bytes unaligned.  */
> > +       movdqu  (%rax), %xmm4
> > +       PCMPEQ  %xmm0, %xmm4
> > +       pmovmskb        %xmm4, %edx
> > +       test    %edx, %edx
> > +       je      L(next48_bytes)
> > +       bsf     %edx, %eax /* If eax is zeroed 16bit bsf can be used.  */
> > +       SHIFT_RETURN
> > +       ret
> > +
> > +L(next48_bytes):
> > +/* Same as FIND_ZERO except we do not check first 16 bytes.  */
> > +       andq    $-16, %rax
> > +       PCMPEQ 16(%rax), %xmm1
> > +       PCMPEQ 32(%rax), %xmm2
> > +       PCMPEQ 48(%rax), %xmm3
> > +       pmovmskb        %xmm1, %edx
> > +       pmovmskb        %xmm2, %r8d
> > +       pmovmskb        %xmm3, %ecx
> > +       salq    $16, %rdx
> > +       salq    $16, %rcx
> > +       orq     %r8, %rcx
> > +       salq    $32, %rcx
> > +       orq     %rcx, %rdx
> > +# endif
> >
> > -#include "strlen-vec.S"
> > +       /* When no zero byte is found xmm1-3 are zero so we do not have to
> > +          zero them.  */
> > +       PROLOG(loop)
> > +
> > +       .p2align 4
> > +L(cross_page):
> > +       andq    $-64, %rax
> > +       FIND_ZERO
> > +       PROLOG(loop_init)
> > +
> > +# ifdef AS_STRNLEN
> > +/* We must do this check to correctly handle strnlen (s, -1).  */
> > +L(strnlen_ret):
> > +       bts     %rsi, %rdx
> > +       sarq    %cl, %rdx
> > +       test    %rdx, %rdx
> > +       je      L(loop_init)
> > +       bsfq    %rdx, %rax
> > +       SHIFT_RETURN
> > +       ret
> > +# endif
> > +       .p2align 4
> > +L(loop_init):
> > +       pxor    %xmm1, %xmm1
> > +       pxor    %xmm2, %xmm2
> > +       pxor    %xmm3, %xmm3
> > +# ifdef AS_STRNLEN
> > +       .p2align 4
> > +L(loop):
> > +
> > +       addq    $64, %rax
> > +       cmpq    %rax, %r10
> > +       je      L(exit_end)
> > +
> > +       movdqa  (%rax), %xmm0
> > +       PMINU   16(%rax), %xmm0
> > +       PMINU   32(%rax), %xmm0
> > +       PMINU   48(%rax), %xmm0
> > +       PCMPEQ  %xmm3, %xmm0
> > +       pmovmskb        %xmm0, %edx
> > +       testl   %edx, %edx
> > +       jne     L(exit)
> > +       jmp     L(loop)
> > +
> > +       .p2align 4
> > +L(exit_end):
> > +       cmp     %rax, %r11
> > +       je      L(first) /* Do not read when end is at page boundary.  */
> > +       pxor    %xmm0, %xmm0
> > +       FIND_ZERO
> > +
> > +L(first):
> > +       bts     %r11, %rdx
> > +       bsfq    %rdx, %rdx
> > +       addq    %rdx, %rax
> > +       subq    %rdi, %rax
> > +       SHIFT_RETURN
> > +       ret
> > +
> > +       .p2align 4
> > +L(exit):
> > +       pxor    %xmm0, %xmm0
> > +       FIND_ZERO
> > +
> > +       bsfq    %rdx, %rdx
> > +       addq    %rdx, %rax
> > +       subq    %rdi, %rax
> > +       SHIFT_RETURN
> > +       ret
> > +
> > +# else
> > +
> > +       /* Main loop.  Unrolled twice to improve L2 cache performance on core2.  */
> > +       .p2align 4
> > +L(loop):
> > +
> > +       movdqa  64(%rax), %xmm0
> > +       PMINU   80(%rax), %xmm0
> > +       PMINU   96(%rax), %xmm0
> > +       PMINU   112(%rax), %xmm0
> > +       PCMPEQ  %xmm3, %xmm0
> > +       pmovmskb        %xmm0, %edx
> > +       testl   %edx, %edx
> > +       jne     L(exit64)
> > +
> > +       subq    $-128, %rax
> > +
> > +       movdqa  (%rax), %xmm0
> > +       PMINU   16(%rax), %xmm0
> > +       PMINU   32(%rax), %xmm0
> > +       PMINU   48(%rax), %xmm0
> > +       PCMPEQ  %xmm3, %xmm0
> > +       pmovmskb        %xmm0, %edx
> > +       testl   %edx, %edx
> > +       jne     L(exit0)
> > +       jmp     L(loop)
> > +
> > +       .p2align 4
> > +L(exit64):
> > +       addq    $64, %rax
> > +L(exit0):
> > +       pxor    %xmm0, %xmm0
> > +       FIND_ZERO
> > +
> > +       bsfq    %rdx, %rdx
> > +       addq    %rdx, %rax
> > +       subq    %rdi, %rax
> > +       SHIFT_RETURN
> > +       ret
> > +
> > +# endif
> > +
> > +END(STRLEN)
> > +#endif
> > diff --git a/sysdeps/x86_64/multiarch/strlen-vec.S b/sysdeps/x86_64/multiarch/strlen-vec.S
> > deleted file mode 100644
> > index 874123d604..0000000000
> > --- a/sysdeps/x86_64/multiarch/strlen-vec.S
> > +++ /dev/null
> > @@ -1,267 +0,0 @@
> > -/* SSE2 version of strlen and SSE4.1 version of wcslen.
> > -   Copyright (C) 2012-2022 Free Software Foundation, Inc.
> > -   This file is part of the GNU C Library.
> > -
> > -   The GNU C Library is free software; you can redistribute it and/or
> > -   modify it under the terms of the GNU Lesser General Public
> > -   License as published by the Free Software Foundation; either
> > -   version 2.1 of the License, or (at your option) any later version.
> > -
> > -   The GNU C Library is distributed in the hope that it will be useful,
> > -   but WITHOUT ANY WARRANTY; without even the implied warranty of
> > -   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > -   Lesser General Public License for more details.
> > -
> > -   You should have received a copy of the GNU Lesser General Public
> > -   License along with the GNU C Library; if not, see
> > -   <https://www.gnu.org/licenses/>.  */
> > -
> > -#include <sysdep.h>
> > -
> > -#ifdef AS_WCSLEN
> > -# define PMINU         pminud
> > -# define PCMPEQ                pcmpeqd
> > -# define SHIFT_RETURN  shrq $2, %rax
> > -#else
> > -# define PMINU         pminub
> > -# define PCMPEQ                pcmpeqb
> > -# define SHIFT_RETURN
> > -#endif
> > -
> > -#ifndef SECTION
> > -# define SECTION(p)    p
> > -#endif
> > -
> > -/* Long lived register in strlen(s), strnlen(s, n) are:
> > -
> > -       %xmm3 - zero
> > -       %rdi   - s
> > -       %r10  (s+n) & (~(64-1))
> > -       %r11   s+n
> > -*/
> > -
> > -
> > -       .section SECTION(.text),"ax",@progbits
> > -ENTRY(strlen)
> > -
> > -/* Test 64 bytes from %rax for zero. Save result as bitmask in %rdx.  */
> > -#define FIND_ZERO      \
> > -       PCMPEQ  (%rax), %xmm0;  \
> > -       PCMPEQ  16(%rax), %xmm1;        \
> > -       PCMPEQ  32(%rax), %xmm2;        \
> > -       PCMPEQ  48(%rax), %xmm3;        \
> > -       pmovmskb        %xmm0, %esi;    \
> > -       pmovmskb        %xmm1, %edx;    \
> > -       pmovmskb        %xmm2, %r8d;    \
> > -       pmovmskb        %xmm3, %ecx;    \
> > -       salq    $16, %rdx;      \
> > -       salq    $16, %rcx;      \
> > -       orq     %rsi, %rdx;     \
> > -       orq     %r8, %rcx;      \
> > -       salq    $32, %rcx;      \
> > -       orq     %rcx, %rdx;
> > -
> > -#ifdef AS_STRNLEN
> > -/* Do not read anything when n==0.  */
> > -       test    %RSI_LP, %RSI_LP
> > -       jne     L(n_nonzero)
> > -       xor     %rax, %rax
> > -       ret
> > -L(n_nonzero):
> > -# ifdef AS_WCSLEN
> > -/* Check for overflow from maxlen * sizeof(wchar_t). If it would
> > -   overflow the only way this program doesn't have undefined behavior
> > -   is if there is a null terminator in valid memory so wcslen will
> > -   suffice.  */
> > -       mov     %RSI_LP, %R10_LP
> > -       sar     $62, %R10_LP
> > -       jnz     __wcslen_sse4_1
> > -       sal     $2, %RSI_LP
> > -# endif
> > -
> > -/* Initialize long lived registers.  */
> > -       add     %RDI_LP, %RSI_LP
> > -       mov     %RSI_LP, %R10_LP
> > -       and     $-64, %R10_LP
> > -       mov     %RSI_LP, %R11_LP
> > -#endif
> > -
> > -       pxor    %xmm0, %xmm0
> > -       pxor    %xmm1, %xmm1
> > -       pxor    %xmm2, %xmm2
> > -       pxor    %xmm3, %xmm3
> > -       movq    %rdi, %rax
> > -       movq    %rdi, %rcx
> > -       andq    $4095, %rcx
> > -/* Offsets 4032-4047 will be aligned into 4032 thus fit into page.  */
> > -       cmpq    $4047, %rcx
> > -/* We cannot unify this branching as it would be ~6 cycles slower.  */
> > -       ja      L(cross_page)
> > -
> > -#ifdef AS_STRNLEN
> > -/* Test if end is among first 64 bytes.  */
> > -# define STRNLEN_PROLOG        \
> > -       mov     %r11, %rsi;     \
> > -       subq    %rax, %rsi;     \
> > -       andq    $-64, %rax;     \
> > -       testq   $-64, %rsi;     \
> > -       je      L(strnlen_ret)
> > -#else
> > -# define STRNLEN_PROLOG  andq $-64, %rax;
> > -#endif
> > -
> > -/* Ignore bits in mask that come before start of string.  */
> > -#define PROLOG(lab)    \
> > -       movq    %rdi, %rcx;     \
> > -       xorq    %rax, %rcx;     \
> > -       STRNLEN_PROLOG; \
> > -       sarq    %cl, %rdx;      \
> > -       test    %rdx, %rdx;     \
> > -       je      L(lab); \
> > -       bsfq    %rdx, %rax;     \
> > -       SHIFT_RETURN;           \
> > -       ret
> > -
> > -#ifdef AS_STRNLEN
> > -       andq    $-16, %rax
> > -       FIND_ZERO
> > -#else
> > -       /* Test first 16 bytes unaligned.  */
> > -       movdqu  (%rax), %xmm4
> > -       PCMPEQ  %xmm0, %xmm4
> > -       pmovmskb        %xmm4, %edx
> > -       test    %edx, %edx
> > -       je      L(next48_bytes)
> > -       bsf     %edx, %eax /* If eax is zeroed 16bit bsf can be used.  */
> > -       SHIFT_RETURN
> > -       ret
> > -
> > -L(next48_bytes):
> > -/* Same as FIND_ZERO except we do not check first 16 bytes.  */
> > -       andq    $-16, %rax
> > -       PCMPEQ 16(%rax), %xmm1
> > -       PCMPEQ 32(%rax), %xmm2
> > -       PCMPEQ 48(%rax), %xmm3
> > -       pmovmskb        %xmm1, %edx
> > -       pmovmskb        %xmm2, %r8d
> > -       pmovmskb        %xmm3, %ecx
> > -       salq    $16, %rdx
> > -       salq    $16, %rcx
> > -       orq     %r8, %rcx
> > -       salq    $32, %rcx
> > -       orq     %rcx, %rdx
> > -#endif
> > -
> > -       /* When no zero byte is found xmm1-3 are zero so we do not have to
> > -          zero them.  */
> > -       PROLOG(loop)
> > -
> > -       .p2align 4
> > -L(cross_page):
> > -       andq    $-64, %rax
> > -       FIND_ZERO
> > -       PROLOG(loop_init)
> > -
> > -#ifdef AS_STRNLEN
> > -/* We must do this check to correctly handle strnlen (s, -1).  */
> > -L(strnlen_ret):
> > -       bts     %rsi, %rdx
> > -       sarq    %cl, %rdx
> > -       test    %rdx, %rdx
> > -       je      L(loop_init)
> > -       bsfq    %rdx, %rax
> > -       SHIFT_RETURN
> > -       ret
> > -#endif
> > -       .p2align 4
> > -L(loop_init):
> > -       pxor    %xmm1, %xmm1
> > -       pxor    %xmm2, %xmm2
> > -       pxor    %xmm3, %xmm3
> > -#ifdef AS_STRNLEN
> > -       .p2align 4
> > -L(loop):
> > -
> > -       addq    $64, %rax
> > -       cmpq    %rax, %r10
> > -       je      L(exit_end)
> > -
> > -       movdqa  (%rax), %xmm0
> > -       PMINU   16(%rax), %xmm0
> > -       PMINU   32(%rax), %xmm0
> > -       PMINU   48(%rax), %xmm0
> > -       PCMPEQ  %xmm3, %xmm0
> > -       pmovmskb        %xmm0, %edx
> > -       testl   %edx, %edx
> > -       jne     L(exit)
> > -       jmp     L(loop)
> > -
> > -       .p2align 4
> > -L(exit_end):
> > -       cmp     %rax, %r11
> > -       je      L(first) /* Do not read when end is at page boundary.  */
> > -       pxor    %xmm0, %xmm0
> > -       FIND_ZERO
> > -
> > -L(first):
> > -       bts     %r11, %rdx
> > -       bsfq    %rdx, %rdx
> > -       addq    %rdx, %rax
> > -       subq    %rdi, %rax
> > -       SHIFT_RETURN
> > -       ret
> > -
> > -       .p2align 4
> > -L(exit):
> > -       pxor    %xmm0, %xmm0
> > -       FIND_ZERO
> > -
> > -       bsfq    %rdx, %rdx
> > -       addq    %rdx, %rax
> > -       subq    %rdi, %rax
> > -       SHIFT_RETURN
> > -       ret
> > -
> > -#else
> > -
> > -       /* Main loop.  Unrolled twice to improve L2 cache performance on core2.  */
> > -       .p2align 4
> > -L(loop):
> > -
> > -       movdqa  64(%rax), %xmm0
> > -       PMINU   80(%rax), %xmm0
> > -       PMINU   96(%rax), %xmm0
> > -       PMINU   112(%rax), %xmm0
> > -       PCMPEQ  %xmm3, %xmm0
> > -       pmovmskb        %xmm0, %edx
> > -       testl   %edx, %edx
> > -       jne     L(exit64)
> > -
> > -       subq    $-128, %rax
> > -
> > -       movdqa  (%rax), %xmm0
> > -       PMINU   16(%rax), %xmm0
> > -       PMINU   32(%rax), %xmm0
> > -       PMINU   48(%rax), %xmm0
> > -       PCMPEQ  %xmm3, %xmm0
> > -       pmovmskb        %xmm0, %edx
> > -       testl   %edx, %edx
> > -       jne     L(exit0)
> > -       jmp     L(loop)
> > -
> > -       .p2align 4
> > -L(exit64):
> > -       addq    $64, %rax
> > -L(exit0):
> > -       pxor    %xmm0, %xmm0
> > -       FIND_ZERO
> > -
> > -       bsfq    %rdx, %rdx
> > -       addq    %rdx, %rax
> > -       subq    %rdi, %rax
> > -       SHIFT_RETURN
> > -       ret
> > -
> > -#endif
> > -
> > -END(strlen)
> > diff --git a/sysdeps/x86_64/multiarch/strnlen-sse2.S b/sysdeps/x86_64/multiarch/strnlen-sse2.S
> > index c4f395c210..a50c7d6a28 100644
> > --- a/sysdeps/x86_64/multiarch/strnlen-sse2.S
> > +++ b/sysdeps/x86_64/multiarch/strnlen-sse2.S
> > @@ -17,12 +17,10 @@
> >     <https://www.gnu.org/licenses/>.  */
> >
> >  #if IS_IN (libc)
> > -# define __strnlen __strnlen_sse2
> > -
> > -# undef weak_alias
> > -# define weak_alias(__strnlen, strnlen)
> > -# undef libc_hidden_builtin_def
> > -# define libc_hidden_builtin_def(strnlen)
> > +# ifndef STRLEN
> > +#  define STRLEN       __strnlen_sse2
> > +# endif
> >  #endif
> >
> > -#include "../strnlen.S"
> > +#define AS_STRNLEN
> > +#include "strlen-sse2.S"
> > diff --git a/sysdeps/x86_64/multiarch/wcslen-sse4_1.S b/sysdeps/x86_64/multiarch/wcslen-sse4_1.S
> > index e306a77f51..c88e8342a1 100644
> > --- a/sysdeps/x86_64/multiarch/wcslen-sse4_1.S
> > +++ b/sysdeps/x86_64/multiarch/wcslen-sse4_1.S
> > @@ -1,5 +1,5 @@
> >  #define AS_WCSLEN
> > -#define strlen __wcslen_sse4_1
> > +#define STRLEN __wcslen_sse4_1
> >  #define SECTION(p)     p##.sse4.1
> >
> > -#include "strlen-vec.S"
> > +#include "strlen-sse2.S"
> > diff --git a/sysdeps/x86_64/multiarch/wcsnlen-sse4_1.S b/sysdeps/x86_64/multiarch/wcsnlen-sse4_1.S
> > index d2f7dd6e22..17cdedc2a9 100644
> > --- a/sysdeps/x86_64/multiarch/wcsnlen-sse4_1.S
> > +++ b/sysdeps/x86_64/multiarch/wcsnlen-sse4_1.S
> > @@ -1,6 +1,6 @@
> >  #define AS_WCSLEN
> >  #define AS_STRNLEN
> > -#define strlen __wcsnlen_sse4_1
> > +#define STRLEN __wcsnlen_sse4_1
> >  #define SECTION(p)     p##.sse4.1
> >
> > -#include "strlen-vec.S"
> > +#include "strlen-sse2.S"
> > diff --git a/sysdeps/x86_64/strlen.S b/sysdeps/x86_64/strlen.S
> > index e1f0b19f2f..c2f5674f8d 100644
> > --- a/sysdeps/x86_64/strlen.S
> > +++ b/sysdeps/x86_64/strlen.S
> > @@ -16,6 +16,7 @@
> >     License along with the GNU C Library; if not, see
> >     <https://www.gnu.org/licenses/>.  */
> >
> > -#include "multiarch/strlen-vec.S"
> > +#define STRLEN strlen
> > +#include "multiarch/strlen-sse2.S"
> >
> >  libc_hidden_builtin_def (strlen)
> > diff --git a/sysdeps/x86_64/strnlen.S b/sysdeps/x86_64/strnlen.S
> > index d3c43ac482..174970d58f 100644
> > --- a/sysdeps/x86_64/strnlen.S
> > +++ b/sysdeps/x86_64/strnlen.S
> > @@ -1,6 +1,6 @@
> > -#define AS_STRNLEN
> > -#define strlen __strnlen
> > -#include "strlen.S"
> > +#define STRLEN __strnlen
> > +#include "multiarch/strnlen-sse2.S"
> >
> > +libc_hidden_def (__strnlen)
> >  weak_alias (__strnlen, strnlen);
> >  libc_hidden_builtin_def (strnlen)
> > --
> > 2.34.1
> >
>
> LGTM.
>
> Thanks.
>
> --
> H.J.

^ permalink raw reply	[flat|nested] 21+ messages in thread

end of thread, other threads:[~2022-07-13  4:06 UTC | newest]

Thread overview: 21+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-07-12 19:29 [PATCH v1] x86: Move strlen SSE2 implementation to multiarch/strlen-sse2.S Noah Goldstein
2022-07-12 19:29 ` [PATCH v1] x86: Move strcpy SSE2 implementation to multiarch/strcpy-sse2.S Noah Goldstein
2022-07-12 23:23   ` H.J. Lu
2022-07-12 19:29 ` [PATCH v1] x86: Move memrchr SSE2 implementation to multiarch/memrchr-sse2.S Noah Goldstein
2022-07-12 22:58   ` H.J. Lu
2022-07-12 19:29 ` [PATCH v1] x86: Move strrchr SSE2 implementation to multiarch/strrchr-sse2.S Noah Goldstein
2022-07-12 22:28   ` H.J. Lu
2022-07-12 19:29 ` [PATCH v1] x86: Move strchr SSE2 implementation to multiarch/strchr-sse2.S Noah Goldstein
2022-07-12 21:27   ` H.J. Lu
2022-07-12 19:29 ` [PATCH v1] x86: Move strcat SSE2 implementation to multiarch/strcat-sse2.S Noah Goldstein
2022-07-12 21:16   ` H.J. Lu
2022-07-12 19:29 ` [PATCH v1] x86: Move wcschr SSE2 implementation to multiarch/wcschr-sse2.S Noah Goldstein
2022-07-12 20:55   ` H.J. Lu
2022-07-12 19:29 ` [PATCH v1] x86: Move wcslen SSE2 implementation to multiarch/wcslen-sse2.S Noah Goldstein
2022-07-12 20:26   ` H.J. Lu
2022-07-12 19:29 ` [PATCH v1] x86: Remove unneeded rtld-wmemcmp Noah Goldstein
2022-07-12 19:44   ` H.J. Lu
2022-07-12 19:29 ` [PATCH v1] x86: Add missing rtm tests for strcmp family Noah Goldstein
2022-07-12 19:59   ` H.J. Lu
2022-07-12 23:29 ` [PATCH v1] x86: Move strlen SSE2 implementation to multiarch/strlen-sse2.S H.J. Lu
2022-07-13  4:06   ` Noah Goldstein

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).