From mboxrd@z Thu Jan  1 00:00:00 1970
Return-Path: <nwg@sourceware.org>
Received: by sourceware.org (Postfix, from userid 7844)
 id 73CAD3858010; Wed, 13 Jul 2022 22:54:58 +0000 (GMT)
DKIM-Filter: OpenDKIM Filter v2.11.0 sourceware.org 73CAD3858010
Content-Type: text/plain; charset="us-ascii"
MIME-Version: 1.0
Content-Transfer-Encoding: 7bit
From: Noah Goldstein <nwg@sourceware.org>
To: glibc-cvs@sourceware.org
Subject: [glibc] x86: Move strchr SSE2 implementation to
 multiarch/strchr-sse2.S
X-Act-Checkin: glibc
X-Git-Author: Noah Goldstein <goldstein.w.n@gmail.com>
X-Git-Refname: refs/heads/master
X-Git-Oldrev: 425647458b03652526f670da7a0c2605513cf450
X-Git-Newrev: cd080d07410426c5ce211509eb0d8fd0901f673a
Message-Id: <20220713225458.73CAD3858010@sourceware.org>
Date: Wed, 13 Jul 2022 22:54:58 +0000 (GMT)
X-BeenThere: glibc-cvs@sourceware.org
X-Mailman-Version: 2.1.29
Precedence: list
List-Id: Glibc-cvs mailing list <glibc-cvs.sourceware.org>
List-Unsubscribe: <https://sourceware.org/mailman/options/glibc-cvs>,
 <mailto:glibc-cvs-request@sourceware.org?subject=unsubscribe>
List-Archive: <https://sourceware.org/pipermail/glibc-cvs/>
List-Help: <mailto:glibc-cvs-request@sourceware.org?subject=help>
List-Subscribe: <https://sourceware.org/mailman/listinfo/glibc-cvs>,
 <mailto:glibc-cvs-request@sourceware.org?subject=subscribe>
X-List-Received-Date: Wed, 13 Jul 2022 22:54:58 -0000

https://sourceware.org/git/gitweb.cgi?p=glibc.git;h=cd080d07410426c5ce211509eb0d8fd0901f673a

commit cd080d07410426c5ce211509eb0d8fd0901f673a
Author: Noah Goldstein <goldstein.w.n@gmail.com>
Date:   Tue Jul 12 12:29:05 2022 -0700

    x86: Move strchr SSE2 implementation to multiarch/strchr-sse2.S
    
    This commit doesn't affect libc.so.6, its just housekeeping to prepare
    for adding explicit ISA level support.
    
    Tested build on x86_64 and x86_32 with/without multiarch.

Diff:
---
 sysdeps/x86_64/multiarch/rtld-strchr.S    |  18 +++
 sysdeps/x86_64/multiarch/rtld-strchrnul.S |  18 +++
 sysdeps/x86_64/multiarch/strchr-sse2.S    | 175 ++++++++++++++++++++++++++++--
 sysdeps/x86_64/multiarch/strchrnul-sse2.S |  11 +-
 sysdeps/x86_64/strchr.S                   | 167 +---------------------------
 sysdeps/x86_64/strchrnul.S                |   7 +-
 6 files changed, 213 insertions(+), 183 deletions(-)

diff --git a/sysdeps/x86_64/multiarch/rtld-strchr.S b/sysdeps/x86_64/multiarch/rtld-strchr.S
new file mode 100644
index 0000000000..2b7b879e37
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/rtld-strchr.S
@@ -0,0 +1,18 @@
+/* Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include "../strchr.S"
diff --git a/sysdeps/x86_64/multiarch/rtld-strchrnul.S b/sysdeps/x86_64/multiarch/rtld-strchrnul.S
new file mode 100644
index 0000000000..0cc5becc88
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/rtld-strchrnul.S
@@ -0,0 +1,18 @@
+/* Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include "../strchrnul.S"
diff --git a/sysdeps/x86_64/multiarch/strchr-sse2.S b/sysdeps/x86_64/multiarch/strchr-sse2.S
index 992f700077..f7767ca543 100644
--- a/sysdeps/x86_64/multiarch/strchr-sse2.S
+++ b/sysdeps/x86_64/multiarch/strchr-sse2.S
@@ -16,13 +16,172 @@
    License along with the GNU C Library; if not, see
    <https://www.gnu.org/licenses/>.  */
 
-#if IS_IN (libc)
-# define strchr __strchr_sse2
+#if IS_IN (libc) || defined STRCHR
+# ifndef STRCHR
+#  define STRCHR __strchr_sse2
+# endif
 
-# undef weak_alias
-# define weak_alias(strchr, index)
-# undef libc_hidden_builtin_def
-# define libc_hidden_builtin_def(strchr)
-#endif
+# include <sysdep.h>
+
+	.text
+ENTRY (STRCHR)
+	movd	%esi, %xmm1
+	movl	%edi, %eax
+	andl	$4095, %eax
+	punpcklbw %xmm1, %xmm1
+	cmpl	$4032, %eax
+	punpcklwd %xmm1, %xmm1
+	pshufd	$0, %xmm1, %xmm1
+	jg	L(cross_page)
+	movdqu	(%rdi), %xmm0
+	pxor	%xmm3, %xmm3
+	movdqa	%xmm0, %xmm4
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm3, %xmm4
+	por	%xmm4, %xmm0
+	pmovmskb %xmm0, %eax
+	test	%eax, %eax
+	je	L(next_48_bytes)
+	bsf	%eax, %eax
+# ifdef AS_STRCHRNUL
+	leaq	(%rdi,%rax), %rax
+# else
+	movl	$0, %edx
+	leaq	(%rdi,%rax), %rax
+	cmpb	%sil, (%rax)
+	cmovne	%rdx, %rax
+# endif
+	ret
+
+	.p2align 3
+L(next_48_bytes):
+	movdqu	16(%rdi), %xmm0
+	movdqa	%xmm0, %xmm4
+	pcmpeqb	%xmm1, %xmm0
+	pcmpeqb	%xmm3, %xmm4
+	por	%xmm4, %xmm0
+	pmovmskb %xmm0, %ecx
+	movdqu	32(%rdi), %xmm0
+	movdqa	%xmm0, %xmm4
+	pcmpeqb	%xmm1, %xmm0
+	salq	$16, %rcx
+	pcmpeqb	%xmm3, %xmm4
+	por	%xmm4, %xmm0
+	pmovmskb %xmm0, %eax
+	movdqu	48(%rdi), %xmm0
+	pcmpeqb	%xmm0, %xmm3
+	salq	$32, %rax
+	pcmpeqb	%xmm1, %xmm0
+	orq	%rcx, %rax
+	por	%xmm3, %xmm0
+	pmovmskb %xmm0, %ecx
+	salq	$48, %rcx
+	orq	%rcx, %rax
+	testq	%rax, %rax
+	jne	L(return)
+L(loop_start):
+	/* We use this alignment to force loop be aligned to 8 but not
+	   16 bytes.  This gives better sheduling on AMD processors.  */
+	.p2align 4
+	pxor	%xmm6, %xmm6
+	andq	$-64, %rdi
+	.p2align 3
+L(loop64):
+	addq	$64, %rdi
+	movdqa	(%rdi), %xmm5
+	movdqa	16(%rdi), %xmm2
+	movdqa	32(%rdi), %xmm3
+	pxor	%xmm1, %xmm5
+	movdqa	48(%rdi), %xmm4
+	pxor	%xmm1, %xmm2
+	pxor	%xmm1, %xmm3
+	pminub	(%rdi), %xmm5
+	pxor	%xmm1, %xmm4
+	pminub	16(%rdi), %xmm2
+	pminub	32(%rdi), %xmm3
+	pminub	%xmm2, %xmm5
+	pminub	48(%rdi), %xmm4
+	pminub	%xmm3, %xmm5
+	pminub	%xmm4, %xmm5
+	pcmpeqb %xmm6, %xmm5
+	pmovmskb %xmm5, %eax
+
+	testl	%eax, %eax
+	je	L(loop64)
 
-#include "../strchr.S"
+	movdqa	(%rdi), %xmm5
+	movdqa	%xmm5, %xmm0
+	pcmpeqb	%xmm1, %xmm5
+	pcmpeqb	%xmm6, %xmm0
+	por	%xmm0, %xmm5
+	pcmpeqb %xmm6, %xmm2
+	pcmpeqb %xmm6, %xmm3
+	pcmpeqb %xmm6, %xmm4
+
+	pmovmskb %xmm5, %ecx
+	pmovmskb %xmm2, %eax
+	salq	$16, %rax
+	pmovmskb %xmm3, %r8d
+	pmovmskb %xmm4, %edx
+	salq	$32, %r8
+	orq	%r8, %rax
+	orq	%rcx, %rax
+	salq	$48, %rdx
+	orq	%rdx, %rax
+	.p2align 3
+L(return):
+	bsfq	%rax, %rax
+# ifdef AS_STRCHRNUL
+	leaq	(%rdi,%rax), %rax
+# else
+	movl	$0, %edx
+	leaq	(%rdi,%rax), %rax
+	cmpb	%sil, (%rax)
+	cmovne	%rdx, %rax
+# endif
+	ret
+	.p2align 4
+
+L(cross_page):
+	movq	%rdi, %rdx
+	pxor	%xmm2, %xmm2
+	andq	$-64, %rdx
+	movdqa	%xmm1, %xmm0
+	movdqa	(%rdx), %xmm3
+	movdqa	%xmm3, %xmm4
+	pcmpeqb	%xmm1, %xmm3
+	pcmpeqb	%xmm2, %xmm4
+	por	%xmm4, %xmm3
+	pmovmskb %xmm3, %r8d
+	movdqa	16(%rdx), %xmm3
+	movdqa	%xmm3, %xmm4
+	pcmpeqb	%xmm1, %xmm3
+	pcmpeqb	%xmm2, %xmm4
+	por	%xmm4, %xmm3
+	pmovmskb %xmm3, %eax
+	movdqa	32(%rdx), %xmm3
+	movdqa	%xmm3, %xmm4
+	pcmpeqb	%xmm1, %xmm3
+	salq	$16, %rax
+	pcmpeqb	%xmm2, %xmm4
+	por	%xmm4, %xmm3
+	pmovmskb %xmm3, %r9d
+	movdqa	48(%rdx), %xmm3
+	pcmpeqb	%xmm3, %xmm2
+	salq	$32, %r9
+	pcmpeqb	%xmm3, %xmm0
+	orq	%r9, %rax
+	orq	%r8, %rax
+	por	%xmm2, %xmm0
+	pmovmskb %xmm0, %ecx
+	salq	$48, %rcx
+	orq	%rcx, %rax
+	movl	%edi, %ecx
+	subb	%dl, %cl
+	shrq	%cl, %rax
+	testq	%rax, %rax
+	jne	L(return)
+	jmp	L(loop_start)
+
+END (STRCHR)
+#endif
diff --git a/sysdeps/x86_64/multiarch/strchrnul-sse2.S b/sysdeps/x86_64/multiarch/strchrnul-sse2.S
index f91c670369..7238977a21 100644
--- a/sysdeps/x86_64/multiarch/strchrnul-sse2.S
+++ b/sysdeps/x86_64/multiarch/strchrnul-sse2.S
@@ -17,10 +17,11 @@
    <https://www.gnu.org/licenses/>.  */
 
 #if IS_IN (libc)
-# define __strchrnul __strchrnul_sse2
-
-# undef weak_alias
-# define weak_alias(__strchrnul, strchrnul)
+# ifndef STRCHR
+#  define STRCHR	__strchrnul_sse2
+# endif
 #endif
 
-#include "../strchrnul.S"
+#define AS_STRCHRNUL
+
+#include "strchr-sse2.S"
diff --git a/sysdeps/x86_64/strchr.S b/sysdeps/x86_64/strchr.S
index dda7c0431d..77c956c92c 100644
--- a/sysdeps/x86_64/strchr.S
+++ b/sysdeps/x86_64/strchr.S
@@ -17,171 +17,8 @@
    License along with the GNU C Library; if not, see
    <https://www.gnu.org/licenses/>.  */
 
-#include <sysdep.h>
 
-	.text
-ENTRY (strchr)
-	movd	%esi, %xmm1
-	movl	%edi, %eax
-	andl	$4095, %eax
-	punpcklbw %xmm1, %xmm1
-	cmpl	$4032, %eax
-	punpcklwd %xmm1, %xmm1
-	pshufd	$0, %xmm1, %xmm1
-	jg	L(cross_page)
-	movdqu	(%rdi), %xmm0
-	pxor	%xmm3, %xmm3
-	movdqa	%xmm0, %xmm4
-	pcmpeqb	%xmm1, %xmm0
-	pcmpeqb	%xmm3, %xmm4
-	por	%xmm4, %xmm0
-	pmovmskb %xmm0, %eax
-	test	%eax, %eax
-	je	L(next_48_bytes)
-	bsf	%eax, %eax
-#ifdef AS_STRCHRNUL
-	leaq	(%rdi,%rax), %rax
-#else
-	movl	$0, %edx
-	leaq	(%rdi,%rax), %rax
-	cmpb	%sil, (%rax)
-	cmovne	%rdx, %rax
-#endif
-	ret
-
-	.p2align 3
-	L(next_48_bytes):
-	movdqu	16(%rdi), %xmm0
-	movdqa	%xmm0, %xmm4
-	pcmpeqb	%xmm1, %xmm0
-	pcmpeqb	%xmm3, %xmm4
-	por	%xmm4, %xmm0
-	pmovmskb %xmm0, %ecx
-	movdqu	32(%rdi), %xmm0
-	movdqa	%xmm0, %xmm4
-	pcmpeqb	%xmm1, %xmm0
-	salq	$16, %rcx
-	pcmpeqb	%xmm3, %xmm4
-	por	%xmm4, %xmm0
-	pmovmskb %xmm0, %eax
-	movdqu	48(%rdi), %xmm0
-	pcmpeqb	%xmm0, %xmm3
-	salq	$32, %rax
-	pcmpeqb	%xmm1, %xmm0
-	orq	%rcx, %rax
-	por	%xmm3, %xmm0
-	pmovmskb %xmm0, %ecx
-	salq	$48, %rcx
-	orq	%rcx, %rax
-	testq	%rax, %rax
-	jne	L(return)
-L(loop_start):
-	/* We use this alignment to force loop be aligned to 8 but not
-	   16 bytes.  This gives better sheduling on AMD processors.  */
-	.p2align 4
-	pxor	%xmm6, %xmm6
-	andq	$-64, %rdi
-	.p2align 3
-L(loop64):
-	addq	$64, %rdi
-	movdqa	(%rdi), %xmm5
-	movdqa	16(%rdi), %xmm2
-	movdqa	32(%rdi), %xmm3
-	pxor	%xmm1, %xmm5
-	movdqa	48(%rdi), %xmm4
-	pxor	%xmm1, %xmm2
-	pxor	%xmm1, %xmm3
-	pminub	(%rdi), %xmm5
-	pxor	%xmm1, %xmm4
-	pminub	16(%rdi), %xmm2
-	pminub	32(%rdi), %xmm3
-	pminub	%xmm2, %xmm5
-	pminub	48(%rdi), %xmm4
-	pminub	%xmm3, %xmm5
-	pminub	%xmm4, %xmm5
-	pcmpeqb %xmm6, %xmm5
-	pmovmskb %xmm5, %eax
-
-	testl	%eax, %eax
-	je	L(loop64)
-
-	movdqa	(%rdi), %xmm5
-	movdqa	%xmm5, %xmm0
-	pcmpeqb	%xmm1, %xmm5
-	pcmpeqb	%xmm6, %xmm0
-	por	%xmm0, %xmm5
-	pcmpeqb %xmm6, %xmm2
-	pcmpeqb %xmm6, %xmm3
-	pcmpeqb %xmm6, %xmm4
-
-	pmovmskb %xmm5, %ecx
-	pmovmskb %xmm2, %eax
-	salq	$16, %rax
-	pmovmskb %xmm3, %r8d
-	pmovmskb %xmm4, %edx
-	salq	$32, %r8
-	orq	%r8, %rax
-	orq	%rcx, %rax
-	salq	$48, %rdx
-	orq	%rdx, %rax
-	.p2align 3
-L(return):
-	bsfq	%rax, %rax
-#ifdef AS_STRCHRNUL
-	leaq	(%rdi,%rax), %rax
-#else
-	movl	$0, %edx
-	leaq	(%rdi,%rax), %rax
-	cmpb	%sil, (%rax)
-	cmovne	%rdx, %rax
-#endif
-	ret
-	.p2align 4
-
-L(cross_page):
-	movq	%rdi, %rdx
-	pxor	%xmm2, %xmm2
-	andq	$-64, %rdx
-	movdqa	%xmm1, %xmm0
-	movdqa	(%rdx), %xmm3
-	movdqa	%xmm3, %xmm4
-	pcmpeqb	%xmm1, %xmm3
-	pcmpeqb	%xmm2, %xmm4
-	por	%xmm4, %xmm3
-	pmovmskb %xmm3, %r8d
-	movdqa	16(%rdx), %xmm3
-	movdqa	%xmm3, %xmm4
-	pcmpeqb	%xmm1, %xmm3
-	pcmpeqb	%xmm2, %xmm4
-	por	%xmm4, %xmm3
-	pmovmskb %xmm3, %eax
-	movdqa	32(%rdx), %xmm3
-	movdqa	%xmm3, %xmm4
-	pcmpeqb	%xmm1, %xmm3
-	salq	$16, %rax
-	pcmpeqb	%xmm2, %xmm4
-	por	%xmm4, %xmm3
-	pmovmskb %xmm3, %r9d
-	movdqa	48(%rdx), %xmm3
-	pcmpeqb	%xmm3, %xmm2
-	salq	$32, %r9
-	pcmpeqb	%xmm3, %xmm0
-	orq	%r9, %rax
-	orq	%r8, %rax
-	por	%xmm2, %xmm0
-	pmovmskb %xmm0, %ecx
-	salq	$48, %rcx
-	orq	%rcx, %rax
-	movl	%edi, %ecx
-	subb	%dl, %cl
-	shrq	%cl, %rax
-	testq	%rax, %rax
-	jne	L(return)
-	jmp	L(loop_start)
-
-END (strchr)
-
-#ifndef AS_STRCHRNUL
+#define STRCHR strchr
+#include "multiarch/strchr-sse2.S"
 weak_alias (strchr, index)
 libc_hidden_builtin_def (strchr)
-#endif
diff --git a/sysdeps/x86_64/strchrnul.S b/sysdeps/x86_64/strchrnul.S
index ec2e652e25..508e42db26 100644
--- a/sysdeps/x86_64/strchrnul.S
+++ b/sysdeps/x86_64/strchrnul.S
@@ -18,10 +18,7 @@
    License along with the GNU C Library; if not, see
    <https://www.gnu.org/licenses/>.  */
 
-#include <sysdep.h>
-
-#define strchr __strchrnul
-#define AS_STRCHRNUL
-#include "strchr.S"
+#define STRCHR __strchrnul
+#include "multiarch/strchrnul-sse2.S"
 
 weak_alias (__strchrnul, strchrnul)