* [PATCH v1] x86: Move strlen SSE2 implementation to multiarch/strlen-sse2.S
@ 2022-07-12 19:29 Noah Goldstein
2022-07-12 19:29 ` [PATCH v1] x86: Move strcpy SSE2 implementation to multiarch/strcpy-sse2.S Noah Goldstein
` (9 more replies)
0 siblings, 10 replies; 21+ messages in thread
From: Noah Goldstein @ 2022-07-12 19:29 UTC (permalink / raw)
To: libc-alpha
This commit doesn't affect libc.so.6, its just housekeeping to prepare
for adding explicit ISA level support.
Tested build on x86_64 and x86_32 with/without multiarch.
---
sysdeps/x86_64/multiarch/rtld-strlen.S | 18 ++
sysdeps/x86_64/multiarch/rtld-strnlen.S | 18 ++
sysdeps/x86_64/multiarch/strlen-sse2.S | 260 ++++++++++++++++++++-
sysdeps/x86_64/multiarch/strlen-vec.S | 267 ----------------------
sysdeps/x86_64/multiarch/strnlen-sse2.S | 12 +-
sysdeps/x86_64/multiarch/wcslen-sse4_1.S | 4 +-
sysdeps/x86_64/multiarch/wcsnlen-sse4_1.S | 4 +-
sysdeps/x86_64/strlen.S | 3 +-
sysdeps/x86_64/strnlen.S | 6 +-
9 files changed, 306 insertions(+), 286 deletions(-)
create mode 100644 sysdeps/x86_64/multiarch/rtld-strlen.S
create mode 100644 sysdeps/x86_64/multiarch/rtld-strnlen.S
delete mode 100644 sysdeps/x86_64/multiarch/strlen-vec.S
diff --git a/sysdeps/x86_64/multiarch/rtld-strlen.S b/sysdeps/x86_64/multiarch/rtld-strlen.S
new file mode 100644
index 0000000000..609d26256e
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/rtld-strlen.S
@@ -0,0 +1,18 @@
+/* Copyright (C) 2022 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#include "../strlen.S"
diff --git a/sysdeps/x86_64/multiarch/rtld-strnlen.S b/sysdeps/x86_64/multiarch/rtld-strnlen.S
new file mode 100644
index 0000000000..ef2d64abc2
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/rtld-strnlen.S
@@ -0,0 +1,18 @@
+/* Copyright (C) 2022 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#include "../strnlen.S"
diff --git a/sysdeps/x86_64/multiarch/strlen-sse2.S b/sysdeps/x86_64/multiarch/strlen-sse2.S
index 660b327ed2..5be72267d5 100644
--- a/sysdeps/x86_64/multiarch/strlen-sse2.S
+++ b/sysdeps/x86_64/multiarch/strlen-sse2.S
@@ -16,8 +16,260 @@
License along with the GNU C Library; if not, see
<https://www.gnu.org/licenses/>. */
-#if IS_IN (libc)
-# define strlen __strlen_sse2
-#endif
+#if IS_IN (libc) || defined STRLEN
+
+# ifndef STRLEN
+# define STRLEN __strlen_sse2
+# endif
+
+
+# include <sysdep.h>
+
+# ifdef AS_WCSLEN
+# define PMINU pminud
+# define PCMPEQ pcmpeqd
+# define SHIFT_RETURN shrq $2, %rax
+# else
+# define PMINU pminub
+# define PCMPEQ pcmpeqb
+# define SHIFT_RETURN
+# endif
+
+# ifndef SECTION
+# define SECTION(p) p
+# endif
+
+/* Long lived register in strlen(s), strnlen(s, n) are:
+
+ %xmm3 - zero
+ %rdi - s
+ %r10 (s+n) & (~(64-1))
+ %r11 s+n
+*/
+
+
+ .section SECTION(.text),"ax",@progbits
+ENTRY(STRLEN)
+
+/* Test 64 bytes from %rax for zero. Save result as bitmask in %rdx. */
+# define FIND_ZERO \
+ PCMPEQ (%rax), %xmm0; \
+ PCMPEQ 16(%rax), %xmm1; \
+ PCMPEQ 32(%rax), %xmm2; \
+ PCMPEQ 48(%rax), %xmm3; \
+ pmovmskb %xmm0, %esi; \
+ pmovmskb %xmm1, %edx; \
+ pmovmskb %xmm2, %r8d; \
+ pmovmskb %xmm3, %ecx; \
+ salq $16, %rdx; \
+ salq $16, %rcx; \
+ orq %rsi, %rdx; \
+ orq %r8, %rcx; \
+ salq $32, %rcx; \
+ orq %rcx, %rdx;
+
+# ifdef AS_STRNLEN
+/* Do not read anything when n==0. */
+ test %RSI_LP, %RSI_LP
+ jne L(n_nonzero)
+ xor %rax, %rax
+ ret
+L(n_nonzero):
+# ifdef AS_WCSLEN
+/* Check for overflow from maxlen * sizeof(wchar_t). If it would
+ overflow the only way this program doesn't have undefined behavior
+ is if there is a null terminator in valid memory so wcslen will
+ suffice. */
+ mov %RSI_LP, %R10_LP
+ sar $62, %R10_LP
+ jnz __wcslen_sse4_1
+ sal $2, %RSI_LP
+# endif
+
+/* Initialize long lived registers. */
+ add %RDI_LP, %RSI_LP
+ mov %RSI_LP, %R10_LP
+ and $-64, %R10_LP
+ mov %RSI_LP, %R11_LP
+# endif
+
+ pxor %xmm0, %xmm0
+ pxor %xmm1, %xmm1
+ pxor %xmm2, %xmm2
+ pxor %xmm3, %xmm3
+ movq %rdi, %rax
+ movq %rdi, %rcx
+ andq $4095, %rcx
+/* Offsets 4032-4047 will be aligned into 4032 thus fit into page. */
+ cmpq $4047, %rcx
+/* We cannot unify this branching as it would be ~6 cycles slower. */
+ ja L(cross_page)
+
+# ifdef AS_STRNLEN
+/* Test if end is among first 64 bytes. */
+# define STRNLEN_PROLOG \
+ mov %r11, %rsi; \
+ subq %rax, %rsi; \
+ andq $-64, %rax; \
+ testq $-64, %rsi; \
+ je L(strnlen_ret)
+# else
+# define STRNLEN_PROLOG andq $-64, %rax;
+# endif
+
+/* Ignore bits in mask that come before start of string. */
+# define PROLOG(lab) \
+ movq %rdi, %rcx; \
+ xorq %rax, %rcx; \
+ STRNLEN_PROLOG; \
+ sarq %cl, %rdx; \
+ test %rdx, %rdx; \
+ je L(lab); \
+ bsfq %rdx, %rax; \
+ SHIFT_RETURN; \
+ ret
+
+# ifdef AS_STRNLEN
+ andq $-16, %rax
+ FIND_ZERO
+# else
+ /* Test first 16 bytes unaligned. */
+ movdqu (%rax), %xmm4
+ PCMPEQ %xmm0, %xmm4
+ pmovmskb %xmm4, %edx
+ test %edx, %edx
+ je L(next48_bytes)
+ bsf %edx, %eax /* If eax is zeroed 16bit bsf can be used. */
+ SHIFT_RETURN
+ ret
+
+L(next48_bytes):
+/* Same as FIND_ZERO except we do not check first 16 bytes. */
+ andq $-16, %rax
+ PCMPEQ 16(%rax), %xmm1
+ PCMPEQ 32(%rax), %xmm2
+ PCMPEQ 48(%rax), %xmm3
+ pmovmskb %xmm1, %edx
+ pmovmskb %xmm2, %r8d
+ pmovmskb %xmm3, %ecx
+ salq $16, %rdx
+ salq $16, %rcx
+ orq %r8, %rcx
+ salq $32, %rcx
+ orq %rcx, %rdx
+# endif
-#include "strlen-vec.S"
+ /* When no zero byte is found xmm1-3 are zero so we do not have to
+ zero them. */
+ PROLOG(loop)
+
+ .p2align 4
+L(cross_page):
+ andq $-64, %rax
+ FIND_ZERO
+ PROLOG(loop_init)
+
+# ifdef AS_STRNLEN
+/* We must do this check to correctly handle strnlen (s, -1). */
+L(strnlen_ret):
+ bts %rsi, %rdx
+ sarq %cl, %rdx
+ test %rdx, %rdx
+ je L(loop_init)
+ bsfq %rdx, %rax
+ SHIFT_RETURN
+ ret
+# endif
+ .p2align 4
+L(loop_init):
+ pxor %xmm1, %xmm1
+ pxor %xmm2, %xmm2
+ pxor %xmm3, %xmm3
+# ifdef AS_STRNLEN
+ .p2align 4
+L(loop):
+
+ addq $64, %rax
+ cmpq %rax, %r10
+ je L(exit_end)
+
+ movdqa (%rax), %xmm0
+ PMINU 16(%rax), %xmm0
+ PMINU 32(%rax), %xmm0
+ PMINU 48(%rax), %xmm0
+ PCMPEQ %xmm3, %xmm0
+ pmovmskb %xmm0, %edx
+ testl %edx, %edx
+ jne L(exit)
+ jmp L(loop)
+
+ .p2align 4
+L(exit_end):
+ cmp %rax, %r11
+ je L(first) /* Do not read when end is at page boundary. */
+ pxor %xmm0, %xmm0
+ FIND_ZERO
+
+L(first):
+ bts %r11, %rdx
+ bsfq %rdx, %rdx
+ addq %rdx, %rax
+ subq %rdi, %rax
+ SHIFT_RETURN
+ ret
+
+ .p2align 4
+L(exit):
+ pxor %xmm0, %xmm0
+ FIND_ZERO
+
+ bsfq %rdx, %rdx
+ addq %rdx, %rax
+ subq %rdi, %rax
+ SHIFT_RETURN
+ ret
+
+# else
+
+ /* Main loop. Unrolled twice to improve L2 cache performance on core2. */
+ .p2align 4
+L(loop):
+
+ movdqa 64(%rax), %xmm0
+ PMINU 80(%rax), %xmm0
+ PMINU 96(%rax), %xmm0
+ PMINU 112(%rax), %xmm0
+ PCMPEQ %xmm3, %xmm0
+ pmovmskb %xmm0, %edx
+ testl %edx, %edx
+ jne L(exit64)
+
+ subq $-128, %rax
+
+ movdqa (%rax), %xmm0
+ PMINU 16(%rax), %xmm0
+ PMINU 32(%rax), %xmm0
+ PMINU 48(%rax), %xmm0
+ PCMPEQ %xmm3, %xmm0
+ pmovmskb %xmm0, %edx
+ testl %edx, %edx
+ jne L(exit0)
+ jmp L(loop)
+
+ .p2align 4
+L(exit64):
+ addq $64, %rax
+L(exit0):
+ pxor %xmm0, %xmm0
+ FIND_ZERO
+
+ bsfq %rdx, %rdx
+ addq %rdx, %rax
+ subq %rdi, %rax
+ SHIFT_RETURN
+ ret
+
+# endif
+
+END(STRLEN)
+#endif
diff --git a/sysdeps/x86_64/multiarch/strlen-vec.S b/sysdeps/x86_64/multiarch/strlen-vec.S
deleted file mode 100644
index 874123d604..0000000000
--- a/sysdeps/x86_64/multiarch/strlen-vec.S
+++ /dev/null
@@ -1,267 +0,0 @@
-/* SSE2 version of strlen and SSE4.1 version of wcslen.
- Copyright (C) 2012-2022 Free Software Foundation, Inc.
- This file is part of the GNU C Library.
-
- The GNU C Library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- The GNU C Library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with the GNU C Library; if not, see
- <https://www.gnu.org/licenses/>. */
-
-#include <sysdep.h>
-
-#ifdef AS_WCSLEN
-# define PMINU pminud
-# define PCMPEQ pcmpeqd
-# define SHIFT_RETURN shrq $2, %rax
-#else
-# define PMINU pminub
-# define PCMPEQ pcmpeqb
-# define SHIFT_RETURN
-#endif
-
-#ifndef SECTION
-# define SECTION(p) p
-#endif
-
-/* Long lived register in strlen(s), strnlen(s, n) are:
-
- %xmm3 - zero
- %rdi - s
- %r10 (s+n) & (~(64-1))
- %r11 s+n
-*/
-
-
- .section SECTION(.text),"ax",@progbits
-ENTRY(strlen)
-
-/* Test 64 bytes from %rax for zero. Save result as bitmask in %rdx. */
-#define FIND_ZERO \
- PCMPEQ (%rax), %xmm0; \
- PCMPEQ 16(%rax), %xmm1; \
- PCMPEQ 32(%rax), %xmm2; \
- PCMPEQ 48(%rax), %xmm3; \
- pmovmskb %xmm0, %esi; \
- pmovmskb %xmm1, %edx; \
- pmovmskb %xmm2, %r8d; \
- pmovmskb %xmm3, %ecx; \
- salq $16, %rdx; \
- salq $16, %rcx; \
- orq %rsi, %rdx; \
- orq %r8, %rcx; \
- salq $32, %rcx; \
- orq %rcx, %rdx;
-
-#ifdef AS_STRNLEN
-/* Do not read anything when n==0. */
- test %RSI_LP, %RSI_LP
- jne L(n_nonzero)
- xor %rax, %rax
- ret
-L(n_nonzero):
-# ifdef AS_WCSLEN
-/* Check for overflow from maxlen * sizeof(wchar_t). If it would
- overflow the only way this program doesn't have undefined behavior
- is if there is a null terminator in valid memory so wcslen will
- suffice. */
- mov %RSI_LP, %R10_LP
- sar $62, %R10_LP
- jnz __wcslen_sse4_1
- sal $2, %RSI_LP
-# endif
-
-/* Initialize long lived registers. */
- add %RDI_LP, %RSI_LP
- mov %RSI_LP, %R10_LP
- and $-64, %R10_LP
- mov %RSI_LP, %R11_LP
-#endif
-
- pxor %xmm0, %xmm0
- pxor %xmm1, %xmm1
- pxor %xmm2, %xmm2
- pxor %xmm3, %xmm3
- movq %rdi, %rax
- movq %rdi, %rcx
- andq $4095, %rcx
-/* Offsets 4032-4047 will be aligned into 4032 thus fit into page. */
- cmpq $4047, %rcx
-/* We cannot unify this branching as it would be ~6 cycles slower. */
- ja L(cross_page)
-
-#ifdef AS_STRNLEN
-/* Test if end is among first 64 bytes. */
-# define STRNLEN_PROLOG \
- mov %r11, %rsi; \
- subq %rax, %rsi; \
- andq $-64, %rax; \
- testq $-64, %rsi; \
- je L(strnlen_ret)
-#else
-# define STRNLEN_PROLOG andq $-64, %rax;
-#endif
-
-/* Ignore bits in mask that come before start of string. */
-#define PROLOG(lab) \
- movq %rdi, %rcx; \
- xorq %rax, %rcx; \
- STRNLEN_PROLOG; \
- sarq %cl, %rdx; \
- test %rdx, %rdx; \
- je L(lab); \
- bsfq %rdx, %rax; \
- SHIFT_RETURN; \
- ret
-
-#ifdef AS_STRNLEN
- andq $-16, %rax
- FIND_ZERO
-#else
- /* Test first 16 bytes unaligned. */
- movdqu (%rax), %xmm4
- PCMPEQ %xmm0, %xmm4
- pmovmskb %xmm4, %edx
- test %edx, %edx
- je L(next48_bytes)
- bsf %edx, %eax /* If eax is zeroed 16bit bsf can be used. */
- SHIFT_RETURN
- ret
-
-L(next48_bytes):
-/* Same as FIND_ZERO except we do not check first 16 bytes. */
- andq $-16, %rax
- PCMPEQ 16(%rax), %xmm1
- PCMPEQ 32(%rax), %xmm2
- PCMPEQ 48(%rax), %xmm3
- pmovmskb %xmm1, %edx
- pmovmskb %xmm2, %r8d
- pmovmskb %xmm3, %ecx
- salq $16, %rdx
- salq $16, %rcx
- orq %r8, %rcx
- salq $32, %rcx
- orq %rcx, %rdx
-#endif
-
- /* When no zero byte is found xmm1-3 are zero so we do not have to
- zero them. */
- PROLOG(loop)
-
- .p2align 4
-L(cross_page):
- andq $-64, %rax
- FIND_ZERO
- PROLOG(loop_init)
-
-#ifdef AS_STRNLEN
-/* We must do this check to correctly handle strnlen (s, -1). */
-L(strnlen_ret):
- bts %rsi, %rdx
- sarq %cl, %rdx
- test %rdx, %rdx
- je L(loop_init)
- bsfq %rdx, %rax
- SHIFT_RETURN
- ret
-#endif
- .p2align 4
-L(loop_init):
- pxor %xmm1, %xmm1
- pxor %xmm2, %xmm2
- pxor %xmm3, %xmm3
-#ifdef AS_STRNLEN
- .p2align 4
-L(loop):
-
- addq $64, %rax
- cmpq %rax, %r10
- je L(exit_end)
-
- movdqa (%rax), %xmm0
- PMINU 16(%rax), %xmm0
- PMINU 32(%rax), %xmm0
- PMINU 48(%rax), %xmm0
- PCMPEQ %xmm3, %xmm0
- pmovmskb %xmm0, %edx
- testl %edx, %edx
- jne L(exit)
- jmp L(loop)
-
- .p2align 4
-L(exit_end):
- cmp %rax, %r11
- je L(first) /* Do not read when end is at page boundary. */
- pxor %xmm0, %xmm0
- FIND_ZERO
-
-L(first):
- bts %r11, %rdx
- bsfq %rdx, %rdx
- addq %rdx, %rax
- subq %rdi, %rax
- SHIFT_RETURN
- ret
-
- .p2align 4
-L(exit):
- pxor %xmm0, %xmm0
- FIND_ZERO
-
- bsfq %rdx, %rdx
- addq %rdx, %rax
- subq %rdi, %rax
- SHIFT_RETURN
- ret
-
-#else
-
- /* Main loop. Unrolled twice to improve L2 cache performance on core2. */
- .p2align 4
-L(loop):
-
- movdqa 64(%rax), %xmm0
- PMINU 80(%rax), %xmm0
- PMINU 96(%rax), %xmm0
- PMINU 112(%rax), %xmm0
- PCMPEQ %xmm3, %xmm0
- pmovmskb %xmm0, %edx
- testl %edx, %edx
- jne L(exit64)
-
- subq $-128, %rax
-
- movdqa (%rax), %xmm0
- PMINU 16(%rax), %xmm0
- PMINU 32(%rax), %xmm0
- PMINU 48(%rax), %xmm0
- PCMPEQ %xmm3, %xmm0
- pmovmskb %xmm0, %edx
- testl %edx, %edx
- jne L(exit0)
- jmp L(loop)
-
- .p2align 4
-L(exit64):
- addq $64, %rax
-L(exit0):
- pxor %xmm0, %xmm0
- FIND_ZERO
-
- bsfq %rdx, %rdx
- addq %rdx, %rax
- subq %rdi, %rax
- SHIFT_RETURN
- ret
-
-#endif
-
-END(strlen)
diff --git a/sysdeps/x86_64/multiarch/strnlen-sse2.S b/sysdeps/x86_64/multiarch/strnlen-sse2.S
index c4f395c210..a50c7d6a28 100644
--- a/sysdeps/x86_64/multiarch/strnlen-sse2.S
+++ b/sysdeps/x86_64/multiarch/strnlen-sse2.S
@@ -17,12 +17,10 @@
<https://www.gnu.org/licenses/>. */
#if IS_IN (libc)
-# define __strnlen __strnlen_sse2
-
-# undef weak_alias
-# define weak_alias(__strnlen, strnlen)
-# undef libc_hidden_builtin_def
-# define libc_hidden_builtin_def(strnlen)
+# ifndef STRLEN
+# define STRLEN __strnlen_sse2
+# endif
#endif
-#include "../strnlen.S"
+#define AS_STRNLEN
+#include "strlen-sse2.S"
diff --git a/sysdeps/x86_64/multiarch/wcslen-sse4_1.S b/sysdeps/x86_64/multiarch/wcslen-sse4_1.S
index e306a77f51..c88e8342a1 100644
--- a/sysdeps/x86_64/multiarch/wcslen-sse4_1.S
+++ b/sysdeps/x86_64/multiarch/wcslen-sse4_1.S
@@ -1,5 +1,5 @@
#define AS_WCSLEN
-#define strlen __wcslen_sse4_1
+#define STRLEN __wcslen_sse4_1
#define SECTION(p) p##.sse4.1
-#include "strlen-vec.S"
+#include "strlen-sse2.S"
diff --git a/sysdeps/x86_64/multiarch/wcsnlen-sse4_1.S b/sysdeps/x86_64/multiarch/wcsnlen-sse4_1.S
index d2f7dd6e22..17cdedc2a9 100644
--- a/sysdeps/x86_64/multiarch/wcsnlen-sse4_1.S
+++ b/sysdeps/x86_64/multiarch/wcsnlen-sse4_1.S
@@ -1,6 +1,6 @@
#define AS_WCSLEN
#define AS_STRNLEN
-#define strlen __wcsnlen_sse4_1
+#define STRLEN __wcsnlen_sse4_1
#define SECTION(p) p##.sse4.1
-#include "strlen-vec.S"
+#include "strlen-sse2.S"
diff --git a/sysdeps/x86_64/strlen.S b/sysdeps/x86_64/strlen.S
index e1f0b19f2f..c2f5674f8d 100644
--- a/sysdeps/x86_64/strlen.S
+++ b/sysdeps/x86_64/strlen.S
@@ -16,6 +16,7 @@
License along with the GNU C Library; if not, see
<https://www.gnu.org/licenses/>. */
-#include "multiarch/strlen-vec.S"
+#define STRLEN strlen
+#include "multiarch/strlen-sse2.S"
libc_hidden_builtin_def (strlen)
diff --git a/sysdeps/x86_64/strnlen.S b/sysdeps/x86_64/strnlen.S
index d3c43ac482..174970d58f 100644
--- a/sysdeps/x86_64/strnlen.S
+++ b/sysdeps/x86_64/strnlen.S
@@ -1,6 +1,6 @@
-#define AS_STRNLEN
-#define strlen __strnlen
-#include "strlen.S"
+#define STRLEN __strnlen
+#include "multiarch/strnlen-sse2.S"
+libc_hidden_def (__strnlen)
weak_alias (__strnlen, strnlen);
libc_hidden_builtin_def (strnlen)
--
2.34.1
^ permalink raw reply [flat|nested] 21+ messages in thread
* [PATCH v1] x86: Move strcpy SSE2 implementation to multiarch/strcpy-sse2.S
2022-07-12 19:29 [PATCH v1] x86: Move strlen SSE2 implementation to multiarch/strlen-sse2.S Noah Goldstein
@ 2022-07-12 19:29 ` Noah Goldstein
2022-07-12 23:23 ` H.J. Lu
2022-07-12 19:29 ` [PATCH v1] x86: Move memrchr SSE2 implementation to multiarch/memrchr-sse2.S Noah Goldstein
` (8 subsequent siblings)
9 siblings, 1 reply; 21+ messages in thread
From: Noah Goldstein @ 2022-07-12 19:29 UTC (permalink / raw)
To: libc-alpha
This commit doesn't affect libc.so.6, its just housekeeping to prepare
for adding explicit ISA level support.
Tested build on x86_64 and x86_32 with/without multiarch.
---
sysdeps/x86_64/multiarch/rtld-stpcpy.S | 18 ++++
sysdeps/x86_64/multiarch/stpcpy-sse2.S | 15 +--
sysdeps/x86_64/multiarch/strcpy-sse2.S | 137 ++++++++++++++++++++++--
sysdeps/x86_64/stpcpy.S | 3 +-
sysdeps/x86_64/strcpy.S | 138 +------------------------
5 files changed, 156 insertions(+), 155 deletions(-)
create mode 100644 sysdeps/x86_64/multiarch/rtld-stpcpy.S
diff --git a/sysdeps/x86_64/multiarch/rtld-stpcpy.S b/sysdeps/x86_64/multiarch/rtld-stpcpy.S
new file mode 100644
index 0000000000..914141f07f
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/rtld-stpcpy.S
@@ -0,0 +1,18 @@
+/* Copyright (C) 2022 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#include "../stpcpy.S"
diff --git a/sysdeps/x86_64/multiarch/stpcpy-sse2.S b/sysdeps/x86_64/multiarch/stpcpy-sse2.S
index 078504a44e..ea9f973af3 100644
--- a/sysdeps/x86_64/multiarch/stpcpy-sse2.S
+++ b/sysdeps/x86_64/multiarch/stpcpy-sse2.S
@@ -17,17 +17,10 @@
<https://www.gnu.org/licenses/>. */
#if IS_IN (libc)
-
-# include <sysdep.h>
-# define __stpcpy __stpcpy_sse2
-
-# undef weak_alias
-# define weak_alias(ignored1, ignored2)
-# undef libc_hidden_def
-# define libc_hidden_def(__stpcpy)
-# undef libc_hidden_builtin_def
-# define libc_hidden_builtin_def(stpcpy)
+# ifndef STRCPY
+# define STRCPY __stpcpy_sse2
+# endif
#endif
#define USE_AS_STPCPY
-#include <sysdeps/x86_64/stpcpy.S>
+#include "strcpy-sse2.S"
diff --git a/sysdeps/x86_64/multiarch/strcpy-sse2.S b/sysdeps/x86_64/multiarch/strcpy-sse2.S
index f37967c441..8b5db8b13d 100644
--- a/sysdeps/x86_64/multiarch/strcpy-sse2.S
+++ b/sysdeps/x86_64/multiarch/strcpy-sse2.S
@@ -17,12 +17,137 @@
<https://www.gnu.org/licenses/>. */
#if IS_IN (libc)
+# ifndef STRCPY
+# define STRCPY __strcpy_sse2
+# endif
+#endif
-# include <sysdep.h>
-# define strcpy __strcpy_sse2
+#include <sysdep.h>
-# undef libc_hidden_builtin_def
-# define libc_hidden_builtin_def(strcpy)
-#endif
+ .text
+ENTRY (STRCPY)
+ movq %rsi, %rcx /* Source register. */
+ andl $7, %ecx /* mask alignment bits */
+ movq %rdi, %rdx /* Duplicate destination pointer. */
+
+ jz 5f /* aligned => start loop */
+
+ neg %ecx /* We need to align to 8 bytes. */
+ addl $8,%ecx
+ /* Search the first bytes directly. */
+0:
+ movb (%rsi), %al /* Fetch a byte */
+ testb %al, %al /* Is it NUL? */
+ movb %al, (%rdx) /* Store it */
+ jz 4f /* If it was NUL, done! */
+ incq %rsi
+ incq %rdx
+ decl %ecx
+ jnz 0b
+
+5:
+ movq $0xfefefefefefefeff,%r8
+
+ /* Now the sources is aligned. Unfortunatly we cannot force
+ to have both source and destination aligned, so ignore the
+ alignment of the destination. */
+ .p2align 4
+1:
+ /* 1st unroll. */
+ movq (%rsi), %rax /* Read double word (8 bytes). */
+ addq $8, %rsi /* Adjust pointer for next word. */
+ movq %rax, %r9 /* Save a copy for NUL finding. */
+ addq %r8, %r9 /* add the magic value to the word. We get
+ carry bits reported for each byte which
+ is *not* 0 */
+ jnc 3f /* highest byte is NUL => return pointer */
+ xorq %rax, %r9 /* (word+magic)^word */
+ orq %r8, %r9 /* set all non-carry bits */
+ incq %r9 /* add 1: if one carry bit was *not* set
+ the addition will not result in 0. */
+
+ jnz 3f /* found NUL => return pointer */
+
+ movq %rax, (%rdx) /* Write value to destination. */
+ addq $8, %rdx /* Adjust pointer. */
+
+ /* 2nd unroll. */
+ movq (%rsi), %rax /* Read double word (8 bytes). */
+ addq $8, %rsi /* Adjust pointer for next word. */
+ movq %rax, %r9 /* Save a copy for NUL finding. */
+ addq %r8, %r9 /* add the magic value to the word. We get
+ carry bits reported for each byte which
+ is *not* 0 */
+ jnc 3f /* highest byte is NUL => return pointer */
+ xorq %rax, %r9 /* (word+magic)^word */
+ orq %r8, %r9 /* set all non-carry bits */
+ incq %r9 /* add 1: if one carry bit was *not* set
+ the addition will not result in 0. */
+
+ jnz 3f /* found NUL => return pointer */
-#include <sysdeps/x86_64/strcpy.S>
+ movq %rax, (%rdx) /* Write value to destination. */
+ addq $8, %rdx /* Adjust pointer. */
+
+ /* 3rd unroll. */
+ movq (%rsi), %rax /* Read double word (8 bytes). */
+ addq $8, %rsi /* Adjust pointer for next word. */
+ movq %rax, %r9 /* Save a copy for NUL finding. */
+ addq %r8, %r9 /* add the magic value to the word. We get
+ carry bits reported for each byte which
+ is *not* 0 */
+ jnc 3f /* highest byte is NUL => return pointer */
+ xorq %rax, %r9 /* (word+magic)^word */
+ orq %r8, %r9 /* set all non-carry bits */
+ incq %r9 /* add 1: if one carry bit was *not* set
+ the addition will not result in 0. */
+
+ jnz 3f /* found NUL => return pointer */
+
+ movq %rax, (%rdx) /* Write value to destination. */
+ addq $8, %rdx /* Adjust pointer. */
+
+ /* 4th unroll. */
+ movq (%rsi), %rax /* Read double word (8 bytes). */
+ addq $8, %rsi /* Adjust pointer for next word. */
+ movq %rax, %r9 /* Save a copy for NUL finding. */
+ addq %r8, %r9 /* add the magic value to the word. We get
+ carry bits reported for each byte which
+ is *not* 0 */
+ jnc 3f /* highest byte is NUL => return pointer */
+ xorq %rax, %r9 /* (word+magic)^word */
+ orq %r8, %r9 /* set all non-carry bits */
+ incq %r9 /* add 1: if one carry bit was *not* set
+ the addition will not result in 0. */
+
+ jnz 3f /* found NUL => return pointer */
+
+ movq %rax, (%rdx) /* Write value to destination. */
+ addq $8, %rdx /* Adjust pointer. */
+ jmp 1b /* Next iteration. */
+
+ /* Do the last few bytes. %rax contains the value to write.
+ The loop is unrolled twice. */
+ .p2align 4
+3:
+ /* Note that stpcpy needs to return with the value of the NUL
+ byte. */
+ movb %al, (%rdx) /* 1st byte. */
+ testb %al, %al /* Is it NUL. */
+ jz 4f /* yes, finish. */
+ incq %rdx /* Increment destination. */
+ movb %ah, (%rdx) /* 2nd byte. */
+ testb %ah, %ah /* Is it NUL?. */
+ jz 4f /* yes, finish. */
+ incq %rdx /* Increment destination. */
+ shrq $16, %rax /* Shift... */
+ jmp 3b /* and look at next two bytes in %rax. */
+
+4:
+#ifdef USE_AS_STPCPY
+ movq %rdx, %rax /* Destination is return value. */
+#else
+ movq %rdi, %rax /* Source is return value. */
+#endif
+ retq
+END (STRCPY)
diff --git a/sysdeps/x86_64/stpcpy.S b/sysdeps/x86_64/stpcpy.S
index ec23de1416..b097c203dd 100644
--- a/sysdeps/x86_64/stpcpy.S
+++ b/sysdeps/x86_64/stpcpy.S
@@ -1,7 +1,6 @@
-#define USE_AS_STPCPY
#define STRCPY __stpcpy
-#include <sysdeps/x86_64/strcpy.S>
+#include "multiarch/stpcpy-sse2.S"
weak_alias (__stpcpy, stpcpy)
libc_hidden_def (__stpcpy)
diff --git a/sysdeps/x86_64/strcpy.S b/sysdeps/x86_64/strcpy.S
index 17e8073550..05f19e6e94 100644
--- a/sysdeps/x86_64/strcpy.S
+++ b/sysdeps/x86_64/strcpy.S
@@ -16,140 +16,6 @@
License along with the GNU C Library; if not, see
<https://www.gnu.org/licenses/>. */
-#include <sysdep.h>
-#include "asm-syntax.h"
-
-#ifndef USE_AS_STPCPY
-# define STRCPY strcpy
-#endif
-
- .text
-ENTRY (STRCPY)
- movq %rsi, %rcx /* Source register. */
- andl $7, %ecx /* mask alignment bits */
- movq %rdi, %rdx /* Duplicate destination pointer. */
-
- jz 5f /* aligned => start loop */
-
- neg %ecx /* We need to align to 8 bytes. */
- addl $8,%ecx
- /* Search the first bytes directly. */
-0:
- movb (%rsi), %al /* Fetch a byte */
- testb %al, %al /* Is it NUL? */
- movb %al, (%rdx) /* Store it */
- jz 4f /* If it was NUL, done! */
- incq %rsi
- incq %rdx
- decl %ecx
- jnz 0b
-
-5:
- movq $0xfefefefefefefeff,%r8
-
- /* Now the sources is aligned. Unfortunatly we cannot force
- to have both source and destination aligned, so ignore the
- alignment of the destination. */
- .p2align 4
-1:
- /* 1st unroll. */
- movq (%rsi), %rax /* Read double word (8 bytes). */
- addq $8, %rsi /* Adjust pointer for next word. */
- movq %rax, %r9 /* Save a copy for NUL finding. */
- addq %r8, %r9 /* add the magic value to the word. We get
- carry bits reported for each byte which
- is *not* 0 */
- jnc 3f /* highest byte is NUL => return pointer */
- xorq %rax, %r9 /* (word+magic)^word */
- orq %r8, %r9 /* set all non-carry bits */
- incq %r9 /* add 1: if one carry bit was *not* set
- the addition will not result in 0. */
-
- jnz 3f /* found NUL => return pointer */
-
- movq %rax, (%rdx) /* Write value to destination. */
- addq $8, %rdx /* Adjust pointer. */
-
- /* 2nd unroll. */
- movq (%rsi), %rax /* Read double word (8 bytes). */
- addq $8, %rsi /* Adjust pointer for next word. */
- movq %rax, %r9 /* Save a copy for NUL finding. */
- addq %r8, %r9 /* add the magic value to the word. We get
- carry bits reported for each byte which
- is *not* 0 */
- jnc 3f /* highest byte is NUL => return pointer */
- xorq %rax, %r9 /* (word+magic)^word */
- orq %r8, %r9 /* set all non-carry bits */
- incq %r9 /* add 1: if one carry bit was *not* set
- the addition will not result in 0. */
-
- jnz 3f /* found NUL => return pointer */
-
- movq %rax, (%rdx) /* Write value to destination. */
- addq $8, %rdx /* Adjust pointer. */
-
- /* 3rd unroll. */
- movq (%rsi), %rax /* Read double word (8 bytes). */
- addq $8, %rsi /* Adjust pointer for next word. */
- movq %rax, %r9 /* Save a copy for NUL finding. */
- addq %r8, %r9 /* add the magic value to the word. We get
- carry bits reported for each byte which
- is *not* 0 */
- jnc 3f /* highest byte is NUL => return pointer */
- xorq %rax, %r9 /* (word+magic)^word */
- orq %r8, %r9 /* set all non-carry bits */
- incq %r9 /* add 1: if one carry bit was *not* set
- the addition will not result in 0. */
-
- jnz 3f /* found NUL => return pointer */
-
- movq %rax, (%rdx) /* Write value to destination. */
- addq $8, %rdx /* Adjust pointer. */
-
- /* 4th unroll. */
- movq (%rsi), %rax /* Read double word (8 bytes). */
- addq $8, %rsi /* Adjust pointer for next word. */
- movq %rax, %r9 /* Save a copy for NUL finding. */
- addq %r8, %r9 /* add the magic value to the word. We get
- carry bits reported for each byte which
- is *not* 0 */
- jnc 3f /* highest byte is NUL => return pointer */
- xorq %rax, %r9 /* (word+magic)^word */
- orq %r8, %r9 /* set all non-carry bits */
- incq %r9 /* add 1: if one carry bit was *not* set
- the addition will not result in 0. */
-
- jnz 3f /* found NUL => return pointer */
-
- movq %rax, (%rdx) /* Write value to destination. */
- addq $8, %rdx /* Adjust pointer. */
- jmp 1b /* Next iteration. */
-
- /* Do the last few bytes. %rax contains the value to write.
- The loop is unrolled twice. */
- .p2align 4
-3:
- /* Note that stpcpy needs to return with the value of the NUL
- byte. */
- movb %al, (%rdx) /* 1st byte. */
- testb %al, %al /* Is it NUL. */
- jz 4f /* yes, finish. */
- incq %rdx /* Increment destination. */
- movb %ah, (%rdx) /* 2nd byte. */
- testb %ah, %ah /* Is it NUL?. */
- jz 4f /* yes, finish. */
- incq %rdx /* Increment destination. */
- shrq $16, %rax /* Shift... */
- jmp 3b /* and look at next two bytes in %rax. */
-
-4:
-#ifdef USE_AS_STPCPY
- movq %rdx, %rax /* Destination is return value. */
-#else
- movq %rdi, %rax /* Source is return value. */
-#endif
- retq
-END (STRCPY)
-#ifndef USE_AS_STPCPY
+#define STRCPY strcpy
+#include "multiarch/strcpy-sse2.S"
libc_hidden_builtin_def (strcpy)
-#endif
--
2.34.1
^ permalink raw reply [flat|nested] 21+ messages in thread
* [PATCH v1] x86: Move memrchr SSE2 implementation to multiarch/memrchr-sse2.S
2022-07-12 19:29 [PATCH v1] x86: Move strlen SSE2 implementation to multiarch/strlen-sse2.S Noah Goldstein
2022-07-12 19:29 ` [PATCH v1] x86: Move strcpy SSE2 implementation to multiarch/strcpy-sse2.S Noah Goldstein
@ 2022-07-12 19:29 ` Noah Goldstein
2022-07-12 22:58 ` H.J. Lu
2022-07-12 19:29 ` [PATCH v1] x86: Move strrchr SSE2 implementation to multiarch/strrchr-sse2.S Noah Goldstein
` (7 subsequent siblings)
9 siblings, 1 reply; 21+ messages in thread
From: Noah Goldstein @ 2022-07-12 19:29 UTC (permalink / raw)
To: libc-alpha
This commit doesn't affect libc.so.6, its just housekeeping to prepare
for adding explicit ISA level support.
Tested build on x86_64 and x86_32 with/without multiarch.
---
sysdeps/x86_64/memrchr.S | 332 +----------------------
sysdeps/x86_64/multiarch/memrchr-sse2.S | 336 +++++++++++++++++++++++-
2 files changed, 334 insertions(+), 334 deletions(-)
diff --git a/sysdeps/x86_64/memrchr.S b/sysdeps/x86_64/memrchr.S
index b0dffd2ae2..385e2c5668 100644
--- a/sysdeps/x86_64/memrchr.S
+++ b/sysdeps/x86_64/memrchr.S
@@ -17,334 +17,6 @@
License along with the GNU C Library; if not, see
<https://www.gnu.org/licenses/>. */
-#include <sysdep.h>
-#define VEC_SIZE 16
-#define PAGE_SIZE 4096
-
- .text
-ENTRY_P2ALIGN(__memrchr, 6)
-#ifdef __ILP32__
- /* Clear upper bits. */
- mov %RDX_LP, %RDX_LP
-#endif
- movd %esi, %xmm0
-
- /* Get end pointer. */
- leaq (%rdx, %rdi), %rcx
-
- punpcklbw %xmm0, %xmm0
- punpcklwd %xmm0, %xmm0
- pshufd $0, %xmm0, %xmm0
-
- /* Check if we can load 1x VEC without cross a page. */
- testl $(PAGE_SIZE - VEC_SIZE), %ecx
- jz L(page_cross)
-
- /* NB: This load happens regardless of whether rdx (len) is zero. Since
- it doesn't cross a page and the standard gurantees any pointer have
- at least one-valid byte this load must be safe. For the entire
- history of the x86 memrchr implementation this has been possible so
- no code "should" be relying on a zero-length check before this load.
- The zero-length check is moved to the page cross case because it is
- 1) pretty cold and including it pushes the hot case len <= VEC_SIZE
- into 2-cache lines. */
- movups -(VEC_SIZE)(%rcx), %xmm1
- pcmpeqb %xmm0, %xmm1
- pmovmskb %xmm1, %eax
-
- subq $VEC_SIZE, %rdx
- ja L(more_1x_vec)
-L(ret_vec_x0_test):
- /* Zero-flag set if eax (src) is zero. Destination unchanged if src is
- zero. */
- bsrl %eax, %eax
- jz L(ret_0)
- /* Check if the CHAR match is in bounds. Need to truly zero `eax` here
- if out of bounds. */
- addl %edx, %eax
- jl L(zero_0)
- /* Since we subtracted VEC_SIZE from rdx earlier we can just add to base
- ptr. */
- addq %rdi, %rax
-L(ret_0):
- ret
-
- .p2align 4,, 5
-L(ret_vec_x0):
- bsrl %eax, %eax
- leaq -(VEC_SIZE)(%rcx, %rax), %rax
- ret
-
- .p2align 4,, 2
-L(zero_0):
- xorl %eax, %eax
- ret
-
-
- .p2align 4,, 8
-L(more_1x_vec):
- testl %eax, %eax
- jnz L(ret_vec_x0)
-
- /* Align rcx (pointer to string). */
- decq %rcx
- andq $-VEC_SIZE, %rcx
-
- movq %rcx, %rdx
- /* NB: We could consistenyl save 1-byte in this pattern with `movaps
- %xmm0, %xmm1; pcmpeq IMM8(r), %xmm1; ...`. The reason against it is
- it adds more frontend uops (even if the moves can be eliminated) and
- some percentage of the time actual backend uops. */
- movaps -(VEC_SIZE)(%rcx), %xmm1
- pcmpeqb %xmm0, %xmm1
- subq %rdi, %rdx
- pmovmskb %xmm1, %eax
-
- cmpq $(VEC_SIZE * 2), %rdx
- ja L(more_2x_vec)
-L(last_2x_vec):
- subl $VEC_SIZE, %edx
- jbe L(ret_vec_x0_test)
-
- testl %eax, %eax
- jnz L(ret_vec_x0)
-
- movaps -(VEC_SIZE * 2)(%rcx), %xmm1
- pcmpeqb %xmm0, %xmm1
- pmovmskb %xmm1, %eax
-
- subl $VEC_SIZE, %edx
- bsrl %eax, %eax
- jz L(ret_1)
- addl %edx, %eax
- jl L(zero_0)
- addq %rdi, %rax
-L(ret_1):
- ret
-
- /* Don't align. Otherwise lose 2-byte encoding in jump to L(page_cross)
- causes the hot pause (length <= VEC_SIZE) to span multiple cache
- lines. Naturally aligned % 16 to 8-bytes. */
-L(page_cross):
- /* Zero length check. */
- testq %rdx, %rdx
- jz L(zero_0)
-
- leaq -1(%rcx), %r8
- andq $-(VEC_SIZE), %r8
-
- movaps (%r8), %xmm1
- pcmpeqb %xmm0, %xmm1
- pmovmskb %xmm1, %esi
- /* Shift out negative alignment (because we are starting from endptr and
- working backwards). */
- negl %ecx
- /* 32-bit shift but VEC_SIZE=16 so need to mask the shift count
- explicitly. */
- andl $(VEC_SIZE - 1), %ecx
- shl %cl, %esi
- movzwl %si, %eax
- leaq (%rdi, %rdx), %rcx
- cmpq %rdi, %r8
- ja L(more_1x_vec)
- subl $VEC_SIZE, %edx
- bsrl %eax, %eax
- jz L(ret_2)
- addl %edx, %eax
- jl L(zero_1)
- addq %rdi, %rax
-L(ret_2):
- ret
-
- /* Fits in aliging bytes. */
-L(zero_1):
- xorl %eax, %eax
- ret
-
- .p2align 4,, 5
-L(ret_vec_x1):
- bsrl %eax, %eax
- leaq -(VEC_SIZE * 2)(%rcx, %rax), %rax
- ret
-
- .p2align 4,, 8
-L(more_2x_vec):
- testl %eax, %eax
- jnz L(ret_vec_x0)
-
- movaps -(VEC_SIZE * 2)(%rcx), %xmm1
- pcmpeqb %xmm0, %xmm1
- pmovmskb %xmm1, %eax
- testl %eax, %eax
- jnz L(ret_vec_x1)
-
-
- movaps -(VEC_SIZE * 3)(%rcx), %xmm1
- pcmpeqb %xmm0, %xmm1
- pmovmskb %xmm1, %eax
-
- subq $(VEC_SIZE * 4), %rdx
- ja L(more_4x_vec)
-
- addl $(VEC_SIZE), %edx
- jle L(ret_vec_x2_test)
-
-L(last_vec):
- testl %eax, %eax
- jnz L(ret_vec_x2)
-
- movaps -(VEC_SIZE * 4)(%rcx), %xmm1
- pcmpeqb %xmm0, %xmm1
- pmovmskb %xmm1, %eax
-
- subl $(VEC_SIZE), %edx
- bsrl %eax, %eax
- jz L(ret_3)
- addl %edx, %eax
- jl L(zero_2)
- addq %rdi, %rax
-L(ret_3):
- ret
-
- .p2align 4,, 6
-L(ret_vec_x2_test):
- bsrl %eax, %eax
- jz L(zero_2)
- addl %edx, %eax
- jl L(zero_2)
- addq %rdi, %rax
- ret
-
-L(zero_2):
- xorl %eax, %eax
- ret
-
-
- .p2align 4,, 5
-L(ret_vec_x2):
- bsrl %eax, %eax
- leaq -(VEC_SIZE * 3)(%rcx, %rax), %rax
- ret
-
- .p2align 4,, 5
-L(ret_vec_x3):
- bsrl %eax, %eax
- leaq -(VEC_SIZE * 4)(%rcx, %rax), %rax
- ret
-
- .p2align 4,, 8
-L(more_4x_vec):
- testl %eax, %eax
- jnz L(ret_vec_x2)
-
- movaps -(VEC_SIZE * 4)(%rcx), %xmm1
- pcmpeqb %xmm0, %xmm1
- pmovmskb %xmm1, %eax
-
- testl %eax, %eax
- jnz L(ret_vec_x3)
-
- addq $-(VEC_SIZE * 4), %rcx
- cmpq $(VEC_SIZE * 4), %rdx
- jbe L(last_4x_vec)
-
- /* Offset everything by 4x VEC_SIZE here to save a few bytes at the end
- keeping the code from spilling to the next cache line. */
- addq $(VEC_SIZE * 4 - 1), %rcx
- andq $-(VEC_SIZE * 4), %rcx
- leaq (VEC_SIZE * 4)(%rdi), %rdx
- andq $-(VEC_SIZE * 4), %rdx
-
- .p2align 4,, 11
-L(loop_4x_vec):
- movaps (VEC_SIZE * -1)(%rcx), %xmm1
- movaps (VEC_SIZE * -2)(%rcx), %xmm2
- movaps (VEC_SIZE * -3)(%rcx), %xmm3
- movaps (VEC_SIZE * -4)(%rcx), %xmm4
- pcmpeqb %xmm0, %xmm1
- pcmpeqb %xmm0, %xmm2
- pcmpeqb %xmm0, %xmm3
- pcmpeqb %xmm0, %xmm4
-
- por %xmm1, %xmm2
- por %xmm3, %xmm4
- por %xmm2, %xmm4
-
- pmovmskb %xmm4, %esi
- testl %esi, %esi
- jnz L(loop_end)
-
- addq $-(VEC_SIZE * 4), %rcx
- cmpq %rdx, %rcx
- jne L(loop_4x_vec)
-
- subl %edi, %edx
-
- /* Ends up being 1-byte nop. */
- .p2align 4,, 2
-L(last_4x_vec):
- movaps -(VEC_SIZE)(%rcx), %xmm1
- pcmpeqb %xmm0, %xmm1
- pmovmskb %xmm1, %eax
-
- cmpl $(VEC_SIZE * 2), %edx
- jbe L(last_2x_vec)
-
- testl %eax, %eax
- jnz L(ret_vec_x0)
-
-
- movaps -(VEC_SIZE * 2)(%rcx), %xmm1
- pcmpeqb %xmm0, %xmm1
- pmovmskb %xmm1, %eax
-
- testl %eax, %eax
- jnz L(ret_vec_end)
-
- movaps -(VEC_SIZE * 3)(%rcx), %xmm1
- pcmpeqb %xmm0, %xmm1
- pmovmskb %xmm1, %eax
-
- subl $(VEC_SIZE * 3), %edx
- ja L(last_vec)
- bsrl %eax, %eax
- jz L(ret_4)
- addl %edx, %eax
- jl L(zero_3)
- addq %rdi, %rax
-L(ret_4):
- ret
-
- /* Ends up being 1-byte nop. */
- .p2align 4,, 3
-L(loop_end):
- pmovmskb %xmm1, %eax
- sall $16, %eax
- jnz L(ret_vec_end)
-
- pmovmskb %xmm2, %eax
- testl %eax, %eax
- jnz L(ret_vec_end)
-
- pmovmskb %xmm3, %eax
- /* Combine last 2 VEC matches. If ecx (VEC3) is zero (no CHAR in VEC3)
- then it won't affect the result in esi (VEC4). If ecx is non-zero
- then CHAR in VEC3 and bsrq will use that position. */
- sall $16, %eax
- orl %esi, %eax
- bsrl %eax, %eax
- leaq -(VEC_SIZE * 4)(%rcx, %rax), %rax
- ret
-
-L(ret_vec_end):
- bsrl %eax, %eax
- leaq (VEC_SIZE * -2)(%rax, %rcx), %rax
- ret
- /* Use in L(last_4x_vec). In the same cache line. This is just a spare
- aligning bytes. */
-L(zero_3):
- xorl %eax, %eax
- ret
- /* 2-bytes from next cache line. */
-END(__memrchr)
+#define MEMRCHR __memrchr
+#include "multiarch/memrchr-sse2.S"
weak_alias (__memrchr, memrchr)
diff --git a/sysdeps/x86_64/multiarch/memrchr-sse2.S b/sysdeps/x86_64/multiarch/memrchr-sse2.S
index b04202e171..d92a4022dc 100644
--- a/sysdeps/x86_64/multiarch/memrchr-sse2.S
+++ b/sysdeps/x86_64/multiarch/memrchr-sse2.S
@@ -17,10 +17,338 @@
<https://www.gnu.org/licenses/>. */
#if IS_IN (libc)
-# define __memrchr __memrchr_sse2
+# ifndef MEMRCHR
+# define MEMRCHR __memrchr_sse2
+# endif
+#endif
+
+#include <sysdep.h>
+#define VEC_SIZE 16
+#define PAGE_SIZE 4096
-# undef weak_alias
-# define weak_alias(__memrchr, memrchr)
+ .text
+ENTRY_P2ALIGN(MEMRCHR, 6)
+#ifdef __ILP32__
+ /* Clear upper bits. */
+ mov %RDX_LP, %RDX_LP
#endif
+ movd %esi, %xmm0
+
+ /* Get end pointer. */
+ leaq (%rdx, %rdi), %rcx
+
+ punpcklbw %xmm0, %xmm0
+ punpcklwd %xmm0, %xmm0
+ pshufd $0, %xmm0, %xmm0
+
+ /* Check if we can load 1x VEC without cross a page. */
+ testl $(PAGE_SIZE - VEC_SIZE), %ecx
+ jz L(page_cross)
+
+ /* NB: This load happens regardless of whether rdx (len) is zero. Since
+ it doesn't cross a page and the standard gurantees any pointer have
+ at least one-valid byte this load must be safe. For the entire
+ history of the x86 memrchr implementation this has been possible so
+ no code "should" be relying on a zero-length check before this load.
+ The zero-length check is moved to the page cross case because it is
+ 1) pretty cold and including it pushes the hot case len <= VEC_SIZE
+ into 2-cache lines. */
+ movups -(VEC_SIZE)(%rcx), %xmm1
+ pcmpeqb %xmm0, %xmm1
+ pmovmskb %xmm1, %eax
+
+ subq $VEC_SIZE, %rdx
+ ja L(more_1x_vec)
+L(ret_vec_x0_test):
+ /* Zero-flag set if eax (src) is zero. Destination unchanged if src is
+ zero. */
+ bsrl %eax, %eax
+ jz L(ret_0)
+ /* Check if the CHAR match is in bounds. Need to truly zero `eax` here
+ if out of bounds. */
+ addl %edx, %eax
+ jl L(zero_0)
+ /* Since we subtracted VEC_SIZE from rdx earlier we can just add to base
+ ptr. */
+ addq %rdi, %rax
+L(ret_0):
+ ret
+
+ .p2align 4,, 5
+L(ret_vec_x0):
+ bsrl %eax, %eax
+ leaq -(VEC_SIZE)(%rcx, %rax), %rax
+ ret
+
+ .p2align 4,, 2
+L(zero_0):
+ xorl %eax, %eax
+ ret
+
+
+ .p2align 4,, 8
+L(more_1x_vec):
+ testl %eax, %eax
+ jnz L(ret_vec_x0)
+
+ /* Align rcx (pointer to string). */
+ decq %rcx
+ andq $-VEC_SIZE, %rcx
+
+ movq %rcx, %rdx
+ /* NB: We could consistenyl save 1-byte in this pattern with `movaps
+ %xmm0, %xmm1; pcmpeq IMM8(r), %xmm1; ...`. The reason against it is
+ it adds more frontend uops (even if the moves can be eliminated) and
+ some percentage of the time actual backend uops. */
+ movaps -(VEC_SIZE)(%rcx), %xmm1
+ pcmpeqb %xmm0, %xmm1
+ subq %rdi, %rdx
+ pmovmskb %xmm1, %eax
+
+ cmpq $(VEC_SIZE * 2), %rdx
+ ja L(more_2x_vec)
+L(last_2x_vec):
+ subl $VEC_SIZE, %edx
+ jbe L(ret_vec_x0_test)
+
+ testl %eax, %eax
+ jnz L(ret_vec_x0)
+
+ movaps -(VEC_SIZE * 2)(%rcx), %xmm1
+ pcmpeqb %xmm0, %xmm1
+ pmovmskb %xmm1, %eax
+
+ subl $VEC_SIZE, %edx
+ bsrl %eax, %eax
+ jz L(ret_1)
+ addl %edx, %eax
+ jl L(zero_0)
+ addq %rdi, %rax
+L(ret_1):
+ ret
+
+ /* Don't align. Otherwise lose 2-byte encoding in jump to L(page_cross)
+ causes the hot pause (length <= VEC_SIZE) to span multiple cache
+ lines. Naturally aligned % 16 to 8-bytes. */
+L(page_cross):
+ /* Zero length check. */
+ testq %rdx, %rdx
+ jz L(zero_0)
+
+ leaq -1(%rcx), %r8
+ andq $-(VEC_SIZE), %r8
+
+ movaps (%r8), %xmm1
+ pcmpeqb %xmm0, %xmm1
+ pmovmskb %xmm1, %esi
+ /* Shift out negative alignment (because we are starting from endptr and
+ working backwards). */
+ negl %ecx
+ /* 32-bit shift but VEC_SIZE=16 so need to mask the shift count
+ explicitly. */
+ andl $(VEC_SIZE - 1), %ecx
+ shl %cl, %esi
+ movzwl %si, %eax
+ leaq (%rdi, %rdx), %rcx
+ cmpq %rdi, %r8
+ ja L(more_1x_vec)
+ subl $VEC_SIZE, %edx
+ bsrl %eax, %eax
+ jz L(ret_2)
+ addl %edx, %eax
+ jl L(zero_1)
+ addq %rdi, %rax
+L(ret_2):
+ ret
+
+ /* Fits in aliging bytes. */
+L(zero_1):
+ xorl %eax, %eax
+ ret
+
+ .p2align 4,, 5
+L(ret_vec_x1):
+ bsrl %eax, %eax
+ leaq -(VEC_SIZE * 2)(%rcx, %rax), %rax
+ ret
+
+ .p2align 4,, 8
+L(more_2x_vec):
+ testl %eax, %eax
+ jnz L(ret_vec_x0)
+
+ movaps -(VEC_SIZE * 2)(%rcx), %xmm1
+ pcmpeqb %xmm0, %xmm1
+ pmovmskb %xmm1, %eax
+ testl %eax, %eax
+ jnz L(ret_vec_x1)
+
+
+ movaps -(VEC_SIZE * 3)(%rcx), %xmm1
+ pcmpeqb %xmm0, %xmm1
+ pmovmskb %xmm1, %eax
+
+ subq $(VEC_SIZE * 4), %rdx
+ ja L(more_4x_vec)
+
+ addl $(VEC_SIZE), %edx
+ jle L(ret_vec_x2_test)
+
+L(last_vec):
+ testl %eax, %eax
+ jnz L(ret_vec_x2)
+
+ movaps -(VEC_SIZE * 4)(%rcx), %xmm1
+ pcmpeqb %xmm0, %xmm1
+ pmovmskb %xmm1, %eax
+
+ subl $(VEC_SIZE), %edx
+ bsrl %eax, %eax
+ jz L(ret_3)
+ addl %edx, %eax
+ jl L(zero_2)
+ addq %rdi, %rax
+L(ret_3):
+ ret
+
+ .p2align 4,, 6
+L(ret_vec_x2_test):
+ bsrl %eax, %eax
+ jz L(zero_2)
+ addl %edx, %eax
+ jl L(zero_2)
+ addq %rdi, %rax
+ ret
+
+L(zero_2):
+ xorl %eax, %eax
+ ret
+
+
+ .p2align 4,, 5
+L(ret_vec_x2):
+ bsrl %eax, %eax
+ leaq -(VEC_SIZE * 3)(%rcx, %rax), %rax
+ ret
+
+ .p2align 4,, 5
+L(ret_vec_x3):
+ bsrl %eax, %eax
+ leaq -(VEC_SIZE * 4)(%rcx, %rax), %rax
+ ret
+
+ .p2align 4,, 8
+L(more_4x_vec):
+ testl %eax, %eax
+ jnz L(ret_vec_x2)
+
+ movaps -(VEC_SIZE * 4)(%rcx), %xmm1
+ pcmpeqb %xmm0, %xmm1
+ pmovmskb %xmm1, %eax
+
+ testl %eax, %eax
+ jnz L(ret_vec_x3)
+
+ addq $-(VEC_SIZE * 4), %rcx
+ cmpq $(VEC_SIZE * 4), %rdx
+ jbe L(last_4x_vec)
+
+ /* Offset everything by 4x VEC_SIZE here to save a few bytes at the end
+ keeping the code from spilling to the next cache line. */
+ addq $(VEC_SIZE * 4 - 1), %rcx
+ andq $-(VEC_SIZE * 4), %rcx
+ leaq (VEC_SIZE * 4)(%rdi), %rdx
+ andq $-(VEC_SIZE * 4), %rdx
+
+ .p2align 4,, 11
+L(loop_4x_vec):
+ movaps (VEC_SIZE * -1)(%rcx), %xmm1
+ movaps (VEC_SIZE * -2)(%rcx), %xmm2
+ movaps (VEC_SIZE * -3)(%rcx), %xmm3
+ movaps (VEC_SIZE * -4)(%rcx), %xmm4
+ pcmpeqb %xmm0, %xmm1
+ pcmpeqb %xmm0, %xmm2
+ pcmpeqb %xmm0, %xmm3
+ pcmpeqb %xmm0, %xmm4
+
+ por %xmm1, %xmm2
+ por %xmm3, %xmm4
+ por %xmm2, %xmm4
+
+ pmovmskb %xmm4, %esi
+ testl %esi, %esi
+ jnz L(loop_end)
+
+ addq $-(VEC_SIZE * 4), %rcx
+ cmpq %rdx, %rcx
+ jne L(loop_4x_vec)
+
+ subl %edi, %edx
+
+ /* Ends up being 1-byte nop. */
+ .p2align 4,, 2
+L(last_4x_vec):
+ movaps -(VEC_SIZE)(%rcx), %xmm1
+ pcmpeqb %xmm0, %xmm1
+ pmovmskb %xmm1, %eax
+
+ cmpl $(VEC_SIZE * 2), %edx
+ jbe L(last_2x_vec)
+
+ testl %eax, %eax
+ jnz L(ret_vec_x0)
+
+
+ movaps -(VEC_SIZE * 2)(%rcx), %xmm1
+ pcmpeqb %xmm0, %xmm1
+ pmovmskb %xmm1, %eax
+
+ testl %eax, %eax
+ jnz L(ret_vec_end)
+
+ movaps -(VEC_SIZE * 3)(%rcx), %xmm1
+ pcmpeqb %xmm0, %xmm1
+ pmovmskb %xmm1, %eax
+
+ subl $(VEC_SIZE * 3), %edx
+ ja L(last_vec)
+ bsrl %eax, %eax
+ jz L(ret_4)
+ addl %edx, %eax
+ jl L(zero_3)
+ addq %rdi, %rax
+L(ret_4):
+ ret
+
+ /* Ends up being 1-byte nop. */
+ .p2align 4,, 3
+L(loop_end):
+ pmovmskb %xmm1, %eax
+ sall $16, %eax
+ jnz L(ret_vec_end)
+
+ pmovmskb %xmm2, %eax
+ testl %eax, %eax
+ jnz L(ret_vec_end)
+
+ pmovmskb %xmm3, %eax
+ /* Combine last 2 VEC matches. If ecx (VEC3) is zero (no CHAR in VEC3)
+ then it won't affect the result in esi (VEC4). If ecx is non-zero
+ then CHAR in VEC3 and bsrq will use that position. */
+ sall $16, %eax
+ orl %esi, %eax
+ bsrl %eax, %eax
+ leaq -(VEC_SIZE * 4)(%rcx, %rax), %rax
+ ret
-#include "../memrchr.S"
+L(ret_vec_end):
+ bsrl %eax, %eax
+ leaq (VEC_SIZE * -2)(%rax, %rcx), %rax
+ ret
+ /* Use in L(last_4x_vec). In the same cache line. This is just a spare
+ aligning bytes. */
+L(zero_3):
+ xorl %eax, %eax
+ ret
+ /* 2-bytes from next cache line. */
+END(MEMRCHR)
--
2.34.1
^ permalink raw reply [flat|nested] 21+ messages in thread
* [PATCH v1] x86: Move strrchr SSE2 implementation to multiarch/strrchr-sse2.S
2022-07-12 19:29 [PATCH v1] x86: Move strlen SSE2 implementation to multiarch/strlen-sse2.S Noah Goldstein
2022-07-12 19:29 ` [PATCH v1] x86: Move strcpy SSE2 implementation to multiarch/strcpy-sse2.S Noah Goldstein
2022-07-12 19:29 ` [PATCH v1] x86: Move memrchr SSE2 implementation to multiarch/memrchr-sse2.S Noah Goldstein
@ 2022-07-12 19:29 ` Noah Goldstein
2022-07-12 22:28 ` H.J. Lu
2022-07-12 19:29 ` [PATCH v1] x86: Move strchr SSE2 implementation to multiarch/strchr-sse2.S Noah Goldstein
` (6 subsequent siblings)
9 siblings, 1 reply; 21+ messages in thread
From: Noah Goldstein @ 2022-07-12 19:29 UTC (permalink / raw)
To: libc-alpha
This commit doesn't affect libc.so.6, its just housekeeping to prepare
for adding explicit ISA level support.
Tested build on x86_64 and x86_32 with/without multiarch.
---
sysdeps/x86_64/multiarch/strrchr-sse2.S | 358 ++++++++++++++++++++++-
sysdeps/x86_64/multiarch/wcsrchr-sse2.S | 10 +-
sysdeps/x86_64/strrchr.S | 364 +-----------------------
sysdeps/x86_64/wcsrchr.S | 11 +-
4 files changed, 366 insertions(+), 377 deletions(-)
diff --git a/sysdeps/x86_64/multiarch/strrchr-sse2.S b/sysdeps/x86_64/multiarch/strrchr-sse2.S
index 866396e947..6ee7a5e33a 100644
--- a/sysdeps/x86_64/multiarch/strrchr-sse2.S
+++ b/sysdeps/x86_64/multiarch/strrchr-sse2.S
@@ -17,12 +17,358 @@
<https://www.gnu.org/licenses/>. */
#if IS_IN (libc)
-# define STRRCHR __strrchr_sse2
+# ifndef STRRCHR
+# define STRRCHR __strrchr_sse2
+# endif
+#endif
+
+#include <sysdep.h>
+
+#ifdef USE_AS_WCSRCHR
+# define PCMPEQ pcmpeqd
+# define CHAR_SIZE 4
+# define PMINU pminud
+#else
+# define PCMPEQ pcmpeqb
+# define CHAR_SIZE 1
+# define PMINU pminub
+#endif
+
+#define PAGE_SIZE 4096
+#define VEC_SIZE 16
+
+ .text
+ENTRY(STRRCHR)
+ movd %esi, %xmm0
+ movq %rdi, %rax
+ andl $(PAGE_SIZE - 1), %eax
+#ifndef USE_AS_WCSRCHR
+ punpcklbw %xmm0, %xmm0
+ punpcklwd %xmm0, %xmm0
+#endif
+ pshufd $0, %xmm0, %xmm0
+ cmpl $(PAGE_SIZE - VEC_SIZE), %eax
+ ja L(cross_page)
+
+L(cross_page_continue):
+ movups (%rdi), %xmm1
+ pxor %xmm2, %xmm2
+ PCMPEQ %xmm1, %xmm2
+ pmovmskb %xmm2, %ecx
+ testl %ecx, %ecx
+ jz L(aligned_more)
+
+ PCMPEQ %xmm0, %xmm1
+ pmovmskb %xmm1, %eax
+ leal -1(%rcx), %edx
+ xorl %edx, %ecx
+ andl %ecx, %eax
+ jz L(ret0)
+ bsrl %eax, %eax
+ addq %rdi, %rax
+ /* We are off by 3 for wcsrchr if search CHAR is non-zero. If
+ search CHAR is zero we are correct. Either way `andq
+ -CHAR_SIZE, %rax` gets the correct result. */
+#ifdef USE_AS_WCSRCHR
+ andq $-CHAR_SIZE, %rax
+#endif
+L(ret0):
+ ret
+
+ /* Returns for first vec x1/x2 have hard coded backward search
+ path for earlier matches. */
+ .p2align 4
+L(first_vec_x0_test):
+ PCMPEQ %xmm0, %xmm1
+ pmovmskb %xmm1, %eax
+ testl %eax, %eax
+ jz L(ret0)
+ bsrl %eax, %eax
+ addq %r8, %rax
+#ifdef USE_AS_WCSRCHR
+ andq $-CHAR_SIZE, %rax
+#endif
+ ret
+
+ .p2align 4
+L(first_vec_x1):
+ PCMPEQ %xmm0, %xmm2
+ pmovmskb %xmm2, %eax
+ leal -1(%rcx), %edx
+ xorl %edx, %ecx
+ andl %ecx, %eax
+ jz L(first_vec_x0_test)
+ bsrl %eax, %eax
+ leaq (VEC_SIZE)(%rdi, %rax), %rax
+#ifdef USE_AS_WCSRCHR
+ andq $-CHAR_SIZE, %rax
+#endif
+ ret
+
+ .p2align 4
+L(first_vec_x1_test):
+ PCMPEQ %xmm0, %xmm2
+ pmovmskb %xmm2, %eax
+ testl %eax, %eax
+ jz L(first_vec_x0_test)
+ bsrl %eax, %eax
+ leaq (VEC_SIZE)(%rdi, %rax), %rax
+#ifdef USE_AS_WCSRCHR
+ andq $-CHAR_SIZE, %rax
+#endif
+ ret
+
+ .p2align 4
+L(first_vec_x2):
+ PCMPEQ %xmm0, %xmm3
+ pmovmskb %xmm3, %eax
+ leal -1(%rcx), %edx
+ xorl %edx, %ecx
+ andl %ecx, %eax
+ jz L(first_vec_x1_test)
+ bsrl %eax, %eax
+ leaq (VEC_SIZE * 2)(%rdi, %rax), %rax
+#ifdef USE_AS_WCSRCHR
+ andq $-CHAR_SIZE, %rax
+#endif
+ ret
+
+ .p2align 4
+L(aligned_more):
+ /* Save original pointer if match was in VEC 0. */
+ movq %rdi, %r8
+ andq $-VEC_SIZE, %rdi
+
+ movaps VEC_SIZE(%rdi), %xmm2
+ pxor %xmm3, %xmm3
+ PCMPEQ %xmm2, %xmm3
+ pmovmskb %xmm3, %ecx
+ testl %ecx, %ecx
+ jnz L(first_vec_x1)
+
+ movaps (VEC_SIZE * 2)(%rdi), %xmm3
+ pxor %xmm4, %xmm4
+ PCMPEQ %xmm3, %xmm4
+ pmovmskb %xmm4, %ecx
+ testl %ecx, %ecx
+ jnz L(first_vec_x2)
+
+ addq $VEC_SIZE, %rdi
+ /* Save pointer again before realigning. */
+ movq %rdi, %rsi
+ andq $-(VEC_SIZE * 2), %rdi
+ .p2align 4
+L(first_loop):
+ /* Do 2x VEC at a time. */
+ movaps (VEC_SIZE * 2)(%rdi), %xmm4
+ movaps (VEC_SIZE * 3)(%rdi), %xmm5
+ /* Since SSE2 no pminud so wcsrchr needs seperate logic for
+ detecting zero. Note if this is found to be a bottleneck it
+ may be worth adding an SSE4.1 wcsrchr implementation. */
+#ifdef USE_AS_WCSRCHR
+ movaps %xmm5, %xmm6
+ pxor %xmm8, %xmm8
+
+ PCMPEQ %xmm8, %xmm5
+ PCMPEQ %xmm4, %xmm8
+ por %xmm5, %xmm8
+#else
+ movaps %xmm5, %xmm6
+ PMINU %xmm4, %xmm5
+#endif
+
+ movaps %xmm4, %xmm9
+ PCMPEQ %xmm0, %xmm4
+ PCMPEQ %xmm0, %xmm6
+ movaps %xmm6, %xmm7
+ por %xmm4, %xmm6
+#ifndef USE_AS_WCSRCHR
+ pxor %xmm8, %xmm8
+ PCMPEQ %xmm5, %xmm8
+#endif
+ pmovmskb %xmm8, %ecx
+ pmovmskb %xmm6, %eax
-# undef weak_alias
-# define weak_alias(strrchr, rindex)
-# undef libc_hidden_builtin_def
-# define libc_hidden_builtin_def(strrchr)
+ addq $(VEC_SIZE * 2), %rdi
+ /* Use `addl` 1) so we can undo it with `subl` and 2) it can
+ macro-fuse with `jz`. */
+ addl %ecx, %eax
+ jz L(first_loop)
+
+ /* Check if there is zero match. */
+ testl %ecx, %ecx
+ jz L(second_loop_match)
+
+ /* Check if there was a match in last iteration. */
+ subl %ecx, %eax
+ jnz L(new_match)
+
+L(first_loop_old_match):
+ PCMPEQ %xmm0, %xmm2
+ PCMPEQ %xmm0, %xmm3
+ pmovmskb %xmm2, %ecx
+ pmovmskb %xmm3, %eax
+ addl %eax, %ecx
+ jz L(first_vec_x0_test)
+ /* NB: We could move this shift to before the branch and save a
+ bit of code size / performance on the fall through. The
+ branch leads to the null case which generally seems hotter
+ than char in first 3x VEC. */
+ sall $16, %eax
+ orl %ecx, %eax
+
+ bsrl %eax, %eax
+ addq %rsi, %rax
+#ifdef USE_AS_WCSRCHR
+ andq $-CHAR_SIZE, %rax
+#endif
+ ret
+
+ .p2align 4
+L(new_match):
+ pxor %xmm6, %xmm6
+ PCMPEQ %xmm9, %xmm6
+ pmovmskb %xmm6, %eax
+ sall $16, %ecx
+ orl %eax, %ecx
+
+ /* We can't reuse either of the old comparisons as since we mask
+ of zeros after first zero (instead of using the full
+ comparison) we can't gurantee no interference between match
+ after end of string and valid match. */
+ pmovmskb %xmm4, %eax
+ pmovmskb %xmm7, %edx
+ sall $16, %edx
+ orl %edx, %eax
+
+ leal -1(%ecx), %edx
+ xorl %edx, %ecx
+ andl %ecx, %eax
+ jz L(first_loop_old_match)
+ bsrl %eax, %eax
+ addq %rdi, %rax
+#ifdef USE_AS_WCSRCHR
+ andq $-CHAR_SIZE, %rax
+#endif
+ ret
+
+ /* Save minimum state for getting most recent match. We can
+ throw out all previous work. */
+ .p2align 4
+L(second_loop_match):
+ movq %rdi, %rsi
+ movaps %xmm4, %xmm2
+ movaps %xmm7, %xmm3
+
+ .p2align 4
+L(second_loop):
+ movaps (VEC_SIZE * 2)(%rdi), %xmm4
+ movaps (VEC_SIZE * 3)(%rdi), %xmm5
+ /* Since SSE2 no pminud so wcsrchr needs seperate logic for
+ detecting zero. Note if this is found to be a bottleneck it
+ may be worth adding an SSE4.1 wcsrchr implementation. */
+#ifdef USE_AS_WCSRCHR
+ movaps %xmm5, %xmm6
+ pxor %xmm8, %xmm8
+
+ PCMPEQ %xmm8, %xmm5
+ PCMPEQ %xmm4, %xmm8
+ por %xmm5, %xmm8
+#else
+ movaps %xmm5, %xmm6
+ PMINU %xmm4, %xmm5
+#endif
+
+ movaps %xmm4, %xmm9
+ PCMPEQ %xmm0, %xmm4
+ PCMPEQ %xmm0, %xmm6
+ movaps %xmm6, %xmm7
+ por %xmm4, %xmm6
+#ifndef USE_AS_WCSRCHR
+ pxor %xmm8, %xmm8
+ PCMPEQ %xmm5, %xmm8
#endif
-#include "../strrchr.S"
+ pmovmskb %xmm8, %ecx
+ pmovmskb %xmm6, %eax
+
+ addq $(VEC_SIZE * 2), %rdi
+ /* Either null term or new occurence of CHAR. */
+ addl %ecx, %eax
+ jz L(second_loop)
+
+ /* No null term so much be new occurence of CHAR. */
+ testl %ecx, %ecx
+ jz L(second_loop_match)
+
+
+ subl %ecx, %eax
+ jnz L(second_loop_new_match)
+
+L(second_loop_old_match):
+ pmovmskb %xmm2, %ecx
+ pmovmskb %xmm3, %eax
+ sall $16, %eax
+ orl %ecx, %eax
+ bsrl %eax, %eax
+ addq %rsi, %rax
+#ifdef USE_AS_WCSRCHR
+ andq $-CHAR_SIZE, %rax
+#endif
+ ret
+
+ .p2align 4
+L(second_loop_new_match):
+ pxor %xmm6, %xmm6
+ PCMPEQ %xmm9, %xmm6
+ pmovmskb %xmm6, %eax
+ sall $16, %ecx
+ orl %eax, %ecx
+
+ /* We can't reuse either of the old comparisons as since we mask
+ of zeros after first zero (instead of using the full
+ comparison) we can't gurantee no interference between match
+ after end of string and valid match. */
+ pmovmskb %xmm4, %eax
+ pmovmskb %xmm7, %edx
+ sall $16, %edx
+ orl %edx, %eax
+
+ leal -1(%ecx), %edx
+ xorl %edx, %ecx
+ andl %ecx, %eax
+ jz L(second_loop_old_match)
+ bsrl %eax, %eax
+ addq %rdi, %rax
+#ifdef USE_AS_WCSRCHR
+ andq $-CHAR_SIZE, %rax
+#endif
+ ret
+
+ .p2align 4,, 4
+L(cross_page):
+ movq %rdi, %rsi
+ andq $-VEC_SIZE, %rsi
+ movaps (%rsi), %xmm1
+ pxor %xmm2, %xmm2
+ PCMPEQ %xmm1, %xmm2
+ pmovmskb %xmm2, %edx
+ movl %edi, %ecx
+ andl $(VEC_SIZE - 1), %ecx
+ sarl %cl, %edx
+ jz L(cross_page_continue)
+ PCMPEQ %xmm0, %xmm1
+ pmovmskb %xmm1, %eax
+ sarl %cl, %eax
+ leal -1(%rdx), %ecx
+ xorl %edx, %ecx
+ andl %ecx, %eax
+ jz L(ret1)
+ bsrl %eax, %eax
+ addq %rdi, %rax
+#ifdef USE_AS_WCSRCHR
+ andq $-CHAR_SIZE, %rax
+#endif
+L(ret1):
+ ret
+END(STRRCHR)
diff --git a/sysdeps/x86_64/multiarch/wcsrchr-sse2.S b/sysdeps/x86_64/multiarch/wcsrchr-sse2.S
index 69d2f3cdb1..d9259720f8 100644
--- a/sysdeps/x86_64/multiarch/wcsrchr-sse2.S
+++ b/sysdeps/x86_64/multiarch/wcsrchr-sse2.S
@@ -17,6 +17,12 @@
<https://www.gnu.org/licenses/>. */
#if IS_IN (libc)
-# define STRRCHR __wcsrchr_sse2
+# ifndef STRRCHR
+# define STRRCHR __wcsrchr_sse2
+# endif
#endif
-#include "../wcsrchr.S"
+
+#define USE_AS_WCSRCHR 1
+#define NO_PMINU 1
+
+#include "strrchr-sse2.S"
diff --git a/sysdeps/x86_64/strrchr.S b/sysdeps/x86_64/strrchr.S
index 4d7ba4ceb2..f39da60454 100644
--- a/sysdeps/x86_64/strrchr.S
+++ b/sysdeps/x86_64/strrchr.S
@@ -16,363 +16,7 @@
License along with the GNU C Library; if not, see
<https://www.gnu.org/licenses/>. */
-
-#include <sysdep.h>
-
-#ifndef STRRCHR
-# define STRRCHR strrchr
-#endif
-
-#ifdef USE_AS_WCSRCHR
-# define PCMPEQ pcmpeqd
-# define CHAR_SIZE 4
-# define PMINU pminud
-#else
-# define PCMPEQ pcmpeqb
-# define CHAR_SIZE 1
-# define PMINU pminub
-#endif
-
-#define PAGE_SIZE 4096
-#define VEC_SIZE 16
-
- .text
-ENTRY(STRRCHR)
- movd %esi, %xmm0
- movq %rdi, %rax
- andl $(PAGE_SIZE - 1), %eax
-#ifndef USE_AS_WCSRCHR
- punpcklbw %xmm0, %xmm0
- punpcklwd %xmm0, %xmm0
-#endif
- pshufd $0, %xmm0, %xmm0
- cmpl $(PAGE_SIZE - VEC_SIZE), %eax
- ja L(cross_page)
-
-L(cross_page_continue):
- movups (%rdi), %xmm1
- pxor %xmm2, %xmm2
- PCMPEQ %xmm1, %xmm2
- pmovmskb %xmm2, %ecx
- testl %ecx, %ecx
- jz L(aligned_more)
-
- PCMPEQ %xmm0, %xmm1
- pmovmskb %xmm1, %eax
- leal -1(%rcx), %edx
- xorl %edx, %ecx
- andl %ecx, %eax
- jz L(ret0)
- bsrl %eax, %eax
- addq %rdi, %rax
- /* We are off by 3 for wcsrchr if search CHAR is non-zero. If
- search CHAR is zero we are correct. Either way `andq
- -CHAR_SIZE, %rax` gets the correct result. */
-#ifdef USE_AS_WCSRCHR
- andq $-CHAR_SIZE, %rax
-#endif
-L(ret0):
- ret
-
- /* Returns for first vec x1/x2 have hard coded backward search
- path for earlier matches. */
- .p2align 4
-L(first_vec_x0_test):
- PCMPEQ %xmm0, %xmm1
- pmovmskb %xmm1, %eax
- testl %eax, %eax
- jz L(ret0)
- bsrl %eax, %eax
- addq %r8, %rax
-#ifdef USE_AS_WCSRCHR
- andq $-CHAR_SIZE, %rax
-#endif
- ret
-
- .p2align 4
-L(first_vec_x1):
- PCMPEQ %xmm0, %xmm2
- pmovmskb %xmm2, %eax
- leal -1(%rcx), %edx
- xorl %edx, %ecx
- andl %ecx, %eax
- jz L(first_vec_x0_test)
- bsrl %eax, %eax
- leaq (VEC_SIZE)(%rdi, %rax), %rax
-#ifdef USE_AS_WCSRCHR
- andq $-CHAR_SIZE, %rax
-#endif
- ret
-
- .p2align 4
-L(first_vec_x1_test):
- PCMPEQ %xmm0, %xmm2
- pmovmskb %xmm2, %eax
- testl %eax, %eax
- jz L(first_vec_x0_test)
- bsrl %eax, %eax
- leaq (VEC_SIZE)(%rdi, %rax), %rax
-#ifdef USE_AS_WCSRCHR
- andq $-CHAR_SIZE, %rax
-#endif
- ret
-
- .p2align 4
-L(first_vec_x2):
- PCMPEQ %xmm0, %xmm3
- pmovmskb %xmm3, %eax
- leal -1(%rcx), %edx
- xorl %edx, %ecx
- andl %ecx, %eax
- jz L(first_vec_x1_test)
- bsrl %eax, %eax
- leaq (VEC_SIZE * 2)(%rdi, %rax), %rax
-#ifdef USE_AS_WCSRCHR
- andq $-CHAR_SIZE, %rax
-#endif
- ret
-
- .p2align 4
-L(aligned_more):
- /* Save original pointer if match was in VEC 0. */
- movq %rdi, %r8
- andq $-VEC_SIZE, %rdi
-
- movaps VEC_SIZE(%rdi), %xmm2
- pxor %xmm3, %xmm3
- PCMPEQ %xmm2, %xmm3
- pmovmskb %xmm3, %ecx
- testl %ecx, %ecx
- jnz L(first_vec_x1)
-
- movaps (VEC_SIZE * 2)(%rdi), %xmm3
- pxor %xmm4, %xmm4
- PCMPEQ %xmm3, %xmm4
- pmovmskb %xmm4, %ecx
- testl %ecx, %ecx
- jnz L(first_vec_x2)
-
- addq $VEC_SIZE, %rdi
- /* Save pointer again before realigning. */
- movq %rdi, %rsi
- andq $-(VEC_SIZE * 2), %rdi
- .p2align 4
-L(first_loop):
- /* Do 2x VEC at a time. */
- movaps (VEC_SIZE * 2)(%rdi), %xmm4
- movaps (VEC_SIZE * 3)(%rdi), %xmm5
- /* Since SSE2 no pminud so wcsrchr needs seperate logic for
- detecting zero. Note if this is found to be a bottleneck it
- may be worth adding an SSE4.1 wcsrchr implementation. */
-#ifdef USE_AS_WCSRCHR
- movaps %xmm5, %xmm6
- pxor %xmm8, %xmm8
-
- PCMPEQ %xmm8, %xmm5
- PCMPEQ %xmm4, %xmm8
- por %xmm5, %xmm8
-#else
- movaps %xmm5, %xmm6
- PMINU %xmm4, %xmm5
-#endif
-
- movaps %xmm4, %xmm9
- PCMPEQ %xmm0, %xmm4
- PCMPEQ %xmm0, %xmm6
- movaps %xmm6, %xmm7
- por %xmm4, %xmm6
-#ifndef USE_AS_WCSRCHR
- pxor %xmm8, %xmm8
- PCMPEQ %xmm5, %xmm8
-#endif
- pmovmskb %xmm8, %ecx
- pmovmskb %xmm6, %eax
-
- addq $(VEC_SIZE * 2), %rdi
- /* Use `addl` 1) so we can undo it with `subl` and 2) it can
- macro-fuse with `jz`. */
- addl %ecx, %eax
- jz L(first_loop)
-
- /* Check if there is zero match. */
- testl %ecx, %ecx
- jz L(second_loop_match)
-
- /* Check if there was a match in last iteration. */
- subl %ecx, %eax
- jnz L(new_match)
-
-L(first_loop_old_match):
- PCMPEQ %xmm0, %xmm2
- PCMPEQ %xmm0, %xmm3
- pmovmskb %xmm2, %ecx
- pmovmskb %xmm3, %eax
- addl %eax, %ecx
- jz L(first_vec_x0_test)
- /* NB: We could move this shift to before the branch and save a
- bit of code size / performance on the fall through. The
- branch leads to the null case which generally seems hotter
- than char in first 3x VEC. */
- sall $16, %eax
- orl %ecx, %eax
-
- bsrl %eax, %eax
- addq %rsi, %rax
-#ifdef USE_AS_WCSRCHR
- andq $-CHAR_SIZE, %rax
-#endif
- ret
-
- .p2align 4
-L(new_match):
- pxor %xmm6, %xmm6
- PCMPEQ %xmm9, %xmm6
- pmovmskb %xmm6, %eax
- sall $16, %ecx
- orl %eax, %ecx
-
- /* We can't reuse either of the old comparisons as since we mask
- of zeros after first zero (instead of using the full
- comparison) we can't gurantee no interference between match
- after end of string and valid match. */
- pmovmskb %xmm4, %eax
- pmovmskb %xmm7, %edx
- sall $16, %edx
- orl %edx, %eax
-
- leal -1(%ecx), %edx
- xorl %edx, %ecx
- andl %ecx, %eax
- jz L(first_loop_old_match)
- bsrl %eax, %eax
- addq %rdi, %rax
-#ifdef USE_AS_WCSRCHR
- andq $-CHAR_SIZE, %rax
-#endif
- ret
-
- /* Save minimum state for getting most recent match. We can
- throw out all previous work. */
- .p2align 4
-L(second_loop_match):
- movq %rdi, %rsi
- movaps %xmm4, %xmm2
- movaps %xmm7, %xmm3
-
- .p2align 4
-L(second_loop):
- movaps (VEC_SIZE * 2)(%rdi), %xmm4
- movaps (VEC_SIZE * 3)(%rdi), %xmm5
- /* Since SSE2 no pminud so wcsrchr needs seperate logic for
- detecting zero. Note if this is found to be a bottleneck it
- may be worth adding an SSE4.1 wcsrchr implementation. */
-#ifdef USE_AS_WCSRCHR
- movaps %xmm5, %xmm6
- pxor %xmm8, %xmm8
-
- PCMPEQ %xmm8, %xmm5
- PCMPEQ %xmm4, %xmm8
- por %xmm5, %xmm8
-#else
- movaps %xmm5, %xmm6
- PMINU %xmm4, %xmm5
-#endif
-
- movaps %xmm4, %xmm9
- PCMPEQ %xmm0, %xmm4
- PCMPEQ %xmm0, %xmm6
- movaps %xmm6, %xmm7
- por %xmm4, %xmm6
-#ifndef USE_AS_WCSRCHR
- pxor %xmm8, %xmm8
- PCMPEQ %xmm5, %xmm8
-#endif
-
- pmovmskb %xmm8, %ecx
- pmovmskb %xmm6, %eax
-
- addq $(VEC_SIZE * 2), %rdi
- /* Either null term or new occurence of CHAR. */
- addl %ecx, %eax
- jz L(second_loop)
-
- /* No null term so much be new occurence of CHAR. */
- testl %ecx, %ecx
- jz L(second_loop_match)
-
-
- subl %ecx, %eax
- jnz L(second_loop_new_match)
-
-L(second_loop_old_match):
- pmovmskb %xmm2, %ecx
- pmovmskb %xmm3, %eax
- sall $16, %eax
- orl %ecx, %eax
- bsrl %eax, %eax
- addq %rsi, %rax
-#ifdef USE_AS_WCSRCHR
- andq $-CHAR_SIZE, %rax
-#endif
- ret
-
- .p2align 4
-L(second_loop_new_match):
- pxor %xmm6, %xmm6
- PCMPEQ %xmm9, %xmm6
- pmovmskb %xmm6, %eax
- sall $16, %ecx
- orl %eax, %ecx
-
- /* We can't reuse either of the old comparisons as since we mask
- of zeros after first zero (instead of using the full
- comparison) we can't gurantee no interference between match
- after end of string and valid match. */
- pmovmskb %xmm4, %eax
- pmovmskb %xmm7, %edx
- sall $16, %edx
- orl %edx, %eax
-
- leal -1(%ecx), %edx
- xorl %edx, %ecx
- andl %ecx, %eax
- jz L(second_loop_old_match)
- bsrl %eax, %eax
- addq %rdi, %rax
-#ifdef USE_AS_WCSRCHR
- andq $-CHAR_SIZE, %rax
-#endif
- ret
-
- .p2align 4,, 4
-L(cross_page):
- movq %rdi, %rsi
- andq $-VEC_SIZE, %rsi
- movaps (%rsi), %xmm1
- pxor %xmm2, %xmm2
- PCMPEQ %xmm1, %xmm2
- pmovmskb %xmm2, %edx
- movl %edi, %ecx
- andl $(VEC_SIZE - 1), %ecx
- sarl %cl, %edx
- jz L(cross_page_continue)
- PCMPEQ %xmm0, %xmm1
- pmovmskb %xmm1, %eax
- sarl %cl, %eax
- leal -1(%rdx), %ecx
- xorl %edx, %ecx
- andl %ecx, %eax
- jz L(ret1)
- bsrl %eax, %eax
- addq %rdi, %rax
-#ifdef USE_AS_WCSRCHR
- andq $-CHAR_SIZE, %rax
-#endif
-L(ret1):
- ret
-END(STRRCHR)
-
-#ifndef USE_AS_WCSRCHR
- weak_alias (STRRCHR, rindex)
- libc_hidden_builtin_def (STRRCHR)
-#endif
+#define STRRCHR strrchr
+#include "multiarch/strrchr-sse2.S"
+weak_alias (strrchr, rindex)
+libc_hidden_builtin_def (strrchr)
diff --git a/sysdeps/x86_64/wcsrchr.S b/sysdeps/x86_64/wcsrchr.S
index 2b80efc5ef..1d4b1eb21c 100644
--- a/sysdeps/x86_64/wcsrchr.S
+++ b/sysdeps/x86_64/wcsrchr.S
@@ -16,12 +16,5 @@
License along with the GNU C Library; if not, see
<https://www.gnu.org/licenses/>. */
-
-#define USE_AS_WCSRCHR 1
-#define NO_PMINU 1
-
-#ifndef STRRCHR
-# define STRRCHR wcsrchr
-#endif
-
-#include "../strrchr.S"
+#define STRRCHR wcsrchr
+#include "multiarch/wcsrchr-sse2.S"
--
2.34.1
^ permalink raw reply [flat|nested] 21+ messages in thread
* [PATCH v1] x86: Move strchr SSE2 implementation to multiarch/strchr-sse2.S
2022-07-12 19:29 [PATCH v1] x86: Move strlen SSE2 implementation to multiarch/strlen-sse2.S Noah Goldstein
` (2 preceding siblings ...)
2022-07-12 19:29 ` [PATCH v1] x86: Move strrchr SSE2 implementation to multiarch/strrchr-sse2.S Noah Goldstein
@ 2022-07-12 19:29 ` Noah Goldstein
2022-07-12 21:27 ` H.J. Lu
2022-07-12 19:29 ` [PATCH v1] x86: Move strcat SSE2 implementation to multiarch/strcat-sse2.S Noah Goldstein
` (5 subsequent siblings)
9 siblings, 1 reply; 21+ messages in thread
From: Noah Goldstein @ 2022-07-12 19:29 UTC (permalink / raw)
To: libc-alpha
This commit doesn't affect libc.so.6, its just housekeeping to prepare
for adding explicit ISA level support.
Tested build on x86_64 and x86_32 with/without multiarch.
---
sysdeps/x86_64/multiarch/rtld-strchr.S | 18 +++
sysdeps/x86_64/multiarch/rtld-strchrnul.S | 18 +++
sysdeps/x86_64/multiarch/strchr-sse2.S | 175 +++++++++++++++++++++-
sysdeps/x86_64/multiarch/strchrnul-sse2.S | 11 +-
sysdeps/x86_64/strchr.S | 167 +--------------------
sysdeps/x86_64/strchrnul.S | 7 +-
6 files changed, 213 insertions(+), 183 deletions(-)
create mode 100644 sysdeps/x86_64/multiarch/rtld-strchr.S
create mode 100644 sysdeps/x86_64/multiarch/rtld-strchrnul.S
diff --git a/sysdeps/x86_64/multiarch/rtld-strchr.S b/sysdeps/x86_64/multiarch/rtld-strchr.S
new file mode 100644
index 0000000000..2b7b879e37
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/rtld-strchr.S
@@ -0,0 +1,18 @@
+/* Copyright (C) 2022 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#include "../strchr.S"
diff --git a/sysdeps/x86_64/multiarch/rtld-strchrnul.S b/sysdeps/x86_64/multiarch/rtld-strchrnul.S
new file mode 100644
index 0000000000..0cc5becc88
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/rtld-strchrnul.S
@@ -0,0 +1,18 @@
+/* Copyright (C) 2022 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#include "../strchrnul.S"
diff --git a/sysdeps/x86_64/multiarch/strchr-sse2.S b/sysdeps/x86_64/multiarch/strchr-sse2.S
index 992f700077..f7767ca543 100644
--- a/sysdeps/x86_64/multiarch/strchr-sse2.S
+++ b/sysdeps/x86_64/multiarch/strchr-sse2.S
@@ -16,13 +16,172 @@
License along with the GNU C Library; if not, see
<https://www.gnu.org/licenses/>. */
-#if IS_IN (libc)
-# define strchr __strchr_sse2
+#if IS_IN (libc) || defined STRCHR
+# ifndef STRCHR
+# define STRCHR __strchr_sse2
+# endif
-# undef weak_alias
-# define weak_alias(strchr, index)
-# undef libc_hidden_builtin_def
-# define libc_hidden_builtin_def(strchr)
-#endif
+# include <sysdep.h>
+
+ .text
+ENTRY (STRCHR)
+ movd %esi, %xmm1
+ movl %edi, %eax
+ andl $4095, %eax
+ punpcklbw %xmm1, %xmm1
+ cmpl $4032, %eax
+ punpcklwd %xmm1, %xmm1
+ pshufd $0, %xmm1, %xmm1
+ jg L(cross_page)
+ movdqu (%rdi), %xmm0
+ pxor %xmm3, %xmm3
+ movdqa %xmm0, %xmm4
+ pcmpeqb %xmm1, %xmm0
+ pcmpeqb %xmm3, %xmm4
+ por %xmm4, %xmm0
+ pmovmskb %xmm0, %eax
+ test %eax, %eax
+ je L(next_48_bytes)
+ bsf %eax, %eax
+# ifdef AS_STRCHRNUL
+ leaq (%rdi,%rax), %rax
+# else
+ movl $0, %edx
+ leaq (%rdi,%rax), %rax
+ cmpb %sil, (%rax)
+ cmovne %rdx, %rax
+# endif
+ ret
+
+ .p2align 3
+L(next_48_bytes):
+ movdqu 16(%rdi), %xmm0
+ movdqa %xmm0, %xmm4
+ pcmpeqb %xmm1, %xmm0
+ pcmpeqb %xmm3, %xmm4
+ por %xmm4, %xmm0
+ pmovmskb %xmm0, %ecx
+ movdqu 32(%rdi), %xmm0
+ movdqa %xmm0, %xmm4
+ pcmpeqb %xmm1, %xmm0
+ salq $16, %rcx
+ pcmpeqb %xmm3, %xmm4
+ por %xmm4, %xmm0
+ pmovmskb %xmm0, %eax
+ movdqu 48(%rdi), %xmm0
+ pcmpeqb %xmm0, %xmm3
+ salq $32, %rax
+ pcmpeqb %xmm1, %xmm0
+ orq %rcx, %rax
+ por %xmm3, %xmm0
+ pmovmskb %xmm0, %ecx
+ salq $48, %rcx
+ orq %rcx, %rax
+ testq %rax, %rax
+ jne L(return)
+L(loop_start):
+ /* We use this alignment to force loop be aligned to 8 but not
+ 16 bytes. This gives better sheduling on AMD processors. */
+ .p2align 4
+ pxor %xmm6, %xmm6
+ andq $-64, %rdi
+ .p2align 3
+L(loop64):
+ addq $64, %rdi
+ movdqa (%rdi), %xmm5
+ movdqa 16(%rdi), %xmm2
+ movdqa 32(%rdi), %xmm3
+ pxor %xmm1, %xmm5
+ movdqa 48(%rdi), %xmm4
+ pxor %xmm1, %xmm2
+ pxor %xmm1, %xmm3
+ pminub (%rdi), %xmm5
+ pxor %xmm1, %xmm4
+ pminub 16(%rdi), %xmm2
+ pminub 32(%rdi), %xmm3
+ pminub %xmm2, %xmm5
+ pminub 48(%rdi), %xmm4
+ pminub %xmm3, %xmm5
+ pminub %xmm4, %xmm5
+ pcmpeqb %xmm6, %xmm5
+ pmovmskb %xmm5, %eax
+
+ testl %eax, %eax
+ je L(loop64)
-#include "../strchr.S"
+ movdqa (%rdi), %xmm5
+ movdqa %xmm5, %xmm0
+ pcmpeqb %xmm1, %xmm5
+ pcmpeqb %xmm6, %xmm0
+ por %xmm0, %xmm5
+ pcmpeqb %xmm6, %xmm2
+ pcmpeqb %xmm6, %xmm3
+ pcmpeqb %xmm6, %xmm4
+
+ pmovmskb %xmm5, %ecx
+ pmovmskb %xmm2, %eax
+ salq $16, %rax
+ pmovmskb %xmm3, %r8d
+ pmovmskb %xmm4, %edx
+ salq $32, %r8
+ orq %r8, %rax
+ orq %rcx, %rax
+ salq $48, %rdx
+ orq %rdx, %rax
+ .p2align 3
+L(return):
+ bsfq %rax, %rax
+# ifdef AS_STRCHRNUL
+ leaq (%rdi,%rax), %rax
+# else
+ movl $0, %edx
+ leaq (%rdi,%rax), %rax
+ cmpb %sil, (%rax)
+ cmovne %rdx, %rax
+# endif
+ ret
+ .p2align 4
+
+L(cross_page):
+ movq %rdi, %rdx
+ pxor %xmm2, %xmm2
+ andq $-64, %rdx
+ movdqa %xmm1, %xmm0
+ movdqa (%rdx), %xmm3
+ movdqa %xmm3, %xmm4
+ pcmpeqb %xmm1, %xmm3
+ pcmpeqb %xmm2, %xmm4
+ por %xmm4, %xmm3
+ pmovmskb %xmm3, %r8d
+ movdqa 16(%rdx), %xmm3
+ movdqa %xmm3, %xmm4
+ pcmpeqb %xmm1, %xmm3
+ pcmpeqb %xmm2, %xmm4
+ por %xmm4, %xmm3
+ pmovmskb %xmm3, %eax
+ movdqa 32(%rdx), %xmm3
+ movdqa %xmm3, %xmm4
+ pcmpeqb %xmm1, %xmm3
+ salq $16, %rax
+ pcmpeqb %xmm2, %xmm4
+ por %xmm4, %xmm3
+ pmovmskb %xmm3, %r9d
+ movdqa 48(%rdx), %xmm3
+ pcmpeqb %xmm3, %xmm2
+ salq $32, %r9
+ pcmpeqb %xmm3, %xmm0
+ orq %r9, %rax
+ orq %r8, %rax
+ por %xmm2, %xmm0
+ pmovmskb %xmm0, %ecx
+ salq $48, %rcx
+ orq %rcx, %rax
+ movl %edi, %ecx
+ subb %dl, %cl
+ shrq %cl, %rax
+ testq %rax, %rax
+ jne L(return)
+ jmp L(loop_start)
+
+END (STRCHR)
+#endif
diff --git a/sysdeps/x86_64/multiarch/strchrnul-sse2.S b/sysdeps/x86_64/multiarch/strchrnul-sse2.S
index f91c670369..7238977a21 100644
--- a/sysdeps/x86_64/multiarch/strchrnul-sse2.S
+++ b/sysdeps/x86_64/multiarch/strchrnul-sse2.S
@@ -17,10 +17,11 @@
<https://www.gnu.org/licenses/>. */
#if IS_IN (libc)
-# define __strchrnul __strchrnul_sse2
-
-# undef weak_alias
-# define weak_alias(__strchrnul, strchrnul)
+# ifndef STRCHR
+# define STRCHR __strchrnul_sse2
+# endif
#endif
-#include "../strchrnul.S"
+#define AS_STRCHRNUL
+
+#include "strchr-sse2.S"
diff --git a/sysdeps/x86_64/strchr.S b/sysdeps/x86_64/strchr.S
index dda7c0431d..77c956c92c 100644
--- a/sysdeps/x86_64/strchr.S
+++ b/sysdeps/x86_64/strchr.S
@@ -17,171 +17,8 @@
License along with the GNU C Library; if not, see
<https://www.gnu.org/licenses/>. */
-#include <sysdep.h>
- .text
-ENTRY (strchr)
- movd %esi, %xmm1
- movl %edi, %eax
- andl $4095, %eax
- punpcklbw %xmm1, %xmm1
- cmpl $4032, %eax
- punpcklwd %xmm1, %xmm1
- pshufd $0, %xmm1, %xmm1
- jg L(cross_page)
- movdqu (%rdi), %xmm0
- pxor %xmm3, %xmm3
- movdqa %xmm0, %xmm4
- pcmpeqb %xmm1, %xmm0
- pcmpeqb %xmm3, %xmm4
- por %xmm4, %xmm0
- pmovmskb %xmm0, %eax
- test %eax, %eax
- je L(next_48_bytes)
- bsf %eax, %eax
-#ifdef AS_STRCHRNUL
- leaq (%rdi,%rax), %rax
-#else
- movl $0, %edx
- leaq (%rdi,%rax), %rax
- cmpb %sil, (%rax)
- cmovne %rdx, %rax
-#endif
- ret
-
- .p2align 3
- L(next_48_bytes):
- movdqu 16(%rdi), %xmm0
- movdqa %xmm0, %xmm4
- pcmpeqb %xmm1, %xmm0
- pcmpeqb %xmm3, %xmm4
- por %xmm4, %xmm0
- pmovmskb %xmm0, %ecx
- movdqu 32(%rdi), %xmm0
- movdqa %xmm0, %xmm4
- pcmpeqb %xmm1, %xmm0
- salq $16, %rcx
- pcmpeqb %xmm3, %xmm4
- por %xmm4, %xmm0
- pmovmskb %xmm0, %eax
- movdqu 48(%rdi), %xmm0
- pcmpeqb %xmm0, %xmm3
- salq $32, %rax
- pcmpeqb %xmm1, %xmm0
- orq %rcx, %rax
- por %xmm3, %xmm0
- pmovmskb %xmm0, %ecx
- salq $48, %rcx
- orq %rcx, %rax
- testq %rax, %rax
- jne L(return)
-L(loop_start):
- /* We use this alignment to force loop be aligned to 8 but not
- 16 bytes. This gives better sheduling on AMD processors. */
- .p2align 4
- pxor %xmm6, %xmm6
- andq $-64, %rdi
- .p2align 3
-L(loop64):
- addq $64, %rdi
- movdqa (%rdi), %xmm5
- movdqa 16(%rdi), %xmm2
- movdqa 32(%rdi), %xmm3
- pxor %xmm1, %xmm5
- movdqa 48(%rdi), %xmm4
- pxor %xmm1, %xmm2
- pxor %xmm1, %xmm3
- pminub (%rdi), %xmm5
- pxor %xmm1, %xmm4
- pminub 16(%rdi), %xmm2
- pminub 32(%rdi), %xmm3
- pminub %xmm2, %xmm5
- pminub 48(%rdi), %xmm4
- pminub %xmm3, %xmm5
- pminub %xmm4, %xmm5
- pcmpeqb %xmm6, %xmm5
- pmovmskb %xmm5, %eax
-
- testl %eax, %eax
- je L(loop64)
-
- movdqa (%rdi), %xmm5
- movdqa %xmm5, %xmm0
- pcmpeqb %xmm1, %xmm5
- pcmpeqb %xmm6, %xmm0
- por %xmm0, %xmm5
- pcmpeqb %xmm6, %xmm2
- pcmpeqb %xmm6, %xmm3
- pcmpeqb %xmm6, %xmm4
-
- pmovmskb %xmm5, %ecx
- pmovmskb %xmm2, %eax
- salq $16, %rax
- pmovmskb %xmm3, %r8d
- pmovmskb %xmm4, %edx
- salq $32, %r8
- orq %r8, %rax
- orq %rcx, %rax
- salq $48, %rdx
- orq %rdx, %rax
- .p2align 3
-L(return):
- bsfq %rax, %rax
-#ifdef AS_STRCHRNUL
- leaq (%rdi,%rax), %rax
-#else
- movl $0, %edx
- leaq (%rdi,%rax), %rax
- cmpb %sil, (%rax)
- cmovne %rdx, %rax
-#endif
- ret
- .p2align 4
-
-L(cross_page):
- movq %rdi, %rdx
- pxor %xmm2, %xmm2
- andq $-64, %rdx
- movdqa %xmm1, %xmm0
- movdqa (%rdx), %xmm3
- movdqa %xmm3, %xmm4
- pcmpeqb %xmm1, %xmm3
- pcmpeqb %xmm2, %xmm4
- por %xmm4, %xmm3
- pmovmskb %xmm3, %r8d
- movdqa 16(%rdx), %xmm3
- movdqa %xmm3, %xmm4
- pcmpeqb %xmm1, %xmm3
- pcmpeqb %xmm2, %xmm4
- por %xmm4, %xmm3
- pmovmskb %xmm3, %eax
- movdqa 32(%rdx), %xmm3
- movdqa %xmm3, %xmm4
- pcmpeqb %xmm1, %xmm3
- salq $16, %rax
- pcmpeqb %xmm2, %xmm4
- por %xmm4, %xmm3
- pmovmskb %xmm3, %r9d
- movdqa 48(%rdx), %xmm3
- pcmpeqb %xmm3, %xmm2
- salq $32, %r9
- pcmpeqb %xmm3, %xmm0
- orq %r9, %rax
- orq %r8, %rax
- por %xmm2, %xmm0
- pmovmskb %xmm0, %ecx
- salq $48, %rcx
- orq %rcx, %rax
- movl %edi, %ecx
- subb %dl, %cl
- shrq %cl, %rax
- testq %rax, %rax
- jne L(return)
- jmp L(loop_start)
-
-END (strchr)
-
-#ifndef AS_STRCHRNUL
+#define STRCHR strchr
+#include "multiarch/strchr-sse2.S"
weak_alias (strchr, index)
libc_hidden_builtin_def (strchr)
-#endif
diff --git a/sysdeps/x86_64/strchrnul.S b/sysdeps/x86_64/strchrnul.S
index ec2e652e25..508e42db26 100644
--- a/sysdeps/x86_64/strchrnul.S
+++ b/sysdeps/x86_64/strchrnul.S
@@ -18,10 +18,7 @@
License along with the GNU C Library; if not, see
<https://www.gnu.org/licenses/>. */
-#include <sysdep.h>
-
-#define strchr __strchrnul
-#define AS_STRCHRNUL
-#include "strchr.S"
+#define STRCHR __strchrnul
+#include "multiarch/strchrnul-sse2.S"
weak_alias (__strchrnul, strchrnul)
--
2.34.1
^ permalink raw reply [flat|nested] 21+ messages in thread
* [PATCH v1] x86: Move strcat SSE2 implementation to multiarch/strcat-sse2.S
2022-07-12 19:29 [PATCH v1] x86: Move strlen SSE2 implementation to multiarch/strlen-sse2.S Noah Goldstein
` (3 preceding siblings ...)
2022-07-12 19:29 ` [PATCH v1] x86: Move strchr SSE2 implementation to multiarch/strchr-sse2.S Noah Goldstein
@ 2022-07-12 19:29 ` Noah Goldstein
2022-07-12 21:16 ` H.J. Lu
2022-07-12 19:29 ` [PATCH v1] x86: Move wcschr SSE2 implementation to multiarch/wcschr-sse2.S Noah Goldstein
` (4 subsequent siblings)
9 siblings, 1 reply; 21+ messages in thread
From: Noah Goldstein @ 2022-07-12 19:29 UTC (permalink / raw)
To: libc-alpha
This commit doesn't affect libc.so.6, its just housekeeping to prepare
for adding explicit ISA level support.
Tested build on x86_64 and x86_32 with/without multiarch.
---
sysdeps/x86_64/multiarch/strcat-sse2.S | 242 ++++++++++++++++++++++++-
sysdeps/x86_64/strcat.S | 239 +-----------------------
2 files changed, 238 insertions(+), 243 deletions(-)
diff --git a/sysdeps/x86_64/multiarch/strcat-sse2.S b/sysdeps/x86_64/multiarch/strcat-sse2.S
index 449e102438..244c4a6d74 100644
--- a/sysdeps/x86_64/multiarch/strcat-sse2.S
+++ b/sysdeps/x86_64/multiarch/strcat-sse2.S
@@ -17,12 +17,242 @@
<https://www.gnu.org/licenses/>. */
#if IS_IN (libc)
+# ifndef STRCAT
+# define STRCAT __strcat_sse2
+# endif
+#endif
-# include <sysdep.h>
-# define strcat __strcat_sse2
+#include <sysdep.h>
+
+ .text
+ENTRY (STRCAT)
+ movq %rdi, %rcx /* Dest. register. */
+ andl $7, %ecx /* mask alignment bits */
+ movq %rdi, %rax /* Duplicate destination pointer. */
+ movq $0xfefefefefefefeff,%r8
+
+ /* First step: Find end of destination. */
+ jz 4f /* aligned => start loop */
+
+ neg %ecx /* We need to align to 8 bytes. */
+ addl $8,%ecx
+ /* Search the first bytes directly. */
+0: cmpb $0x0,(%rax) /* is byte NUL? */
+ je 2f /* yes => start copy */
+ incq %rax /* increment pointer */
+ decl %ecx
+ jnz 0b
+
+
+
+ /* Now the source is aligned. Scan for NUL byte. */
+ .p2align 4
+4:
+ /* First unroll. */
+ movq (%rax), %rcx /* get double word (= 8 bytes) in question */
+ addq $8,%rax /* adjust pointer for next word */
+ movq %r8, %rdx /* magic value */
+ addq %rcx, %rdx /* add the magic value to the word. We get
+ carry bits reported for each byte which
+ is *not* 0 */
+ jnc 3f /* highest byte is NUL => return pointer */
+ xorq %rcx, %rdx /* (word+magic)^word */
+ orq %r8, %rdx /* set all non-carry bits */
+ incq %rdx /* add 1: if one carry bit was *not* set
+ the addition will not result in 0. */
+ jnz 3f /* found NUL => return pointer */
+
+ /* Second unroll. */
+ movq (%rax), %rcx /* get double word (= 8 bytes) in question */
+ addq $8,%rax /* adjust pointer for next word */
+ movq %r8, %rdx /* magic value */
+ addq %rcx, %rdx /* add the magic value to the word. We get
+ carry bits reported for each byte which
+ is *not* 0 */
+ jnc 3f /* highest byte is NUL => return pointer */
+ xorq %rcx, %rdx /* (word+magic)^word */
+ orq %r8, %rdx /* set all non-carry bits */
+ incq %rdx /* add 1: if one carry bit was *not* set
+ the addition will not result in 0. */
+ jnz 3f /* found NUL => return pointer */
+
+ /* Third unroll. */
+ movq (%rax), %rcx /* get double word (= 8 bytes) in question */
+ addq $8,%rax /* adjust pointer for next word */
+ movq %r8, %rdx /* magic value */
+ addq %rcx, %rdx /* add the magic value to the word. We get
+ carry bits reported for each byte which
+ is *not* 0 */
+ jnc 3f /* highest byte is NUL => return pointer */
+ xorq %rcx, %rdx /* (word+magic)^word */
+ orq %r8, %rdx /* set all non-carry bits */
+ incq %rdx /* add 1: if one carry bit was *not* set
+ the addition will not result in 0. */
+ jnz 3f /* found NUL => return pointer */
+
+ /* Fourth unroll. */
+ movq (%rax), %rcx /* get double word (= 8 bytes) in question */
+ addq $8,%rax /* adjust pointer for next word */
+ movq %r8, %rdx /* magic value */
+ addq %rcx, %rdx /* add the magic value to the word. We get
+ carry bits reported for each byte which
+ is *not* 0 */
+ jnc 3f /* highest byte is NUL => return pointer */
+ xorq %rcx, %rdx /* (word+magic)^word */
+ orq %r8, %rdx /* set all non-carry bits */
+ incq %rdx /* add 1: if one carry bit was *not* set
+ the addition will not result in 0. */
+ jz 4b /* no NUL found => continue loop */
+
+ .p2align 4 /* Align, it's a jump target. */
+3: subq $8,%rax /* correct pointer increment. */
+
+ testb %cl, %cl /* is first byte NUL? */
+ jz 2f /* yes => return */
+ incq %rax /* increment pointer */
+
+ testb %ch, %ch /* is second byte NUL? */
+ jz 2f /* yes => return */
+ incq %rax /* increment pointer */
+
+ testl $0x00ff0000, %ecx /* is third byte NUL? */
+ jz 2f /* yes => return pointer */
+ incq %rax /* increment pointer */
+
+ testl $0xff000000, %ecx /* is fourth byte NUL? */
+ jz 2f /* yes => return pointer */
+ incq %rax /* increment pointer */
+
+ shrq $32, %rcx /* look at other half. */
+
+ testb %cl, %cl /* is first byte NUL? */
+ jz 2f /* yes => return */
+ incq %rax /* increment pointer */
+
+ testb %ch, %ch /* is second byte NUL? */
+ jz 2f /* yes => return */
+ incq %rax /* increment pointer */
+
+ testl $0xff0000, %ecx /* is third byte NUL? */
+ jz 2f /* yes => return pointer */
+ incq %rax /* increment pointer */
+
+2:
+ /* Second step: Copy source to destination. */
+
+ movq %rsi, %rcx /* duplicate */
+ andl $7,%ecx /* mask alignment bits */
+ movq %rax, %rdx /* move around */
+ jz 22f /* aligned => start loop */
+
+ neg %ecx /* align to 8 bytes. */
+ addl $8, %ecx
+ /* Align the source pointer. */
+21:
+ movb (%rsi), %al /* Fetch a byte */
+ testb %al, %al /* Is it NUL? */
+ movb %al, (%rdx) /* Store it */
+ jz 24f /* If it was NUL, done! */
+ incq %rsi
+ incq %rdx
+ decl %ecx
+ jnz 21b
+
+ /* Now the sources is aligned. Unfortunatly we cannot force
+ to have both source and destination aligned, so ignore the
+ alignment of the destination. */
+ .p2align 4
+22:
+ /* 1st unroll. */
+ movq (%rsi), %rax /* Read double word (8 bytes). */
+ addq $8, %rsi /* Adjust pointer for next word. */
+ movq %rax, %r9 /* Save a copy for NUL finding. */
+ addq %r8, %r9 /* add the magic value to the word. We get
+ carry bits reported for each byte which
+ is *not* 0 */
+ jnc 23f /* highest byte is NUL => return pointer */
+ xorq %rax, %r9 /* (word+magic)^word */
+ orq %r8, %r9 /* set all non-carry bits */
+ incq %r9 /* add 1: if one carry bit was *not* set
+ the addition will not result in 0. */
+
+ jnz 23f /* found NUL => return pointer */
+
+ movq %rax, (%rdx) /* Write value to destination. */
+ addq $8, %rdx /* Adjust pointer. */
+
+ /* 2nd unroll. */
+ movq (%rsi), %rax /* Read double word (8 bytes). */
+ addq $8, %rsi /* Adjust pointer for next word. */
+ movq %rax, %r9 /* Save a copy for NUL finding. */
+ addq %r8, %r9 /* add the magic value to the word. We get
+ carry bits reported for each byte which
+ is *not* 0 */
+ jnc 23f /* highest byte is NUL => return pointer */
+ xorq %rax, %r9 /* (word+magic)^word */
+ orq %r8, %r9 /* set all non-carry bits */
+ incq %r9 /* add 1: if one carry bit was *not* set
+ the addition will not result in 0. */
+
+ jnz 23f /* found NUL => return pointer */
+
+ movq %rax, (%rdx) /* Write value to destination. */
+ addq $8, %rdx /* Adjust pointer. */
+
+ /* 3rd unroll. */
+ movq (%rsi), %rax /* Read double word (8 bytes). */
+ addq $8, %rsi /* Adjust pointer for next word. */
+ movq %rax, %r9 /* Save a copy for NUL finding. */
+ addq %r8, %r9 /* add the magic value to the word. We get
+ carry bits reported for each byte which
+ is *not* 0 */
+ jnc 23f /* highest byte is NUL => return pointer */
+ xorq %rax, %r9 /* (word+magic)^word */
+ orq %r8, %r9 /* set all non-carry bits */
+ incq %r9 /* add 1: if one carry bit was *not* set
+ the addition will not result in 0. */
+
+ jnz 23f /* found NUL => return pointer */
+
+ movq %rax, (%rdx) /* Write value to destination. */
+ addq $8, %rdx /* Adjust pointer. */
+
+ /* 4th unroll. */
+ movq (%rsi), %rax /* Read double word (8 bytes). */
+ addq $8, %rsi /* Adjust pointer for next word. */
+ movq %rax, %r9 /* Save a copy for NUL finding. */
+ addq %r8, %r9 /* add the magic value to the word. We get
+ carry bits reported for each byte which
+ is *not* 0 */
+ jnc 23f /* highest byte is NUL => return pointer */
+ xorq %rax, %r9 /* (word+magic)^word */
+ orq %r8, %r9 /* set all non-carry bits */
+ incq %r9 /* add 1: if one carry bit was *not* set
+ the addition will not result in 0. */
+
+ jnz 23f /* found NUL => return pointer */
+
+ movq %rax, (%rdx) /* Write value to destination. */
+ addq $8, %rdx /* Adjust pointer. */
+ jmp 22b /* Next iteration. */
+
+ /* Do the last few bytes. %rax contains the value to write.
+ The loop is unrolled twice. */
+ .p2align 4
+23:
+ movb %al, (%rdx) /* 1st byte. */
+ testb %al, %al /* Is it NUL. */
+ jz 24f /* yes, finish. */
+ incq %rdx /* Increment destination. */
+ movb %ah, (%rdx) /* 2nd byte. */
+ testb %ah, %ah /* Is it NUL?. */
+ jz 24f /* yes, finish. */
+ incq %rdx /* Increment destination. */
+ shrq $16, %rax /* Shift... */
+ jmp 23b /* and look at next two bytes in %rax. */
-# undef libc_hidden_builtin_def
-# define libc_hidden_builtin_def(strcat)
-#endif
-#include <sysdeps/x86_64/strcat.S>
+24:
+ movq %rdi, %rax /* Source is return value. */
+ retq
+END (STRCAT)
diff --git a/sysdeps/x86_64/strcat.S b/sysdeps/x86_64/strcat.S
index 565a9c785a..fc3e8a9bcf 100644
--- a/sysdeps/x86_64/strcat.S
+++ b/sysdeps/x86_64/strcat.S
@@ -17,241 +17,6 @@
License along with the GNU C Library; if not, see
<https://www.gnu.org/licenses/>. */
-#include <sysdep.h>
-#include "asm-syntax.h"
-
-/* Will be removed when new strcpy implementation gets merged. */
-
- .text
-ENTRY (strcat)
- movq %rdi, %rcx /* Dest. register. */
- andl $7, %ecx /* mask alignment bits */
- movq %rdi, %rax /* Duplicate destination pointer. */
- movq $0xfefefefefefefeff,%r8
-
- /* First step: Find end of destination. */
- jz 4f /* aligned => start loop */
-
- neg %ecx /* We need to align to 8 bytes. */
- addl $8,%ecx
- /* Search the first bytes directly. */
-0: cmpb $0x0,(%rax) /* is byte NUL? */
- je 2f /* yes => start copy */
- incq %rax /* increment pointer */
- decl %ecx
- jnz 0b
-
-
-
- /* Now the source is aligned. Scan for NUL byte. */
- .p2align 4
-4:
- /* First unroll. */
- movq (%rax), %rcx /* get double word (= 8 bytes) in question */
- addq $8,%rax /* adjust pointer for next word */
- movq %r8, %rdx /* magic value */
- addq %rcx, %rdx /* add the magic value to the word. We get
- carry bits reported for each byte which
- is *not* 0 */
- jnc 3f /* highest byte is NUL => return pointer */
- xorq %rcx, %rdx /* (word+magic)^word */
- orq %r8, %rdx /* set all non-carry bits */
- incq %rdx /* add 1: if one carry bit was *not* set
- the addition will not result in 0. */
- jnz 3f /* found NUL => return pointer */
-
- /* Second unroll. */
- movq (%rax), %rcx /* get double word (= 8 bytes) in question */
- addq $8,%rax /* adjust pointer for next word */
- movq %r8, %rdx /* magic value */
- addq %rcx, %rdx /* add the magic value to the word. We get
- carry bits reported for each byte which
- is *not* 0 */
- jnc 3f /* highest byte is NUL => return pointer */
- xorq %rcx, %rdx /* (word+magic)^word */
- orq %r8, %rdx /* set all non-carry bits */
- incq %rdx /* add 1: if one carry bit was *not* set
- the addition will not result in 0. */
- jnz 3f /* found NUL => return pointer */
-
- /* Third unroll. */
- movq (%rax), %rcx /* get double word (= 8 bytes) in question */
- addq $8,%rax /* adjust pointer for next word */
- movq %r8, %rdx /* magic value */
- addq %rcx, %rdx /* add the magic value to the word. We get
- carry bits reported for each byte which
- is *not* 0 */
- jnc 3f /* highest byte is NUL => return pointer */
- xorq %rcx, %rdx /* (word+magic)^word */
- orq %r8, %rdx /* set all non-carry bits */
- incq %rdx /* add 1: if one carry bit was *not* set
- the addition will not result in 0. */
- jnz 3f /* found NUL => return pointer */
-
- /* Fourth unroll. */
- movq (%rax), %rcx /* get double word (= 8 bytes) in question */
- addq $8,%rax /* adjust pointer for next word */
- movq %r8, %rdx /* magic value */
- addq %rcx, %rdx /* add the magic value to the word. We get
- carry bits reported for each byte which
- is *not* 0 */
- jnc 3f /* highest byte is NUL => return pointer */
- xorq %rcx, %rdx /* (word+magic)^word */
- orq %r8, %rdx /* set all non-carry bits */
- incq %rdx /* add 1: if one carry bit was *not* set
- the addition will not result in 0. */
- jz 4b /* no NUL found => continue loop */
-
- .p2align 4 /* Align, it's a jump target. */
-3: subq $8,%rax /* correct pointer increment. */
-
- testb %cl, %cl /* is first byte NUL? */
- jz 2f /* yes => return */
- incq %rax /* increment pointer */
-
- testb %ch, %ch /* is second byte NUL? */
- jz 2f /* yes => return */
- incq %rax /* increment pointer */
-
- testl $0x00ff0000, %ecx /* is third byte NUL? */
- jz 2f /* yes => return pointer */
- incq %rax /* increment pointer */
-
- testl $0xff000000, %ecx /* is fourth byte NUL? */
- jz 2f /* yes => return pointer */
- incq %rax /* increment pointer */
-
- shrq $32, %rcx /* look at other half. */
-
- testb %cl, %cl /* is first byte NUL? */
- jz 2f /* yes => return */
- incq %rax /* increment pointer */
-
- testb %ch, %ch /* is second byte NUL? */
- jz 2f /* yes => return */
- incq %rax /* increment pointer */
-
- testl $0xff0000, %ecx /* is third byte NUL? */
- jz 2f /* yes => return pointer */
- incq %rax /* increment pointer */
-
-2:
- /* Second step: Copy source to destination. */
-
- movq %rsi, %rcx /* duplicate */
- andl $7,%ecx /* mask alignment bits */
- movq %rax, %rdx /* move around */
- jz 22f /* aligned => start loop */
-
- neg %ecx /* align to 8 bytes. */
- addl $8, %ecx
- /* Align the source pointer. */
-21:
- movb (%rsi), %al /* Fetch a byte */
- testb %al, %al /* Is it NUL? */
- movb %al, (%rdx) /* Store it */
- jz 24f /* If it was NUL, done! */
- incq %rsi
- incq %rdx
- decl %ecx
- jnz 21b
-
- /* Now the sources is aligned. Unfortunatly we cannot force
- to have both source and destination aligned, so ignore the
- alignment of the destination. */
- .p2align 4
-22:
- /* 1st unroll. */
- movq (%rsi), %rax /* Read double word (8 bytes). */
- addq $8, %rsi /* Adjust pointer for next word. */
- movq %rax, %r9 /* Save a copy for NUL finding. */
- addq %r8, %r9 /* add the magic value to the word. We get
- carry bits reported for each byte which
- is *not* 0 */
- jnc 23f /* highest byte is NUL => return pointer */
- xorq %rax, %r9 /* (word+magic)^word */
- orq %r8, %r9 /* set all non-carry bits */
- incq %r9 /* add 1: if one carry bit was *not* set
- the addition will not result in 0. */
-
- jnz 23f /* found NUL => return pointer */
-
- movq %rax, (%rdx) /* Write value to destination. */
- addq $8, %rdx /* Adjust pointer. */
-
- /* 2nd unroll. */
- movq (%rsi), %rax /* Read double word (8 bytes). */
- addq $8, %rsi /* Adjust pointer for next word. */
- movq %rax, %r9 /* Save a copy for NUL finding. */
- addq %r8, %r9 /* add the magic value to the word. We get
- carry bits reported for each byte which
- is *not* 0 */
- jnc 23f /* highest byte is NUL => return pointer */
- xorq %rax, %r9 /* (word+magic)^word */
- orq %r8, %r9 /* set all non-carry bits */
- incq %r9 /* add 1: if one carry bit was *not* set
- the addition will not result in 0. */
-
- jnz 23f /* found NUL => return pointer */
-
- movq %rax, (%rdx) /* Write value to destination. */
- addq $8, %rdx /* Adjust pointer. */
-
- /* 3rd unroll. */
- movq (%rsi), %rax /* Read double word (8 bytes). */
- addq $8, %rsi /* Adjust pointer for next word. */
- movq %rax, %r9 /* Save a copy for NUL finding. */
- addq %r8, %r9 /* add the magic value to the word. We get
- carry bits reported for each byte which
- is *not* 0 */
- jnc 23f /* highest byte is NUL => return pointer */
- xorq %rax, %r9 /* (word+magic)^word */
- orq %r8, %r9 /* set all non-carry bits */
- incq %r9 /* add 1: if one carry bit was *not* set
- the addition will not result in 0. */
-
- jnz 23f /* found NUL => return pointer */
-
- movq %rax, (%rdx) /* Write value to destination. */
- addq $8, %rdx /* Adjust pointer. */
-
- /* 4th unroll. */
- movq (%rsi), %rax /* Read double word (8 bytes). */
- addq $8, %rsi /* Adjust pointer for next word. */
- movq %rax, %r9 /* Save a copy for NUL finding. */
- addq %r8, %r9 /* add the magic value to the word. We get
- carry bits reported for each byte which
- is *not* 0 */
- jnc 23f /* highest byte is NUL => return pointer */
- xorq %rax, %r9 /* (word+magic)^word */
- orq %r8, %r9 /* set all non-carry bits */
- incq %r9 /* add 1: if one carry bit was *not* set
- the addition will not result in 0. */
-
- jnz 23f /* found NUL => return pointer */
-
- movq %rax, (%rdx) /* Write value to destination. */
- addq $8, %rdx /* Adjust pointer. */
- jmp 22b /* Next iteration. */
-
- /* Do the last few bytes. %rax contains the value to write.
- The loop is unrolled twice. */
- .p2align 4
-23:
- movb %al, (%rdx) /* 1st byte. */
- testb %al, %al /* Is it NUL. */
- jz 24f /* yes, finish. */
- incq %rdx /* Increment destination. */
- movb %ah, (%rdx) /* 2nd byte. */
- testb %ah, %ah /* Is it NUL?. */
- jz 24f /* yes, finish. */
- incq %rdx /* Increment destination. */
- shrq $16, %rax /* Shift... */
- jmp 23b /* and look at next two bytes in %rax. */
-
-
-24:
- movq %rdi, %rax /* Source is return value. */
- retq
-END (strcat)
+#define STRCAT strcat
+#include "multiarch/strcat-sse2.S"
libc_hidden_builtin_def (strcat)
--
2.34.1
^ permalink raw reply [flat|nested] 21+ messages in thread
* [PATCH v1] x86: Move wcschr SSE2 implementation to multiarch/wcschr-sse2.S
2022-07-12 19:29 [PATCH v1] x86: Move strlen SSE2 implementation to multiarch/strlen-sse2.S Noah Goldstein
` (4 preceding siblings ...)
2022-07-12 19:29 ` [PATCH v1] x86: Move strcat SSE2 implementation to multiarch/strcat-sse2.S Noah Goldstein
@ 2022-07-12 19:29 ` Noah Goldstein
2022-07-12 20:55 ` H.J. Lu
2022-07-12 19:29 ` [PATCH v1] x86: Move wcslen SSE2 implementation to multiarch/wcslen-sse2.S Noah Goldstein
` (3 subsequent siblings)
9 siblings, 1 reply; 21+ messages in thread
From: Noah Goldstein @ 2022-07-12 19:29 UTC (permalink / raw)
To: libc-alpha
This commit doesn't affect libc.so.6, its just housekeeping to prepare
for adding explicit ISA level support.
Tested build on x86_64 and x86_32 with/without multiarch.
---
sysdeps/x86_64/multiarch/wcschr-sse2.S | 145 +++++++++++++++++++++++--
sysdeps/x86_64/wcschr.S | 135 +----------------------
2 files changed, 138 insertions(+), 142 deletions(-)
diff --git a/sysdeps/x86_64/multiarch/wcschr-sse2.S b/sysdeps/x86_64/multiarch/wcschr-sse2.S
index 218ea609b9..c872926ba9 100644
--- a/sysdeps/x86_64/multiarch/wcschr-sse2.S
+++ b/sysdeps/x86_64/multiarch/wcschr-sse2.S
@@ -17,14 +17,141 @@
<https://www.gnu.org/licenses/>. */
#if IS_IN (libc)
-# define __wcschr __wcschr_sse2
-
-# undef weak_alias
-# define weak_alias(__wcschr, wcschr)
-# undef libc_hidden_def
-# define libc_hidden_def(__wcschr)
-# undef libc_hidden_weak
-# define libc_hidden_weak(wcschr)
+# ifndef WCSCHR
+# define WCSCHR __wcschr_sse2
+# endif
#endif
-#include "../wcschr.S"
+#include <sysdep.h>
+
+ .text
+ENTRY (WCSCHR)
+
+ movd %rsi, %xmm1
+ pxor %xmm2, %xmm2
+ mov %rdi, %rcx
+ punpckldq %xmm1, %xmm1
+ punpckldq %xmm1, %xmm1
+
+ and $63, %rcx
+ cmp $48, %rcx
+ ja L(cross_cache)
+
+ movdqu (%rdi), %xmm0
+ pcmpeqd %xmm0, %xmm2
+ add $16, %rdi
+ pcmpeqd %xmm1, %xmm0
+ pmovmskb %xmm2, %rdx
+ pmovmskb %xmm0, %rax
+ or %rax, %rdx
+ jnz L(matches)
+
+ and $-16, %rdi
+
+ movdqa (%rdi), %xmm0
+ pcmpeqd %xmm0, %xmm2
+ add $16, %rdi
+ pcmpeqd %xmm1, %xmm0
+ pmovmskb %xmm2, %rdx
+ pmovmskb %xmm0, %rax
+ or %rax, %rdx
+ jnz L(matches)
+
+ jmp L(loop)
+
+L(cross_cache):
+ and $15, %rcx
+ and $-16, %rdi
+ movdqa (%rdi), %xmm0
+ pcmpeqd %xmm0, %xmm2
+ pcmpeqd %xmm1, %xmm0
+ pmovmskb %xmm2, %rdx
+ pmovmskb %xmm0, %rax
+
+ sar %cl, %rdx
+ sar %cl, %rax
+ test %rax, %rax
+ je L(unaligned_no_match)
+
+ bsf %rax, %rax
+ test %rdx, %rdx
+ je L(unaligned_match)
+ bsf %rdx, %rdx
+ cmp %rdx, %rax
+ ja L(return_null)
+
+L(unaligned_match):
+ add %rdi, %rax
+ add %rcx, %rax
+ ret
+
+ .p2align 4
+L(unaligned_no_match):
+ test %rdx, %rdx
+ jne L(return_null)
+ pxor %xmm2, %xmm2
+
+ add $16, %rdi
+
+ .p2align 4
+/* Loop start on aligned string. */
+L(loop):
+ movdqa (%rdi), %xmm0
+ pcmpeqd %xmm0, %xmm2
+ add $16, %rdi
+ pcmpeqd %xmm1, %xmm0
+ pmovmskb %xmm2, %rdx
+ pmovmskb %xmm0, %rax
+ or %rax, %rdx
+ jnz L(matches)
+
+ movdqa (%rdi), %xmm0
+ pcmpeqd %xmm0, %xmm2
+ add $16, %rdi
+ pcmpeqd %xmm1, %xmm0
+ pmovmskb %xmm2, %rdx
+ pmovmskb %xmm0, %rax
+ or %rax, %rdx
+ jnz L(matches)
+
+ movdqa (%rdi), %xmm0
+ pcmpeqd %xmm0, %xmm2
+ add $16, %rdi
+ pcmpeqd %xmm1, %xmm0
+ pmovmskb %xmm2, %rdx
+ pmovmskb %xmm0, %rax
+ or %rax, %rdx
+ jnz L(matches)
+
+ movdqa (%rdi), %xmm0
+ pcmpeqd %xmm0, %xmm2
+ add $16, %rdi
+ pcmpeqd %xmm1, %xmm0
+ pmovmskb %xmm2, %rdx
+ pmovmskb %xmm0, %rax
+ or %rax, %rdx
+ jnz L(matches)
+ jmp L(loop)
+
+ .p2align 4
+L(matches):
+ pmovmskb %xmm2, %rdx
+ test %rax, %rax
+ jz L(return_null)
+ bsf %rax, %rax
+ test %rdx, %rdx
+ je L(match)
+ bsf %rdx, %rcx
+ cmp %rcx, %rax
+ ja L(return_null)
+L(match):
+ sub $16, %rdi
+ add %rdi, %rax
+ ret
+
+ .p2align 4
+L(return_null):
+ xor %rax, %rax
+ ret
+
+END (WCSCHR)
diff --git a/sysdeps/x86_64/wcschr.S b/sysdeps/x86_64/wcschr.S
index 2131220382..80b12c4286 100644
--- a/sysdeps/x86_64/wcschr.S
+++ b/sysdeps/x86_64/wcschr.S
@@ -16,140 +16,9 @@
License along with the GNU C Library; if not, see
<https://www.gnu.org/licenses/>. */
-#include <sysdep.h>
-
- .text
-ENTRY (__wcschr)
-
- movd %rsi, %xmm1
- pxor %xmm2, %xmm2
- mov %rdi, %rcx
- punpckldq %xmm1, %xmm1
- punpckldq %xmm1, %xmm1
-
- and $63, %rcx
- cmp $48, %rcx
- ja L(cross_cache)
-
- movdqu (%rdi), %xmm0
- pcmpeqd %xmm0, %xmm2
- add $16, %rdi
- pcmpeqd %xmm1, %xmm0
- pmovmskb %xmm2, %rdx
- pmovmskb %xmm0, %rax
- or %rax, %rdx
- jnz L(matches)
-
- and $-16, %rdi
-
- movdqa (%rdi), %xmm0
- pcmpeqd %xmm0, %xmm2
- add $16, %rdi
- pcmpeqd %xmm1, %xmm0
- pmovmskb %xmm2, %rdx
- pmovmskb %xmm0, %rax
- or %rax, %rdx
- jnz L(matches)
-
- jmp L(loop)
-
-L(cross_cache):
- and $15, %rcx
- and $-16, %rdi
- movdqa (%rdi), %xmm0
- pcmpeqd %xmm0, %xmm2
- pcmpeqd %xmm1, %xmm0
- pmovmskb %xmm2, %rdx
- pmovmskb %xmm0, %rax
-
- sar %cl, %rdx
- sar %cl, %rax
- test %rax, %rax
- je L(unaligned_no_match)
-
- bsf %rax, %rax
- test %rdx, %rdx
- je L(unaligned_match)
- bsf %rdx, %rdx
- cmp %rdx, %rax
- ja L(return_null)
-
-L(unaligned_match):
- add %rdi, %rax
- add %rcx, %rax
- ret
-
- .p2align 4
-L(unaligned_no_match):
- test %rdx, %rdx
- jne L(return_null)
- pxor %xmm2, %xmm2
-
- add $16, %rdi
-
- .p2align 4
-/* Loop start on aligned string. */
-L(loop):
- movdqa (%rdi), %xmm0
- pcmpeqd %xmm0, %xmm2
- add $16, %rdi
- pcmpeqd %xmm1, %xmm0
- pmovmskb %xmm2, %rdx
- pmovmskb %xmm0, %rax
- or %rax, %rdx
- jnz L(matches)
-
- movdqa (%rdi), %xmm0
- pcmpeqd %xmm0, %xmm2
- add $16, %rdi
- pcmpeqd %xmm1, %xmm0
- pmovmskb %xmm2, %rdx
- pmovmskb %xmm0, %rax
- or %rax, %rdx
- jnz L(matches)
-
- movdqa (%rdi), %xmm0
- pcmpeqd %xmm0, %xmm2
- add $16, %rdi
- pcmpeqd %xmm1, %xmm0
- pmovmskb %xmm2, %rdx
- pmovmskb %xmm0, %rax
- or %rax, %rdx
- jnz L(matches)
-
- movdqa (%rdi), %xmm0
- pcmpeqd %xmm0, %xmm2
- add $16, %rdi
- pcmpeqd %xmm1, %xmm0
- pmovmskb %xmm2, %rdx
- pmovmskb %xmm0, %rax
- or %rax, %rdx
- jnz L(matches)
- jmp L(loop)
-
- .p2align 4
-L(matches):
- pmovmskb %xmm2, %rdx
- test %rax, %rax
- jz L(return_null)
- bsf %rax, %rax
- test %rdx, %rdx
- je L(match)
- bsf %rdx, %rcx
- cmp %rcx, %rax
- ja L(return_null)
-L(match):
- sub $16, %rdi
- add %rdi, %rax
- ret
-
- .p2align 4
-L(return_null):
- xor %rax, %rax
- ret
-
-END (__wcschr)
+#define WCSCHR __wcschr
+#include "multiarch/wcschr-sse2.S"
libc_hidden_def(__wcschr)
weak_alias (__wcschr, wcschr)
libc_hidden_weak (wcschr)
--
2.34.1
^ permalink raw reply [flat|nested] 21+ messages in thread
* [PATCH v1] x86: Move wcslen SSE2 implementation to multiarch/wcslen-sse2.S
2022-07-12 19:29 [PATCH v1] x86: Move strlen SSE2 implementation to multiarch/strlen-sse2.S Noah Goldstein
` (5 preceding siblings ...)
2022-07-12 19:29 ` [PATCH v1] x86: Move wcschr SSE2 implementation to multiarch/wcschr-sse2.S Noah Goldstein
@ 2022-07-12 19:29 ` Noah Goldstein
2022-07-12 20:26 ` H.J. Lu
2022-07-12 19:29 ` [PATCH v1] x86: Remove unneeded rtld-wmemcmp Noah Goldstein
` (2 subsequent siblings)
9 siblings, 1 reply; 21+ messages in thread
From: Noah Goldstein @ 2022-07-12 19:29 UTC (permalink / raw)
To: libc-alpha
This commit doesn't affect libc.so.6, its just housekeeping to prepare
for adding explicit ISA level support.
Tested build on x86_64 and x86_32 with/without multiarch.
---
sysdeps/x86_64/multiarch/wcslen-sse2.S | 221 ++++++++++++++++++++++++-
sysdeps/x86_64/wcslen.S | 216 +-----------------------
2 files changed, 218 insertions(+), 219 deletions(-)
diff --git a/sysdeps/x86_64/multiarch/wcslen-sse2.S b/sysdeps/x86_64/multiarch/wcslen-sse2.S
index 2b3a9efd64..944c3bd9c6 100644
--- a/sysdeps/x86_64/multiarch/wcslen-sse2.S
+++ b/sysdeps/x86_64/multiarch/wcslen-sse2.S
@@ -17,10 +17,221 @@
<https://www.gnu.org/licenses/>. */
#if IS_IN (libc)
-# define __wcslen __wcslen_sse2
-
-# undef weak_alias
-# define weak_alias(__wcslen, wcslen)
+# ifndef WCSLEN
+# define WCSLEN __wcslen_sse2
+# endif
#endif
-#include "../wcslen.S"
+#include <sysdep.h>
+
+ .text
+ENTRY (WCSLEN)
+ cmpl $0, (%rdi)
+ jz L(exit_tail0)
+ cmpl $0, 4(%rdi)
+ jz L(exit_tail1)
+ cmpl $0, 8(%rdi)
+ jz L(exit_tail2)
+ cmpl $0, 12(%rdi)
+ jz L(exit_tail3)
+ cmpl $0, 16(%rdi)
+ jz L(exit_tail4)
+ cmpl $0, 20(%rdi)
+ jz L(exit_tail5)
+ cmpl $0, 24(%rdi)
+ jz L(exit_tail6)
+ cmpl $0, 28(%rdi)
+ jz L(exit_tail7)
+
+ pxor %xmm0, %xmm0
+
+ lea 32(%rdi), %rax
+ addq $16, %rdi
+ and $-16, %rax
+
+ pcmpeqd (%rax), %xmm0
+ pmovmskb %xmm0, %edx
+ pxor %xmm1, %xmm1
+ addq $16, %rax
+ test %edx, %edx
+ jnz L(exit)
+
+ pcmpeqd (%rax), %xmm1
+ pmovmskb %xmm1, %edx
+ pxor %xmm2, %xmm2
+ addq $16, %rax
+ test %edx, %edx
+ jnz L(exit)
+
+ pcmpeqd (%rax), %xmm2
+ pmovmskb %xmm2, %edx
+ pxor %xmm3, %xmm3
+ addq $16, %rax
+ test %edx, %edx
+ jnz L(exit)
+
+ pcmpeqd (%rax), %xmm3
+ pmovmskb %xmm3, %edx
+ addq $16, %rax
+ test %edx, %edx
+ jnz L(exit)
+
+ pcmpeqd (%rax), %xmm0
+ pmovmskb %xmm0, %edx
+ addq $16, %rax
+ test %edx, %edx
+ jnz L(exit)
+
+ pcmpeqd (%rax), %xmm1
+ pmovmskb %xmm1, %edx
+ addq $16, %rax
+ test %edx, %edx
+ jnz L(exit)
+
+ pcmpeqd (%rax), %xmm2
+ pmovmskb %xmm2, %edx
+ addq $16, %rax
+ test %edx, %edx
+ jnz L(exit)
+
+ pcmpeqd (%rax), %xmm3
+ pmovmskb %xmm3, %edx
+ addq $16, %rax
+ test %edx, %edx
+ jnz L(exit)
+
+ pcmpeqd (%rax), %xmm0
+ pmovmskb %xmm0, %edx
+ addq $16, %rax
+ test %edx, %edx
+ jnz L(exit)
+
+ pcmpeqd (%rax), %xmm1
+ pmovmskb %xmm1, %edx
+ addq $16, %rax
+ test %edx, %edx
+ jnz L(exit)
+
+ pcmpeqd (%rax), %xmm2
+ pmovmskb %xmm2, %edx
+ addq $16, %rax
+ test %edx, %edx
+ jnz L(exit)
+
+ pcmpeqd (%rax), %xmm3
+ pmovmskb %xmm3, %edx
+ addq $16, %rax
+ test %edx, %edx
+ jnz L(exit)
+
+ and $-0x40, %rax
+
+ .p2align 4
+L(aligned_64_loop):
+ movaps (%rax), %xmm0
+ movaps 16(%rax), %xmm1
+ movaps 32(%rax), %xmm2
+ movaps 48(%rax), %xmm6
+
+ pminub %xmm1, %xmm0
+ pminub %xmm6, %xmm2
+ pminub %xmm0, %xmm2
+ pcmpeqd %xmm3, %xmm2
+ pmovmskb %xmm2, %edx
+ addq $64, %rax
+ test %edx, %edx
+ jz L(aligned_64_loop)
+
+ pcmpeqd -64(%rax), %xmm3
+ pmovmskb %xmm3, %edx
+ addq $48, %rdi
+ test %edx, %edx
+ jnz L(exit)
+
+ pcmpeqd %xmm1, %xmm3
+ pmovmskb %xmm3, %edx
+ addq $-16, %rdi
+ test %edx, %edx
+ jnz L(exit)
+
+ pcmpeqd -32(%rax), %xmm3
+ pmovmskb %xmm3, %edx
+ addq $-16, %rdi
+ test %edx, %edx
+ jnz L(exit)
+
+ pcmpeqd %xmm6, %xmm3
+ pmovmskb %xmm3, %edx
+ addq $-16, %rdi
+ test %edx, %edx
+ jz L(aligned_64_loop)
+
+ .p2align 4
+L(exit):
+ sub %rdi, %rax
+ shr $2, %rax
+ test %dl, %dl
+ jz L(exit_high)
+
+ andl $15, %edx
+ jz L(exit_1)
+ ret
+
+ /* No align here. Naturally aligned % 16 == 1. */
+L(exit_high):
+ andl $(15 << 8), %edx
+ jz L(exit_3)
+ add $2, %rax
+ ret
+
+ .p2align 3
+L(exit_1):
+ add $1, %rax
+ ret
+
+ .p2align 3
+L(exit_3):
+ add $3, %rax
+ ret
+
+ .p2align 3
+L(exit_tail0):
+ xorl %eax, %eax
+ ret
+
+ .p2align 3
+L(exit_tail1):
+ movl $1, %eax
+ ret
+
+ .p2align 3
+L(exit_tail2):
+ movl $2, %eax
+ ret
+
+ .p2align 3
+L(exit_tail3):
+ movl $3, %eax
+ ret
+
+ .p2align 3
+L(exit_tail4):
+ movl $4, %eax
+ ret
+
+ .p2align 3
+L(exit_tail5):
+ movl $5, %eax
+ ret
+
+ .p2align 3
+L(exit_tail6):
+ movl $6, %eax
+ ret
+
+ .p2align 3
+L(exit_tail7):
+ movl $7, %eax
+ ret
+
+END (WCSLEN)
diff --git a/sysdeps/x86_64/wcslen.S b/sysdeps/x86_64/wcslen.S
index d641141d75..588a0fbe01 100644
--- a/sysdeps/x86_64/wcslen.S
+++ b/sysdeps/x86_64/wcslen.S
@@ -16,218 +16,6 @@
License along with the GNU C Library; if not, see
<https://www.gnu.org/licenses/>. */
-#include <sysdep.h>
-
- .text
-ENTRY (__wcslen)
- cmpl $0, (%rdi)
- jz L(exit_tail0)
- cmpl $0, 4(%rdi)
- jz L(exit_tail1)
- cmpl $0, 8(%rdi)
- jz L(exit_tail2)
- cmpl $0, 12(%rdi)
- jz L(exit_tail3)
- cmpl $0, 16(%rdi)
- jz L(exit_tail4)
- cmpl $0, 20(%rdi)
- jz L(exit_tail5)
- cmpl $0, 24(%rdi)
- jz L(exit_tail6)
- cmpl $0, 28(%rdi)
- jz L(exit_tail7)
-
- pxor %xmm0, %xmm0
-
- lea 32(%rdi), %rax
- addq $16, %rdi
- and $-16, %rax
-
- pcmpeqd (%rax), %xmm0
- pmovmskb %xmm0, %edx
- pxor %xmm1, %xmm1
- addq $16, %rax
- test %edx, %edx
- jnz L(exit)
-
- pcmpeqd (%rax), %xmm1
- pmovmskb %xmm1, %edx
- pxor %xmm2, %xmm2
- addq $16, %rax
- test %edx, %edx
- jnz L(exit)
-
- pcmpeqd (%rax), %xmm2
- pmovmskb %xmm2, %edx
- pxor %xmm3, %xmm3
- addq $16, %rax
- test %edx, %edx
- jnz L(exit)
-
- pcmpeqd (%rax), %xmm3
- pmovmskb %xmm3, %edx
- addq $16, %rax
- test %edx, %edx
- jnz L(exit)
-
- pcmpeqd (%rax), %xmm0
- pmovmskb %xmm0, %edx
- addq $16, %rax
- test %edx, %edx
- jnz L(exit)
-
- pcmpeqd (%rax), %xmm1
- pmovmskb %xmm1, %edx
- addq $16, %rax
- test %edx, %edx
- jnz L(exit)
-
- pcmpeqd (%rax), %xmm2
- pmovmskb %xmm2, %edx
- addq $16, %rax
- test %edx, %edx
- jnz L(exit)
-
- pcmpeqd (%rax), %xmm3
- pmovmskb %xmm3, %edx
- addq $16, %rax
- test %edx, %edx
- jnz L(exit)
-
- pcmpeqd (%rax), %xmm0
- pmovmskb %xmm0, %edx
- addq $16, %rax
- test %edx, %edx
- jnz L(exit)
-
- pcmpeqd (%rax), %xmm1
- pmovmskb %xmm1, %edx
- addq $16, %rax
- test %edx, %edx
- jnz L(exit)
-
- pcmpeqd (%rax), %xmm2
- pmovmskb %xmm2, %edx
- addq $16, %rax
- test %edx, %edx
- jnz L(exit)
-
- pcmpeqd (%rax), %xmm3
- pmovmskb %xmm3, %edx
- addq $16, %rax
- test %edx, %edx
- jnz L(exit)
-
- and $-0x40, %rax
-
- .p2align 4
-L(aligned_64_loop):
- movaps (%rax), %xmm0
- movaps 16(%rax), %xmm1
- movaps 32(%rax), %xmm2
- movaps 48(%rax), %xmm6
-
- pminub %xmm1, %xmm0
- pminub %xmm6, %xmm2
- pminub %xmm0, %xmm2
- pcmpeqd %xmm3, %xmm2
- pmovmskb %xmm2, %edx
- addq $64, %rax
- test %edx, %edx
- jz L(aligned_64_loop)
-
- pcmpeqd -64(%rax), %xmm3
- pmovmskb %xmm3, %edx
- addq $48, %rdi
- test %edx, %edx
- jnz L(exit)
-
- pcmpeqd %xmm1, %xmm3
- pmovmskb %xmm3, %edx
- addq $-16, %rdi
- test %edx, %edx
- jnz L(exit)
-
- pcmpeqd -32(%rax), %xmm3
- pmovmskb %xmm3, %edx
- addq $-16, %rdi
- test %edx, %edx
- jnz L(exit)
-
- pcmpeqd %xmm6, %xmm3
- pmovmskb %xmm3, %edx
- addq $-16, %rdi
- test %edx, %edx
- jz L(aligned_64_loop)
-
- .p2align 4
-L(exit):
- sub %rdi, %rax
- shr $2, %rax
- test %dl, %dl
- jz L(exit_high)
-
- andl $15, %edx
- jz L(exit_1)
- ret
-
- /* No align here. Naturally aligned % 16 == 1. */
-L(exit_high):
- andl $(15 << 8), %edx
- jz L(exit_3)
- add $2, %rax
- ret
-
- .p2align 3
-L(exit_1):
- add $1, %rax
- ret
-
- .p2align 3
-L(exit_3):
- add $3, %rax
- ret
-
- .p2align 3
-L(exit_tail0):
- xorl %eax, %eax
- ret
-
- .p2align 3
-L(exit_tail1):
- movl $1, %eax
- ret
-
- .p2align 3
-L(exit_tail2):
- movl $2, %eax
- ret
-
- .p2align 3
-L(exit_tail3):
- movl $3, %eax
- ret
-
- .p2align 3
-L(exit_tail4):
- movl $4, %eax
- ret
-
- .p2align 3
-L(exit_tail5):
- movl $5, %eax
- ret
-
- .p2align 3
-L(exit_tail6):
- movl $6, %eax
- ret
-
- .p2align 3
-L(exit_tail7):
- movl $7, %eax
- ret
-
-END (__wcslen)
-
+#define WCSLEN __wcslen
+#include "multiarch/wcslen-sse2.S"
weak_alias(__wcslen, wcslen)
--
2.34.1
^ permalink raw reply [flat|nested] 21+ messages in thread
* [PATCH v1] x86: Remove unneeded rtld-wmemcmp
2022-07-12 19:29 [PATCH v1] x86: Move strlen SSE2 implementation to multiarch/strlen-sse2.S Noah Goldstein
` (6 preceding siblings ...)
2022-07-12 19:29 ` [PATCH v1] x86: Move wcslen SSE2 implementation to multiarch/wcslen-sse2.S Noah Goldstein
@ 2022-07-12 19:29 ` Noah Goldstein
2022-07-12 19:44 ` H.J. Lu
2022-07-12 19:29 ` [PATCH v1] x86: Add missing rtm tests for strcmp family Noah Goldstein
2022-07-12 23:29 ` [PATCH v1] x86: Move strlen SSE2 implementation to multiarch/strlen-sse2.S H.J. Lu
9 siblings, 1 reply; 21+ messages in thread
From: Noah Goldstein @ 2022-07-12 19:29 UTC (permalink / raw)
To: libc-alpha
wmemcmp isn't used by the dynamic loader so their no need to add an
RTLD stub for it.
Tested with and without multiarch on x86_64 for ISA levels:
{generic, x86-64-v2, x86-64-v3, x86-64-v4}
And m32 with and without multiarch.
---
sysdeps/x86_64/multiarch/rtld-wmemcmp.S | 18 ------------------
1 file changed, 18 deletions(-)
delete mode 100644 sysdeps/x86_64/multiarch/rtld-wmemcmp.S
diff --git a/sysdeps/x86_64/multiarch/rtld-wmemcmp.S b/sysdeps/x86_64/multiarch/rtld-wmemcmp.S
deleted file mode 100644
index 71a6f0affa..0000000000
--- a/sysdeps/x86_64/multiarch/rtld-wmemcmp.S
+++ /dev/null
@@ -1,18 +0,0 @@
-/* Copyright (C) 2022 Free Software Foundation, Inc.
- This file is part of the GNU C Library.
-
- The GNU C Library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- The GNU C Library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with the GNU C Library; if not, see
- <https://www.gnu.org/licenses/>. */
-
-#include "../wmemcmp.S"
--
2.34.1
^ permalink raw reply [flat|nested] 21+ messages in thread
* [PATCH v1] x86: Add missing rtm tests for strcmp family
2022-07-12 19:29 [PATCH v1] x86: Move strlen SSE2 implementation to multiarch/strlen-sse2.S Noah Goldstein
` (7 preceding siblings ...)
2022-07-12 19:29 ` [PATCH v1] x86: Remove unneeded rtld-wmemcmp Noah Goldstein
@ 2022-07-12 19:29 ` Noah Goldstein
2022-07-12 19:59 ` H.J. Lu
2022-07-12 23:29 ` [PATCH v1] x86: Move strlen SSE2 implementation to multiarch/strlen-sse2.S H.J. Lu
9 siblings, 1 reply; 21+ messages in thread
From: Noah Goldstein @ 2022-07-12 19:29 UTC (permalink / raw)
To: libc-alpha
Add new tests for:
strcasecmp
strncasecmp
strcmp
wcscmp
These functions all have avx2_rtm implementations so should be tested.
---
sysdeps/x86/Makefile | 8 ++++
sysdeps/x86/tst-strcasecmp-rtm.c | 23 ++++++++++
sysdeps/x86/tst-strcmp-rtm.c | 70 +++++++++++++++++++++++++++++++
sysdeps/x86/tst-strncasecmp-rtm.c | 23 ++++++++++
sysdeps/x86/tst-strncmp-rtm.c | 6 ++-
sysdeps/x86/tst-wcscmp-rtm.c | 22 ++++++++++
6 files changed, 150 insertions(+), 2 deletions(-)
create mode 100644 sysdeps/x86/tst-strcasecmp-rtm.c
create mode 100644 sysdeps/x86/tst-strcmp-rtm.c
create mode 100644 sysdeps/x86/tst-strncasecmp-rtm.c
create mode 100644 sysdeps/x86/tst-wcscmp-rtm.c
diff --git a/sysdeps/x86/Makefile b/sysdeps/x86/Makefile
index c6bee981f8..56fd5fc805 100644
--- a/sysdeps/x86/Makefile
+++ b/sysdeps/x86/Makefile
@@ -95,11 +95,15 @@ tests += \
tst-memmove-rtm \
tst-memrchr-rtm \
tst-memset-rtm \
+ tst-strcasecmp-rtm \
tst-strchr-rtm \
+ tst-strcmp-rtm \
tst-strcpy-rtm \
tst-strlen-rtm \
+ tst-strncasecmp-rtm \
tst-strncmp-rtm \
tst-strrchr-rtm \
+ tst-wcscmp-rtm \
tst-wcsncmp-rtm \
# tests
@@ -108,11 +112,15 @@ CFLAGS-tst-memcmp-rtm.c += -mrtm
CFLAGS-tst-memmove-rtm.c += -mrtm
CFLAGS-tst-memrchr-rtm.c += -mrtm
CFLAGS-tst-memset-rtm.c += -mrtm
+CFLAGS-tst-strcasecmp-rtm.c += -mrtm
CFLAGS-tst-strchr-rtm.c += -mrtm
+CFLAGS-tst-strcmp-rtm.c += -mrtm
CFLAGS-tst-strcpy-rtm.c += -mrtm
CFLAGS-tst-strlen-rtm.c += -mrtm
+CFLAGS-tst-strncasecmp-rtm.c += -mrtm -Wno-error
CFLAGS-tst-strncmp-rtm.c += -mrtm -Wno-error
CFLAGS-tst-strrchr-rtm.c += -mrtm
+CFLAGS-tst-wcscmp-rtm.c += -mrtm
CFLAGS-tst-wcsncmp-rtm.c += -mrtm -Wno-error
endif
diff --git a/sysdeps/x86/tst-strcasecmp-rtm.c b/sysdeps/x86/tst-strcasecmp-rtm.c
new file mode 100644
index 0000000000..da460799ce
--- /dev/null
+++ b/sysdeps/x86/tst-strcasecmp-rtm.c
@@ -0,0 +1,23 @@
+/* Test case for strcasecmp inside a transactionally executing RTM
+ region.
+ Copyright (C) 2022 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#define STRCMP strcasecmp
+#define TEST_NAME "strcasecmp"
+
+#include "tst-strcmp-rtm.c"
diff --git a/sysdeps/x86/tst-strcmp-rtm.c b/sysdeps/x86/tst-strcmp-rtm.c
new file mode 100644
index 0000000000..371916a2f0
--- /dev/null
+++ b/sysdeps/x86/tst-strcmp-rtm.c
@@ -0,0 +1,70 @@
+/* Test case for strcmp inside a transactionally executing RTM
+ region.
+ Copyright (C) 2022 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#include <stdint.h>
+#include <tst-string-rtm.h>
+
+#ifdef WIDE
+# define CHAR wchar_t
+# define MEMSET wmemset
+# define STRCMP wcscmp
+# define TEST_NAME "wcscmp"
+#else /* !WIDE */
+# define CHAR char
+# define MEMSET memset
+
+# ifndef STRCMP
+# define STRCMP strcmp
+# define TEST_NAME "strcmp"
+# endif
+#endif
+
+#define LOOP 3000
+#define STRING_SIZE 1024
+CHAR string1[STRING_SIZE];
+CHAR string2[STRING_SIZE];
+
+__attribute__ ((noinline, noclone))
+static int
+prepare (void)
+{
+ MEMSET (string1, 'a', STRING_SIZE - 1);
+ MEMSET (string2, 'a', STRING_SIZE - 1);
+ if (STRCMP (string1, string2) == 0)
+ return EXIT_SUCCESS;
+ else
+ return EXIT_FAILURE;
+}
+
+__attribute__ ((noinline, noclone))
+static int
+function (void)
+{
+ if (STRCMP (string1, string2) == 0)
+ return 0;
+ else
+ return 1;
+}
+
+
+static int
+do_test (void)
+{
+ return do_test_1 (TEST_NAME, LOOP, prepare, function);
+}
diff --git a/sysdeps/x86/tst-strncasecmp-rtm.c b/sysdeps/x86/tst-strncasecmp-rtm.c
new file mode 100644
index 0000000000..4ebe58951b
--- /dev/null
+++ b/sysdeps/x86/tst-strncasecmp-rtm.c
@@ -0,0 +1,23 @@
+/* Test case for strncasecmp inside a transactionally executing RTM
+ region.
+ Copyright (C) 2022 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#define STRNCMP strncasecmp
+#define TEST_NAME "strncasecmp"
+
+#include "tst-strncmp-rtm.c"
diff --git a/sysdeps/x86/tst-strncmp-rtm.c b/sysdeps/x86/tst-strncmp-rtm.c
index a3b14e72ff..2d27b20a68 100644
--- a/sysdeps/x86/tst-strncmp-rtm.c
+++ b/sysdeps/x86/tst-strncmp-rtm.c
@@ -27,8 +27,10 @@
#else /* !WIDE */
# define CHAR char
# define MEMSET memset
-# define STRNCMP strncmp
-# define TEST_NAME "strncmp"
+# ifndef STRNCMP
+# define STRNCMP strncmp
+# define TEST_NAME "strncmp"
+# endif
#endif /* !WIDE */
diff --git a/sysdeps/x86/tst-wcscmp-rtm.c b/sysdeps/x86/tst-wcscmp-rtm.c
new file mode 100644
index 0000000000..28a5b4b82d
--- /dev/null
+++ b/sysdeps/x86/tst-wcscmp-rtm.c
@@ -0,0 +1,22 @@
+/* Test case for wcscmp inside a transactionally executing RTM
+ region.
+ Copyright (C) 2022 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#define WIDE 1
+#include <wchar.h>
+#include "tst-strcmp-rtm.c"
--
2.34.1
^ permalink raw reply [flat|nested] 21+ messages in thread
* Re: [PATCH v1] x86: Remove unneeded rtld-wmemcmp
2022-07-12 19:29 ` [PATCH v1] x86: Remove unneeded rtld-wmemcmp Noah Goldstein
@ 2022-07-12 19:44 ` H.J. Lu
0 siblings, 0 replies; 21+ messages in thread
From: H.J. Lu @ 2022-07-12 19:44 UTC (permalink / raw)
To: Noah Goldstein; +Cc: GNU C Library, Carlos O'Donell
On Tue, Jul 12, 2022 at 12:29 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> wmemcmp isn't used by the dynamic loader so their no need to add an
> RTLD stub for it.
>
> Tested with and without multiarch on x86_64 for ISA levels:
> {generic, x86-64-v2, x86-64-v3, x86-64-v4}
>
> And m32 with and without multiarch.
> ---
> sysdeps/x86_64/multiarch/rtld-wmemcmp.S | 18 ------------------
> 1 file changed, 18 deletions(-)
> delete mode 100644 sysdeps/x86_64/multiarch/rtld-wmemcmp.S
>
> diff --git a/sysdeps/x86_64/multiarch/rtld-wmemcmp.S b/sysdeps/x86_64/multiarch/rtld-wmemcmp.S
> deleted file mode 100644
> index 71a6f0affa..0000000000
> --- a/sysdeps/x86_64/multiarch/rtld-wmemcmp.S
> +++ /dev/null
> @@ -1,18 +0,0 @@
> -/* Copyright (C) 2022 Free Software Foundation, Inc.
> - This file is part of the GNU C Library.
> -
> - The GNU C Library is free software; you can redistribute it and/or
> - modify it under the terms of the GNU Lesser General Public
> - License as published by the Free Software Foundation; either
> - version 2.1 of the License, or (at your option) any later version.
> -
> - The GNU C Library is distributed in the hope that it will be useful,
> - but WITHOUT ANY WARRANTY; without even the implied warranty of
> - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> - Lesser General Public License for more details.
> -
> - You should have received a copy of the GNU Lesser General Public
> - License along with the GNU C Library; if not, see
> - <https://www.gnu.org/licenses/>. */
> -
> -#include "../wmemcmp.S"
> --
> 2.34.1
>
LGTM.
Thanks.
--
H.J.
^ permalink raw reply [flat|nested] 21+ messages in thread
* Re: [PATCH v1] x86: Add missing rtm tests for strcmp family
2022-07-12 19:29 ` [PATCH v1] x86: Add missing rtm tests for strcmp family Noah Goldstein
@ 2022-07-12 19:59 ` H.J. Lu
0 siblings, 0 replies; 21+ messages in thread
From: H.J. Lu @ 2022-07-12 19:59 UTC (permalink / raw)
To: Noah Goldstein; +Cc: GNU C Library, Carlos O'Donell
On Tue, Jul 12, 2022 at 12:29 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> Add new tests for:
> strcasecmp
> strncasecmp
> strcmp
> wcscmp
>
> These functions all have avx2_rtm implementations so should be tested.
> ---
> sysdeps/x86/Makefile | 8 ++++
> sysdeps/x86/tst-strcasecmp-rtm.c | 23 ++++++++++
> sysdeps/x86/tst-strcmp-rtm.c | 70 +++++++++++++++++++++++++++++++
> sysdeps/x86/tst-strncasecmp-rtm.c | 23 ++++++++++
> sysdeps/x86/tst-strncmp-rtm.c | 6 ++-
> sysdeps/x86/tst-wcscmp-rtm.c | 22 ++++++++++
> 6 files changed, 150 insertions(+), 2 deletions(-)
> create mode 100644 sysdeps/x86/tst-strcasecmp-rtm.c
> create mode 100644 sysdeps/x86/tst-strcmp-rtm.c
> create mode 100644 sysdeps/x86/tst-strncasecmp-rtm.c
> create mode 100644 sysdeps/x86/tst-wcscmp-rtm.c
>
> diff --git a/sysdeps/x86/Makefile b/sysdeps/x86/Makefile
> index c6bee981f8..56fd5fc805 100644
> --- a/sysdeps/x86/Makefile
> +++ b/sysdeps/x86/Makefile
> @@ -95,11 +95,15 @@ tests += \
> tst-memmove-rtm \
> tst-memrchr-rtm \
> tst-memset-rtm \
> + tst-strcasecmp-rtm \
> tst-strchr-rtm \
> + tst-strcmp-rtm \
> tst-strcpy-rtm \
> tst-strlen-rtm \
> + tst-strncasecmp-rtm \
> tst-strncmp-rtm \
> tst-strrchr-rtm \
> + tst-wcscmp-rtm \
> tst-wcsncmp-rtm \
> # tests
>
> @@ -108,11 +112,15 @@ CFLAGS-tst-memcmp-rtm.c += -mrtm
> CFLAGS-tst-memmove-rtm.c += -mrtm
> CFLAGS-tst-memrchr-rtm.c += -mrtm
> CFLAGS-tst-memset-rtm.c += -mrtm
> +CFLAGS-tst-strcasecmp-rtm.c += -mrtm
> CFLAGS-tst-strchr-rtm.c += -mrtm
> +CFLAGS-tst-strcmp-rtm.c += -mrtm
> CFLAGS-tst-strcpy-rtm.c += -mrtm
> CFLAGS-tst-strlen-rtm.c += -mrtm
> +CFLAGS-tst-strncasecmp-rtm.c += -mrtm -Wno-error
> CFLAGS-tst-strncmp-rtm.c += -mrtm -Wno-error
> CFLAGS-tst-strrchr-rtm.c += -mrtm
> +CFLAGS-tst-wcscmp-rtm.c += -mrtm
> CFLAGS-tst-wcsncmp-rtm.c += -mrtm -Wno-error
> endif
>
> diff --git a/sysdeps/x86/tst-strcasecmp-rtm.c b/sysdeps/x86/tst-strcasecmp-rtm.c
> new file mode 100644
> index 0000000000..da460799ce
> --- /dev/null
> +++ b/sysdeps/x86/tst-strcasecmp-rtm.c
> @@ -0,0 +1,23 @@
> +/* Test case for strcasecmp inside a transactionally executing RTM
> + region.
> + Copyright (C) 2022 Free Software Foundation, Inc.
> + This file is part of the GNU C Library.
> +
> + The GNU C Library is free software; you can redistribute it and/or
> + modify it under the terms of the GNU Lesser General Public
> + License as published by the Free Software Foundation; either
> + version 2.1 of the License, or (at your option) any later version.
> +
> + The GNU C Library is distributed in the hope that it will be useful,
> + but WITHOUT ANY WARRANTY; without even the implied warranty of
> + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + Lesser General Public License for more details.
> +
> + You should have received a copy of the GNU Lesser General Public
> + License along with the GNU C Library; if not, see
> + <https://www.gnu.org/licenses/>. */
> +
> +#define STRCMP strcasecmp
> +#define TEST_NAME "strcasecmp"
> +
> +#include "tst-strcmp-rtm.c"
> diff --git a/sysdeps/x86/tst-strcmp-rtm.c b/sysdeps/x86/tst-strcmp-rtm.c
> new file mode 100644
> index 0000000000..371916a2f0
> --- /dev/null
> +++ b/sysdeps/x86/tst-strcmp-rtm.c
> @@ -0,0 +1,70 @@
> +/* Test case for strcmp inside a transactionally executing RTM
> + region.
> + Copyright (C) 2022 Free Software Foundation, Inc.
> + This file is part of the GNU C Library.
> +
> + The GNU C Library is free software; you can redistribute it and/or
> + modify it under the terms of the GNU Lesser General Public
> + License as published by the Free Software Foundation; either
> + version 2.1 of the License, or (at your option) any later version.
> +
> + The GNU C Library is distributed in the hope that it will be useful,
> + but WITHOUT ANY WARRANTY; without even the implied warranty of
> + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + Lesser General Public License for more details.
> +
> + You should have received a copy of the GNU Lesser General Public
> + License along with the GNU C Library; if not, see
> + <https://www.gnu.org/licenses/>. */
> +
> +#include <stdint.h>
> +#include <tst-string-rtm.h>
> +
> +#ifdef WIDE
> +# define CHAR wchar_t
> +# define MEMSET wmemset
> +# define STRCMP wcscmp
> +# define TEST_NAME "wcscmp"
> +#else /* !WIDE */
> +# define CHAR char
> +# define MEMSET memset
> +
> +# ifndef STRCMP
> +# define STRCMP strcmp
> +# define TEST_NAME "strcmp"
> +# endif
> +#endif
> +
> +#define LOOP 3000
> +#define STRING_SIZE 1024
> +CHAR string1[STRING_SIZE];
> +CHAR string2[STRING_SIZE];
> +
> +__attribute__ ((noinline, noclone))
> +static int
> +prepare (void)
> +{
> + MEMSET (string1, 'a', STRING_SIZE - 1);
> + MEMSET (string2, 'a', STRING_SIZE - 1);
> + if (STRCMP (string1, string2) == 0)
> + return EXIT_SUCCESS;
> + else
> + return EXIT_FAILURE;
> +}
> +
> +__attribute__ ((noinline, noclone))
> +static int
> +function (void)
> +{
> + if (STRCMP (string1, string2) == 0)
> + return 0;
> + else
> + return 1;
> +}
> +
> +
> +static int
> +do_test (void)
> +{
> + return do_test_1 (TEST_NAME, LOOP, prepare, function);
> +}
> diff --git a/sysdeps/x86/tst-strncasecmp-rtm.c b/sysdeps/x86/tst-strncasecmp-rtm.c
> new file mode 100644
> index 0000000000..4ebe58951b
> --- /dev/null
> +++ b/sysdeps/x86/tst-strncasecmp-rtm.c
> @@ -0,0 +1,23 @@
> +/* Test case for strncasecmp inside a transactionally executing RTM
> + region.
> + Copyright (C) 2022 Free Software Foundation, Inc.
> + This file is part of the GNU C Library.
> +
> + The GNU C Library is free software; you can redistribute it and/or
> + modify it under the terms of the GNU Lesser General Public
> + License as published by the Free Software Foundation; either
> + version 2.1 of the License, or (at your option) any later version.
> +
> + The GNU C Library is distributed in the hope that it will be useful,
> + but WITHOUT ANY WARRANTY; without even the implied warranty of
> + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + Lesser General Public License for more details.
> +
> + You should have received a copy of the GNU Lesser General Public
> + License along with the GNU C Library; if not, see
> + <https://www.gnu.org/licenses/>. */
> +
> +#define STRNCMP strncasecmp
> +#define TEST_NAME "strncasecmp"
> +
> +#include "tst-strncmp-rtm.c"
> diff --git a/sysdeps/x86/tst-strncmp-rtm.c b/sysdeps/x86/tst-strncmp-rtm.c
> index a3b14e72ff..2d27b20a68 100644
> --- a/sysdeps/x86/tst-strncmp-rtm.c
> +++ b/sysdeps/x86/tst-strncmp-rtm.c
> @@ -27,8 +27,10 @@
> #else /* !WIDE */
> # define CHAR char
> # define MEMSET memset
> -# define STRNCMP strncmp
> -# define TEST_NAME "strncmp"
> +# ifndef STRNCMP
> +# define STRNCMP strncmp
> +# define TEST_NAME "strncmp"
> +# endif
> #endif /* !WIDE */
>
>
> diff --git a/sysdeps/x86/tst-wcscmp-rtm.c b/sysdeps/x86/tst-wcscmp-rtm.c
> new file mode 100644
> index 0000000000..28a5b4b82d
> --- /dev/null
> +++ b/sysdeps/x86/tst-wcscmp-rtm.c
> @@ -0,0 +1,22 @@
> +/* Test case for wcscmp inside a transactionally executing RTM
> + region.
> + Copyright (C) 2022 Free Software Foundation, Inc.
> + This file is part of the GNU C Library.
> +
> + The GNU C Library is free software; you can redistribute it and/or
> + modify it under the terms of the GNU Lesser General Public
> + License as published by the Free Software Foundation; either
> + version 2.1 of the License, or (at your option) any later version.
> +
> + The GNU C Library is distributed in the hope that it will be useful,
> + but WITHOUT ANY WARRANTY; without even the implied warranty of
> + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + Lesser General Public License for more details.
> +
> + You should have received a copy of the GNU Lesser General Public
> + License along with the GNU C Library; if not, see
> + <https://www.gnu.org/licenses/>. */
> +
> +#define WIDE 1
> +#include <wchar.h>
> +#include "tst-strcmp-rtm.c"
> --
> 2.34.1
>
LGTM.
Thanks.
--
H.J.
^ permalink raw reply [flat|nested] 21+ messages in thread
* Re: [PATCH v1] x86: Move wcslen SSE2 implementation to multiarch/wcslen-sse2.S
2022-07-12 19:29 ` [PATCH v1] x86: Move wcslen SSE2 implementation to multiarch/wcslen-sse2.S Noah Goldstein
@ 2022-07-12 20:26 ` H.J. Lu
0 siblings, 0 replies; 21+ messages in thread
From: H.J. Lu @ 2022-07-12 20:26 UTC (permalink / raw)
To: Noah Goldstein; +Cc: GNU C Library, Carlos O'Donell
On Tue, Jul 12, 2022 at 12:29 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> This commit doesn't affect libc.so.6, its just housekeeping to prepare
> for adding explicit ISA level support.
>
> Tested build on x86_64 and x86_32 with/without multiarch.
> ---
> sysdeps/x86_64/multiarch/wcslen-sse2.S | 221 ++++++++++++++++++++++++-
> sysdeps/x86_64/wcslen.S | 216 +-----------------------
> 2 files changed, 218 insertions(+), 219 deletions(-)
>
> diff --git a/sysdeps/x86_64/multiarch/wcslen-sse2.S b/sysdeps/x86_64/multiarch/wcslen-sse2.S
> index 2b3a9efd64..944c3bd9c6 100644
> --- a/sysdeps/x86_64/multiarch/wcslen-sse2.S
> +++ b/sysdeps/x86_64/multiarch/wcslen-sse2.S
> @@ -17,10 +17,221 @@
> <https://www.gnu.org/licenses/>. */
>
> #if IS_IN (libc)
> -# define __wcslen __wcslen_sse2
> -
> -# undef weak_alias
> -# define weak_alias(__wcslen, wcslen)
> +# ifndef WCSLEN
> +# define WCSLEN __wcslen_sse2
> +# endif
> #endif
>
> -#include "../wcslen.S"
> +#include <sysdep.h>
> +
> + .text
> +ENTRY (WCSLEN)
> + cmpl $0, (%rdi)
> + jz L(exit_tail0)
> + cmpl $0, 4(%rdi)
> + jz L(exit_tail1)
> + cmpl $0, 8(%rdi)
> + jz L(exit_tail2)
> + cmpl $0, 12(%rdi)
> + jz L(exit_tail3)
> + cmpl $0, 16(%rdi)
> + jz L(exit_tail4)
> + cmpl $0, 20(%rdi)
> + jz L(exit_tail5)
> + cmpl $0, 24(%rdi)
> + jz L(exit_tail6)
> + cmpl $0, 28(%rdi)
> + jz L(exit_tail7)
> +
> + pxor %xmm0, %xmm0
> +
> + lea 32(%rdi), %rax
> + addq $16, %rdi
> + and $-16, %rax
> +
> + pcmpeqd (%rax), %xmm0
> + pmovmskb %xmm0, %edx
> + pxor %xmm1, %xmm1
> + addq $16, %rax
> + test %edx, %edx
> + jnz L(exit)
> +
> + pcmpeqd (%rax), %xmm1
> + pmovmskb %xmm1, %edx
> + pxor %xmm2, %xmm2
> + addq $16, %rax
> + test %edx, %edx
> + jnz L(exit)
> +
> + pcmpeqd (%rax), %xmm2
> + pmovmskb %xmm2, %edx
> + pxor %xmm3, %xmm3
> + addq $16, %rax
> + test %edx, %edx
> + jnz L(exit)
> +
> + pcmpeqd (%rax), %xmm3
> + pmovmskb %xmm3, %edx
> + addq $16, %rax
> + test %edx, %edx
> + jnz L(exit)
> +
> + pcmpeqd (%rax), %xmm0
> + pmovmskb %xmm0, %edx
> + addq $16, %rax
> + test %edx, %edx
> + jnz L(exit)
> +
> + pcmpeqd (%rax), %xmm1
> + pmovmskb %xmm1, %edx
> + addq $16, %rax
> + test %edx, %edx
> + jnz L(exit)
> +
> + pcmpeqd (%rax), %xmm2
> + pmovmskb %xmm2, %edx
> + addq $16, %rax
> + test %edx, %edx
> + jnz L(exit)
> +
> + pcmpeqd (%rax), %xmm3
> + pmovmskb %xmm3, %edx
> + addq $16, %rax
> + test %edx, %edx
> + jnz L(exit)
> +
> + pcmpeqd (%rax), %xmm0
> + pmovmskb %xmm0, %edx
> + addq $16, %rax
> + test %edx, %edx
> + jnz L(exit)
> +
> + pcmpeqd (%rax), %xmm1
> + pmovmskb %xmm1, %edx
> + addq $16, %rax
> + test %edx, %edx
> + jnz L(exit)
> +
> + pcmpeqd (%rax), %xmm2
> + pmovmskb %xmm2, %edx
> + addq $16, %rax
> + test %edx, %edx
> + jnz L(exit)
> +
> + pcmpeqd (%rax), %xmm3
> + pmovmskb %xmm3, %edx
> + addq $16, %rax
> + test %edx, %edx
> + jnz L(exit)
> +
> + and $-0x40, %rax
> +
> + .p2align 4
> +L(aligned_64_loop):
> + movaps (%rax), %xmm0
> + movaps 16(%rax), %xmm1
> + movaps 32(%rax), %xmm2
> + movaps 48(%rax), %xmm6
> +
> + pminub %xmm1, %xmm0
> + pminub %xmm6, %xmm2
> + pminub %xmm0, %xmm2
> + pcmpeqd %xmm3, %xmm2
> + pmovmskb %xmm2, %edx
> + addq $64, %rax
> + test %edx, %edx
> + jz L(aligned_64_loop)
> +
> + pcmpeqd -64(%rax), %xmm3
> + pmovmskb %xmm3, %edx
> + addq $48, %rdi
> + test %edx, %edx
> + jnz L(exit)
> +
> + pcmpeqd %xmm1, %xmm3
> + pmovmskb %xmm3, %edx
> + addq $-16, %rdi
> + test %edx, %edx
> + jnz L(exit)
> +
> + pcmpeqd -32(%rax), %xmm3
> + pmovmskb %xmm3, %edx
> + addq $-16, %rdi
> + test %edx, %edx
> + jnz L(exit)
> +
> + pcmpeqd %xmm6, %xmm3
> + pmovmskb %xmm3, %edx
> + addq $-16, %rdi
> + test %edx, %edx
> + jz L(aligned_64_loop)
> +
> + .p2align 4
> +L(exit):
> + sub %rdi, %rax
> + shr $2, %rax
> + test %dl, %dl
> + jz L(exit_high)
> +
> + andl $15, %edx
> + jz L(exit_1)
> + ret
> +
> + /* No align here. Naturally aligned % 16 == 1. */
> +L(exit_high):
> + andl $(15 << 8), %edx
> + jz L(exit_3)
> + add $2, %rax
> + ret
> +
> + .p2align 3
> +L(exit_1):
> + add $1, %rax
> + ret
> +
> + .p2align 3
> +L(exit_3):
> + add $3, %rax
> + ret
> +
> + .p2align 3
> +L(exit_tail0):
> + xorl %eax, %eax
> + ret
> +
> + .p2align 3
> +L(exit_tail1):
> + movl $1, %eax
> + ret
> +
> + .p2align 3
> +L(exit_tail2):
> + movl $2, %eax
> + ret
> +
> + .p2align 3
> +L(exit_tail3):
> + movl $3, %eax
> + ret
> +
> + .p2align 3
> +L(exit_tail4):
> + movl $4, %eax
> + ret
> +
> + .p2align 3
> +L(exit_tail5):
> + movl $5, %eax
> + ret
> +
> + .p2align 3
> +L(exit_tail6):
> + movl $6, %eax
> + ret
> +
> + .p2align 3
> +L(exit_tail7):
> + movl $7, %eax
> + ret
> +
> +END (WCSLEN)
> diff --git a/sysdeps/x86_64/wcslen.S b/sysdeps/x86_64/wcslen.S
> index d641141d75..588a0fbe01 100644
> --- a/sysdeps/x86_64/wcslen.S
> +++ b/sysdeps/x86_64/wcslen.S
> @@ -16,218 +16,6 @@
> License along with the GNU C Library; if not, see
> <https://www.gnu.org/licenses/>. */
>
> -#include <sysdep.h>
> -
> - .text
> -ENTRY (__wcslen)
> - cmpl $0, (%rdi)
> - jz L(exit_tail0)
> - cmpl $0, 4(%rdi)
> - jz L(exit_tail1)
> - cmpl $0, 8(%rdi)
> - jz L(exit_tail2)
> - cmpl $0, 12(%rdi)
> - jz L(exit_tail3)
> - cmpl $0, 16(%rdi)
> - jz L(exit_tail4)
> - cmpl $0, 20(%rdi)
> - jz L(exit_tail5)
> - cmpl $0, 24(%rdi)
> - jz L(exit_tail6)
> - cmpl $0, 28(%rdi)
> - jz L(exit_tail7)
> -
> - pxor %xmm0, %xmm0
> -
> - lea 32(%rdi), %rax
> - addq $16, %rdi
> - and $-16, %rax
> -
> - pcmpeqd (%rax), %xmm0
> - pmovmskb %xmm0, %edx
> - pxor %xmm1, %xmm1
> - addq $16, %rax
> - test %edx, %edx
> - jnz L(exit)
> -
> - pcmpeqd (%rax), %xmm1
> - pmovmskb %xmm1, %edx
> - pxor %xmm2, %xmm2
> - addq $16, %rax
> - test %edx, %edx
> - jnz L(exit)
> -
> - pcmpeqd (%rax), %xmm2
> - pmovmskb %xmm2, %edx
> - pxor %xmm3, %xmm3
> - addq $16, %rax
> - test %edx, %edx
> - jnz L(exit)
> -
> - pcmpeqd (%rax), %xmm3
> - pmovmskb %xmm3, %edx
> - addq $16, %rax
> - test %edx, %edx
> - jnz L(exit)
> -
> - pcmpeqd (%rax), %xmm0
> - pmovmskb %xmm0, %edx
> - addq $16, %rax
> - test %edx, %edx
> - jnz L(exit)
> -
> - pcmpeqd (%rax), %xmm1
> - pmovmskb %xmm1, %edx
> - addq $16, %rax
> - test %edx, %edx
> - jnz L(exit)
> -
> - pcmpeqd (%rax), %xmm2
> - pmovmskb %xmm2, %edx
> - addq $16, %rax
> - test %edx, %edx
> - jnz L(exit)
> -
> - pcmpeqd (%rax), %xmm3
> - pmovmskb %xmm3, %edx
> - addq $16, %rax
> - test %edx, %edx
> - jnz L(exit)
> -
> - pcmpeqd (%rax), %xmm0
> - pmovmskb %xmm0, %edx
> - addq $16, %rax
> - test %edx, %edx
> - jnz L(exit)
> -
> - pcmpeqd (%rax), %xmm1
> - pmovmskb %xmm1, %edx
> - addq $16, %rax
> - test %edx, %edx
> - jnz L(exit)
> -
> - pcmpeqd (%rax), %xmm2
> - pmovmskb %xmm2, %edx
> - addq $16, %rax
> - test %edx, %edx
> - jnz L(exit)
> -
> - pcmpeqd (%rax), %xmm3
> - pmovmskb %xmm3, %edx
> - addq $16, %rax
> - test %edx, %edx
> - jnz L(exit)
> -
> - and $-0x40, %rax
> -
> - .p2align 4
> -L(aligned_64_loop):
> - movaps (%rax), %xmm0
> - movaps 16(%rax), %xmm1
> - movaps 32(%rax), %xmm2
> - movaps 48(%rax), %xmm6
> -
> - pminub %xmm1, %xmm0
> - pminub %xmm6, %xmm2
> - pminub %xmm0, %xmm2
> - pcmpeqd %xmm3, %xmm2
> - pmovmskb %xmm2, %edx
> - addq $64, %rax
> - test %edx, %edx
> - jz L(aligned_64_loop)
> -
> - pcmpeqd -64(%rax), %xmm3
> - pmovmskb %xmm3, %edx
> - addq $48, %rdi
> - test %edx, %edx
> - jnz L(exit)
> -
> - pcmpeqd %xmm1, %xmm3
> - pmovmskb %xmm3, %edx
> - addq $-16, %rdi
> - test %edx, %edx
> - jnz L(exit)
> -
> - pcmpeqd -32(%rax), %xmm3
> - pmovmskb %xmm3, %edx
> - addq $-16, %rdi
> - test %edx, %edx
> - jnz L(exit)
> -
> - pcmpeqd %xmm6, %xmm3
> - pmovmskb %xmm3, %edx
> - addq $-16, %rdi
> - test %edx, %edx
> - jz L(aligned_64_loop)
> -
> - .p2align 4
> -L(exit):
> - sub %rdi, %rax
> - shr $2, %rax
> - test %dl, %dl
> - jz L(exit_high)
> -
> - andl $15, %edx
> - jz L(exit_1)
> - ret
> -
> - /* No align here. Naturally aligned % 16 == 1. */
> -L(exit_high):
> - andl $(15 << 8), %edx
> - jz L(exit_3)
> - add $2, %rax
> - ret
> -
> - .p2align 3
> -L(exit_1):
> - add $1, %rax
> - ret
> -
> - .p2align 3
> -L(exit_3):
> - add $3, %rax
> - ret
> -
> - .p2align 3
> -L(exit_tail0):
> - xorl %eax, %eax
> - ret
> -
> - .p2align 3
> -L(exit_tail1):
> - movl $1, %eax
> - ret
> -
> - .p2align 3
> -L(exit_tail2):
> - movl $2, %eax
> - ret
> -
> - .p2align 3
> -L(exit_tail3):
> - movl $3, %eax
> - ret
> -
> - .p2align 3
> -L(exit_tail4):
> - movl $4, %eax
> - ret
> -
> - .p2align 3
> -L(exit_tail5):
> - movl $5, %eax
> - ret
> -
> - .p2align 3
> -L(exit_tail6):
> - movl $6, %eax
> - ret
> -
> - .p2align 3
> -L(exit_tail7):
> - movl $7, %eax
> - ret
> -
> -END (__wcslen)
> -
> +#define WCSLEN __wcslen
> +#include "multiarch/wcslen-sse2.S"
> weak_alias(__wcslen, wcslen)
> --
> 2.34.1
>
LGTM.
Thanks.
--
H.J.
^ permalink raw reply [flat|nested] 21+ messages in thread
* Re: [PATCH v1] x86: Move wcschr SSE2 implementation to multiarch/wcschr-sse2.S
2022-07-12 19:29 ` [PATCH v1] x86: Move wcschr SSE2 implementation to multiarch/wcschr-sse2.S Noah Goldstein
@ 2022-07-12 20:55 ` H.J. Lu
0 siblings, 0 replies; 21+ messages in thread
From: H.J. Lu @ 2022-07-12 20:55 UTC (permalink / raw)
To: Noah Goldstein; +Cc: GNU C Library, Carlos O'Donell
On Tue, Jul 12, 2022 at 12:29 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> This commit doesn't affect libc.so.6, its just housekeeping to prepare
> for adding explicit ISA level support.
>
> Tested build on x86_64 and x86_32 with/without multiarch.
> ---
> sysdeps/x86_64/multiarch/wcschr-sse2.S | 145 +++++++++++++++++++++++--
> sysdeps/x86_64/wcschr.S | 135 +----------------------
> 2 files changed, 138 insertions(+), 142 deletions(-)
>
> diff --git a/sysdeps/x86_64/multiarch/wcschr-sse2.S b/sysdeps/x86_64/multiarch/wcschr-sse2.S
> index 218ea609b9..c872926ba9 100644
> --- a/sysdeps/x86_64/multiarch/wcschr-sse2.S
> +++ b/sysdeps/x86_64/multiarch/wcschr-sse2.S
> @@ -17,14 +17,141 @@
> <https://www.gnu.org/licenses/>. */
>
> #if IS_IN (libc)
> -# define __wcschr __wcschr_sse2
> -
> -# undef weak_alias
> -# define weak_alias(__wcschr, wcschr)
> -# undef libc_hidden_def
> -# define libc_hidden_def(__wcschr)
> -# undef libc_hidden_weak
> -# define libc_hidden_weak(wcschr)
> +# ifndef WCSCHR
> +# define WCSCHR __wcschr_sse2
> +# endif
> #endif
>
> -#include "../wcschr.S"
> +#include <sysdep.h>
> +
> + .text
> +ENTRY (WCSCHR)
> +
> + movd %rsi, %xmm1
> + pxor %xmm2, %xmm2
> + mov %rdi, %rcx
> + punpckldq %xmm1, %xmm1
> + punpckldq %xmm1, %xmm1
> +
> + and $63, %rcx
> + cmp $48, %rcx
> + ja L(cross_cache)
> +
> + movdqu (%rdi), %xmm0
> + pcmpeqd %xmm0, %xmm2
> + add $16, %rdi
> + pcmpeqd %xmm1, %xmm0
> + pmovmskb %xmm2, %rdx
> + pmovmskb %xmm0, %rax
> + or %rax, %rdx
> + jnz L(matches)
> +
> + and $-16, %rdi
> +
> + movdqa (%rdi), %xmm0
> + pcmpeqd %xmm0, %xmm2
> + add $16, %rdi
> + pcmpeqd %xmm1, %xmm0
> + pmovmskb %xmm2, %rdx
> + pmovmskb %xmm0, %rax
> + or %rax, %rdx
> + jnz L(matches)
> +
> + jmp L(loop)
> +
> +L(cross_cache):
> + and $15, %rcx
> + and $-16, %rdi
> + movdqa (%rdi), %xmm0
> + pcmpeqd %xmm0, %xmm2
> + pcmpeqd %xmm1, %xmm0
> + pmovmskb %xmm2, %rdx
> + pmovmskb %xmm0, %rax
> +
> + sar %cl, %rdx
> + sar %cl, %rax
> + test %rax, %rax
> + je L(unaligned_no_match)
> +
> + bsf %rax, %rax
> + test %rdx, %rdx
> + je L(unaligned_match)
> + bsf %rdx, %rdx
> + cmp %rdx, %rax
> + ja L(return_null)
> +
> +L(unaligned_match):
> + add %rdi, %rax
> + add %rcx, %rax
> + ret
> +
> + .p2align 4
> +L(unaligned_no_match):
> + test %rdx, %rdx
> + jne L(return_null)
> + pxor %xmm2, %xmm2
> +
> + add $16, %rdi
> +
> + .p2align 4
> +/* Loop start on aligned string. */
> +L(loop):
> + movdqa (%rdi), %xmm0
> + pcmpeqd %xmm0, %xmm2
> + add $16, %rdi
> + pcmpeqd %xmm1, %xmm0
> + pmovmskb %xmm2, %rdx
> + pmovmskb %xmm0, %rax
> + or %rax, %rdx
> + jnz L(matches)
> +
> + movdqa (%rdi), %xmm0
> + pcmpeqd %xmm0, %xmm2
> + add $16, %rdi
> + pcmpeqd %xmm1, %xmm0
> + pmovmskb %xmm2, %rdx
> + pmovmskb %xmm0, %rax
> + or %rax, %rdx
> + jnz L(matches)
> +
> + movdqa (%rdi), %xmm0
> + pcmpeqd %xmm0, %xmm2
> + add $16, %rdi
> + pcmpeqd %xmm1, %xmm0
> + pmovmskb %xmm2, %rdx
> + pmovmskb %xmm0, %rax
> + or %rax, %rdx
> + jnz L(matches)
> +
> + movdqa (%rdi), %xmm0
> + pcmpeqd %xmm0, %xmm2
> + add $16, %rdi
> + pcmpeqd %xmm1, %xmm0
> + pmovmskb %xmm2, %rdx
> + pmovmskb %xmm0, %rax
> + or %rax, %rdx
> + jnz L(matches)
> + jmp L(loop)
> +
> + .p2align 4
> +L(matches):
> + pmovmskb %xmm2, %rdx
> + test %rax, %rax
> + jz L(return_null)
> + bsf %rax, %rax
> + test %rdx, %rdx
> + je L(match)
> + bsf %rdx, %rcx
> + cmp %rcx, %rax
> + ja L(return_null)
> +L(match):
> + sub $16, %rdi
> + add %rdi, %rax
> + ret
> +
> + .p2align 4
> +L(return_null):
> + xor %rax, %rax
> + ret
> +
> +END (WCSCHR)
> diff --git a/sysdeps/x86_64/wcschr.S b/sysdeps/x86_64/wcschr.S
> index 2131220382..80b12c4286 100644
> --- a/sysdeps/x86_64/wcschr.S
> +++ b/sysdeps/x86_64/wcschr.S
> @@ -16,140 +16,9 @@
> License along with the GNU C Library; if not, see
> <https://www.gnu.org/licenses/>. */
>
> -#include <sysdep.h>
> -
> - .text
> -ENTRY (__wcschr)
> -
> - movd %rsi, %xmm1
> - pxor %xmm2, %xmm2
> - mov %rdi, %rcx
> - punpckldq %xmm1, %xmm1
> - punpckldq %xmm1, %xmm1
> -
> - and $63, %rcx
> - cmp $48, %rcx
> - ja L(cross_cache)
> -
> - movdqu (%rdi), %xmm0
> - pcmpeqd %xmm0, %xmm2
> - add $16, %rdi
> - pcmpeqd %xmm1, %xmm0
> - pmovmskb %xmm2, %rdx
> - pmovmskb %xmm0, %rax
> - or %rax, %rdx
> - jnz L(matches)
> -
> - and $-16, %rdi
> -
> - movdqa (%rdi), %xmm0
> - pcmpeqd %xmm0, %xmm2
> - add $16, %rdi
> - pcmpeqd %xmm1, %xmm0
> - pmovmskb %xmm2, %rdx
> - pmovmskb %xmm0, %rax
> - or %rax, %rdx
> - jnz L(matches)
> -
> - jmp L(loop)
> -
> -L(cross_cache):
> - and $15, %rcx
> - and $-16, %rdi
> - movdqa (%rdi), %xmm0
> - pcmpeqd %xmm0, %xmm2
> - pcmpeqd %xmm1, %xmm0
> - pmovmskb %xmm2, %rdx
> - pmovmskb %xmm0, %rax
> -
> - sar %cl, %rdx
> - sar %cl, %rax
> - test %rax, %rax
> - je L(unaligned_no_match)
> -
> - bsf %rax, %rax
> - test %rdx, %rdx
> - je L(unaligned_match)
> - bsf %rdx, %rdx
> - cmp %rdx, %rax
> - ja L(return_null)
> -
> -L(unaligned_match):
> - add %rdi, %rax
> - add %rcx, %rax
> - ret
> -
> - .p2align 4
> -L(unaligned_no_match):
> - test %rdx, %rdx
> - jne L(return_null)
> - pxor %xmm2, %xmm2
> -
> - add $16, %rdi
> -
> - .p2align 4
> -/* Loop start on aligned string. */
> -L(loop):
> - movdqa (%rdi), %xmm0
> - pcmpeqd %xmm0, %xmm2
> - add $16, %rdi
> - pcmpeqd %xmm1, %xmm0
> - pmovmskb %xmm2, %rdx
> - pmovmskb %xmm0, %rax
> - or %rax, %rdx
> - jnz L(matches)
> -
> - movdqa (%rdi), %xmm0
> - pcmpeqd %xmm0, %xmm2
> - add $16, %rdi
> - pcmpeqd %xmm1, %xmm0
> - pmovmskb %xmm2, %rdx
> - pmovmskb %xmm0, %rax
> - or %rax, %rdx
> - jnz L(matches)
> -
> - movdqa (%rdi), %xmm0
> - pcmpeqd %xmm0, %xmm2
> - add $16, %rdi
> - pcmpeqd %xmm1, %xmm0
> - pmovmskb %xmm2, %rdx
> - pmovmskb %xmm0, %rax
> - or %rax, %rdx
> - jnz L(matches)
> -
> - movdqa (%rdi), %xmm0
> - pcmpeqd %xmm0, %xmm2
> - add $16, %rdi
> - pcmpeqd %xmm1, %xmm0
> - pmovmskb %xmm2, %rdx
> - pmovmskb %xmm0, %rax
> - or %rax, %rdx
> - jnz L(matches)
> - jmp L(loop)
> -
> - .p2align 4
> -L(matches):
> - pmovmskb %xmm2, %rdx
> - test %rax, %rax
> - jz L(return_null)
> - bsf %rax, %rax
> - test %rdx, %rdx
> - je L(match)
> - bsf %rdx, %rcx
> - cmp %rcx, %rax
> - ja L(return_null)
> -L(match):
> - sub $16, %rdi
> - add %rdi, %rax
> - ret
> -
> - .p2align 4
> -L(return_null):
> - xor %rax, %rax
> - ret
> -
> -END (__wcschr)
>
> +#define WCSCHR __wcschr
> +#include "multiarch/wcschr-sse2.S"
> libc_hidden_def(__wcschr)
> weak_alias (__wcschr, wcschr)
> libc_hidden_weak (wcschr)
> --
> 2.34.1
>
LGTM.
Thanks.
--
H.J.
^ permalink raw reply [flat|nested] 21+ messages in thread
* Re: [PATCH v1] x86: Move strcat SSE2 implementation to multiarch/strcat-sse2.S
2022-07-12 19:29 ` [PATCH v1] x86: Move strcat SSE2 implementation to multiarch/strcat-sse2.S Noah Goldstein
@ 2022-07-12 21:16 ` H.J. Lu
0 siblings, 0 replies; 21+ messages in thread
From: H.J. Lu @ 2022-07-12 21:16 UTC (permalink / raw)
To: Noah Goldstein; +Cc: GNU C Library, Carlos O'Donell
On Tue, Jul 12, 2022 at 12:29 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> This commit doesn't affect libc.so.6, its just housekeeping to prepare
> for adding explicit ISA level support.
>
> Tested build on x86_64 and x86_32 with/without multiarch.
> ---
> sysdeps/x86_64/multiarch/strcat-sse2.S | 242 ++++++++++++++++++++++++-
> sysdeps/x86_64/strcat.S | 239 +-----------------------
> 2 files changed, 238 insertions(+), 243 deletions(-)
>
> diff --git a/sysdeps/x86_64/multiarch/strcat-sse2.S b/sysdeps/x86_64/multiarch/strcat-sse2.S
> index 449e102438..244c4a6d74 100644
> --- a/sysdeps/x86_64/multiarch/strcat-sse2.S
> +++ b/sysdeps/x86_64/multiarch/strcat-sse2.S
> @@ -17,12 +17,242 @@
> <https://www.gnu.org/licenses/>. */
>
> #if IS_IN (libc)
> +# ifndef STRCAT
> +# define STRCAT __strcat_sse2
> +# endif
> +#endif
>
> -# include <sysdep.h>
> -# define strcat __strcat_sse2
> +#include <sysdep.h>
> +
> + .text
> +ENTRY (STRCAT)
> + movq %rdi, %rcx /* Dest. register. */
> + andl $7, %ecx /* mask alignment bits */
> + movq %rdi, %rax /* Duplicate destination pointer. */
> + movq $0xfefefefefefefeff,%r8
> +
> + /* First step: Find end of destination. */
> + jz 4f /* aligned => start loop */
> +
> + neg %ecx /* We need to align to 8 bytes. */
> + addl $8,%ecx
> + /* Search the first bytes directly. */
> +0: cmpb $0x0,(%rax) /* is byte NUL? */
> + je 2f /* yes => start copy */
> + incq %rax /* increment pointer */
> + decl %ecx
> + jnz 0b
> +
> +
> +
> + /* Now the source is aligned. Scan for NUL byte. */
> + .p2align 4
> +4:
> + /* First unroll. */
> + movq (%rax), %rcx /* get double word (= 8 bytes) in question */
> + addq $8,%rax /* adjust pointer for next word */
> + movq %r8, %rdx /* magic value */
> + addq %rcx, %rdx /* add the magic value to the word. We get
> + carry bits reported for each byte which
> + is *not* 0 */
> + jnc 3f /* highest byte is NUL => return pointer */
> + xorq %rcx, %rdx /* (word+magic)^word */
> + orq %r8, %rdx /* set all non-carry bits */
> + incq %rdx /* add 1: if one carry bit was *not* set
> + the addition will not result in 0. */
> + jnz 3f /* found NUL => return pointer */
> +
> + /* Second unroll. */
> + movq (%rax), %rcx /* get double word (= 8 bytes) in question */
> + addq $8,%rax /* adjust pointer for next word */
> + movq %r8, %rdx /* magic value */
> + addq %rcx, %rdx /* add the magic value to the word. We get
> + carry bits reported for each byte which
> + is *not* 0 */
> + jnc 3f /* highest byte is NUL => return pointer */
> + xorq %rcx, %rdx /* (word+magic)^word */
> + orq %r8, %rdx /* set all non-carry bits */
> + incq %rdx /* add 1: if one carry bit was *not* set
> + the addition will not result in 0. */
> + jnz 3f /* found NUL => return pointer */
> +
> + /* Third unroll. */
> + movq (%rax), %rcx /* get double word (= 8 bytes) in question */
> + addq $8,%rax /* adjust pointer for next word */
> + movq %r8, %rdx /* magic value */
> + addq %rcx, %rdx /* add the magic value to the word. We get
> + carry bits reported for each byte which
> + is *not* 0 */
> + jnc 3f /* highest byte is NUL => return pointer */
> + xorq %rcx, %rdx /* (word+magic)^word */
> + orq %r8, %rdx /* set all non-carry bits */
> + incq %rdx /* add 1: if one carry bit was *not* set
> + the addition will not result in 0. */
> + jnz 3f /* found NUL => return pointer */
> +
> + /* Fourth unroll. */
> + movq (%rax), %rcx /* get double word (= 8 bytes) in question */
> + addq $8,%rax /* adjust pointer for next word */
> + movq %r8, %rdx /* magic value */
> + addq %rcx, %rdx /* add the magic value to the word. We get
> + carry bits reported for each byte which
> + is *not* 0 */
> + jnc 3f /* highest byte is NUL => return pointer */
> + xorq %rcx, %rdx /* (word+magic)^word */
> + orq %r8, %rdx /* set all non-carry bits */
> + incq %rdx /* add 1: if one carry bit was *not* set
> + the addition will not result in 0. */
> + jz 4b /* no NUL found => continue loop */
> +
> + .p2align 4 /* Align, it's a jump target. */
> +3: subq $8,%rax /* correct pointer increment. */
> +
> + testb %cl, %cl /* is first byte NUL? */
> + jz 2f /* yes => return */
> + incq %rax /* increment pointer */
> +
> + testb %ch, %ch /* is second byte NUL? */
> + jz 2f /* yes => return */
> + incq %rax /* increment pointer */
> +
> + testl $0x00ff0000, %ecx /* is third byte NUL? */
> + jz 2f /* yes => return pointer */
> + incq %rax /* increment pointer */
> +
> + testl $0xff000000, %ecx /* is fourth byte NUL? */
> + jz 2f /* yes => return pointer */
> + incq %rax /* increment pointer */
> +
> + shrq $32, %rcx /* look at other half. */
> +
> + testb %cl, %cl /* is first byte NUL? */
> + jz 2f /* yes => return */
> + incq %rax /* increment pointer */
> +
> + testb %ch, %ch /* is second byte NUL? */
> + jz 2f /* yes => return */
> + incq %rax /* increment pointer */
> +
> + testl $0xff0000, %ecx /* is third byte NUL? */
> + jz 2f /* yes => return pointer */
> + incq %rax /* increment pointer */
> +
> +2:
> + /* Second step: Copy source to destination. */
> +
> + movq %rsi, %rcx /* duplicate */
> + andl $7,%ecx /* mask alignment bits */
> + movq %rax, %rdx /* move around */
> + jz 22f /* aligned => start loop */
> +
> + neg %ecx /* align to 8 bytes. */
> + addl $8, %ecx
> + /* Align the source pointer. */
> +21:
> + movb (%rsi), %al /* Fetch a byte */
> + testb %al, %al /* Is it NUL? */
> + movb %al, (%rdx) /* Store it */
> + jz 24f /* If it was NUL, done! */
> + incq %rsi
> + incq %rdx
> + decl %ecx
> + jnz 21b
> +
> + /* Now the sources is aligned. Unfortunatly we cannot force
> + to have both source and destination aligned, so ignore the
> + alignment of the destination. */
> + .p2align 4
> +22:
> + /* 1st unroll. */
> + movq (%rsi), %rax /* Read double word (8 bytes). */
> + addq $8, %rsi /* Adjust pointer for next word. */
> + movq %rax, %r9 /* Save a copy for NUL finding. */
> + addq %r8, %r9 /* add the magic value to the word. We get
> + carry bits reported for each byte which
> + is *not* 0 */
> + jnc 23f /* highest byte is NUL => return pointer */
> + xorq %rax, %r9 /* (word+magic)^word */
> + orq %r8, %r9 /* set all non-carry bits */
> + incq %r9 /* add 1: if one carry bit was *not* set
> + the addition will not result in 0. */
> +
> + jnz 23f /* found NUL => return pointer */
> +
> + movq %rax, (%rdx) /* Write value to destination. */
> + addq $8, %rdx /* Adjust pointer. */
> +
> + /* 2nd unroll. */
> + movq (%rsi), %rax /* Read double word (8 bytes). */
> + addq $8, %rsi /* Adjust pointer for next word. */
> + movq %rax, %r9 /* Save a copy for NUL finding. */
> + addq %r8, %r9 /* add the magic value to the word. We get
> + carry bits reported for each byte which
> + is *not* 0 */
> + jnc 23f /* highest byte is NUL => return pointer */
> + xorq %rax, %r9 /* (word+magic)^word */
> + orq %r8, %r9 /* set all non-carry bits */
> + incq %r9 /* add 1: if one carry bit was *not* set
> + the addition will not result in 0. */
> +
> + jnz 23f /* found NUL => return pointer */
> +
> + movq %rax, (%rdx) /* Write value to destination. */
> + addq $8, %rdx /* Adjust pointer. */
> +
> + /* 3rd unroll. */
> + movq (%rsi), %rax /* Read double word (8 bytes). */
> + addq $8, %rsi /* Adjust pointer for next word. */
> + movq %rax, %r9 /* Save a copy for NUL finding. */
> + addq %r8, %r9 /* add the magic value to the word. We get
> + carry bits reported for each byte which
> + is *not* 0 */
> + jnc 23f /* highest byte is NUL => return pointer */
> + xorq %rax, %r9 /* (word+magic)^word */
> + orq %r8, %r9 /* set all non-carry bits */
> + incq %r9 /* add 1: if one carry bit was *not* set
> + the addition will not result in 0. */
> +
> + jnz 23f /* found NUL => return pointer */
> +
> + movq %rax, (%rdx) /* Write value to destination. */
> + addq $8, %rdx /* Adjust pointer. */
> +
> + /* 4th unroll. */
> + movq (%rsi), %rax /* Read double word (8 bytes). */
> + addq $8, %rsi /* Adjust pointer for next word. */
> + movq %rax, %r9 /* Save a copy for NUL finding. */
> + addq %r8, %r9 /* add the magic value to the word. We get
> + carry bits reported for each byte which
> + is *not* 0 */
> + jnc 23f /* highest byte is NUL => return pointer */
> + xorq %rax, %r9 /* (word+magic)^word */
> + orq %r8, %r9 /* set all non-carry bits */
> + incq %r9 /* add 1: if one carry bit was *not* set
> + the addition will not result in 0. */
> +
> + jnz 23f /* found NUL => return pointer */
> +
> + movq %rax, (%rdx) /* Write value to destination. */
> + addq $8, %rdx /* Adjust pointer. */
> + jmp 22b /* Next iteration. */
> +
> + /* Do the last few bytes. %rax contains the value to write.
> + The loop is unrolled twice. */
> + .p2align 4
> +23:
> + movb %al, (%rdx) /* 1st byte. */
> + testb %al, %al /* Is it NUL. */
> + jz 24f /* yes, finish. */
> + incq %rdx /* Increment destination. */
> + movb %ah, (%rdx) /* 2nd byte. */
> + testb %ah, %ah /* Is it NUL?. */
> + jz 24f /* yes, finish. */
> + incq %rdx /* Increment destination. */
> + shrq $16, %rax /* Shift... */
> + jmp 23b /* and look at next two bytes in %rax. */
>
> -# undef libc_hidden_builtin_def
> -# define libc_hidden_builtin_def(strcat)
> -#endif
>
> -#include <sysdeps/x86_64/strcat.S>
> +24:
> + movq %rdi, %rax /* Source is return value. */
> + retq
> +END (STRCAT)
> diff --git a/sysdeps/x86_64/strcat.S b/sysdeps/x86_64/strcat.S
> index 565a9c785a..fc3e8a9bcf 100644
> --- a/sysdeps/x86_64/strcat.S
> +++ b/sysdeps/x86_64/strcat.S
> @@ -17,241 +17,6 @@
> License along with the GNU C Library; if not, see
> <https://www.gnu.org/licenses/>. */
>
> -#include <sysdep.h>
> -#include "asm-syntax.h"
> -
> -/* Will be removed when new strcpy implementation gets merged. */
> -
> - .text
> -ENTRY (strcat)
> - movq %rdi, %rcx /* Dest. register. */
> - andl $7, %ecx /* mask alignment bits */
> - movq %rdi, %rax /* Duplicate destination pointer. */
> - movq $0xfefefefefefefeff,%r8
> -
> - /* First step: Find end of destination. */
> - jz 4f /* aligned => start loop */
> -
> - neg %ecx /* We need to align to 8 bytes. */
> - addl $8,%ecx
> - /* Search the first bytes directly. */
> -0: cmpb $0x0,(%rax) /* is byte NUL? */
> - je 2f /* yes => start copy */
> - incq %rax /* increment pointer */
> - decl %ecx
> - jnz 0b
> -
> -
> -
> - /* Now the source is aligned. Scan for NUL byte. */
> - .p2align 4
> -4:
> - /* First unroll. */
> - movq (%rax), %rcx /* get double word (= 8 bytes) in question */
> - addq $8,%rax /* adjust pointer for next word */
> - movq %r8, %rdx /* magic value */
> - addq %rcx, %rdx /* add the magic value to the word. We get
> - carry bits reported for each byte which
> - is *not* 0 */
> - jnc 3f /* highest byte is NUL => return pointer */
> - xorq %rcx, %rdx /* (word+magic)^word */
> - orq %r8, %rdx /* set all non-carry bits */
> - incq %rdx /* add 1: if one carry bit was *not* set
> - the addition will not result in 0. */
> - jnz 3f /* found NUL => return pointer */
> -
> - /* Second unroll. */
> - movq (%rax), %rcx /* get double word (= 8 bytes) in question */
> - addq $8,%rax /* adjust pointer for next word */
> - movq %r8, %rdx /* magic value */
> - addq %rcx, %rdx /* add the magic value to the word. We get
> - carry bits reported for each byte which
> - is *not* 0 */
> - jnc 3f /* highest byte is NUL => return pointer */
> - xorq %rcx, %rdx /* (word+magic)^word */
> - orq %r8, %rdx /* set all non-carry bits */
> - incq %rdx /* add 1: if one carry bit was *not* set
> - the addition will not result in 0. */
> - jnz 3f /* found NUL => return pointer */
> -
> - /* Third unroll. */
> - movq (%rax), %rcx /* get double word (= 8 bytes) in question */
> - addq $8,%rax /* adjust pointer for next word */
> - movq %r8, %rdx /* magic value */
> - addq %rcx, %rdx /* add the magic value to the word. We get
> - carry bits reported for each byte which
> - is *not* 0 */
> - jnc 3f /* highest byte is NUL => return pointer */
> - xorq %rcx, %rdx /* (word+magic)^word */
> - orq %r8, %rdx /* set all non-carry bits */
> - incq %rdx /* add 1: if one carry bit was *not* set
> - the addition will not result in 0. */
> - jnz 3f /* found NUL => return pointer */
> -
> - /* Fourth unroll. */
> - movq (%rax), %rcx /* get double word (= 8 bytes) in question */
> - addq $8,%rax /* adjust pointer for next word */
> - movq %r8, %rdx /* magic value */
> - addq %rcx, %rdx /* add the magic value to the word. We get
> - carry bits reported for each byte which
> - is *not* 0 */
> - jnc 3f /* highest byte is NUL => return pointer */
> - xorq %rcx, %rdx /* (word+magic)^word */
> - orq %r8, %rdx /* set all non-carry bits */
> - incq %rdx /* add 1: if one carry bit was *not* set
> - the addition will not result in 0. */
> - jz 4b /* no NUL found => continue loop */
> -
> - .p2align 4 /* Align, it's a jump target. */
> -3: subq $8,%rax /* correct pointer increment. */
> -
> - testb %cl, %cl /* is first byte NUL? */
> - jz 2f /* yes => return */
> - incq %rax /* increment pointer */
> -
> - testb %ch, %ch /* is second byte NUL? */
> - jz 2f /* yes => return */
> - incq %rax /* increment pointer */
> -
> - testl $0x00ff0000, %ecx /* is third byte NUL? */
> - jz 2f /* yes => return pointer */
> - incq %rax /* increment pointer */
> -
> - testl $0xff000000, %ecx /* is fourth byte NUL? */
> - jz 2f /* yes => return pointer */
> - incq %rax /* increment pointer */
> -
> - shrq $32, %rcx /* look at other half. */
> -
> - testb %cl, %cl /* is first byte NUL? */
> - jz 2f /* yes => return */
> - incq %rax /* increment pointer */
> -
> - testb %ch, %ch /* is second byte NUL? */
> - jz 2f /* yes => return */
> - incq %rax /* increment pointer */
> -
> - testl $0xff0000, %ecx /* is third byte NUL? */
> - jz 2f /* yes => return pointer */
> - incq %rax /* increment pointer */
> -
> -2:
> - /* Second step: Copy source to destination. */
> -
> - movq %rsi, %rcx /* duplicate */
> - andl $7,%ecx /* mask alignment bits */
> - movq %rax, %rdx /* move around */
> - jz 22f /* aligned => start loop */
> -
> - neg %ecx /* align to 8 bytes. */
> - addl $8, %ecx
> - /* Align the source pointer. */
> -21:
> - movb (%rsi), %al /* Fetch a byte */
> - testb %al, %al /* Is it NUL? */
> - movb %al, (%rdx) /* Store it */
> - jz 24f /* If it was NUL, done! */
> - incq %rsi
> - incq %rdx
> - decl %ecx
> - jnz 21b
> -
> - /* Now the sources is aligned. Unfortunatly we cannot force
> - to have both source and destination aligned, so ignore the
> - alignment of the destination. */
> - .p2align 4
> -22:
> - /* 1st unroll. */
> - movq (%rsi), %rax /* Read double word (8 bytes). */
> - addq $8, %rsi /* Adjust pointer for next word. */
> - movq %rax, %r9 /* Save a copy for NUL finding. */
> - addq %r8, %r9 /* add the magic value to the word. We get
> - carry bits reported for each byte which
> - is *not* 0 */
> - jnc 23f /* highest byte is NUL => return pointer */
> - xorq %rax, %r9 /* (word+magic)^word */
> - orq %r8, %r9 /* set all non-carry bits */
> - incq %r9 /* add 1: if one carry bit was *not* set
> - the addition will not result in 0. */
> -
> - jnz 23f /* found NUL => return pointer */
> -
> - movq %rax, (%rdx) /* Write value to destination. */
> - addq $8, %rdx /* Adjust pointer. */
> -
> - /* 2nd unroll. */
> - movq (%rsi), %rax /* Read double word (8 bytes). */
> - addq $8, %rsi /* Adjust pointer for next word. */
> - movq %rax, %r9 /* Save a copy for NUL finding. */
> - addq %r8, %r9 /* add the magic value to the word. We get
> - carry bits reported for each byte which
> - is *not* 0 */
> - jnc 23f /* highest byte is NUL => return pointer */
> - xorq %rax, %r9 /* (word+magic)^word */
> - orq %r8, %r9 /* set all non-carry bits */
> - incq %r9 /* add 1: if one carry bit was *not* set
> - the addition will not result in 0. */
> -
> - jnz 23f /* found NUL => return pointer */
> -
> - movq %rax, (%rdx) /* Write value to destination. */
> - addq $8, %rdx /* Adjust pointer. */
> -
> - /* 3rd unroll. */
> - movq (%rsi), %rax /* Read double word (8 bytes). */
> - addq $8, %rsi /* Adjust pointer for next word. */
> - movq %rax, %r9 /* Save a copy for NUL finding. */
> - addq %r8, %r9 /* add the magic value to the word. We get
> - carry bits reported for each byte which
> - is *not* 0 */
> - jnc 23f /* highest byte is NUL => return pointer */
> - xorq %rax, %r9 /* (word+magic)^word */
> - orq %r8, %r9 /* set all non-carry bits */
> - incq %r9 /* add 1: if one carry bit was *not* set
> - the addition will not result in 0. */
> -
> - jnz 23f /* found NUL => return pointer */
> -
> - movq %rax, (%rdx) /* Write value to destination. */
> - addq $8, %rdx /* Adjust pointer. */
> -
> - /* 4th unroll. */
> - movq (%rsi), %rax /* Read double word (8 bytes). */
> - addq $8, %rsi /* Adjust pointer for next word. */
> - movq %rax, %r9 /* Save a copy for NUL finding. */
> - addq %r8, %r9 /* add the magic value to the word. We get
> - carry bits reported for each byte which
> - is *not* 0 */
> - jnc 23f /* highest byte is NUL => return pointer */
> - xorq %rax, %r9 /* (word+magic)^word */
> - orq %r8, %r9 /* set all non-carry bits */
> - incq %r9 /* add 1: if one carry bit was *not* set
> - the addition will not result in 0. */
> -
> - jnz 23f /* found NUL => return pointer */
> -
> - movq %rax, (%rdx) /* Write value to destination. */
> - addq $8, %rdx /* Adjust pointer. */
> - jmp 22b /* Next iteration. */
> -
> - /* Do the last few bytes. %rax contains the value to write.
> - The loop is unrolled twice. */
> - .p2align 4
> -23:
> - movb %al, (%rdx) /* 1st byte. */
> - testb %al, %al /* Is it NUL. */
> - jz 24f /* yes, finish. */
> - incq %rdx /* Increment destination. */
> - movb %ah, (%rdx) /* 2nd byte. */
> - testb %ah, %ah /* Is it NUL?. */
> - jz 24f /* yes, finish. */
> - incq %rdx /* Increment destination. */
> - shrq $16, %rax /* Shift... */
> - jmp 23b /* and look at next two bytes in %rax. */
> -
> -
> -24:
> - movq %rdi, %rax /* Source is return value. */
> - retq
> -END (strcat)
> +#define STRCAT strcat
> +#include "multiarch/strcat-sse2.S"
> libc_hidden_builtin_def (strcat)
> --
> 2.34.1
>
LGTM.
Thanks.
--
H.J.
^ permalink raw reply [flat|nested] 21+ messages in thread
* Re: [PATCH v1] x86: Move strchr SSE2 implementation to multiarch/strchr-sse2.S
2022-07-12 19:29 ` [PATCH v1] x86: Move strchr SSE2 implementation to multiarch/strchr-sse2.S Noah Goldstein
@ 2022-07-12 21:27 ` H.J. Lu
0 siblings, 0 replies; 21+ messages in thread
From: H.J. Lu @ 2022-07-12 21:27 UTC (permalink / raw)
To: Noah Goldstein; +Cc: GNU C Library, Carlos O'Donell
On Tue, Jul 12, 2022 at 12:29 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> This commit doesn't affect libc.so.6, its just housekeeping to prepare
> for adding explicit ISA level support.
>
> Tested build on x86_64 and x86_32 with/without multiarch.
> ---
> sysdeps/x86_64/multiarch/rtld-strchr.S | 18 +++
> sysdeps/x86_64/multiarch/rtld-strchrnul.S | 18 +++
> sysdeps/x86_64/multiarch/strchr-sse2.S | 175 +++++++++++++++++++++-
> sysdeps/x86_64/multiarch/strchrnul-sse2.S | 11 +-
> sysdeps/x86_64/strchr.S | 167 +--------------------
> sysdeps/x86_64/strchrnul.S | 7 +-
> 6 files changed, 213 insertions(+), 183 deletions(-)
> create mode 100644 sysdeps/x86_64/multiarch/rtld-strchr.S
> create mode 100644 sysdeps/x86_64/multiarch/rtld-strchrnul.S
>
> diff --git a/sysdeps/x86_64/multiarch/rtld-strchr.S b/sysdeps/x86_64/multiarch/rtld-strchr.S
> new file mode 100644
> index 0000000000..2b7b879e37
> --- /dev/null
> +++ b/sysdeps/x86_64/multiarch/rtld-strchr.S
> @@ -0,0 +1,18 @@
> +/* Copyright (C) 2022 Free Software Foundation, Inc.
> + This file is part of the GNU C Library.
> +
> + The GNU C Library is free software; you can redistribute it and/or
> + modify it under the terms of the GNU Lesser General Public
> + License as published by the Free Software Foundation; either
> + version 2.1 of the License, or (at your option) any later version.
> +
> + The GNU C Library is distributed in the hope that it will be useful,
> + but WITHOUT ANY WARRANTY; without even the implied warranty of
> + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + Lesser General Public License for more details.
> +
> + You should have received a copy of the GNU Lesser General Public
> + License along with the GNU C Library; if not, see
> + <https://www.gnu.org/licenses/>. */
> +
> +#include "../strchr.S"
> diff --git a/sysdeps/x86_64/multiarch/rtld-strchrnul.S b/sysdeps/x86_64/multiarch/rtld-strchrnul.S
> new file mode 100644
> index 0000000000..0cc5becc88
> --- /dev/null
> +++ b/sysdeps/x86_64/multiarch/rtld-strchrnul.S
> @@ -0,0 +1,18 @@
> +/* Copyright (C) 2022 Free Software Foundation, Inc.
> + This file is part of the GNU C Library.
> +
> + The GNU C Library is free software; you can redistribute it and/or
> + modify it under the terms of the GNU Lesser General Public
> + License as published by the Free Software Foundation; either
> + version 2.1 of the License, or (at your option) any later version.
> +
> + The GNU C Library is distributed in the hope that it will be useful,
> + but WITHOUT ANY WARRANTY; without even the implied warranty of
> + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + Lesser General Public License for more details.
> +
> + You should have received a copy of the GNU Lesser General Public
> + License along with the GNU C Library; if not, see
> + <https://www.gnu.org/licenses/>. */
> +
> +#include "../strchrnul.S"
> diff --git a/sysdeps/x86_64/multiarch/strchr-sse2.S b/sysdeps/x86_64/multiarch/strchr-sse2.S
> index 992f700077..f7767ca543 100644
> --- a/sysdeps/x86_64/multiarch/strchr-sse2.S
> +++ b/sysdeps/x86_64/multiarch/strchr-sse2.S
> @@ -16,13 +16,172 @@
> License along with the GNU C Library; if not, see
> <https://www.gnu.org/licenses/>. */
>
> -#if IS_IN (libc)
> -# define strchr __strchr_sse2
> +#if IS_IN (libc) || defined STRCHR
> +# ifndef STRCHR
> +# define STRCHR __strchr_sse2
> +# endif
>
> -# undef weak_alias
> -# define weak_alias(strchr, index)
> -# undef libc_hidden_builtin_def
> -# define libc_hidden_builtin_def(strchr)
> -#endif
> +# include <sysdep.h>
> +
> + .text
> +ENTRY (STRCHR)
> + movd %esi, %xmm1
> + movl %edi, %eax
> + andl $4095, %eax
> + punpcklbw %xmm1, %xmm1
> + cmpl $4032, %eax
> + punpcklwd %xmm1, %xmm1
> + pshufd $0, %xmm1, %xmm1
> + jg L(cross_page)
> + movdqu (%rdi), %xmm0
> + pxor %xmm3, %xmm3
> + movdqa %xmm0, %xmm4
> + pcmpeqb %xmm1, %xmm0
> + pcmpeqb %xmm3, %xmm4
> + por %xmm4, %xmm0
> + pmovmskb %xmm0, %eax
> + test %eax, %eax
> + je L(next_48_bytes)
> + bsf %eax, %eax
> +# ifdef AS_STRCHRNUL
> + leaq (%rdi,%rax), %rax
> +# else
> + movl $0, %edx
> + leaq (%rdi,%rax), %rax
> + cmpb %sil, (%rax)
> + cmovne %rdx, %rax
> +# endif
> + ret
> +
> + .p2align 3
> +L(next_48_bytes):
> + movdqu 16(%rdi), %xmm0
> + movdqa %xmm0, %xmm4
> + pcmpeqb %xmm1, %xmm0
> + pcmpeqb %xmm3, %xmm4
> + por %xmm4, %xmm0
> + pmovmskb %xmm0, %ecx
> + movdqu 32(%rdi), %xmm0
> + movdqa %xmm0, %xmm4
> + pcmpeqb %xmm1, %xmm0
> + salq $16, %rcx
> + pcmpeqb %xmm3, %xmm4
> + por %xmm4, %xmm0
> + pmovmskb %xmm0, %eax
> + movdqu 48(%rdi), %xmm0
> + pcmpeqb %xmm0, %xmm3
> + salq $32, %rax
> + pcmpeqb %xmm1, %xmm0
> + orq %rcx, %rax
> + por %xmm3, %xmm0
> + pmovmskb %xmm0, %ecx
> + salq $48, %rcx
> + orq %rcx, %rax
> + testq %rax, %rax
> + jne L(return)
> +L(loop_start):
> + /* We use this alignment to force loop be aligned to 8 but not
> + 16 bytes. This gives better sheduling on AMD processors. */
> + .p2align 4
> + pxor %xmm6, %xmm6
> + andq $-64, %rdi
> + .p2align 3
> +L(loop64):
> + addq $64, %rdi
> + movdqa (%rdi), %xmm5
> + movdqa 16(%rdi), %xmm2
> + movdqa 32(%rdi), %xmm3
> + pxor %xmm1, %xmm5
> + movdqa 48(%rdi), %xmm4
> + pxor %xmm1, %xmm2
> + pxor %xmm1, %xmm3
> + pminub (%rdi), %xmm5
> + pxor %xmm1, %xmm4
> + pminub 16(%rdi), %xmm2
> + pminub 32(%rdi), %xmm3
> + pminub %xmm2, %xmm5
> + pminub 48(%rdi), %xmm4
> + pminub %xmm3, %xmm5
> + pminub %xmm4, %xmm5
> + pcmpeqb %xmm6, %xmm5
> + pmovmskb %xmm5, %eax
> +
> + testl %eax, %eax
> + je L(loop64)
>
> -#include "../strchr.S"
> + movdqa (%rdi), %xmm5
> + movdqa %xmm5, %xmm0
> + pcmpeqb %xmm1, %xmm5
> + pcmpeqb %xmm6, %xmm0
> + por %xmm0, %xmm5
> + pcmpeqb %xmm6, %xmm2
> + pcmpeqb %xmm6, %xmm3
> + pcmpeqb %xmm6, %xmm4
> +
> + pmovmskb %xmm5, %ecx
> + pmovmskb %xmm2, %eax
> + salq $16, %rax
> + pmovmskb %xmm3, %r8d
> + pmovmskb %xmm4, %edx
> + salq $32, %r8
> + orq %r8, %rax
> + orq %rcx, %rax
> + salq $48, %rdx
> + orq %rdx, %rax
> + .p2align 3
> +L(return):
> + bsfq %rax, %rax
> +# ifdef AS_STRCHRNUL
> + leaq (%rdi,%rax), %rax
> +# else
> + movl $0, %edx
> + leaq (%rdi,%rax), %rax
> + cmpb %sil, (%rax)
> + cmovne %rdx, %rax
> +# endif
> + ret
> + .p2align 4
> +
> +L(cross_page):
> + movq %rdi, %rdx
> + pxor %xmm2, %xmm2
> + andq $-64, %rdx
> + movdqa %xmm1, %xmm0
> + movdqa (%rdx), %xmm3
> + movdqa %xmm3, %xmm4
> + pcmpeqb %xmm1, %xmm3
> + pcmpeqb %xmm2, %xmm4
> + por %xmm4, %xmm3
> + pmovmskb %xmm3, %r8d
> + movdqa 16(%rdx), %xmm3
> + movdqa %xmm3, %xmm4
> + pcmpeqb %xmm1, %xmm3
> + pcmpeqb %xmm2, %xmm4
> + por %xmm4, %xmm3
> + pmovmskb %xmm3, %eax
> + movdqa 32(%rdx), %xmm3
> + movdqa %xmm3, %xmm4
> + pcmpeqb %xmm1, %xmm3
> + salq $16, %rax
> + pcmpeqb %xmm2, %xmm4
> + por %xmm4, %xmm3
> + pmovmskb %xmm3, %r9d
> + movdqa 48(%rdx), %xmm3
> + pcmpeqb %xmm3, %xmm2
> + salq $32, %r9
> + pcmpeqb %xmm3, %xmm0
> + orq %r9, %rax
> + orq %r8, %rax
> + por %xmm2, %xmm0
> + pmovmskb %xmm0, %ecx
> + salq $48, %rcx
> + orq %rcx, %rax
> + movl %edi, %ecx
> + subb %dl, %cl
> + shrq %cl, %rax
> + testq %rax, %rax
> + jne L(return)
> + jmp L(loop_start)
> +
> +END (STRCHR)
> +#endif
> diff --git a/sysdeps/x86_64/multiarch/strchrnul-sse2.S b/sysdeps/x86_64/multiarch/strchrnul-sse2.S
> index f91c670369..7238977a21 100644
> --- a/sysdeps/x86_64/multiarch/strchrnul-sse2.S
> +++ b/sysdeps/x86_64/multiarch/strchrnul-sse2.S
> @@ -17,10 +17,11 @@
> <https://www.gnu.org/licenses/>. */
>
> #if IS_IN (libc)
> -# define __strchrnul __strchrnul_sse2
> -
> -# undef weak_alias
> -# define weak_alias(__strchrnul, strchrnul)
> +# ifndef STRCHR
> +# define STRCHR __strchrnul_sse2
> +# endif
> #endif
>
> -#include "../strchrnul.S"
> +#define AS_STRCHRNUL
> +
> +#include "strchr-sse2.S"
> diff --git a/sysdeps/x86_64/strchr.S b/sysdeps/x86_64/strchr.S
> index dda7c0431d..77c956c92c 100644
> --- a/sysdeps/x86_64/strchr.S
> +++ b/sysdeps/x86_64/strchr.S
> @@ -17,171 +17,8 @@
> License along with the GNU C Library; if not, see
> <https://www.gnu.org/licenses/>. */
>
> -#include <sysdep.h>
>
> - .text
> -ENTRY (strchr)
> - movd %esi, %xmm1
> - movl %edi, %eax
> - andl $4095, %eax
> - punpcklbw %xmm1, %xmm1
> - cmpl $4032, %eax
> - punpcklwd %xmm1, %xmm1
> - pshufd $0, %xmm1, %xmm1
> - jg L(cross_page)
> - movdqu (%rdi), %xmm0
> - pxor %xmm3, %xmm3
> - movdqa %xmm0, %xmm4
> - pcmpeqb %xmm1, %xmm0
> - pcmpeqb %xmm3, %xmm4
> - por %xmm4, %xmm0
> - pmovmskb %xmm0, %eax
> - test %eax, %eax
> - je L(next_48_bytes)
> - bsf %eax, %eax
> -#ifdef AS_STRCHRNUL
> - leaq (%rdi,%rax), %rax
> -#else
> - movl $0, %edx
> - leaq (%rdi,%rax), %rax
> - cmpb %sil, (%rax)
> - cmovne %rdx, %rax
> -#endif
> - ret
> -
> - .p2align 3
> - L(next_48_bytes):
> - movdqu 16(%rdi), %xmm0
> - movdqa %xmm0, %xmm4
> - pcmpeqb %xmm1, %xmm0
> - pcmpeqb %xmm3, %xmm4
> - por %xmm4, %xmm0
> - pmovmskb %xmm0, %ecx
> - movdqu 32(%rdi), %xmm0
> - movdqa %xmm0, %xmm4
> - pcmpeqb %xmm1, %xmm0
> - salq $16, %rcx
> - pcmpeqb %xmm3, %xmm4
> - por %xmm4, %xmm0
> - pmovmskb %xmm0, %eax
> - movdqu 48(%rdi), %xmm0
> - pcmpeqb %xmm0, %xmm3
> - salq $32, %rax
> - pcmpeqb %xmm1, %xmm0
> - orq %rcx, %rax
> - por %xmm3, %xmm0
> - pmovmskb %xmm0, %ecx
> - salq $48, %rcx
> - orq %rcx, %rax
> - testq %rax, %rax
> - jne L(return)
> -L(loop_start):
> - /* We use this alignment to force loop be aligned to 8 but not
> - 16 bytes. This gives better sheduling on AMD processors. */
> - .p2align 4
> - pxor %xmm6, %xmm6
> - andq $-64, %rdi
> - .p2align 3
> -L(loop64):
> - addq $64, %rdi
> - movdqa (%rdi), %xmm5
> - movdqa 16(%rdi), %xmm2
> - movdqa 32(%rdi), %xmm3
> - pxor %xmm1, %xmm5
> - movdqa 48(%rdi), %xmm4
> - pxor %xmm1, %xmm2
> - pxor %xmm1, %xmm3
> - pminub (%rdi), %xmm5
> - pxor %xmm1, %xmm4
> - pminub 16(%rdi), %xmm2
> - pminub 32(%rdi), %xmm3
> - pminub %xmm2, %xmm5
> - pminub 48(%rdi), %xmm4
> - pminub %xmm3, %xmm5
> - pminub %xmm4, %xmm5
> - pcmpeqb %xmm6, %xmm5
> - pmovmskb %xmm5, %eax
> -
> - testl %eax, %eax
> - je L(loop64)
> -
> - movdqa (%rdi), %xmm5
> - movdqa %xmm5, %xmm0
> - pcmpeqb %xmm1, %xmm5
> - pcmpeqb %xmm6, %xmm0
> - por %xmm0, %xmm5
> - pcmpeqb %xmm6, %xmm2
> - pcmpeqb %xmm6, %xmm3
> - pcmpeqb %xmm6, %xmm4
> -
> - pmovmskb %xmm5, %ecx
> - pmovmskb %xmm2, %eax
> - salq $16, %rax
> - pmovmskb %xmm3, %r8d
> - pmovmskb %xmm4, %edx
> - salq $32, %r8
> - orq %r8, %rax
> - orq %rcx, %rax
> - salq $48, %rdx
> - orq %rdx, %rax
> - .p2align 3
> -L(return):
> - bsfq %rax, %rax
> -#ifdef AS_STRCHRNUL
> - leaq (%rdi,%rax), %rax
> -#else
> - movl $0, %edx
> - leaq (%rdi,%rax), %rax
> - cmpb %sil, (%rax)
> - cmovne %rdx, %rax
> -#endif
> - ret
> - .p2align 4
> -
> -L(cross_page):
> - movq %rdi, %rdx
> - pxor %xmm2, %xmm2
> - andq $-64, %rdx
> - movdqa %xmm1, %xmm0
> - movdqa (%rdx), %xmm3
> - movdqa %xmm3, %xmm4
> - pcmpeqb %xmm1, %xmm3
> - pcmpeqb %xmm2, %xmm4
> - por %xmm4, %xmm3
> - pmovmskb %xmm3, %r8d
> - movdqa 16(%rdx), %xmm3
> - movdqa %xmm3, %xmm4
> - pcmpeqb %xmm1, %xmm3
> - pcmpeqb %xmm2, %xmm4
> - por %xmm4, %xmm3
> - pmovmskb %xmm3, %eax
> - movdqa 32(%rdx), %xmm3
> - movdqa %xmm3, %xmm4
> - pcmpeqb %xmm1, %xmm3
> - salq $16, %rax
> - pcmpeqb %xmm2, %xmm4
> - por %xmm4, %xmm3
> - pmovmskb %xmm3, %r9d
> - movdqa 48(%rdx), %xmm3
> - pcmpeqb %xmm3, %xmm2
> - salq $32, %r9
> - pcmpeqb %xmm3, %xmm0
> - orq %r9, %rax
> - orq %r8, %rax
> - por %xmm2, %xmm0
> - pmovmskb %xmm0, %ecx
> - salq $48, %rcx
> - orq %rcx, %rax
> - movl %edi, %ecx
> - subb %dl, %cl
> - shrq %cl, %rax
> - testq %rax, %rax
> - jne L(return)
> - jmp L(loop_start)
> -
> -END (strchr)
> -
> -#ifndef AS_STRCHRNUL
> +#define STRCHR strchr
> +#include "multiarch/strchr-sse2.S"
> weak_alias (strchr, index)
> libc_hidden_builtin_def (strchr)
> -#endif
> diff --git a/sysdeps/x86_64/strchrnul.S b/sysdeps/x86_64/strchrnul.S
> index ec2e652e25..508e42db26 100644
> --- a/sysdeps/x86_64/strchrnul.S
> +++ b/sysdeps/x86_64/strchrnul.S
> @@ -18,10 +18,7 @@
> License along with the GNU C Library; if not, see
> <https://www.gnu.org/licenses/>. */
>
> -#include <sysdep.h>
> -
> -#define strchr __strchrnul
> -#define AS_STRCHRNUL
> -#include "strchr.S"
> +#define STRCHR __strchrnul
> +#include "multiarch/strchrnul-sse2.S"
>
> weak_alias (__strchrnul, strchrnul)
> --
> 2.34.1
>
LGTM.
Thanks.
--
H.J.
^ permalink raw reply [flat|nested] 21+ messages in thread
* Re: [PATCH v1] x86: Move strrchr SSE2 implementation to multiarch/strrchr-sse2.S
2022-07-12 19:29 ` [PATCH v1] x86: Move strrchr SSE2 implementation to multiarch/strrchr-sse2.S Noah Goldstein
@ 2022-07-12 22:28 ` H.J. Lu
0 siblings, 0 replies; 21+ messages in thread
From: H.J. Lu @ 2022-07-12 22:28 UTC (permalink / raw)
To: Noah Goldstein; +Cc: GNU C Library, Carlos O'Donell
On Tue, Jul 12, 2022 at 12:29 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> This commit doesn't affect libc.so.6, its just housekeeping to prepare
> for adding explicit ISA level support.
>
> Tested build on x86_64 and x86_32 with/without multiarch.
> ---
> sysdeps/x86_64/multiarch/strrchr-sse2.S | 358 ++++++++++++++++++++++-
> sysdeps/x86_64/multiarch/wcsrchr-sse2.S | 10 +-
> sysdeps/x86_64/strrchr.S | 364 +-----------------------
> sysdeps/x86_64/wcsrchr.S | 11 +-
> 4 files changed, 366 insertions(+), 377 deletions(-)
>
> diff --git a/sysdeps/x86_64/multiarch/strrchr-sse2.S b/sysdeps/x86_64/multiarch/strrchr-sse2.S
> index 866396e947..6ee7a5e33a 100644
> --- a/sysdeps/x86_64/multiarch/strrchr-sse2.S
> +++ b/sysdeps/x86_64/multiarch/strrchr-sse2.S
> @@ -17,12 +17,358 @@
> <https://www.gnu.org/licenses/>. */
>
> #if IS_IN (libc)
> -# define STRRCHR __strrchr_sse2
> +# ifndef STRRCHR
> +# define STRRCHR __strrchr_sse2
> +# endif
> +#endif
> +
> +#include <sysdep.h>
> +
> +#ifdef USE_AS_WCSRCHR
> +# define PCMPEQ pcmpeqd
> +# define CHAR_SIZE 4
> +# define PMINU pminud
> +#else
> +# define PCMPEQ pcmpeqb
> +# define CHAR_SIZE 1
> +# define PMINU pminub
> +#endif
> +
> +#define PAGE_SIZE 4096
> +#define VEC_SIZE 16
> +
> + .text
> +ENTRY(STRRCHR)
> + movd %esi, %xmm0
> + movq %rdi, %rax
> + andl $(PAGE_SIZE - 1), %eax
> +#ifndef USE_AS_WCSRCHR
> + punpcklbw %xmm0, %xmm0
> + punpcklwd %xmm0, %xmm0
> +#endif
> + pshufd $0, %xmm0, %xmm0
> + cmpl $(PAGE_SIZE - VEC_SIZE), %eax
> + ja L(cross_page)
> +
> +L(cross_page_continue):
> + movups (%rdi), %xmm1
> + pxor %xmm2, %xmm2
> + PCMPEQ %xmm1, %xmm2
> + pmovmskb %xmm2, %ecx
> + testl %ecx, %ecx
> + jz L(aligned_more)
> +
> + PCMPEQ %xmm0, %xmm1
> + pmovmskb %xmm1, %eax
> + leal -1(%rcx), %edx
> + xorl %edx, %ecx
> + andl %ecx, %eax
> + jz L(ret0)
> + bsrl %eax, %eax
> + addq %rdi, %rax
> + /* We are off by 3 for wcsrchr if search CHAR is non-zero. If
> + search CHAR is zero we are correct. Either way `andq
> + -CHAR_SIZE, %rax` gets the correct result. */
> +#ifdef USE_AS_WCSRCHR
> + andq $-CHAR_SIZE, %rax
> +#endif
> +L(ret0):
> + ret
> +
> + /* Returns for first vec x1/x2 have hard coded backward search
> + path for earlier matches. */
> + .p2align 4
> +L(first_vec_x0_test):
> + PCMPEQ %xmm0, %xmm1
> + pmovmskb %xmm1, %eax
> + testl %eax, %eax
> + jz L(ret0)
> + bsrl %eax, %eax
> + addq %r8, %rax
> +#ifdef USE_AS_WCSRCHR
> + andq $-CHAR_SIZE, %rax
> +#endif
> + ret
> +
> + .p2align 4
> +L(first_vec_x1):
> + PCMPEQ %xmm0, %xmm2
> + pmovmskb %xmm2, %eax
> + leal -1(%rcx), %edx
> + xorl %edx, %ecx
> + andl %ecx, %eax
> + jz L(first_vec_x0_test)
> + bsrl %eax, %eax
> + leaq (VEC_SIZE)(%rdi, %rax), %rax
> +#ifdef USE_AS_WCSRCHR
> + andq $-CHAR_SIZE, %rax
> +#endif
> + ret
> +
> + .p2align 4
> +L(first_vec_x1_test):
> + PCMPEQ %xmm0, %xmm2
> + pmovmskb %xmm2, %eax
> + testl %eax, %eax
> + jz L(first_vec_x0_test)
> + bsrl %eax, %eax
> + leaq (VEC_SIZE)(%rdi, %rax), %rax
> +#ifdef USE_AS_WCSRCHR
> + andq $-CHAR_SIZE, %rax
> +#endif
> + ret
> +
> + .p2align 4
> +L(first_vec_x2):
> + PCMPEQ %xmm0, %xmm3
> + pmovmskb %xmm3, %eax
> + leal -1(%rcx), %edx
> + xorl %edx, %ecx
> + andl %ecx, %eax
> + jz L(first_vec_x1_test)
> + bsrl %eax, %eax
> + leaq (VEC_SIZE * 2)(%rdi, %rax), %rax
> +#ifdef USE_AS_WCSRCHR
> + andq $-CHAR_SIZE, %rax
> +#endif
> + ret
> +
> + .p2align 4
> +L(aligned_more):
> + /* Save original pointer if match was in VEC 0. */
> + movq %rdi, %r8
> + andq $-VEC_SIZE, %rdi
> +
> + movaps VEC_SIZE(%rdi), %xmm2
> + pxor %xmm3, %xmm3
> + PCMPEQ %xmm2, %xmm3
> + pmovmskb %xmm3, %ecx
> + testl %ecx, %ecx
> + jnz L(first_vec_x1)
> +
> + movaps (VEC_SIZE * 2)(%rdi), %xmm3
> + pxor %xmm4, %xmm4
> + PCMPEQ %xmm3, %xmm4
> + pmovmskb %xmm4, %ecx
> + testl %ecx, %ecx
> + jnz L(first_vec_x2)
> +
> + addq $VEC_SIZE, %rdi
> + /* Save pointer again before realigning. */
> + movq %rdi, %rsi
> + andq $-(VEC_SIZE * 2), %rdi
> + .p2align 4
> +L(first_loop):
> + /* Do 2x VEC at a time. */
> + movaps (VEC_SIZE * 2)(%rdi), %xmm4
> + movaps (VEC_SIZE * 3)(%rdi), %xmm5
> + /* Since SSE2 no pminud so wcsrchr needs seperate logic for
> + detecting zero. Note if this is found to be a bottleneck it
> + may be worth adding an SSE4.1 wcsrchr implementation. */
> +#ifdef USE_AS_WCSRCHR
> + movaps %xmm5, %xmm6
> + pxor %xmm8, %xmm8
> +
> + PCMPEQ %xmm8, %xmm5
> + PCMPEQ %xmm4, %xmm8
> + por %xmm5, %xmm8
> +#else
> + movaps %xmm5, %xmm6
> + PMINU %xmm4, %xmm5
> +#endif
> +
> + movaps %xmm4, %xmm9
> + PCMPEQ %xmm0, %xmm4
> + PCMPEQ %xmm0, %xmm6
> + movaps %xmm6, %xmm7
> + por %xmm4, %xmm6
> +#ifndef USE_AS_WCSRCHR
> + pxor %xmm8, %xmm8
> + PCMPEQ %xmm5, %xmm8
> +#endif
> + pmovmskb %xmm8, %ecx
> + pmovmskb %xmm6, %eax
>
> -# undef weak_alias
> -# define weak_alias(strrchr, rindex)
> -# undef libc_hidden_builtin_def
> -# define libc_hidden_builtin_def(strrchr)
> + addq $(VEC_SIZE * 2), %rdi
> + /* Use `addl` 1) so we can undo it with `subl` and 2) it can
> + macro-fuse with `jz`. */
> + addl %ecx, %eax
> + jz L(first_loop)
> +
> + /* Check if there is zero match. */
> + testl %ecx, %ecx
> + jz L(second_loop_match)
> +
> + /* Check if there was a match in last iteration. */
> + subl %ecx, %eax
> + jnz L(new_match)
> +
> +L(first_loop_old_match):
> + PCMPEQ %xmm0, %xmm2
> + PCMPEQ %xmm0, %xmm3
> + pmovmskb %xmm2, %ecx
> + pmovmskb %xmm3, %eax
> + addl %eax, %ecx
> + jz L(first_vec_x0_test)
> + /* NB: We could move this shift to before the branch and save a
> + bit of code size / performance on the fall through. The
> + branch leads to the null case which generally seems hotter
> + than char in first 3x VEC. */
> + sall $16, %eax
> + orl %ecx, %eax
> +
> + bsrl %eax, %eax
> + addq %rsi, %rax
> +#ifdef USE_AS_WCSRCHR
> + andq $-CHAR_SIZE, %rax
> +#endif
> + ret
> +
> + .p2align 4
> +L(new_match):
> + pxor %xmm6, %xmm6
> + PCMPEQ %xmm9, %xmm6
> + pmovmskb %xmm6, %eax
> + sall $16, %ecx
> + orl %eax, %ecx
> +
> + /* We can't reuse either of the old comparisons as since we mask
> + of zeros after first zero (instead of using the full
> + comparison) we can't gurantee no interference between match
> + after end of string and valid match. */
> + pmovmskb %xmm4, %eax
> + pmovmskb %xmm7, %edx
> + sall $16, %edx
> + orl %edx, %eax
> +
> + leal -1(%ecx), %edx
> + xorl %edx, %ecx
> + andl %ecx, %eax
> + jz L(first_loop_old_match)
> + bsrl %eax, %eax
> + addq %rdi, %rax
> +#ifdef USE_AS_WCSRCHR
> + andq $-CHAR_SIZE, %rax
> +#endif
> + ret
> +
> + /* Save minimum state for getting most recent match. We can
> + throw out all previous work. */
> + .p2align 4
> +L(second_loop_match):
> + movq %rdi, %rsi
> + movaps %xmm4, %xmm2
> + movaps %xmm7, %xmm3
> +
> + .p2align 4
> +L(second_loop):
> + movaps (VEC_SIZE * 2)(%rdi), %xmm4
> + movaps (VEC_SIZE * 3)(%rdi), %xmm5
> + /* Since SSE2 no pminud so wcsrchr needs seperate logic for
> + detecting zero. Note if this is found to be a bottleneck it
> + may be worth adding an SSE4.1 wcsrchr implementation. */
> +#ifdef USE_AS_WCSRCHR
> + movaps %xmm5, %xmm6
> + pxor %xmm8, %xmm8
> +
> + PCMPEQ %xmm8, %xmm5
> + PCMPEQ %xmm4, %xmm8
> + por %xmm5, %xmm8
> +#else
> + movaps %xmm5, %xmm6
> + PMINU %xmm4, %xmm5
> +#endif
> +
> + movaps %xmm4, %xmm9
> + PCMPEQ %xmm0, %xmm4
> + PCMPEQ %xmm0, %xmm6
> + movaps %xmm6, %xmm7
> + por %xmm4, %xmm6
> +#ifndef USE_AS_WCSRCHR
> + pxor %xmm8, %xmm8
> + PCMPEQ %xmm5, %xmm8
> #endif
>
> -#include "../strrchr.S"
> + pmovmskb %xmm8, %ecx
> + pmovmskb %xmm6, %eax
> +
> + addq $(VEC_SIZE * 2), %rdi
> + /* Either null term or new occurence of CHAR. */
> + addl %ecx, %eax
> + jz L(second_loop)
> +
> + /* No null term so much be new occurence of CHAR. */
> + testl %ecx, %ecx
> + jz L(second_loop_match)
> +
> +
> + subl %ecx, %eax
> + jnz L(second_loop_new_match)
> +
> +L(second_loop_old_match):
> + pmovmskb %xmm2, %ecx
> + pmovmskb %xmm3, %eax
> + sall $16, %eax
> + orl %ecx, %eax
> + bsrl %eax, %eax
> + addq %rsi, %rax
> +#ifdef USE_AS_WCSRCHR
> + andq $-CHAR_SIZE, %rax
> +#endif
> + ret
> +
> + .p2align 4
> +L(second_loop_new_match):
> + pxor %xmm6, %xmm6
> + PCMPEQ %xmm9, %xmm6
> + pmovmskb %xmm6, %eax
> + sall $16, %ecx
> + orl %eax, %ecx
> +
> + /* We can't reuse either of the old comparisons as since we mask
> + of zeros after first zero (instead of using the full
> + comparison) we can't gurantee no interference between match
> + after end of string and valid match. */
> + pmovmskb %xmm4, %eax
> + pmovmskb %xmm7, %edx
> + sall $16, %edx
> + orl %edx, %eax
> +
> + leal -1(%ecx), %edx
> + xorl %edx, %ecx
> + andl %ecx, %eax
> + jz L(second_loop_old_match)
> + bsrl %eax, %eax
> + addq %rdi, %rax
> +#ifdef USE_AS_WCSRCHR
> + andq $-CHAR_SIZE, %rax
> +#endif
> + ret
> +
> + .p2align 4,, 4
> +L(cross_page):
> + movq %rdi, %rsi
> + andq $-VEC_SIZE, %rsi
> + movaps (%rsi), %xmm1
> + pxor %xmm2, %xmm2
> + PCMPEQ %xmm1, %xmm2
> + pmovmskb %xmm2, %edx
> + movl %edi, %ecx
> + andl $(VEC_SIZE - 1), %ecx
> + sarl %cl, %edx
> + jz L(cross_page_continue)
> + PCMPEQ %xmm0, %xmm1
> + pmovmskb %xmm1, %eax
> + sarl %cl, %eax
> + leal -1(%rdx), %ecx
> + xorl %edx, %ecx
> + andl %ecx, %eax
> + jz L(ret1)
> + bsrl %eax, %eax
> + addq %rdi, %rax
> +#ifdef USE_AS_WCSRCHR
> + andq $-CHAR_SIZE, %rax
> +#endif
> +L(ret1):
> + ret
> +END(STRRCHR)
> diff --git a/sysdeps/x86_64/multiarch/wcsrchr-sse2.S b/sysdeps/x86_64/multiarch/wcsrchr-sse2.S
> index 69d2f3cdb1..d9259720f8 100644
> --- a/sysdeps/x86_64/multiarch/wcsrchr-sse2.S
> +++ b/sysdeps/x86_64/multiarch/wcsrchr-sse2.S
> @@ -17,6 +17,12 @@
> <https://www.gnu.org/licenses/>. */
>
> #if IS_IN (libc)
> -# define STRRCHR __wcsrchr_sse2
> +# ifndef STRRCHR
> +# define STRRCHR __wcsrchr_sse2
> +# endif
> #endif
> -#include "../wcsrchr.S"
> +
> +#define USE_AS_WCSRCHR 1
> +#define NO_PMINU 1
> +
> +#include "strrchr-sse2.S"
> diff --git a/sysdeps/x86_64/strrchr.S b/sysdeps/x86_64/strrchr.S
> index 4d7ba4ceb2..f39da60454 100644
> --- a/sysdeps/x86_64/strrchr.S
> +++ b/sysdeps/x86_64/strrchr.S
> @@ -16,363 +16,7 @@
> License along with the GNU C Library; if not, see
> <https://www.gnu.org/licenses/>. */
>
> -
> -#include <sysdep.h>
> -
> -#ifndef STRRCHR
> -# define STRRCHR strrchr
> -#endif
> -
> -#ifdef USE_AS_WCSRCHR
> -# define PCMPEQ pcmpeqd
> -# define CHAR_SIZE 4
> -# define PMINU pminud
> -#else
> -# define PCMPEQ pcmpeqb
> -# define CHAR_SIZE 1
> -# define PMINU pminub
> -#endif
> -
> -#define PAGE_SIZE 4096
> -#define VEC_SIZE 16
> -
> - .text
> -ENTRY(STRRCHR)
> - movd %esi, %xmm0
> - movq %rdi, %rax
> - andl $(PAGE_SIZE - 1), %eax
> -#ifndef USE_AS_WCSRCHR
> - punpcklbw %xmm0, %xmm0
> - punpcklwd %xmm0, %xmm0
> -#endif
> - pshufd $0, %xmm0, %xmm0
> - cmpl $(PAGE_SIZE - VEC_SIZE), %eax
> - ja L(cross_page)
> -
> -L(cross_page_continue):
> - movups (%rdi), %xmm1
> - pxor %xmm2, %xmm2
> - PCMPEQ %xmm1, %xmm2
> - pmovmskb %xmm2, %ecx
> - testl %ecx, %ecx
> - jz L(aligned_more)
> -
> - PCMPEQ %xmm0, %xmm1
> - pmovmskb %xmm1, %eax
> - leal -1(%rcx), %edx
> - xorl %edx, %ecx
> - andl %ecx, %eax
> - jz L(ret0)
> - bsrl %eax, %eax
> - addq %rdi, %rax
> - /* We are off by 3 for wcsrchr if search CHAR is non-zero. If
> - search CHAR is zero we are correct. Either way `andq
> - -CHAR_SIZE, %rax` gets the correct result. */
> -#ifdef USE_AS_WCSRCHR
> - andq $-CHAR_SIZE, %rax
> -#endif
> -L(ret0):
> - ret
> -
> - /* Returns for first vec x1/x2 have hard coded backward search
> - path for earlier matches. */
> - .p2align 4
> -L(first_vec_x0_test):
> - PCMPEQ %xmm0, %xmm1
> - pmovmskb %xmm1, %eax
> - testl %eax, %eax
> - jz L(ret0)
> - bsrl %eax, %eax
> - addq %r8, %rax
> -#ifdef USE_AS_WCSRCHR
> - andq $-CHAR_SIZE, %rax
> -#endif
> - ret
> -
> - .p2align 4
> -L(first_vec_x1):
> - PCMPEQ %xmm0, %xmm2
> - pmovmskb %xmm2, %eax
> - leal -1(%rcx), %edx
> - xorl %edx, %ecx
> - andl %ecx, %eax
> - jz L(first_vec_x0_test)
> - bsrl %eax, %eax
> - leaq (VEC_SIZE)(%rdi, %rax), %rax
> -#ifdef USE_AS_WCSRCHR
> - andq $-CHAR_SIZE, %rax
> -#endif
> - ret
> -
> - .p2align 4
> -L(first_vec_x1_test):
> - PCMPEQ %xmm0, %xmm2
> - pmovmskb %xmm2, %eax
> - testl %eax, %eax
> - jz L(first_vec_x0_test)
> - bsrl %eax, %eax
> - leaq (VEC_SIZE)(%rdi, %rax), %rax
> -#ifdef USE_AS_WCSRCHR
> - andq $-CHAR_SIZE, %rax
> -#endif
> - ret
> -
> - .p2align 4
> -L(first_vec_x2):
> - PCMPEQ %xmm0, %xmm3
> - pmovmskb %xmm3, %eax
> - leal -1(%rcx), %edx
> - xorl %edx, %ecx
> - andl %ecx, %eax
> - jz L(first_vec_x1_test)
> - bsrl %eax, %eax
> - leaq (VEC_SIZE * 2)(%rdi, %rax), %rax
> -#ifdef USE_AS_WCSRCHR
> - andq $-CHAR_SIZE, %rax
> -#endif
> - ret
> -
> - .p2align 4
> -L(aligned_more):
> - /* Save original pointer if match was in VEC 0. */
> - movq %rdi, %r8
> - andq $-VEC_SIZE, %rdi
> -
> - movaps VEC_SIZE(%rdi), %xmm2
> - pxor %xmm3, %xmm3
> - PCMPEQ %xmm2, %xmm3
> - pmovmskb %xmm3, %ecx
> - testl %ecx, %ecx
> - jnz L(first_vec_x1)
> -
> - movaps (VEC_SIZE * 2)(%rdi), %xmm3
> - pxor %xmm4, %xmm4
> - PCMPEQ %xmm3, %xmm4
> - pmovmskb %xmm4, %ecx
> - testl %ecx, %ecx
> - jnz L(first_vec_x2)
> -
> - addq $VEC_SIZE, %rdi
> - /* Save pointer again before realigning. */
> - movq %rdi, %rsi
> - andq $-(VEC_SIZE * 2), %rdi
> - .p2align 4
> -L(first_loop):
> - /* Do 2x VEC at a time. */
> - movaps (VEC_SIZE * 2)(%rdi), %xmm4
> - movaps (VEC_SIZE * 3)(%rdi), %xmm5
> - /* Since SSE2 no pminud so wcsrchr needs seperate logic for
> - detecting zero. Note if this is found to be a bottleneck it
> - may be worth adding an SSE4.1 wcsrchr implementation. */
> -#ifdef USE_AS_WCSRCHR
> - movaps %xmm5, %xmm6
> - pxor %xmm8, %xmm8
> -
> - PCMPEQ %xmm8, %xmm5
> - PCMPEQ %xmm4, %xmm8
> - por %xmm5, %xmm8
> -#else
> - movaps %xmm5, %xmm6
> - PMINU %xmm4, %xmm5
> -#endif
> -
> - movaps %xmm4, %xmm9
> - PCMPEQ %xmm0, %xmm4
> - PCMPEQ %xmm0, %xmm6
> - movaps %xmm6, %xmm7
> - por %xmm4, %xmm6
> -#ifndef USE_AS_WCSRCHR
> - pxor %xmm8, %xmm8
> - PCMPEQ %xmm5, %xmm8
> -#endif
> - pmovmskb %xmm8, %ecx
> - pmovmskb %xmm6, %eax
> -
> - addq $(VEC_SIZE * 2), %rdi
> - /* Use `addl` 1) so we can undo it with `subl` and 2) it can
> - macro-fuse with `jz`. */
> - addl %ecx, %eax
> - jz L(first_loop)
> -
> - /* Check if there is zero match. */
> - testl %ecx, %ecx
> - jz L(second_loop_match)
> -
> - /* Check if there was a match in last iteration. */
> - subl %ecx, %eax
> - jnz L(new_match)
> -
> -L(first_loop_old_match):
> - PCMPEQ %xmm0, %xmm2
> - PCMPEQ %xmm0, %xmm3
> - pmovmskb %xmm2, %ecx
> - pmovmskb %xmm3, %eax
> - addl %eax, %ecx
> - jz L(first_vec_x0_test)
> - /* NB: We could move this shift to before the branch and save a
> - bit of code size / performance on the fall through. The
> - branch leads to the null case which generally seems hotter
> - than char in first 3x VEC. */
> - sall $16, %eax
> - orl %ecx, %eax
> -
> - bsrl %eax, %eax
> - addq %rsi, %rax
> -#ifdef USE_AS_WCSRCHR
> - andq $-CHAR_SIZE, %rax
> -#endif
> - ret
> -
> - .p2align 4
> -L(new_match):
> - pxor %xmm6, %xmm6
> - PCMPEQ %xmm9, %xmm6
> - pmovmskb %xmm6, %eax
> - sall $16, %ecx
> - orl %eax, %ecx
> -
> - /* We can't reuse either of the old comparisons as since we mask
> - of zeros after first zero (instead of using the full
> - comparison) we can't gurantee no interference between match
> - after end of string and valid match. */
> - pmovmskb %xmm4, %eax
> - pmovmskb %xmm7, %edx
> - sall $16, %edx
> - orl %edx, %eax
> -
> - leal -1(%ecx), %edx
> - xorl %edx, %ecx
> - andl %ecx, %eax
> - jz L(first_loop_old_match)
> - bsrl %eax, %eax
> - addq %rdi, %rax
> -#ifdef USE_AS_WCSRCHR
> - andq $-CHAR_SIZE, %rax
> -#endif
> - ret
> -
> - /* Save minimum state for getting most recent match. We can
> - throw out all previous work. */
> - .p2align 4
> -L(second_loop_match):
> - movq %rdi, %rsi
> - movaps %xmm4, %xmm2
> - movaps %xmm7, %xmm3
> -
> - .p2align 4
> -L(second_loop):
> - movaps (VEC_SIZE * 2)(%rdi), %xmm4
> - movaps (VEC_SIZE * 3)(%rdi), %xmm5
> - /* Since SSE2 no pminud so wcsrchr needs seperate logic for
> - detecting zero. Note if this is found to be a bottleneck it
> - may be worth adding an SSE4.1 wcsrchr implementation. */
> -#ifdef USE_AS_WCSRCHR
> - movaps %xmm5, %xmm6
> - pxor %xmm8, %xmm8
> -
> - PCMPEQ %xmm8, %xmm5
> - PCMPEQ %xmm4, %xmm8
> - por %xmm5, %xmm8
> -#else
> - movaps %xmm5, %xmm6
> - PMINU %xmm4, %xmm5
> -#endif
> -
> - movaps %xmm4, %xmm9
> - PCMPEQ %xmm0, %xmm4
> - PCMPEQ %xmm0, %xmm6
> - movaps %xmm6, %xmm7
> - por %xmm4, %xmm6
> -#ifndef USE_AS_WCSRCHR
> - pxor %xmm8, %xmm8
> - PCMPEQ %xmm5, %xmm8
> -#endif
> -
> - pmovmskb %xmm8, %ecx
> - pmovmskb %xmm6, %eax
> -
> - addq $(VEC_SIZE * 2), %rdi
> - /* Either null term or new occurence of CHAR. */
> - addl %ecx, %eax
> - jz L(second_loop)
> -
> - /* No null term so much be new occurence of CHAR. */
> - testl %ecx, %ecx
> - jz L(second_loop_match)
> -
> -
> - subl %ecx, %eax
> - jnz L(second_loop_new_match)
> -
> -L(second_loop_old_match):
> - pmovmskb %xmm2, %ecx
> - pmovmskb %xmm3, %eax
> - sall $16, %eax
> - orl %ecx, %eax
> - bsrl %eax, %eax
> - addq %rsi, %rax
> -#ifdef USE_AS_WCSRCHR
> - andq $-CHAR_SIZE, %rax
> -#endif
> - ret
> -
> - .p2align 4
> -L(second_loop_new_match):
> - pxor %xmm6, %xmm6
> - PCMPEQ %xmm9, %xmm6
> - pmovmskb %xmm6, %eax
> - sall $16, %ecx
> - orl %eax, %ecx
> -
> - /* We can't reuse either of the old comparisons as since we mask
> - of zeros after first zero (instead of using the full
> - comparison) we can't gurantee no interference between match
> - after end of string and valid match. */
> - pmovmskb %xmm4, %eax
> - pmovmskb %xmm7, %edx
> - sall $16, %edx
> - orl %edx, %eax
> -
> - leal -1(%ecx), %edx
> - xorl %edx, %ecx
> - andl %ecx, %eax
> - jz L(second_loop_old_match)
> - bsrl %eax, %eax
> - addq %rdi, %rax
> -#ifdef USE_AS_WCSRCHR
> - andq $-CHAR_SIZE, %rax
> -#endif
> - ret
> -
> - .p2align 4,, 4
> -L(cross_page):
> - movq %rdi, %rsi
> - andq $-VEC_SIZE, %rsi
> - movaps (%rsi), %xmm1
> - pxor %xmm2, %xmm2
> - PCMPEQ %xmm1, %xmm2
> - pmovmskb %xmm2, %edx
> - movl %edi, %ecx
> - andl $(VEC_SIZE - 1), %ecx
> - sarl %cl, %edx
> - jz L(cross_page_continue)
> - PCMPEQ %xmm0, %xmm1
> - pmovmskb %xmm1, %eax
> - sarl %cl, %eax
> - leal -1(%rdx), %ecx
> - xorl %edx, %ecx
> - andl %ecx, %eax
> - jz L(ret1)
> - bsrl %eax, %eax
> - addq %rdi, %rax
> -#ifdef USE_AS_WCSRCHR
> - andq $-CHAR_SIZE, %rax
> -#endif
> -L(ret1):
> - ret
> -END(STRRCHR)
> -
> -#ifndef USE_AS_WCSRCHR
> - weak_alias (STRRCHR, rindex)
> - libc_hidden_builtin_def (STRRCHR)
> -#endif
> +#define STRRCHR strrchr
> +#include "multiarch/strrchr-sse2.S"
> +weak_alias (strrchr, rindex)
> +libc_hidden_builtin_def (strrchr)
> diff --git a/sysdeps/x86_64/wcsrchr.S b/sysdeps/x86_64/wcsrchr.S
> index 2b80efc5ef..1d4b1eb21c 100644
> --- a/sysdeps/x86_64/wcsrchr.S
> +++ b/sysdeps/x86_64/wcsrchr.S
> @@ -16,12 +16,5 @@
> License along with the GNU C Library; if not, see
> <https://www.gnu.org/licenses/>. */
>
> -
> -#define USE_AS_WCSRCHR 1
> -#define NO_PMINU 1
> -
> -#ifndef STRRCHR
> -# define STRRCHR wcsrchr
> -#endif
> -
> -#include "../strrchr.S"
> +#define STRRCHR wcsrchr
> +#include "multiarch/wcsrchr-sse2.S"
> --
> 2.34.1
>
LGTM.
Thanks.
--
H.J.
^ permalink raw reply [flat|nested] 21+ messages in thread
* Re: [PATCH v1] x86: Move memrchr SSE2 implementation to multiarch/memrchr-sse2.S
2022-07-12 19:29 ` [PATCH v1] x86: Move memrchr SSE2 implementation to multiarch/memrchr-sse2.S Noah Goldstein
@ 2022-07-12 22:58 ` H.J. Lu
0 siblings, 0 replies; 21+ messages in thread
From: H.J. Lu @ 2022-07-12 22:58 UTC (permalink / raw)
To: Noah Goldstein; +Cc: GNU C Library, Carlos O'Donell
On Tue, Jul 12, 2022 at 12:29 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> This commit doesn't affect libc.so.6, its just housekeeping to prepare
> for adding explicit ISA level support.
>
> Tested build on x86_64 and x86_32 with/without multiarch.
> ---
> sysdeps/x86_64/memrchr.S | 332 +----------------------
> sysdeps/x86_64/multiarch/memrchr-sse2.S | 336 +++++++++++++++++++++++-
> 2 files changed, 334 insertions(+), 334 deletions(-)
>
> diff --git a/sysdeps/x86_64/memrchr.S b/sysdeps/x86_64/memrchr.S
> index b0dffd2ae2..385e2c5668 100644
> --- a/sysdeps/x86_64/memrchr.S
> +++ b/sysdeps/x86_64/memrchr.S
> @@ -17,334 +17,6 @@
> License along with the GNU C Library; if not, see
> <https://www.gnu.org/licenses/>. */
>
> -#include <sysdep.h>
> -#define VEC_SIZE 16
> -#define PAGE_SIZE 4096
> -
> - .text
> -ENTRY_P2ALIGN(__memrchr, 6)
> -#ifdef __ILP32__
> - /* Clear upper bits. */
> - mov %RDX_LP, %RDX_LP
> -#endif
> - movd %esi, %xmm0
> -
> - /* Get end pointer. */
> - leaq (%rdx, %rdi), %rcx
> -
> - punpcklbw %xmm0, %xmm0
> - punpcklwd %xmm0, %xmm0
> - pshufd $0, %xmm0, %xmm0
> -
> - /* Check if we can load 1x VEC without cross a page. */
> - testl $(PAGE_SIZE - VEC_SIZE), %ecx
> - jz L(page_cross)
> -
> - /* NB: This load happens regardless of whether rdx (len) is zero. Since
> - it doesn't cross a page and the standard gurantees any pointer have
> - at least one-valid byte this load must be safe. For the entire
> - history of the x86 memrchr implementation this has been possible so
> - no code "should" be relying on a zero-length check before this load.
> - The zero-length check is moved to the page cross case because it is
> - 1) pretty cold and including it pushes the hot case len <= VEC_SIZE
> - into 2-cache lines. */
> - movups -(VEC_SIZE)(%rcx), %xmm1
> - pcmpeqb %xmm0, %xmm1
> - pmovmskb %xmm1, %eax
> -
> - subq $VEC_SIZE, %rdx
> - ja L(more_1x_vec)
> -L(ret_vec_x0_test):
> - /* Zero-flag set if eax (src) is zero. Destination unchanged if src is
> - zero. */
> - bsrl %eax, %eax
> - jz L(ret_0)
> - /* Check if the CHAR match is in bounds. Need to truly zero `eax` here
> - if out of bounds. */
> - addl %edx, %eax
> - jl L(zero_0)
> - /* Since we subtracted VEC_SIZE from rdx earlier we can just add to base
> - ptr. */
> - addq %rdi, %rax
> -L(ret_0):
> - ret
> -
> - .p2align 4,, 5
> -L(ret_vec_x0):
> - bsrl %eax, %eax
> - leaq -(VEC_SIZE)(%rcx, %rax), %rax
> - ret
> -
> - .p2align 4,, 2
> -L(zero_0):
> - xorl %eax, %eax
> - ret
> -
> -
> - .p2align 4,, 8
> -L(more_1x_vec):
> - testl %eax, %eax
> - jnz L(ret_vec_x0)
> -
> - /* Align rcx (pointer to string). */
> - decq %rcx
> - andq $-VEC_SIZE, %rcx
> -
> - movq %rcx, %rdx
> - /* NB: We could consistenyl save 1-byte in this pattern with `movaps
> - %xmm0, %xmm1; pcmpeq IMM8(r), %xmm1; ...`. The reason against it is
> - it adds more frontend uops (even if the moves can be eliminated) and
> - some percentage of the time actual backend uops. */
> - movaps -(VEC_SIZE)(%rcx), %xmm1
> - pcmpeqb %xmm0, %xmm1
> - subq %rdi, %rdx
> - pmovmskb %xmm1, %eax
> -
> - cmpq $(VEC_SIZE * 2), %rdx
> - ja L(more_2x_vec)
> -L(last_2x_vec):
> - subl $VEC_SIZE, %edx
> - jbe L(ret_vec_x0_test)
> -
> - testl %eax, %eax
> - jnz L(ret_vec_x0)
> -
> - movaps -(VEC_SIZE * 2)(%rcx), %xmm1
> - pcmpeqb %xmm0, %xmm1
> - pmovmskb %xmm1, %eax
> -
> - subl $VEC_SIZE, %edx
> - bsrl %eax, %eax
> - jz L(ret_1)
> - addl %edx, %eax
> - jl L(zero_0)
> - addq %rdi, %rax
> -L(ret_1):
> - ret
> -
> - /* Don't align. Otherwise lose 2-byte encoding in jump to L(page_cross)
> - causes the hot pause (length <= VEC_SIZE) to span multiple cache
> - lines. Naturally aligned % 16 to 8-bytes. */
> -L(page_cross):
> - /* Zero length check. */
> - testq %rdx, %rdx
> - jz L(zero_0)
> -
> - leaq -1(%rcx), %r8
> - andq $-(VEC_SIZE), %r8
> -
> - movaps (%r8), %xmm1
> - pcmpeqb %xmm0, %xmm1
> - pmovmskb %xmm1, %esi
> - /* Shift out negative alignment (because we are starting from endptr and
> - working backwards). */
> - negl %ecx
> - /* 32-bit shift but VEC_SIZE=16 so need to mask the shift count
> - explicitly. */
> - andl $(VEC_SIZE - 1), %ecx
> - shl %cl, %esi
> - movzwl %si, %eax
> - leaq (%rdi, %rdx), %rcx
> - cmpq %rdi, %r8
> - ja L(more_1x_vec)
> - subl $VEC_SIZE, %edx
> - bsrl %eax, %eax
> - jz L(ret_2)
> - addl %edx, %eax
> - jl L(zero_1)
> - addq %rdi, %rax
> -L(ret_2):
> - ret
> -
> - /* Fits in aliging bytes. */
> -L(zero_1):
> - xorl %eax, %eax
> - ret
> -
> - .p2align 4,, 5
> -L(ret_vec_x1):
> - bsrl %eax, %eax
> - leaq -(VEC_SIZE * 2)(%rcx, %rax), %rax
> - ret
> -
> - .p2align 4,, 8
> -L(more_2x_vec):
> - testl %eax, %eax
> - jnz L(ret_vec_x0)
> -
> - movaps -(VEC_SIZE * 2)(%rcx), %xmm1
> - pcmpeqb %xmm0, %xmm1
> - pmovmskb %xmm1, %eax
> - testl %eax, %eax
> - jnz L(ret_vec_x1)
> -
> -
> - movaps -(VEC_SIZE * 3)(%rcx), %xmm1
> - pcmpeqb %xmm0, %xmm1
> - pmovmskb %xmm1, %eax
> -
> - subq $(VEC_SIZE * 4), %rdx
> - ja L(more_4x_vec)
> -
> - addl $(VEC_SIZE), %edx
> - jle L(ret_vec_x2_test)
> -
> -L(last_vec):
> - testl %eax, %eax
> - jnz L(ret_vec_x2)
> -
> - movaps -(VEC_SIZE * 4)(%rcx), %xmm1
> - pcmpeqb %xmm0, %xmm1
> - pmovmskb %xmm1, %eax
> -
> - subl $(VEC_SIZE), %edx
> - bsrl %eax, %eax
> - jz L(ret_3)
> - addl %edx, %eax
> - jl L(zero_2)
> - addq %rdi, %rax
> -L(ret_3):
> - ret
> -
> - .p2align 4,, 6
> -L(ret_vec_x2_test):
> - bsrl %eax, %eax
> - jz L(zero_2)
> - addl %edx, %eax
> - jl L(zero_2)
> - addq %rdi, %rax
> - ret
> -
> -L(zero_2):
> - xorl %eax, %eax
> - ret
> -
> -
> - .p2align 4,, 5
> -L(ret_vec_x2):
> - bsrl %eax, %eax
> - leaq -(VEC_SIZE * 3)(%rcx, %rax), %rax
> - ret
> -
> - .p2align 4,, 5
> -L(ret_vec_x3):
> - bsrl %eax, %eax
> - leaq -(VEC_SIZE * 4)(%rcx, %rax), %rax
> - ret
> -
> - .p2align 4,, 8
> -L(more_4x_vec):
> - testl %eax, %eax
> - jnz L(ret_vec_x2)
> -
> - movaps -(VEC_SIZE * 4)(%rcx), %xmm1
> - pcmpeqb %xmm0, %xmm1
> - pmovmskb %xmm1, %eax
> -
> - testl %eax, %eax
> - jnz L(ret_vec_x3)
> -
> - addq $-(VEC_SIZE * 4), %rcx
> - cmpq $(VEC_SIZE * 4), %rdx
> - jbe L(last_4x_vec)
> -
> - /* Offset everything by 4x VEC_SIZE here to save a few bytes at the end
> - keeping the code from spilling to the next cache line. */
> - addq $(VEC_SIZE * 4 - 1), %rcx
> - andq $-(VEC_SIZE * 4), %rcx
> - leaq (VEC_SIZE * 4)(%rdi), %rdx
> - andq $-(VEC_SIZE * 4), %rdx
> -
> - .p2align 4,, 11
> -L(loop_4x_vec):
> - movaps (VEC_SIZE * -1)(%rcx), %xmm1
> - movaps (VEC_SIZE * -2)(%rcx), %xmm2
> - movaps (VEC_SIZE * -3)(%rcx), %xmm3
> - movaps (VEC_SIZE * -4)(%rcx), %xmm4
> - pcmpeqb %xmm0, %xmm1
> - pcmpeqb %xmm0, %xmm2
> - pcmpeqb %xmm0, %xmm3
> - pcmpeqb %xmm0, %xmm4
> -
> - por %xmm1, %xmm2
> - por %xmm3, %xmm4
> - por %xmm2, %xmm4
> -
> - pmovmskb %xmm4, %esi
> - testl %esi, %esi
> - jnz L(loop_end)
> -
> - addq $-(VEC_SIZE * 4), %rcx
> - cmpq %rdx, %rcx
> - jne L(loop_4x_vec)
> -
> - subl %edi, %edx
> -
> - /* Ends up being 1-byte nop. */
> - .p2align 4,, 2
> -L(last_4x_vec):
> - movaps -(VEC_SIZE)(%rcx), %xmm1
> - pcmpeqb %xmm0, %xmm1
> - pmovmskb %xmm1, %eax
> -
> - cmpl $(VEC_SIZE * 2), %edx
> - jbe L(last_2x_vec)
> -
> - testl %eax, %eax
> - jnz L(ret_vec_x0)
> -
> -
> - movaps -(VEC_SIZE * 2)(%rcx), %xmm1
> - pcmpeqb %xmm0, %xmm1
> - pmovmskb %xmm1, %eax
> -
> - testl %eax, %eax
> - jnz L(ret_vec_end)
> -
> - movaps -(VEC_SIZE * 3)(%rcx), %xmm1
> - pcmpeqb %xmm0, %xmm1
> - pmovmskb %xmm1, %eax
> -
> - subl $(VEC_SIZE * 3), %edx
> - ja L(last_vec)
> - bsrl %eax, %eax
> - jz L(ret_4)
> - addl %edx, %eax
> - jl L(zero_3)
> - addq %rdi, %rax
> -L(ret_4):
> - ret
> -
> - /* Ends up being 1-byte nop. */
> - .p2align 4,, 3
> -L(loop_end):
> - pmovmskb %xmm1, %eax
> - sall $16, %eax
> - jnz L(ret_vec_end)
> -
> - pmovmskb %xmm2, %eax
> - testl %eax, %eax
> - jnz L(ret_vec_end)
> -
> - pmovmskb %xmm3, %eax
> - /* Combine last 2 VEC matches. If ecx (VEC3) is zero (no CHAR in VEC3)
> - then it won't affect the result in esi (VEC4). If ecx is non-zero
> - then CHAR in VEC3 and bsrq will use that position. */
> - sall $16, %eax
> - orl %esi, %eax
> - bsrl %eax, %eax
> - leaq -(VEC_SIZE * 4)(%rcx, %rax), %rax
> - ret
> -
> -L(ret_vec_end):
> - bsrl %eax, %eax
> - leaq (VEC_SIZE * -2)(%rax, %rcx), %rax
> - ret
> - /* Use in L(last_4x_vec). In the same cache line. This is just a spare
> - aligning bytes. */
> -L(zero_3):
> - xorl %eax, %eax
> - ret
> - /* 2-bytes from next cache line. */
> -END(__memrchr)
> +#define MEMRCHR __memrchr
> +#include "multiarch/memrchr-sse2.S"
> weak_alias (__memrchr, memrchr)
> diff --git a/sysdeps/x86_64/multiarch/memrchr-sse2.S b/sysdeps/x86_64/multiarch/memrchr-sse2.S
> index b04202e171..d92a4022dc 100644
> --- a/sysdeps/x86_64/multiarch/memrchr-sse2.S
> +++ b/sysdeps/x86_64/multiarch/memrchr-sse2.S
> @@ -17,10 +17,338 @@
> <https://www.gnu.org/licenses/>. */
>
> #if IS_IN (libc)
> -# define __memrchr __memrchr_sse2
> +# ifndef MEMRCHR
> +# define MEMRCHR __memrchr_sse2
> +# endif
> +#endif
> +
> +#include <sysdep.h>
> +#define VEC_SIZE 16
> +#define PAGE_SIZE 4096
>
> -# undef weak_alias
> -# define weak_alias(__memrchr, memrchr)
> + .text
> +ENTRY_P2ALIGN(MEMRCHR, 6)
> +#ifdef __ILP32__
> + /* Clear upper bits. */
> + mov %RDX_LP, %RDX_LP
> #endif
> + movd %esi, %xmm0
> +
> + /* Get end pointer. */
> + leaq (%rdx, %rdi), %rcx
> +
> + punpcklbw %xmm0, %xmm0
> + punpcklwd %xmm0, %xmm0
> + pshufd $0, %xmm0, %xmm0
> +
> + /* Check if we can load 1x VEC without cross a page. */
> + testl $(PAGE_SIZE - VEC_SIZE), %ecx
> + jz L(page_cross)
> +
> + /* NB: This load happens regardless of whether rdx (len) is zero. Since
> + it doesn't cross a page and the standard gurantees any pointer have
> + at least one-valid byte this load must be safe. For the entire
> + history of the x86 memrchr implementation this has been possible so
> + no code "should" be relying on a zero-length check before this load.
> + The zero-length check is moved to the page cross case because it is
> + 1) pretty cold and including it pushes the hot case len <= VEC_SIZE
> + into 2-cache lines. */
> + movups -(VEC_SIZE)(%rcx), %xmm1
> + pcmpeqb %xmm0, %xmm1
> + pmovmskb %xmm1, %eax
> +
> + subq $VEC_SIZE, %rdx
> + ja L(more_1x_vec)
> +L(ret_vec_x0_test):
> + /* Zero-flag set if eax (src) is zero. Destination unchanged if src is
> + zero. */
> + bsrl %eax, %eax
> + jz L(ret_0)
> + /* Check if the CHAR match is in bounds. Need to truly zero `eax` here
> + if out of bounds. */
> + addl %edx, %eax
> + jl L(zero_0)
> + /* Since we subtracted VEC_SIZE from rdx earlier we can just add to base
> + ptr. */
> + addq %rdi, %rax
> +L(ret_0):
> + ret
> +
> + .p2align 4,, 5
> +L(ret_vec_x0):
> + bsrl %eax, %eax
> + leaq -(VEC_SIZE)(%rcx, %rax), %rax
> + ret
> +
> + .p2align 4,, 2
> +L(zero_0):
> + xorl %eax, %eax
> + ret
> +
> +
> + .p2align 4,, 8
> +L(more_1x_vec):
> + testl %eax, %eax
> + jnz L(ret_vec_x0)
> +
> + /* Align rcx (pointer to string). */
> + decq %rcx
> + andq $-VEC_SIZE, %rcx
> +
> + movq %rcx, %rdx
> + /* NB: We could consistenyl save 1-byte in this pattern with `movaps
> + %xmm0, %xmm1; pcmpeq IMM8(r), %xmm1; ...`. The reason against it is
> + it adds more frontend uops (even if the moves can be eliminated) and
> + some percentage of the time actual backend uops. */
> + movaps -(VEC_SIZE)(%rcx), %xmm1
> + pcmpeqb %xmm0, %xmm1
> + subq %rdi, %rdx
> + pmovmskb %xmm1, %eax
> +
> + cmpq $(VEC_SIZE * 2), %rdx
> + ja L(more_2x_vec)
> +L(last_2x_vec):
> + subl $VEC_SIZE, %edx
> + jbe L(ret_vec_x0_test)
> +
> + testl %eax, %eax
> + jnz L(ret_vec_x0)
> +
> + movaps -(VEC_SIZE * 2)(%rcx), %xmm1
> + pcmpeqb %xmm0, %xmm1
> + pmovmskb %xmm1, %eax
> +
> + subl $VEC_SIZE, %edx
> + bsrl %eax, %eax
> + jz L(ret_1)
> + addl %edx, %eax
> + jl L(zero_0)
> + addq %rdi, %rax
> +L(ret_1):
> + ret
> +
> + /* Don't align. Otherwise lose 2-byte encoding in jump to L(page_cross)
> + causes the hot pause (length <= VEC_SIZE) to span multiple cache
> + lines. Naturally aligned % 16 to 8-bytes. */
> +L(page_cross):
> + /* Zero length check. */
> + testq %rdx, %rdx
> + jz L(zero_0)
> +
> + leaq -1(%rcx), %r8
> + andq $-(VEC_SIZE), %r8
> +
> + movaps (%r8), %xmm1
> + pcmpeqb %xmm0, %xmm1
> + pmovmskb %xmm1, %esi
> + /* Shift out negative alignment (because we are starting from endptr and
> + working backwards). */
> + negl %ecx
> + /* 32-bit shift but VEC_SIZE=16 so need to mask the shift count
> + explicitly. */
> + andl $(VEC_SIZE - 1), %ecx
> + shl %cl, %esi
> + movzwl %si, %eax
> + leaq (%rdi, %rdx), %rcx
> + cmpq %rdi, %r8
> + ja L(more_1x_vec)
> + subl $VEC_SIZE, %edx
> + bsrl %eax, %eax
> + jz L(ret_2)
> + addl %edx, %eax
> + jl L(zero_1)
> + addq %rdi, %rax
> +L(ret_2):
> + ret
> +
> + /* Fits in aliging bytes. */
> +L(zero_1):
> + xorl %eax, %eax
> + ret
> +
> + .p2align 4,, 5
> +L(ret_vec_x1):
> + bsrl %eax, %eax
> + leaq -(VEC_SIZE * 2)(%rcx, %rax), %rax
> + ret
> +
> + .p2align 4,, 8
> +L(more_2x_vec):
> + testl %eax, %eax
> + jnz L(ret_vec_x0)
> +
> + movaps -(VEC_SIZE * 2)(%rcx), %xmm1
> + pcmpeqb %xmm0, %xmm1
> + pmovmskb %xmm1, %eax
> + testl %eax, %eax
> + jnz L(ret_vec_x1)
> +
> +
> + movaps -(VEC_SIZE * 3)(%rcx), %xmm1
> + pcmpeqb %xmm0, %xmm1
> + pmovmskb %xmm1, %eax
> +
> + subq $(VEC_SIZE * 4), %rdx
> + ja L(more_4x_vec)
> +
> + addl $(VEC_SIZE), %edx
> + jle L(ret_vec_x2_test)
> +
> +L(last_vec):
> + testl %eax, %eax
> + jnz L(ret_vec_x2)
> +
> + movaps -(VEC_SIZE * 4)(%rcx), %xmm1
> + pcmpeqb %xmm0, %xmm1
> + pmovmskb %xmm1, %eax
> +
> + subl $(VEC_SIZE), %edx
> + bsrl %eax, %eax
> + jz L(ret_3)
> + addl %edx, %eax
> + jl L(zero_2)
> + addq %rdi, %rax
> +L(ret_3):
> + ret
> +
> + .p2align 4,, 6
> +L(ret_vec_x2_test):
> + bsrl %eax, %eax
> + jz L(zero_2)
> + addl %edx, %eax
> + jl L(zero_2)
> + addq %rdi, %rax
> + ret
> +
> +L(zero_2):
> + xorl %eax, %eax
> + ret
> +
> +
> + .p2align 4,, 5
> +L(ret_vec_x2):
> + bsrl %eax, %eax
> + leaq -(VEC_SIZE * 3)(%rcx, %rax), %rax
> + ret
> +
> + .p2align 4,, 5
> +L(ret_vec_x3):
> + bsrl %eax, %eax
> + leaq -(VEC_SIZE * 4)(%rcx, %rax), %rax
> + ret
> +
> + .p2align 4,, 8
> +L(more_4x_vec):
> + testl %eax, %eax
> + jnz L(ret_vec_x2)
> +
> + movaps -(VEC_SIZE * 4)(%rcx), %xmm1
> + pcmpeqb %xmm0, %xmm1
> + pmovmskb %xmm1, %eax
> +
> + testl %eax, %eax
> + jnz L(ret_vec_x3)
> +
> + addq $-(VEC_SIZE * 4), %rcx
> + cmpq $(VEC_SIZE * 4), %rdx
> + jbe L(last_4x_vec)
> +
> + /* Offset everything by 4x VEC_SIZE here to save a few bytes at the end
> + keeping the code from spilling to the next cache line. */
> + addq $(VEC_SIZE * 4 - 1), %rcx
> + andq $-(VEC_SIZE * 4), %rcx
> + leaq (VEC_SIZE * 4)(%rdi), %rdx
> + andq $-(VEC_SIZE * 4), %rdx
> +
> + .p2align 4,, 11
> +L(loop_4x_vec):
> + movaps (VEC_SIZE * -1)(%rcx), %xmm1
> + movaps (VEC_SIZE * -2)(%rcx), %xmm2
> + movaps (VEC_SIZE * -3)(%rcx), %xmm3
> + movaps (VEC_SIZE * -4)(%rcx), %xmm4
> + pcmpeqb %xmm0, %xmm1
> + pcmpeqb %xmm0, %xmm2
> + pcmpeqb %xmm0, %xmm3
> + pcmpeqb %xmm0, %xmm4
> +
> + por %xmm1, %xmm2
> + por %xmm3, %xmm4
> + por %xmm2, %xmm4
> +
> + pmovmskb %xmm4, %esi
> + testl %esi, %esi
> + jnz L(loop_end)
> +
> + addq $-(VEC_SIZE * 4), %rcx
> + cmpq %rdx, %rcx
> + jne L(loop_4x_vec)
> +
> + subl %edi, %edx
> +
> + /* Ends up being 1-byte nop. */
> + .p2align 4,, 2
> +L(last_4x_vec):
> + movaps -(VEC_SIZE)(%rcx), %xmm1
> + pcmpeqb %xmm0, %xmm1
> + pmovmskb %xmm1, %eax
> +
> + cmpl $(VEC_SIZE * 2), %edx
> + jbe L(last_2x_vec)
> +
> + testl %eax, %eax
> + jnz L(ret_vec_x0)
> +
> +
> + movaps -(VEC_SIZE * 2)(%rcx), %xmm1
> + pcmpeqb %xmm0, %xmm1
> + pmovmskb %xmm1, %eax
> +
> + testl %eax, %eax
> + jnz L(ret_vec_end)
> +
> + movaps -(VEC_SIZE * 3)(%rcx), %xmm1
> + pcmpeqb %xmm0, %xmm1
> + pmovmskb %xmm1, %eax
> +
> + subl $(VEC_SIZE * 3), %edx
> + ja L(last_vec)
> + bsrl %eax, %eax
> + jz L(ret_4)
> + addl %edx, %eax
> + jl L(zero_3)
> + addq %rdi, %rax
> +L(ret_4):
> + ret
> +
> + /* Ends up being 1-byte nop. */
> + .p2align 4,, 3
> +L(loop_end):
> + pmovmskb %xmm1, %eax
> + sall $16, %eax
> + jnz L(ret_vec_end)
> +
> + pmovmskb %xmm2, %eax
> + testl %eax, %eax
> + jnz L(ret_vec_end)
> +
> + pmovmskb %xmm3, %eax
> + /* Combine last 2 VEC matches. If ecx (VEC3) is zero (no CHAR in VEC3)
> + then it won't affect the result in esi (VEC4). If ecx is non-zero
> + then CHAR in VEC3 and bsrq will use that position. */
> + sall $16, %eax
> + orl %esi, %eax
> + bsrl %eax, %eax
> + leaq -(VEC_SIZE * 4)(%rcx, %rax), %rax
> + ret
>
> -#include "../memrchr.S"
> +L(ret_vec_end):
> + bsrl %eax, %eax
> + leaq (VEC_SIZE * -2)(%rax, %rcx), %rax
> + ret
> + /* Use in L(last_4x_vec). In the same cache line. This is just a spare
> + aligning bytes. */
> +L(zero_3):
> + xorl %eax, %eax
> + ret
> + /* 2-bytes from next cache line. */
> +END(MEMRCHR)
> --
> 2.34.1
>
LGTM.
Thanks.
--
H.J.
^ permalink raw reply [flat|nested] 21+ messages in thread
* Re: [PATCH v1] x86: Move strcpy SSE2 implementation to multiarch/strcpy-sse2.S
2022-07-12 19:29 ` [PATCH v1] x86: Move strcpy SSE2 implementation to multiarch/strcpy-sse2.S Noah Goldstein
@ 2022-07-12 23:23 ` H.J. Lu
0 siblings, 0 replies; 21+ messages in thread
From: H.J. Lu @ 2022-07-12 23:23 UTC (permalink / raw)
To: Noah Goldstein; +Cc: GNU C Library, Carlos O'Donell
On Tue, Jul 12, 2022 at 12:29 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> This commit doesn't affect libc.so.6, its just housekeeping to prepare
> for adding explicit ISA level support.
>
> Tested build on x86_64 and x86_32 with/without multiarch.
> ---
> sysdeps/x86_64/multiarch/rtld-stpcpy.S | 18 ++++
> sysdeps/x86_64/multiarch/stpcpy-sse2.S | 15 +--
> sysdeps/x86_64/multiarch/strcpy-sse2.S | 137 ++++++++++++++++++++++--
> sysdeps/x86_64/stpcpy.S | 3 +-
> sysdeps/x86_64/strcpy.S | 138 +------------------------
> 5 files changed, 156 insertions(+), 155 deletions(-)
> create mode 100644 sysdeps/x86_64/multiarch/rtld-stpcpy.S
>
> diff --git a/sysdeps/x86_64/multiarch/rtld-stpcpy.S b/sysdeps/x86_64/multiarch/rtld-stpcpy.S
> new file mode 100644
> index 0000000000..914141f07f
> --- /dev/null
> +++ b/sysdeps/x86_64/multiarch/rtld-stpcpy.S
> @@ -0,0 +1,18 @@
> +/* Copyright (C) 2022 Free Software Foundation, Inc.
> + This file is part of the GNU C Library.
> +
> + The GNU C Library is free software; you can redistribute it and/or
> + modify it under the terms of the GNU Lesser General Public
> + License as published by the Free Software Foundation; either
> + version 2.1 of the License, or (at your option) any later version.
> +
> + The GNU C Library is distributed in the hope that it will be useful,
> + but WITHOUT ANY WARRANTY; without even the implied warranty of
> + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + Lesser General Public License for more details.
> +
> + You should have received a copy of the GNU Lesser General Public
> + License along with the GNU C Library; if not, see
> + <https://www.gnu.org/licenses/>. */
> +
> +#include "../stpcpy.S"
> diff --git a/sysdeps/x86_64/multiarch/stpcpy-sse2.S b/sysdeps/x86_64/multiarch/stpcpy-sse2.S
> index 078504a44e..ea9f973af3 100644
> --- a/sysdeps/x86_64/multiarch/stpcpy-sse2.S
> +++ b/sysdeps/x86_64/multiarch/stpcpy-sse2.S
> @@ -17,17 +17,10 @@
> <https://www.gnu.org/licenses/>. */
>
> #if IS_IN (libc)
> -
> -# include <sysdep.h>
> -# define __stpcpy __stpcpy_sse2
> -
> -# undef weak_alias
> -# define weak_alias(ignored1, ignored2)
> -# undef libc_hidden_def
> -# define libc_hidden_def(__stpcpy)
> -# undef libc_hidden_builtin_def
> -# define libc_hidden_builtin_def(stpcpy)
> +# ifndef STRCPY
> +# define STRCPY __stpcpy_sse2
> +# endif
> #endif
>
> #define USE_AS_STPCPY
> -#include <sysdeps/x86_64/stpcpy.S>
> +#include "strcpy-sse2.S"
> diff --git a/sysdeps/x86_64/multiarch/strcpy-sse2.S b/sysdeps/x86_64/multiarch/strcpy-sse2.S
> index f37967c441..8b5db8b13d 100644
> --- a/sysdeps/x86_64/multiarch/strcpy-sse2.S
> +++ b/sysdeps/x86_64/multiarch/strcpy-sse2.S
> @@ -17,12 +17,137 @@
> <https://www.gnu.org/licenses/>. */
>
> #if IS_IN (libc)
> +# ifndef STRCPY
> +# define STRCPY __strcpy_sse2
> +# endif
> +#endif
>
> -# include <sysdep.h>
> -# define strcpy __strcpy_sse2
> +#include <sysdep.h>
>
> -# undef libc_hidden_builtin_def
> -# define libc_hidden_builtin_def(strcpy)
> -#endif
> + .text
> +ENTRY (STRCPY)
> + movq %rsi, %rcx /* Source register. */
> + andl $7, %ecx /* mask alignment bits */
> + movq %rdi, %rdx /* Duplicate destination pointer. */
> +
> + jz 5f /* aligned => start loop */
> +
> + neg %ecx /* We need to align to 8 bytes. */
> + addl $8,%ecx
> + /* Search the first bytes directly. */
> +0:
> + movb (%rsi), %al /* Fetch a byte */
> + testb %al, %al /* Is it NUL? */
> + movb %al, (%rdx) /* Store it */
> + jz 4f /* If it was NUL, done! */
> + incq %rsi
> + incq %rdx
> + decl %ecx
> + jnz 0b
> +
> +5:
> + movq $0xfefefefefefefeff,%r8
> +
> + /* Now the sources is aligned. Unfortunatly we cannot force
> + to have both source and destination aligned, so ignore the
> + alignment of the destination. */
> + .p2align 4
> +1:
> + /* 1st unroll. */
> + movq (%rsi), %rax /* Read double word (8 bytes). */
> + addq $8, %rsi /* Adjust pointer for next word. */
> + movq %rax, %r9 /* Save a copy for NUL finding. */
> + addq %r8, %r9 /* add the magic value to the word. We get
> + carry bits reported for each byte which
> + is *not* 0 */
> + jnc 3f /* highest byte is NUL => return pointer */
> + xorq %rax, %r9 /* (word+magic)^word */
> + orq %r8, %r9 /* set all non-carry bits */
> + incq %r9 /* add 1: if one carry bit was *not* set
> + the addition will not result in 0. */
> +
> + jnz 3f /* found NUL => return pointer */
> +
> + movq %rax, (%rdx) /* Write value to destination. */
> + addq $8, %rdx /* Adjust pointer. */
> +
> + /* 2nd unroll. */
> + movq (%rsi), %rax /* Read double word (8 bytes). */
> + addq $8, %rsi /* Adjust pointer for next word. */
> + movq %rax, %r9 /* Save a copy for NUL finding. */
> + addq %r8, %r9 /* add the magic value to the word. We get
> + carry bits reported for each byte which
> + is *not* 0 */
> + jnc 3f /* highest byte is NUL => return pointer */
> + xorq %rax, %r9 /* (word+magic)^word */
> + orq %r8, %r9 /* set all non-carry bits */
> + incq %r9 /* add 1: if one carry bit was *not* set
> + the addition will not result in 0. */
> +
> + jnz 3f /* found NUL => return pointer */
>
> -#include <sysdeps/x86_64/strcpy.S>
> + movq %rax, (%rdx) /* Write value to destination. */
> + addq $8, %rdx /* Adjust pointer. */
> +
> + /* 3rd unroll. */
> + movq (%rsi), %rax /* Read double word (8 bytes). */
> + addq $8, %rsi /* Adjust pointer for next word. */
> + movq %rax, %r9 /* Save a copy for NUL finding. */
> + addq %r8, %r9 /* add the magic value to the word. We get
> + carry bits reported for each byte which
> + is *not* 0 */
> + jnc 3f /* highest byte is NUL => return pointer */
> + xorq %rax, %r9 /* (word+magic)^word */
> + orq %r8, %r9 /* set all non-carry bits */
> + incq %r9 /* add 1: if one carry bit was *not* set
> + the addition will not result in 0. */
> +
> + jnz 3f /* found NUL => return pointer */
> +
> + movq %rax, (%rdx) /* Write value to destination. */
> + addq $8, %rdx /* Adjust pointer. */
> +
> + /* 4th unroll. */
> + movq (%rsi), %rax /* Read double word (8 bytes). */
> + addq $8, %rsi /* Adjust pointer for next word. */
> + movq %rax, %r9 /* Save a copy for NUL finding. */
> + addq %r8, %r9 /* add the magic value to the word. We get
> + carry bits reported for each byte which
> + is *not* 0 */
> + jnc 3f /* highest byte is NUL => return pointer */
> + xorq %rax, %r9 /* (word+magic)^word */
> + orq %r8, %r9 /* set all non-carry bits */
> + incq %r9 /* add 1: if one carry bit was *not* set
> + the addition will not result in 0. */
> +
> + jnz 3f /* found NUL => return pointer */
> +
> + movq %rax, (%rdx) /* Write value to destination. */
> + addq $8, %rdx /* Adjust pointer. */
> + jmp 1b /* Next iteration. */
> +
> + /* Do the last few bytes. %rax contains the value to write.
> + The loop is unrolled twice. */
> + .p2align 4
> +3:
> + /* Note that stpcpy needs to return with the value of the NUL
> + byte. */
> + movb %al, (%rdx) /* 1st byte. */
> + testb %al, %al /* Is it NUL. */
> + jz 4f /* yes, finish. */
> + incq %rdx /* Increment destination. */
> + movb %ah, (%rdx) /* 2nd byte. */
> + testb %ah, %ah /* Is it NUL?. */
> + jz 4f /* yes, finish. */
> + incq %rdx /* Increment destination. */
> + shrq $16, %rax /* Shift... */
> + jmp 3b /* and look at next two bytes in %rax. */
> +
> +4:
> +#ifdef USE_AS_STPCPY
> + movq %rdx, %rax /* Destination is return value. */
> +#else
> + movq %rdi, %rax /* Source is return value. */
> +#endif
> + retq
> +END (STRCPY)
> diff --git a/sysdeps/x86_64/stpcpy.S b/sysdeps/x86_64/stpcpy.S
> index ec23de1416..b097c203dd 100644
> --- a/sysdeps/x86_64/stpcpy.S
> +++ b/sysdeps/x86_64/stpcpy.S
> @@ -1,7 +1,6 @@
> -#define USE_AS_STPCPY
> #define STRCPY __stpcpy
>
> -#include <sysdeps/x86_64/strcpy.S>
> +#include "multiarch/stpcpy-sse2.S"
>
> weak_alias (__stpcpy, stpcpy)
> libc_hidden_def (__stpcpy)
> diff --git a/sysdeps/x86_64/strcpy.S b/sysdeps/x86_64/strcpy.S
> index 17e8073550..05f19e6e94 100644
> --- a/sysdeps/x86_64/strcpy.S
> +++ b/sysdeps/x86_64/strcpy.S
> @@ -16,140 +16,6 @@
> License along with the GNU C Library; if not, see
> <https://www.gnu.org/licenses/>. */
>
> -#include <sysdep.h>
> -#include "asm-syntax.h"
> -
> -#ifndef USE_AS_STPCPY
> -# define STRCPY strcpy
> -#endif
> -
> - .text
> -ENTRY (STRCPY)
> - movq %rsi, %rcx /* Source register. */
> - andl $7, %ecx /* mask alignment bits */
> - movq %rdi, %rdx /* Duplicate destination pointer. */
> -
> - jz 5f /* aligned => start loop */
> -
> - neg %ecx /* We need to align to 8 bytes. */
> - addl $8,%ecx
> - /* Search the first bytes directly. */
> -0:
> - movb (%rsi), %al /* Fetch a byte */
> - testb %al, %al /* Is it NUL? */
> - movb %al, (%rdx) /* Store it */
> - jz 4f /* If it was NUL, done! */
> - incq %rsi
> - incq %rdx
> - decl %ecx
> - jnz 0b
> -
> -5:
> - movq $0xfefefefefefefeff,%r8
> -
> - /* Now the sources is aligned. Unfortunatly we cannot force
> - to have both source and destination aligned, so ignore the
> - alignment of the destination. */
> - .p2align 4
> -1:
> - /* 1st unroll. */
> - movq (%rsi), %rax /* Read double word (8 bytes). */
> - addq $8, %rsi /* Adjust pointer for next word. */
> - movq %rax, %r9 /* Save a copy for NUL finding. */
> - addq %r8, %r9 /* add the magic value to the word. We get
> - carry bits reported for each byte which
> - is *not* 0 */
> - jnc 3f /* highest byte is NUL => return pointer */
> - xorq %rax, %r9 /* (word+magic)^word */
> - orq %r8, %r9 /* set all non-carry bits */
> - incq %r9 /* add 1: if one carry bit was *not* set
> - the addition will not result in 0. */
> -
> - jnz 3f /* found NUL => return pointer */
> -
> - movq %rax, (%rdx) /* Write value to destination. */
> - addq $8, %rdx /* Adjust pointer. */
> -
> - /* 2nd unroll. */
> - movq (%rsi), %rax /* Read double word (8 bytes). */
> - addq $8, %rsi /* Adjust pointer for next word. */
> - movq %rax, %r9 /* Save a copy for NUL finding. */
> - addq %r8, %r9 /* add the magic value to the word. We get
> - carry bits reported for each byte which
> - is *not* 0 */
> - jnc 3f /* highest byte is NUL => return pointer */
> - xorq %rax, %r9 /* (word+magic)^word */
> - orq %r8, %r9 /* set all non-carry bits */
> - incq %r9 /* add 1: if one carry bit was *not* set
> - the addition will not result in 0. */
> -
> - jnz 3f /* found NUL => return pointer */
> -
> - movq %rax, (%rdx) /* Write value to destination. */
> - addq $8, %rdx /* Adjust pointer. */
> -
> - /* 3rd unroll. */
> - movq (%rsi), %rax /* Read double word (8 bytes). */
> - addq $8, %rsi /* Adjust pointer for next word. */
> - movq %rax, %r9 /* Save a copy for NUL finding. */
> - addq %r8, %r9 /* add the magic value to the word. We get
> - carry bits reported for each byte which
> - is *not* 0 */
> - jnc 3f /* highest byte is NUL => return pointer */
> - xorq %rax, %r9 /* (word+magic)^word */
> - orq %r8, %r9 /* set all non-carry bits */
> - incq %r9 /* add 1: if one carry bit was *not* set
> - the addition will not result in 0. */
> -
> - jnz 3f /* found NUL => return pointer */
> -
> - movq %rax, (%rdx) /* Write value to destination. */
> - addq $8, %rdx /* Adjust pointer. */
> -
> - /* 4th unroll. */
> - movq (%rsi), %rax /* Read double word (8 bytes). */
> - addq $8, %rsi /* Adjust pointer for next word. */
> - movq %rax, %r9 /* Save a copy for NUL finding. */
> - addq %r8, %r9 /* add the magic value to the word. We get
> - carry bits reported for each byte which
> - is *not* 0 */
> - jnc 3f /* highest byte is NUL => return pointer */
> - xorq %rax, %r9 /* (word+magic)^word */
> - orq %r8, %r9 /* set all non-carry bits */
> - incq %r9 /* add 1: if one carry bit was *not* set
> - the addition will not result in 0. */
> -
> - jnz 3f /* found NUL => return pointer */
> -
> - movq %rax, (%rdx) /* Write value to destination. */
> - addq $8, %rdx /* Adjust pointer. */
> - jmp 1b /* Next iteration. */
> -
> - /* Do the last few bytes. %rax contains the value to write.
> - The loop is unrolled twice. */
> - .p2align 4
> -3:
> - /* Note that stpcpy needs to return with the value of the NUL
> - byte. */
> - movb %al, (%rdx) /* 1st byte. */
> - testb %al, %al /* Is it NUL. */
> - jz 4f /* yes, finish. */
> - incq %rdx /* Increment destination. */
> - movb %ah, (%rdx) /* 2nd byte. */
> - testb %ah, %ah /* Is it NUL?. */
> - jz 4f /* yes, finish. */
> - incq %rdx /* Increment destination. */
> - shrq $16, %rax /* Shift... */
> - jmp 3b /* and look at next two bytes in %rax. */
> -
> -4:
> -#ifdef USE_AS_STPCPY
> - movq %rdx, %rax /* Destination is return value. */
> -#else
> - movq %rdi, %rax /* Source is return value. */
> -#endif
> - retq
> -END (STRCPY)
> -#ifndef USE_AS_STPCPY
> +#define STRCPY strcpy
> +#include "multiarch/strcpy-sse2.S"
> libc_hidden_builtin_def (strcpy)
> -#endif
> --
> 2.34.1
>
LGTM.
Thanks.
--
H.J.
^ permalink raw reply [flat|nested] 21+ messages in thread
* Re: [PATCH v1] x86: Move strlen SSE2 implementation to multiarch/strlen-sse2.S
2022-07-12 19:29 [PATCH v1] x86: Move strlen SSE2 implementation to multiarch/strlen-sse2.S Noah Goldstein
` (8 preceding siblings ...)
2022-07-12 19:29 ` [PATCH v1] x86: Add missing rtm tests for strcmp family Noah Goldstein
@ 2022-07-12 23:29 ` H.J. Lu
2022-07-13 4:06 ` Noah Goldstein
9 siblings, 1 reply; 21+ messages in thread
From: H.J. Lu @ 2022-07-12 23:29 UTC (permalink / raw)
To: Noah Goldstein; +Cc: GNU C Library, Carlos O'Donell
On Tue, Jul 12, 2022 at 12:29 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> This commit doesn't affect libc.so.6, its just housekeeping to prepare
> for adding explicit ISA level support.
>
> Tested build on x86_64 and x86_32 with/without multiarch.
> ---
> sysdeps/x86_64/multiarch/rtld-strlen.S | 18 ++
> sysdeps/x86_64/multiarch/rtld-strnlen.S | 18 ++
> sysdeps/x86_64/multiarch/strlen-sse2.S | 260 ++++++++++++++++++++-
> sysdeps/x86_64/multiarch/strlen-vec.S | 267 ----------------------
> sysdeps/x86_64/multiarch/strnlen-sse2.S | 12 +-
> sysdeps/x86_64/multiarch/wcslen-sse4_1.S | 4 +-
> sysdeps/x86_64/multiarch/wcsnlen-sse4_1.S | 4 +-
> sysdeps/x86_64/strlen.S | 3 +-
> sysdeps/x86_64/strnlen.S | 6 +-
> 9 files changed, 306 insertions(+), 286 deletions(-)
> create mode 100644 sysdeps/x86_64/multiarch/rtld-strlen.S
> create mode 100644 sysdeps/x86_64/multiarch/rtld-strnlen.S
> delete mode 100644 sysdeps/x86_64/multiarch/strlen-vec.S
>
> diff --git a/sysdeps/x86_64/multiarch/rtld-strlen.S b/sysdeps/x86_64/multiarch/rtld-strlen.S
> new file mode 100644
> index 0000000000..609d26256e
> --- /dev/null
> +++ b/sysdeps/x86_64/multiarch/rtld-strlen.S
> @@ -0,0 +1,18 @@
> +/* Copyright (C) 2022 Free Software Foundation, Inc.
> + This file is part of the GNU C Library.
> +
> + The GNU C Library is free software; you can redistribute it and/or
> + modify it under the terms of the GNU Lesser General Public
> + License as published by the Free Software Foundation; either
> + version 2.1 of the License, or (at your option) any later version.
> +
> + The GNU C Library is distributed in the hope that it will be useful,
> + but WITHOUT ANY WARRANTY; without even the implied warranty of
> + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + Lesser General Public License for more details.
> +
> + You should have received a copy of the GNU Lesser General Public
> + License along with the GNU C Library; if not, see
> + <https://www.gnu.org/licenses/>. */
> +
> +#include "../strlen.S"
> diff --git a/sysdeps/x86_64/multiarch/rtld-strnlen.S b/sysdeps/x86_64/multiarch/rtld-strnlen.S
> new file mode 100644
> index 0000000000..ef2d64abc2
> --- /dev/null
> +++ b/sysdeps/x86_64/multiarch/rtld-strnlen.S
> @@ -0,0 +1,18 @@
> +/* Copyright (C) 2022 Free Software Foundation, Inc.
> + This file is part of the GNU C Library.
> +
> + The GNU C Library is free software; you can redistribute it and/or
> + modify it under the terms of the GNU Lesser General Public
> + License as published by the Free Software Foundation; either
> + version 2.1 of the License, or (at your option) any later version.
> +
> + The GNU C Library is distributed in the hope that it will be useful,
> + but WITHOUT ANY WARRANTY; without even the implied warranty of
> + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + Lesser General Public License for more details.
> +
> + You should have received a copy of the GNU Lesser General Public
> + License along with the GNU C Library; if not, see
> + <https://www.gnu.org/licenses/>. */
> +
> +#include "../strnlen.S"
> diff --git a/sysdeps/x86_64/multiarch/strlen-sse2.S b/sysdeps/x86_64/multiarch/strlen-sse2.S
> index 660b327ed2..5be72267d5 100644
> --- a/sysdeps/x86_64/multiarch/strlen-sse2.S
> +++ b/sysdeps/x86_64/multiarch/strlen-sse2.S
> @@ -16,8 +16,260 @@
> License along with the GNU C Library; if not, see
> <https://www.gnu.org/licenses/>. */
>
> -#if IS_IN (libc)
> -# define strlen __strlen_sse2
> -#endif
> +#if IS_IN (libc) || defined STRLEN
> +
> +# ifndef STRLEN
> +# define STRLEN __strlen_sse2
> +# endif
> +
> +
> +# include <sysdep.h>
> +
> +# ifdef AS_WCSLEN
> +# define PMINU pminud
> +# define PCMPEQ pcmpeqd
> +# define SHIFT_RETURN shrq $2, %rax
> +# else
> +# define PMINU pminub
> +# define PCMPEQ pcmpeqb
> +# define SHIFT_RETURN
> +# endif
> +
> +# ifndef SECTION
> +# define SECTION(p) p
> +# endif
> +
> +/* Long lived register in strlen(s), strnlen(s, n) are:
> +
> + %xmm3 - zero
> + %rdi - s
> + %r10 (s+n) & (~(64-1))
> + %r11 s+n
> +*/
> +
> +
> + .section SECTION(.text),"ax",@progbits
> +ENTRY(STRLEN)
> +
> +/* Test 64 bytes from %rax for zero. Save result as bitmask in %rdx. */
> +# define FIND_ZERO \
> + PCMPEQ (%rax), %xmm0; \
> + PCMPEQ 16(%rax), %xmm1; \
> + PCMPEQ 32(%rax), %xmm2; \
> + PCMPEQ 48(%rax), %xmm3; \
> + pmovmskb %xmm0, %esi; \
> + pmovmskb %xmm1, %edx; \
> + pmovmskb %xmm2, %r8d; \
> + pmovmskb %xmm3, %ecx; \
> + salq $16, %rdx; \
> + salq $16, %rcx; \
> + orq %rsi, %rdx; \
> + orq %r8, %rcx; \
> + salq $32, %rcx; \
> + orq %rcx, %rdx;
> +
> +# ifdef AS_STRNLEN
> +/* Do not read anything when n==0. */
> + test %RSI_LP, %RSI_LP
> + jne L(n_nonzero)
> + xor %rax, %rax
> + ret
> +L(n_nonzero):
> +# ifdef AS_WCSLEN
> +/* Check for overflow from maxlen * sizeof(wchar_t). If it would
> + overflow the only way this program doesn't have undefined behavior
> + is if there is a null terminator in valid memory so wcslen will
> + suffice. */
> + mov %RSI_LP, %R10_LP
> + sar $62, %R10_LP
> + jnz __wcslen_sse4_1
> + sal $2, %RSI_LP
> +# endif
> +
> +/* Initialize long lived registers. */
> + add %RDI_LP, %RSI_LP
> + mov %RSI_LP, %R10_LP
> + and $-64, %R10_LP
> + mov %RSI_LP, %R11_LP
> +# endif
> +
> + pxor %xmm0, %xmm0
> + pxor %xmm1, %xmm1
> + pxor %xmm2, %xmm2
> + pxor %xmm3, %xmm3
> + movq %rdi, %rax
> + movq %rdi, %rcx
> + andq $4095, %rcx
> +/* Offsets 4032-4047 will be aligned into 4032 thus fit into page. */
> + cmpq $4047, %rcx
> +/* We cannot unify this branching as it would be ~6 cycles slower. */
> + ja L(cross_page)
> +
> +# ifdef AS_STRNLEN
> +/* Test if end is among first 64 bytes. */
> +# define STRNLEN_PROLOG \
> + mov %r11, %rsi; \
> + subq %rax, %rsi; \
> + andq $-64, %rax; \
> + testq $-64, %rsi; \
> + je L(strnlen_ret)
> +# else
> +# define STRNLEN_PROLOG andq $-64, %rax;
> +# endif
> +
> +/* Ignore bits in mask that come before start of string. */
> +# define PROLOG(lab) \
> + movq %rdi, %rcx; \
> + xorq %rax, %rcx; \
> + STRNLEN_PROLOG; \
> + sarq %cl, %rdx; \
> + test %rdx, %rdx; \
> + je L(lab); \
> + bsfq %rdx, %rax; \
> + SHIFT_RETURN; \
> + ret
> +
> +# ifdef AS_STRNLEN
> + andq $-16, %rax
> + FIND_ZERO
> +# else
> + /* Test first 16 bytes unaligned. */
> + movdqu (%rax), %xmm4
> + PCMPEQ %xmm0, %xmm4
> + pmovmskb %xmm4, %edx
> + test %edx, %edx
> + je L(next48_bytes)
> + bsf %edx, %eax /* If eax is zeroed 16bit bsf can be used. */
> + SHIFT_RETURN
> + ret
> +
> +L(next48_bytes):
> +/* Same as FIND_ZERO except we do not check first 16 bytes. */
> + andq $-16, %rax
> + PCMPEQ 16(%rax), %xmm1
> + PCMPEQ 32(%rax), %xmm2
> + PCMPEQ 48(%rax), %xmm3
> + pmovmskb %xmm1, %edx
> + pmovmskb %xmm2, %r8d
> + pmovmskb %xmm3, %ecx
> + salq $16, %rdx
> + salq $16, %rcx
> + orq %r8, %rcx
> + salq $32, %rcx
> + orq %rcx, %rdx
> +# endif
>
> -#include "strlen-vec.S"
> + /* When no zero byte is found xmm1-3 are zero so we do not have to
> + zero them. */
> + PROLOG(loop)
> +
> + .p2align 4
> +L(cross_page):
> + andq $-64, %rax
> + FIND_ZERO
> + PROLOG(loop_init)
> +
> +# ifdef AS_STRNLEN
> +/* We must do this check to correctly handle strnlen (s, -1). */
> +L(strnlen_ret):
> + bts %rsi, %rdx
> + sarq %cl, %rdx
> + test %rdx, %rdx
> + je L(loop_init)
> + bsfq %rdx, %rax
> + SHIFT_RETURN
> + ret
> +# endif
> + .p2align 4
> +L(loop_init):
> + pxor %xmm1, %xmm1
> + pxor %xmm2, %xmm2
> + pxor %xmm3, %xmm3
> +# ifdef AS_STRNLEN
> + .p2align 4
> +L(loop):
> +
> + addq $64, %rax
> + cmpq %rax, %r10
> + je L(exit_end)
> +
> + movdqa (%rax), %xmm0
> + PMINU 16(%rax), %xmm0
> + PMINU 32(%rax), %xmm0
> + PMINU 48(%rax), %xmm0
> + PCMPEQ %xmm3, %xmm0
> + pmovmskb %xmm0, %edx
> + testl %edx, %edx
> + jne L(exit)
> + jmp L(loop)
> +
> + .p2align 4
> +L(exit_end):
> + cmp %rax, %r11
> + je L(first) /* Do not read when end is at page boundary. */
> + pxor %xmm0, %xmm0
> + FIND_ZERO
> +
> +L(first):
> + bts %r11, %rdx
> + bsfq %rdx, %rdx
> + addq %rdx, %rax
> + subq %rdi, %rax
> + SHIFT_RETURN
> + ret
> +
> + .p2align 4
> +L(exit):
> + pxor %xmm0, %xmm0
> + FIND_ZERO
> +
> + bsfq %rdx, %rdx
> + addq %rdx, %rax
> + subq %rdi, %rax
> + SHIFT_RETURN
> + ret
> +
> +# else
> +
> + /* Main loop. Unrolled twice to improve L2 cache performance on core2. */
> + .p2align 4
> +L(loop):
> +
> + movdqa 64(%rax), %xmm0
> + PMINU 80(%rax), %xmm0
> + PMINU 96(%rax), %xmm0
> + PMINU 112(%rax), %xmm0
> + PCMPEQ %xmm3, %xmm0
> + pmovmskb %xmm0, %edx
> + testl %edx, %edx
> + jne L(exit64)
> +
> + subq $-128, %rax
> +
> + movdqa (%rax), %xmm0
> + PMINU 16(%rax), %xmm0
> + PMINU 32(%rax), %xmm0
> + PMINU 48(%rax), %xmm0
> + PCMPEQ %xmm3, %xmm0
> + pmovmskb %xmm0, %edx
> + testl %edx, %edx
> + jne L(exit0)
> + jmp L(loop)
> +
> + .p2align 4
> +L(exit64):
> + addq $64, %rax
> +L(exit0):
> + pxor %xmm0, %xmm0
> + FIND_ZERO
> +
> + bsfq %rdx, %rdx
> + addq %rdx, %rax
> + subq %rdi, %rax
> + SHIFT_RETURN
> + ret
> +
> +# endif
> +
> +END(STRLEN)
> +#endif
> diff --git a/sysdeps/x86_64/multiarch/strlen-vec.S b/sysdeps/x86_64/multiarch/strlen-vec.S
> deleted file mode 100644
> index 874123d604..0000000000
> --- a/sysdeps/x86_64/multiarch/strlen-vec.S
> +++ /dev/null
> @@ -1,267 +0,0 @@
> -/* SSE2 version of strlen and SSE4.1 version of wcslen.
> - Copyright (C) 2012-2022 Free Software Foundation, Inc.
> - This file is part of the GNU C Library.
> -
> - The GNU C Library is free software; you can redistribute it and/or
> - modify it under the terms of the GNU Lesser General Public
> - License as published by the Free Software Foundation; either
> - version 2.1 of the License, or (at your option) any later version.
> -
> - The GNU C Library is distributed in the hope that it will be useful,
> - but WITHOUT ANY WARRANTY; without even the implied warranty of
> - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> - Lesser General Public License for more details.
> -
> - You should have received a copy of the GNU Lesser General Public
> - License along with the GNU C Library; if not, see
> - <https://www.gnu.org/licenses/>. */
> -
> -#include <sysdep.h>
> -
> -#ifdef AS_WCSLEN
> -# define PMINU pminud
> -# define PCMPEQ pcmpeqd
> -# define SHIFT_RETURN shrq $2, %rax
> -#else
> -# define PMINU pminub
> -# define PCMPEQ pcmpeqb
> -# define SHIFT_RETURN
> -#endif
> -
> -#ifndef SECTION
> -# define SECTION(p) p
> -#endif
> -
> -/* Long lived register in strlen(s), strnlen(s, n) are:
> -
> - %xmm3 - zero
> - %rdi - s
> - %r10 (s+n) & (~(64-1))
> - %r11 s+n
> -*/
> -
> -
> - .section SECTION(.text),"ax",@progbits
> -ENTRY(strlen)
> -
> -/* Test 64 bytes from %rax for zero. Save result as bitmask in %rdx. */
> -#define FIND_ZERO \
> - PCMPEQ (%rax), %xmm0; \
> - PCMPEQ 16(%rax), %xmm1; \
> - PCMPEQ 32(%rax), %xmm2; \
> - PCMPEQ 48(%rax), %xmm3; \
> - pmovmskb %xmm0, %esi; \
> - pmovmskb %xmm1, %edx; \
> - pmovmskb %xmm2, %r8d; \
> - pmovmskb %xmm3, %ecx; \
> - salq $16, %rdx; \
> - salq $16, %rcx; \
> - orq %rsi, %rdx; \
> - orq %r8, %rcx; \
> - salq $32, %rcx; \
> - orq %rcx, %rdx;
> -
> -#ifdef AS_STRNLEN
> -/* Do not read anything when n==0. */
> - test %RSI_LP, %RSI_LP
> - jne L(n_nonzero)
> - xor %rax, %rax
> - ret
> -L(n_nonzero):
> -# ifdef AS_WCSLEN
> -/* Check for overflow from maxlen * sizeof(wchar_t). If it would
> - overflow the only way this program doesn't have undefined behavior
> - is if there is a null terminator in valid memory so wcslen will
> - suffice. */
> - mov %RSI_LP, %R10_LP
> - sar $62, %R10_LP
> - jnz __wcslen_sse4_1
> - sal $2, %RSI_LP
> -# endif
> -
> -/* Initialize long lived registers. */
> - add %RDI_LP, %RSI_LP
> - mov %RSI_LP, %R10_LP
> - and $-64, %R10_LP
> - mov %RSI_LP, %R11_LP
> -#endif
> -
> - pxor %xmm0, %xmm0
> - pxor %xmm1, %xmm1
> - pxor %xmm2, %xmm2
> - pxor %xmm3, %xmm3
> - movq %rdi, %rax
> - movq %rdi, %rcx
> - andq $4095, %rcx
> -/* Offsets 4032-4047 will be aligned into 4032 thus fit into page. */
> - cmpq $4047, %rcx
> -/* We cannot unify this branching as it would be ~6 cycles slower. */
> - ja L(cross_page)
> -
> -#ifdef AS_STRNLEN
> -/* Test if end is among first 64 bytes. */
> -# define STRNLEN_PROLOG \
> - mov %r11, %rsi; \
> - subq %rax, %rsi; \
> - andq $-64, %rax; \
> - testq $-64, %rsi; \
> - je L(strnlen_ret)
> -#else
> -# define STRNLEN_PROLOG andq $-64, %rax;
> -#endif
> -
> -/* Ignore bits in mask that come before start of string. */
> -#define PROLOG(lab) \
> - movq %rdi, %rcx; \
> - xorq %rax, %rcx; \
> - STRNLEN_PROLOG; \
> - sarq %cl, %rdx; \
> - test %rdx, %rdx; \
> - je L(lab); \
> - bsfq %rdx, %rax; \
> - SHIFT_RETURN; \
> - ret
> -
> -#ifdef AS_STRNLEN
> - andq $-16, %rax
> - FIND_ZERO
> -#else
> - /* Test first 16 bytes unaligned. */
> - movdqu (%rax), %xmm4
> - PCMPEQ %xmm0, %xmm4
> - pmovmskb %xmm4, %edx
> - test %edx, %edx
> - je L(next48_bytes)
> - bsf %edx, %eax /* If eax is zeroed 16bit bsf can be used. */
> - SHIFT_RETURN
> - ret
> -
> -L(next48_bytes):
> -/* Same as FIND_ZERO except we do not check first 16 bytes. */
> - andq $-16, %rax
> - PCMPEQ 16(%rax), %xmm1
> - PCMPEQ 32(%rax), %xmm2
> - PCMPEQ 48(%rax), %xmm3
> - pmovmskb %xmm1, %edx
> - pmovmskb %xmm2, %r8d
> - pmovmskb %xmm3, %ecx
> - salq $16, %rdx
> - salq $16, %rcx
> - orq %r8, %rcx
> - salq $32, %rcx
> - orq %rcx, %rdx
> -#endif
> -
> - /* When no zero byte is found xmm1-3 are zero so we do not have to
> - zero them. */
> - PROLOG(loop)
> -
> - .p2align 4
> -L(cross_page):
> - andq $-64, %rax
> - FIND_ZERO
> - PROLOG(loop_init)
> -
> -#ifdef AS_STRNLEN
> -/* We must do this check to correctly handle strnlen (s, -1). */
> -L(strnlen_ret):
> - bts %rsi, %rdx
> - sarq %cl, %rdx
> - test %rdx, %rdx
> - je L(loop_init)
> - bsfq %rdx, %rax
> - SHIFT_RETURN
> - ret
> -#endif
> - .p2align 4
> -L(loop_init):
> - pxor %xmm1, %xmm1
> - pxor %xmm2, %xmm2
> - pxor %xmm3, %xmm3
> -#ifdef AS_STRNLEN
> - .p2align 4
> -L(loop):
> -
> - addq $64, %rax
> - cmpq %rax, %r10
> - je L(exit_end)
> -
> - movdqa (%rax), %xmm0
> - PMINU 16(%rax), %xmm0
> - PMINU 32(%rax), %xmm0
> - PMINU 48(%rax), %xmm0
> - PCMPEQ %xmm3, %xmm0
> - pmovmskb %xmm0, %edx
> - testl %edx, %edx
> - jne L(exit)
> - jmp L(loop)
> -
> - .p2align 4
> -L(exit_end):
> - cmp %rax, %r11
> - je L(first) /* Do not read when end is at page boundary. */
> - pxor %xmm0, %xmm0
> - FIND_ZERO
> -
> -L(first):
> - bts %r11, %rdx
> - bsfq %rdx, %rdx
> - addq %rdx, %rax
> - subq %rdi, %rax
> - SHIFT_RETURN
> - ret
> -
> - .p2align 4
> -L(exit):
> - pxor %xmm0, %xmm0
> - FIND_ZERO
> -
> - bsfq %rdx, %rdx
> - addq %rdx, %rax
> - subq %rdi, %rax
> - SHIFT_RETURN
> - ret
> -
> -#else
> -
> - /* Main loop. Unrolled twice to improve L2 cache performance on core2. */
> - .p2align 4
> -L(loop):
> -
> - movdqa 64(%rax), %xmm0
> - PMINU 80(%rax), %xmm0
> - PMINU 96(%rax), %xmm0
> - PMINU 112(%rax), %xmm0
> - PCMPEQ %xmm3, %xmm0
> - pmovmskb %xmm0, %edx
> - testl %edx, %edx
> - jne L(exit64)
> -
> - subq $-128, %rax
> -
> - movdqa (%rax), %xmm0
> - PMINU 16(%rax), %xmm0
> - PMINU 32(%rax), %xmm0
> - PMINU 48(%rax), %xmm0
> - PCMPEQ %xmm3, %xmm0
> - pmovmskb %xmm0, %edx
> - testl %edx, %edx
> - jne L(exit0)
> - jmp L(loop)
> -
> - .p2align 4
> -L(exit64):
> - addq $64, %rax
> -L(exit0):
> - pxor %xmm0, %xmm0
> - FIND_ZERO
> -
> - bsfq %rdx, %rdx
> - addq %rdx, %rax
> - subq %rdi, %rax
> - SHIFT_RETURN
> - ret
> -
> -#endif
> -
> -END(strlen)
> diff --git a/sysdeps/x86_64/multiarch/strnlen-sse2.S b/sysdeps/x86_64/multiarch/strnlen-sse2.S
> index c4f395c210..a50c7d6a28 100644
> --- a/sysdeps/x86_64/multiarch/strnlen-sse2.S
> +++ b/sysdeps/x86_64/multiarch/strnlen-sse2.S
> @@ -17,12 +17,10 @@
> <https://www.gnu.org/licenses/>. */
>
> #if IS_IN (libc)
> -# define __strnlen __strnlen_sse2
> -
> -# undef weak_alias
> -# define weak_alias(__strnlen, strnlen)
> -# undef libc_hidden_builtin_def
> -# define libc_hidden_builtin_def(strnlen)
> +# ifndef STRLEN
> +# define STRLEN __strnlen_sse2
> +# endif
> #endif
>
> -#include "../strnlen.S"
> +#define AS_STRNLEN
> +#include "strlen-sse2.S"
> diff --git a/sysdeps/x86_64/multiarch/wcslen-sse4_1.S b/sysdeps/x86_64/multiarch/wcslen-sse4_1.S
> index e306a77f51..c88e8342a1 100644
> --- a/sysdeps/x86_64/multiarch/wcslen-sse4_1.S
> +++ b/sysdeps/x86_64/multiarch/wcslen-sse4_1.S
> @@ -1,5 +1,5 @@
> #define AS_WCSLEN
> -#define strlen __wcslen_sse4_1
> +#define STRLEN __wcslen_sse4_1
> #define SECTION(p) p##.sse4.1
>
> -#include "strlen-vec.S"
> +#include "strlen-sse2.S"
> diff --git a/sysdeps/x86_64/multiarch/wcsnlen-sse4_1.S b/sysdeps/x86_64/multiarch/wcsnlen-sse4_1.S
> index d2f7dd6e22..17cdedc2a9 100644
> --- a/sysdeps/x86_64/multiarch/wcsnlen-sse4_1.S
> +++ b/sysdeps/x86_64/multiarch/wcsnlen-sse4_1.S
> @@ -1,6 +1,6 @@
> #define AS_WCSLEN
> #define AS_STRNLEN
> -#define strlen __wcsnlen_sse4_1
> +#define STRLEN __wcsnlen_sse4_1
> #define SECTION(p) p##.sse4.1
>
> -#include "strlen-vec.S"
> +#include "strlen-sse2.S"
> diff --git a/sysdeps/x86_64/strlen.S b/sysdeps/x86_64/strlen.S
> index e1f0b19f2f..c2f5674f8d 100644
> --- a/sysdeps/x86_64/strlen.S
> +++ b/sysdeps/x86_64/strlen.S
> @@ -16,6 +16,7 @@
> License along with the GNU C Library; if not, see
> <https://www.gnu.org/licenses/>. */
>
> -#include "multiarch/strlen-vec.S"
> +#define STRLEN strlen
> +#include "multiarch/strlen-sse2.S"
>
> libc_hidden_builtin_def (strlen)
> diff --git a/sysdeps/x86_64/strnlen.S b/sysdeps/x86_64/strnlen.S
> index d3c43ac482..174970d58f 100644
> --- a/sysdeps/x86_64/strnlen.S
> +++ b/sysdeps/x86_64/strnlen.S
> @@ -1,6 +1,6 @@
> -#define AS_STRNLEN
> -#define strlen __strnlen
> -#include "strlen.S"
> +#define STRLEN __strnlen
> +#include "multiarch/strnlen-sse2.S"
>
> +libc_hidden_def (__strnlen)
> weak_alias (__strnlen, strnlen);
> libc_hidden_builtin_def (strnlen)
> --
> 2.34.1
>
LGTM.
Thanks.
--
H.J.
^ permalink raw reply [flat|nested] 21+ messages in thread
* Re: [PATCH v1] x86: Move strlen SSE2 implementation to multiarch/strlen-sse2.S
2022-07-12 23:29 ` [PATCH v1] x86: Move strlen SSE2 implementation to multiarch/strlen-sse2.S H.J. Lu
@ 2022-07-13 4:06 ` Noah Goldstein
0 siblings, 0 replies; 21+ messages in thread
From: Noah Goldstein @ 2022-07-13 4:06 UTC (permalink / raw)
To: H.J. Lu; +Cc: GNU C Library, Carlos O'Donell
Carlos,
Any issue with pushing the "move <func> SSE2 implementation to
multiarch/<func>-sse2.S" commits?
The follow on patchwork:
https://patchwork.sourceware.org/project/glibc/patch/20220712192910.351121-8-goldstein.w.n@gmail.com/
https://patchwork.sourceware.org/project/glibc/patch/20220712192910.351121-7-goldstein.w.n@gmail.com/
https://patchwork.sourceware.org/project/glibc/patch/20220712192910.351121-6-goldstein.w.n@gmail.com/
https://patchwork.sourceware.org/project/glibc/patch/20220712192910.351121-5-goldstein.w.n@gmail.com/
https://patchwork.sourceware.org/project/glibc/patch/20220712192910.351121-4-goldstein.w.n@gmail.com/
https://patchwork.sourceware.org/project/glibc/patch/20220712192910.351121-3-goldstein.w.n@gmail.com/
https://patchwork.sourceware.org/project/glibc/patch/20220712192910.351121-2-goldstein.w.n@gmail.com/
https://patchwork.sourceware.org/project/glibc/patch/20220712192910.351121-1-goldstein.w.n@gmail.com/
https://patchwork.sourceware.org/project/glibc/patch/20220712192808.335531-4-goldstein.w.n@gmail.com/
https://patchwork.sourceware.org/project/glibc/patch/20220712192808.335531-3-goldstein.w.n@gmail.com/
https://patchwork.sourceware.org/project/glibc/patch/20220712192808.335531-2-goldstein.w.n@gmail.com/
They are necessary for a coming ISA raising patch that I hope to get
into 2.36, but it
may be too near the release date for such large changes.
On Tue, Jul 12, 2022 at 4:30 PM H.J. Lu <hjl.tools@gmail.com> wrote:
>
> On Tue, Jul 12, 2022 at 12:29 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> >
> > This commit doesn't affect libc.so.6, its just housekeeping to prepare
> > for adding explicit ISA level support.
> >
> > Tested build on x86_64 and x86_32 with/without multiarch.
> > ---
> > sysdeps/x86_64/multiarch/rtld-strlen.S | 18 ++
> > sysdeps/x86_64/multiarch/rtld-strnlen.S | 18 ++
> > sysdeps/x86_64/multiarch/strlen-sse2.S | 260 ++++++++++++++++++++-
> > sysdeps/x86_64/multiarch/strlen-vec.S | 267 ----------------------
> > sysdeps/x86_64/multiarch/strnlen-sse2.S | 12 +-
> > sysdeps/x86_64/multiarch/wcslen-sse4_1.S | 4 +-
> > sysdeps/x86_64/multiarch/wcsnlen-sse4_1.S | 4 +-
> > sysdeps/x86_64/strlen.S | 3 +-
> > sysdeps/x86_64/strnlen.S | 6 +-
> > 9 files changed, 306 insertions(+), 286 deletions(-)
> > create mode 100644 sysdeps/x86_64/multiarch/rtld-strlen.S
> > create mode 100644 sysdeps/x86_64/multiarch/rtld-strnlen.S
> > delete mode 100644 sysdeps/x86_64/multiarch/strlen-vec.S
> >
> > diff --git a/sysdeps/x86_64/multiarch/rtld-strlen.S b/sysdeps/x86_64/multiarch/rtld-strlen.S
> > new file mode 100644
> > index 0000000000..609d26256e
> > --- /dev/null
> > +++ b/sysdeps/x86_64/multiarch/rtld-strlen.S
> > @@ -0,0 +1,18 @@
> > +/* Copyright (C) 2022 Free Software Foundation, Inc.
> > + This file is part of the GNU C Library.
> > +
> > + The GNU C Library is free software; you can redistribute it and/or
> > + modify it under the terms of the GNU Lesser General Public
> > + License as published by the Free Software Foundation; either
> > + version 2.1 of the License, or (at your option) any later version.
> > +
> > + The GNU C Library is distributed in the hope that it will be useful,
> > + but WITHOUT ANY WARRANTY; without even the implied warranty of
> > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> > + Lesser General Public License for more details.
> > +
> > + You should have received a copy of the GNU Lesser General Public
> > + License along with the GNU C Library; if not, see
> > + <https://www.gnu.org/licenses/>. */
> > +
> > +#include "../strlen.S"
> > diff --git a/sysdeps/x86_64/multiarch/rtld-strnlen.S b/sysdeps/x86_64/multiarch/rtld-strnlen.S
> > new file mode 100644
> > index 0000000000..ef2d64abc2
> > --- /dev/null
> > +++ b/sysdeps/x86_64/multiarch/rtld-strnlen.S
> > @@ -0,0 +1,18 @@
> > +/* Copyright (C) 2022 Free Software Foundation, Inc.
> > + This file is part of the GNU C Library.
> > +
> > + The GNU C Library is free software; you can redistribute it and/or
> > + modify it under the terms of the GNU Lesser General Public
> > + License as published by the Free Software Foundation; either
> > + version 2.1 of the License, or (at your option) any later version.
> > +
> > + The GNU C Library is distributed in the hope that it will be useful,
> > + but WITHOUT ANY WARRANTY; without even the implied warranty of
> > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> > + Lesser General Public License for more details.
> > +
> > + You should have received a copy of the GNU Lesser General Public
> > + License along with the GNU C Library; if not, see
> > + <https://www.gnu.org/licenses/>. */
> > +
> > +#include "../strnlen.S"
> > diff --git a/sysdeps/x86_64/multiarch/strlen-sse2.S b/sysdeps/x86_64/multiarch/strlen-sse2.S
> > index 660b327ed2..5be72267d5 100644
> > --- a/sysdeps/x86_64/multiarch/strlen-sse2.S
> > +++ b/sysdeps/x86_64/multiarch/strlen-sse2.S
> > @@ -16,8 +16,260 @@
> > License along with the GNU C Library; if not, see
> > <https://www.gnu.org/licenses/>. */
> >
> > -#if IS_IN (libc)
> > -# define strlen __strlen_sse2
> > -#endif
> > +#if IS_IN (libc) || defined STRLEN
> > +
> > +# ifndef STRLEN
> > +# define STRLEN __strlen_sse2
> > +# endif
> > +
> > +
> > +# include <sysdep.h>
> > +
> > +# ifdef AS_WCSLEN
> > +# define PMINU pminud
> > +# define PCMPEQ pcmpeqd
> > +# define SHIFT_RETURN shrq $2, %rax
> > +# else
> > +# define PMINU pminub
> > +# define PCMPEQ pcmpeqb
> > +# define SHIFT_RETURN
> > +# endif
> > +
> > +# ifndef SECTION
> > +# define SECTION(p) p
> > +# endif
> > +
> > +/* Long lived register in strlen(s), strnlen(s, n) are:
> > +
> > + %xmm3 - zero
> > + %rdi - s
> > + %r10 (s+n) & (~(64-1))
> > + %r11 s+n
> > +*/
> > +
> > +
> > + .section SECTION(.text),"ax",@progbits
> > +ENTRY(STRLEN)
> > +
> > +/* Test 64 bytes from %rax for zero. Save result as bitmask in %rdx. */
> > +# define FIND_ZERO \
> > + PCMPEQ (%rax), %xmm0; \
> > + PCMPEQ 16(%rax), %xmm1; \
> > + PCMPEQ 32(%rax), %xmm2; \
> > + PCMPEQ 48(%rax), %xmm3; \
> > + pmovmskb %xmm0, %esi; \
> > + pmovmskb %xmm1, %edx; \
> > + pmovmskb %xmm2, %r8d; \
> > + pmovmskb %xmm3, %ecx; \
> > + salq $16, %rdx; \
> > + salq $16, %rcx; \
> > + orq %rsi, %rdx; \
> > + orq %r8, %rcx; \
> > + salq $32, %rcx; \
> > + orq %rcx, %rdx;
> > +
> > +# ifdef AS_STRNLEN
> > +/* Do not read anything when n==0. */
> > + test %RSI_LP, %RSI_LP
> > + jne L(n_nonzero)
> > + xor %rax, %rax
> > + ret
> > +L(n_nonzero):
> > +# ifdef AS_WCSLEN
> > +/* Check for overflow from maxlen * sizeof(wchar_t). If it would
> > + overflow the only way this program doesn't have undefined behavior
> > + is if there is a null terminator in valid memory so wcslen will
> > + suffice. */
> > + mov %RSI_LP, %R10_LP
> > + sar $62, %R10_LP
> > + jnz __wcslen_sse4_1
> > + sal $2, %RSI_LP
> > +# endif
> > +
> > +/* Initialize long lived registers. */
> > + add %RDI_LP, %RSI_LP
> > + mov %RSI_LP, %R10_LP
> > + and $-64, %R10_LP
> > + mov %RSI_LP, %R11_LP
> > +# endif
> > +
> > + pxor %xmm0, %xmm0
> > + pxor %xmm1, %xmm1
> > + pxor %xmm2, %xmm2
> > + pxor %xmm3, %xmm3
> > + movq %rdi, %rax
> > + movq %rdi, %rcx
> > + andq $4095, %rcx
> > +/* Offsets 4032-4047 will be aligned into 4032 thus fit into page. */
> > + cmpq $4047, %rcx
> > +/* We cannot unify this branching as it would be ~6 cycles slower. */
> > + ja L(cross_page)
> > +
> > +# ifdef AS_STRNLEN
> > +/* Test if end is among first 64 bytes. */
> > +# define STRNLEN_PROLOG \
> > + mov %r11, %rsi; \
> > + subq %rax, %rsi; \
> > + andq $-64, %rax; \
> > + testq $-64, %rsi; \
> > + je L(strnlen_ret)
> > +# else
> > +# define STRNLEN_PROLOG andq $-64, %rax;
> > +# endif
> > +
> > +/* Ignore bits in mask that come before start of string. */
> > +# define PROLOG(lab) \
> > + movq %rdi, %rcx; \
> > + xorq %rax, %rcx; \
> > + STRNLEN_PROLOG; \
> > + sarq %cl, %rdx; \
> > + test %rdx, %rdx; \
> > + je L(lab); \
> > + bsfq %rdx, %rax; \
> > + SHIFT_RETURN; \
> > + ret
> > +
> > +# ifdef AS_STRNLEN
> > + andq $-16, %rax
> > + FIND_ZERO
> > +# else
> > + /* Test first 16 bytes unaligned. */
> > + movdqu (%rax), %xmm4
> > + PCMPEQ %xmm0, %xmm4
> > + pmovmskb %xmm4, %edx
> > + test %edx, %edx
> > + je L(next48_bytes)
> > + bsf %edx, %eax /* If eax is zeroed 16bit bsf can be used. */
> > + SHIFT_RETURN
> > + ret
> > +
> > +L(next48_bytes):
> > +/* Same as FIND_ZERO except we do not check first 16 bytes. */
> > + andq $-16, %rax
> > + PCMPEQ 16(%rax), %xmm1
> > + PCMPEQ 32(%rax), %xmm2
> > + PCMPEQ 48(%rax), %xmm3
> > + pmovmskb %xmm1, %edx
> > + pmovmskb %xmm2, %r8d
> > + pmovmskb %xmm3, %ecx
> > + salq $16, %rdx
> > + salq $16, %rcx
> > + orq %r8, %rcx
> > + salq $32, %rcx
> > + orq %rcx, %rdx
> > +# endif
> >
> > -#include "strlen-vec.S"
> > + /* When no zero byte is found xmm1-3 are zero so we do not have to
> > + zero them. */
> > + PROLOG(loop)
> > +
> > + .p2align 4
> > +L(cross_page):
> > + andq $-64, %rax
> > + FIND_ZERO
> > + PROLOG(loop_init)
> > +
> > +# ifdef AS_STRNLEN
> > +/* We must do this check to correctly handle strnlen (s, -1). */
> > +L(strnlen_ret):
> > + bts %rsi, %rdx
> > + sarq %cl, %rdx
> > + test %rdx, %rdx
> > + je L(loop_init)
> > + bsfq %rdx, %rax
> > + SHIFT_RETURN
> > + ret
> > +# endif
> > + .p2align 4
> > +L(loop_init):
> > + pxor %xmm1, %xmm1
> > + pxor %xmm2, %xmm2
> > + pxor %xmm3, %xmm3
> > +# ifdef AS_STRNLEN
> > + .p2align 4
> > +L(loop):
> > +
> > + addq $64, %rax
> > + cmpq %rax, %r10
> > + je L(exit_end)
> > +
> > + movdqa (%rax), %xmm0
> > + PMINU 16(%rax), %xmm0
> > + PMINU 32(%rax), %xmm0
> > + PMINU 48(%rax), %xmm0
> > + PCMPEQ %xmm3, %xmm0
> > + pmovmskb %xmm0, %edx
> > + testl %edx, %edx
> > + jne L(exit)
> > + jmp L(loop)
> > +
> > + .p2align 4
> > +L(exit_end):
> > + cmp %rax, %r11
> > + je L(first) /* Do not read when end is at page boundary. */
> > + pxor %xmm0, %xmm0
> > + FIND_ZERO
> > +
> > +L(first):
> > + bts %r11, %rdx
> > + bsfq %rdx, %rdx
> > + addq %rdx, %rax
> > + subq %rdi, %rax
> > + SHIFT_RETURN
> > + ret
> > +
> > + .p2align 4
> > +L(exit):
> > + pxor %xmm0, %xmm0
> > + FIND_ZERO
> > +
> > + bsfq %rdx, %rdx
> > + addq %rdx, %rax
> > + subq %rdi, %rax
> > + SHIFT_RETURN
> > + ret
> > +
> > +# else
> > +
> > + /* Main loop. Unrolled twice to improve L2 cache performance on core2. */
> > + .p2align 4
> > +L(loop):
> > +
> > + movdqa 64(%rax), %xmm0
> > + PMINU 80(%rax), %xmm0
> > + PMINU 96(%rax), %xmm0
> > + PMINU 112(%rax), %xmm0
> > + PCMPEQ %xmm3, %xmm0
> > + pmovmskb %xmm0, %edx
> > + testl %edx, %edx
> > + jne L(exit64)
> > +
> > + subq $-128, %rax
> > +
> > + movdqa (%rax), %xmm0
> > + PMINU 16(%rax), %xmm0
> > + PMINU 32(%rax), %xmm0
> > + PMINU 48(%rax), %xmm0
> > + PCMPEQ %xmm3, %xmm0
> > + pmovmskb %xmm0, %edx
> > + testl %edx, %edx
> > + jne L(exit0)
> > + jmp L(loop)
> > +
> > + .p2align 4
> > +L(exit64):
> > + addq $64, %rax
> > +L(exit0):
> > + pxor %xmm0, %xmm0
> > + FIND_ZERO
> > +
> > + bsfq %rdx, %rdx
> > + addq %rdx, %rax
> > + subq %rdi, %rax
> > + SHIFT_RETURN
> > + ret
> > +
> > +# endif
> > +
> > +END(STRLEN)
> > +#endif
> > diff --git a/sysdeps/x86_64/multiarch/strlen-vec.S b/sysdeps/x86_64/multiarch/strlen-vec.S
> > deleted file mode 100644
> > index 874123d604..0000000000
> > --- a/sysdeps/x86_64/multiarch/strlen-vec.S
> > +++ /dev/null
> > @@ -1,267 +0,0 @@
> > -/* SSE2 version of strlen and SSE4.1 version of wcslen.
> > - Copyright (C) 2012-2022 Free Software Foundation, Inc.
> > - This file is part of the GNU C Library.
> > -
> > - The GNU C Library is free software; you can redistribute it and/or
> > - modify it under the terms of the GNU Lesser General Public
> > - License as published by the Free Software Foundation; either
> > - version 2.1 of the License, or (at your option) any later version.
> > -
> > - The GNU C Library is distributed in the hope that it will be useful,
> > - but WITHOUT ANY WARRANTY; without even the implied warranty of
> > - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> > - Lesser General Public License for more details.
> > -
> > - You should have received a copy of the GNU Lesser General Public
> > - License along with the GNU C Library; if not, see
> > - <https://www.gnu.org/licenses/>. */
> > -
> > -#include <sysdep.h>
> > -
> > -#ifdef AS_WCSLEN
> > -# define PMINU pminud
> > -# define PCMPEQ pcmpeqd
> > -# define SHIFT_RETURN shrq $2, %rax
> > -#else
> > -# define PMINU pminub
> > -# define PCMPEQ pcmpeqb
> > -# define SHIFT_RETURN
> > -#endif
> > -
> > -#ifndef SECTION
> > -# define SECTION(p) p
> > -#endif
> > -
> > -/* Long lived register in strlen(s), strnlen(s, n) are:
> > -
> > - %xmm3 - zero
> > - %rdi - s
> > - %r10 (s+n) & (~(64-1))
> > - %r11 s+n
> > -*/
> > -
> > -
> > - .section SECTION(.text),"ax",@progbits
> > -ENTRY(strlen)
> > -
> > -/* Test 64 bytes from %rax for zero. Save result as bitmask in %rdx. */
> > -#define FIND_ZERO \
> > - PCMPEQ (%rax), %xmm0; \
> > - PCMPEQ 16(%rax), %xmm1; \
> > - PCMPEQ 32(%rax), %xmm2; \
> > - PCMPEQ 48(%rax), %xmm3; \
> > - pmovmskb %xmm0, %esi; \
> > - pmovmskb %xmm1, %edx; \
> > - pmovmskb %xmm2, %r8d; \
> > - pmovmskb %xmm3, %ecx; \
> > - salq $16, %rdx; \
> > - salq $16, %rcx; \
> > - orq %rsi, %rdx; \
> > - orq %r8, %rcx; \
> > - salq $32, %rcx; \
> > - orq %rcx, %rdx;
> > -
> > -#ifdef AS_STRNLEN
> > -/* Do not read anything when n==0. */
> > - test %RSI_LP, %RSI_LP
> > - jne L(n_nonzero)
> > - xor %rax, %rax
> > - ret
> > -L(n_nonzero):
> > -# ifdef AS_WCSLEN
> > -/* Check for overflow from maxlen * sizeof(wchar_t). If it would
> > - overflow the only way this program doesn't have undefined behavior
> > - is if there is a null terminator in valid memory so wcslen will
> > - suffice. */
> > - mov %RSI_LP, %R10_LP
> > - sar $62, %R10_LP
> > - jnz __wcslen_sse4_1
> > - sal $2, %RSI_LP
> > -# endif
> > -
> > -/* Initialize long lived registers. */
> > - add %RDI_LP, %RSI_LP
> > - mov %RSI_LP, %R10_LP
> > - and $-64, %R10_LP
> > - mov %RSI_LP, %R11_LP
> > -#endif
> > -
> > - pxor %xmm0, %xmm0
> > - pxor %xmm1, %xmm1
> > - pxor %xmm2, %xmm2
> > - pxor %xmm3, %xmm3
> > - movq %rdi, %rax
> > - movq %rdi, %rcx
> > - andq $4095, %rcx
> > -/* Offsets 4032-4047 will be aligned into 4032 thus fit into page. */
> > - cmpq $4047, %rcx
> > -/* We cannot unify this branching as it would be ~6 cycles slower. */
> > - ja L(cross_page)
> > -
> > -#ifdef AS_STRNLEN
> > -/* Test if end is among first 64 bytes. */
> > -# define STRNLEN_PROLOG \
> > - mov %r11, %rsi; \
> > - subq %rax, %rsi; \
> > - andq $-64, %rax; \
> > - testq $-64, %rsi; \
> > - je L(strnlen_ret)
> > -#else
> > -# define STRNLEN_PROLOG andq $-64, %rax;
> > -#endif
> > -
> > -/* Ignore bits in mask that come before start of string. */
> > -#define PROLOG(lab) \
> > - movq %rdi, %rcx; \
> > - xorq %rax, %rcx; \
> > - STRNLEN_PROLOG; \
> > - sarq %cl, %rdx; \
> > - test %rdx, %rdx; \
> > - je L(lab); \
> > - bsfq %rdx, %rax; \
> > - SHIFT_RETURN; \
> > - ret
> > -
> > -#ifdef AS_STRNLEN
> > - andq $-16, %rax
> > - FIND_ZERO
> > -#else
> > - /* Test first 16 bytes unaligned. */
> > - movdqu (%rax), %xmm4
> > - PCMPEQ %xmm0, %xmm4
> > - pmovmskb %xmm4, %edx
> > - test %edx, %edx
> > - je L(next48_bytes)
> > - bsf %edx, %eax /* If eax is zeroed 16bit bsf can be used. */
> > - SHIFT_RETURN
> > - ret
> > -
> > -L(next48_bytes):
> > -/* Same as FIND_ZERO except we do not check first 16 bytes. */
> > - andq $-16, %rax
> > - PCMPEQ 16(%rax), %xmm1
> > - PCMPEQ 32(%rax), %xmm2
> > - PCMPEQ 48(%rax), %xmm3
> > - pmovmskb %xmm1, %edx
> > - pmovmskb %xmm2, %r8d
> > - pmovmskb %xmm3, %ecx
> > - salq $16, %rdx
> > - salq $16, %rcx
> > - orq %r8, %rcx
> > - salq $32, %rcx
> > - orq %rcx, %rdx
> > -#endif
> > -
> > - /* When no zero byte is found xmm1-3 are zero so we do not have to
> > - zero them. */
> > - PROLOG(loop)
> > -
> > - .p2align 4
> > -L(cross_page):
> > - andq $-64, %rax
> > - FIND_ZERO
> > - PROLOG(loop_init)
> > -
> > -#ifdef AS_STRNLEN
> > -/* We must do this check to correctly handle strnlen (s, -1). */
> > -L(strnlen_ret):
> > - bts %rsi, %rdx
> > - sarq %cl, %rdx
> > - test %rdx, %rdx
> > - je L(loop_init)
> > - bsfq %rdx, %rax
> > - SHIFT_RETURN
> > - ret
> > -#endif
> > - .p2align 4
> > -L(loop_init):
> > - pxor %xmm1, %xmm1
> > - pxor %xmm2, %xmm2
> > - pxor %xmm3, %xmm3
> > -#ifdef AS_STRNLEN
> > - .p2align 4
> > -L(loop):
> > -
> > - addq $64, %rax
> > - cmpq %rax, %r10
> > - je L(exit_end)
> > -
> > - movdqa (%rax), %xmm0
> > - PMINU 16(%rax), %xmm0
> > - PMINU 32(%rax), %xmm0
> > - PMINU 48(%rax), %xmm0
> > - PCMPEQ %xmm3, %xmm0
> > - pmovmskb %xmm0, %edx
> > - testl %edx, %edx
> > - jne L(exit)
> > - jmp L(loop)
> > -
> > - .p2align 4
> > -L(exit_end):
> > - cmp %rax, %r11
> > - je L(first) /* Do not read when end is at page boundary. */
> > - pxor %xmm0, %xmm0
> > - FIND_ZERO
> > -
> > -L(first):
> > - bts %r11, %rdx
> > - bsfq %rdx, %rdx
> > - addq %rdx, %rax
> > - subq %rdi, %rax
> > - SHIFT_RETURN
> > - ret
> > -
> > - .p2align 4
> > -L(exit):
> > - pxor %xmm0, %xmm0
> > - FIND_ZERO
> > -
> > - bsfq %rdx, %rdx
> > - addq %rdx, %rax
> > - subq %rdi, %rax
> > - SHIFT_RETURN
> > - ret
> > -
> > -#else
> > -
> > - /* Main loop. Unrolled twice to improve L2 cache performance on core2. */
> > - .p2align 4
> > -L(loop):
> > -
> > - movdqa 64(%rax), %xmm0
> > - PMINU 80(%rax), %xmm0
> > - PMINU 96(%rax), %xmm0
> > - PMINU 112(%rax), %xmm0
> > - PCMPEQ %xmm3, %xmm0
> > - pmovmskb %xmm0, %edx
> > - testl %edx, %edx
> > - jne L(exit64)
> > -
> > - subq $-128, %rax
> > -
> > - movdqa (%rax), %xmm0
> > - PMINU 16(%rax), %xmm0
> > - PMINU 32(%rax), %xmm0
> > - PMINU 48(%rax), %xmm0
> > - PCMPEQ %xmm3, %xmm0
> > - pmovmskb %xmm0, %edx
> > - testl %edx, %edx
> > - jne L(exit0)
> > - jmp L(loop)
> > -
> > - .p2align 4
> > -L(exit64):
> > - addq $64, %rax
> > -L(exit0):
> > - pxor %xmm0, %xmm0
> > - FIND_ZERO
> > -
> > - bsfq %rdx, %rdx
> > - addq %rdx, %rax
> > - subq %rdi, %rax
> > - SHIFT_RETURN
> > - ret
> > -
> > -#endif
> > -
> > -END(strlen)
> > diff --git a/sysdeps/x86_64/multiarch/strnlen-sse2.S b/sysdeps/x86_64/multiarch/strnlen-sse2.S
> > index c4f395c210..a50c7d6a28 100644
> > --- a/sysdeps/x86_64/multiarch/strnlen-sse2.S
> > +++ b/sysdeps/x86_64/multiarch/strnlen-sse2.S
> > @@ -17,12 +17,10 @@
> > <https://www.gnu.org/licenses/>. */
> >
> > #if IS_IN (libc)
> > -# define __strnlen __strnlen_sse2
> > -
> > -# undef weak_alias
> > -# define weak_alias(__strnlen, strnlen)
> > -# undef libc_hidden_builtin_def
> > -# define libc_hidden_builtin_def(strnlen)
> > +# ifndef STRLEN
> > +# define STRLEN __strnlen_sse2
> > +# endif
> > #endif
> >
> > -#include "../strnlen.S"
> > +#define AS_STRNLEN
> > +#include "strlen-sse2.S"
> > diff --git a/sysdeps/x86_64/multiarch/wcslen-sse4_1.S b/sysdeps/x86_64/multiarch/wcslen-sse4_1.S
> > index e306a77f51..c88e8342a1 100644
> > --- a/sysdeps/x86_64/multiarch/wcslen-sse4_1.S
> > +++ b/sysdeps/x86_64/multiarch/wcslen-sse4_1.S
> > @@ -1,5 +1,5 @@
> > #define AS_WCSLEN
> > -#define strlen __wcslen_sse4_1
> > +#define STRLEN __wcslen_sse4_1
> > #define SECTION(p) p##.sse4.1
> >
> > -#include "strlen-vec.S"
> > +#include "strlen-sse2.S"
> > diff --git a/sysdeps/x86_64/multiarch/wcsnlen-sse4_1.S b/sysdeps/x86_64/multiarch/wcsnlen-sse4_1.S
> > index d2f7dd6e22..17cdedc2a9 100644
> > --- a/sysdeps/x86_64/multiarch/wcsnlen-sse4_1.S
> > +++ b/sysdeps/x86_64/multiarch/wcsnlen-sse4_1.S
> > @@ -1,6 +1,6 @@
> > #define AS_WCSLEN
> > #define AS_STRNLEN
> > -#define strlen __wcsnlen_sse4_1
> > +#define STRLEN __wcsnlen_sse4_1
> > #define SECTION(p) p##.sse4.1
> >
> > -#include "strlen-vec.S"
> > +#include "strlen-sse2.S"
> > diff --git a/sysdeps/x86_64/strlen.S b/sysdeps/x86_64/strlen.S
> > index e1f0b19f2f..c2f5674f8d 100644
> > --- a/sysdeps/x86_64/strlen.S
> > +++ b/sysdeps/x86_64/strlen.S
> > @@ -16,6 +16,7 @@
> > License along with the GNU C Library; if not, see
> > <https://www.gnu.org/licenses/>. */
> >
> > -#include "multiarch/strlen-vec.S"
> > +#define STRLEN strlen
> > +#include "multiarch/strlen-sse2.S"
> >
> > libc_hidden_builtin_def (strlen)
> > diff --git a/sysdeps/x86_64/strnlen.S b/sysdeps/x86_64/strnlen.S
> > index d3c43ac482..174970d58f 100644
> > --- a/sysdeps/x86_64/strnlen.S
> > +++ b/sysdeps/x86_64/strnlen.S
> > @@ -1,6 +1,6 @@
> > -#define AS_STRNLEN
> > -#define strlen __strnlen
> > -#include "strlen.S"
> > +#define STRLEN __strnlen
> > +#include "multiarch/strnlen-sse2.S"
> >
> > +libc_hidden_def (__strnlen)
> > weak_alias (__strnlen, strnlen);
> > libc_hidden_builtin_def (strnlen)
> > --
> > 2.34.1
> >
>
> LGTM.
>
> Thanks.
>
> --
> H.J.
^ permalink raw reply [flat|nested] 21+ messages in thread
end of thread, other threads:[~2022-07-13 4:06 UTC | newest]
Thread overview: 21+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-07-12 19:29 [PATCH v1] x86: Move strlen SSE2 implementation to multiarch/strlen-sse2.S Noah Goldstein
2022-07-12 19:29 ` [PATCH v1] x86: Move strcpy SSE2 implementation to multiarch/strcpy-sse2.S Noah Goldstein
2022-07-12 23:23 ` H.J. Lu
2022-07-12 19:29 ` [PATCH v1] x86: Move memrchr SSE2 implementation to multiarch/memrchr-sse2.S Noah Goldstein
2022-07-12 22:58 ` H.J. Lu
2022-07-12 19:29 ` [PATCH v1] x86: Move strrchr SSE2 implementation to multiarch/strrchr-sse2.S Noah Goldstein
2022-07-12 22:28 ` H.J. Lu
2022-07-12 19:29 ` [PATCH v1] x86: Move strchr SSE2 implementation to multiarch/strchr-sse2.S Noah Goldstein
2022-07-12 21:27 ` H.J. Lu
2022-07-12 19:29 ` [PATCH v1] x86: Move strcat SSE2 implementation to multiarch/strcat-sse2.S Noah Goldstein
2022-07-12 21:16 ` H.J. Lu
2022-07-12 19:29 ` [PATCH v1] x86: Move wcschr SSE2 implementation to multiarch/wcschr-sse2.S Noah Goldstein
2022-07-12 20:55 ` H.J. Lu
2022-07-12 19:29 ` [PATCH v1] x86: Move wcslen SSE2 implementation to multiarch/wcslen-sse2.S Noah Goldstein
2022-07-12 20:26 ` H.J. Lu
2022-07-12 19:29 ` [PATCH v1] x86: Remove unneeded rtld-wmemcmp Noah Goldstein
2022-07-12 19:44 ` H.J. Lu
2022-07-12 19:29 ` [PATCH v1] x86: Add missing rtm tests for strcmp family Noah Goldstein
2022-07-12 19:59 ` H.J. Lu
2022-07-12 23:29 ` [PATCH v1] x86: Move strlen SSE2 implementation to multiarch/strlen-sse2.S H.J. Lu
2022-07-13 4:06 ` Noah Goldstein
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).