* [PATCH] x86-64: Optimize strchr/strchrnul/wcschr with AVX2
@ 2017-06-01 18:13 H.J. Lu
2017-06-02 19:49 ` H.J. Lu
0 siblings, 1 reply; 3+ messages in thread
From: H.J. Lu @ 2017-06-01 18:13 UTC (permalink / raw)
To: GNU C Library
Optimize strchr/strchrnul/wcschr with AVX2 to search 32 bytes with vector
instructions. It is as fast as SSE2 versions for size <= 16 bytes and up
to 1X faster for or size > 16 bytes on Haswell. Select AVX2 version on
AVX2 machines where vzeroupper is preferred and AVX unaligned load is fast.
NB: It uses TZCNT instead of BSF since TZCNT produces the same result
as BSF for non-zero input. TZCNT is faster than BSF and is executed
as BSF if machine doesn't support TZCNT.
Any comments?
H.J.
* sysdeps/x86_64/multiarch/Makefile (sysdep_routines): Add
strchr-avx2, strchrnul-avx2 and wcschr-avx2.
* sysdeps/x86_64/multiarch/ifunc-impl-list.c
(__libc_ifunc_impl_list): Add tests for __strchr_avx2,
__strchrnul_avx2, __strchrnul_sse2, __wcschr_avx2 and
__wcschr_sse2.
* sysdeps/x86_64/multiarch/strchr-avx2.S: New file.
* sysdeps/x86_64/multiarch/strchrnul-avx2.S: Likewise.
* sysdeps/x86_64/multiarch/strchrnul.S: Likewise.
* sysdeps/x86_64/multiarch/wcschr-avx2.S: Likewise.
* sysdeps/x86_64/multiarch/wcschr.S: Likewise.
* sysdeps/x86_64/multiarch/strchr.S (strchr): Add support for
__strchr_avx2.
---
sysdeps/x86_64/multiarch/Makefile | 2 +
sysdeps/x86_64/multiarch/ifunc-impl-list.c | 17 ++
sysdeps/x86_64/multiarch/strchr-avx2.S | 254 +++++++++++++++++++++++++++++
sysdeps/x86_64/multiarch/strchr.S | 9 +
sysdeps/x86_64/multiarch/strchrnul-avx2.S | 3 +
sysdeps/x86_64/multiarch/strchrnul.S | 55 +++++++
sysdeps/x86_64/multiarch/wcschr-avx2.S | 3 +
sysdeps/x86_64/multiarch/wcschr.S | 67 ++++++++
8 files changed, 410 insertions(+)
create mode 100644 sysdeps/x86_64/multiarch/strchr-avx2.S
create mode 100644 sysdeps/x86_64/multiarch/strchrnul-avx2.S
create mode 100644 sysdeps/x86_64/multiarch/strchrnul.S
create mode 100644 sysdeps/x86_64/multiarch/wcschr-avx2.S
create mode 100644 sysdeps/x86_64/multiarch/wcschr.S
diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
index f33f21a..2974495 100644
--- a/sysdeps/x86_64/multiarch/Makefile
+++ b/sysdeps/x86_64/multiarch/Makefile
@@ -13,6 +13,7 @@ sysdep_routines += strncat-c stpncpy-c strncpy-c strcmp-ssse3 \
memcpy-ssse3-back \
memmove-ssse3-back \
memmove-avx512-no-vzeroupper strcasecmp_l-ssse3 \
+ strchr-avx2 strchrnul-avx2 \
strlen-avx2 strnlen-avx2 \
strncase_l-ssse3 strcat-ssse3 strncat-ssse3\
strcpy-ssse3 strncpy-ssse3 stpcpy-ssse3 stpncpy-ssse3 \
@@ -37,5 +38,6 @@ sysdep_routines += wmemcmp-sse4 wmemcmp-ssse3 wmemcmp-c \
wmemchr-avx2 \
wmemcmp-avx2 \
wcscpy-ssse3 wcscpy-c \
+ wcschr-avx2 \
wcslen-avx2 wcsnlen-avx2
endif
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
index c2b07b3..d3fa98f 100644
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
@@ -230,9 +230,19 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
/* Support sysdeps/x86_64/multiarch/strchr.S. */
IFUNC_IMPL (i, name, strchr,
+ IFUNC_IMPL_ADD (array, i, strchr,
+ HAS_ARCH_FEATURE (AVX2_Usable),
+ __strchr_avx2)
IFUNC_IMPL_ADD (array, i, strchr, 1, __strchr_sse2_no_bsf)
IFUNC_IMPL_ADD (array, i, strchr, 1, __strchr_sse2))
+ /* Support sysdeps/x86_64/multiarch/strchrnul.S. */
+ IFUNC_IMPL (i, name, strchrnul,
+ IFUNC_IMPL_ADD (array, i, strchrnul,
+ HAS_ARCH_FEATURE (AVX2_Usable),
+ __strchrnul_avx2)
+ IFUNC_IMPL_ADD (array, i, strchrnul, 1, __strchrnul_sse2))
+
/* Support sysdeps/x86_64/multiarch/strcmp.S. */
IFUNC_IMPL (i, name, strcmp,
IFUNC_IMPL_ADD (array, i, strcmp, HAS_CPU_FEATURE (SSE4_2),
@@ -317,6 +327,13 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
IFUNC_IMPL_ADD (array, i, strstr, 1, __strstr_sse2_unaligned)
IFUNC_IMPL_ADD (array, i, strstr, 1, __strstr_sse2))
+ /* Support sysdeps/x86_64/multiarch/wcschr.S. */
+ IFUNC_IMPL (i, name, wcschr,
+ IFUNC_IMPL_ADD (array, i, wcschr,
+ HAS_ARCH_FEATURE (AVX2_Usable),
+ __wcschr_avx2)
+ IFUNC_IMPL_ADD (array, i, wcschr, 1, __wcschr_sse2))
+
/* Support sysdeps/x86_64/multiarch/wcscpy.S. */
IFUNC_IMPL (i, name, wcscpy,
IFUNC_IMPL_ADD (array, i, wcscpy, HAS_CPU_FEATURE (SSSE3),
diff --git a/sysdeps/x86_64/multiarch/strchr-avx2.S b/sysdeps/x86_64/multiarch/strchr-avx2.S
new file mode 100644
index 0000000..e4292d3
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strchr-avx2.S
@@ -0,0 +1,254 @@
+/* strchr/strchrnul optimized with AVX2.
+ Copyright (C) 2017 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#if IS_IN (libc)
+
+# include <sysdep.h>
+
+# ifndef STRCHR
+# define STRCHR __strchr_avx2
+# endif
+
+# ifdef USE_AS_WCSCHR
+# define VPBROADCAST vpbroadcastd
+# define VPCMPEQ vpcmpeqd
+# define CHAR_REG esi
+# else
+# define VPBROADCAST vpbroadcastb
+# define VPCMPEQ vpcmpeqb
+# define CHAR_REG sil
+# endif
+
+# ifndef VZEROUPPER
+# define VZEROUPPER vzeroupper
+# endif
+
+# define VEC_SIZE 32
+
+ .section .text.avx,"ax",@progbits
+ENTRY (STRCHR)
+ movl %edi, %ecx
+ /* Broadcast CHAR to YMM0. */
+ vmovd %esi, %xmm0
+ vpxor %xmm9, %xmm9, %xmm9
+ VPBROADCAST %xmm0, %ymm0
+ /* Check if we may cross page boundary with one vector load. */
+ andl $(2 * VEC_SIZE - 1), %ecx
+ cmpl $VEC_SIZE, %ecx
+ ja L(cros_page_boundary)
+
+ /* Check the first VEC_SIZE bytes. Search for both CHAR and the
+ null byte. */
+ vmovdqu (%rdi), %ymm8
+ VPCMPEQ %ymm8, %ymm0, %ymm1
+ VPCMPEQ %ymm8, %ymm9, %ymm2
+ vpor %ymm1, %ymm2, %ymm1
+ vpmovmskb %ymm1, %eax
+ testl %eax, %eax
+ jnz L(first_vec_x0)
+
+ /* Align data for aligned loads in the loop. */
+ addq $VEC_SIZE, %rdi
+ andl $(VEC_SIZE - 1), %ecx
+ andq $-VEC_SIZE, %rdi
+
+ jmp L(more_4x_vec)
+
+ .p2align 4
+L(cros_page_boundary):
+ andl $(VEC_SIZE - 1), %ecx
+ andq $-VEC_SIZE, %rdi
+ vmovdqu (%rdi), %ymm8
+ VPCMPEQ %ymm8, %ymm0, %ymm1
+ VPCMPEQ %ymm8, %ymm9, %ymm2
+ vpor %ymm1, %ymm2, %ymm1
+ vpmovmskb %ymm1, %eax
+ /* Remove the leading bytes. */
+ sarl %cl, %eax
+ testl %eax, %eax
+ jz L(aligned_more)
+ /* Found CHAR or the null byte. */
+ tzcntl %eax, %eax
+ addq %rcx, %rax
+# ifdef USE_AS_STRCHRNUL
+ addq %rdi, %rax
+# else
+ xorl %edx, %edx
+ leaq (%rdi, %rax), %rax
+ cmp (%rax), %CHAR_REG
+ cmovne %rdx, %rax
+# endif
+ VZEROUPPER
+ ret
+
+ .p2align 4
+L(aligned_more):
+ addq $VEC_SIZE, %rdi
+
+L(more_4x_vec):
+ /* Check the first 4 * VEC_SIZE. Only one VEC_SIZE at a time
+ since data is only aligned to VEC_SIZE. */
+ vmovdqa (%rdi), %ymm8
+ VPCMPEQ %ymm8, %ymm0, %ymm1
+ VPCMPEQ %ymm8, %ymm9, %ymm2
+ vpor %ymm1, %ymm2, %ymm1
+ vpmovmskb %ymm1, %eax
+ testl %eax, %eax
+ jnz L(first_vec_x0)
+
+ vmovdqa VEC_SIZE(%rdi), %ymm8
+ VPCMPEQ %ymm8, %ymm0, %ymm1
+ VPCMPEQ %ymm8, %ymm9, %ymm2
+ vpor %ymm1, %ymm2, %ymm1
+ vpmovmskb %ymm1, %eax
+ testl %eax, %eax
+ jnz L(first_vec_x1)
+
+ vmovdqa (VEC_SIZE * 2)(%rdi), %ymm8
+ VPCMPEQ %ymm8, %ymm0, %ymm1
+ VPCMPEQ %ymm8, %ymm9, %ymm2
+ vpor %ymm1, %ymm2, %ymm1
+ vpmovmskb %ymm1, %eax
+ testl %eax, %eax
+ jnz L(first_vec_x2)
+
+ vmovdqa (VEC_SIZE * 3)(%rdi), %ymm8
+ VPCMPEQ %ymm8, %ymm0, %ymm1
+ VPCMPEQ %ymm8, %ymm9, %ymm2
+ vpor %ymm1, %ymm2, %ymm1
+ vpmovmskb %ymm1, %eax
+ testl %eax, %eax
+ jnz L(first_vec_x3)
+
+ addq $(VEC_SIZE * 4), %rdi
+
+ /* Align data to 4 * VEC_SIZE. */
+ movq %rdi, %rcx
+ andl $(4 * VEC_SIZE - 1), %ecx
+ andq $-(4 * VEC_SIZE), %rdi
+
+ .p2align 4
+L(loop_4x_vec):
+ /* Compare 4 * VEC at a time forward. */
+ vmovdqa (%rdi), %ymm5
+ vmovdqa VEC_SIZE(%rdi), %ymm6
+ vmovdqa (VEC_SIZE * 2)(%rdi), %ymm7
+ vmovdqa (VEC_SIZE * 3)(%rdi), %ymm8
+
+ VPCMPEQ %ymm5, %ymm0, %ymm1
+ VPCMPEQ %ymm6, %ymm0, %ymm2
+ VPCMPEQ %ymm7, %ymm0, %ymm3
+ VPCMPEQ %ymm8, %ymm0, %ymm4
+
+ VPCMPEQ %ymm5, %ymm9, %ymm5
+ VPCMPEQ %ymm6, %ymm9, %ymm6
+ VPCMPEQ %ymm7, %ymm9, %ymm7
+ VPCMPEQ %ymm8, %ymm9, %ymm8
+
+ vpor %ymm1, %ymm5, %ymm1
+ vpor %ymm2, %ymm6, %ymm2
+ vpor %ymm3, %ymm7, %ymm3
+ vpor %ymm4, %ymm8, %ymm4
+
+ vpor %ymm1, %ymm2, %ymm5
+ vpor %ymm3, %ymm4, %ymm6
+
+ vpor %ymm5, %ymm6, %ymm5
+
+ vpmovmskb %ymm5, %eax
+ testl %eax, %eax
+ jnz L(4x_vec_end)
+
+ addq $(VEC_SIZE * 4), %rdi
+
+ jmp L(loop_4x_vec)
+
+ .p2align 4
+L(first_vec_x0):
+ /* Found CHAR or the null byte. */
+ tzcntl %eax, %eax
+# ifdef USE_AS_STRCHRNUL
+ addq %rdi, %rax
+# else
+ xorl %edx, %edx
+ leaq (%rdi, %rax), %rax
+ cmp (%rax), %CHAR_REG
+ cmovne %rdx, %rax
+# endif
+ VZEROUPPER
+ ret
+
+ .p2align 4
+L(first_vec_x1):
+ tzcntl %eax, %eax
+# ifdef USE_AS_STRCHRNUL
+ addq $VEC_SIZE, %rax
+ addq %rdi, %rax
+# else
+ xorl %edx, %edx
+ leaq VEC_SIZE(%rdi, %rax), %rax
+ cmp (%rax), %CHAR_REG
+ cmovne %rdx, %rax
+# endif
+ VZEROUPPER
+ ret
+
+ .p2align 4
+L(first_vec_x2):
+ tzcntl %eax, %eax
+# ifdef USE_AS_STRCHRNUL
+ addq $(VEC_SIZE * 2), %rax
+ addq %rdi, %rax
+# else
+ xorl %edx, %edx
+ leaq (VEC_SIZE * 2)(%rdi, %rax), %rax
+ cmp (%rax), %CHAR_REG
+ cmovne %rdx, %rax
+# endif
+ VZEROUPPER
+ ret
+
+ .p2align 4
+L(4x_vec_end):
+ vpmovmskb %ymm1, %eax
+ testl %eax, %eax
+ jnz L(first_vec_x0)
+ vpmovmskb %ymm2, %eax
+ testl %eax, %eax
+ jnz L(first_vec_x1)
+ vpmovmskb %ymm3, %eax
+ testl %eax, %eax
+ jnz L(first_vec_x2)
+ vpmovmskb %ymm4, %eax
+ testl %eax, %eax
+L(first_vec_x3):
+ tzcntl %eax, %eax
+# ifdef USE_AS_STRCHRNUL
+ addq $(VEC_SIZE * 3), %rax
+ addq %rdi, %rax
+# else
+ xorl %edx, %edx
+ leaq (VEC_SIZE * 3)(%rdi, %rax), %rax
+ cmp (%rax), %CHAR_REG
+ cmovne %rdx, %rax
+# endif
+ VZEROUPPER
+ ret
+
+END (STRCHR)
+#endif
diff --git a/sysdeps/x86_64/multiarch/strchr.S b/sysdeps/x86_64/multiarch/strchr.S
index c9f54ca..f83ba6e 100644
--- a/sysdeps/x86_64/multiarch/strchr.S
+++ b/sysdeps/x86_64/multiarch/strchr.S
@@ -26,6 +26,15 @@
ENTRY(strchr)
.type strchr, @gnu_indirect_function
LOAD_RTLD_GLOBAL_RO_RDX
+ HAS_ARCH_FEATURE (Prefer_No_VZEROUPPER)
+ jnz 1f
+ HAS_ARCH_FEATURE (AVX2_Usable)
+ jz 1f
+ HAS_ARCH_FEATURE (AVX_Fast_Unaligned_Load)
+ jz 1f
+ leaq __strchr_avx2(%rip), %rax
+ ret
+1:
leaq __strchr_sse2(%rip), %rax
2: HAS_ARCH_FEATURE (Slow_BSF)
jz 3f
diff --git a/sysdeps/x86_64/multiarch/strchrnul-avx2.S b/sysdeps/x86_64/multiarch/strchrnul-avx2.S
new file mode 100644
index 0000000..fa0cc09
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strchrnul-avx2.S
@@ -0,0 +1,3 @@
+#define STRCHR __strchrnul_avx2
+#define USE_AS_STRCHRNUL 1
+#include "strchr-avx2.S"
diff --git a/sysdeps/x86_64/multiarch/strchrnul.S b/sysdeps/x86_64/multiarch/strchrnul.S
new file mode 100644
index 0000000..4df9c39
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strchrnul.S
@@ -0,0 +1,55 @@
+/* Multiple versions of strchrnul
+ All versions must be listed in ifunc-impl-list.c.
+ Copyright (C) 2017 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+/* Define multiple versions only for the definition in libc. */
+#if IS_IN (libc)
+ .text
+ENTRY(__strchrnul)
+ .type __strchrnul, @gnu_indirect_function
+ LOAD_RTLD_GLOBAL_RO_RDX
+ HAS_ARCH_FEATURE (Prefer_No_VZEROUPPER)
+ jnz 1f
+ HAS_ARCH_FEATURE (AVX2_Usable)
+ jz 1f
+ HAS_ARCH_FEATURE (AVX_Fast_Unaligned_Load)
+ jz 1f
+ leaq __strchrnul_avx2(%rip), %rax
+ ret
+
+1: leaq __strchrnul_sse2(%rip), %rax
+ ret
+END(__strchrnul)
+
+# undef ENTRY
+# define ENTRY(name) \
+ .type __strchrnul_sse2, @function; \
+ .p2align 4; \
+ .globl __strchrnul_sse2; \
+ .hidden __strchrnul_sse2; \
+ __strchrnul_sse2: cfi_startproc; \
+ CALL_MCOUNT
+# undef END
+# define END(name) \
+ cfi_endproc; .size __strchrnul_sse2, .-__strchrnul_sse2
+#endif
+
+#include "../strchrnul.S"
diff --git a/sysdeps/x86_64/multiarch/wcschr-avx2.S b/sysdeps/x86_64/multiarch/wcschr-avx2.S
new file mode 100644
index 0000000..67726b6
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wcschr-avx2.S
@@ -0,0 +1,3 @@
+#define STRCHR __wcschr_avx2
+#define USE_AS_WCSCHR 1
+#include "strchr-avx2.S"
diff --git a/sysdeps/x86_64/multiarch/wcschr.S b/sysdeps/x86_64/multiarch/wcschr.S
new file mode 100644
index 0000000..e132acc
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wcschr.S
@@ -0,0 +1,67 @@
+/* Multiple versions of wcschr
+ All versions must be listed in ifunc-impl-list.c.
+ Copyright (C) 2017 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+#include <init-arch.h>
+
+/* Define multiple versions only for the definition in libc. */
+#if IS_IN (libc)
+ .text
+ENTRY(__wcschr)
+ .type __wcschr, @gnu_indirect_function
+ LOAD_RTLD_GLOBAL_RO_RDX
+ HAS_ARCH_FEATURE (Prefer_No_VZEROUPPER)
+ jnz 1f
+ HAS_ARCH_FEATURE (AVX2_Usable)
+ jz 1f
+ HAS_ARCH_FEATURE (AVX_Fast_Unaligned_Load)
+ jz 1f
+ leaq __wcschr_avx2(%rip), %rax
+ ret
+
+1: leaq __wcschr_sse2(%rip), %rax
+ ret
+END(__wcschr)
+
+# undef ENTRY
+# define ENTRY(name) \
+ .type __wcschr_sse2, @function; \
+ .p2align 4; \
+ .globl __wcschr_sse2; \
+ .hidden __wcschr_sse2; \
+ __wcschr_sse2: cfi_startproc; \
+ CALL_MCOUNT
+# undef END
+# define END(name) \
+ cfi_endproc; .size __wcschr_sse2, .-__wcschr_sse2
+
+# ifdef SHARED
+/* It doesn't make sense to send libc-internal wcschr calls through a PLT.
+ The speedup we get from using AVX2 instructions is likely eaten away
+ by the indirect call in the PLT. */
+# undef libc_hidden_def
+# define libc_hidden_def(name) \
+ .globl __GI___wcschr; __GI___wcschr = __wcschr_sse2
+# undef libc_hidden_weak
+# define libc_hidden_weak(name) \
+ .weak __GI_wcschr; __GI_wcschr = __wcschr_sse2
+# endif
+#endif
+
+#include "../wcschr.S"
--
2.9.4
^ permalink raw reply [flat|nested] 3+ messages in thread
* Re: [PATCH] x86-64: Optimize strchr/strchrnul/wcschr with AVX2
2017-06-01 18:13 [PATCH] x86-64: Optimize strchr/strchrnul/wcschr with AVX2 H.J. Lu
@ 2017-06-02 19:49 ` H.J. Lu
2017-06-09 11:34 ` H.J. Lu
0 siblings, 1 reply; 3+ messages in thread
From: H.J. Lu @ 2017-06-02 19:49 UTC (permalink / raw)
To: GNU C Library
[-- Attachment #1: Type: text/plain, Size: 1379 bytes --]
On Thu, Jun 1, 2017 at 11:13 AM, H.J. Lu <hongjiu.lu@intel.com> wrote:
> Optimize strchr/strchrnul/wcschr with AVX2 to search 32 bytes with vector
> instructions. It is as fast as SSE2 versions for size <= 16 bytes and up
> to 1X faster for or size > 16 bytes on Haswell. Select AVX2 version on
> AVX2 machines where vzeroupper is preferred and AVX unaligned load is fast.
>
> NB: It uses TZCNT instead of BSF since TZCNT produces the same result
> as BSF for non-zero input. TZCNT is faster than BSF and is executed
> as BSF if machine doesn't support TZCNT.
>
> Any comments?
>
> H.J.
> * sysdeps/x86_64/multiarch/Makefile (sysdep_routines): Add
> strchr-avx2, strchrnul-avx2 and wcschr-avx2.
> * sysdeps/x86_64/multiarch/ifunc-impl-list.c
> (__libc_ifunc_impl_list): Add tests for __strchr_avx2,
> __strchrnul_avx2, __strchrnul_sse2, __wcschr_avx2 and
> __wcschr_sse2.
> * sysdeps/x86_64/multiarch/strchr-avx2.S: New file.
> * sysdeps/x86_64/multiarch/strchrnul-avx2.S: Likewise.
> * sysdeps/x86_64/multiarch/strchrnul.S: Likewise.
> * sysdeps/x86_64/multiarch/wcschr-avx2.S: Likewise.
> * sysdeps/x86_64/multiarch/wcschr.S: Likewise.
> * sysdeps/x86_64/multiarch/strchr.S (strchr): Add support for
> __strchr_avx2.
>
Updated patch with IFUNC selector in C.
--
H.J.
[-- Attachment #2: 0006-x86-64-Optimize-strchr-strchrnul-wcschr-with-AVX2.patch --]
[-- Type: text/x-patch, Size: 21728 bytes --]
From 1d028bc47e34a6250d6216f128093099e07d2c5a Mon Sep 17 00:00:00 2001
From: "H.J. Lu" <hjl.tools@gmail.com>
Date: Mon, 22 May 2017 15:09:50 -0700
Subject: [PATCH 6/8] x86-64: Optimize strchr/strchrnul/wcschr with AVX2
Optimize strchr/strchrnul/wcschr with AVX2 to search 32 bytes with vector
instructions. It is as fast as SSE2 versions for size <= 16 bytes and up
to 1X faster for or size > 16 bytes on Haswell. Select AVX2 version on
AVX2 machines where vzeroupper is preferred and AVX unaligned load is fast.
NB: It uses TZCNT instead of BSF since TZCNT produces the same result
as BSF for non-zero input. TZCNT is faster than BSF and is executed
as BSF if machine doesn't support TZCNT.
* sysdeps/x86_64/multiarch/Makefile (sysdep_routines): Add
strchr-sse2, strchrnul-sse2, strchr-avx2, strchrnul-avx2,
wcschr-sse2 and wcschr-avx2.
* sysdeps/x86_64/multiarch/ifunc-impl-list.c
(__libc_ifunc_impl_list): Add tests for __strchr_avx2,
__strchrnul_avx2, __strchrnul_sse2, __wcschr_avx2 and
__wcschr_sse2.
* sysdeps/x86_64/multiarch/strchr-avx2.S: New file.
* sysdeps/x86_64/multiarch/strchr-sse2.S: Likewise.
* sysdeps/x86_64/multiarch/strchr.c: New file.
* sysdeps/x86_64/multiarch/strchrnul-avx2.S: Likewise.
* sysdeps/x86_64/multiarch/strchrnul-sse2.S: Likewise.
* sysdeps/x86_64/multiarch/strchrnul.c: Likewise.
* sysdeps/x86_64/multiarch/wcschr-avx2.S: Likewise.
* sysdeps/x86_64/multiarch/wcschr-sse2.S: Likewise.
* sysdeps/x86_64/multiarch/wcschr.c: Likewise.
* sysdeps/x86_64/multiarch/strchr.S: Removed.
---
sysdeps/x86_64/multiarch/Makefile | 2 +
sysdeps/x86_64/multiarch/ifunc-impl-list.c | 17 ++
sysdeps/x86_64/multiarch/strchr-avx2.S | 254 +++++++++++++++++++++
.../x86_64/multiarch/{strchr.S => strchr-sse2.S} | 40 +---
sysdeps/x86_64/multiarch/strchr.c | 51 +++++
sysdeps/x86_64/multiarch/strchrnul-avx2.S | 3 +
sysdeps/x86_64/multiarch/strchrnul-sse2.S | 26 +++
sysdeps/x86_64/multiarch/strchrnul.c | 34 +++
sysdeps/x86_64/multiarch/wcschr-avx2.S | 3 +
sysdeps/x86_64/multiarch/wcschr-sse2.S | 38 +++
sysdeps/x86_64/multiarch/wcschr.c | 33 +++
11 files changed, 470 insertions(+), 31 deletions(-)
create mode 100644 sysdeps/x86_64/multiarch/strchr-avx2.S
rename sysdeps/x86_64/multiarch/{strchr.S => strchr-sse2.S} (60%)
create mode 100644 sysdeps/x86_64/multiarch/strchr.c
create mode 100644 sysdeps/x86_64/multiarch/strchrnul-avx2.S
create mode 100644 sysdeps/x86_64/multiarch/strchrnul-sse2.S
create mode 100644 sysdeps/x86_64/multiarch/strchrnul.c
create mode 100644 sysdeps/x86_64/multiarch/wcschr-avx2.S
create mode 100644 sysdeps/x86_64/multiarch/wcschr-sse2.S
create mode 100644 sysdeps/x86_64/multiarch/wcschr.c
diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
index 915b44f..fd4baf3 100644
--- a/sysdeps/x86_64/multiarch/Makefile
+++ b/sysdeps/x86_64/multiarch/Makefile
@@ -13,6 +13,7 @@ sysdep_routines += strncat-c stpncpy-c strncpy-c strcmp-ssse3 \
memcpy-ssse3-back \
memmove-ssse3-back \
memmove-avx512-no-vzeroupper strcasecmp_l-ssse3 \
+ strchr-sse2 strchrnul-sse2 strchr-avx2 strchrnul-avx2 \
strlen-sse2 strnlen-sse2 strlen-avx2 strnlen-avx2 \
strncase_l-ssse3 strcat-ssse3 strncat-ssse3\
strcpy-ssse3 strncpy-ssse3 stpcpy-ssse3 stpncpy-ssse3 \
@@ -37,6 +38,7 @@ sysdep_routines += wmemcmp-sse4 wmemcmp-ssse3 wmemcmp-c \
wmemchr-sse2 wmemchr-avx2 \
wmemcmp-avx2-movbe \
wcscpy-ssse3 wcscpy-c \
+ wcschr-sse2 wcschr-avx2 \
wcslen-sse2 wcsnlen-sse2 wcslen-avx2 wcsnlen-avx2
endif
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
index f139efc..36f14a8 100644
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
@@ -231,9 +231,19 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
/* Support sysdeps/x86_64/multiarch/strchr.S. */
IFUNC_IMPL (i, name, strchr,
+ IFUNC_IMPL_ADD (array, i, strchr,
+ HAS_ARCH_FEATURE (AVX2_Usable),
+ __strchr_avx2)
IFUNC_IMPL_ADD (array, i, strchr, 1, __strchr_sse2_no_bsf)
IFUNC_IMPL_ADD (array, i, strchr, 1, __strchr_sse2))
+ /* Support sysdeps/x86_64/multiarch/strchrnul.S. */
+ IFUNC_IMPL (i, name, strchrnul,
+ IFUNC_IMPL_ADD (array, i, strchrnul,
+ HAS_ARCH_FEATURE (AVX2_Usable),
+ __strchrnul_avx2)
+ IFUNC_IMPL_ADD (array, i, strchrnul, 1, __strchrnul_sse2))
+
/* Support sysdeps/x86_64/multiarch/strcmp.S. */
IFUNC_IMPL (i, name, strcmp,
IFUNC_IMPL_ADD (array, i, strcmp, HAS_CPU_FEATURE (SSE4_2),
@@ -318,6 +328,13 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
IFUNC_IMPL_ADD (array, i, strstr, 1, __strstr_sse2_unaligned)
IFUNC_IMPL_ADD (array, i, strstr, 1, __strstr_sse2))
+ /* Support sysdeps/x86_64/multiarch/wcschr.S. */
+ IFUNC_IMPL (i, name, wcschr,
+ IFUNC_IMPL_ADD (array, i, wcschr,
+ HAS_ARCH_FEATURE (AVX2_Usable),
+ __wcschr_avx2)
+ IFUNC_IMPL_ADD (array, i, wcschr, 1, __wcschr_sse2))
+
/* Support sysdeps/x86_64/multiarch/wcscpy.S. */
IFUNC_IMPL (i, name, wcscpy,
IFUNC_IMPL_ADD (array, i, wcscpy, HAS_CPU_FEATURE (SSSE3),
diff --git a/sysdeps/x86_64/multiarch/strchr-avx2.S b/sysdeps/x86_64/multiarch/strchr-avx2.S
new file mode 100644
index 0000000..e4292d3
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strchr-avx2.S
@@ -0,0 +1,254 @@
+/* strchr/strchrnul optimized with AVX2.
+ Copyright (C) 2017 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#if IS_IN (libc)
+
+# include <sysdep.h>
+
+# ifndef STRCHR
+# define STRCHR __strchr_avx2
+# endif
+
+# ifdef USE_AS_WCSCHR
+# define VPBROADCAST vpbroadcastd
+# define VPCMPEQ vpcmpeqd
+# define CHAR_REG esi
+# else
+# define VPBROADCAST vpbroadcastb
+# define VPCMPEQ vpcmpeqb
+# define CHAR_REG sil
+# endif
+
+# ifndef VZEROUPPER
+# define VZEROUPPER vzeroupper
+# endif
+
+# define VEC_SIZE 32
+
+ .section .text.avx,"ax",@progbits
+ENTRY (STRCHR)
+ movl %edi, %ecx
+ /* Broadcast CHAR to YMM0. */
+ vmovd %esi, %xmm0
+ vpxor %xmm9, %xmm9, %xmm9
+ VPBROADCAST %xmm0, %ymm0
+ /* Check if we may cross page boundary with one vector load. */
+ andl $(2 * VEC_SIZE - 1), %ecx
+ cmpl $VEC_SIZE, %ecx
+ ja L(cros_page_boundary)
+
+ /* Check the first VEC_SIZE bytes. Search for both CHAR and the
+ null byte. */
+ vmovdqu (%rdi), %ymm8
+ VPCMPEQ %ymm8, %ymm0, %ymm1
+ VPCMPEQ %ymm8, %ymm9, %ymm2
+ vpor %ymm1, %ymm2, %ymm1
+ vpmovmskb %ymm1, %eax
+ testl %eax, %eax
+ jnz L(first_vec_x0)
+
+ /* Align data for aligned loads in the loop. */
+ addq $VEC_SIZE, %rdi
+ andl $(VEC_SIZE - 1), %ecx
+ andq $-VEC_SIZE, %rdi
+
+ jmp L(more_4x_vec)
+
+ .p2align 4
+L(cros_page_boundary):
+ andl $(VEC_SIZE - 1), %ecx
+ andq $-VEC_SIZE, %rdi
+ vmovdqu (%rdi), %ymm8
+ VPCMPEQ %ymm8, %ymm0, %ymm1
+ VPCMPEQ %ymm8, %ymm9, %ymm2
+ vpor %ymm1, %ymm2, %ymm1
+ vpmovmskb %ymm1, %eax
+ /* Remove the leading bytes. */
+ sarl %cl, %eax
+ testl %eax, %eax
+ jz L(aligned_more)
+ /* Found CHAR or the null byte. */
+ tzcntl %eax, %eax
+ addq %rcx, %rax
+# ifdef USE_AS_STRCHRNUL
+ addq %rdi, %rax
+# else
+ xorl %edx, %edx
+ leaq (%rdi, %rax), %rax
+ cmp (%rax), %CHAR_REG
+ cmovne %rdx, %rax
+# endif
+ VZEROUPPER
+ ret
+
+ .p2align 4
+L(aligned_more):
+ addq $VEC_SIZE, %rdi
+
+L(more_4x_vec):
+ /* Check the first 4 * VEC_SIZE. Only one VEC_SIZE at a time
+ since data is only aligned to VEC_SIZE. */
+ vmovdqa (%rdi), %ymm8
+ VPCMPEQ %ymm8, %ymm0, %ymm1
+ VPCMPEQ %ymm8, %ymm9, %ymm2
+ vpor %ymm1, %ymm2, %ymm1
+ vpmovmskb %ymm1, %eax
+ testl %eax, %eax
+ jnz L(first_vec_x0)
+
+ vmovdqa VEC_SIZE(%rdi), %ymm8
+ VPCMPEQ %ymm8, %ymm0, %ymm1
+ VPCMPEQ %ymm8, %ymm9, %ymm2
+ vpor %ymm1, %ymm2, %ymm1
+ vpmovmskb %ymm1, %eax
+ testl %eax, %eax
+ jnz L(first_vec_x1)
+
+ vmovdqa (VEC_SIZE * 2)(%rdi), %ymm8
+ VPCMPEQ %ymm8, %ymm0, %ymm1
+ VPCMPEQ %ymm8, %ymm9, %ymm2
+ vpor %ymm1, %ymm2, %ymm1
+ vpmovmskb %ymm1, %eax
+ testl %eax, %eax
+ jnz L(first_vec_x2)
+
+ vmovdqa (VEC_SIZE * 3)(%rdi), %ymm8
+ VPCMPEQ %ymm8, %ymm0, %ymm1
+ VPCMPEQ %ymm8, %ymm9, %ymm2
+ vpor %ymm1, %ymm2, %ymm1
+ vpmovmskb %ymm1, %eax
+ testl %eax, %eax
+ jnz L(first_vec_x3)
+
+ addq $(VEC_SIZE * 4), %rdi
+
+ /* Align data to 4 * VEC_SIZE. */
+ movq %rdi, %rcx
+ andl $(4 * VEC_SIZE - 1), %ecx
+ andq $-(4 * VEC_SIZE), %rdi
+
+ .p2align 4
+L(loop_4x_vec):
+ /* Compare 4 * VEC at a time forward. */
+ vmovdqa (%rdi), %ymm5
+ vmovdqa VEC_SIZE(%rdi), %ymm6
+ vmovdqa (VEC_SIZE * 2)(%rdi), %ymm7
+ vmovdqa (VEC_SIZE * 3)(%rdi), %ymm8
+
+ VPCMPEQ %ymm5, %ymm0, %ymm1
+ VPCMPEQ %ymm6, %ymm0, %ymm2
+ VPCMPEQ %ymm7, %ymm0, %ymm3
+ VPCMPEQ %ymm8, %ymm0, %ymm4
+
+ VPCMPEQ %ymm5, %ymm9, %ymm5
+ VPCMPEQ %ymm6, %ymm9, %ymm6
+ VPCMPEQ %ymm7, %ymm9, %ymm7
+ VPCMPEQ %ymm8, %ymm9, %ymm8
+
+ vpor %ymm1, %ymm5, %ymm1
+ vpor %ymm2, %ymm6, %ymm2
+ vpor %ymm3, %ymm7, %ymm3
+ vpor %ymm4, %ymm8, %ymm4
+
+ vpor %ymm1, %ymm2, %ymm5
+ vpor %ymm3, %ymm4, %ymm6
+
+ vpor %ymm5, %ymm6, %ymm5
+
+ vpmovmskb %ymm5, %eax
+ testl %eax, %eax
+ jnz L(4x_vec_end)
+
+ addq $(VEC_SIZE * 4), %rdi
+
+ jmp L(loop_4x_vec)
+
+ .p2align 4
+L(first_vec_x0):
+ /* Found CHAR or the null byte. */
+ tzcntl %eax, %eax
+# ifdef USE_AS_STRCHRNUL
+ addq %rdi, %rax
+# else
+ xorl %edx, %edx
+ leaq (%rdi, %rax), %rax
+ cmp (%rax), %CHAR_REG
+ cmovne %rdx, %rax
+# endif
+ VZEROUPPER
+ ret
+
+ .p2align 4
+L(first_vec_x1):
+ tzcntl %eax, %eax
+# ifdef USE_AS_STRCHRNUL
+ addq $VEC_SIZE, %rax
+ addq %rdi, %rax
+# else
+ xorl %edx, %edx
+ leaq VEC_SIZE(%rdi, %rax), %rax
+ cmp (%rax), %CHAR_REG
+ cmovne %rdx, %rax
+# endif
+ VZEROUPPER
+ ret
+
+ .p2align 4
+L(first_vec_x2):
+ tzcntl %eax, %eax
+# ifdef USE_AS_STRCHRNUL
+ addq $(VEC_SIZE * 2), %rax
+ addq %rdi, %rax
+# else
+ xorl %edx, %edx
+ leaq (VEC_SIZE * 2)(%rdi, %rax), %rax
+ cmp (%rax), %CHAR_REG
+ cmovne %rdx, %rax
+# endif
+ VZEROUPPER
+ ret
+
+ .p2align 4
+L(4x_vec_end):
+ vpmovmskb %ymm1, %eax
+ testl %eax, %eax
+ jnz L(first_vec_x0)
+ vpmovmskb %ymm2, %eax
+ testl %eax, %eax
+ jnz L(first_vec_x1)
+ vpmovmskb %ymm3, %eax
+ testl %eax, %eax
+ jnz L(first_vec_x2)
+ vpmovmskb %ymm4, %eax
+ testl %eax, %eax
+L(first_vec_x3):
+ tzcntl %eax, %eax
+# ifdef USE_AS_STRCHRNUL
+ addq $(VEC_SIZE * 3), %rax
+ addq %rdi, %rax
+# else
+ xorl %edx, %edx
+ leaq (VEC_SIZE * 3)(%rdi, %rax), %rax
+ cmp (%rax), %CHAR_REG
+ cmovne %rdx, %rax
+# endif
+ VZEROUPPER
+ ret
+
+END (STRCHR)
+#endif
diff --git a/sysdeps/x86_64/multiarch/strchr.S b/sysdeps/x86_64/multiarch/strchr-sse2.S
similarity index 60%
rename from sysdeps/x86_64/multiarch/strchr.S
rename to sysdeps/x86_64/multiarch/strchr-sse2.S
index c9f54ca..44eb07e 100644
--- a/sysdeps/x86_64/multiarch/strchr.S
+++ b/sysdeps/x86_64/multiarch/strchr-sse2.S
@@ -1,4 +1,4 @@
-/* Multiple versions of strchr
+/* strchr optimized with SSE2.
Copyright (C) 2009-2017 Free Software Foundation, Inc.
This file is part of the GNU C Library.
@@ -16,42 +16,20 @@
License along with the GNU C Library; if not, see
<http://www.gnu.org/licenses/>. */
-#include <sysdep.h>
-#include <init-arch.h>
-
-
-/* Define multiple versions only for the definition in libc. */
#if IS_IN (libc)
- .text
-ENTRY(strchr)
- .type strchr, @gnu_indirect_function
- LOAD_RTLD_GLOBAL_RO_RDX
- leaq __strchr_sse2(%rip), %rax
-2: HAS_ARCH_FEATURE (Slow_BSF)
- jz 3f
- leaq __strchr_sse2_no_bsf(%rip), %rax
-3: ret
-END(strchr)
-
+# define strchr __strchr_sse2
-
-# undef ENTRY
-# define ENTRY(name) \
- .type __strchr_sse2, @function; \
- .align 16; \
- .globl __strchr_sse2; \
- .hidden __strchr_sse2; \
- __strchr_sse2: cfi_startproc; \
- CALL_MCOUNT
-# undef END
-# define END(name) \
- cfi_endproc; .size __strchr_sse2, .-__strchr_sse2
-# undef libc_hidden_builtin_def
+# ifdef SHARED
+# undef libc_hidden_builtin_def
/* It doesn't make sense to send libc-internal strchr calls through a PLT.
The speedup we get from using SSE4.2 instruction is likely eaten away
by the indirect call in the PLT. */
-# define libc_hidden_builtin_def(name) \
+# define libc_hidden_builtin_def(name) \
.globl __GI_strchr; __GI_strchr = __strchr_sse2
+# endif
+
+# undef weak_alias
+# define weak_alias(strchr, index)
#endif
#include "../strchr.S"
diff --git a/sysdeps/x86_64/multiarch/strchr.c b/sysdeps/x86_64/multiarch/strchr.c
new file mode 100644
index 0000000..31dc583
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strchr.c
@@ -0,0 +1,51 @@
+/* Multiple versions of strchr.
+ All versions must be listed in ifunc-impl-list.c.
+ Copyright (C) 2009-2017 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+/* Define multiple versions only for the definition in libc. */
+#if IS_IN (libc)
+# define strchr __redirect_strchr
+# include <string.h>
+# undef strchr
+
+# define SYMBOL_NAME strchr
+# include <init-arch.h>
+
+extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2_no_bsf) attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden;
+
+static inline void *
+IFUNC_SELECTOR (void)
+{
+ const struct cpu_features* cpu_features = __get_cpu_features ();
+
+ if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER)
+ && CPU_FEATURES_ARCH_P (cpu_features, AVX2_Usable)
+ && CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load))
+ return OPTIMIZE (avx2);
+
+ if (CPU_FEATURES_ARCH_P (cpu_features, Slow_BSF))
+ return OPTIMIZE (sse2_no_bsf);
+
+ return OPTIMIZE (sse2);
+}
+
+libc_ifunc_redirected (__redirect_strchr, strchr, IFUNC_SELECTOR ());
+weak_alias (strchr, index)
+#endif
diff --git a/sysdeps/x86_64/multiarch/strchrnul-avx2.S b/sysdeps/x86_64/multiarch/strchrnul-avx2.S
new file mode 100644
index 0000000..fa0cc09
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strchrnul-avx2.S
@@ -0,0 +1,3 @@
+#define STRCHR __strchrnul_avx2
+#define USE_AS_STRCHRNUL 1
+#include "strchr-avx2.S"
diff --git a/sysdeps/x86_64/multiarch/strchrnul-sse2.S b/sysdeps/x86_64/multiarch/strchrnul-sse2.S
new file mode 100644
index 0000000..4d199b3
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strchrnul-sse2.S
@@ -0,0 +1,26 @@
+/* strchrnul optimized with SSE2.
+ Copyright (C) 2017 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#if IS_IN (libc)
+# define __strchrnul __strchrnul_sse2
+
+# undef weak_alias
+# define weak_alias(__strchrnul, strchrnul)
+#endif
+
+#include "../strchrnul.S"
diff --git a/sysdeps/x86_64/multiarch/strchrnul.c b/sysdeps/x86_64/multiarch/strchrnul.c
new file mode 100644
index 0000000..95b6222
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strchrnul.c
@@ -0,0 +1,34 @@
+/* Multiple versions of strchrnul.
+ All versions must be listed in ifunc-impl-list.c.
+ Copyright (C) 2017 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+/* Define multiple versions only for the definition in libc. */
+#if IS_IN (libc)
+# define strchrnul __redirect_strchrnul
+# define __strchrnul __redirect___strchrnul
+# include <string.h>
+# undef __strchrnul
+# undef strchrnul
+
+# define SYMBOL_NAME strchrnul
+# include "ifunc-sse2-avx2.h"
+
+libc_ifunc_redirected (__redirect_strchrnul, __strchrnul,
+ IFUNC_SELECTOR ());
+weak_alias (__strchrnul, strchrnul)
+#endif
diff --git a/sysdeps/x86_64/multiarch/wcschr-avx2.S b/sysdeps/x86_64/multiarch/wcschr-avx2.S
new file mode 100644
index 0000000..67726b6
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wcschr-avx2.S
@@ -0,0 +1,3 @@
+#define STRCHR __wcschr_avx2
+#define USE_AS_WCSCHR 1
+#include "strchr-avx2.S"
diff --git a/sysdeps/x86_64/multiarch/wcschr-sse2.S b/sysdeps/x86_64/multiarch/wcschr-sse2.S
new file mode 100644
index 0000000..e5fa517
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wcschr-sse2.S
@@ -0,0 +1,38 @@
+/* wcschr optimized with SSE2.
+ Copyright (C) 2017 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#if IS_IN (libc)
+# define __wcschr __wcschr_sse2
+
+# ifdef SHARED
+/* It doesn't make sense to send libc-internal wcschr calls through a PLT.
+ The speedup we get from using AVX2 instructions is likely eaten away
+ by the indirect call in the PLT. */
+# undef libc_hidden_def
+# define libc_hidden_def(name) \
+ .globl __GI___wcschr; __GI___wcschr = __wcschr_sse2
+# undef libc_hidden_weak
+# define libc_hidden_weak(name) \
+ .weak __GI_wcschr; __GI_wcschr = __wcschr_sse2
+# endif
+
+# undef weak_alias
+# define weak_alias(__wcschr, wcschr)
+#endif
+
+#include "../wcschr.S"
diff --git a/sysdeps/x86_64/multiarch/wcschr.c b/sysdeps/x86_64/multiarch/wcschr.c
new file mode 100644
index 0000000..910468f
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wcschr.c
@@ -0,0 +1,33 @@
+/* Multiple versions of wcschr.
+ All versions must be listed in ifunc-impl-list.c.
+ Copyright (C) 2017 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+/* Define multiple versions only for the definition in libc. */
+#if IS_IN (libc)
+# define wcschr __redirect_wcschr
+# define __wcschr __redirect___wcschr
+# include <wchar.h>
+# undef wcschr
+# undef __wcschr
+
+# define SYMBOL_NAME wcschr
+# include "ifunc-sse2-avx2.h"
+
+libc_ifunc_redirected (__redirect_wcschr, __wcschr, IFUNC_SELECTOR ());
+weak_alias (__wcschr, wcschr);
+#endif
--
2.9.4
^ permalink raw reply [flat|nested] 3+ messages in thread
* Re: [PATCH] x86-64: Optimize strchr/strchrnul/wcschr with AVX2
2017-06-02 19:49 ` H.J. Lu
@ 2017-06-09 11:34 ` H.J. Lu
0 siblings, 0 replies; 3+ messages in thread
From: H.J. Lu @ 2017-06-09 11:34 UTC (permalink / raw)
To: GNU C Library
On Fri, Jun 2, 2017 at 12:49 PM, H.J. Lu <hjl.tools@gmail.com> wrote:
> On Thu, Jun 1, 2017 at 11:13 AM, H.J. Lu <hongjiu.lu@intel.com> wrote:
>> Optimize strchr/strchrnul/wcschr with AVX2 to search 32 bytes with vector
>> instructions. It is as fast as SSE2 versions for size <= 16 bytes and up
>> to 1X faster for or size > 16 bytes on Haswell. Select AVX2 version on
>> AVX2 machines where vzeroupper is preferred and AVX unaligned load is fast.
>>
>> NB: It uses TZCNT instead of BSF since TZCNT produces the same result
>> as BSF for non-zero input. TZCNT is faster than BSF and is executed
>> as BSF if machine doesn't support TZCNT.
>>
>> Any comments?
>>
>> H.J.
>> * sysdeps/x86_64/multiarch/Makefile (sysdep_routines): Add
>> strchr-avx2, strchrnul-avx2 and wcschr-avx2.
>> * sysdeps/x86_64/multiarch/ifunc-impl-list.c
>> (__libc_ifunc_impl_list): Add tests for __strchr_avx2,
>> __strchrnul_avx2, __strchrnul_sse2, __wcschr_avx2 and
>> __wcschr_sse2.
>> * sysdeps/x86_64/multiarch/strchr-avx2.S: New file.
>> * sysdeps/x86_64/multiarch/strchrnul-avx2.S: Likewise.
>> * sysdeps/x86_64/multiarch/strchrnul.S: Likewise.
>> * sysdeps/x86_64/multiarch/wcschr-avx2.S: Likewise.
>> * sysdeps/x86_64/multiarch/wcschr.S: Likewise.
>> * sysdeps/x86_64/multiarch/strchr.S (strchr): Add support for
>> __strchr_avx2.
>>
>
> Updated patch with IFUNC selector in C.
>
I will check it in.
--
H.J.
^ permalink raw reply [flat|nested] 3+ messages in thread
end of thread, other threads:[~2017-06-09 11:34 UTC | newest]
Thread overview: 3+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2017-06-01 18:13 [PATCH] x86-64: Optimize strchr/strchrnul/wcschr with AVX2 H.J. Lu
2017-06-02 19:49 ` H.J. Lu
2017-06-09 11:34 ` H.J. Lu
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).