* [PATCH] x86_64: Implement evex512 version of strlen, strnlen, wcslen and wcsnlen
@ 2022-05-18 18:59 Sunil K Pandey
2022-05-18 20:29 ` Noah Goldstein
0 siblings, 1 reply; 12+ messages in thread
From: Sunil K Pandey @ 2022-05-18 18:59 UTC (permalink / raw)
To: libc-alpha
This patch implements following evex512 version of string functions.
Perf gain up to 50% as compared to evex, depending on length and
alignment.
- String length function using 512 bit vectors.
- String N length using 512 bit vectors.
- Wide string length using 512 bit vectors.
- Wide string N length using 512 bit vectors.
---
sysdeps/x86_64/multiarch/Makefile | 4 +
sysdeps/x86_64/multiarch/ifunc-impl-list.c | 20 ++
sysdeps/x86_64/multiarch/strlen-evex512.S | 291 +++++++++++++++++++++
sysdeps/x86_64/multiarch/strnlen-evex512.S | 4 +
sysdeps/x86_64/multiarch/wcslen-evex512.S | 4 +
sysdeps/x86_64/multiarch/wcsnlen-evex512.S | 5 +
6 files changed, 328 insertions(+)
create mode 100644 sysdeps/x86_64/multiarch/strlen-evex512.S
create mode 100644 sysdeps/x86_64/multiarch/strnlen-evex512.S
create mode 100644 sysdeps/x86_64/multiarch/wcslen-evex512.S
create mode 100644 sysdeps/x86_64/multiarch/wcsnlen-evex512.S
diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
index f3ab5e0928..d0869c3ac3 100644
--- a/sysdeps/x86_64/multiarch/Makefile
+++ b/sysdeps/x86_64/multiarch/Makefile
@@ -81,6 +81,7 @@ sysdep_routines += \
strlen-avx2 \
strlen-avx2-rtm \
strlen-evex \
+ strlen-evex512 \
strlen-sse2 \
strncase_l-avx2 \
strncase_l-avx2-rtm \
@@ -105,6 +106,7 @@ sysdep_routines += \
strnlen-avx2 \
strnlen-avx2-rtm \
strnlen-evex \
+ strnlen-evex512 \
strnlen-sse2 \
strpbrk-c \
strpbrk-sse2 \
@@ -138,6 +140,7 @@ sysdep_routines += \
wcslen-avx2 \
wcslen-avx2-rtm \
wcslen-evex \
+ wcslen-evex512 \
wcslen-sse2 \
wcslen-sse4_1 \
wcsncmp-avx2 \
@@ -148,6 +151,7 @@ sysdep_routines += \
wcsnlen-avx2-rtm \
wcsnlen-c \
wcsnlen-evex \
+ wcsnlen-evex512 \
wcsnlen-sse4_1 \
wcsrchr-avx2 \
wcsrchr-avx2-rtm \
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
index 7218095430..c5cd9466fe 100644
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
@@ -328,6 +328,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
&& CPU_FEATURE_USABLE (AVX512BW)
&& CPU_FEATURE_USABLE (BMI2)),
__strlen_evex)
+ IFUNC_IMPL_ADD (array, i, strlen,
+ (CPU_FEATURE_USABLE (AVX512VL)
+ && CPU_FEATURE_USABLE (AVX512BW)
+ && CPU_FEATURE_USABLE (BMI2)),
+ __strlen_evex512)
IFUNC_IMPL_ADD (array, i, strlen, 1, __strlen_sse2))
/* Support sysdeps/x86_64/multiarch/strnlen.c. */
@@ -346,6 +351,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
&& CPU_FEATURE_USABLE (AVX512BW)
&& CPU_FEATURE_USABLE (BMI2)),
__strnlen_evex)
+ IFUNC_IMPL_ADD (array, i, strnlen,
+ (CPU_FEATURE_USABLE (AVX512VL)
+ && CPU_FEATURE_USABLE (AVX512BW)
+ && CPU_FEATURE_USABLE (BMI2)),
+ __strnlen_evex512)
IFUNC_IMPL_ADD (array, i, strnlen, 1, __strnlen_sse2))
/* Support sysdeps/x86_64/multiarch/stpncpy.c. */
@@ -699,6 +709,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
&& CPU_FEATURE_USABLE (AVX512BW)
&& CPU_FEATURE_USABLE (BMI2)),
__wcslen_evex)
+ IFUNC_IMPL_ADD (array, i, wcslen,
+ (CPU_FEATURE_USABLE (AVX512VL)
+ && CPU_FEATURE_USABLE (AVX512BW)
+ && CPU_FEATURE_USABLE (BMI2)),
+ __wcslen_evex512)
IFUNC_IMPL_ADD (array, i, wcslen,
CPU_FEATURE_USABLE (SSE4_1),
__wcslen_sse4_1)
@@ -720,6 +735,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
&& CPU_FEATURE_USABLE (AVX512BW)
&& CPU_FEATURE_USABLE (BMI2)),
__wcsnlen_evex)
+ IFUNC_IMPL_ADD (array, i, wcsnlen,
+ (CPU_FEATURE_USABLE (AVX512VL)
+ && CPU_FEATURE_USABLE (AVX512BW)
+ && CPU_FEATURE_USABLE (BMI2)),
+ __wcsnlen_evex512)
IFUNC_IMPL_ADD (array, i, wcsnlen,
CPU_FEATURE_USABLE (SSE4_1),
__wcsnlen_sse4_1)
diff --git a/sysdeps/x86_64/multiarch/strlen-evex512.S b/sysdeps/x86_64/multiarch/strlen-evex512.S
new file mode 100644
index 0000000000..13a6b34615
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strlen-evex512.S
@@ -0,0 +1,291 @@
+/* Copyright (C) 2022 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#if IS_IN (libc)
+
+# include <sysdep.h>
+
+# ifndef STRLEN
+# define STRLEN __strlen_evex512
+# endif
+
+# define VMOVA vmovdqa64
+# ifdef USE_AS_WCSLEN
+# define VPCMP vpcmpd
+# define VPMINU vpminud
+# define CHAR_SIZE 4
+# else
+# define VPCMP vpcmpb
+# define VPMINU vpminub
+# define CHAR_SIZE 1
+# endif
+
+# define XMM0 xmm16
+# define ZMM0 zmm16
+# define ZMM1 zmm17
+# define ZMM2 zmm18
+# define ZMM3 zmm19
+# define ZMM4 zmm20
+# define VEC_SIZE 64
+# define PAGE_SIZE 4096
+# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE)
+
+ .section .text.evex512, "ax", @progbits
+/* Aligning entry point to 64 byte, provides better performance for
+ one vector length string. */
+ENTRY_P2ALIGN (STRLEN, 6)
+# ifdef USE_AS_STRNLEN
+ /* Check zero length. */
+ test %RSI_LP, %RSI_LP
+ jz L(zero)
+# ifdef __ILP32__
+ /* Clear the upper 32 bits. */
+ movl %esi, %esi
+# endif
+# endif
+
+ movl %edi, %ecx
+ vpxorq %XMM0, %XMM0, %XMM0
+ andl $(PAGE_SIZE - 1), %ecx
+ cmpl $(PAGE_SIZE - VEC_SIZE), %ecx
+ ja L(page_cross)
+
+ /* Compare [w]char for null, mask bit will be set for match. */
+ VPCMP $0, (%rdi), %ZMM0, %k0
+ kmovq %k0, %rax
+ testq %rax, %rax
+ jz L(align_more)
+
+ tzcntq %rax, %rax
+# ifdef USE_AS_STRNLEN
+ cmpq %rsi, %rax
+ jae L(ret_max)
+# endif
+ ret
+
+# ifdef USE_AS_STRNLEN
+ /* eax instead of rax used to save encoding space. */
+L(zero):
+ xorl %eax, %eax
+ ret
+# endif
+
+ /* At this point vector max length reached. */
+# ifdef USE_AS_STRNLEN
+L(ret_max):
+ movq %rsi, %rax
+ ret
+# endif
+
+L(page_cross):
+ andl $(VEC_SIZE - 1), %ecx
+# ifdef USE_AS_WCSLEN
+ sarl $2, %ecx
+# endif
+ /* ecx contains number of w[char] to be skipped as a result
+ of address alignment. */
+ movq %rdi, %rax
+ andq $-VEC_SIZE, %rax
+ VPCMP $0, (%rax), %ZMM0, %k0
+ kmovq %k0, %rax
+ /* Ignore number of character for alignment adjustment. */
+ shrq %cl, %rax
+ jz L(align_more)
+
+ tzcntq %rax, %rax
+# ifdef USE_AS_STRNLEN
+ cmpq %rsi, %rax
+ jae L(ret_max)
+# endif
+ ret
+
+L(align_more):
+ leaq VEC_SIZE(%rdi), %rax
+ /* Align rax to VEC_SIZE. */
+ andq $-VEC_SIZE, %rax
+# ifdef USE_AS_STRNLEN
+ movq %rax, %rdx
+ subq %rdi, %rdx
+# ifdef USE_AS_WCSLEN
+ shrq $2, %rdx
+# endif
+ /* At this point rdx contains [w]chars already compared. */
+ cmpq %rsi, %rdx
+ jae L(ret_max)
+ subq %rsi, %rdx
+ negq %rdx
+ /* At this point rdx contains number of w[char] needs to go.
+ Now onwards rdx will keep decrementing with each compare. */
+# endif
+
+ /* Loop unroll 4 times for 4 vector loop. */
+ VPCMP $0, (%rax), %ZMM0, %k0
+ kmovq %k0, %rcx
+ testq %rcx, %rcx
+ jnz L(first_vector)
+
+# ifdef USE_AS_STRNLEN
+ subq $CHAR_PER_VEC, %rdx
+ jbe L(ret_max)
+# endif
+
+ VPCMP $0, VEC_SIZE(%rax), %ZMM0, %k0
+ kmovq %k0, %rcx
+ testq %rcx, %rcx
+ jnz L(second_vector)
+
+# ifdef USE_AS_STRNLEN
+ subq $CHAR_PER_VEC, %rdx
+ jbe L(ret_max)
+# endif
+
+ VPCMP $0, (2 * VEC_SIZE)(%rax), %ZMM0, %k0
+ kmovq %k0, %rcx
+ testq %rcx, %rcx
+ jnz L(third_vector)
+
+# ifdef USE_AS_STRNLEN
+ subq $CHAR_PER_VEC, %rdx
+ jbe L(ret_max)
+# endif
+
+ VPCMP $0, (3 * VEC_SIZE)(%rax), %ZMM0, %k0
+ kmovq %k0, %rcx
+ testq %rcx, %rcx
+ jnz L(fourth_vector)
+
+ addq $(4 * VEC_SIZE), %rax
+
+# ifdef USE_AS_STRNLEN
+ /* Instead of decreasing, rdx increased to prepare for loop
+ first iteration. Incremented 3 times because one increment
+ cancelled by previous decrement. */
+ addq $(3 * CHAR_PER_VEC), %rdx
+# endif
+
+ /* Test if address is already 4 * VEC_SIZE byte aligned goto
+ loop. */
+ testq $(3 * VEC_SIZE), %rax
+ jz L(loop)
+
+ movq %rax, %rcx
+
+ /* Align address to 4 * VEC_SIZE for loop. */
+ andq $-(4 * VEC_SIZE), %rax
+
+# ifdef USE_AS_STRNLEN
+ subq %rax, %rcx
+# ifdef USE_AS_WCSLEN
+ sarq $2, %rcx
+# endif
+ /* rcx contains number of [w]char will be recompared due to
+ alignment fixes. rdx must be incremented by rcx to offset
+ alignment adjustmentment. */
+ addq %rcx, %rdx
+# endif
+
+L(loop):
+# ifdef USE_AS_STRNLEN
+ subq $(CHAR_PER_VEC * 4), %rdx
+ jbe L(ret_max)
+# endif
+ /* VPMINU and VPCMP combination provide better perfomance as
+ compared to alternative combinations. */
+ VMOVA (%rax), %ZMM1
+ VPMINU (VEC_SIZE)(%rax), %ZMM1, %ZMM2
+ VMOVA (2 * VEC_SIZE)(%rax), %ZMM3
+ VPMINU (3 * VEC_SIZE)(%rax), %ZMM3, %ZMM4
+
+ VPCMP $0, %ZMM2, %ZMM0, %k0
+ VPCMP $0, %ZMM4, %ZMM0, %k1
+
+ addq $(4 * VEC_SIZE), %rax
+ kortestq %k0, %k1
+ jz L(loop)
+
+ /* Need 4 vector subtraction because address incremented in
+ the loop before terminating condition check. Also want to
+ reuse code for exit condition before and after the loop. */
+ subq $(4 * VEC_SIZE), %rax
+
+ VPCMP $0, %ZMM1, %ZMM0, %k2
+ kmovq %k2, %rcx
+ testq %rcx, %rcx
+ jnz L(first_vector)
+
+ kmovq %k0, %rcx
+ /* At this point, if k0 is non zero, null char must be in the
+ second vector. */
+ testq %rcx, %rcx
+ jnz L(second_vector)
+
+ VPCMP $0, %ZMM3, %ZMM0, %k3
+ kmovq %k3, %rcx
+ testq %rcx, %rcx
+ jnz L(third_vector)
+ /* At this point null [w]char must be in the fourth vector so no
+ need to check. */
+ kmovq %k1, %rcx
+
+ /* Termination fourth, third, second vector are pretty much
+ same, implemented this way to avoid branching and reuse code
+ from pre loop exit condition. */
+L(fourth_vector):
+ addq $(3 * VEC_SIZE), %rax
+ tzcntq %rcx, %rcx
+ subq %rdi, %rax
+# ifdef USE_AS_WCSLEN
+ sarq $2, %rax
+# endif
+ addq %rcx, %rax
+# ifdef USE_AS_STRNLEN
+ cmpq %rsi, %rax
+ jae L(ret_max)
+# endif
+ ret
+
+L(third_vector):
+ addq $(2 * VEC_SIZE), %rax
+ tzcntq %rcx, %rcx
+ subq %rdi, %rax
+# ifdef USE_AS_WCSLEN
+ sarq $2, %rax
+# endif
+ addq %rcx, %rax
+# ifdef USE_AS_STRNLEN
+ cmpq %rsi, %rax
+ jae L(ret_max)
+# endif
+ ret
+
+L(second_vector):
+ addq $VEC_SIZE, %rax
+L(first_vector):
+ tzcntq %rcx, %rcx
+ subq %rdi, %rax
+# ifdef USE_AS_WCSLEN
+ sarq $2, %rax
+# endif
+ addq %rcx, %rax
+# ifdef USE_AS_STRNLEN
+ cmpq %rsi, %rax
+ jae L(ret_max)
+# endif
+ ret
+
+END (STRLEN)
+#endif
diff --git a/sysdeps/x86_64/multiarch/strnlen-evex512.S b/sysdeps/x86_64/multiarch/strnlen-evex512.S
new file mode 100644
index 0000000000..0b7f220214
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strnlen-evex512.S
@@ -0,0 +1,4 @@
+#define STRLEN __strnlen_evex512
+#define USE_AS_STRNLEN 1
+
+#include "strlen-evex512.S"
diff --git a/sysdeps/x86_64/multiarch/wcslen-evex512.S b/sysdeps/x86_64/multiarch/wcslen-evex512.S
new file mode 100644
index 0000000000..f59c372b78
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wcslen-evex512.S
@@ -0,0 +1,4 @@
+#define STRLEN __wcslen_evex512
+#define USE_AS_WCSLEN 1
+
+#include "strlen-evex512.S"
diff --git a/sysdeps/x86_64/multiarch/wcsnlen-evex512.S b/sysdeps/x86_64/multiarch/wcsnlen-evex512.S
new file mode 100644
index 0000000000..73dcf2f210
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wcsnlen-evex512.S
@@ -0,0 +1,5 @@
+#define STRLEN __wcsnlen_evex512
+#define USE_AS_WCSLEN 1
+#define USE_AS_STRNLEN 1
+
+#include "strlen-evex512.S"
--
2.35.3
^ permalink raw reply [flat|nested] 12+ messages in thread
* Re: [PATCH] x86_64: Implement evex512 version of strlen, strnlen, wcslen and wcsnlen
2022-05-18 18:59 [PATCH] x86_64: Implement evex512 version of strlen, strnlen, wcslen and wcsnlen Sunil K Pandey
@ 2022-05-18 20:29 ` Noah Goldstein
2022-05-19 3:33 ` Sunil Pandey
0 siblings, 1 reply; 12+ messages in thread
From: Noah Goldstein @ 2022-05-18 20:29 UTC (permalink / raw)
To: Sunil K Pandey; +Cc: GNU C Library
On Wed, May 18, 2022 at 1:59 PM Sunil K Pandey via Libc-alpha
<libc-alpha@sourceware.org> wrote:
>
> This patch implements following evex512 version of string functions.
> Perf gain up to 50% as compared to evex, depending on length and
> alignment.
Can you include a csv (or any consistent fmt really) somewhere of all
the benchmarks
and results of ~10-20 runs and the hardware your benchmarking on?
>
> - String length function using 512 bit vectors.
> - String N length using 512 bit vectors.
> - Wide string length using 512 bit vectors.
> - Wide string N length using 512 bit vectors.
> ---
> sysdeps/x86_64/multiarch/Makefile | 4 +
> sysdeps/x86_64/multiarch/ifunc-impl-list.c | 20 ++
> sysdeps/x86_64/multiarch/strlen-evex512.S | 291 +++++++++++++++++++++
> sysdeps/x86_64/multiarch/strnlen-evex512.S | 4 +
> sysdeps/x86_64/multiarch/wcslen-evex512.S | 4 +
> sysdeps/x86_64/multiarch/wcsnlen-evex512.S | 5 +
> 6 files changed, 328 insertions(+)
> create mode 100644 sysdeps/x86_64/multiarch/strlen-evex512.S
> create mode 100644 sysdeps/x86_64/multiarch/strnlen-evex512.S
> create mode 100644 sysdeps/x86_64/multiarch/wcslen-evex512.S
> create mode 100644 sysdeps/x86_64/multiarch/wcsnlen-evex512.S
>
> diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
> index f3ab5e0928..d0869c3ac3 100644
> --- a/sysdeps/x86_64/multiarch/Makefile
> +++ b/sysdeps/x86_64/multiarch/Makefile
> @@ -81,6 +81,7 @@ sysdep_routines += \
> strlen-avx2 \
> strlen-avx2-rtm \
> strlen-evex \
> + strlen-evex512 \
> strlen-sse2 \
> strncase_l-avx2 \
> strncase_l-avx2-rtm \
> @@ -105,6 +106,7 @@ sysdep_routines += \
> strnlen-avx2 \
> strnlen-avx2-rtm \
> strnlen-evex \
> + strnlen-evex512 \
> strnlen-sse2 \
> strpbrk-c \
> strpbrk-sse2 \
> @@ -138,6 +140,7 @@ sysdep_routines += \
> wcslen-avx2 \
> wcslen-avx2-rtm \
> wcslen-evex \
> + wcslen-evex512 \
> wcslen-sse2 \
> wcslen-sse4_1 \
> wcsncmp-avx2 \
> @@ -148,6 +151,7 @@ sysdep_routines += \
> wcsnlen-avx2-rtm \
> wcsnlen-c \
> wcsnlen-evex \
> + wcsnlen-evex512 \
> wcsnlen-sse4_1 \
> wcsrchr-avx2 \
> wcsrchr-avx2-rtm \
> diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> index 7218095430..c5cd9466fe 100644
> --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> @@ -328,6 +328,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> && CPU_FEATURE_USABLE (AVX512BW)
> && CPU_FEATURE_USABLE (BMI2)),
> __strlen_evex)
> + IFUNC_IMPL_ADD (array, i, strlen,
> + (CPU_FEATURE_USABLE (AVX512VL)
> + && CPU_FEATURE_USABLE (AVX512BW)
> + && CPU_FEATURE_USABLE (BMI2)),
> + __strlen_evex512)
> IFUNC_IMPL_ADD (array, i, strlen, 1, __strlen_sse2))
>
> /* Support sysdeps/x86_64/multiarch/strnlen.c. */
> @@ -346,6 +351,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> && CPU_FEATURE_USABLE (AVX512BW)
> && CPU_FEATURE_USABLE (BMI2)),
> __strnlen_evex)
> + IFUNC_IMPL_ADD (array, i, strnlen,
> + (CPU_FEATURE_USABLE (AVX512VL)
> + && CPU_FEATURE_USABLE (AVX512BW)
> + && CPU_FEATURE_USABLE (BMI2)),
> + __strnlen_evex512)
> IFUNC_IMPL_ADD (array, i, strnlen, 1, __strnlen_sse2))
>
> /* Support sysdeps/x86_64/multiarch/stpncpy.c. */
> @@ -699,6 +709,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> && CPU_FEATURE_USABLE (AVX512BW)
> && CPU_FEATURE_USABLE (BMI2)),
> __wcslen_evex)
> + IFUNC_IMPL_ADD (array, i, wcslen,
> + (CPU_FEATURE_USABLE (AVX512VL)
> + && CPU_FEATURE_USABLE (AVX512BW)
> + && CPU_FEATURE_USABLE (BMI2)),
> + __wcslen_evex512)
> IFUNC_IMPL_ADD (array, i, wcslen,
> CPU_FEATURE_USABLE (SSE4_1),
> __wcslen_sse4_1)
> @@ -720,6 +735,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> && CPU_FEATURE_USABLE (AVX512BW)
> && CPU_FEATURE_USABLE (BMI2)),
> __wcsnlen_evex)
> + IFUNC_IMPL_ADD (array, i, wcsnlen,
> + (CPU_FEATURE_USABLE (AVX512VL)
> + && CPU_FEATURE_USABLE (AVX512BW)
> + && CPU_FEATURE_USABLE (BMI2)),
> + __wcsnlen_evex512)
> IFUNC_IMPL_ADD (array, i, wcsnlen,
> CPU_FEATURE_USABLE (SSE4_1),
> __wcsnlen_sse4_1)
> diff --git a/sysdeps/x86_64/multiarch/strlen-evex512.S b/sysdeps/x86_64/multiarch/strlen-evex512.S
> new file mode 100644
> index 0000000000..13a6b34615
> --- /dev/null
> +++ b/sysdeps/x86_64/multiarch/strlen-evex512.S
> @@ -0,0 +1,291 @@
> +/* Copyright (C) 2022 Free Software Foundation, Inc.
> + This file is part of the GNU C Library.
> +
> + The GNU C Library is free software; you can redistribute it and/or
> + modify it under the terms of the GNU Lesser General Public
> + License as published by the Free Software Foundation; either
> + version 2.1 of the License, or (at your option) any later version.
> +
> + The GNU C Library is distributed in the hope that it will be useful,
> + but WITHOUT ANY WARRANTY; without even the implied warranty of
> + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + Lesser General Public License for more details.
> +
> + You should have received a copy of the GNU Lesser General Public
> + License along with the GNU C Library; if not, see
> + <https://www.gnu.org/licenses/>. */
> +
> +#if IS_IN (libc)
> +
> +# include <sysdep.h>
> +
> +# ifndef STRLEN
> +# define STRLEN __strlen_evex512
> +# endif
> +
> +# define VMOVA vmovdqa64
> +# ifdef USE_AS_WCSLEN
> +# define VPCMP vpcmpd
> +# define VPMINU vpminud
> +# define CHAR_SIZE 4
> +# else
> +# define VPCMP vpcmpb
> +# define VPMINU vpminub
> +# define CHAR_SIZE 1
> +# endif
> +
> +# define XMM0 xmm16
> +# define ZMM0 zmm16
> +# define ZMM1 zmm17
> +# define ZMM2 zmm18
> +# define ZMM3 zmm19
> +# define ZMM4 zmm20
> +# define VEC_SIZE 64
> +# define PAGE_SIZE 4096
> +# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE)
Is it possible to integrate this file cleanly with the evex256 version?
Something similar to what we do for memset/memmove.
> +
> + .section .text.evex512, "ax", @progbits
> +/* Aligning entry point to 64 byte, provides better performance for
> + one vector length string. */
> +ENTRY_P2ALIGN (STRLEN, 6)
> +# ifdef USE_AS_STRNLEN
> + /* Check zero length. */
> + test %RSI_LP, %RSI_LP
> + jz L(zero)
> +# ifdef __ILP32__
> + /* Clear the upper 32 bits. */
> + movl %esi, %esi
> +# endif
> +# endif
> +
> + movl %edi, %ecx
> + vpxorq %XMM0, %XMM0, %XMM0
> + andl $(PAGE_SIZE - 1), %ecx
> + cmpl $(PAGE_SIZE - VEC_SIZE), %ecx
> + ja L(page_cross)
> +
> + /* Compare [w]char for null, mask bit will be set for match. */
> + VPCMP $0, (%rdi), %ZMM0, %k0
> + kmovq %k0, %rax
> + testq %rax, %rax
> + jz L(align_more)
> +
> + tzcntq %rax, %rax
> +# ifdef USE_AS_STRNLEN
> + cmpq %rsi, %rax
cmpl
> + jae L(ret_max)
> +# endif
> + ret
> +
> +# ifdef USE_AS_STRNLEN
> + /* eax instead of rax used to save encoding space. */
> +L(zero):
> + xorl %eax, %eax
> + ret
> +# endif
> +
> + /* At this point vector max length reached. */
> +# ifdef USE_AS_STRNLEN
> +L(ret_max):
> + movq %rsi, %rax
> + ret
> +# endif
> +
> +L(page_cross):
Imo unless you need the 2-byte encoding on the jump this should be at
the end of the
file as its expected to not be hot.
> + andl $(VEC_SIZE - 1), %ecx
> +# ifdef USE_AS_WCSLEN
> + sarl $2, %ecx
> +# endif
> + /* ecx contains number of w[char] to be skipped as a result
> + of address alignment. */
> + movq %rdi, %rax
> + andq $-VEC_SIZE, %rax
> + VPCMP $0, (%rax), %ZMM0, %k0
> + kmovq %k0, %rax
> + /* Ignore number of character for alignment adjustment. */
> + shrq %cl, %rax
> + jz L(align_more)
> +
> + tzcntq %rax, %rax
> +# ifdef USE_AS_STRNLEN
> + cmpq %rsi, %rax
> + jae L(ret_max)
> +# endif
> + ret
> +
> +L(align_more):
> + leaq VEC_SIZE(%rdi), %rax
> + /* Align rax to VEC_SIZE. */
> + andq $-VEC_SIZE, %rax
> +# ifdef USE_AS_STRNLEN
> + movq %rax, %rdx
> + subq %rdi, %rdx
> +# ifdef USE_AS_WCSLEN
> + shrq $2, %rdx
> +# endif
> + /* At this point rdx contains [w]chars already compared. */
> + cmpq %rsi, %rdx
> + jae L(ret_max)
> + subq %rsi, %rdx
> + negq %rdx
> + /* At this point rdx contains number of w[char] needs to go.
> + Now onwards rdx will keep decrementing with each compare. */
> +# endif
> +
> + /* Loop unroll 4 times for 4 vector loop. */
> + VPCMP $0, (%rax), %ZMM0, %k0
> + kmovq %k0, %rcx
> + testq %rcx, %rcx
> + jnz L(first_vector)
Just to keep consistent with the other files can you
rename first_vector/second_vector... to ret_vec_x{N}
or something like that.
> +
> +# ifdef USE_AS_STRNLEN
> + subq $CHAR_PER_VEC, %rdx
> + jbe L(ret_max)
> +# endif
> +
> + VPCMP $0, VEC_SIZE(%rax), %ZMM0, %k0
> + kmovq %k0, %rcx
> + testq %rcx, %rcx
> + jnz L(second_vector)
> +
> +# ifdef USE_AS_STRNLEN
> + subq $CHAR_PER_VEC, %rdx
> + jbe L(ret_max)
> +# endif
The evex256 / avx2 versions do a simple check if we will be able
to do all 4 aligning compares w.o a branch. This saves total
branches. Why not do something similar here?
> +
> + VPCMP $0, (2 * VEC_SIZE)(%rax), %ZMM0, %k0
> + kmovq %k0, %rcx
> + testq %rcx, %rcx
> + jnz L(third_vector)
> +
> +# ifdef USE_AS_STRNLEN
> + subq $CHAR_PER_VEC, %rdx
> + jbe L(ret_max)
> +# endif
> +
> + VPCMP $0, (3 * VEC_SIZE)(%rax), %ZMM0, %k0
> + kmovq %k0, %rcx
> + testq %rcx, %rcx
> + jnz L(fourth_vector)
> +
> + addq $(4 * VEC_SIZE), %rax
> +
> +# ifdef USE_AS_STRNLEN
> + /* Instead of decreasing, rdx increased to prepare for loop
> + first iteration. Incremented 3 times because one increment
> + cancelled by previous decrement. */
> + addq $(3 * CHAR_PER_VEC), %rdx
> +# endif
> +
> + /* Test if address is already 4 * VEC_SIZE byte aligned goto
> + loop. */
> + testq $(3 * VEC_SIZE), %rax
> + jz L(loop)
> +
> + movq %rax, %rcx
> +
> + /* Align address to 4 * VEC_SIZE for loop. */
> + andq $-(4 * VEC_SIZE), %rax
> +
> +# ifdef USE_AS_STRNLEN
> + subq %rax, %rcx
> +# ifdef USE_AS_WCSLEN
> + sarq $2, %rcx
> +# endif
> + /* rcx contains number of [w]char will be recompared due to
> + alignment fixes. rdx must be incremented by rcx to offset
> + alignment adjustmentment. */
> + addq %rcx, %rdx
> +# endif
> +
> +L(loop):
> +# ifdef USE_AS_STRNLEN
> + subq $(CHAR_PER_VEC * 4), %rdx
> + jbe L(ret_max)
we have potential to overread by 255 bytes. Not correctness issue because
we are page aligned by seems like a possible perf issue.
> +# endif
> + /* VPMINU and VPCMP combination provide better perfomance as
> + compared to alternative combinations. */
> + VMOVA (%rax), %ZMM1
> + VPMINU (VEC_SIZE)(%rax), %ZMM1, %ZMM2
> + VMOVA (2 * VEC_SIZE)(%rax), %ZMM3
> + VPMINU (3 * VEC_SIZE)(%rax), %ZMM3, %ZMM4
I think doing 4x in the main loop is probably overkill no?
Aligning to 256 is pretty extreme.
Also I don't think the 4x zmm loads can even keep up with
2x / cycle so seems like it may not be worth wasting up to
255 bytes to get it.
> +
> + VPCMP $0, %ZMM2, %ZMM0, %k0
> + VPCMP $0, %ZMM4, %ZMM0, %k1
> +
> + addq $(4 * VEC_SIZE), %rax
> + kortestq %k0, %k1
> + jz L(loop)
> +
> + /* Need 4 vector subtraction because address incremented in
> + the loop before terminating condition check. Also want to
> + reuse code for exit condition before and after the loop. */
> + subq $(4 * VEC_SIZE), %rax
> +
> + VPCMP $0, %ZMM1, %ZMM0, %k2
> + kmovq %k2, %rcx
> + testq %rcx, %rcx
> + jnz L(first_vector)
> +
> + kmovq %k0, %rcx
> + /* At this point, if k0 is non zero, null char must be in the
> + second vector. */
> + testq %rcx, %rcx
> + jnz L(second_vector)
> +
> + VPCMP $0, %ZMM3, %ZMM0, %k3
> + kmovq %k3, %rcx
> + testq %rcx, %rcx
> + jnz L(third_vector)
> + /* At this point null [w]char must be in the fourth vector so no
> + need to check. */
> + kmovq %k1, %rcx
> +
> + /* Termination fourth, third, second vector are pretty much
> + same, implemented this way to avoid branching and reuse code
> + from pre loop exit condition. */
> +L(fourth_vector):
> + addq $(3 * VEC_SIZE), %rax
> + tzcntq %rcx, %rcx
> + subq %rdi, %rax
Can this be hoisted out to the begining of L(aligned_more).
It seems every return path uses it.
> +# ifdef USE_AS_WCSLEN
> + sarq $2, %rax
> +# endif
> + addq %rcx, %rax
if not wcslen probably faster to use lea instead of 2x add
> +# ifdef USE_AS_STRNLEN
> + cmpq %rsi, %rax
> + jae L(ret_max)
> +# endif
> + ret
> +
> +L(third_vector):
> + addq $(2 * VEC_SIZE), %rax
> + tzcntq %rcx, %rcx
> + subq %rdi, %rax
> +# ifdef USE_AS_WCSLEN
> + sarq $2, %rax
> +# endif
> + addq %rcx, %rax
> +# ifdef USE_AS_STRNLEN
> + cmpq %rsi, %rax
> + jae L(ret_max)
> +# endif
> + ret
> +
> +L(second_vector):
> + addq $VEC_SIZE, %rax
> +L(first_vector):
> + tzcntq %rcx, %rcx
> + subq %rdi, %rax
> +# ifdef USE_AS_WCSLEN
> + sarq $2, %rax
> +# endif
> + addq %rcx, %rax
> +# ifdef USE_AS_STRNLEN
> + cmpq %rsi, %rax
> + jae L(ret_max)
> +# endif
> + ret
> +
> +END (STRLEN)
> +#endif
> diff --git a/sysdeps/x86_64/multiarch/strnlen-evex512.S b/sysdeps/x86_64/multiarch/strnlen-evex512.S
> new file mode 100644
> index 0000000000..0b7f220214
> --- /dev/null
> +++ b/sysdeps/x86_64/multiarch/strnlen-evex512.S
> @@ -0,0 +1,4 @@
> +#define STRLEN __strnlen_evex512
> +#define USE_AS_STRNLEN 1
> +
> +#include "strlen-evex512.S"
> diff --git a/sysdeps/x86_64/multiarch/wcslen-evex512.S b/sysdeps/x86_64/multiarch/wcslen-evex512.S
> new file mode 100644
> index 0000000000..f59c372b78
> --- /dev/null
> +++ b/sysdeps/x86_64/multiarch/wcslen-evex512.S
> @@ -0,0 +1,4 @@
> +#define STRLEN __wcslen_evex512
> +#define USE_AS_WCSLEN 1
> +
> +#include "strlen-evex512.S"
> diff --git a/sysdeps/x86_64/multiarch/wcsnlen-evex512.S b/sysdeps/x86_64/multiarch/wcsnlen-evex512.S
> new file mode 100644
> index 0000000000..73dcf2f210
> --- /dev/null
> +++ b/sysdeps/x86_64/multiarch/wcsnlen-evex512.S
> @@ -0,0 +1,5 @@
> +#define STRLEN __wcsnlen_evex512
> +#define USE_AS_WCSLEN 1
> +#define USE_AS_STRNLEN 1
> +
> +#include "strlen-evex512.S"
> --
> 2.35.3
>
^ permalink raw reply [flat|nested] 12+ messages in thread
* Re: [PATCH] x86_64: Implement evex512 version of strlen, strnlen, wcslen and wcsnlen
2022-05-18 20:29 ` Noah Goldstein
@ 2022-05-19 3:33 ` Sunil Pandey
2022-05-19 3:48 ` [PATCH v2] " Sunil K Pandey
2022-05-19 4:41 ` [PATCH] " Noah Goldstein
0 siblings, 2 replies; 12+ messages in thread
From: Sunil Pandey @ 2022-05-19 3:33 UTC (permalink / raw)
To: Noah Goldstein; +Cc: GNU C Library
[-- Attachment #1: Type: text/plain, Size: 18531 bytes --]
On Wed, May 18, 2022 at 1:29 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> On Wed, May 18, 2022 at 1:59 PM Sunil K Pandey via Libc-alpha
> <libc-alpha@sourceware.org> wrote:
> >
> > This patch implements following evex512 version of string functions.
> > Perf gain up to 50% as compared to evex, depending on length and
> > alignment.
>
> Can you include a csv (or any consistent fmt really) somewhere of all
> the benchmarks
> and results of ~10-20 runs and the hardware your benchmarking on?
Machine:
Intel(R) Xeon(R) Gold 6148 CPU @ 2.40GHz
Fedora 35
Glibc master
20 iteration data for each function is attached, please use any text
editor(vi) to
access it.
> >
> > - String length function using 512 bit vectors.
> > - String N length using 512 bit vectors.
> > - Wide string length using 512 bit vectors.
> > - Wide string N length using 512 bit vectors.
> > ---
> > sysdeps/x86_64/multiarch/Makefile | 4 +
> > sysdeps/x86_64/multiarch/ifunc-impl-list.c | 20 ++
> > sysdeps/x86_64/multiarch/strlen-evex512.S | 291 +++++++++++++++++++++
> > sysdeps/x86_64/multiarch/strnlen-evex512.S | 4 +
> > sysdeps/x86_64/multiarch/wcslen-evex512.S | 4 +
> > sysdeps/x86_64/multiarch/wcsnlen-evex512.S | 5 +
> > 6 files changed, 328 insertions(+)
> > create mode 100644 sysdeps/x86_64/multiarch/strlen-evex512.S
> > create mode 100644 sysdeps/x86_64/multiarch/strnlen-evex512.S
> > create mode 100644 sysdeps/x86_64/multiarch/wcslen-evex512.S
> > create mode 100644 sysdeps/x86_64/multiarch/wcsnlen-evex512.S
> >
> > diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
> > index f3ab5e0928..d0869c3ac3 100644
> > --- a/sysdeps/x86_64/multiarch/Makefile
> > +++ b/sysdeps/x86_64/multiarch/Makefile
> > @@ -81,6 +81,7 @@ sysdep_routines += \
> > strlen-avx2 \
> > strlen-avx2-rtm \
> > strlen-evex \
> > + strlen-evex512 \
> > strlen-sse2 \
> > strncase_l-avx2 \
> > strncase_l-avx2-rtm \
> > @@ -105,6 +106,7 @@ sysdep_routines += \
> > strnlen-avx2 \
> > strnlen-avx2-rtm \
> > strnlen-evex \
> > + strnlen-evex512 \
> > strnlen-sse2 \
> > strpbrk-c \
> > strpbrk-sse2 \
> > @@ -138,6 +140,7 @@ sysdep_routines += \
> > wcslen-avx2 \
> > wcslen-avx2-rtm \
> > wcslen-evex \
> > + wcslen-evex512 \
> > wcslen-sse2 \
> > wcslen-sse4_1 \
> > wcsncmp-avx2 \
> > @@ -148,6 +151,7 @@ sysdep_routines += \
> > wcsnlen-avx2-rtm \
> > wcsnlen-c \
> > wcsnlen-evex \
> > + wcsnlen-evex512 \
> > wcsnlen-sse4_1 \
> > wcsrchr-avx2 \
> > wcsrchr-avx2-rtm \
> > diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> > index 7218095430..c5cd9466fe 100644
> > --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> > +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> > @@ -328,6 +328,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> > && CPU_FEATURE_USABLE (AVX512BW)
> > && CPU_FEATURE_USABLE (BMI2)),
> > __strlen_evex)
> > + IFUNC_IMPL_ADD (array, i, strlen,
> > + (CPU_FEATURE_USABLE (AVX512VL)
> > + && CPU_FEATURE_USABLE (AVX512BW)
> > + && CPU_FEATURE_USABLE (BMI2)),
> > + __strlen_evex512)
> > IFUNC_IMPL_ADD (array, i, strlen, 1, __strlen_sse2))
> >
> > /* Support sysdeps/x86_64/multiarch/strnlen.c. */
> > @@ -346,6 +351,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> > && CPU_FEATURE_USABLE (AVX512BW)
> > && CPU_FEATURE_USABLE (BMI2)),
> > __strnlen_evex)
> > + IFUNC_IMPL_ADD (array, i, strnlen,
> > + (CPU_FEATURE_USABLE (AVX512VL)
> > + && CPU_FEATURE_USABLE (AVX512BW)
> > + && CPU_FEATURE_USABLE (BMI2)),
> > + __strnlen_evex512)
> > IFUNC_IMPL_ADD (array, i, strnlen, 1, __strnlen_sse2))
> >
> > /* Support sysdeps/x86_64/multiarch/stpncpy.c. */
> > @@ -699,6 +709,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> > && CPU_FEATURE_USABLE (AVX512BW)
> > && CPU_FEATURE_USABLE (BMI2)),
> > __wcslen_evex)
> > + IFUNC_IMPL_ADD (array, i, wcslen,
> > + (CPU_FEATURE_USABLE (AVX512VL)
> > + && CPU_FEATURE_USABLE (AVX512BW)
> > + && CPU_FEATURE_USABLE (BMI2)),
> > + __wcslen_evex512)
> > IFUNC_IMPL_ADD (array, i, wcslen,
> > CPU_FEATURE_USABLE (SSE4_1),
> > __wcslen_sse4_1)
> > @@ -720,6 +735,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> > && CPU_FEATURE_USABLE (AVX512BW)
> > && CPU_FEATURE_USABLE (BMI2)),
> > __wcsnlen_evex)
> > + IFUNC_IMPL_ADD (array, i, wcsnlen,
> > + (CPU_FEATURE_USABLE (AVX512VL)
> > + && CPU_FEATURE_USABLE (AVX512BW)
> > + && CPU_FEATURE_USABLE (BMI2)),
> > + __wcsnlen_evex512)
> > IFUNC_IMPL_ADD (array, i, wcsnlen,
> > CPU_FEATURE_USABLE (SSE4_1),
> > __wcsnlen_sse4_1)
> > diff --git a/sysdeps/x86_64/multiarch/strlen-evex512.S b/sysdeps/x86_64/multiarch/strlen-evex512.S
> > new file mode 100644
> > index 0000000000..13a6b34615
> > --- /dev/null
> > +++ b/sysdeps/x86_64/multiarch/strlen-evex512.S
> > @@ -0,0 +1,291 @@
> > +/* Copyright (C) 2022 Free Software Foundation, Inc.
> > + This file is part of the GNU C Library.
> > +
> > + The GNU C Library is free software; you can redistribute it and/or
> > + modify it under the terms of the GNU Lesser General Public
> > + License as published by the Free Software Foundation; either
> > + version 2.1 of the License, or (at your option) any later version.
> > +
> > + The GNU C Library is distributed in the hope that it will be useful,
> > + but WITHOUT ANY WARRANTY; without even the implied warranty of
> > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> > + Lesser General Public License for more details.
> > +
> > + You should have received a copy of the GNU Lesser General Public
> > + License along with the GNU C Library; if not, see
> > + <https://www.gnu.org/licenses/>. */
> > +
> > +#if IS_IN (libc)
> > +
> > +# include <sysdep.h>
> > +
> > +# ifndef STRLEN
> > +# define STRLEN __strlen_evex512
> > +# endif
> > +
> > +# define VMOVA vmovdqa64
> > +# ifdef USE_AS_WCSLEN
> > +# define VPCMP vpcmpd
> > +# define VPMINU vpminud
> > +# define CHAR_SIZE 4
> > +# else
> > +# define VPCMP vpcmpb
> > +# define VPMINU vpminub
> > +# define CHAR_SIZE 1
> > +# endif
> > +
> > +# define XMM0 xmm16
> > +# define ZMM0 zmm16
> > +# define ZMM1 zmm17
> > +# define ZMM2 zmm18
> > +# define ZMM3 zmm19
> > +# define ZMM4 zmm20
> > +# define VEC_SIZE 64
> > +# define PAGE_SIZE 4096
> > +# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE)
>
> Is it possible to integrate this file cleanly with the evex256 version?
> Something similar to what we do for memset/memmove.
Good suggestion, I will look into it. For the first iteration, let's
keep it standalone
for now.
> > +
> > + .section .text.evex512, "ax", @progbits
> > +/* Aligning entry point to 64 byte, provides better performance for
> > + one vector length string. */
> > +ENTRY_P2ALIGN (STRLEN, 6)
> > +# ifdef USE_AS_STRNLEN
> > + /* Check zero length. */
> > + test %RSI_LP, %RSI_LP
> > + jz L(zero)
> > +# ifdef __ILP32__
> > + /* Clear the upper 32 bits. */
> > + movl %esi, %esi
> > +# endif
> > +# endif
> > +
> > + movl %edi, %ecx
> > + vpxorq %XMM0, %XMM0, %XMM0
> > + andl $(PAGE_SIZE - 1), %ecx
> > + cmpl $(PAGE_SIZE - VEC_SIZE), %ecx
> > + ja L(page_cross)
> > +
> > + /* Compare [w]char for null, mask bit will be set for match. */
> > + VPCMP $0, (%rdi), %ZMM0, %k0
> > + kmovq %k0, %rax
> > + testq %rax, %rax
> > + jz L(align_more)
> > +
> > + tzcntq %rax, %rax
> > +# ifdef USE_AS_STRNLEN
> > + cmpq %rsi, %rax
> cmpl
>
> > + jae L(ret_max)
> > +# endif
> > + ret
> > +
> > +# ifdef USE_AS_STRNLEN
> > + /* eax instead of rax used to save encoding space. */
> > +L(zero):
> > + xorl %eax, %eax
> > + ret
> > +# endif
> > +
> > + /* At this point vector max length reached. */
> > +# ifdef USE_AS_STRNLEN
> > +L(ret_max):
> > + movq %rsi, %rax
> > + ret
> > +# endif
> > +
> > +L(page_cross):
>
> Imo unless you need the 2-byte encoding on the jump this should be at
> the end of the
> file as its expected to not be hot.
One of my goal, to reduce size as much as possible, as long as it
doesn't hurt performance. Keeping the jump target nearby reduces
size by a few bytes, without hurting performance.
> > + andl $(VEC_SIZE - 1), %ecx
> > +# ifdef USE_AS_WCSLEN
> > + sarl $2, %ecx
> > +# endif
> > + /* ecx contains number of w[char] to be skipped as a result
> > + of address alignment. */
> > + movq %rdi, %rax
> > + andq $-VEC_SIZE, %rax
> > + VPCMP $0, (%rax), %ZMM0, %k0
> > + kmovq %k0, %rax
> > + /* Ignore number of character for alignment adjustment. */
> > + shrq %cl, %rax
> > + jz L(align_more)
> > +
> > + tzcntq %rax, %rax
> > +# ifdef USE_AS_STRNLEN
> > + cmpq %rsi, %rax
> > + jae L(ret_max)
> > +# endif
> > + ret
> > +
> > +L(align_more):
> > + leaq VEC_SIZE(%rdi), %rax
> > + /* Align rax to VEC_SIZE. */
> > + andq $-VEC_SIZE, %rax
> > +# ifdef USE_AS_STRNLEN
> > + movq %rax, %rdx
> > + subq %rdi, %rdx
> > +# ifdef USE_AS_WCSLEN
> > + shrq $2, %rdx
> > +# endif
> > + /* At this point rdx contains [w]chars already compared. */
> > + cmpq %rsi, %rdx
> > + jae L(ret_max)
> > + subq %rsi, %rdx
> > + negq %rdx
> > + /* At this point rdx contains number of w[char] needs to go.
> > + Now onwards rdx will keep decrementing with each compare. */
> > +# endif
> > +
> > + /* Loop unroll 4 times for 4 vector loop. */
> > + VPCMP $0, (%rax), %ZMM0, %k0
> > + kmovq %k0, %rcx
> > + testq %rcx, %rcx
> > + jnz L(first_vector)
>
> Just to keep consistent with the other files can you
> rename first_vector/second_vector... to ret_vec_x{N}
> or something like that.
Agree, will be fixed in v1.
> > +
> > +# ifdef USE_AS_STRNLEN
> > + subq $CHAR_PER_VEC, %rdx
> > + jbe L(ret_max)
> > +# endif
> > +
> > + VPCMP $0, VEC_SIZE(%rax), %ZMM0, %k0
> > + kmovq %k0, %rcx
> > + testq %rcx, %rcx
> > + jnz L(second_vector)
> > +
> > +# ifdef USE_AS_STRNLEN
> > + subq $CHAR_PER_VEC, %rdx
> > + jbe L(ret_max)
> > +# endif
>
> The evex256 / avx2 versions do a simple check if we will be able
> to do all 4 aligning compares w.o a branch. This saves total
> branches. Why not do something similar here?
Done this way to reduce size and complexity. Branch taken, will
jump to terminating condition. Branch not taken has no impact on perf.
> > +
> > + VPCMP $0, (2 * VEC_SIZE)(%rax), %ZMM0, %k0
> > + kmovq %k0, %rcx
> > + testq %rcx, %rcx
> > + jnz L(third_vector)
> > +
> > +# ifdef USE_AS_STRNLEN
> > + subq $CHAR_PER_VEC, %rdx
> > + jbe L(ret_max)
> > +# endif
> > +
> > + VPCMP $0, (3 * VEC_SIZE)(%rax), %ZMM0, %k0
> > + kmovq %k0, %rcx
> > + testq %rcx, %rcx
> > + jnz L(fourth_vector)
> > +
> > + addq $(4 * VEC_SIZE), %rax
> > +
> > +# ifdef USE_AS_STRNLEN
> > + /* Instead of decreasing, rdx increased to prepare for loop
> > + first iteration. Incremented 3 times because one increment
> > + cancelled by previous decrement. */
> > + addq $(3 * CHAR_PER_VEC), %rdx
> > +# endif
> > +
> > + /* Test if address is already 4 * VEC_SIZE byte aligned goto
> > + loop. */
> > + testq $(3 * VEC_SIZE), %rax
> > + jz L(loop)
> > +
> > + movq %rax, %rcx
> > +
> > + /* Align address to 4 * VEC_SIZE for loop. */
> > + andq $-(4 * VEC_SIZE), %rax
> > +
> > +# ifdef USE_AS_STRNLEN
> > + subq %rax, %rcx
> > +# ifdef USE_AS_WCSLEN
> > + sarq $2, %rcx
> > +# endif
> > + /* rcx contains number of [w]char will be recompared due to
> > + alignment fixes. rdx must be incremented by rcx to offset
> > + alignment adjustmentment. */
> > + addq %rcx, %rdx
> > +# endif
> > +
> > +L(loop):
> > +# ifdef USE_AS_STRNLEN
> > + subq $(CHAR_PER_VEC * 4), %rdx
> > + jbe L(ret_max)
>
> we have potential to overread by 255 bytes. Not correctness issue because
> we are page aligned by seems like a possible perf issue.
Correct, but overread data will be read from cache not memory, not a
significant impact, but this is the cost we have to pay for 4 vector alignments.
> > +# endif
> > + /* VPMINU and VPCMP combination provide better perfomance as
> > + compared to alternative combinations. */
> > + VMOVA (%rax), %ZMM1
> > + VPMINU (VEC_SIZE)(%rax), %ZMM1, %ZMM2
> > + VMOVA (2 * VEC_SIZE)(%rax), %ZMM3
> > + VPMINU (3 * VEC_SIZE)(%rax), %ZMM3, %ZMM4
>
> I think doing 4x in the main loop is probably overkill no?
> Aligning to 256 is pretty extreme.
>
> Also I don't think the 4x zmm loads can even keep up with
> 2x / cycle so seems like it may not be worth wasting up to
> 255 bytes to get it.
Perf number looks good, so for now it should be ok.
> > +
> > + VPCMP $0, %ZMM2, %ZMM0, %k0
> > + VPCMP $0, %ZMM4, %ZMM0, %k1
> > +
> > + addq $(4 * VEC_SIZE), %rax
> > + kortestq %k0, %k1
> > + jz L(loop)
> > +
> > + /* Need 4 vector subtraction because address incremented in
> > + the loop before terminating condition check. Also want to
> > + reuse code for exit condition before and after the loop. */
> > + subq $(4 * VEC_SIZE), %rax
> > +
> > + VPCMP $0, %ZMM1, %ZMM0, %k2
> > + kmovq %k2, %rcx
> > + testq %rcx, %rcx
> > + jnz L(first_vector)
> > +
> > + kmovq %k0, %rcx
> > + /* At this point, if k0 is non zero, null char must be in the
> > + second vector. */
> > + testq %rcx, %rcx
> > + jnz L(second_vector)
> > +
> > + VPCMP $0, %ZMM3, %ZMM0, %k3
> > + kmovq %k3, %rcx
> > + testq %rcx, %rcx
> > + jnz L(third_vector)
> > + /* At this point null [w]char must be in the fourth vector so no
> > + need to check. */
> > + kmovq %k1, %rcx
> > +
> > + /* Termination fourth, third, second vector are pretty much
> > + same, implemented this way to avoid branching and reuse code
> > + from pre loop exit condition. */
> > +L(fourth_vector):
> > + addq $(3 * VEC_SIZE), %rax
> > + tzcntq %rcx, %rcx
> > + subq %rdi, %rax
> Can this be hoisted out to the begining of L(aligned_more).
> It seems every return path uses it.
>
It really depends on where the control is coming from. So moving before
align_more will not be correct, or I may be missing something here.
> > +# ifdef USE_AS_WCSLEN
> > + sarq $2, %rax
> > +# endif
> > + addq %rcx, %rax
>
> if not wcslen probably faster to use lea instead of 2x add
I'm not sure whether there will be any significant gain. lea vs add. Used add
because it's readily available on all ports.
>
> > +# ifdef USE_AS_STRNLEN
> > + cmpq %rsi, %rax
> > + jae L(ret_max)
> > +# endif
> > + ret
> > +
> > +L(third_vector):
> > + addq $(2 * VEC_SIZE), %rax
> > + tzcntq %rcx, %rcx
> > + subq %rdi, %rax
> > +# ifdef USE_AS_WCSLEN
> > + sarq $2, %rax
> > +# endif
> > + addq %rcx, %rax
> > +# ifdef USE_AS_STRNLEN
> > + cmpq %rsi, %rax
> > + jae L(ret_max)
> > +# endif
> > + ret
> > +
> > +L(second_vector):
> > + addq $VEC_SIZE, %rax
> > +L(first_vector):
> > + tzcntq %rcx, %rcx
> > + subq %rdi, %rax
> > +# ifdef USE_AS_WCSLEN
> > + sarq $2, %rax
> > +# endif
> > + addq %rcx, %rax
> > +# ifdef USE_AS_STRNLEN
> > + cmpq %rsi, %rax
> > + jae L(ret_max)
> > +# endif
> > + ret
> > +
> > +END (STRLEN)
> > +#endif
> > diff --git a/sysdeps/x86_64/multiarch/strnlen-evex512.S b/sysdeps/x86_64/multiarch/strnlen-evex512.S
> > new file mode 100644
> > index 0000000000..0b7f220214
> > --- /dev/null
> > +++ b/sysdeps/x86_64/multiarch/strnlen-evex512.S
> > @@ -0,0 +1,4 @@
> > +#define STRLEN __strnlen_evex512
> > +#define USE_AS_STRNLEN 1
> > +
> > +#include "strlen-evex512.S"
> > diff --git a/sysdeps/x86_64/multiarch/wcslen-evex512.S b/sysdeps/x86_64/multiarch/wcslen-evex512.S
> > new file mode 100644
> > index 0000000000..f59c372b78
> > --- /dev/null
> > +++ b/sysdeps/x86_64/multiarch/wcslen-evex512.S
> > @@ -0,0 +1,4 @@
> > +#define STRLEN __wcslen_evex512
> > +#define USE_AS_WCSLEN 1
> > +
> > +#include "strlen-evex512.S"
> > diff --git a/sysdeps/x86_64/multiarch/wcsnlen-evex512.S b/sysdeps/x86_64/multiarch/wcsnlen-evex512.S
> > new file mode 100644
> > index 0000000000..73dcf2f210
> > --- /dev/null
> > +++ b/sysdeps/x86_64/multiarch/wcsnlen-evex512.S
> > @@ -0,0 +1,5 @@
> > +#define STRLEN __wcsnlen_evex512
> > +#define USE_AS_WCSLEN 1
> > +#define USE_AS_STRNLEN 1
> > +
> > +#include "strlen-evex512.S"
> > --
> > 2.35.3
> >
[-- Attachment #2: skxdata.tar.gz --]
[-- Type: application/gzip, Size: 57289 bytes --]
^ permalink raw reply [flat|nested] 12+ messages in thread
* [PATCH v2] x86_64: Implement evex512 version of strlen, strnlen, wcslen and wcsnlen
2022-05-19 3:33 ` Sunil Pandey
@ 2022-05-19 3:48 ` Sunil K Pandey
2022-05-19 15:03 ` Noah Goldstein
2022-05-19 4:41 ` [PATCH] " Noah Goldstein
1 sibling, 1 reply; 12+ messages in thread
From: Sunil K Pandey @ 2022-05-19 3:48 UTC (permalink / raw)
To: libc-alpha
This patch implements following evex512 version of string functions.
Perf gain up to 50% as compared to evex, depending on length and
alignment.
- String length function using 512 bit vectors.
- String N length using 512 bit vectors.
- Wide string length using 512 bit vectors.
- Wide string N length using 512 bit vectors.
---
sysdeps/x86_64/multiarch/Makefile | 4 +
sysdeps/x86_64/multiarch/ifunc-impl-list.c | 20 ++
sysdeps/x86_64/multiarch/strlen-evex512.S | 291 +++++++++++++++++++++
sysdeps/x86_64/multiarch/strnlen-evex512.S | 4 +
sysdeps/x86_64/multiarch/wcslen-evex512.S | 4 +
sysdeps/x86_64/multiarch/wcsnlen-evex512.S | 5 +
6 files changed, 328 insertions(+)
create mode 100644 sysdeps/x86_64/multiarch/strlen-evex512.S
create mode 100644 sysdeps/x86_64/multiarch/strnlen-evex512.S
create mode 100644 sysdeps/x86_64/multiarch/wcslen-evex512.S
create mode 100644 sysdeps/x86_64/multiarch/wcsnlen-evex512.S
diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
index f3ab5e0928..d0869c3ac3 100644
--- a/sysdeps/x86_64/multiarch/Makefile
+++ b/sysdeps/x86_64/multiarch/Makefile
@@ -81,6 +81,7 @@ sysdep_routines += \
strlen-avx2 \
strlen-avx2-rtm \
strlen-evex \
+ strlen-evex512 \
strlen-sse2 \
strncase_l-avx2 \
strncase_l-avx2-rtm \
@@ -105,6 +106,7 @@ sysdep_routines += \
strnlen-avx2 \
strnlen-avx2-rtm \
strnlen-evex \
+ strnlen-evex512 \
strnlen-sse2 \
strpbrk-c \
strpbrk-sse2 \
@@ -138,6 +140,7 @@ sysdep_routines += \
wcslen-avx2 \
wcslen-avx2-rtm \
wcslen-evex \
+ wcslen-evex512 \
wcslen-sse2 \
wcslen-sse4_1 \
wcsncmp-avx2 \
@@ -148,6 +151,7 @@ sysdep_routines += \
wcsnlen-avx2-rtm \
wcsnlen-c \
wcsnlen-evex \
+ wcsnlen-evex512 \
wcsnlen-sse4_1 \
wcsrchr-avx2 \
wcsrchr-avx2-rtm \
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
index 7218095430..c5cd9466fe 100644
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
@@ -328,6 +328,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
&& CPU_FEATURE_USABLE (AVX512BW)
&& CPU_FEATURE_USABLE (BMI2)),
__strlen_evex)
+ IFUNC_IMPL_ADD (array, i, strlen,
+ (CPU_FEATURE_USABLE (AVX512VL)
+ && CPU_FEATURE_USABLE (AVX512BW)
+ && CPU_FEATURE_USABLE (BMI2)),
+ __strlen_evex512)
IFUNC_IMPL_ADD (array, i, strlen, 1, __strlen_sse2))
/* Support sysdeps/x86_64/multiarch/strnlen.c. */
@@ -346,6 +351,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
&& CPU_FEATURE_USABLE (AVX512BW)
&& CPU_FEATURE_USABLE (BMI2)),
__strnlen_evex)
+ IFUNC_IMPL_ADD (array, i, strnlen,
+ (CPU_FEATURE_USABLE (AVX512VL)
+ && CPU_FEATURE_USABLE (AVX512BW)
+ && CPU_FEATURE_USABLE (BMI2)),
+ __strnlen_evex512)
IFUNC_IMPL_ADD (array, i, strnlen, 1, __strnlen_sse2))
/* Support sysdeps/x86_64/multiarch/stpncpy.c. */
@@ -699,6 +709,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
&& CPU_FEATURE_USABLE (AVX512BW)
&& CPU_FEATURE_USABLE (BMI2)),
__wcslen_evex)
+ IFUNC_IMPL_ADD (array, i, wcslen,
+ (CPU_FEATURE_USABLE (AVX512VL)
+ && CPU_FEATURE_USABLE (AVX512BW)
+ && CPU_FEATURE_USABLE (BMI2)),
+ __wcslen_evex512)
IFUNC_IMPL_ADD (array, i, wcslen,
CPU_FEATURE_USABLE (SSE4_1),
__wcslen_sse4_1)
@@ -720,6 +735,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
&& CPU_FEATURE_USABLE (AVX512BW)
&& CPU_FEATURE_USABLE (BMI2)),
__wcsnlen_evex)
+ IFUNC_IMPL_ADD (array, i, wcsnlen,
+ (CPU_FEATURE_USABLE (AVX512VL)
+ && CPU_FEATURE_USABLE (AVX512BW)
+ && CPU_FEATURE_USABLE (BMI2)),
+ __wcsnlen_evex512)
IFUNC_IMPL_ADD (array, i, wcsnlen,
CPU_FEATURE_USABLE (SSE4_1),
__wcsnlen_sse4_1)
diff --git a/sysdeps/x86_64/multiarch/strlen-evex512.S b/sysdeps/x86_64/multiarch/strlen-evex512.S
new file mode 100644
index 0000000000..0a2d7bbb1a
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strlen-evex512.S
@@ -0,0 +1,291 @@
+/* Copyright (C) 2022 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#if IS_IN (libc)
+
+# include <sysdep.h>
+
+# ifndef STRLEN
+# define STRLEN __strlen_evex512
+# endif
+
+# define VMOVA vmovdqa64
+# ifdef USE_AS_WCSLEN
+# define VPCMP vpcmpd
+# define VPMINU vpminud
+# define CHAR_SIZE 4
+# else
+# define VPCMP vpcmpb
+# define VPMINU vpminub
+# define CHAR_SIZE 1
+# endif
+
+# define XMM0 xmm16
+# define ZMM0 zmm16
+# define ZMM1 zmm17
+# define ZMM2 zmm18
+# define ZMM3 zmm19
+# define ZMM4 zmm20
+# define VEC_SIZE 64
+# define PAGE_SIZE 4096
+# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE)
+
+ .section .text.evex512, "ax", @progbits
+/* Aligning entry point to 64 byte, provides better performance for
+ one vector length string. */
+ENTRY_P2ALIGN (STRLEN, 6)
+# ifdef USE_AS_STRNLEN
+ /* Check zero length. */
+ test %RSI_LP, %RSI_LP
+ jz L(zero)
+# ifdef __ILP32__
+ /* Clear the upper 32 bits. */
+ movl %esi, %esi
+# endif
+# endif
+
+ movl %edi, %ecx
+ vpxorq %XMM0, %XMM0, %XMM0
+ andl $(PAGE_SIZE - 1), %ecx
+ cmpl $(PAGE_SIZE - VEC_SIZE), %ecx
+ ja L(page_cross)
+
+ /* Compare [w]char for null, mask bit will be set for match. */
+ VPCMP $0, (%rdi), %ZMM0, %k0
+ kmovq %k0, %rax
+ testq %rax, %rax
+ jz L(align_more)
+
+ tzcntq %rax, %rax
+# ifdef USE_AS_STRNLEN
+ cmpq %rsi, %rax
+ jae L(ret_max)
+# endif
+ ret
+
+# ifdef USE_AS_STRNLEN
+ /* eax instead of rax used to save encoding space. */
+L(zero):
+ xorl %eax, %eax
+ ret
+# endif
+
+ /* At this point vector max length reached. */
+# ifdef USE_AS_STRNLEN
+L(ret_max):
+ movq %rsi, %rax
+ ret
+# endif
+
+L(page_cross):
+ andl $(VEC_SIZE - 1), %ecx
+# ifdef USE_AS_WCSLEN
+ sarl $2, %ecx
+# endif
+ /* ecx contains number of w[char] to be skipped as a result
+ of address alignment. */
+ movq %rdi, %rax
+ andq $-VEC_SIZE, %rax
+ VPCMP $0, (%rax), %ZMM0, %k0
+ kmovq %k0, %rax
+ /* Ignore number of character for alignment adjustment. */
+ shrq %cl, %rax
+ jz L(align_more)
+
+ tzcntq %rax, %rax
+# ifdef USE_AS_STRNLEN
+ cmpq %rsi, %rax
+ jae L(ret_max)
+# endif
+ ret
+
+L(align_more):
+ leaq VEC_SIZE(%rdi), %rax
+ /* Align rax to VEC_SIZE. */
+ andq $-VEC_SIZE, %rax
+# ifdef USE_AS_STRNLEN
+ movq %rax, %rdx
+ subq %rdi, %rdx
+# ifdef USE_AS_WCSLEN
+ shrq $2, %rdx
+# endif
+ /* At this point rdx contains [w]chars already compared. */
+ cmpq %rsi, %rdx
+ jae L(ret_max)
+ subq %rsi, %rdx
+ negq %rdx
+ /* At this point rdx contains number of w[char] needs to go.
+ Now onwards rdx will keep decrementing with each compare. */
+# endif
+
+ /* Loop unroll 4 times for 4 vector loop. */
+ VPCMP $0, (%rax), %ZMM0, %k0
+ kmovq %k0, %rcx
+ testq %rcx, %rcx
+ jnz L(ret_vec_x1)
+
+# ifdef USE_AS_STRNLEN
+ subq $CHAR_PER_VEC, %rdx
+ jbe L(ret_max)
+# endif
+
+ VPCMP $0, VEC_SIZE(%rax), %ZMM0, %k0
+ kmovq %k0, %rcx
+ testq %rcx, %rcx
+ jnz L(ret_vec_x2)
+
+# ifdef USE_AS_STRNLEN
+ subq $CHAR_PER_VEC, %rdx
+ jbe L(ret_max)
+# endif
+
+ VPCMP $0, (2 * VEC_SIZE)(%rax), %ZMM0, %k0
+ kmovq %k0, %rcx
+ testq %rcx, %rcx
+ jnz L(ret_vec_x3)
+
+# ifdef USE_AS_STRNLEN
+ subq $CHAR_PER_VEC, %rdx
+ jbe L(ret_max)
+# endif
+
+ VPCMP $0, (3 * VEC_SIZE)(%rax), %ZMM0, %k0
+ kmovq %k0, %rcx
+ testq %rcx, %rcx
+ jnz L(ret_vec_x4)
+
+ addq $(4 * VEC_SIZE), %rax
+
+# ifdef USE_AS_STRNLEN
+ /* Instead of decreasing, rdx increased to prepare for loop
+ first iteration. Incremented 3 times because one increment
+ cancelled by previous decrement. */
+ addq $(3 * CHAR_PER_VEC), %rdx
+# endif
+
+ /* Test if address is already 4 * VEC_SIZE byte aligned goto
+ loop. */
+ testq $(3 * VEC_SIZE), %rax
+ jz L(loop)
+
+ movq %rax, %rcx
+
+ /* Align address to 4 * VEC_SIZE for loop. */
+ andq $-(4 * VEC_SIZE), %rax
+
+# ifdef USE_AS_STRNLEN
+ subq %rax, %rcx
+# ifdef USE_AS_WCSLEN
+ sarq $2, %rcx
+# endif
+ /* rcx contains number of [w]char will be recompared due to
+ alignment fixes. rdx must be incremented by rcx to offset
+ alignment adjustmentment. */
+ addq %rcx, %rdx
+# endif
+
+L(loop):
+# ifdef USE_AS_STRNLEN
+ subq $(CHAR_PER_VEC * 4), %rdx
+ jbe L(ret_max)
+# endif
+ /* VPMINU and VPCMP combination provide better perfomance as
+ compared to alternative combinations. */
+ VMOVA (%rax), %ZMM1
+ VPMINU (VEC_SIZE)(%rax), %ZMM1, %ZMM2
+ VMOVA (2 * VEC_SIZE)(%rax), %ZMM3
+ VPMINU (3 * VEC_SIZE)(%rax), %ZMM3, %ZMM4
+
+ VPCMP $0, %ZMM2, %ZMM0, %k0
+ VPCMP $0, %ZMM4, %ZMM0, %k1
+
+ addq $(4 * VEC_SIZE), %rax
+ kortestq %k0, %k1
+ jz L(loop)
+
+ /* Need 4 vector subtraction because address incremented in
+ the loop before terminating condition check. Also want to
+ reuse code for exit condition before and after the loop. */
+ subq $(4 * VEC_SIZE), %rax
+
+ VPCMP $0, %ZMM1, %ZMM0, %k2
+ kmovq %k2, %rcx
+ testq %rcx, %rcx
+ jnz L(ret_vec_x1)
+
+ kmovq %k0, %rcx
+ /* At this point, if k0 is non zero, null char must be in the
+ second vector. */
+ testq %rcx, %rcx
+ jnz L(ret_vec_x2)
+
+ VPCMP $0, %ZMM3, %ZMM0, %k3
+ kmovq %k3, %rcx
+ testq %rcx, %rcx
+ jnz L(ret_vec_x3)
+ /* At this point null [w]char must be in the fourth vector so no
+ need to check. */
+ kmovq %k1, %rcx
+
+ /* Termination fourth, third, second vector are pretty much
+ same, implemented this way to avoid branching and reuse code
+ from pre loop exit condition. */
+L(ret_vec_x4):
+ addq $(3 * VEC_SIZE), %rax
+ tzcntq %rcx, %rcx
+ subq %rdi, %rax
+# ifdef USE_AS_WCSLEN
+ sarq $2, %rax
+# endif
+ addq %rcx, %rax
+# ifdef USE_AS_STRNLEN
+ cmpq %rsi, %rax
+ jae L(ret_max)
+# endif
+ ret
+
+L(ret_vec_x3):
+ addq $(2 * VEC_SIZE), %rax
+ tzcntq %rcx, %rcx
+ subq %rdi, %rax
+# ifdef USE_AS_WCSLEN
+ sarq $2, %rax
+# endif
+ addq %rcx, %rax
+# ifdef USE_AS_STRNLEN
+ cmpq %rsi, %rax
+ jae L(ret_max)
+# endif
+ ret
+
+L(ret_vec_x2):
+ addq $VEC_SIZE, %rax
+L(ret_vec_x1):
+ tzcntq %rcx, %rcx
+ subq %rdi, %rax
+# ifdef USE_AS_WCSLEN
+ sarq $2, %rax
+# endif
+ addq %rcx, %rax
+# ifdef USE_AS_STRNLEN
+ cmpq %rsi, %rax
+ jae L(ret_max)
+# endif
+ ret
+
+END (STRLEN)
+#endif
diff --git a/sysdeps/x86_64/multiarch/strnlen-evex512.S b/sysdeps/x86_64/multiarch/strnlen-evex512.S
new file mode 100644
index 0000000000..0b7f220214
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strnlen-evex512.S
@@ -0,0 +1,4 @@
+#define STRLEN __strnlen_evex512
+#define USE_AS_STRNLEN 1
+
+#include "strlen-evex512.S"
diff --git a/sysdeps/x86_64/multiarch/wcslen-evex512.S b/sysdeps/x86_64/multiarch/wcslen-evex512.S
new file mode 100644
index 0000000000..f59c372b78
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wcslen-evex512.S
@@ -0,0 +1,4 @@
+#define STRLEN __wcslen_evex512
+#define USE_AS_WCSLEN 1
+
+#include "strlen-evex512.S"
diff --git a/sysdeps/x86_64/multiarch/wcsnlen-evex512.S b/sysdeps/x86_64/multiarch/wcsnlen-evex512.S
new file mode 100644
index 0000000000..73dcf2f210
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wcsnlen-evex512.S
@@ -0,0 +1,5 @@
+#define STRLEN __wcsnlen_evex512
+#define USE_AS_WCSLEN 1
+#define USE_AS_STRNLEN 1
+
+#include "strlen-evex512.S"
--
2.35.3
^ permalink raw reply [flat|nested] 12+ messages in thread
* Re: [PATCH] x86_64: Implement evex512 version of strlen, strnlen, wcslen and wcsnlen
2022-05-19 3:33 ` Sunil Pandey
2022-05-19 3:48 ` [PATCH v2] " Sunil K Pandey
@ 2022-05-19 4:41 ` Noah Goldstein
1 sibling, 0 replies; 12+ messages in thread
From: Noah Goldstein @ 2022-05-19 4:41 UTC (permalink / raw)
To: Sunil Pandey; +Cc: GNU C Library
On Wed, May 18, 2022 at 10:33 PM Sunil Pandey <skpgkp2@gmail.com> wrote:
>
> On Wed, May 18, 2022 at 1:29 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> >
> > On Wed, May 18, 2022 at 1:59 PM Sunil K Pandey via Libc-alpha
> > <libc-alpha@sourceware.org> wrote:
> > >
> > > This patch implements following evex512 version of string functions.
> > > Perf gain up to 50% as compared to evex, depending on length and
> > > alignment.
> >
> > Can you include a csv (or any consistent fmt really) somewhere of all
> > the benchmarks
> > and results of ~10-20 runs and the hardware your benchmarking on?
>
> Machine:
> Intel(R) Xeon(R) Gold 6148 CPU @ 2.40GHz
> Fedora 35
> Glibc master
>
> 20 iteration data for each function is attached, please use any text
> editor(vi) to
> access it.
Any chance you can aggregate it?
Also can you add collective geometric mean of evex vs evex512 and
cpu info to the commit message.
>
> > >
> > > - String length function using 512 bit vectors.
> > > - String N length using 512 bit vectors.
> > > - Wide string length using 512 bit vectors.
> > > - Wide string N length using 512 bit vectors.
> > > ---
> > > sysdeps/x86_64/multiarch/Makefile | 4 +
> > > sysdeps/x86_64/multiarch/ifunc-impl-list.c | 20 ++
> > > sysdeps/x86_64/multiarch/strlen-evex512.S | 291 +++++++++++++++++++++
> > > sysdeps/x86_64/multiarch/strnlen-evex512.S | 4 +
> > > sysdeps/x86_64/multiarch/wcslen-evex512.S | 4 +
> > > sysdeps/x86_64/multiarch/wcsnlen-evex512.S | 5 +
> > > 6 files changed, 328 insertions(+)
> > > create mode 100644 sysdeps/x86_64/multiarch/strlen-evex512.S
> > > create mode 100644 sysdeps/x86_64/multiarch/strnlen-evex512.S
> > > create mode 100644 sysdeps/x86_64/multiarch/wcslen-evex512.S
> > > create mode 100644 sysdeps/x86_64/multiarch/wcsnlen-evex512.S
> > >
> > > diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
> > > index f3ab5e0928..d0869c3ac3 100644
> > > --- a/sysdeps/x86_64/multiarch/Makefile
> > > +++ b/sysdeps/x86_64/multiarch/Makefile
> > > @@ -81,6 +81,7 @@ sysdep_routines += \
> > > strlen-avx2 \
> > > strlen-avx2-rtm \
> > > strlen-evex \
> > > + strlen-evex512 \
> > > strlen-sse2 \
> > > strncase_l-avx2 \
> > > strncase_l-avx2-rtm \
> > > @@ -105,6 +106,7 @@ sysdep_routines += \
> > > strnlen-avx2 \
> > > strnlen-avx2-rtm \
> > > strnlen-evex \
> > > + strnlen-evex512 \
> > > strnlen-sse2 \
> > > strpbrk-c \
> > > strpbrk-sse2 \
> > > @@ -138,6 +140,7 @@ sysdep_routines += \
> > > wcslen-avx2 \
> > > wcslen-avx2-rtm \
> > > wcslen-evex \
> > > + wcslen-evex512 \
> > > wcslen-sse2 \
> > > wcslen-sse4_1 \
> > > wcsncmp-avx2 \
> > > @@ -148,6 +151,7 @@ sysdep_routines += \
> > > wcsnlen-avx2-rtm \
> > > wcsnlen-c \
> > > wcsnlen-evex \
> > > + wcsnlen-evex512 \
> > > wcsnlen-sse4_1 \
> > > wcsrchr-avx2 \
> > > wcsrchr-avx2-rtm \
> > > diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> > > index 7218095430..c5cd9466fe 100644
> > > --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> > > +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> > > @@ -328,6 +328,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> > > && CPU_FEATURE_USABLE (AVX512BW)
> > > && CPU_FEATURE_USABLE (BMI2)),
> > > __strlen_evex)
> > > + IFUNC_IMPL_ADD (array, i, strlen,
> > > + (CPU_FEATURE_USABLE (AVX512VL)
> > > + && CPU_FEATURE_USABLE (AVX512BW)
> > > + && CPU_FEATURE_USABLE (BMI2)),
> > > + __strlen_evex512)
> > > IFUNC_IMPL_ADD (array, i, strlen, 1, __strlen_sse2))
> > >
> > > /* Support sysdeps/x86_64/multiarch/strnlen.c. */
> > > @@ -346,6 +351,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> > > && CPU_FEATURE_USABLE (AVX512BW)
> > > && CPU_FEATURE_USABLE (BMI2)),
> > > __strnlen_evex)
> > > + IFUNC_IMPL_ADD (array, i, strnlen,
> > > + (CPU_FEATURE_USABLE (AVX512VL)
> > > + && CPU_FEATURE_USABLE (AVX512BW)
> > > + && CPU_FEATURE_USABLE (BMI2)),
> > > + __strnlen_evex512)
> > > IFUNC_IMPL_ADD (array, i, strnlen, 1, __strnlen_sse2))
> > >
> > > /* Support sysdeps/x86_64/multiarch/stpncpy.c. */
> > > @@ -699,6 +709,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> > > && CPU_FEATURE_USABLE (AVX512BW)
> > > && CPU_FEATURE_USABLE (BMI2)),
> > > __wcslen_evex)
> > > + IFUNC_IMPL_ADD (array, i, wcslen,
> > > + (CPU_FEATURE_USABLE (AVX512VL)
> > > + && CPU_FEATURE_USABLE (AVX512BW)
> > > + && CPU_FEATURE_USABLE (BMI2)),
> > > + __wcslen_evex512)
> > > IFUNC_IMPL_ADD (array, i, wcslen,
> > > CPU_FEATURE_USABLE (SSE4_1),
> > > __wcslen_sse4_1)
> > > @@ -720,6 +735,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> > > && CPU_FEATURE_USABLE (AVX512BW)
> > > && CPU_FEATURE_USABLE (BMI2)),
> > > __wcsnlen_evex)
> > > + IFUNC_IMPL_ADD (array, i, wcsnlen,
> > > + (CPU_FEATURE_USABLE (AVX512VL)
> > > + && CPU_FEATURE_USABLE (AVX512BW)
> > > + && CPU_FEATURE_USABLE (BMI2)),
> > > + __wcsnlen_evex512)
> > > IFUNC_IMPL_ADD (array, i, wcsnlen,
> > > CPU_FEATURE_USABLE (SSE4_1),
> > > __wcsnlen_sse4_1)
> > > diff --git a/sysdeps/x86_64/multiarch/strlen-evex512.S b/sysdeps/x86_64/multiarch/strlen-evex512.S
> > > new file mode 100644
> > > index 0000000000..13a6b34615
> > > --- /dev/null
> > > +++ b/sysdeps/x86_64/multiarch/strlen-evex512.S
> > > @@ -0,0 +1,291 @@
> > > +/* Copyright (C) 2022 Free Software Foundation, Inc.
> > > + This file is part of the GNU C Library.
> > > +
> > > + The GNU C Library is free software; you can redistribute it and/or
> > > + modify it under the terms of the GNU Lesser General Public
> > > + License as published by the Free Software Foundation; either
> > > + version 2.1 of the License, or (at your option) any later version.
> > > +
> > > + The GNU C Library is distributed in the hope that it will be useful,
> > > + but WITHOUT ANY WARRANTY; without even the implied warranty of
> > > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> > > + Lesser General Public License for more details.
> > > +
> > > + You should have received a copy of the GNU Lesser General Public
> > > + License along with the GNU C Library; if not, see
> > > + <https://www.gnu.org/licenses/>. */
> > > +
> > > +#if IS_IN (libc)
> > > +
> > > +# include <sysdep.h>
> > > +
> > > +# ifndef STRLEN
> > > +# define STRLEN __strlen_evex512
> > > +# endif
> > > +
> > > +# define VMOVA vmovdqa64
> > > +# ifdef USE_AS_WCSLEN
> > > +# define VPCMP vpcmpd
> > > +# define VPMINU vpminud
> > > +# define CHAR_SIZE 4
> > > +# else
> > > +# define VPCMP vpcmpb
> > > +# define VPMINU vpminub
> > > +# define CHAR_SIZE 1
> > > +# endif
> > > +
> > > +# define XMM0 xmm16
> > > +# define ZMM0 zmm16
> > > +# define ZMM1 zmm17
> > > +# define ZMM2 zmm18
> > > +# define ZMM3 zmm19
> > > +# define ZMM4 zmm20
> > > +# define VEC_SIZE 64
> > > +# define PAGE_SIZE 4096
> > > +# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE)
> >
> > Is it possible to integrate this file cleanly with the evex256 version?
> > Something similar to what we do for memset/memmove.
>
> Good suggestion, I will look into it. For the first iteration, let's
> keep it standalone
> for now.
Why? There are a fair amount of functions. Given that evex/evex512
are just about 1-1 except VEC_SIZE we should try to integrate.
>
> > > +
> > > + .section .text.evex512, "ax", @progbits
> > > +/* Aligning entry point to 64 byte, provides better performance for
> > > + one vector length string. */
> > > +ENTRY_P2ALIGN (STRLEN, 6)
> > > +# ifdef USE_AS_STRNLEN
> > > + /* Check zero length. */
> > > + test %RSI_LP, %RSI_LP
> > > + jz L(zero)
> > > +# ifdef __ILP32__
> > > + /* Clear the upper 32 bits. */
> > > + movl %esi, %esi
> > > +# endif
> > > +# endif
> > > +
> > > + movl %edi, %ecx
> > > + vpxorq %XMM0, %XMM0, %XMM0
> > > + andl $(PAGE_SIZE - 1), %ecx
> > > + cmpl $(PAGE_SIZE - VEC_SIZE), %ecx
> > > + ja L(page_cross)
> > > +
> > > + /* Compare [w]char for null, mask bit will be set for match. */
> > > + VPCMP $0, (%rdi), %ZMM0, %k0
> > > + kmovq %k0, %rax
> > > + testq %rax, %rax
> > > + jz L(align_more)
> > > +
> > > + tzcntq %rax, %rax
> > > +# ifdef USE_AS_STRNLEN
> > > + cmpq %rsi, %rax
> > cmpl
> >
> > > + jae L(ret_max)
> > > +# endif
> > > + ret
> > > +
> > > +# ifdef USE_AS_STRNLEN
> > > + /* eax instead of rax used to save encoding space. */
> > > +L(zero):
> > > + xorl %eax, %eax
> > > + ret
> > > +# endif
> > > +
> > > + /* At this point vector max length reached. */
> > > +# ifdef USE_AS_STRNLEN
> > > +L(ret_max):
> > > + movq %rsi, %rax
> > > + ret
> > > +# endif
> > > +
> > > +L(page_cross):
> >
> > Imo unless you need the 2-byte encoding on the jump this should be at
> > the end of the
> > file as its expected to not be hot.
>
> One of my goal, to reduce size as much as possible, as long as it
> doesn't hurt performance. Keeping the jump target nearby reduces
> size by a few bytes, without hurting performance.
Fair enough, although the page cross cases are pretty cold. Putting cold
code in a hot region is a waste in a sense too.
>
> > > + andl $(VEC_SIZE - 1), %ecx
> > > +# ifdef USE_AS_WCSLEN
> > > + sarl $2, %ecx
> > > +# endif
> > > + /* ecx contains number of w[char] to be skipped as a result
> > > + of address alignment. */
> > > + movq %rdi, %rax
> > > + andq $-VEC_SIZE, %rax
> > > + VPCMP $0, (%rax), %ZMM0, %k0
> > > + kmovq %k0, %rax
> > > + /* Ignore number of character for alignment adjustment. */
> > > + shrq %cl, %rax
> > > + jz L(align_more)
> > > +
> > > + tzcntq %rax, %rax
> > > +# ifdef USE_AS_STRNLEN
> > > + cmpq %rsi, %rax
> > > + jae L(ret_max)
> > > +# endif
> > > + ret
> > > +
> > > +L(align_more):
> > > + leaq VEC_SIZE(%rdi), %rax
> > > + /* Align rax to VEC_SIZE. */
> > > + andq $-VEC_SIZE, %rax
> > > +# ifdef USE_AS_STRNLEN
> > > + movq %rax, %rdx
> > > + subq %rdi, %rdx
> > > +# ifdef USE_AS_WCSLEN
> > > + shrq $2, %rdx
> > > +# endif
> > > + /* At this point rdx contains [w]chars already compared. */
> > > + cmpq %rsi, %rdx
> > > + jae L(ret_max)
> > > + subq %rsi, %rdx
> > > + negq %rdx
> > > + /* At this point rdx contains number of w[char] needs to go.
> > > + Now onwards rdx will keep decrementing with each compare. */
> > > +# endif
> > > +
> > > + /* Loop unroll 4 times for 4 vector loop. */
> > > + VPCMP $0, (%rax), %ZMM0, %k0
> > > + kmovq %k0, %rcx
> > > + testq %rcx, %rcx
> > > + jnz L(first_vector)
> >
> > Just to keep consistent with the other files can you
> > rename first_vector/second_vector... to ret_vec_x{N}
> > or something like that.
>
> Agree, will be fixed in v1.
>
> > > +
> > > +# ifdef USE_AS_STRNLEN
> > > + subq $CHAR_PER_VEC, %rdx
> > > + jbe L(ret_max)
> > > +# endif
> > > +
> > > + VPCMP $0, VEC_SIZE(%rax), %ZMM0, %k0
> > > + kmovq %k0, %rcx
> > > + testq %rcx, %rcx
> > > + jnz L(second_vector)
> > > +
> > > +# ifdef USE_AS_STRNLEN
> > > + subq $CHAR_PER_VEC, %rdx
> > > + jbe L(ret_max)
> > > +# endif
> >
> > The evex256 / avx2 versions do a simple check if we will be able
> > to do all 4 aligning compares w.o a branch. This saves total
> > branches. Why not do something similar here?
>
> Done this way to reduce size and complexity. Branch taken, will
> jump to terminating condition. Branch not taken has no impact on perf.
Don't think that's quite true...
>
> > > +
> > > + VPCMP $0, (2 * VEC_SIZE)(%rax), %ZMM0, %k0
> > > + kmovq %k0, %rcx
> > > + testq %rcx, %rcx
> > > + jnz L(third_vector)
> > > +
> > > +# ifdef USE_AS_STRNLEN
> > > + subq $CHAR_PER_VEC, %rdx
> > > + jbe L(ret_max)
> > > +# endif
> > > +
> > > + VPCMP $0, (3 * VEC_SIZE)(%rax), %ZMM0, %k0
> > > + kmovq %k0, %rcx
> > > + testq %rcx, %rcx
> > > + jnz L(fourth_vector)
> > > +
> > > + addq $(4 * VEC_SIZE), %rax
> > > +
> > > +# ifdef USE_AS_STRNLEN
> > > + /* Instead of decreasing, rdx increased to prepare for loop
> > > + first iteration. Incremented 3 times because one increment
> > > + cancelled by previous decrement. */
> > > + addq $(3 * CHAR_PER_VEC), %rdx
> > > +# endif
> > > +
> > > + /* Test if address is already 4 * VEC_SIZE byte aligned goto
> > > + loop. */
> > > + testq $(3 * VEC_SIZE), %rax
> > > + jz L(loop)
> > > +
> > > + movq %rax, %rcx
> > > +
> > > + /* Align address to 4 * VEC_SIZE for loop. */
> > > + andq $-(4 * VEC_SIZE), %rax
> > > +
> > > +# ifdef USE_AS_STRNLEN
> > > + subq %rax, %rcx
> > > +# ifdef USE_AS_WCSLEN
> > > + sarq $2, %rcx
> > > +# endif
> > > + /* rcx contains number of [w]char will be recompared due to
> > > + alignment fixes. rdx must be incremented by rcx to offset
> > > + alignment adjustmentment. */
> > > + addq %rcx, %rdx
> > > +# endif
> > > +
> > > +L(loop):
> > > +# ifdef USE_AS_STRNLEN
> > > + subq $(CHAR_PER_VEC * 4), %rdx
> > > + jbe L(ret_max)
> >
> > we have potential to overread by 255 bytes. Not correctness issue because
> > we are page aligned by seems like a possible perf issue.
>
> Correct, but overread data will be read from cache not memory, not a
> significant impact, but this is the cost we have to pay for 4 vector alignments.
You can implement it so it has a last 4x case instead.
>
> > > +# endif
> > > + /* VPMINU and VPCMP combination provide better perfomance as
> > > + compared to alternative combinations. */
> > > + VMOVA (%rax), %ZMM1
> > > + VPMINU (VEC_SIZE)(%rax), %ZMM1, %ZMM2
> > > + VMOVA (2 * VEC_SIZE)(%rax), %ZMM3
> > > + VPMINU (3 * VEC_SIZE)(%rax), %ZMM3, %ZMM4
> >
> > I think doing 4x in the main loop is probably overkill no?
> > Aligning to 256 is pretty extreme.
> >
> > Also I don't think the 4x zmm loads can even keep up with
> > 2x / cycle so seems like it may not be worth wasting up to
> > 255 bytes to get it.
>
> Perf number looks good, so for now it should be ok.
Would prefer having a good final version.
>
> > > +
> > > + VPCMP $0, %ZMM2, %ZMM0, %k0
> > > + VPCMP $0, %ZMM4, %ZMM0, %k1
> > > +
> > > + addq $(4 * VEC_SIZE), %rax
> > > + kortestq %k0, %k1
> > > + jz L(loop)
> > > +
> > > + /* Need 4 vector subtraction because address incremented in
> > > + the loop before terminating condition check. Also want to
> > > + reuse code for exit condition before and after the loop. */
> > > + subq $(4 * VEC_SIZE), %rax
> > > +
> > > + VPCMP $0, %ZMM1, %ZMM0, %k2
> > > + kmovq %k2, %rcx
> > > + testq %rcx, %rcx
> > > + jnz L(first_vector)
> > > +
> > > + kmovq %k0, %rcx
> > > + /* At this point, if k0 is non zero, null char must be in the
> > > + second vector. */
> > > + testq %rcx, %rcx
> > > + jnz L(second_vector)
> > > +
> > > + VPCMP $0, %ZMM3, %ZMM0, %k3
> > > + kmovq %k3, %rcx
> > > + testq %rcx, %rcx
> > > + jnz L(third_vector)
> > > + /* At this point null [w]char must be in the fourth vector so no
> > > + need to check. */
> > > + kmovq %k1, %rcx
> > > +
> > > + /* Termination fourth, third, second vector are pretty much
> > > + same, implemented this way to avoid branching and reuse code
> > > + from pre loop exit condition. */
> > > +L(fourth_vector):
> > > + addq $(3 * VEC_SIZE), %rax
> > > + tzcntq %rcx, %rcx
> > > + subq %rdi, %rax
> > Can this be hoisted out to the begining of L(aligned_more).
> > It seems every return path uses it.
> >
>
> It really depends on where the control is coming from. So moving before
> align_more will not be correct, or I may be missing something here.
Is there any path from the *begining* of L(aligned_more) that
doesn't go to either L(ret_max) or one of the 4 return statements?
>
> > > +# ifdef USE_AS_WCSLEN
> > > + sarq $2, %rax
> > > +# endif
> > > + addq %rcx, %rax
> >
> > if not wcslen probably faster to use lea instead of 2x add
>
> I'm not sure whether there will be any significant gain. lea vs add. Used add
> because it's readily available on all ports.
AFAIK all machines we would enable evex512 on have fast LEA.
>
>
> >
> > > +# ifdef USE_AS_STRNLEN
> > > + cmpq %rsi, %rax
> > > + jae L(ret_max)
> > > +# endif
> > > + ret
> > > +
> > > +L(third_vector):
> > > + addq $(2 * VEC_SIZE), %rax
> > > + tzcntq %rcx, %rcx
> > > + subq %rdi, %rax
> > > +# ifdef USE_AS_WCSLEN
> > > + sarq $2, %rax
> > > +# endif
> > > + addq %rcx, %rax
> > > +# ifdef USE_AS_STRNLEN
> > > + cmpq %rsi, %rax
> > > + jae L(ret_max)
> > > +# endif
> > > + ret
> > > +
> > > +L(second_vector):
> > > + addq $VEC_SIZE, %rax
> > > +L(first_vector):
> > > + tzcntq %rcx, %rcx
> > > + subq %rdi, %rax
> > > +# ifdef USE_AS_WCSLEN
> > > + sarq $2, %rax
> > > +# endif
> > > + addq %rcx, %rax
> > > +# ifdef USE_AS_STRNLEN
> > > + cmpq %rsi, %rax
> > > + jae L(ret_max)
> > > +# endif
> > > + ret
> > > +
> > > +END (STRLEN)
> > > +#endif
> > > diff --git a/sysdeps/x86_64/multiarch/strnlen-evex512.S b/sysdeps/x86_64/multiarch/strnlen-evex512.S
> > > new file mode 100644
> > > index 0000000000..0b7f220214
> > > --- /dev/null
> > > +++ b/sysdeps/x86_64/multiarch/strnlen-evex512.S
> > > @@ -0,0 +1,4 @@
> > > +#define STRLEN __strnlen_evex512
> > > +#define USE_AS_STRNLEN 1
> > > +
> > > +#include "strlen-evex512.S"
> > > diff --git a/sysdeps/x86_64/multiarch/wcslen-evex512.S b/sysdeps/x86_64/multiarch/wcslen-evex512.S
> > > new file mode 100644
> > > index 0000000000..f59c372b78
> > > --- /dev/null
> > > +++ b/sysdeps/x86_64/multiarch/wcslen-evex512.S
> > > @@ -0,0 +1,4 @@
> > > +#define STRLEN __wcslen_evex512
> > > +#define USE_AS_WCSLEN 1
> > > +
> > > +#include "strlen-evex512.S"
> > > diff --git a/sysdeps/x86_64/multiarch/wcsnlen-evex512.S b/sysdeps/x86_64/multiarch/wcsnlen-evex512.S
> > > new file mode 100644
> > > index 0000000000..73dcf2f210
> > > --- /dev/null
> > > +++ b/sysdeps/x86_64/multiarch/wcsnlen-evex512.S
> > > @@ -0,0 +1,5 @@
> > > +#define STRLEN __wcsnlen_evex512
> > > +#define USE_AS_WCSLEN 1
> > > +#define USE_AS_STRNLEN 1
> > > +
> > > +#include "strlen-evex512.S"
> > > --
> > > 2.35.3
> > >
^ permalink raw reply [flat|nested] 12+ messages in thread
* Re: [PATCH v2] x86_64: Implement evex512 version of strlen, strnlen, wcslen and wcsnlen
2022-05-19 3:48 ` [PATCH v2] " Sunil K Pandey
@ 2022-05-19 15:03 ` Noah Goldstein
2022-05-25 13:43 ` [PATCH v3] " Sunil K Pandey
0 siblings, 1 reply; 12+ messages in thread
From: Noah Goldstein @ 2022-05-19 15:03 UTC (permalink / raw)
To: Sunil K Pandey; +Cc: GNU C Library
On Wed, May 18, 2022 at 10:48 PM Sunil K Pandey via Libc-alpha
<libc-alpha@sourceware.org> wrote:
>
> This patch implements following evex512 version of string functions.
> Perf gain up to 50% as compared to evex, depending on length and
> alignment.
>
> - String length function using 512 bit vectors.
> - String N length using 512 bit vectors.
> - Wide string length using 512 bit vectors.
> - Wide string N length using 512 bit vectors.
> ---
> sysdeps/x86_64/multiarch/Makefile | 4 +
> sysdeps/x86_64/multiarch/ifunc-impl-list.c | 20 ++
> sysdeps/x86_64/multiarch/strlen-evex512.S | 291 +++++++++++++++++++++
> sysdeps/x86_64/multiarch/strnlen-evex512.S | 4 +
> sysdeps/x86_64/multiarch/wcslen-evex512.S | 4 +
> sysdeps/x86_64/multiarch/wcsnlen-evex512.S | 5 +
> 6 files changed, 328 insertions(+)
> create mode 100644 sysdeps/x86_64/multiarch/strlen-evex512.S
> create mode 100644 sysdeps/x86_64/multiarch/strnlen-evex512.S
> create mode 100644 sysdeps/x86_64/multiarch/wcslen-evex512.S
> create mode 100644 sysdeps/x86_64/multiarch/wcsnlen-evex512.S
>
> diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
> index f3ab5e0928..d0869c3ac3 100644
> --- a/sysdeps/x86_64/multiarch/Makefile
> +++ b/sysdeps/x86_64/multiarch/Makefile
> @@ -81,6 +81,7 @@ sysdep_routines += \
> strlen-avx2 \
> strlen-avx2-rtm \
> strlen-evex \
> + strlen-evex512 \
> strlen-sse2 \
> strncase_l-avx2 \
> strncase_l-avx2-rtm \
> @@ -105,6 +106,7 @@ sysdep_routines += \
> strnlen-avx2 \
> strnlen-avx2-rtm \
> strnlen-evex \
> + strnlen-evex512 \
> strnlen-sse2 \
> strpbrk-c \
> strpbrk-sse2 \
> @@ -138,6 +140,7 @@ sysdep_routines += \
> wcslen-avx2 \
> wcslen-avx2-rtm \
> wcslen-evex \
> + wcslen-evex512 \
> wcslen-sse2 \
> wcslen-sse4_1 \
> wcsncmp-avx2 \
> @@ -148,6 +151,7 @@ sysdep_routines += \
> wcsnlen-avx2-rtm \
> wcsnlen-c \
> wcsnlen-evex \
> + wcsnlen-evex512 \
> wcsnlen-sse4_1 \
> wcsrchr-avx2 \
> wcsrchr-avx2-rtm \
> diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> index 7218095430..c5cd9466fe 100644
> --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> @@ -328,6 +328,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> && CPU_FEATURE_USABLE (AVX512BW)
> && CPU_FEATURE_USABLE (BMI2)),
> __strlen_evex)
> + IFUNC_IMPL_ADD (array, i, strlen,
> + (CPU_FEATURE_USABLE (AVX512VL)
> + && CPU_FEATURE_USABLE (AVX512BW)
> + && CPU_FEATURE_USABLE (BMI2)),
> + __strlen_evex512)
> IFUNC_IMPL_ADD (array, i, strlen, 1, __strlen_sse2))
>
> /* Support sysdeps/x86_64/multiarch/strnlen.c. */
> @@ -346,6 +351,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> && CPU_FEATURE_USABLE (AVX512BW)
> && CPU_FEATURE_USABLE (BMI2)),
> __strnlen_evex)
> + IFUNC_IMPL_ADD (array, i, strnlen,
> + (CPU_FEATURE_USABLE (AVX512VL)
> + && CPU_FEATURE_USABLE (AVX512BW)
> + && CPU_FEATURE_USABLE (BMI2)),
> + __strnlen_evex512)
> IFUNC_IMPL_ADD (array, i, strnlen, 1, __strnlen_sse2))
>
> /* Support sysdeps/x86_64/multiarch/stpncpy.c. */
> @@ -699,6 +709,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> && CPU_FEATURE_USABLE (AVX512BW)
> && CPU_FEATURE_USABLE (BMI2)),
> __wcslen_evex)
> + IFUNC_IMPL_ADD (array, i, wcslen,
> + (CPU_FEATURE_USABLE (AVX512VL)
> + && CPU_FEATURE_USABLE (AVX512BW)
> + && CPU_FEATURE_USABLE (BMI2)),
> + __wcslen_evex512)
> IFUNC_IMPL_ADD (array, i, wcslen,
> CPU_FEATURE_USABLE (SSE4_1),
> __wcslen_sse4_1)
> @@ -720,6 +735,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> && CPU_FEATURE_USABLE (AVX512BW)
> && CPU_FEATURE_USABLE (BMI2)),
> __wcsnlen_evex)
> + IFUNC_IMPL_ADD (array, i, wcsnlen,
> + (CPU_FEATURE_USABLE (AVX512VL)
> + && CPU_FEATURE_USABLE (AVX512BW)
> + && CPU_FEATURE_USABLE (BMI2)),
> + __wcsnlen_evex512)
> IFUNC_IMPL_ADD (array, i, wcsnlen,
> CPU_FEATURE_USABLE (SSE4_1),
> __wcsnlen_sse4_1)
> diff --git a/sysdeps/x86_64/multiarch/strlen-evex512.S b/sysdeps/x86_64/multiarch/strlen-evex512.S
> new file mode 100644
> index 0000000000..0a2d7bbb1a
> --- /dev/null
> +++ b/sysdeps/x86_64/multiarch/strlen-evex512.S
> @@ -0,0 +1,291 @@
> +/* Copyright (C) 2022 Free Software Foundation, Inc.
> + This file is part of the GNU C Library.
> +
> + The GNU C Library is free software; you can redistribute it and/or
> + modify it under the terms of the GNU Lesser General Public
> + License as published by the Free Software Foundation; either
> + version 2.1 of the License, or (at your option) any later version.
> +
> + The GNU C Library is distributed in the hope that it will be useful,
> + but WITHOUT ANY WARRANTY; without even the implied warranty of
> + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + Lesser General Public License for more details.
> +
> + You should have received a copy of the GNU Lesser General Public
> + License along with the GNU C Library; if not, see
> + <https://www.gnu.org/licenses/>. */
> +
> +#if IS_IN (libc)
> +
> +# include <sysdep.h>
> +
> +# ifndef STRLEN
> +# define STRLEN __strlen_evex512
> +# endif
> +
> +# define VMOVA vmovdqa64
> +# ifdef USE_AS_WCSLEN
> +# define VPCMP vpcmpd
> +# define VPMINU vpminud
> +# define CHAR_SIZE 4
> +# else
> +# define VPCMP vpcmpb
> +# define VPMINU vpminub
> +# define CHAR_SIZE 1
> +# endif
> +
> +# define XMM0 xmm16
> +# define ZMM0 zmm16
> +# define ZMM1 zmm17
> +# define ZMM2 zmm18
> +# define ZMM3 zmm19
> +# define ZMM4 zmm20
> +# define VEC_SIZE 64
> +# define PAGE_SIZE 4096
> +# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE)
> +
> + .section .text.evex512, "ax", @progbits
> +/* Aligning entry point to 64 byte, provides better performance for
> + one vector length string. */
> +ENTRY_P2ALIGN (STRLEN, 6)
> +# ifdef USE_AS_STRNLEN
> + /* Check zero length. */
> + test %RSI_LP, %RSI_LP
> + jz L(zero)
> +# ifdef __ILP32__
> + /* Clear the upper 32 bits. */
> + movl %esi, %esi
> +# endif
> +# endif
> +
> + movl %edi, %ecx
> + vpxorq %XMM0, %XMM0, %XMM0
> + andl $(PAGE_SIZE - 1), %ecx
> + cmpl $(PAGE_SIZE - VEC_SIZE), %ecx
Use eax instead of ecx here to save more code size.
> + ja L(page_cross)
> +
> + /* Compare [w]char for null, mask bit will be set for match. */
> + VPCMP $0, (%rdi), %ZMM0, %k0
> + kmovq %k0, %rax
> + testq %rax, %rax
> + jz L(align_more)
> +
> + tzcntq %rax, %rax
Replace tzcnt with bsf to save code size
> +# ifdef USE_AS_STRNLEN
> + cmpq %rsi, %rax
> + jae L(ret_max)
> +# endif
> + ret
> +
> +# ifdef USE_AS_STRNLEN
> + /* eax instead of rax used to save encoding space. */
> +L(zero):
> + xorl %eax, %eax
> + ret
> +# endif
> +
> + /* At this point vector max length reached. */
> +# ifdef USE_AS_STRNLEN
> +L(ret_max):
> + movq %rsi, %rax
> + ret
> +# endif
> +
> +L(page_cross):
> + andl $(VEC_SIZE - 1), %ecx
no needed shifts automatically only use bits in range
> +# ifdef USE_AS_WCSLEN
> + sarl $2, %ecx
> +# endif
> + /* ecx contains number of w[char] to be skipped as a result
> + of address alignment. */
> + movq %rdi, %rax
> + andq $-VEC_SIZE, %rax
You can save further code size doing
`xorq %rdi, %rcx`; VPCMP $0, (PAGE_SIZE - VEC_SIZE)(%rcx)...`
then use `rdi` for the shift.
> + VPCMP $0, (%rax), %ZMM0, %k0
> + kmovq %k0, %rax
> + /* Ignore number of character for alignment adjustment. */
> + shrq %cl, %rax
> + jz L(align_more)
> +
> + tzcntq %rax, %rax
> +# ifdef USE_AS_STRNLEN
> + cmpq %rsi, %rax
> + jae L(ret_max)
> +# endif
> + ret
> +
> +L(align_more):
> + leaq VEC_SIZE(%rdi), %rax
> + /* Align rax to VEC_SIZE. */
> + andq $-VEC_SIZE, %rax
> +# ifdef USE_AS_STRNLEN
> + movq %rax, %rdx
> + subq %rdi, %rdx
> +# ifdef USE_AS_WCSLEN
> + shrq $2, %rdx
> +# endif
> + /* At this point rdx contains [w]chars already compared. */
> + cmpq %rsi, %rdx
You `subq` next inst so just do the comparison with `subq`.
> + jae L(ret_max)
> + subq %rsi, %rdx
> + negq %rdx
> + /* At this point rdx contains number of w[char] needs to go.
> + Now onwards rdx will keep decrementing with each compare. */
> +# endif
> +
> + /* Loop unroll 4 times for 4 vector loop. */
> + VPCMP $0, (%rax), %ZMM0, %k0
> + kmovq %k0, %rcx
> + testq %rcx, %rcx
> + jnz L(ret_vec_x1)
> +
> +# ifdef USE_AS_STRNLEN
> + subq $CHAR_PER_VEC, %rdx
> + jbe L(ret_max)
> +# endif
> +
> + VPCMP $0, VEC_SIZE(%rax), %ZMM0, %k0
> + kmovq %k0, %rcx
> + testq %rcx, %rcx
> + jnz L(ret_vec_x2)
> +
> +# ifdef USE_AS_STRNLEN
> + subq $CHAR_PER_VEC, %rdx
> + jbe L(ret_max)
> +# endif
> +
> + VPCMP $0, (2 * VEC_SIZE)(%rax), %ZMM0, %k0
> + kmovq %k0, %rcx
> + testq %rcx, %rcx
> + jnz L(ret_vec_x3)
> +
> +# ifdef USE_AS_STRNLEN
> + subq $CHAR_PER_VEC, %rdx
> + jbe L(ret_max)
> +# endif
> +
> + VPCMP $0, (3 * VEC_SIZE)(%rax), %ZMM0, %k0
> + kmovq %k0, %rcx
> + testq %rcx, %rcx
> + jnz L(ret_vec_x4)
> +
> + addq $(4 * VEC_SIZE), %rax
> +
> +# ifdef USE_AS_STRNLEN
> + /* Instead of decreasing, rdx increased to prepare for loop
> + first iteration. Incremented 3 times because one increment
> + cancelled by previous decrement. */
> + addq $(3 * CHAR_PER_VEC), %rdx
> +# endif
> +
> + /* Test if address is already 4 * VEC_SIZE byte aligned goto
> + loop. */
> + testq $(3 * VEC_SIZE), %rax
Can only imagine this is is possibly worth it for STRNLEN.
> + jz L(loop)
> +
> + movq %rax, %rcx
> +
> + /* Align address to 4 * VEC_SIZE for loop. */
> + andq $-(4 * VEC_SIZE), %rax
Less code size way to aligned is
`orq $(VEC_SIZE * 4 - 1), %rax; incq %rax`
> +
> +# ifdef USE_AS_STRNLEN
> + subq %rax, %rcx
> +# ifdef USE_AS_WCSLEN
> + sarq $2, %rcx
> +# endif
> + /* rcx contains number of [w]char will be recompared due to
> + alignment fixes. rdx must be incremented by rcx to offset
> + alignment adjustmentment. */
> + addq %rcx, %rdx
> +# endif
> +
> +L(loop):
> +# ifdef USE_AS_STRNLEN
> + subq $(CHAR_PER_VEC * 4), %rdx
> + jbe L(ret_max)
> +# endif
> + /* VPMINU and VPCMP combination provide better perfomance as
> + compared to alternative combinations. */
> + VMOVA (%rax), %ZMM1
> + VPMINU (VEC_SIZE)(%rax), %ZMM1, %ZMM2
> + VMOVA (2 * VEC_SIZE)(%rax), %ZMM3
> + VPMINU (3 * VEC_SIZE)(%rax), %ZMM3, %ZMM4
> +
> + VPCMP $0, %ZMM2, %ZMM0, %k0
> + VPCMP $0, %ZMM4, %ZMM0, %k1
> +
> + addq $(4 * VEC_SIZE), %rax
> + kortestq %k0, %k1
> + jz L(loop)
> +
> + /* Need 4 vector subtraction because address incremented in
> + the loop before terminating condition check. Also want to
> + reuse code for exit condition before and after the loop. */
> + subq $(4 * VEC_SIZE), %rax
Is it possible to just add 4x offset to the loop? No penalty of imm32 encoding
with evex encoding.
> +
> + VPCMP $0, %ZMM1, %ZMM0, %k2
> + kmovq %k2, %rcx
> + testq %rcx, %rcx
> + jnz L(ret_vec_x1)
> +
> + kmovq %k0, %rcx
> + /* At this point, if k0 is non zero, null char must be in the
> + second vector. */
> + testq %rcx, %rcx
> + jnz L(ret_vec_x2)
> +
> + VPCMP $0, %ZMM3, %ZMM0, %k3
> + kmovq %k3, %rcx
> + testq %rcx, %rcx
> + jnz L(ret_vec_x3)
> + /* At this point null [w]char must be in the fourth vector so no
> + need to check. */
> + kmovq %k1, %rcx
> +
> + /* Termination fourth, third, second vector are pretty much
> + same, implemented this way to avoid branching and reuse code
> + from pre loop exit condition. */
> +L(ret_vec_x4):
> + addq $(3 * VEC_SIZE), %rax
> + tzcntq %rcx, %rcx
> + subq %rdi, %rax
> +# ifdef USE_AS_WCSLEN
> + sarq $2, %rax
> +# endif
> + addq %rcx, %rax
> +# ifdef USE_AS_STRNLEN
> + cmpq %rsi, %rax
> + jae L(ret_max)
> +# endif
> + ret
> +
> +L(ret_vec_x3):
> + addq $(2 * VEC_SIZE), %rax
> + tzcntq %rcx, %rcx
> + subq %rdi, %rax
> +# ifdef USE_AS_WCSLEN
> + sarq $2, %rax
> +# endif
> + addq %rcx, %rax
> +# ifdef USE_AS_STRNLEN
> + cmpq %rsi, %rax
> + jae L(ret_max)
> +# endif
> + ret
> +
> +L(ret_vec_x2):
> + addq $VEC_SIZE, %rax
> +L(ret_vec_x1):
> + tzcntq %rcx, %rcx
> + subq %rdi, %rax
> +# ifdef USE_AS_WCSLEN
> + sarq $2, %rax
> +# endif
> + addq %rcx, %rax
> +# ifdef USE_AS_STRNLEN
> + cmpq %rsi, %rax
> + jae L(ret_max)
> +# endif
> + ret
> +
> +END (STRLEN)
> +#endif
> diff --git a/sysdeps/x86_64/multiarch/strnlen-evex512.S b/sysdeps/x86_64/multiarch/strnlen-evex512.S
> new file mode 100644
> index 0000000000..0b7f220214
> --- /dev/null
> +++ b/sysdeps/x86_64/multiarch/strnlen-evex512.S
> @@ -0,0 +1,4 @@
> +#define STRLEN __strnlen_evex512
> +#define USE_AS_STRNLEN 1
> +
> +#include "strlen-evex512.S"
> diff --git a/sysdeps/x86_64/multiarch/wcslen-evex512.S b/sysdeps/x86_64/multiarch/wcslen-evex512.S
> new file mode 100644
> index 0000000000..f59c372b78
> --- /dev/null
> +++ b/sysdeps/x86_64/multiarch/wcslen-evex512.S
> @@ -0,0 +1,4 @@
> +#define STRLEN __wcslen_evex512
> +#define USE_AS_WCSLEN 1
> +
> +#include "strlen-evex512.S"
> diff --git a/sysdeps/x86_64/multiarch/wcsnlen-evex512.S b/sysdeps/x86_64/multiarch/wcsnlen-evex512.S
> new file mode 100644
> index 0000000000..73dcf2f210
> --- /dev/null
> +++ b/sysdeps/x86_64/multiarch/wcsnlen-evex512.S
> @@ -0,0 +1,5 @@
> +#define STRLEN __wcsnlen_evex512
> +#define USE_AS_WCSLEN 1
> +#define USE_AS_STRNLEN 1
> +
> +#include "strlen-evex512.S"
> --
> 2.35.3
>
^ permalink raw reply [flat|nested] 12+ messages in thread
* [PATCH v3] x86_64: Implement evex512 version of strlen, strnlen, wcslen and wcsnlen
2022-05-19 15:03 ` Noah Goldstein
@ 2022-05-25 13:43 ` Sunil K Pandey
2022-05-25 17:10 ` Noah Goldstein
0 siblings, 1 reply; 12+ messages in thread
From: Sunil K Pandey @ 2022-05-25 13:43 UTC (permalink / raw)
To: libc-alpha
This patch implements following evex512 version of string functions.
Perf gain for evex512 version is up to 50% as compared to evex,
depending on length and alignment.
These functions are currently just for benchmarking/reference.
- String length function using 512 bit vectors.
- String N length using 512 bit vectors.
- Wide string length using 512 bit vectors.
- Wide string N length using 512 bit vectors.
---
sysdeps/x86_64/multiarch/Makefile | 4 +
sysdeps/x86_64/multiarch/ifunc-impl-list.c | 20 ++
sysdeps/x86_64/multiarch/strlen-evex-base.S | 299 ++++++++++++++++++++
sysdeps/x86_64/multiarch/strlen-evex512.S | 7 +
sysdeps/x86_64/multiarch/strnlen-evex512.S | 4 +
sysdeps/x86_64/multiarch/wcslen-evex512.S | 4 +
sysdeps/x86_64/multiarch/wcsnlen-evex512.S | 5 +
7 files changed, 343 insertions(+)
create mode 100644 sysdeps/x86_64/multiarch/strlen-evex-base.S
create mode 100644 sysdeps/x86_64/multiarch/strlen-evex512.S
create mode 100644 sysdeps/x86_64/multiarch/strnlen-evex512.S
create mode 100644 sysdeps/x86_64/multiarch/wcslen-evex512.S
create mode 100644 sysdeps/x86_64/multiarch/wcsnlen-evex512.S
diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
index f3ab5e0928..d0869c3ac3 100644
--- a/sysdeps/x86_64/multiarch/Makefile
+++ b/sysdeps/x86_64/multiarch/Makefile
@@ -81,6 +81,7 @@ sysdep_routines += \
strlen-avx2 \
strlen-avx2-rtm \
strlen-evex \
+ strlen-evex512 \
strlen-sse2 \
strncase_l-avx2 \
strncase_l-avx2-rtm \
@@ -105,6 +106,7 @@ sysdep_routines += \
strnlen-avx2 \
strnlen-avx2-rtm \
strnlen-evex \
+ strnlen-evex512 \
strnlen-sse2 \
strpbrk-c \
strpbrk-sse2 \
@@ -138,6 +140,7 @@ sysdep_routines += \
wcslen-avx2 \
wcslen-avx2-rtm \
wcslen-evex \
+ wcslen-evex512 \
wcslen-sse2 \
wcslen-sse4_1 \
wcsncmp-avx2 \
@@ -148,6 +151,7 @@ sysdep_routines += \
wcsnlen-avx2-rtm \
wcsnlen-c \
wcsnlen-evex \
+ wcsnlen-evex512 \
wcsnlen-sse4_1 \
wcsrchr-avx2 \
wcsrchr-avx2-rtm \
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
index 7218095430..c5cd9466fe 100644
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
@@ -328,6 +328,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
&& CPU_FEATURE_USABLE (AVX512BW)
&& CPU_FEATURE_USABLE (BMI2)),
__strlen_evex)
+ IFUNC_IMPL_ADD (array, i, strlen,
+ (CPU_FEATURE_USABLE (AVX512VL)
+ && CPU_FEATURE_USABLE (AVX512BW)
+ && CPU_FEATURE_USABLE (BMI2)),
+ __strlen_evex512)
IFUNC_IMPL_ADD (array, i, strlen, 1, __strlen_sse2))
/* Support sysdeps/x86_64/multiarch/strnlen.c. */
@@ -346,6 +351,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
&& CPU_FEATURE_USABLE (AVX512BW)
&& CPU_FEATURE_USABLE (BMI2)),
__strnlen_evex)
+ IFUNC_IMPL_ADD (array, i, strnlen,
+ (CPU_FEATURE_USABLE (AVX512VL)
+ && CPU_FEATURE_USABLE (AVX512BW)
+ && CPU_FEATURE_USABLE (BMI2)),
+ __strnlen_evex512)
IFUNC_IMPL_ADD (array, i, strnlen, 1, __strnlen_sse2))
/* Support sysdeps/x86_64/multiarch/stpncpy.c. */
@@ -699,6 +709,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
&& CPU_FEATURE_USABLE (AVX512BW)
&& CPU_FEATURE_USABLE (BMI2)),
__wcslen_evex)
+ IFUNC_IMPL_ADD (array, i, wcslen,
+ (CPU_FEATURE_USABLE (AVX512VL)
+ && CPU_FEATURE_USABLE (AVX512BW)
+ && CPU_FEATURE_USABLE (BMI2)),
+ __wcslen_evex512)
IFUNC_IMPL_ADD (array, i, wcslen,
CPU_FEATURE_USABLE (SSE4_1),
__wcslen_sse4_1)
@@ -720,6 +735,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
&& CPU_FEATURE_USABLE (AVX512BW)
&& CPU_FEATURE_USABLE (BMI2)),
__wcsnlen_evex)
+ IFUNC_IMPL_ADD (array, i, wcsnlen,
+ (CPU_FEATURE_USABLE (AVX512VL)
+ && CPU_FEATURE_USABLE (AVX512BW)
+ && CPU_FEATURE_USABLE (BMI2)),
+ __wcsnlen_evex512)
IFUNC_IMPL_ADD (array, i, wcsnlen,
CPU_FEATURE_USABLE (SSE4_1),
__wcsnlen_sse4_1)
diff --git a/sysdeps/x86_64/multiarch/strlen-evex-base.S b/sysdeps/x86_64/multiarch/strlen-evex-base.S
new file mode 100644
index 0000000000..bd09967f76
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strlen-evex-base.S
@@ -0,0 +1,299 @@
+/* Placeholder function, not used by any processor at the moment.
+ Copyright (C) 2022 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#if IS_IN (libc)
+
+# include <sysdep.h>
+
+# ifdef USE_AS_WCSLEN
+# define VPCMP vpcmpd
+# define VPTESTN vptestnmd
+# define VPMINU vpminud
+# define CHAR_SIZE 4
+# else
+# define VPCMP vpcmpb
+# define VPTESTN vptestnmb
+# define VPMINU vpminub
+# define CHAR_SIZE 1
+# endif
+
+# define XMM0 xmm16
+# define PAGE_SIZE 4096
+# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE)
+
+# if VEC_SIZE == 64
+# define KMOV kmovq
+# define KORTEST kortestq
+# define RAX rax
+# define RCX rcx
+# define RDX rdx
+# define SHR shrq
+# define TEXTSUFFIX evex512
+# define VMM0 zmm16
+# define VMM1 zmm17
+# define VMM2 zmm18
+# define VMM3 zmm19
+# define VMM4 zmm20
+# define VMOVA vmovdqa64
+# elif VEC_SIZE == 32
+/* Currently Unused. */
+# define KMOV kmovd
+# define KORTEST kortestd
+# define RAX eax
+# define RCX ecx
+# define RDX edx
+# define SHR shrl
+# define TEXTSUFFIX evex256
+# define VMM0 ymm16
+# define VMM1 ymm17
+# define VMM2 ymm18
+# define VMM3 ymm19
+# define VMM4 ymm20
+# define VMOVA vmovdqa32
+# endif
+
+ .section .text.TEXTSUFFIX, "ax", @progbits
+/* Aligning entry point to 64 byte, provides better performance for
+ one vector length string. */
+ENTRY_P2ALIGN (STRLEN, 6)
+# ifdef USE_AS_STRNLEN
+ /* Check zero length. */
+ test %RSI_LP, %RSI_LP
+ jz L(ret_max)
+# ifdef __ILP32__
+ /* Clear the upper 32 bits. */
+ movl %esi, %esi
+# endif
+# endif
+
+ movl %edi, %eax
+ vpxorq %XMM0, %XMM0, %XMM0
+ andl $(PAGE_SIZE - 1), %eax
+ cmpl $(PAGE_SIZE - VEC_SIZE), %eax
+ ja L(page_cross)
+
+ /* Compare [w]char for null, mask bit will be set for match. */
+ VPCMP $0, (%rdi), %VMM0, %k0
+ KMOV %k0, %RAX
+ test %RAX, %RAX
+ jz L(align_more)
+
+ bsf %RAX, %RAX
+# ifdef USE_AS_STRNLEN
+ cmpq %rsi, %rax
+ cmovnb %rsi, %rax
+# endif
+ ret
+
+ /* At this point vector max length reached. */
+# ifdef USE_AS_STRNLEN
+ .p2align 4,,3
+L(ret_max):
+ movq %rsi, %rax
+ ret
+# endif
+
+L(align_more):
+ leaq VEC_SIZE(%rdi), %rax
+ /* Align rax to VEC_SIZE. */
+ andq $-VEC_SIZE, %rax
+# ifdef USE_AS_STRNLEN
+ movq %rax, %rdx
+ subq %rdi, %rdx
+# ifdef USE_AS_WCSLEN
+ SHR $2, %RDX
+# endif
+ /* At this point rdx contains [w]chars already compared. */
+ subq %rsi, %rdx
+ jae L(ret_max)
+ negq %rdx
+ /* At this point rdx contains number of w[char] needs to go.
+ Now onwards rdx will keep decrementing with each compare. */
+# endif
+
+ /* Loop unroll 4 times for 4 vector loop. */
+ VPCMP $0, (%rax), %VMM0, %k0
+ KMOV %k0, %RCX
+ test %RCX, %RCX
+ jnz L(ret_vec_x1)
+
+# ifdef USE_AS_STRNLEN
+ subq $CHAR_PER_VEC, %rdx
+ jbe L(ret_max)
+# endif
+
+ VPCMP $0, VEC_SIZE(%rax), %VMM0, %k0
+ KMOV %k0, %RCX
+ test %RCX, %RCX
+ jnz L(ret_vec_x2)
+
+# ifdef USE_AS_STRNLEN
+ subq $CHAR_PER_VEC, %rdx
+ jbe L(ret_max)
+# endif
+
+ VPCMP $0, (VEC_SIZE * 2)(%rax), %VMM0, %k0
+ KMOV %k0, %RCX
+ test %RCX, %RCX
+ jnz L(ret_vec_x3)
+
+# ifdef USE_AS_STRNLEN
+ subq $CHAR_PER_VEC, %rdx
+ jbe L(ret_max)
+# endif
+
+ VPCMP $0, (VEC_SIZE * 3)(%rax), %VMM0, %k0
+ KMOV %k0, %RCX
+ test %RCX, %RCX
+ jnz L(ret_vec_x4)
+
+# ifdef USE_AS_STRNLEN
+ /* Instead of decreasing, rdx increased to prepare for loop
+ first iteration. Incremented 3 times because one increment
+ cancelled by previous decrement. */
+ subq $-(CHAR_PER_VEC * 3), %rdx
+ movq %rax, %rcx
+# endif
+
+ /* Align address to VEC_SIZE * 4 for loop. */
+ andq $-(VEC_SIZE * 4), %rax
+
+# ifdef USE_AS_STRNLEN
+ subq %rax, %rcx
+# ifdef USE_AS_WCSLEN
+ SHR $2, %RCX
+# endif
+ /* rcx contains number of [w]char will be recompared due to
+ alignment fixes. rdx must be incremented by rcx to offset
+ alignment adjustment. */
+ addq %rcx, %rdx
+# endif
+
+ .p2align 4,,11
+L(loop):
+# ifdef USE_AS_STRNLEN
+ subq $(CHAR_PER_VEC * 4), %rdx
+ jbe L(ret_max)
+# endif
+ /* VPMINU and VPCMP combination provide better performance as
+ compared to alternative combinations. */
+ VMOVA (VEC_SIZE * 4)(%rax), %VMM1
+ VPMINU (VEC_SIZE * 5)(%rax), %VMM1, %VMM2
+ VMOVA (VEC_SIZE * 6)(%rax), %VMM3
+ VPMINU (VEC_SIZE * 7)(%rax), %VMM3, %VMM4
+
+ VPTESTN %VMM2, %VMM2, %k0
+ VPTESTN %VMM4, %VMM4, %k1
+
+ subq $-(VEC_SIZE * 4), %rax
+ KORTEST %k0, %k1
+ jz L(loop)
+
+ VPTESTN %VMM1, %VMM1, %k2
+ KMOV %k2, %RCX
+ test %RCX, %RCX
+ jnz L(ret_vec_x1)
+
+ KMOV %k0, %RCX
+ /* At this point, if k0 is non zero, null char must be in the
+ second vector. */
+ test %RCX, %RCX
+ jnz L(ret_vec_x2)
+
+ VPTESTN %VMM3, %VMM3, %k3
+ KMOV %k3, %RCX
+ test %RCX, %RCX
+ jnz L(ret_vec_x3)
+ /* At this point null [w]char must be in the fourth vector so no
+ need to check. */
+ KMOV %k1, %RCX
+
+ /* Fourth, third, second vector terminating are pretty much
+ same, implemented this way to avoid branching and reuse code
+ from pre loop exit condition. */
+L(ret_vec_x4):
+ bsf %RCX, %RCX
+ subq %rdi, %rax
+# ifdef USE_AS_WCSLEN
+ subq $-(VEC_SIZE * 3), %rax
+ shrq $2, %rax
+ addq %rcx, %rax
+# else
+ leaq (VEC_SIZE * 3)(%rcx, %rax), %rax
+# endif
+# ifdef USE_AS_STRNLEN
+ cmpq %rsi, %rax
+ cmovnb %rsi, %rax
+# endif
+ ret
+
+L(ret_vec_x3):
+ bsf %RCX, %RCX
+ subq %rdi, %rax
+# ifdef USE_AS_WCSLEN
+ subq $-(VEC_SIZE * 2), %rax
+ shrq $2, %rax
+ addq %rcx, %rax
+# else
+ leaq (VEC_SIZE * 2)(%rcx, %rax), %rax
+# endif
+# ifdef USE_AS_STRNLEN
+ cmpq %rsi, %rax
+ cmovnb %rsi, %rax
+# endif
+ ret
+
+L(ret_vec_x2):
+ subq $-VEC_SIZE, %rax
+L(ret_vec_x1):
+ bsf %RCX, %RCX
+ subq %rdi, %rax
+# ifdef USE_AS_WCSLEN
+ shrq $2, %rax
+# endif
+ addq %rcx, %rax
+# ifdef USE_AS_STRNLEN
+ cmpq %rsi, %rax
+ cmovnb %rsi, %rax
+# endif
+ ret
+
+L(page_cross):
+ movl %eax, %ecx
+# ifdef USE_AS_WCSLEN
+ andl $(VEC_SIZE - 1), %ecx
+ sarl $2, %ecx
+# endif
+ /* ecx contains number of w[char] to be skipped as a result
+ of address alignment. */
+ xorq %rdi, %rax
+ VPCMP $0, (PAGE_SIZE - VEC_SIZE)(%rax), %VMM0, %k0
+ KMOV %k0, %RAX
+ /* Ignore number of character for alignment adjustment. */
+ SHR %cl, %RAX
+ jz L(align_more)
+
+ bsf %RAX, %RAX
+# ifdef USE_AS_STRNLEN
+ cmpq %rsi, %rax
+ cmovnb %rsi, %rax
+# endif
+ ret
+
+END (STRLEN)
+#endif
diff --git a/sysdeps/x86_64/multiarch/strlen-evex512.S b/sysdeps/x86_64/multiarch/strlen-evex512.S
new file mode 100644
index 0000000000..116f8981c8
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strlen-evex512.S
@@ -0,0 +1,7 @@
+#ifndef STRLEN
+# define STRLEN __strlen_evex512
+#endif
+
+#define VEC_SIZE 64
+
+#include "strlen-evex-base.S"
diff --git a/sysdeps/x86_64/multiarch/strnlen-evex512.S b/sysdeps/x86_64/multiarch/strnlen-evex512.S
new file mode 100644
index 0000000000..0b7f220214
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strnlen-evex512.S
@@ -0,0 +1,4 @@
+#define STRLEN __strnlen_evex512
+#define USE_AS_STRNLEN 1
+
+#include "strlen-evex512.S"
diff --git a/sysdeps/x86_64/multiarch/wcslen-evex512.S b/sysdeps/x86_64/multiarch/wcslen-evex512.S
new file mode 100644
index 0000000000..f59c372b78
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wcslen-evex512.S
@@ -0,0 +1,4 @@
+#define STRLEN __wcslen_evex512
+#define USE_AS_WCSLEN 1
+
+#include "strlen-evex512.S"
diff --git a/sysdeps/x86_64/multiarch/wcsnlen-evex512.S b/sysdeps/x86_64/multiarch/wcsnlen-evex512.S
new file mode 100644
index 0000000000..73dcf2f210
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wcsnlen-evex512.S
@@ -0,0 +1,5 @@
+#define STRLEN __wcsnlen_evex512
+#define USE_AS_WCSLEN 1
+#define USE_AS_STRNLEN 1
+
+#include "strlen-evex512.S"
--
2.35.3
^ permalink raw reply [flat|nested] 12+ messages in thread
* Re: [PATCH v3] x86_64: Implement evex512 version of strlen, strnlen, wcslen and wcsnlen
2022-05-25 13:43 ` [PATCH v3] " Sunil K Pandey
@ 2022-05-25 17:10 ` Noah Goldstein
2022-05-25 18:20 ` Sunil Pandey
0 siblings, 1 reply; 12+ messages in thread
From: Noah Goldstein @ 2022-05-25 17:10 UTC (permalink / raw)
To: Sunil K Pandey; +Cc: GNU C Library
On Wed, May 25, 2022 at 8:44 AM Sunil K Pandey via Libc-alpha
<libc-alpha@sourceware.org> wrote:
>
> This patch implements following evex512 version of string functions.
> Perf gain for evex512 version is up to 50% as compared to evex,
> depending on length and alignment.
>
> These functions are currently just for benchmarking/reference.
>
> - String length function using 512 bit vectors.
> - String N length using 512 bit vectors.
> - Wide string length using 512 bit vectors.
> - Wide string N length using 512 bit vectors.
> ---
> sysdeps/x86_64/multiarch/Makefile | 4 +
> sysdeps/x86_64/multiarch/ifunc-impl-list.c | 20 ++
> sysdeps/x86_64/multiarch/strlen-evex-base.S | 299 ++++++++++++++++++++
> sysdeps/x86_64/multiarch/strlen-evex512.S | 7 +
> sysdeps/x86_64/multiarch/strnlen-evex512.S | 4 +
> sysdeps/x86_64/multiarch/wcslen-evex512.S | 4 +
> sysdeps/x86_64/multiarch/wcsnlen-evex512.S | 5 +
> 7 files changed, 343 insertions(+)
> create mode 100644 sysdeps/x86_64/multiarch/strlen-evex-base.S
> create mode 100644 sysdeps/x86_64/multiarch/strlen-evex512.S
> create mode 100644 sysdeps/x86_64/multiarch/strnlen-evex512.S
> create mode 100644 sysdeps/x86_64/multiarch/wcslen-evex512.S
> create mode 100644 sysdeps/x86_64/multiarch/wcsnlen-evex512.S
>
> diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
> index f3ab5e0928..d0869c3ac3 100644
> --- a/sysdeps/x86_64/multiarch/Makefile
> +++ b/sysdeps/x86_64/multiarch/Makefile
> @@ -81,6 +81,7 @@ sysdep_routines += \
> strlen-avx2 \
> strlen-avx2-rtm \
> strlen-evex \
> + strlen-evex512 \
> strlen-sse2 \
> strncase_l-avx2 \
> strncase_l-avx2-rtm \
> @@ -105,6 +106,7 @@ sysdep_routines += \
> strnlen-avx2 \
> strnlen-avx2-rtm \
> strnlen-evex \
> + strnlen-evex512 \
> strnlen-sse2 \
> strpbrk-c \
> strpbrk-sse2 \
> @@ -138,6 +140,7 @@ sysdep_routines += \
> wcslen-avx2 \
> wcslen-avx2-rtm \
> wcslen-evex \
> + wcslen-evex512 \
> wcslen-sse2 \
> wcslen-sse4_1 \
> wcsncmp-avx2 \
> @@ -148,6 +151,7 @@ sysdep_routines += \
> wcsnlen-avx2-rtm \
> wcsnlen-c \
> wcsnlen-evex \
> + wcsnlen-evex512 \
> wcsnlen-sse4_1 \
> wcsrchr-avx2 \
> wcsrchr-avx2-rtm \
> diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> index 7218095430..c5cd9466fe 100644
> --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> @@ -328,6 +328,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> && CPU_FEATURE_USABLE (AVX512BW)
> && CPU_FEATURE_USABLE (BMI2)),
> __strlen_evex)
> + IFUNC_IMPL_ADD (array, i, strlen,
> + (CPU_FEATURE_USABLE (AVX512VL)
> + && CPU_FEATURE_USABLE (AVX512BW)
> + && CPU_FEATURE_USABLE (BMI2)),
> + __strlen_evex512)
> IFUNC_IMPL_ADD (array, i, strlen, 1, __strlen_sse2))
>
> /* Support sysdeps/x86_64/multiarch/strnlen.c. */
> @@ -346,6 +351,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> && CPU_FEATURE_USABLE (AVX512BW)
> && CPU_FEATURE_USABLE (BMI2)),
> __strnlen_evex)
> + IFUNC_IMPL_ADD (array, i, strnlen,
> + (CPU_FEATURE_USABLE (AVX512VL)
> + && CPU_FEATURE_USABLE (AVX512BW)
> + && CPU_FEATURE_USABLE (BMI2)),
> + __strnlen_evex512)
> IFUNC_IMPL_ADD (array, i, strnlen, 1, __strnlen_sse2))
>
> /* Support sysdeps/x86_64/multiarch/stpncpy.c. */
> @@ -699,6 +709,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> && CPU_FEATURE_USABLE (AVX512BW)
> && CPU_FEATURE_USABLE (BMI2)),
> __wcslen_evex)
> + IFUNC_IMPL_ADD (array, i, wcslen,
> + (CPU_FEATURE_USABLE (AVX512VL)
> + && CPU_FEATURE_USABLE (AVX512BW)
> + && CPU_FEATURE_USABLE (BMI2)),
> + __wcslen_evex512)
> IFUNC_IMPL_ADD (array, i, wcslen,
> CPU_FEATURE_USABLE (SSE4_1),
> __wcslen_sse4_1)
> @@ -720,6 +735,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> && CPU_FEATURE_USABLE (AVX512BW)
> && CPU_FEATURE_USABLE (BMI2)),
> __wcsnlen_evex)
> + IFUNC_IMPL_ADD (array, i, wcsnlen,
> + (CPU_FEATURE_USABLE (AVX512VL)
> + && CPU_FEATURE_USABLE (AVX512BW)
> + && CPU_FEATURE_USABLE (BMI2)),
> + __wcsnlen_evex512)
> IFUNC_IMPL_ADD (array, i, wcsnlen,
> CPU_FEATURE_USABLE (SSE4_1),
> __wcsnlen_sse4_1)
> diff --git a/sysdeps/x86_64/multiarch/strlen-evex-base.S b/sysdeps/x86_64/multiarch/strlen-evex-base.S
> new file mode 100644
> index 0000000000..bd09967f76
> --- /dev/null
> +++ b/sysdeps/x86_64/multiarch/strlen-evex-base.S
> @@ -0,0 +1,299 @@
> +/* Placeholder function, not used by any processor at the moment.
> + Copyright (C) 2022 Free Software Foundation, Inc.
> + This file is part of the GNU C Library.
> +
> + The GNU C Library is free software; you can redistribute it and/or
> + modify it under the terms of the GNU Lesser General Public
> + License as published by the Free Software Foundation; either
> + version 2.1 of the License, or (at your option) any later version.
> +
> + The GNU C Library is distributed in the hope that it will be useful,
> + but WITHOUT ANY WARRANTY; without even the implied warranty of
> + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + Lesser General Public License for more details.
> +
> + You should have received a copy of the GNU Lesser General Public
> + License along with the GNU C Library; if not, see
> + <https://www.gnu.org/licenses/>. */
> +
> +#if IS_IN (libc)
> +
> +# include <sysdep.h>
> +
> +# ifdef USE_AS_WCSLEN
> +# define VPCMP vpcmpd
> +# define VPTESTN vptestnmd
> +# define VPMINU vpminud
> +# define CHAR_SIZE 4
> +# else
> +# define VPCMP vpcmpb
> +# define VPTESTN vptestnmb
> +# define VPMINU vpminub
> +# define CHAR_SIZE 1
> +# endif
> +
> +# define XMM0 xmm16
> +# define PAGE_SIZE 4096
> +# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE)
> +
> +# if VEC_SIZE == 64
> +# define KMOV kmovq
> +# define KORTEST kortestq
> +# define RAX rax
> +# define RCX rcx
> +# define RDX rdx
> +# define SHR shrq
> +# define TEXTSUFFIX evex512
> +# define VMM0 zmm16
> +# define VMM1 zmm17
> +# define VMM2 zmm18
> +# define VMM3 zmm19
> +# define VMM4 zmm20
> +# define VMOVA vmovdqa64
> +# elif VEC_SIZE == 32
> +/* Currently Unused. */
> +# define KMOV kmovd
> +# define KORTEST kortestd
> +# define RAX eax
> +# define RCX ecx
> +# define RDX edx
> +# define SHR shrl
> +# define TEXTSUFFIX evex256
> +# define VMM0 ymm16
> +# define VMM1 ymm17
> +# define VMM2 ymm18
> +# define VMM3 ymm19
> +# define VMM4 ymm20
> +# define VMOVA vmovdqa32
> +# endif
> +
> + .section .text.TEXTSUFFIX, "ax", @progbits
> +/* Aligning entry point to 64 byte, provides better performance for
> + one vector length string. */
> +ENTRY_P2ALIGN (STRLEN, 6)
> +# ifdef USE_AS_STRNLEN
> + /* Check zero length. */
> + test %RSI_LP, %RSI_LP
> + jz L(ret_max)
> +# ifdef __ILP32__
> + /* Clear the upper 32 bits. */
> + movl %esi, %esi
> +# endif
> +# endif
> +
> + movl %edi, %eax
> + vpxorq %XMM0, %XMM0, %XMM0
> + andl $(PAGE_SIZE - 1), %eax
> + cmpl $(PAGE_SIZE - VEC_SIZE), %eax
> + ja L(page_cross)
> +
> + /* Compare [w]char for null, mask bit will be set for match. */
> + VPCMP $0, (%rdi), %VMM0, %k0
> + KMOV %k0, %RAX
> + test %RAX, %RAX
> + jz L(align_more)
> +
> + bsf %RAX, %RAX
> +# ifdef USE_AS_STRNLEN
> + cmpq %rsi, %rax
> + cmovnb %rsi, %rax
> +# endif
> + ret
> +
> + /* At this point vector max length reached. */
> +# ifdef USE_AS_STRNLEN
> + .p2align 4,,3
> +L(ret_max):
> + movq %rsi, %rax
> + ret
> +# endif
> +
> +L(align_more):
> + leaq VEC_SIZE(%rdi), %rax
> + /* Align rax to VEC_SIZE. */
> + andq $-VEC_SIZE, %rax
> +# ifdef USE_AS_STRNLEN
> + movq %rax, %rdx
> + subq %rdi, %rdx
> +# ifdef USE_AS_WCSLEN
> + SHR $2, %RDX
> +# endif
> + /* At this point rdx contains [w]chars already compared. */
> + subq %rsi, %rdx
> + jae L(ret_max)
> + negq %rdx
> + /* At this point rdx contains number of w[char] needs to go.
> + Now onwards rdx will keep decrementing with each compare. */
> +# endif
> +
> + /* Loop unroll 4 times for 4 vector loop. */
> + VPCMP $0, (%rax), %VMM0, %k0
> + KMOV %k0, %RCX
> + test %RCX, %RCX
> + jnz L(ret_vec_x1)
> +
> +# ifdef USE_AS_STRNLEN
> + subq $CHAR_PER_VEC, %rdx
> + jbe L(ret_max)
> +# endif
> +
> + VPCMP $0, VEC_SIZE(%rax), %VMM0, %k0
> + KMOV %k0, %RCX
> + test %RCX, %RCX
> + jnz L(ret_vec_x2)
> +
> +# ifdef USE_AS_STRNLEN
> + subq $CHAR_PER_VEC, %rdx
> + jbe L(ret_max)
> +# endif
> +
> + VPCMP $0, (VEC_SIZE * 2)(%rax), %VMM0, %k0
> + KMOV %k0, %RCX
> + test %RCX, %RCX
> + jnz L(ret_vec_x3)
> +
> +# ifdef USE_AS_STRNLEN
> + subq $CHAR_PER_VEC, %rdx
> + jbe L(ret_max)
> +# endif
> +
> + VPCMP $0, (VEC_SIZE * 3)(%rax), %VMM0, %k0
> + KMOV %k0, %RCX
> + test %RCX, %RCX
> + jnz L(ret_vec_x4)
> +
> +# ifdef USE_AS_STRNLEN
> + /* Instead of decreasing, rdx increased to prepare for loop
> + first iteration. Incremented 3 times because one increment
> + cancelled by previous decrement. */
> + subq $-(CHAR_PER_VEC * 3), %rdx
> + movq %rax, %rcx
> +# endif
> +
> + /* Align address to VEC_SIZE * 4 for loop. */
> + andq $-(VEC_SIZE * 4), %rax
> +
> +# ifdef USE_AS_STRNLEN
> + subq %rax, %rcx
> +# ifdef USE_AS_WCSLEN
> + SHR $2, %RCX
> +# endif
> + /* rcx contains number of [w]char will be recompared due to
> + alignment fixes. rdx must be incremented by rcx to offset
> + alignment adjustment. */
> + addq %rcx, %rdx
This is buggy for strnlen / wcslen. You are adding back more more than
the original size
so you can overflow.
See strnlen_evex512 for:
strlen=319
align%4096 = 1
maxlen = -1UL
expec = 319
result = 18446744073709551615
> +# endif
> +
> + .p2align 4,,11
> +L(loop):
> +# ifdef USE_AS_STRNLEN
> + subq $(CHAR_PER_VEC * 4), %rdx
> + jbe L(ret_max)
> +# endif
> + /* VPMINU and VPCMP combination provide better performance as
> + compared to alternative combinations. */
> + VMOVA (VEC_SIZE * 4)(%rax), %VMM1
> + VPMINU (VEC_SIZE * 5)(%rax), %VMM1, %VMM2
> + VMOVA (VEC_SIZE * 6)(%rax), %VMM3
> + VPMINU (VEC_SIZE * 7)(%rax), %VMM3, %VMM4
> +
> + VPTESTN %VMM2, %VMM2, %k0
> + VPTESTN %VMM4, %VMM4, %k1
> +
> + subq $-(VEC_SIZE * 4), %rax
> + KORTEST %k0, %k1
> + jz L(loop)
> +
> + VPTESTN %VMM1, %VMM1, %k2
> + KMOV %k2, %RCX
> + test %RCX, %RCX
> + jnz L(ret_vec_x1)
> +
> + KMOV %k0, %RCX
> + /* At this point, if k0 is non zero, null char must be in the
> + second vector. */
> + test %RCX, %RCX
> + jnz L(ret_vec_x2)
> +
> + VPTESTN %VMM3, %VMM3, %k3
> + KMOV %k3, %RCX
> + test %RCX, %RCX
> + jnz L(ret_vec_x3)
> + /* At this point null [w]char must be in the fourth vector so no
> + need to check. */
> + KMOV %k1, %RCX
> +
> + /* Fourth, third, second vector terminating are pretty much
> + same, implemented this way to avoid branching and reuse code
> + from pre loop exit condition. */
> +L(ret_vec_x4):
> + bsf %RCX, %RCX
> + subq %rdi, %rax
> +# ifdef USE_AS_WCSLEN
> + subq $-(VEC_SIZE * 3), %rax
> + shrq $2, %rax
> + addq %rcx, %rax
> +# else
> + leaq (VEC_SIZE * 3)(%rcx, %rax), %rax
> +# endif
> +# ifdef USE_AS_STRNLEN
> + cmpq %rsi, %rax
> + cmovnb %rsi, %rax
> +# endif
> + ret
> +
> +L(ret_vec_x3):
> + bsf %RCX, %RCX
> + subq %rdi, %rax
> +# ifdef USE_AS_WCSLEN
> + subq $-(VEC_SIZE * 2), %rax
> + shrq $2, %rax
> + addq %rcx, %rax
> +# else
> + leaq (VEC_SIZE * 2)(%rcx, %rax), %rax
> +# endif
> +# ifdef USE_AS_STRNLEN
> + cmpq %rsi, %rax
> + cmovnb %rsi, %rax
> +# endif
> + ret
> +
> +L(ret_vec_x2):
> + subq $-VEC_SIZE, %rax
> +L(ret_vec_x1):
> + bsf %RCX, %RCX
> + subq %rdi, %rax
> +# ifdef USE_AS_WCSLEN
> + shrq $2, %rax
> +# endif
> + addq %rcx, %rax
> +# ifdef USE_AS_STRNLEN
> + cmpq %rsi, %rax
> + cmovnb %rsi, %rax
> +# endif
> + ret
> +
> +L(page_cross):
> + movl %eax, %ecx
> +# ifdef USE_AS_WCSLEN
> + andl $(VEC_SIZE - 1), %ecx
> + sarl $2, %ecx
> +# endif
> + /* ecx contains number of w[char] to be skipped as a result
> + of address alignment. */
> + xorq %rdi, %rax
> + VPCMP $0, (PAGE_SIZE - VEC_SIZE)(%rax), %VMM0, %k0
> + KMOV %k0, %RAX
> + /* Ignore number of character for alignment adjustment. */
> + SHR %cl, %RAX
> + jz L(align_more)
> +
> + bsf %RAX, %RAX
> +# ifdef USE_AS_STRNLEN
> + cmpq %rsi, %rax
> + cmovnb %rsi, %rax
> +# endif
> + ret
> +
> +END (STRLEN)
> +#endif
> diff --git a/sysdeps/x86_64/multiarch/strlen-evex512.S b/sysdeps/x86_64/multiarch/strlen-evex512.S
> new file mode 100644
> index 0000000000..116f8981c8
> --- /dev/null
> +++ b/sysdeps/x86_64/multiarch/strlen-evex512.S
> @@ -0,0 +1,7 @@
> +#ifndef STRLEN
> +# define STRLEN __strlen_evex512
> +#endif
> +
> +#define VEC_SIZE 64
> +
> +#include "strlen-evex-base.S"
> diff --git a/sysdeps/x86_64/multiarch/strnlen-evex512.S b/sysdeps/x86_64/multiarch/strnlen-evex512.S
> new file mode 100644
> index 0000000000..0b7f220214
> --- /dev/null
> +++ b/sysdeps/x86_64/multiarch/strnlen-evex512.S
> @@ -0,0 +1,4 @@
> +#define STRLEN __strnlen_evex512
> +#define USE_AS_STRNLEN 1
> +
> +#include "strlen-evex512.S"
> diff --git a/sysdeps/x86_64/multiarch/wcslen-evex512.S b/sysdeps/x86_64/multiarch/wcslen-evex512.S
> new file mode 100644
> index 0000000000..f59c372b78
> --- /dev/null
> +++ b/sysdeps/x86_64/multiarch/wcslen-evex512.S
> @@ -0,0 +1,4 @@
> +#define STRLEN __wcslen_evex512
> +#define USE_AS_WCSLEN 1
> +
> +#include "strlen-evex512.S"
> diff --git a/sysdeps/x86_64/multiarch/wcsnlen-evex512.S b/sysdeps/x86_64/multiarch/wcsnlen-evex512.S
> new file mode 100644
> index 0000000000..73dcf2f210
> --- /dev/null
> +++ b/sysdeps/x86_64/multiarch/wcsnlen-evex512.S
> @@ -0,0 +1,5 @@
> +#define STRLEN __wcsnlen_evex512
> +#define USE_AS_WCSLEN 1
> +#define USE_AS_STRNLEN 1
> +
> +#include "strlen-evex512.S"
> --
> 2.35.3
>
^ permalink raw reply [flat|nested] 12+ messages in thread
* Re: [PATCH v3] x86_64: Implement evex512 version of strlen, strnlen, wcslen and wcsnlen
2022-05-25 17:10 ` Noah Goldstein
@ 2022-05-25 18:20 ` Sunil Pandey
2022-05-26 18:35 ` [PATCH v4] " Sunil K Pandey
0 siblings, 1 reply; 12+ messages in thread
From: Sunil Pandey @ 2022-05-25 18:20 UTC (permalink / raw)
To: Noah Goldstein; +Cc: GNU C Library
On Wed, May 25, 2022 at 10:10 AM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> On Wed, May 25, 2022 at 8:44 AM Sunil K Pandey via Libc-alpha
> <libc-alpha@sourceware.org> wrote:
> >
> > This patch implements following evex512 version of string functions.
> > Perf gain for evex512 version is up to 50% as compared to evex,
> > depending on length and alignment.
> >
> > These functions are currently just for benchmarking/reference.
> >
> > - String length function using 512 bit vectors.
> > - String N length using 512 bit vectors.
> > - Wide string length using 512 bit vectors.
> > - Wide string N length using 512 bit vectors.
> > ---
> > sysdeps/x86_64/multiarch/Makefile | 4 +
> > sysdeps/x86_64/multiarch/ifunc-impl-list.c | 20 ++
> > sysdeps/x86_64/multiarch/strlen-evex-base.S | 299 ++++++++++++++++++++
> > sysdeps/x86_64/multiarch/strlen-evex512.S | 7 +
> > sysdeps/x86_64/multiarch/strnlen-evex512.S | 4 +
> > sysdeps/x86_64/multiarch/wcslen-evex512.S | 4 +
> > sysdeps/x86_64/multiarch/wcsnlen-evex512.S | 5 +
> > 7 files changed, 343 insertions(+)
> > create mode 100644 sysdeps/x86_64/multiarch/strlen-evex-base.S
> > create mode 100644 sysdeps/x86_64/multiarch/strlen-evex512.S
> > create mode 100644 sysdeps/x86_64/multiarch/strnlen-evex512.S
> > create mode 100644 sysdeps/x86_64/multiarch/wcslen-evex512.S
> > create mode 100644 sysdeps/x86_64/multiarch/wcsnlen-evex512.S
> >
> > diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
> > index f3ab5e0928..d0869c3ac3 100644
> > --- a/sysdeps/x86_64/multiarch/Makefile
> > +++ b/sysdeps/x86_64/multiarch/Makefile
> > @@ -81,6 +81,7 @@ sysdep_routines += \
> > strlen-avx2 \
> > strlen-avx2-rtm \
> > strlen-evex \
> > + strlen-evex512 \
> > strlen-sse2 \
> > strncase_l-avx2 \
> > strncase_l-avx2-rtm \
> > @@ -105,6 +106,7 @@ sysdep_routines += \
> > strnlen-avx2 \
> > strnlen-avx2-rtm \
> > strnlen-evex \
> > + strnlen-evex512 \
> > strnlen-sse2 \
> > strpbrk-c \
> > strpbrk-sse2 \
> > @@ -138,6 +140,7 @@ sysdep_routines += \
> > wcslen-avx2 \
> > wcslen-avx2-rtm \
> > wcslen-evex \
> > + wcslen-evex512 \
> > wcslen-sse2 \
> > wcslen-sse4_1 \
> > wcsncmp-avx2 \
> > @@ -148,6 +151,7 @@ sysdep_routines += \
> > wcsnlen-avx2-rtm \
> > wcsnlen-c \
> > wcsnlen-evex \
> > + wcsnlen-evex512 \
> > wcsnlen-sse4_1 \
> > wcsrchr-avx2 \
> > wcsrchr-avx2-rtm \
> > diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> > index 7218095430..c5cd9466fe 100644
> > --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> > +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> > @@ -328,6 +328,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> > && CPU_FEATURE_USABLE (AVX512BW)
> > && CPU_FEATURE_USABLE (BMI2)),
> > __strlen_evex)
> > + IFUNC_IMPL_ADD (array, i, strlen,
> > + (CPU_FEATURE_USABLE (AVX512VL)
> > + && CPU_FEATURE_USABLE (AVX512BW)
> > + && CPU_FEATURE_USABLE (BMI2)),
> > + __strlen_evex512)
> > IFUNC_IMPL_ADD (array, i, strlen, 1, __strlen_sse2))
> >
> > /* Support sysdeps/x86_64/multiarch/strnlen.c. */
> > @@ -346,6 +351,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> > && CPU_FEATURE_USABLE (AVX512BW)
> > && CPU_FEATURE_USABLE (BMI2)),
> > __strnlen_evex)
> > + IFUNC_IMPL_ADD (array, i, strnlen,
> > + (CPU_FEATURE_USABLE (AVX512VL)
> > + && CPU_FEATURE_USABLE (AVX512BW)
> > + && CPU_FEATURE_USABLE (BMI2)),
> > + __strnlen_evex512)
> > IFUNC_IMPL_ADD (array, i, strnlen, 1, __strnlen_sse2))
> >
> > /* Support sysdeps/x86_64/multiarch/stpncpy.c. */
> > @@ -699,6 +709,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> > && CPU_FEATURE_USABLE (AVX512BW)
> > && CPU_FEATURE_USABLE (BMI2)),
> > __wcslen_evex)
> > + IFUNC_IMPL_ADD (array, i, wcslen,
> > + (CPU_FEATURE_USABLE (AVX512VL)
> > + && CPU_FEATURE_USABLE (AVX512BW)
> > + && CPU_FEATURE_USABLE (BMI2)),
> > + __wcslen_evex512)
> > IFUNC_IMPL_ADD (array, i, wcslen,
> > CPU_FEATURE_USABLE (SSE4_1),
> > __wcslen_sse4_1)
> > @@ -720,6 +735,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> > && CPU_FEATURE_USABLE (AVX512BW)
> > && CPU_FEATURE_USABLE (BMI2)),
> > __wcsnlen_evex)
> > + IFUNC_IMPL_ADD (array, i, wcsnlen,
> > + (CPU_FEATURE_USABLE (AVX512VL)
> > + && CPU_FEATURE_USABLE (AVX512BW)
> > + && CPU_FEATURE_USABLE (BMI2)),
> > + __wcsnlen_evex512)
> > IFUNC_IMPL_ADD (array, i, wcsnlen,
> > CPU_FEATURE_USABLE (SSE4_1),
> > __wcsnlen_sse4_1)
> > diff --git a/sysdeps/x86_64/multiarch/strlen-evex-base.S b/sysdeps/x86_64/multiarch/strlen-evex-base.S
> > new file mode 100644
> > index 0000000000..bd09967f76
> > --- /dev/null
> > +++ b/sysdeps/x86_64/multiarch/strlen-evex-base.S
> > @@ -0,0 +1,299 @@
> > +/* Placeholder function, not used by any processor at the moment.
> > + Copyright (C) 2022 Free Software Foundation, Inc.
> > + This file is part of the GNU C Library.
> > +
> > + The GNU C Library is free software; you can redistribute it and/or
> > + modify it under the terms of the GNU Lesser General Public
> > + License as published by the Free Software Foundation; either
> > + version 2.1 of the License, or (at your option) any later version.
> > +
> > + The GNU C Library is distributed in the hope that it will be useful,
> > + but WITHOUT ANY WARRANTY; without even the implied warranty of
> > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> > + Lesser General Public License for more details.
> > +
> > + You should have received a copy of the GNU Lesser General Public
> > + License along with the GNU C Library; if not, see
> > + <https://www.gnu.org/licenses/>. */
> > +
> > +#if IS_IN (libc)
> > +
> > +# include <sysdep.h>
> > +
> > +# ifdef USE_AS_WCSLEN
> > +# define VPCMP vpcmpd
> > +# define VPTESTN vptestnmd
> > +# define VPMINU vpminud
> > +# define CHAR_SIZE 4
> > +# else
> > +# define VPCMP vpcmpb
> > +# define VPTESTN vptestnmb
> > +# define VPMINU vpminub
> > +# define CHAR_SIZE 1
> > +# endif
> > +
> > +# define XMM0 xmm16
> > +# define PAGE_SIZE 4096
> > +# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE)
> > +
> > +# if VEC_SIZE == 64
> > +# define KMOV kmovq
> > +# define KORTEST kortestq
> > +# define RAX rax
> > +# define RCX rcx
> > +# define RDX rdx
> > +# define SHR shrq
> > +# define TEXTSUFFIX evex512
> > +# define VMM0 zmm16
> > +# define VMM1 zmm17
> > +# define VMM2 zmm18
> > +# define VMM3 zmm19
> > +# define VMM4 zmm20
> > +# define VMOVA vmovdqa64
> > +# elif VEC_SIZE == 32
> > +/* Currently Unused. */
> > +# define KMOV kmovd
> > +# define KORTEST kortestd
> > +# define RAX eax
> > +# define RCX ecx
> > +# define RDX edx
> > +# define SHR shrl
> > +# define TEXTSUFFIX evex256
> > +# define VMM0 ymm16
> > +# define VMM1 ymm17
> > +# define VMM2 ymm18
> > +# define VMM3 ymm19
> > +# define VMM4 ymm20
> > +# define VMOVA vmovdqa32
> > +# endif
> > +
> > + .section .text.TEXTSUFFIX, "ax", @progbits
> > +/* Aligning entry point to 64 byte, provides better performance for
> > + one vector length string. */
> > +ENTRY_P2ALIGN (STRLEN, 6)
> > +# ifdef USE_AS_STRNLEN
> > + /* Check zero length. */
> > + test %RSI_LP, %RSI_LP
> > + jz L(ret_max)
> > +# ifdef __ILP32__
> > + /* Clear the upper 32 bits. */
> > + movl %esi, %esi
> > +# endif
> > +# endif
> > +
> > + movl %edi, %eax
> > + vpxorq %XMM0, %XMM0, %XMM0
> > + andl $(PAGE_SIZE - 1), %eax
> > + cmpl $(PAGE_SIZE - VEC_SIZE), %eax
> > + ja L(page_cross)
> > +
> > + /* Compare [w]char for null, mask bit will be set for match. */
> > + VPCMP $0, (%rdi), %VMM0, %k0
> > + KMOV %k0, %RAX
> > + test %RAX, %RAX
> > + jz L(align_more)
> > +
> > + bsf %RAX, %RAX
> > +# ifdef USE_AS_STRNLEN
> > + cmpq %rsi, %rax
> > + cmovnb %rsi, %rax
> > +# endif
> > + ret
> > +
> > + /* At this point vector max length reached. */
> > +# ifdef USE_AS_STRNLEN
> > + .p2align 4,,3
> > +L(ret_max):
> > + movq %rsi, %rax
> > + ret
> > +# endif
> > +
> > +L(align_more):
> > + leaq VEC_SIZE(%rdi), %rax
> > + /* Align rax to VEC_SIZE. */
> > + andq $-VEC_SIZE, %rax
> > +# ifdef USE_AS_STRNLEN
> > + movq %rax, %rdx
> > + subq %rdi, %rdx
> > +# ifdef USE_AS_WCSLEN
> > + SHR $2, %RDX
> > +# endif
> > + /* At this point rdx contains [w]chars already compared. */
> > + subq %rsi, %rdx
> > + jae L(ret_max)
> > + negq %rdx
> > + /* At this point rdx contains number of w[char] needs to go.
> > + Now onwards rdx will keep decrementing with each compare. */
> > +# endif
> > +
> > + /* Loop unroll 4 times for 4 vector loop. */
> > + VPCMP $0, (%rax), %VMM0, %k0
> > + KMOV %k0, %RCX
> > + test %RCX, %RCX
> > + jnz L(ret_vec_x1)
> > +
> > +# ifdef USE_AS_STRNLEN
> > + subq $CHAR_PER_VEC, %rdx
> > + jbe L(ret_max)
> > +# endif
> > +
> > + VPCMP $0, VEC_SIZE(%rax), %VMM0, %k0
> > + KMOV %k0, %RCX
> > + test %RCX, %RCX
> > + jnz L(ret_vec_x2)
> > +
> > +# ifdef USE_AS_STRNLEN
> > + subq $CHAR_PER_VEC, %rdx
> > + jbe L(ret_max)
> > +# endif
> > +
> > + VPCMP $0, (VEC_SIZE * 2)(%rax), %VMM0, %k0
> > + KMOV %k0, %RCX
> > + test %RCX, %RCX
> > + jnz L(ret_vec_x3)
> > +
> > +# ifdef USE_AS_STRNLEN
> > + subq $CHAR_PER_VEC, %rdx
> > + jbe L(ret_max)
> > +# endif
> > +
> > + VPCMP $0, (VEC_SIZE * 3)(%rax), %VMM0, %k0
> > + KMOV %k0, %RCX
> > + test %RCX, %RCX
> > + jnz L(ret_vec_x4)
> > +
> > +# ifdef USE_AS_STRNLEN
> > + /* Instead of decreasing, rdx increased to prepare for loop
> > + first iteration. Incremented 3 times because one increment
> > + cancelled by previous decrement. */
> > + subq $-(CHAR_PER_VEC * 3), %rdx
> > + movq %rax, %rcx
> > +# endif
> > +
> > + /* Align address to VEC_SIZE * 4 for loop. */
> > + andq $-(VEC_SIZE * 4), %rax
> > +
> > +# ifdef USE_AS_STRNLEN
> > + subq %rax, %rcx
> > +# ifdef USE_AS_WCSLEN
> > + SHR $2, %RCX
> > +# endif
> > + /* rcx contains number of [w]char will be recompared due to
> > + alignment fixes. rdx must be incremented by rcx to offset
> > + alignment adjustment. */
> > + addq %rcx, %rdx
>
> This is buggy for strnlen / wcslen. You are adding back more more than
> the original size
> so you can overflow.
>
> See strnlen_evex512 for:
>
> strlen=319
> align%4096 = 1
> maxlen = -1UL
>
> expec = 319
> result = 18446744073709551615
>
Good catch. Will fix it in v4.
> > +# endif
> > +
> > + .p2align 4,,11
> > +L(loop):
> > +# ifdef USE_AS_STRNLEN
> > + subq $(CHAR_PER_VEC * 4), %rdx
> > + jbe L(ret_max)
> > +# endif
> > + /* VPMINU and VPCMP combination provide better performance as
> > + compared to alternative combinations. */
> > + VMOVA (VEC_SIZE * 4)(%rax), %VMM1
> > + VPMINU (VEC_SIZE * 5)(%rax), %VMM1, %VMM2
> > + VMOVA (VEC_SIZE * 6)(%rax), %VMM3
> > + VPMINU (VEC_SIZE * 7)(%rax), %VMM3, %VMM4
> > +
> > + VPTESTN %VMM2, %VMM2, %k0
> > + VPTESTN %VMM4, %VMM4, %k1
> > +
> > + subq $-(VEC_SIZE * 4), %rax
> > + KORTEST %k0, %k1
> > + jz L(loop)
> > +
> > + VPTESTN %VMM1, %VMM1, %k2
> > + KMOV %k2, %RCX
> > + test %RCX, %RCX
> > + jnz L(ret_vec_x1)
> > +
> > + KMOV %k0, %RCX
> > + /* At this point, if k0 is non zero, null char must be in the
> > + second vector. */
> > + test %RCX, %RCX
> > + jnz L(ret_vec_x2)
> > +
> > + VPTESTN %VMM3, %VMM3, %k3
> > + KMOV %k3, %RCX
> > + test %RCX, %RCX
> > + jnz L(ret_vec_x3)
> > + /* At this point null [w]char must be in the fourth vector so no
> > + need to check. */
> > + KMOV %k1, %RCX
> > +
> > + /* Fourth, third, second vector terminating are pretty much
> > + same, implemented this way to avoid branching and reuse code
> > + from pre loop exit condition. */
> > +L(ret_vec_x4):
> > + bsf %RCX, %RCX
> > + subq %rdi, %rax
> > +# ifdef USE_AS_WCSLEN
> > + subq $-(VEC_SIZE * 3), %rax
> > + shrq $2, %rax
> > + addq %rcx, %rax
> > +# else
> > + leaq (VEC_SIZE * 3)(%rcx, %rax), %rax
> > +# endif
> > +# ifdef USE_AS_STRNLEN
> > + cmpq %rsi, %rax
> > + cmovnb %rsi, %rax
> > +# endif
> > + ret
> > +
> > +L(ret_vec_x3):
> > + bsf %RCX, %RCX
> > + subq %rdi, %rax
> > +# ifdef USE_AS_WCSLEN
> > + subq $-(VEC_SIZE * 2), %rax
> > + shrq $2, %rax
> > + addq %rcx, %rax
> > +# else
> > + leaq (VEC_SIZE * 2)(%rcx, %rax), %rax
> > +# endif
> > +# ifdef USE_AS_STRNLEN
> > + cmpq %rsi, %rax
> > + cmovnb %rsi, %rax
> > +# endif
> > + ret
> > +
> > +L(ret_vec_x2):
> > + subq $-VEC_SIZE, %rax
> > +L(ret_vec_x1):
> > + bsf %RCX, %RCX
> > + subq %rdi, %rax
> > +# ifdef USE_AS_WCSLEN
> > + shrq $2, %rax
> > +# endif
> > + addq %rcx, %rax
> > +# ifdef USE_AS_STRNLEN
> > + cmpq %rsi, %rax
> > + cmovnb %rsi, %rax
> > +# endif
> > + ret
> > +
> > +L(page_cross):
> > + movl %eax, %ecx
> > +# ifdef USE_AS_WCSLEN
> > + andl $(VEC_SIZE - 1), %ecx
> > + sarl $2, %ecx
> > +# endif
> > + /* ecx contains number of w[char] to be skipped as a result
> > + of address alignment. */
> > + xorq %rdi, %rax
> > + VPCMP $0, (PAGE_SIZE - VEC_SIZE)(%rax), %VMM0, %k0
> > + KMOV %k0, %RAX
> > + /* Ignore number of character for alignment adjustment. */
> > + SHR %cl, %RAX
> > + jz L(align_more)
> > +
> > + bsf %RAX, %RAX
> > +# ifdef USE_AS_STRNLEN
> > + cmpq %rsi, %rax
> > + cmovnb %rsi, %rax
> > +# endif
> > + ret
> > +
> > +END (STRLEN)
> > +#endif
> > diff --git a/sysdeps/x86_64/multiarch/strlen-evex512.S b/sysdeps/x86_64/multiarch/strlen-evex512.S
> > new file mode 100644
> > index 0000000000..116f8981c8
> > --- /dev/null
> > +++ b/sysdeps/x86_64/multiarch/strlen-evex512.S
> > @@ -0,0 +1,7 @@
> > +#ifndef STRLEN
> > +# define STRLEN __strlen_evex512
> > +#endif
> > +
> > +#define VEC_SIZE 64
> > +
> > +#include "strlen-evex-base.S"
> > diff --git a/sysdeps/x86_64/multiarch/strnlen-evex512.S b/sysdeps/x86_64/multiarch/strnlen-evex512.S
> > new file mode 100644
> > index 0000000000..0b7f220214
> > --- /dev/null
> > +++ b/sysdeps/x86_64/multiarch/strnlen-evex512.S
> > @@ -0,0 +1,4 @@
> > +#define STRLEN __strnlen_evex512
> > +#define USE_AS_STRNLEN 1
> > +
> > +#include "strlen-evex512.S"
> > diff --git a/sysdeps/x86_64/multiarch/wcslen-evex512.S b/sysdeps/x86_64/multiarch/wcslen-evex512.S
> > new file mode 100644
> > index 0000000000..f59c372b78
> > --- /dev/null
> > +++ b/sysdeps/x86_64/multiarch/wcslen-evex512.S
> > @@ -0,0 +1,4 @@
> > +#define STRLEN __wcslen_evex512
> > +#define USE_AS_WCSLEN 1
> > +
> > +#include "strlen-evex512.S"
> > diff --git a/sysdeps/x86_64/multiarch/wcsnlen-evex512.S b/sysdeps/x86_64/multiarch/wcsnlen-evex512.S
> > new file mode 100644
> > index 0000000000..73dcf2f210
> > --- /dev/null
> > +++ b/sysdeps/x86_64/multiarch/wcsnlen-evex512.S
> > @@ -0,0 +1,5 @@
> > +#define STRLEN __wcsnlen_evex512
> > +#define USE_AS_WCSLEN 1
> > +#define USE_AS_STRNLEN 1
> > +
> > +#include "strlen-evex512.S"
> > --
> > 2.35.3
> >
^ permalink raw reply [flat|nested] 12+ messages in thread
* [PATCH v4] x86_64: Implement evex512 version of strlen, strnlen, wcslen and wcsnlen
2022-05-25 18:20 ` Sunil Pandey
@ 2022-05-26 18:35 ` Sunil K Pandey
2022-05-26 20:07 ` Noah Goldstein
0 siblings, 1 reply; 12+ messages in thread
From: Sunil K Pandey @ 2022-05-26 18:35 UTC (permalink / raw)
To: libc-alpha
This patch implements following evex512 version of string functions.
Perf gain for evex512 version is up to 50% as compared to evex,
depending on length and alignment.
Placeholder function, not used by any processor at the moment.
- String length function using 512 bit vectors.
- String N length using 512 bit vectors.
- Wide string length using 512 bit vectors.
- Wide string N length using 512 bit vectors.
---
sysdeps/x86_64/multiarch/Makefile | 4 +
sysdeps/x86_64/multiarch/ifunc-impl-list.c | 20 ++
sysdeps/x86_64/multiarch/strlen-evex-base.S | 302 ++++++++++++++++++++
sysdeps/x86_64/multiarch/strlen-evex512.S | 7 +
sysdeps/x86_64/multiarch/strnlen-evex512.S | 4 +
sysdeps/x86_64/multiarch/wcslen-evex512.S | 4 +
sysdeps/x86_64/multiarch/wcsnlen-evex512.S | 5 +
7 files changed, 346 insertions(+)
create mode 100644 sysdeps/x86_64/multiarch/strlen-evex-base.S
create mode 100644 sysdeps/x86_64/multiarch/strlen-evex512.S
create mode 100644 sysdeps/x86_64/multiarch/strnlen-evex512.S
create mode 100644 sysdeps/x86_64/multiarch/wcslen-evex512.S
create mode 100644 sysdeps/x86_64/multiarch/wcsnlen-evex512.S
diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
index f3ab5e0928..d0869c3ac3 100644
--- a/sysdeps/x86_64/multiarch/Makefile
+++ b/sysdeps/x86_64/multiarch/Makefile
@@ -81,6 +81,7 @@ sysdep_routines += \
strlen-avx2 \
strlen-avx2-rtm \
strlen-evex \
+ strlen-evex512 \
strlen-sse2 \
strncase_l-avx2 \
strncase_l-avx2-rtm \
@@ -105,6 +106,7 @@ sysdep_routines += \
strnlen-avx2 \
strnlen-avx2-rtm \
strnlen-evex \
+ strnlen-evex512 \
strnlen-sse2 \
strpbrk-c \
strpbrk-sse2 \
@@ -138,6 +140,7 @@ sysdep_routines += \
wcslen-avx2 \
wcslen-avx2-rtm \
wcslen-evex \
+ wcslen-evex512 \
wcslen-sse2 \
wcslen-sse4_1 \
wcsncmp-avx2 \
@@ -148,6 +151,7 @@ sysdep_routines += \
wcsnlen-avx2-rtm \
wcsnlen-c \
wcsnlen-evex \
+ wcsnlen-evex512 \
wcsnlen-sse4_1 \
wcsrchr-avx2 \
wcsrchr-avx2-rtm \
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
index 7218095430..c5cd9466fe 100644
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
@@ -328,6 +328,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
&& CPU_FEATURE_USABLE (AVX512BW)
&& CPU_FEATURE_USABLE (BMI2)),
__strlen_evex)
+ IFUNC_IMPL_ADD (array, i, strlen,
+ (CPU_FEATURE_USABLE (AVX512VL)
+ && CPU_FEATURE_USABLE (AVX512BW)
+ && CPU_FEATURE_USABLE (BMI2)),
+ __strlen_evex512)
IFUNC_IMPL_ADD (array, i, strlen, 1, __strlen_sse2))
/* Support sysdeps/x86_64/multiarch/strnlen.c. */
@@ -346,6 +351,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
&& CPU_FEATURE_USABLE (AVX512BW)
&& CPU_FEATURE_USABLE (BMI2)),
__strnlen_evex)
+ IFUNC_IMPL_ADD (array, i, strnlen,
+ (CPU_FEATURE_USABLE (AVX512VL)
+ && CPU_FEATURE_USABLE (AVX512BW)
+ && CPU_FEATURE_USABLE (BMI2)),
+ __strnlen_evex512)
IFUNC_IMPL_ADD (array, i, strnlen, 1, __strnlen_sse2))
/* Support sysdeps/x86_64/multiarch/stpncpy.c. */
@@ -699,6 +709,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
&& CPU_FEATURE_USABLE (AVX512BW)
&& CPU_FEATURE_USABLE (BMI2)),
__wcslen_evex)
+ IFUNC_IMPL_ADD (array, i, wcslen,
+ (CPU_FEATURE_USABLE (AVX512VL)
+ && CPU_FEATURE_USABLE (AVX512BW)
+ && CPU_FEATURE_USABLE (BMI2)),
+ __wcslen_evex512)
IFUNC_IMPL_ADD (array, i, wcslen,
CPU_FEATURE_USABLE (SSE4_1),
__wcslen_sse4_1)
@@ -720,6 +735,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
&& CPU_FEATURE_USABLE (AVX512BW)
&& CPU_FEATURE_USABLE (BMI2)),
__wcsnlen_evex)
+ IFUNC_IMPL_ADD (array, i, wcsnlen,
+ (CPU_FEATURE_USABLE (AVX512VL)
+ && CPU_FEATURE_USABLE (AVX512BW)
+ && CPU_FEATURE_USABLE (BMI2)),
+ __wcsnlen_evex512)
IFUNC_IMPL_ADD (array, i, wcsnlen,
CPU_FEATURE_USABLE (SSE4_1),
__wcsnlen_sse4_1)
diff --git a/sysdeps/x86_64/multiarch/strlen-evex-base.S b/sysdeps/x86_64/multiarch/strlen-evex-base.S
new file mode 100644
index 0000000000..278c899691
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strlen-evex-base.S
@@ -0,0 +1,302 @@
+/* Placeholder function, not used by any processor at the moment.
+ Copyright (C) 2022 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#if IS_IN (libc)
+
+# include <sysdep.h>
+
+# ifdef USE_AS_WCSLEN
+# define VPCMP vpcmpd
+# define VPTESTN vptestnmd
+# define VPMINU vpminud
+# define CHAR_SIZE 4
+# else
+# define VPCMP vpcmpb
+# define VPTESTN vptestnmb
+# define VPMINU vpminub
+# define CHAR_SIZE 1
+# endif
+
+# define XMM0 xmm16
+# define PAGE_SIZE 4096
+# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE)
+
+# if VEC_SIZE == 64
+# define KMOV kmovq
+# define KORTEST kortestq
+# define RAX rax
+# define RCX rcx
+# define RDX rdx
+# define SHR shrq
+# define TEXTSUFFIX evex512
+# define VMM0 zmm16
+# define VMM1 zmm17
+# define VMM2 zmm18
+# define VMM3 zmm19
+# define VMM4 zmm20
+# define VMOVA vmovdqa64
+# elif VEC_SIZE == 32
+/* Currently Unused. */
+# define KMOV kmovd
+# define KORTEST kortestd
+# define RAX eax
+# define RCX ecx
+# define RDX edx
+# define SHR shrl
+# define TEXTSUFFIX evex256
+# define VMM0 ymm16
+# define VMM1 ymm17
+# define VMM2 ymm18
+# define VMM3 ymm19
+# define VMM4 ymm20
+# define VMOVA vmovdqa32
+# endif
+
+ .section .text.TEXTSUFFIX, "ax", @progbits
+/* Aligning entry point to 64 byte, provides better performance for
+ one vector length string. */
+ENTRY_P2ALIGN (STRLEN, 6)
+# ifdef USE_AS_STRNLEN
+ /* Check zero length. */
+ test %RSI_LP, %RSI_LP
+ jz L(ret_max)
+# ifdef __ILP32__
+ /* Clear the upper 32 bits. */
+ movl %esi, %esi
+# endif
+# endif
+
+ movl %edi, %eax
+ vpxorq %XMM0, %XMM0, %XMM0
+ andl $(PAGE_SIZE - 1), %eax
+ cmpl $(PAGE_SIZE - VEC_SIZE), %eax
+ ja L(page_cross)
+
+ /* Compare [w]char for null, mask bit will be set for match. */
+ VPCMP $0, (%rdi), %VMM0, %k0
+ KMOV %k0, %RAX
+ test %RAX, %RAX
+ jz L(align_more)
+
+ bsf %RAX, %RAX
+# ifdef USE_AS_STRNLEN
+ cmpq %rsi, %rax
+ cmovnb %rsi, %rax
+# endif
+ ret
+
+ /* At this point vector max length reached. */
+# ifdef USE_AS_STRNLEN
+ .p2align 4,,3
+L(ret_max):
+ movq %rsi, %rax
+ ret
+# endif
+
+L(align_more):
+ leaq VEC_SIZE(%rdi), %rax
+ /* Align rax to VEC_SIZE. */
+ andq $-VEC_SIZE, %rax
+# ifdef USE_AS_STRNLEN
+ movq %rax, %rdx
+ subq %rdi, %rdx
+# ifdef USE_AS_WCSLEN
+ SHR $2, %RDX
+# endif
+ /* At this point rdx contains [w]chars already compared. */
+ subq %rsi, %rdx
+ jae L(ret_max)
+ negq %rdx
+ /* At this point rdx contains number of w[char] needs to go.
+ Now onwards rdx will keep decrementing with each compare. */
+# endif
+
+ /* Loop unroll 4 times for 4 vector loop. */
+ VPCMP $0, (%rax), %VMM0, %k0
+ KMOV %k0, %RCX
+ test %RCX, %RCX
+ jnz L(ret_vec_x1)
+
+# ifdef USE_AS_STRNLEN
+ subq $CHAR_PER_VEC, %rdx
+ jbe L(ret_max)
+# endif
+
+ VPCMP $0, VEC_SIZE(%rax), %VMM0, %k0
+ KMOV %k0, %RCX
+ test %RCX, %RCX
+ jnz L(ret_vec_x2)
+
+# ifdef USE_AS_STRNLEN
+ subq $CHAR_PER_VEC, %rdx
+ jbe L(ret_max)
+# endif
+
+ VPCMP $0, (VEC_SIZE * 2)(%rax), %VMM0, %k0
+ KMOV %k0, %RCX
+ test %RCX, %RCX
+ jnz L(ret_vec_x3)
+
+# ifdef USE_AS_STRNLEN
+ subq $CHAR_PER_VEC, %rdx
+ jbe L(ret_max)
+# endif
+
+ VPCMP $0, (VEC_SIZE * 3)(%rax), %VMM0, %k0
+ KMOV %k0, %RCX
+ test %RCX, %RCX
+ jnz L(ret_vec_x4)
+
+# ifdef USE_AS_STRNLEN
+ subq $CHAR_PER_VEC, %rdx
+ jbe L(ret_max)
+ /* Save pointer before 4 x VEC_SIZE alignment. */
+ movq %rax, %rcx
+# endif
+
+ /* Align address to VEC_SIZE * 4 for loop. */
+ andq $-(VEC_SIZE * 4), %rax
+
+# ifdef USE_AS_STRNLEN
+ subq %rax, %rcx
+# ifdef USE_AS_WCSLEN
+ SHR $2, %RCX
+# endif
+ /* rcx contains number of [w]char will be recompared due to
+ alignment fixes. rdx must be incremented by rcx to offset
+ alignment adjustment. */
+ addq %rcx, %rdx
+ /* Need jump as we don't want to add/subtract rdx for first
+ iteration of 4 x VEC_SIZE aligned loop. */
+ jmp L(loop_entry)
+# endif
+
+ .p2align 4,,11
+L(loop):
+# ifdef USE_AS_STRNLEN
+ subq $(CHAR_PER_VEC * 4), %rdx
+ jbe L(ret_max)
+L(loop_entry):
+# endif
+ /* VPMINU and VPCMP combination provide better performance as
+ compared to alternative combinations. */
+ VMOVA (VEC_SIZE * 4)(%rax), %VMM1
+ VPMINU (VEC_SIZE * 5)(%rax), %VMM1, %VMM2
+ VMOVA (VEC_SIZE * 6)(%rax), %VMM3
+ VPMINU (VEC_SIZE * 7)(%rax), %VMM3, %VMM4
+
+ VPTESTN %VMM2, %VMM2, %k0
+ VPTESTN %VMM4, %VMM4, %k1
+
+ subq $-(VEC_SIZE * 4), %rax
+ KORTEST %k0, %k1
+ jz L(loop)
+
+ VPTESTN %VMM1, %VMM1, %k2
+ KMOV %k2, %RCX
+ test %RCX, %RCX
+ jnz L(ret_vec_x1)
+
+ KMOV %k0, %RCX
+ /* At this point, if k0 is non zero, null char must be in the
+ second vector. */
+ test %RCX, %RCX
+ jnz L(ret_vec_x2)
+
+ VPTESTN %VMM3, %VMM3, %k3
+ KMOV %k3, %RCX
+ test %RCX, %RCX
+ jnz L(ret_vec_x3)
+ /* At this point null [w]char must be in the fourth vector so no
+ need to check. */
+ KMOV %k1, %RCX
+
+ /* Fourth, third, second vector terminating are pretty much
+ same, implemented this way to avoid branching and reuse code
+ from pre loop exit condition. */
+L(ret_vec_x4):
+ bsf %RCX, %RCX
+ subq %rdi, %rax
+# ifdef USE_AS_WCSLEN
+ subq $-(VEC_SIZE * 3), %rax
+ shrq $2, %rax
+ addq %rcx, %rax
+# else
+ leaq (VEC_SIZE * 3)(%rcx, %rax), %rax
+# endif
+# ifdef USE_AS_STRNLEN
+ cmpq %rsi, %rax
+ cmovnb %rsi, %rax
+# endif
+ ret
+
+L(ret_vec_x3):
+ bsf %RCX, %RCX
+ subq %rdi, %rax
+# ifdef USE_AS_WCSLEN
+ subq $-(VEC_SIZE * 2), %rax
+ shrq $2, %rax
+ addq %rcx, %rax
+# else
+ leaq (VEC_SIZE * 2)(%rcx, %rax), %rax
+# endif
+# ifdef USE_AS_STRNLEN
+ cmpq %rsi, %rax
+ cmovnb %rsi, %rax
+# endif
+ ret
+
+L(ret_vec_x2):
+ subq $-VEC_SIZE, %rax
+L(ret_vec_x1):
+ bsf %RCX, %RCX
+ subq %rdi, %rax
+# ifdef USE_AS_WCSLEN
+ shrq $2, %rax
+# endif
+ addq %rcx, %rax
+# ifdef USE_AS_STRNLEN
+ cmpq %rsi, %rax
+ cmovnb %rsi, %rax
+# endif
+ ret
+
+L(page_cross):
+ movl %eax, %ecx
+# ifdef USE_AS_WCSLEN
+ andl $(VEC_SIZE - 1), %ecx
+ sarl $2, %ecx
+# endif
+ /* ecx contains number of w[char] to be skipped as a result
+ of address alignment. */
+ xorq %rdi, %rax
+ VPCMP $0, (PAGE_SIZE - VEC_SIZE)(%rax), %VMM0, %k0
+ KMOV %k0, %RAX
+ /* Ignore number of character for alignment adjustment. */
+ SHR %cl, %RAX
+ jz L(align_more)
+
+ bsf %RAX, %RAX
+# ifdef USE_AS_STRNLEN
+ cmpq %rsi, %rax
+ cmovnb %rsi, %rax
+# endif
+ ret
+
+END (STRLEN)
+#endif
diff --git a/sysdeps/x86_64/multiarch/strlen-evex512.S b/sysdeps/x86_64/multiarch/strlen-evex512.S
new file mode 100644
index 0000000000..116f8981c8
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strlen-evex512.S
@@ -0,0 +1,7 @@
+#ifndef STRLEN
+# define STRLEN __strlen_evex512
+#endif
+
+#define VEC_SIZE 64
+
+#include "strlen-evex-base.S"
diff --git a/sysdeps/x86_64/multiarch/strnlen-evex512.S b/sysdeps/x86_64/multiarch/strnlen-evex512.S
new file mode 100644
index 0000000000..0b7f220214
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strnlen-evex512.S
@@ -0,0 +1,4 @@
+#define STRLEN __strnlen_evex512
+#define USE_AS_STRNLEN 1
+
+#include "strlen-evex512.S"
diff --git a/sysdeps/x86_64/multiarch/wcslen-evex512.S b/sysdeps/x86_64/multiarch/wcslen-evex512.S
new file mode 100644
index 0000000000..f59c372b78
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wcslen-evex512.S
@@ -0,0 +1,4 @@
+#define STRLEN __wcslen_evex512
+#define USE_AS_WCSLEN 1
+
+#include "strlen-evex512.S"
diff --git a/sysdeps/x86_64/multiarch/wcsnlen-evex512.S b/sysdeps/x86_64/multiarch/wcsnlen-evex512.S
new file mode 100644
index 0000000000..73dcf2f210
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wcsnlen-evex512.S
@@ -0,0 +1,5 @@
+#define STRLEN __wcsnlen_evex512
+#define USE_AS_WCSLEN 1
+#define USE_AS_STRNLEN 1
+
+#include "strlen-evex512.S"
--
2.35.3
^ permalink raw reply [flat|nested] 12+ messages in thread
* Re: [PATCH v4] x86_64: Implement evex512 version of strlen, strnlen, wcslen and wcsnlen
2022-05-26 18:35 ` [PATCH v4] " Sunil K Pandey
@ 2022-05-26 20:07 ` Noah Goldstein
2022-07-14 0:03 ` Sunil Pandey
0 siblings, 1 reply; 12+ messages in thread
From: Noah Goldstein @ 2022-05-26 20:07 UTC (permalink / raw)
To: Sunil K Pandey; +Cc: GNU C Library
On Thu, May 26, 2022 at 1:36 PM Sunil K Pandey via Libc-alpha
<libc-alpha@sourceware.org> wrote:
>
> This patch implements following evex512 version of string functions.
> Perf gain for evex512 version is up to 50% as compared to evex,
> depending on length and alignment.
>
> Placeholder function, not used by any processor at the moment.
>
> - String length function using 512 bit vectors.
> - String N length using 512 bit vectors.
> - Wide string length using 512 bit vectors.
> - Wide string N length using 512 bit vectors.
> ---
> sysdeps/x86_64/multiarch/Makefile | 4 +
> sysdeps/x86_64/multiarch/ifunc-impl-list.c | 20 ++
> sysdeps/x86_64/multiarch/strlen-evex-base.S | 302 ++++++++++++++++++++
> sysdeps/x86_64/multiarch/strlen-evex512.S | 7 +
> sysdeps/x86_64/multiarch/strnlen-evex512.S | 4 +
> sysdeps/x86_64/multiarch/wcslen-evex512.S | 4 +
> sysdeps/x86_64/multiarch/wcsnlen-evex512.S | 5 +
> 7 files changed, 346 insertions(+)
> create mode 100644 sysdeps/x86_64/multiarch/strlen-evex-base.S
> create mode 100644 sysdeps/x86_64/multiarch/strlen-evex512.S
> create mode 100644 sysdeps/x86_64/multiarch/strnlen-evex512.S
> create mode 100644 sysdeps/x86_64/multiarch/wcslen-evex512.S
> create mode 100644 sysdeps/x86_64/multiarch/wcsnlen-evex512.S
>
> diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
> index f3ab5e0928..d0869c3ac3 100644
> --- a/sysdeps/x86_64/multiarch/Makefile
> +++ b/sysdeps/x86_64/multiarch/Makefile
> @@ -81,6 +81,7 @@ sysdep_routines += \
> strlen-avx2 \
> strlen-avx2-rtm \
> strlen-evex \
> + strlen-evex512 \
> strlen-sse2 \
> strncase_l-avx2 \
> strncase_l-avx2-rtm \
> @@ -105,6 +106,7 @@ sysdep_routines += \
> strnlen-avx2 \
> strnlen-avx2-rtm \
> strnlen-evex \
> + strnlen-evex512 \
> strnlen-sse2 \
> strpbrk-c \
> strpbrk-sse2 \
> @@ -138,6 +140,7 @@ sysdep_routines += \
> wcslen-avx2 \
> wcslen-avx2-rtm \
> wcslen-evex \
> + wcslen-evex512 \
> wcslen-sse2 \
> wcslen-sse4_1 \
> wcsncmp-avx2 \
> @@ -148,6 +151,7 @@ sysdep_routines += \
> wcsnlen-avx2-rtm \
> wcsnlen-c \
> wcsnlen-evex \
> + wcsnlen-evex512 \
> wcsnlen-sse4_1 \
> wcsrchr-avx2 \
> wcsrchr-avx2-rtm \
> diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> index 7218095430..c5cd9466fe 100644
> --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> @@ -328,6 +328,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> && CPU_FEATURE_USABLE (AVX512BW)
> && CPU_FEATURE_USABLE (BMI2)),
> __strlen_evex)
> + IFUNC_IMPL_ADD (array, i, strlen,
> + (CPU_FEATURE_USABLE (AVX512VL)
> + && CPU_FEATURE_USABLE (AVX512BW)
> + && CPU_FEATURE_USABLE (BMI2)),
> + __strlen_evex512)
> IFUNC_IMPL_ADD (array, i, strlen, 1, __strlen_sse2))
>
> /* Support sysdeps/x86_64/multiarch/strnlen.c. */
> @@ -346,6 +351,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> && CPU_FEATURE_USABLE (AVX512BW)
> && CPU_FEATURE_USABLE (BMI2)),
> __strnlen_evex)
> + IFUNC_IMPL_ADD (array, i, strnlen,
> + (CPU_FEATURE_USABLE (AVX512VL)
> + && CPU_FEATURE_USABLE (AVX512BW)
> + && CPU_FEATURE_USABLE (BMI2)),
> + __strnlen_evex512)
> IFUNC_IMPL_ADD (array, i, strnlen, 1, __strnlen_sse2))
>
> /* Support sysdeps/x86_64/multiarch/stpncpy.c. */
> @@ -699,6 +709,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> && CPU_FEATURE_USABLE (AVX512BW)
> && CPU_FEATURE_USABLE (BMI2)),
> __wcslen_evex)
> + IFUNC_IMPL_ADD (array, i, wcslen,
> + (CPU_FEATURE_USABLE (AVX512VL)
> + && CPU_FEATURE_USABLE (AVX512BW)
> + && CPU_FEATURE_USABLE (BMI2)),
> + __wcslen_evex512)
> IFUNC_IMPL_ADD (array, i, wcslen,
> CPU_FEATURE_USABLE (SSE4_1),
> __wcslen_sse4_1)
> @@ -720,6 +735,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> && CPU_FEATURE_USABLE (AVX512BW)
> && CPU_FEATURE_USABLE (BMI2)),
> __wcsnlen_evex)
> + IFUNC_IMPL_ADD (array, i, wcsnlen,
> + (CPU_FEATURE_USABLE (AVX512VL)
> + && CPU_FEATURE_USABLE (AVX512BW)
> + && CPU_FEATURE_USABLE (BMI2)),
> + __wcsnlen_evex512)
> IFUNC_IMPL_ADD (array, i, wcsnlen,
> CPU_FEATURE_USABLE (SSE4_1),
> __wcsnlen_sse4_1)
> diff --git a/sysdeps/x86_64/multiarch/strlen-evex-base.S b/sysdeps/x86_64/multiarch/strlen-evex-base.S
> new file mode 100644
> index 0000000000..278c899691
> --- /dev/null
> +++ b/sysdeps/x86_64/multiarch/strlen-evex-base.S
> @@ -0,0 +1,302 @@
> +/* Placeholder function, not used by any processor at the moment.
> + Copyright (C) 2022 Free Software Foundation, Inc.
> + This file is part of the GNU C Library.
> +
> + The GNU C Library is free software; you can redistribute it and/or
> + modify it under the terms of the GNU Lesser General Public
> + License as published by the Free Software Foundation; either
> + version 2.1 of the License, or (at your option) any later version.
> +
> + The GNU C Library is distributed in the hope that it will be useful,
> + but WITHOUT ANY WARRANTY; without even the implied warranty of
> + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + Lesser General Public License for more details.
> +
> + You should have received a copy of the GNU Lesser General Public
> + License along with the GNU C Library; if not, see
> + <https://www.gnu.org/licenses/>. */
> +
> +#if IS_IN (libc)
> +
> +# include <sysdep.h>
> +
> +# ifdef USE_AS_WCSLEN
> +# define VPCMP vpcmpd
> +# define VPTESTN vptestnmd
> +# define VPMINU vpminud
> +# define CHAR_SIZE 4
> +# else
> +# define VPCMP vpcmpb
> +# define VPTESTN vptestnmb
> +# define VPMINU vpminub
> +# define CHAR_SIZE 1
> +# endif
> +
> +# define XMM0 xmm16
> +# define PAGE_SIZE 4096
> +# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE)
> +
> +# if VEC_SIZE == 64
> +# define KMOV kmovq
> +# define KORTEST kortestq
> +# define RAX rax
> +# define RCX rcx
> +# define RDX rdx
> +# define SHR shrq
> +# define TEXTSUFFIX evex512
> +# define VMM0 zmm16
> +# define VMM1 zmm17
> +# define VMM2 zmm18
> +# define VMM3 zmm19
> +# define VMM4 zmm20
> +# define VMOVA vmovdqa64
> +# elif VEC_SIZE == 32
> +/* Currently Unused. */
> +# define KMOV kmovd
> +# define KORTEST kortestd
> +# define RAX eax
> +# define RCX ecx
> +# define RDX edx
> +# define SHR shrl
> +# define TEXTSUFFIX evex256
> +# define VMM0 ymm16
> +# define VMM1 ymm17
> +# define VMM2 ymm18
> +# define VMM3 ymm19
> +# define VMM4 ymm20
> +# define VMOVA vmovdqa32
> +# endif
> +
> + .section .text.TEXTSUFFIX, "ax", @progbits
> +/* Aligning entry point to 64 byte, provides better performance for
> + one vector length string. */
> +ENTRY_P2ALIGN (STRLEN, 6)
> +# ifdef USE_AS_STRNLEN
> + /* Check zero length. */
> + test %RSI_LP, %RSI_LP
> + jz L(ret_max)
> +# ifdef __ILP32__
> + /* Clear the upper 32 bits. */
> + movl %esi, %esi
> +# endif
> +# endif
> +
> + movl %edi, %eax
> + vpxorq %XMM0, %XMM0, %XMM0
> + andl $(PAGE_SIZE - 1), %eax
> + cmpl $(PAGE_SIZE - VEC_SIZE), %eax
> + ja L(page_cross)
> +
> + /* Compare [w]char for null, mask bit will be set for match. */
> + VPCMP $0, (%rdi), %VMM0, %k0
> + KMOV %k0, %RAX
> + test %RAX, %RAX
> + jz L(align_more)
> +
> + bsf %RAX, %RAX
> +# ifdef USE_AS_STRNLEN
> + cmpq %rsi, %rax
> + cmovnb %rsi, %rax
> +# endif
> + ret
> +
> + /* At this point vector max length reached. */
> +# ifdef USE_AS_STRNLEN
> + .p2align 4,,3
> +L(ret_max):
> + movq %rsi, %rax
> + ret
> +# endif
> +
> +L(align_more):
> + leaq VEC_SIZE(%rdi), %rax
> + /* Align rax to VEC_SIZE. */
> + andq $-VEC_SIZE, %rax
> +# ifdef USE_AS_STRNLEN
> + movq %rax, %rdx
> + subq %rdi, %rdx
> +# ifdef USE_AS_WCSLEN
> + SHR $2, %RDX
> +# endif
> + /* At this point rdx contains [w]chars already compared. */
> + subq %rsi, %rdx
> + jae L(ret_max)
> + negq %rdx
> + /* At this point rdx contains number of w[char] needs to go.
> + Now onwards rdx will keep decrementing with each compare. */
> +# endif
> +
> + /* Loop unroll 4 times for 4 vector loop. */
> + VPCMP $0, (%rax), %VMM0, %k0
> + KMOV %k0, %RCX
> + test %RCX, %RCX
> + jnz L(ret_vec_x1)
> +
> +# ifdef USE_AS_STRNLEN
> + subq $CHAR_PER_VEC, %rdx
> + jbe L(ret_max)
> +# endif
> +
> + VPCMP $0, VEC_SIZE(%rax), %VMM0, %k0
> + KMOV %k0, %RCX
> + test %RCX, %RCX
> + jnz L(ret_vec_x2)
> +
> +# ifdef USE_AS_STRNLEN
> + subq $CHAR_PER_VEC, %rdx
> + jbe L(ret_max)
> +# endif
> +
> + VPCMP $0, (VEC_SIZE * 2)(%rax), %VMM0, %k0
> + KMOV %k0, %RCX
> + test %RCX, %RCX
> + jnz L(ret_vec_x3)
> +
> +# ifdef USE_AS_STRNLEN
> + subq $CHAR_PER_VEC, %rdx
> + jbe L(ret_max)
> +# endif
> +
> + VPCMP $0, (VEC_SIZE * 3)(%rax), %VMM0, %k0
> + KMOV %k0, %RCX
> + test %RCX, %RCX
> + jnz L(ret_vec_x4)
> +
> +# ifdef USE_AS_STRNLEN
> + subq $CHAR_PER_VEC, %rdx
> + jbe L(ret_max)
> + /* Save pointer before 4 x VEC_SIZE alignment. */
> + movq %rax, %rcx
> +# endif
> +
> + /* Align address to VEC_SIZE * 4 for loop. */
> + andq $-(VEC_SIZE * 4), %rax
> +
> +# ifdef USE_AS_STRNLEN
> + subq %rax, %rcx
> +# ifdef USE_AS_WCSLEN
> + SHR $2, %RCX
> +# endif
> + /* rcx contains number of [w]char will be recompared due to
> + alignment fixes. rdx must be incremented by rcx to offset
> + alignment adjustment. */
> + addq %rcx, %rdx
> + /* Need jump as we don't want to add/subtract rdx for first
> + iteration of 4 x VEC_SIZE aligned loop. */
> + jmp L(loop_entry)
> +# endif
> +
> + .p2align 4,,11
> +L(loop):
> +# ifdef USE_AS_STRNLEN
> + subq $(CHAR_PER_VEC * 4), %rdx
> + jbe L(ret_max)
> +L(loop_entry):
> +# endif
> + /* VPMINU and VPCMP combination provide better performance as
> + compared to alternative combinations. */
> + VMOVA (VEC_SIZE * 4)(%rax), %VMM1
> + VPMINU (VEC_SIZE * 5)(%rax), %VMM1, %VMM2
> + VMOVA (VEC_SIZE * 6)(%rax), %VMM3
> + VPMINU (VEC_SIZE * 7)(%rax), %VMM3, %VMM4
> +
> + VPTESTN %VMM2, %VMM2, %k0
> + VPTESTN %VMM4, %VMM4, %k1
> +
> + subq $-(VEC_SIZE * 4), %rax
> + KORTEST %k0, %k1
> + jz L(loop)
> +
> + VPTESTN %VMM1, %VMM1, %k2
> + KMOV %k2, %RCX
> + test %RCX, %RCX
> + jnz L(ret_vec_x1)
> +
> + KMOV %k0, %RCX
> + /* At this point, if k0 is non zero, null char must be in the
> + second vector. */
> + test %RCX, %RCX
> + jnz L(ret_vec_x2)
> +
> + VPTESTN %VMM3, %VMM3, %k3
> + KMOV %k3, %RCX
> + test %RCX, %RCX
> + jnz L(ret_vec_x3)
> + /* At this point null [w]char must be in the fourth vector so no
> + need to check. */
> + KMOV %k1, %RCX
> +
> + /* Fourth, third, second vector terminating are pretty much
> + same, implemented this way to avoid branching and reuse code
> + from pre loop exit condition. */
> +L(ret_vec_x4):
> + bsf %RCX, %RCX
> + subq %rdi, %rax
> +# ifdef USE_AS_WCSLEN
> + subq $-(VEC_SIZE * 3), %rax
> + shrq $2, %rax
> + addq %rcx, %rax
> +# else
> + leaq (VEC_SIZE * 3)(%rcx, %rax), %rax
> +# endif
> +# ifdef USE_AS_STRNLEN
> + cmpq %rsi, %rax
> + cmovnb %rsi, %rax
> +# endif
> + ret
> +
> +L(ret_vec_x3):
> + bsf %RCX, %RCX
> + subq %rdi, %rax
> +# ifdef USE_AS_WCSLEN
> + subq $-(VEC_SIZE * 2), %rax
> + shrq $2, %rax
> + addq %rcx, %rax
> +# else
> + leaq (VEC_SIZE * 2)(%rcx, %rax), %rax
> +# endif
> +# ifdef USE_AS_STRNLEN
> + cmpq %rsi, %rax
> + cmovnb %rsi, %rax
> +# endif
> + ret
> +
> +L(ret_vec_x2):
> + subq $-VEC_SIZE, %rax
> +L(ret_vec_x1):
> + bsf %RCX, %RCX
> + subq %rdi, %rax
> +# ifdef USE_AS_WCSLEN
> + shrq $2, %rax
> +# endif
> + addq %rcx, %rax
> +# ifdef USE_AS_STRNLEN
> + cmpq %rsi, %rax
> + cmovnb %rsi, %rax
> +# endif
> + ret
> +
> +L(page_cross):
> + movl %eax, %ecx
> +# ifdef USE_AS_WCSLEN
> + andl $(VEC_SIZE - 1), %ecx
> + sarl $2, %ecx
> +# endif
> + /* ecx contains number of w[char] to be skipped as a result
> + of address alignment. */
> + xorq %rdi, %rax
> + VPCMP $0, (PAGE_SIZE - VEC_SIZE)(%rax), %VMM0, %k0
> + KMOV %k0, %RAX
> + /* Ignore number of character for alignment adjustment. */
> + SHR %cl, %RAX
> + jz L(align_more)
> +
> + bsf %RAX, %RAX
> +# ifdef USE_AS_STRNLEN
> + cmpq %rsi, %rax
> + cmovnb %rsi, %rax
> +# endif
> + ret
> +
> +END (STRLEN)
> +#endif
> diff --git a/sysdeps/x86_64/multiarch/strlen-evex512.S b/sysdeps/x86_64/multiarch/strlen-evex512.S
> new file mode 100644
> index 0000000000..116f8981c8
> --- /dev/null
> +++ b/sysdeps/x86_64/multiarch/strlen-evex512.S
> @@ -0,0 +1,7 @@
> +#ifndef STRLEN
> +# define STRLEN __strlen_evex512
> +#endif
> +
> +#define VEC_SIZE 64
> +
> +#include "strlen-evex-base.S"
> diff --git a/sysdeps/x86_64/multiarch/strnlen-evex512.S b/sysdeps/x86_64/multiarch/strnlen-evex512.S
> new file mode 100644
> index 0000000000..0b7f220214
> --- /dev/null
> +++ b/sysdeps/x86_64/multiarch/strnlen-evex512.S
> @@ -0,0 +1,4 @@
> +#define STRLEN __strnlen_evex512
> +#define USE_AS_STRNLEN 1
> +
> +#include "strlen-evex512.S"
> diff --git a/sysdeps/x86_64/multiarch/wcslen-evex512.S b/sysdeps/x86_64/multiarch/wcslen-evex512.S
> new file mode 100644
> index 0000000000..f59c372b78
> --- /dev/null
> +++ b/sysdeps/x86_64/multiarch/wcslen-evex512.S
> @@ -0,0 +1,4 @@
> +#define STRLEN __wcslen_evex512
> +#define USE_AS_WCSLEN 1
> +
> +#include "strlen-evex512.S"
> diff --git a/sysdeps/x86_64/multiarch/wcsnlen-evex512.S b/sysdeps/x86_64/multiarch/wcsnlen-evex512.S
> new file mode 100644
> index 0000000000..73dcf2f210
> --- /dev/null
> +++ b/sysdeps/x86_64/multiarch/wcsnlen-evex512.S
> @@ -0,0 +1,5 @@
> +#define STRLEN __wcsnlen_evex512
> +#define USE_AS_WCSLEN 1
> +#define USE_AS_STRNLEN 1
> +
> +#include "strlen-evex512.S"
> --
> 2.35.3
>
LGTM.
Reviewed-by: Noah Goldstein <goldstein.w.n@gmail.com>
^ permalink raw reply [flat|nested] 12+ messages in thread
* Re: [PATCH v4] x86_64: Implement evex512 version of strlen, strnlen, wcslen and wcsnlen
2022-05-26 20:07 ` Noah Goldstein
@ 2022-07-14 0:03 ` Sunil Pandey
0 siblings, 0 replies; 12+ messages in thread
From: Sunil Pandey @ 2022-07-14 0:03 UTC (permalink / raw)
To: Noah Goldstein; +Cc: GNU C Library
On Thu, May 26, 2022 at 1:07 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> On Thu, May 26, 2022 at 1:36 PM Sunil K Pandey via Libc-alpha
> <libc-alpha@sourceware.org> wrote:
> >
> > This patch implements following evex512 version of string functions.
> > Perf gain for evex512 version is up to 50% as compared to evex,
> > depending on length and alignment.
> >
> > Placeholder function, not used by any processor at the moment.
> >
> > - String length function using 512 bit vectors.
> > - String N length using 512 bit vectors.
> > - Wide string length using 512 bit vectors.
> > - Wide string N length using 512 bit vectors.
> > ---
> > sysdeps/x86_64/multiarch/Makefile | 4 +
> > sysdeps/x86_64/multiarch/ifunc-impl-list.c | 20 ++
> > sysdeps/x86_64/multiarch/strlen-evex-base.S | 302 ++++++++++++++++++++
> > sysdeps/x86_64/multiarch/strlen-evex512.S | 7 +
> > sysdeps/x86_64/multiarch/strnlen-evex512.S | 4 +
> > sysdeps/x86_64/multiarch/wcslen-evex512.S | 4 +
> > sysdeps/x86_64/multiarch/wcsnlen-evex512.S | 5 +
> > 7 files changed, 346 insertions(+)
> > create mode 100644 sysdeps/x86_64/multiarch/strlen-evex-base.S
> > create mode 100644 sysdeps/x86_64/multiarch/strlen-evex512.S
> > create mode 100644 sysdeps/x86_64/multiarch/strnlen-evex512.S
> > create mode 100644 sysdeps/x86_64/multiarch/wcslen-evex512.S
> > create mode 100644 sysdeps/x86_64/multiarch/wcsnlen-evex512.S
> >
> > diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
> > index f3ab5e0928..d0869c3ac3 100644
> > --- a/sysdeps/x86_64/multiarch/Makefile
> > +++ b/sysdeps/x86_64/multiarch/Makefile
> > @@ -81,6 +81,7 @@ sysdep_routines += \
> > strlen-avx2 \
> > strlen-avx2-rtm \
> > strlen-evex \
> > + strlen-evex512 \
> > strlen-sse2 \
> > strncase_l-avx2 \
> > strncase_l-avx2-rtm \
> > @@ -105,6 +106,7 @@ sysdep_routines += \
> > strnlen-avx2 \
> > strnlen-avx2-rtm \
> > strnlen-evex \
> > + strnlen-evex512 \
> > strnlen-sse2 \
> > strpbrk-c \
> > strpbrk-sse2 \
> > @@ -138,6 +140,7 @@ sysdep_routines += \
> > wcslen-avx2 \
> > wcslen-avx2-rtm \
> > wcslen-evex \
> > + wcslen-evex512 \
> > wcslen-sse2 \
> > wcslen-sse4_1 \
> > wcsncmp-avx2 \
> > @@ -148,6 +151,7 @@ sysdep_routines += \
> > wcsnlen-avx2-rtm \
> > wcsnlen-c \
> > wcsnlen-evex \
> > + wcsnlen-evex512 \
> > wcsnlen-sse4_1 \
> > wcsrchr-avx2 \
> > wcsrchr-avx2-rtm \
> > diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> > index 7218095430..c5cd9466fe 100644
> > --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> > +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> > @@ -328,6 +328,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> > && CPU_FEATURE_USABLE (AVX512BW)
> > && CPU_FEATURE_USABLE (BMI2)),
> > __strlen_evex)
> > + IFUNC_IMPL_ADD (array, i, strlen,
> > + (CPU_FEATURE_USABLE (AVX512VL)
> > + && CPU_FEATURE_USABLE (AVX512BW)
> > + && CPU_FEATURE_USABLE (BMI2)),
> > + __strlen_evex512)
> > IFUNC_IMPL_ADD (array, i, strlen, 1, __strlen_sse2))
> >
> > /* Support sysdeps/x86_64/multiarch/strnlen.c. */
> > @@ -346,6 +351,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> > && CPU_FEATURE_USABLE (AVX512BW)
> > && CPU_FEATURE_USABLE (BMI2)),
> > __strnlen_evex)
> > + IFUNC_IMPL_ADD (array, i, strnlen,
> > + (CPU_FEATURE_USABLE (AVX512VL)
> > + && CPU_FEATURE_USABLE (AVX512BW)
> > + && CPU_FEATURE_USABLE (BMI2)),
> > + __strnlen_evex512)
> > IFUNC_IMPL_ADD (array, i, strnlen, 1, __strnlen_sse2))
> >
> > /* Support sysdeps/x86_64/multiarch/stpncpy.c. */
> > @@ -699,6 +709,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> > && CPU_FEATURE_USABLE (AVX512BW)
> > && CPU_FEATURE_USABLE (BMI2)),
> > __wcslen_evex)
> > + IFUNC_IMPL_ADD (array, i, wcslen,
> > + (CPU_FEATURE_USABLE (AVX512VL)
> > + && CPU_FEATURE_USABLE (AVX512BW)
> > + && CPU_FEATURE_USABLE (BMI2)),
> > + __wcslen_evex512)
> > IFUNC_IMPL_ADD (array, i, wcslen,
> > CPU_FEATURE_USABLE (SSE4_1),
> > __wcslen_sse4_1)
> > @@ -720,6 +735,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> > && CPU_FEATURE_USABLE (AVX512BW)
> > && CPU_FEATURE_USABLE (BMI2)),
> > __wcsnlen_evex)
> > + IFUNC_IMPL_ADD (array, i, wcsnlen,
> > + (CPU_FEATURE_USABLE (AVX512VL)
> > + && CPU_FEATURE_USABLE (AVX512BW)
> > + && CPU_FEATURE_USABLE (BMI2)),
> > + __wcsnlen_evex512)
> > IFUNC_IMPL_ADD (array, i, wcsnlen,
> > CPU_FEATURE_USABLE (SSE4_1),
> > __wcsnlen_sse4_1)
> > diff --git a/sysdeps/x86_64/multiarch/strlen-evex-base.S b/sysdeps/x86_64/multiarch/strlen-evex-base.S
> > new file mode 100644
> > index 0000000000..278c899691
> > --- /dev/null
> > +++ b/sysdeps/x86_64/multiarch/strlen-evex-base.S
> > @@ -0,0 +1,302 @@
> > +/* Placeholder function, not used by any processor at the moment.
> > + Copyright (C) 2022 Free Software Foundation, Inc.
> > + This file is part of the GNU C Library.
> > +
> > + The GNU C Library is free software; you can redistribute it and/or
> > + modify it under the terms of the GNU Lesser General Public
> > + License as published by the Free Software Foundation; either
> > + version 2.1 of the License, or (at your option) any later version.
> > +
> > + The GNU C Library is distributed in the hope that it will be useful,
> > + but WITHOUT ANY WARRANTY; without even the implied warranty of
> > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> > + Lesser General Public License for more details.
> > +
> > + You should have received a copy of the GNU Lesser General Public
> > + License along with the GNU C Library; if not, see
> > + <https://www.gnu.org/licenses/>. */
> > +
> > +#if IS_IN (libc)
> > +
> > +# include <sysdep.h>
> > +
> > +# ifdef USE_AS_WCSLEN
> > +# define VPCMP vpcmpd
> > +# define VPTESTN vptestnmd
> > +# define VPMINU vpminud
> > +# define CHAR_SIZE 4
> > +# else
> > +# define VPCMP vpcmpb
> > +# define VPTESTN vptestnmb
> > +# define VPMINU vpminub
> > +# define CHAR_SIZE 1
> > +# endif
> > +
> > +# define XMM0 xmm16
> > +# define PAGE_SIZE 4096
> > +# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE)
> > +
> > +# if VEC_SIZE == 64
> > +# define KMOV kmovq
> > +# define KORTEST kortestq
> > +# define RAX rax
> > +# define RCX rcx
> > +# define RDX rdx
> > +# define SHR shrq
> > +# define TEXTSUFFIX evex512
> > +# define VMM0 zmm16
> > +# define VMM1 zmm17
> > +# define VMM2 zmm18
> > +# define VMM3 zmm19
> > +# define VMM4 zmm20
> > +# define VMOVA vmovdqa64
> > +# elif VEC_SIZE == 32
> > +/* Currently Unused. */
> > +# define KMOV kmovd
> > +# define KORTEST kortestd
> > +# define RAX eax
> > +# define RCX ecx
> > +# define RDX edx
> > +# define SHR shrl
> > +# define TEXTSUFFIX evex256
> > +# define VMM0 ymm16
> > +# define VMM1 ymm17
> > +# define VMM2 ymm18
> > +# define VMM3 ymm19
> > +# define VMM4 ymm20
> > +# define VMOVA vmovdqa32
> > +# endif
> > +
> > + .section .text.TEXTSUFFIX, "ax", @progbits
> > +/* Aligning entry point to 64 byte, provides better performance for
> > + one vector length string. */
> > +ENTRY_P2ALIGN (STRLEN, 6)
> > +# ifdef USE_AS_STRNLEN
> > + /* Check zero length. */
> > + test %RSI_LP, %RSI_LP
> > + jz L(ret_max)
> > +# ifdef __ILP32__
> > + /* Clear the upper 32 bits. */
> > + movl %esi, %esi
> > +# endif
> > +# endif
> > +
> > + movl %edi, %eax
> > + vpxorq %XMM0, %XMM0, %XMM0
> > + andl $(PAGE_SIZE - 1), %eax
> > + cmpl $(PAGE_SIZE - VEC_SIZE), %eax
> > + ja L(page_cross)
> > +
> > + /* Compare [w]char for null, mask bit will be set for match. */
> > + VPCMP $0, (%rdi), %VMM0, %k0
> > + KMOV %k0, %RAX
> > + test %RAX, %RAX
> > + jz L(align_more)
> > +
> > + bsf %RAX, %RAX
> > +# ifdef USE_AS_STRNLEN
> > + cmpq %rsi, %rax
> > + cmovnb %rsi, %rax
> > +# endif
> > + ret
> > +
> > + /* At this point vector max length reached. */
> > +# ifdef USE_AS_STRNLEN
> > + .p2align 4,,3
> > +L(ret_max):
> > + movq %rsi, %rax
> > + ret
> > +# endif
> > +
> > +L(align_more):
> > + leaq VEC_SIZE(%rdi), %rax
> > + /* Align rax to VEC_SIZE. */
> > + andq $-VEC_SIZE, %rax
> > +# ifdef USE_AS_STRNLEN
> > + movq %rax, %rdx
> > + subq %rdi, %rdx
> > +# ifdef USE_AS_WCSLEN
> > + SHR $2, %RDX
> > +# endif
> > + /* At this point rdx contains [w]chars already compared. */
> > + subq %rsi, %rdx
> > + jae L(ret_max)
> > + negq %rdx
> > + /* At this point rdx contains number of w[char] needs to go.
> > + Now onwards rdx will keep decrementing with each compare. */
> > +# endif
> > +
> > + /* Loop unroll 4 times for 4 vector loop. */
> > + VPCMP $0, (%rax), %VMM0, %k0
> > + KMOV %k0, %RCX
> > + test %RCX, %RCX
> > + jnz L(ret_vec_x1)
> > +
> > +# ifdef USE_AS_STRNLEN
> > + subq $CHAR_PER_VEC, %rdx
> > + jbe L(ret_max)
> > +# endif
> > +
> > + VPCMP $0, VEC_SIZE(%rax), %VMM0, %k0
> > + KMOV %k0, %RCX
> > + test %RCX, %RCX
> > + jnz L(ret_vec_x2)
> > +
> > +# ifdef USE_AS_STRNLEN
> > + subq $CHAR_PER_VEC, %rdx
> > + jbe L(ret_max)
> > +# endif
> > +
> > + VPCMP $0, (VEC_SIZE * 2)(%rax), %VMM0, %k0
> > + KMOV %k0, %RCX
> > + test %RCX, %RCX
> > + jnz L(ret_vec_x3)
> > +
> > +# ifdef USE_AS_STRNLEN
> > + subq $CHAR_PER_VEC, %rdx
> > + jbe L(ret_max)
> > +# endif
> > +
> > + VPCMP $0, (VEC_SIZE * 3)(%rax), %VMM0, %k0
> > + KMOV %k0, %RCX
> > + test %RCX, %RCX
> > + jnz L(ret_vec_x4)
> > +
> > +# ifdef USE_AS_STRNLEN
> > + subq $CHAR_PER_VEC, %rdx
> > + jbe L(ret_max)
> > + /* Save pointer before 4 x VEC_SIZE alignment. */
> > + movq %rax, %rcx
> > +# endif
> > +
> > + /* Align address to VEC_SIZE * 4 for loop. */
> > + andq $-(VEC_SIZE * 4), %rax
> > +
> > +# ifdef USE_AS_STRNLEN
> > + subq %rax, %rcx
> > +# ifdef USE_AS_WCSLEN
> > + SHR $2, %RCX
> > +# endif
> > + /* rcx contains number of [w]char will be recompared due to
> > + alignment fixes. rdx must be incremented by rcx to offset
> > + alignment adjustment. */
> > + addq %rcx, %rdx
> > + /* Need jump as we don't want to add/subtract rdx for first
> > + iteration of 4 x VEC_SIZE aligned loop. */
> > + jmp L(loop_entry)
> > +# endif
> > +
> > + .p2align 4,,11
> > +L(loop):
> > +# ifdef USE_AS_STRNLEN
> > + subq $(CHAR_PER_VEC * 4), %rdx
> > + jbe L(ret_max)
> > +L(loop_entry):
> > +# endif
> > + /* VPMINU and VPCMP combination provide better performance as
> > + compared to alternative combinations. */
> > + VMOVA (VEC_SIZE * 4)(%rax), %VMM1
> > + VPMINU (VEC_SIZE * 5)(%rax), %VMM1, %VMM2
> > + VMOVA (VEC_SIZE * 6)(%rax), %VMM3
> > + VPMINU (VEC_SIZE * 7)(%rax), %VMM3, %VMM4
> > +
> > + VPTESTN %VMM2, %VMM2, %k0
> > + VPTESTN %VMM4, %VMM4, %k1
> > +
> > + subq $-(VEC_SIZE * 4), %rax
> > + KORTEST %k0, %k1
> > + jz L(loop)
> > +
> > + VPTESTN %VMM1, %VMM1, %k2
> > + KMOV %k2, %RCX
> > + test %RCX, %RCX
> > + jnz L(ret_vec_x1)
> > +
> > + KMOV %k0, %RCX
> > + /* At this point, if k0 is non zero, null char must be in the
> > + second vector. */
> > + test %RCX, %RCX
> > + jnz L(ret_vec_x2)
> > +
> > + VPTESTN %VMM3, %VMM3, %k3
> > + KMOV %k3, %RCX
> > + test %RCX, %RCX
> > + jnz L(ret_vec_x3)
> > + /* At this point null [w]char must be in the fourth vector so no
> > + need to check. */
> > + KMOV %k1, %RCX
> > +
> > + /* Fourth, third, second vector terminating are pretty much
> > + same, implemented this way to avoid branching and reuse code
> > + from pre loop exit condition. */
> > +L(ret_vec_x4):
> > + bsf %RCX, %RCX
> > + subq %rdi, %rax
> > +# ifdef USE_AS_WCSLEN
> > + subq $-(VEC_SIZE * 3), %rax
> > + shrq $2, %rax
> > + addq %rcx, %rax
> > +# else
> > + leaq (VEC_SIZE * 3)(%rcx, %rax), %rax
> > +# endif
> > +# ifdef USE_AS_STRNLEN
> > + cmpq %rsi, %rax
> > + cmovnb %rsi, %rax
> > +# endif
> > + ret
> > +
> > +L(ret_vec_x3):
> > + bsf %RCX, %RCX
> > + subq %rdi, %rax
> > +# ifdef USE_AS_WCSLEN
> > + subq $-(VEC_SIZE * 2), %rax
> > + shrq $2, %rax
> > + addq %rcx, %rax
> > +# else
> > + leaq (VEC_SIZE * 2)(%rcx, %rax), %rax
> > +# endif
> > +# ifdef USE_AS_STRNLEN
> > + cmpq %rsi, %rax
> > + cmovnb %rsi, %rax
> > +# endif
> > + ret
> > +
> > +L(ret_vec_x2):
> > + subq $-VEC_SIZE, %rax
> > +L(ret_vec_x1):
> > + bsf %RCX, %RCX
> > + subq %rdi, %rax
> > +# ifdef USE_AS_WCSLEN
> > + shrq $2, %rax
> > +# endif
> > + addq %rcx, %rax
> > +# ifdef USE_AS_STRNLEN
> > + cmpq %rsi, %rax
> > + cmovnb %rsi, %rax
> > +# endif
> > + ret
> > +
> > +L(page_cross):
> > + movl %eax, %ecx
> > +# ifdef USE_AS_WCSLEN
> > + andl $(VEC_SIZE - 1), %ecx
> > + sarl $2, %ecx
> > +# endif
> > + /* ecx contains number of w[char] to be skipped as a result
> > + of address alignment. */
> > + xorq %rdi, %rax
> > + VPCMP $0, (PAGE_SIZE - VEC_SIZE)(%rax), %VMM0, %k0
> > + KMOV %k0, %RAX
> > + /* Ignore number of character for alignment adjustment. */
> > + SHR %cl, %RAX
> > + jz L(align_more)
> > +
> > + bsf %RAX, %RAX
> > +# ifdef USE_AS_STRNLEN
> > + cmpq %rsi, %rax
> > + cmovnb %rsi, %rax
> > +# endif
> > + ret
> > +
> > +END (STRLEN)
> > +#endif
> > diff --git a/sysdeps/x86_64/multiarch/strlen-evex512.S b/sysdeps/x86_64/multiarch/strlen-evex512.S
> > new file mode 100644
> > index 0000000000..116f8981c8
> > --- /dev/null
> > +++ b/sysdeps/x86_64/multiarch/strlen-evex512.S
> > @@ -0,0 +1,7 @@
> > +#ifndef STRLEN
> > +# define STRLEN __strlen_evex512
> > +#endif
> > +
> > +#define VEC_SIZE 64
> > +
> > +#include "strlen-evex-base.S"
> > diff --git a/sysdeps/x86_64/multiarch/strnlen-evex512.S b/sysdeps/x86_64/multiarch/strnlen-evex512.S
> > new file mode 100644
> > index 0000000000..0b7f220214
> > --- /dev/null
> > +++ b/sysdeps/x86_64/multiarch/strnlen-evex512.S
> > @@ -0,0 +1,4 @@
> > +#define STRLEN __strnlen_evex512
> > +#define USE_AS_STRNLEN 1
> > +
> > +#include "strlen-evex512.S"
> > diff --git a/sysdeps/x86_64/multiarch/wcslen-evex512.S b/sysdeps/x86_64/multiarch/wcslen-evex512.S
> > new file mode 100644
> > index 0000000000..f59c372b78
> > --- /dev/null
> > +++ b/sysdeps/x86_64/multiarch/wcslen-evex512.S
> > @@ -0,0 +1,4 @@
> > +#define STRLEN __wcslen_evex512
> > +#define USE_AS_WCSLEN 1
> > +
> > +#include "strlen-evex512.S"
> > diff --git a/sysdeps/x86_64/multiarch/wcsnlen-evex512.S b/sysdeps/x86_64/multiarch/wcsnlen-evex512.S
> > new file mode 100644
> > index 0000000000..73dcf2f210
> > --- /dev/null
> > +++ b/sysdeps/x86_64/multiarch/wcsnlen-evex512.S
> > @@ -0,0 +1,5 @@
> > +#define STRLEN __wcsnlen_evex512
> > +#define USE_AS_WCSLEN 1
> > +#define USE_AS_STRNLEN 1
> > +
> > +#include "strlen-evex512.S"
> > --
> > 2.35.3
> >
>
> LGTM.
>
> Reviewed-by: Noah Goldstein <goldstein.w.n@gmail.com>
I would like to backport this patch to release branches.
Any comments or objections?
--Sunil
^ permalink raw reply [flat|nested] 12+ messages in thread
end of thread, other threads:[~2022-07-14 0:04 UTC | newest]
Thread overview: 12+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-05-18 18:59 [PATCH] x86_64: Implement evex512 version of strlen, strnlen, wcslen and wcsnlen Sunil K Pandey
2022-05-18 20:29 ` Noah Goldstein
2022-05-19 3:33 ` Sunil Pandey
2022-05-19 3:48 ` [PATCH v2] " Sunil K Pandey
2022-05-19 15:03 ` Noah Goldstein
2022-05-25 13:43 ` [PATCH v3] " Sunil K Pandey
2022-05-25 17:10 ` Noah Goldstein
2022-05-25 18:20 ` Sunil Pandey
2022-05-26 18:35 ` [PATCH v4] " Sunil K Pandey
2022-05-26 20:07 ` Noah Goldstein
2022-07-14 0:03 ` Sunil Pandey
2022-05-19 4:41 ` [PATCH] " Noah Goldstein
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).