[PATCH] x86_64: Implement evex512 version of strlen, strnlen, wcslen and wcsnlen

public inbox for libc-alpha@sourceware.org
 help / color / mirror / Atom feed

* [PATCH] x86_64: Implement evex512 version of strlen, strnlen, wcslen and wcsnlen
@ 2022-05-18 18:59 Sunil K Pandey
  2022-05-18 20:29 ` Noah Goldstein
  0 siblings, 1 reply; 12+ messages in thread
From: Sunil K Pandey @ 2022-05-18 18:59 UTC (permalink / raw)
  To: libc-alpha

This patch implements following evex512 version of string functions.
Perf gain up to 50% as compared to evex, depending on length and
alignment.

- String length function using 512 bit vectors.
- String N length using 512 bit vectors.
- Wide string length using 512 bit vectors.
- Wide string N length using 512 bit vectors.
---
 sysdeps/x86_64/multiarch/Makefile          |   4 +
 sysdeps/x86_64/multiarch/ifunc-impl-list.c |  20 ++
 sysdeps/x86_64/multiarch/strlen-evex512.S  | 291 +++++++++++++++++++++
 sysdeps/x86_64/multiarch/strnlen-evex512.S |   4 +
 sysdeps/x86_64/multiarch/wcslen-evex512.S  |   4 +
 sysdeps/x86_64/multiarch/wcsnlen-evex512.S |   5 +
 6 files changed, 328 insertions(+)
 create mode 100644 sysdeps/x86_64/multiarch/strlen-evex512.S
 create mode 100644 sysdeps/x86_64/multiarch/strnlen-evex512.S
 create mode 100644 sysdeps/x86_64/multiarch/wcslen-evex512.S
 create mode 100644 sysdeps/x86_64/multiarch/wcsnlen-evex512.S

diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
index f3ab5e0928..d0869c3ac3 100644
--- a/sysdeps/x86_64/multiarch/Makefile
+++ b/sysdeps/x86_64/multiarch/Makefile
@@ -81,6 +81,7 @@ sysdep_routines += \
   strlen-avx2 \
   strlen-avx2-rtm \
   strlen-evex \
+  strlen-evex512 \
   strlen-sse2 \
   strncase_l-avx2 \
   strncase_l-avx2-rtm \
@@ -105,6 +106,7 @@ sysdep_routines += \
   strnlen-avx2 \
   strnlen-avx2-rtm \
   strnlen-evex \
+  strnlen-evex512 \
   strnlen-sse2 \
   strpbrk-c \
   strpbrk-sse2 \
@@ -138,6 +140,7 @@ sysdep_routines += \
   wcslen-avx2 \
   wcslen-avx2-rtm \
   wcslen-evex \
+  wcslen-evex512 \
   wcslen-sse2 \
   wcslen-sse4_1 \
   wcsncmp-avx2 \
@@ -148,6 +151,7 @@ sysdep_routines += \
   wcsnlen-avx2-rtm \
   wcsnlen-c \
   wcsnlen-evex \
+  wcsnlen-evex512 \
   wcsnlen-sse4_1 \
   wcsrchr-avx2 \
   wcsrchr-avx2-rtm \
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
index 7218095430..c5cd9466fe 100644
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
@@ -328,6 +328,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 			       && CPU_FEATURE_USABLE (AVX512BW)
 			       && CPU_FEATURE_USABLE (BMI2)),
 			      __strlen_evex)
+	      IFUNC_IMPL_ADD (array, i, strlen,
+			      (CPU_FEATURE_USABLE (AVX512VL)
+			       && CPU_FEATURE_USABLE (AVX512BW)
+			       && CPU_FEATURE_USABLE (BMI2)),
+			      __strlen_evex512)
 	      IFUNC_IMPL_ADD (array, i, strlen, 1, __strlen_sse2))
 
   /* Support sysdeps/x86_64/multiarch/strnlen.c.  */
@@ -346,6 +351,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 			       && CPU_FEATURE_USABLE (AVX512BW)
 			       && CPU_FEATURE_USABLE (BMI2)),
 			      __strnlen_evex)
+	      IFUNC_IMPL_ADD (array, i, strnlen,
+			      (CPU_FEATURE_USABLE (AVX512VL)
+			       && CPU_FEATURE_USABLE (AVX512BW)
+			       && CPU_FEATURE_USABLE (BMI2)),
+			      __strnlen_evex512)
 	      IFUNC_IMPL_ADD (array, i, strnlen, 1, __strnlen_sse2))
 
   /* Support sysdeps/x86_64/multiarch/stpncpy.c.  */
@@ -699,6 +709,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 			       && CPU_FEATURE_USABLE (AVX512BW)
 			       && CPU_FEATURE_USABLE (BMI2)),
 			      __wcslen_evex)
+	      IFUNC_IMPL_ADD (array, i, wcslen,
+			      (CPU_FEATURE_USABLE (AVX512VL)
+			       && CPU_FEATURE_USABLE (AVX512BW)
+			       && CPU_FEATURE_USABLE (BMI2)),
+			      __wcslen_evex512)
 	      IFUNC_IMPL_ADD (array, i, wcslen,
 			      CPU_FEATURE_USABLE (SSE4_1),
 			      __wcslen_sse4_1)
@@ -720,6 +735,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 			       && CPU_FEATURE_USABLE (AVX512BW)
 			       && CPU_FEATURE_USABLE (BMI2)),
 			      __wcsnlen_evex)
+	      IFUNC_IMPL_ADD (array, i, wcsnlen,
+			      (CPU_FEATURE_USABLE (AVX512VL)
+			       && CPU_FEATURE_USABLE (AVX512BW)
+			       && CPU_FEATURE_USABLE (BMI2)),
+			      __wcsnlen_evex512)
 	      IFUNC_IMPL_ADD (array, i, wcsnlen,
 			      CPU_FEATURE_USABLE (SSE4_1),
 			      __wcsnlen_sse4_1)
diff --git a/sysdeps/x86_64/multiarch/strlen-evex512.S b/sysdeps/x86_64/multiarch/strlen-evex512.S
new file mode 100644
index 0000000000..13a6b34615
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strlen-evex512.S
@@ -0,0 +1,291 @@
+/* Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#if IS_IN (libc)
+
+# include <sysdep.h>
+
+# ifndef STRLEN
+#  define STRLEN	__strlen_evex512
+# endif
+
+# define VMOVA		vmovdqa64
+# ifdef USE_AS_WCSLEN
+#  define VPCMP		vpcmpd
+#  define VPMINU	vpminud
+#  define CHAR_SIZE	4
+# else
+#  define VPCMP		vpcmpb
+#  define VPMINU	vpminub
+#  define CHAR_SIZE	1
+# endif
+
+# define XMM0		xmm16
+# define ZMM0		zmm16
+# define ZMM1		zmm17
+# define ZMM2		zmm18
+# define ZMM3		zmm19
+# define ZMM4		zmm20
+# define VEC_SIZE	64
+# define PAGE_SIZE	4096
+# define CHAR_PER_VEC	(VEC_SIZE / CHAR_SIZE)
+
+	.section .text.evex512, "ax", @progbits
+/* Aligning entry point to 64 byte, provides better performance for
+   one vector length string.  */
+ENTRY_P2ALIGN (STRLEN, 6)
+# ifdef USE_AS_STRNLEN
+        /* Check zero length.  */
+	test	%RSI_LP, %RSI_LP
+	jz	L(zero)
+#  ifdef __ILP32__
+	/* Clear the upper 32 bits.  */
+	movl	%esi, %esi
+#  endif
+# endif
+
+	movl	%edi, %ecx
+	vpxorq	%XMM0, %XMM0, %XMM0
+	andl	$(PAGE_SIZE - 1), %ecx
+	cmpl	$(PAGE_SIZE - VEC_SIZE), %ecx
+	ja	L(page_cross)
+
+	/* Compare [w]char for null, mask bit will be set for match.  */
+	VPCMP	$0, (%rdi), %ZMM0, %k0
+	kmovq	%k0, %rax
+	testq	%rax, %rax
+	jz	L(align_more)
+
+	tzcntq	%rax, %rax
+# ifdef USE_AS_STRNLEN
+	cmpq	%rsi, %rax
+	jae	L(ret_max)
+# endif
+	ret
+
+# ifdef USE_AS_STRNLEN
+	/* eax instead of rax used to save encoding space.  */
+L(zero):
+	xorl	%eax, %eax
+	ret
+# endif
+
+	/* At this point vector max length reached.  */
+# ifdef USE_AS_STRNLEN
+L(ret_max):
+	movq	%rsi, %rax
+	ret
+# endif
+
+L(page_cross):
+	andl	$(VEC_SIZE - 1), %ecx
+# ifdef USE_AS_WCSLEN
+	sarl	$2, %ecx
+# endif
+	/* ecx contains number of w[char] to be skipped as a result
+	   of address alignment.  */
+	movq	%rdi, %rax
+	andq	$-VEC_SIZE, %rax
+	VPCMP	$0, (%rax), %ZMM0, %k0
+	kmovq	%k0, %rax
+	/* Ignore number of character for alignment adjustment.  */
+	shrq	%cl, %rax
+	jz	L(align_more)
+
+	tzcntq	%rax, %rax
+# ifdef USE_AS_STRNLEN
+	cmpq	%rsi, %rax
+	jae	L(ret_max)
+# endif
+	ret
+
+L(align_more):
+	leaq	VEC_SIZE(%rdi), %rax
+	/* Align rax to VEC_SIZE.  */
+	andq	$-VEC_SIZE, %rax
+# ifdef USE_AS_STRNLEN
+	movq	%rax, %rdx
+	subq	%rdi, %rdx
+#  ifdef USE_AS_WCSLEN
+	shrq	$2, %rdx
+#  endif
+	/* At this point rdx contains [w]chars already compared.  */
+	cmpq	%rsi, %rdx
+	jae	L(ret_max)
+	subq	%rsi, %rdx
+	negq	%rdx
+	/* At this point rdx contains number of w[char] needs to go.
+	   Now onwards rdx will keep decrementing with each compare.  */
+# endif
+
+	/* Loop unroll 4 times for 4 vector loop.  */
+	VPCMP	$0, (%rax), %ZMM0, %k0
+	kmovq	%k0, %rcx
+	testq	%rcx, %rcx
+	jnz	L(first_vector)
+
+# ifdef USE_AS_STRNLEN
+	subq    $CHAR_PER_VEC, %rdx
+	jbe	L(ret_max)
+# endif
+
+	VPCMP	$0, VEC_SIZE(%rax), %ZMM0, %k0
+	kmovq	%k0, %rcx
+	testq	%rcx, %rcx
+	jnz	L(second_vector)
+
+# ifdef USE_AS_STRNLEN
+	subq    $CHAR_PER_VEC, %rdx
+	jbe	L(ret_max)
+# endif
+
+	VPCMP	$0, (2 * VEC_SIZE)(%rax), %ZMM0, %k0
+	kmovq	%k0, %rcx
+	testq	%rcx, %rcx
+	jnz	L(third_vector)
+
+# ifdef USE_AS_STRNLEN
+	subq    $CHAR_PER_VEC, %rdx
+	jbe	L(ret_max)
+# endif
+
+	VPCMP	$0, (3 * VEC_SIZE)(%rax), %ZMM0, %k0
+	kmovq	%k0, %rcx
+	testq	%rcx, %rcx
+	jnz	L(fourth_vector)
+
+	addq    $(4 * VEC_SIZE), %rax
+
+# ifdef USE_AS_STRNLEN
+	/* Instead of decreasing, rdx increased to prepare for loop
+	   first iteration.  Incremented 3 times because one increment
+	   cancelled by previous decrement.  */
+	addq    $(3 * CHAR_PER_VEC), %rdx
+# endif
+
+	/* Test if address is already 4 * VEC_SIZE byte aligned goto
+	   loop.  */
+	testq   $(3 * VEC_SIZE), %rax
+	jz      L(loop)
+
+	movq	%rax, %rcx
+
+	/* Align address to 4 * VEC_SIZE for loop.  */
+	andq	$-(4 * VEC_SIZE), %rax
+
+# ifdef USE_AS_STRNLEN
+	subq	%rax, %rcx
+#  ifdef USE_AS_WCSLEN
+	sarq	$2, %rcx
+#  endif
+	/* rcx contains number of [w]char will be recompared due to
+	   alignment fixes.  rdx must be incremented by rcx to offset
+	   alignment adjustmentment.  */
+	addq	%rcx, %rdx
+# endif
+
+L(loop):
+# ifdef USE_AS_STRNLEN
+	subq	$(CHAR_PER_VEC * 4), %rdx
+	jbe	L(ret_max)
+# endif
+	/* VPMINU and VPCMP combination provide better perfomance as
+	   compared to alternative combinations.  */
+	VMOVA	(%rax), %ZMM1
+	VPMINU	(VEC_SIZE)(%rax), %ZMM1, %ZMM2
+	VMOVA   (2 * VEC_SIZE)(%rax), %ZMM3
+	VPMINU	(3 * VEC_SIZE)(%rax), %ZMM3, %ZMM4
+
+	VPCMP	$0, %ZMM2, %ZMM0, %k0
+	VPCMP	$0, %ZMM4, %ZMM0, %k1
+
+	addq	$(4 * VEC_SIZE), %rax
+	kortestq %k0, %k1
+	jz	L(loop)
+
+	/* Need 4 vector subtraction because address incremented in
+	   the loop before terminating condition check.  Also want to
+	   reuse code for exit condition before and after the loop.  */
+	subq	$(4 * VEC_SIZE), %rax
+
+	VPCMP	$0, %ZMM1, %ZMM0, %k2
+	kmovq	%k2, %rcx
+	testq	%rcx, %rcx
+	jnz	L(first_vector)
+
+	kmovq	%k0, %rcx
+	/* At this point, if k0 is non zero, null char must be in the
+	   second vector.  */
+	testq	%rcx, %rcx
+	jnz	L(second_vector)
+
+	VPCMP	$0, %ZMM3, %ZMM0, %k3
+	kmovq	%k3, %rcx
+	testq	%rcx, %rcx
+	jnz	L(third_vector)
+	/* At this point null [w]char must be in the fourth vector so no
+	   need to check.  */
+	kmovq	%k1, %rcx
+
+	/* Termination fourth, third, second vector are pretty much
+	   same, implemented this way to avoid branching and reuse code
+	   from pre loop exit condition.  */
+L(fourth_vector):
+	addq	$(3 * VEC_SIZE), %rax
+	tzcntq	%rcx, %rcx
+	subq	%rdi, %rax
+# ifdef USE_AS_WCSLEN
+	sarq	$2, %rax
+# endif
+	addq	%rcx, %rax
+# ifdef USE_AS_STRNLEN
+	cmpq	%rsi, %rax
+	jae	L(ret_max)
+# endif
+	ret
+
+L(third_vector):
+	addq	$(2 * VEC_SIZE), %rax
+	tzcntq	%rcx, %rcx
+	subq	%rdi, %rax
+# ifdef USE_AS_WCSLEN
+	sarq	$2, %rax
+# endif
+	addq	%rcx, %rax
+# ifdef USE_AS_STRNLEN
+	cmpq	%rsi, %rax
+	jae	L(ret_max)
+# endif
+	ret
+
+L(second_vector):
+	addq	$VEC_SIZE, %rax
+L(first_vector):
+	tzcntq	%rcx, %rcx
+	subq	%rdi, %rax
+# ifdef USE_AS_WCSLEN
+	sarq	$2, %rax
+# endif
+	addq	%rcx, %rax
+# ifdef USE_AS_STRNLEN
+	cmpq	%rsi, %rax
+	jae	L(ret_max)
+# endif
+	ret
+
+END (STRLEN)
+#endif
diff --git a/sysdeps/x86_64/multiarch/strnlen-evex512.S b/sysdeps/x86_64/multiarch/strnlen-evex512.S
new file mode 100644
index 0000000000..0b7f220214
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strnlen-evex512.S
@@ -0,0 +1,4 @@
+#define STRLEN __strnlen_evex512
+#define USE_AS_STRNLEN 1
+
+#include "strlen-evex512.S"
diff --git a/sysdeps/x86_64/multiarch/wcslen-evex512.S b/sysdeps/x86_64/multiarch/wcslen-evex512.S
new file mode 100644
index 0000000000..f59c372b78
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wcslen-evex512.S
@@ -0,0 +1,4 @@
+#define STRLEN __wcslen_evex512
+#define USE_AS_WCSLEN 1
+
+#include "strlen-evex512.S"
diff --git a/sysdeps/x86_64/multiarch/wcsnlen-evex512.S b/sysdeps/x86_64/multiarch/wcsnlen-evex512.S
new file mode 100644
index 0000000000..73dcf2f210
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wcsnlen-evex512.S
@@ -0,0 +1,5 @@
+#define STRLEN __wcsnlen_evex512
+#define USE_AS_WCSLEN 1
+#define USE_AS_STRNLEN 1
+
+#include "strlen-evex512.S"
-- 
2.35.3


^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH] x86_64: Implement evex512 version of strlen, strnlen, wcslen and wcsnlen
  2022-05-18 18:59 [PATCH] x86_64: Implement evex512 version of strlen, strnlen, wcslen and wcsnlen Sunil K Pandey
@ 2022-05-18 20:29 ` Noah Goldstein
  2022-05-19  3:33   ` Sunil Pandey
  0 siblings, 1 reply; 12+ messages in thread
From: Noah Goldstein @ 2022-05-18 20:29 UTC (permalink / raw)
  To: Sunil K Pandey; +Cc: GNU C Library

On Wed, May 18, 2022 at 1:59 PM Sunil K Pandey via Libc-alpha
<libc-alpha@sourceware.org> wrote:
>
> This patch implements following evex512 version of string functions.
> Perf gain up to 50% as compared to evex, depending on length and
> alignment.

Can you include a csv (or any consistent fmt really) somewhere of all
the benchmarks
and results of ~10-20 runs and the hardware your benchmarking on?
>
> - String length function using 512 bit vectors.
> - String N length using 512 bit vectors.
> - Wide string length using 512 bit vectors.
> - Wide string N length using 512 bit vectors.
> ---
>  sysdeps/x86_64/multiarch/Makefile          |   4 +
>  sysdeps/x86_64/multiarch/ifunc-impl-list.c |  20 ++
>  sysdeps/x86_64/multiarch/strlen-evex512.S  | 291 +++++++++++++++++++++
>  sysdeps/x86_64/multiarch/strnlen-evex512.S |   4 +
>  sysdeps/x86_64/multiarch/wcslen-evex512.S  |   4 +
>  sysdeps/x86_64/multiarch/wcsnlen-evex512.S |   5 +
>  6 files changed, 328 insertions(+)
>  create mode 100644 sysdeps/x86_64/multiarch/strlen-evex512.S
>  create mode 100644 sysdeps/x86_64/multiarch/strnlen-evex512.S
>  create mode 100644 sysdeps/x86_64/multiarch/wcslen-evex512.S
>  create mode 100644 sysdeps/x86_64/multiarch/wcsnlen-evex512.S
>
> diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
> index f3ab5e0928..d0869c3ac3 100644
> --- a/sysdeps/x86_64/multiarch/Makefile
> +++ b/sysdeps/x86_64/multiarch/Makefile
> @@ -81,6 +81,7 @@ sysdep_routines += \
>    strlen-avx2 \
>    strlen-avx2-rtm \
>    strlen-evex \
> +  strlen-evex512 \
>    strlen-sse2 \
>    strncase_l-avx2 \
>    strncase_l-avx2-rtm \
> @@ -105,6 +106,7 @@ sysdep_routines += \
>    strnlen-avx2 \
>    strnlen-avx2-rtm \
>    strnlen-evex \
> +  strnlen-evex512 \
>    strnlen-sse2 \
>    strpbrk-c \
>    strpbrk-sse2 \
> @@ -138,6 +140,7 @@ sysdep_routines += \
>    wcslen-avx2 \
>    wcslen-avx2-rtm \
>    wcslen-evex \
> +  wcslen-evex512 \
>    wcslen-sse2 \
>    wcslen-sse4_1 \
>    wcsncmp-avx2 \
> @@ -148,6 +151,7 @@ sysdep_routines += \
>    wcsnlen-avx2-rtm \
>    wcsnlen-c \
>    wcsnlen-evex \
> +  wcsnlen-evex512 \
>    wcsnlen-sse4_1 \
>    wcsrchr-avx2 \
>    wcsrchr-avx2-rtm \
> diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> index 7218095430..c5cd9466fe 100644
> --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> @@ -328,6 +328,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
>                                && CPU_FEATURE_USABLE (AVX512BW)
>                                && CPU_FEATURE_USABLE (BMI2)),
>                               __strlen_evex)
> +             IFUNC_IMPL_ADD (array, i, strlen,
> +                             (CPU_FEATURE_USABLE (AVX512VL)
> +                              && CPU_FEATURE_USABLE (AVX512BW)
> +                              && CPU_FEATURE_USABLE (BMI2)),
> +                             __strlen_evex512)
>               IFUNC_IMPL_ADD (array, i, strlen, 1, __strlen_sse2))
>
>    /* Support sysdeps/x86_64/multiarch/strnlen.c.  */
> @@ -346,6 +351,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
>                                && CPU_FEATURE_USABLE (AVX512BW)
>                                && CPU_FEATURE_USABLE (BMI2)),
>                               __strnlen_evex)
> +             IFUNC_IMPL_ADD (array, i, strnlen,
> +                             (CPU_FEATURE_USABLE (AVX512VL)
> +                              && CPU_FEATURE_USABLE (AVX512BW)
> +                              && CPU_FEATURE_USABLE (BMI2)),
> +                             __strnlen_evex512)
>               IFUNC_IMPL_ADD (array, i, strnlen, 1, __strnlen_sse2))
>
>    /* Support sysdeps/x86_64/multiarch/stpncpy.c.  */
> @@ -699,6 +709,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
>                                && CPU_FEATURE_USABLE (AVX512BW)
>                                && CPU_FEATURE_USABLE (BMI2)),
>                               __wcslen_evex)
> +             IFUNC_IMPL_ADD (array, i, wcslen,
> +                             (CPU_FEATURE_USABLE (AVX512VL)
> +                              && CPU_FEATURE_USABLE (AVX512BW)
> +                              && CPU_FEATURE_USABLE (BMI2)),
> +                             __wcslen_evex512)
>               IFUNC_IMPL_ADD (array, i, wcslen,
>                               CPU_FEATURE_USABLE (SSE4_1),
>                               __wcslen_sse4_1)
> @@ -720,6 +735,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
>                                && CPU_FEATURE_USABLE (AVX512BW)
>                                && CPU_FEATURE_USABLE (BMI2)),
>                               __wcsnlen_evex)
> +             IFUNC_IMPL_ADD (array, i, wcsnlen,
> +                             (CPU_FEATURE_USABLE (AVX512VL)
> +                              && CPU_FEATURE_USABLE (AVX512BW)
> +                              && CPU_FEATURE_USABLE (BMI2)),
> +                             __wcsnlen_evex512)
>               IFUNC_IMPL_ADD (array, i, wcsnlen,
>                               CPU_FEATURE_USABLE (SSE4_1),
>                               __wcsnlen_sse4_1)
> diff --git a/sysdeps/x86_64/multiarch/strlen-evex512.S b/sysdeps/x86_64/multiarch/strlen-evex512.S
> new file mode 100644
> index 0000000000..13a6b34615
> --- /dev/null
> +++ b/sysdeps/x86_64/multiarch/strlen-evex512.S
> @@ -0,0 +1,291 @@
> +/* Copyright (C) 2022 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +#if IS_IN (libc)
> +
> +# include <sysdep.h>
> +
> +# ifndef STRLEN
> +#  define STRLEN       __strlen_evex512
> +# endif
> +
> +# define VMOVA         vmovdqa64
> +# ifdef USE_AS_WCSLEN
> +#  define VPCMP                vpcmpd
> +#  define VPMINU       vpminud
> +#  define CHAR_SIZE    4
> +# else
> +#  define VPCMP                vpcmpb
> +#  define VPMINU       vpminub
> +#  define CHAR_SIZE    1
> +# endif
> +
> +# define XMM0          xmm16
> +# define ZMM0          zmm16
> +# define ZMM1          zmm17
> +# define ZMM2          zmm18
> +# define ZMM3          zmm19
> +# define ZMM4          zmm20
> +# define VEC_SIZE      64
> +# define PAGE_SIZE     4096
> +# define CHAR_PER_VEC  (VEC_SIZE / CHAR_SIZE)

Is it possible to integrate this file cleanly with the evex256 version?
Something similar to what we do for memset/memmove.
> +
> +       .section .text.evex512, "ax", @progbits
> +/* Aligning entry point to 64 byte, provides better performance for
> +   one vector length string.  */
> +ENTRY_P2ALIGN (STRLEN, 6)
> +# ifdef USE_AS_STRNLEN
> +        /* Check zero length.  */
> +       test    %RSI_LP, %RSI_LP
> +       jz      L(zero)
> +#  ifdef __ILP32__
> +       /* Clear the upper 32 bits.  */
> +       movl    %esi, %esi
> +#  endif
> +# endif
> +
> +       movl    %edi, %ecx
> +       vpxorq  %XMM0, %XMM0, %XMM0
> +       andl    $(PAGE_SIZE - 1), %ecx
> +       cmpl    $(PAGE_SIZE - VEC_SIZE), %ecx
> +       ja      L(page_cross)
> +
> +       /* Compare [w]char for null, mask bit will be set for match.  */
> +       VPCMP   $0, (%rdi), %ZMM0, %k0
> +       kmovq   %k0, %rax
> +       testq   %rax, %rax
> +       jz      L(align_more)
> +
> +       tzcntq  %rax, %rax
> +# ifdef USE_AS_STRNLEN
> +       cmpq    %rsi, %rax
cmpl

> +       jae     L(ret_max)
> +# endif
> +       ret
> +
> +# ifdef USE_AS_STRNLEN
> +       /* eax instead of rax used to save encoding space.  */
> +L(zero):
> +       xorl    %eax, %eax
> +       ret
> +# endif
> +
> +       /* At this point vector max length reached.  */
> +# ifdef USE_AS_STRNLEN
> +L(ret_max):
> +       movq    %rsi, %rax
> +       ret
> +# endif
> +
> +L(page_cross):

Imo unless you need the 2-byte encoding on the jump this should be at
the end of the
file as its expected to not be hot.
> +       andl    $(VEC_SIZE - 1), %ecx
> +# ifdef USE_AS_WCSLEN
> +       sarl    $2, %ecx
> +# endif
> +       /* ecx contains number of w[char] to be skipped as a result
> +          of address alignment.  */
> +       movq    %rdi, %rax
> +       andq    $-VEC_SIZE, %rax
> +       VPCMP   $0, (%rax), %ZMM0, %k0
> +       kmovq   %k0, %rax
> +       /* Ignore number of character for alignment adjustment.  */
> +       shrq    %cl, %rax
> +       jz      L(align_more)
> +
> +       tzcntq  %rax, %rax
> +# ifdef USE_AS_STRNLEN
> +       cmpq    %rsi, %rax
> +       jae     L(ret_max)
> +# endif
> +       ret
> +
> +L(align_more):
> +       leaq    VEC_SIZE(%rdi), %rax
> +       /* Align rax to VEC_SIZE.  */
> +       andq    $-VEC_SIZE, %rax
> +# ifdef USE_AS_STRNLEN
> +       movq    %rax, %rdx
> +       subq    %rdi, %rdx
> +#  ifdef USE_AS_WCSLEN
> +       shrq    $2, %rdx
> +#  endif
> +       /* At this point rdx contains [w]chars already compared.  */
> +       cmpq    %rsi, %rdx
> +       jae     L(ret_max)
> +       subq    %rsi, %rdx
> +       negq    %rdx
> +       /* At this point rdx contains number of w[char] needs to go.
> +          Now onwards rdx will keep decrementing with each compare.  */
> +# endif
> +
> +       /* Loop unroll 4 times for 4 vector loop.  */
> +       VPCMP   $0, (%rax), %ZMM0, %k0
> +       kmovq   %k0, %rcx
> +       testq   %rcx, %rcx
> +       jnz     L(first_vector)

Just to keep consistent with the other files can you
rename first_vector/second_vector... to ret_vec_x{N}
or something like that.
> +
> +# ifdef USE_AS_STRNLEN
> +       subq    $CHAR_PER_VEC, %rdx
> +       jbe     L(ret_max)
> +# endif
> +
> +       VPCMP   $0, VEC_SIZE(%rax), %ZMM0, %k0
> +       kmovq   %k0, %rcx
> +       testq   %rcx, %rcx
> +       jnz     L(second_vector)
> +
> +# ifdef USE_AS_STRNLEN
> +       subq    $CHAR_PER_VEC, %rdx
> +       jbe     L(ret_max)
> +# endif

The evex256 / avx2 versions do a simple check if we will be able
to do all 4 aligning compares w.o a branch. This saves total
branches. Why not do something similar here?
> +
> +       VPCMP   $0, (2 * VEC_SIZE)(%rax), %ZMM0, %k0
> +       kmovq   %k0, %rcx
> +       testq   %rcx, %rcx
> +       jnz     L(third_vector)
> +
> +# ifdef USE_AS_STRNLEN
> +       subq    $CHAR_PER_VEC, %rdx
> +       jbe     L(ret_max)
> +# endif
> +
> +       VPCMP   $0, (3 * VEC_SIZE)(%rax), %ZMM0, %k0
> +       kmovq   %k0, %rcx
> +       testq   %rcx, %rcx
> +       jnz     L(fourth_vector)
> +
> +       addq    $(4 * VEC_SIZE), %rax
> +
> +# ifdef USE_AS_STRNLEN
> +       /* Instead of decreasing, rdx increased to prepare for loop
> +          first iteration.  Incremented 3 times because one increment
> +          cancelled by previous decrement.  */
> +       addq    $(3 * CHAR_PER_VEC), %rdx
> +# endif
> +
> +       /* Test if address is already 4 * VEC_SIZE byte aligned goto
> +          loop.  */
> +       testq   $(3 * VEC_SIZE), %rax
> +       jz      L(loop)
> +
> +       movq    %rax, %rcx
> +
> +       /* Align address to 4 * VEC_SIZE for loop.  */
> +       andq    $-(4 * VEC_SIZE), %rax
> +
> +# ifdef USE_AS_STRNLEN
> +       subq    %rax, %rcx
> +#  ifdef USE_AS_WCSLEN
> +       sarq    $2, %rcx
> +#  endif
> +       /* rcx contains number of [w]char will be recompared due to
> +          alignment fixes.  rdx must be incremented by rcx to offset
> +          alignment adjustmentment.  */
> +       addq    %rcx, %rdx
> +# endif
> +
> +L(loop):
> +# ifdef USE_AS_STRNLEN
> +       subq    $(CHAR_PER_VEC * 4), %rdx
> +       jbe     L(ret_max)

we have potential to overread by 255 bytes. Not correctness issue because
we are page aligned by seems like a possible perf issue.
> +# endif
> +       /* VPMINU and VPCMP combination provide better perfomance as
> +          compared to alternative combinations.  */
> +       VMOVA   (%rax), %ZMM1
> +       VPMINU  (VEC_SIZE)(%rax), %ZMM1, %ZMM2
> +       VMOVA   (2 * VEC_SIZE)(%rax), %ZMM3
> +       VPMINU  (3 * VEC_SIZE)(%rax), %ZMM3, %ZMM4

I think doing 4x in the main loop is probably overkill no?
Aligning to 256 is pretty extreme.

Also I don't think the 4x zmm loads can even keep up with
2x / cycle so seems like it may not be worth wasting up to
255 bytes to get it.
> +
> +       VPCMP   $0, %ZMM2, %ZMM0, %k0
> +       VPCMP   $0, %ZMM4, %ZMM0, %k1
> +
> +       addq    $(4 * VEC_SIZE), %rax
> +       kortestq %k0, %k1
> +       jz      L(loop)
> +
> +       /* Need 4 vector subtraction because address incremented in
> +          the loop before terminating condition check.  Also want to
> +          reuse code for exit condition before and after the loop.  */
> +       subq    $(4 * VEC_SIZE), %rax
> +
> +       VPCMP   $0, %ZMM1, %ZMM0, %k2
> +       kmovq   %k2, %rcx
> +       testq   %rcx, %rcx
> +       jnz     L(first_vector)
> +
> +       kmovq   %k0, %rcx
> +       /* At this point, if k0 is non zero, null char must be in the
> +          second vector.  */
> +       testq   %rcx, %rcx
> +       jnz     L(second_vector)
> +
> +       VPCMP   $0, %ZMM3, %ZMM0, %k3
> +       kmovq   %k3, %rcx
> +       testq   %rcx, %rcx
> +       jnz     L(third_vector)
> +       /* At this point null [w]char must be in the fourth vector so no
> +          need to check.  */
> +       kmovq   %k1, %rcx
> +
> +       /* Termination fourth, third, second vector are pretty much
> +          same, implemented this way to avoid branching and reuse code
> +          from pre loop exit condition.  */
> +L(fourth_vector):
> +       addq    $(3 * VEC_SIZE), %rax
> +       tzcntq  %rcx, %rcx
> +       subq    %rdi, %rax
Can this be hoisted out to the begining of L(aligned_more).
It seems every return path uses it.

> +# ifdef USE_AS_WCSLEN
> +       sarq    $2, %rax
> +# endif
> +       addq    %rcx, %rax

if not wcslen probably faster to use lea instead of 2x add

> +# ifdef USE_AS_STRNLEN
> +       cmpq    %rsi, %rax
> +       jae     L(ret_max)
> +# endif
> +       ret
> +
> +L(third_vector):
> +       addq    $(2 * VEC_SIZE), %rax
> +       tzcntq  %rcx, %rcx
> +       subq    %rdi, %rax
> +# ifdef USE_AS_WCSLEN
> +       sarq    $2, %rax
> +# endif
> +       addq    %rcx, %rax
> +# ifdef USE_AS_STRNLEN
> +       cmpq    %rsi, %rax
> +       jae     L(ret_max)
> +# endif
> +       ret
> +
> +L(second_vector):
> +       addq    $VEC_SIZE, %rax
> +L(first_vector):
> +       tzcntq  %rcx, %rcx
> +       subq    %rdi, %rax
> +# ifdef USE_AS_WCSLEN
> +       sarq    $2, %rax
> +# endif
> +       addq    %rcx, %rax
> +# ifdef USE_AS_STRNLEN
> +       cmpq    %rsi, %rax
> +       jae     L(ret_max)
> +# endif
> +       ret
> +
> +END (STRLEN)
> +#endif
> diff --git a/sysdeps/x86_64/multiarch/strnlen-evex512.S b/sysdeps/x86_64/multiarch/strnlen-evex512.S
> new file mode 100644
> index 0000000000..0b7f220214
> --- /dev/null
> +++ b/sysdeps/x86_64/multiarch/strnlen-evex512.S
> @@ -0,0 +1,4 @@
> +#define STRLEN __strnlen_evex512
> +#define USE_AS_STRNLEN 1
> +
> +#include "strlen-evex512.S"
> diff --git a/sysdeps/x86_64/multiarch/wcslen-evex512.S b/sysdeps/x86_64/multiarch/wcslen-evex512.S
> new file mode 100644
> index 0000000000..f59c372b78
> --- /dev/null
> +++ b/sysdeps/x86_64/multiarch/wcslen-evex512.S
> @@ -0,0 +1,4 @@
> +#define STRLEN __wcslen_evex512
> +#define USE_AS_WCSLEN 1
> +
> +#include "strlen-evex512.S"
> diff --git a/sysdeps/x86_64/multiarch/wcsnlen-evex512.S b/sysdeps/x86_64/multiarch/wcsnlen-evex512.S
> new file mode 100644
> index 0000000000..73dcf2f210
> --- /dev/null
> +++ b/sysdeps/x86_64/multiarch/wcsnlen-evex512.S
> @@ -0,0 +1,5 @@
> +#define STRLEN __wcsnlen_evex512
> +#define USE_AS_WCSLEN 1
> +#define USE_AS_STRNLEN 1
> +
> +#include "strlen-evex512.S"
> --
> 2.35.3
>

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH] x86_64: Implement evex512 version of strlen, strnlen, wcslen and wcsnlen
  2022-05-18 20:29 ` Noah Goldstein
@ 2022-05-19  3:33   ` Sunil Pandey
  2022-05-19  3:48     ` [PATCH v2] " Sunil K Pandey
  2022-05-19  4:41     ` [PATCH] " Noah Goldstein
  0 siblings, 2 replies; 12+ messages in thread
From: Sunil Pandey @ 2022-05-19  3:33 UTC (permalink / raw)
  To: Noah Goldstein; +Cc: GNU C Library

[-- Attachment #1: Type: text/plain, Size: 18531 bytes --]

On Wed, May 18, 2022 at 1:29 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> On Wed, May 18, 2022 at 1:59 PM Sunil K Pandey via Libc-alpha
> <libc-alpha@sourceware.org> wrote:
> >
> > This patch implements following evex512 version of string functions.
> > Perf gain up to 50% as compared to evex, depending on length and
> > alignment.
>
> Can you include a csv (or any consistent fmt really) somewhere of all
> the benchmarks
> and results of ~10-20 runs and the hardware your benchmarking on?

Machine:
Intel(R) Xeon(R) Gold 6148 CPU @ 2.40GHz
Fedora 35
Glibc master

20 iteration data for each function is attached, please use any text
editor(vi) to
access it.

> >
> > - String length function using 512 bit vectors.
> > - String N length using 512 bit vectors.
> > - Wide string length using 512 bit vectors.
> > - Wide string N length using 512 bit vectors.
> > ---
> >  sysdeps/x86_64/multiarch/Makefile          |   4 +
> >  sysdeps/x86_64/multiarch/ifunc-impl-list.c |  20 ++
> >  sysdeps/x86_64/multiarch/strlen-evex512.S  | 291 +++++++++++++++++++++
> >  sysdeps/x86_64/multiarch/strnlen-evex512.S |   4 +
> >  sysdeps/x86_64/multiarch/wcslen-evex512.S  |   4 +
> >  sysdeps/x86_64/multiarch/wcsnlen-evex512.S |   5 +
> >  6 files changed, 328 insertions(+)
> >  create mode 100644 sysdeps/x86_64/multiarch/strlen-evex512.S
> >  create mode 100644 sysdeps/x86_64/multiarch/strnlen-evex512.S
> >  create mode 100644 sysdeps/x86_64/multiarch/wcslen-evex512.S
> >  create mode 100644 sysdeps/x86_64/multiarch/wcsnlen-evex512.S
> >
> > diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
> > index f3ab5e0928..d0869c3ac3 100644
> > --- a/sysdeps/x86_64/multiarch/Makefile
> > +++ b/sysdeps/x86_64/multiarch/Makefile
> > @@ -81,6 +81,7 @@ sysdep_routines += \
> >    strlen-avx2 \
> >    strlen-avx2-rtm \
> >    strlen-evex \
> > +  strlen-evex512 \
> >    strlen-sse2 \
> >    strncase_l-avx2 \
> >    strncase_l-avx2-rtm \
> > @@ -105,6 +106,7 @@ sysdep_routines += \
> >    strnlen-avx2 \
> >    strnlen-avx2-rtm \
> >    strnlen-evex \
> > +  strnlen-evex512 \
> >    strnlen-sse2 \
> >    strpbrk-c \
> >    strpbrk-sse2 \
> > @@ -138,6 +140,7 @@ sysdep_routines += \
> >    wcslen-avx2 \
> >    wcslen-avx2-rtm \
> >    wcslen-evex \
> > +  wcslen-evex512 \
> >    wcslen-sse2 \
> >    wcslen-sse4_1 \
> >    wcsncmp-avx2 \
> > @@ -148,6 +151,7 @@ sysdep_routines += \
> >    wcsnlen-avx2-rtm \
> >    wcsnlen-c \
> >    wcsnlen-evex \
> > +  wcsnlen-evex512 \
> >    wcsnlen-sse4_1 \
> >    wcsrchr-avx2 \
> >    wcsrchr-avx2-rtm \
> > diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> > index 7218095430..c5cd9466fe 100644
> > --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> > +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> > @@ -328,6 +328,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> >                                && CPU_FEATURE_USABLE (AVX512BW)
> >                                && CPU_FEATURE_USABLE (BMI2)),
> >                               __strlen_evex)
> > +             IFUNC_IMPL_ADD (array, i, strlen,
> > +                             (CPU_FEATURE_USABLE (AVX512VL)
> > +                              && CPU_FEATURE_USABLE (AVX512BW)
> > +                              && CPU_FEATURE_USABLE (BMI2)),
> > +                             __strlen_evex512)
> >               IFUNC_IMPL_ADD (array, i, strlen, 1, __strlen_sse2))
> >
> >    /* Support sysdeps/x86_64/multiarch/strnlen.c.  */
> > @@ -346,6 +351,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> >                                && CPU_FEATURE_USABLE (AVX512BW)
> >                                && CPU_FEATURE_USABLE (BMI2)),
> >                               __strnlen_evex)
> > +             IFUNC_IMPL_ADD (array, i, strnlen,
> > +                             (CPU_FEATURE_USABLE (AVX512VL)
> > +                              && CPU_FEATURE_USABLE (AVX512BW)
> > +                              && CPU_FEATURE_USABLE (BMI2)),
> > +                             __strnlen_evex512)
> >               IFUNC_IMPL_ADD (array, i, strnlen, 1, __strnlen_sse2))
> >
> >    /* Support sysdeps/x86_64/multiarch/stpncpy.c.  */
> > @@ -699,6 +709,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> >                                && CPU_FEATURE_USABLE (AVX512BW)
> >                                && CPU_FEATURE_USABLE (BMI2)),
> >                               __wcslen_evex)
> > +             IFUNC_IMPL_ADD (array, i, wcslen,
> > +                             (CPU_FEATURE_USABLE (AVX512VL)
> > +                              && CPU_FEATURE_USABLE (AVX512BW)
> > +                              && CPU_FEATURE_USABLE (BMI2)),
> > +                             __wcslen_evex512)
> >               IFUNC_IMPL_ADD (array, i, wcslen,
> >                               CPU_FEATURE_USABLE (SSE4_1),
> >                               __wcslen_sse4_1)
> > @@ -720,6 +735,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> >                                && CPU_FEATURE_USABLE (AVX512BW)
> >                                && CPU_FEATURE_USABLE (BMI2)),
> >                               __wcsnlen_evex)
> > +             IFUNC_IMPL_ADD (array, i, wcsnlen,
> > +                             (CPU_FEATURE_USABLE (AVX512VL)
> > +                              && CPU_FEATURE_USABLE (AVX512BW)
> > +                              && CPU_FEATURE_USABLE (BMI2)),
> > +                             __wcsnlen_evex512)
> >               IFUNC_IMPL_ADD (array, i, wcsnlen,
> >                               CPU_FEATURE_USABLE (SSE4_1),
> >                               __wcsnlen_sse4_1)
> > diff --git a/sysdeps/x86_64/multiarch/strlen-evex512.S b/sysdeps/x86_64/multiarch/strlen-evex512.S
> > new file mode 100644
> > index 0000000000..13a6b34615
> > --- /dev/null
> > +++ b/sysdeps/x86_64/multiarch/strlen-evex512.S
> > @@ -0,0 +1,291 @@
> > +/* Copyright (C) 2022 Free Software Foundation, Inc.
> > +   This file is part of the GNU C Library.
> > +
> > +   The GNU C Library is free software; you can redistribute it and/or
> > +   modify it under the terms of the GNU Lesser General Public
> > +   License as published by the Free Software Foundation; either
> > +   version 2.1 of the License, or (at your option) any later version.
> > +
> > +   The GNU C Library is distributed in the hope that it will be useful,
> > +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> > +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > +   Lesser General Public License for more details.
> > +
> > +   You should have received a copy of the GNU Lesser General Public
> > +   License along with the GNU C Library; if not, see
> > +   <https://www.gnu.org/licenses/>.  */
> > +
> > +#if IS_IN (libc)
> > +
> > +# include <sysdep.h>
> > +
> > +# ifndef STRLEN
> > +#  define STRLEN       __strlen_evex512
> > +# endif
> > +
> > +# define VMOVA         vmovdqa64
> > +# ifdef USE_AS_WCSLEN
> > +#  define VPCMP                vpcmpd
> > +#  define VPMINU       vpminud
> > +#  define CHAR_SIZE    4
> > +# else
> > +#  define VPCMP                vpcmpb
> > +#  define VPMINU       vpminub
> > +#  define CHAR_SIZE    1
> > +# endif
> > +
> > +# define XMM0          xmm16
> > +# define ZMM0          zmm16
> > +# define ZMM1          zmm17
> > +# define ZMM2          zmm18
> > +# define ZMM3          zmm19
> > +# define ZMM4          zmm20
> > +# define VEC_SIZE      64
> > +# define PAGE_SIZE     4096
> > +# define CHAR_PER_VEC  (VEC_SIZE / CHAR_SIZE)
>
> Is it possible to integrate this file cleanly with the evex256 version?
> Something similar to what we do for memset/memmove.

Good suggestion, I will look into it. For the first iteration, let's
keep it standalone
for now.

> > +
> > +       .section .text.evex512, "ax", @progbits
> > +/* Aligning entry point to 64 byte, provides better performance for
> > +   one vector length string.  */
> > +ENTRY_P2ALIGN (STRLEN, 6)
> > +# ifdef USE_AS_STRNLEN
> > +        /* Check zero length.  */
> > +       test    %RSI_LP, %RSI_LP
> > +       jz      L(zero)
> > +#  ifdef __ILP32__
> > +       /* Clear the upper 32 bits.  */
> > +       movl    %esi, %esi
> > +#  endif
> > +# endif
> > +
> > +       movl    %edi, %ecx
> > +       vpxorq  %XMM0, %XMM0, %XMM0
> > +       andl    $(PAGE_SIZE - 1), %ecx
> > +       cmpl    $(PAGE_SIZE - VEC_SIZE), %ecx
> > +       ja      L(page_cross)
> > +
> > +       /* Compare [w]char for null, mask bit will be set for match.  */
> > +       VPCMP   $0, (%rdi), %ZMM0, %k0
> > +       kmovq   %k0, %rax
> > +       testq   %rax, %rax
> > +       jz      L(align_more)
> > +
> > +       tzcntq  %rax, %rax
> > +# ifdef USE_AS_STRNLEN
> > +       cmpq    %rsi, %rax
> cmpl
>
> > +       jae     L(ret_max)
> > +# endif
> > +       ret
> > +
> > +# ifdef USE_AS_STRNLEN
> > +       /* eax instead of rax used to save encoding space.  */
> > +L(zero):
> > +       xorl    %eax, %eax
> > +       ret
> > +# endif
> > +
> > +       /* At this point vector max length reached.  */
> > +# ifdef USE_AS_STRNLEN
> > +L(ret_max):
> > +       movq    %rsi, %rax
> > +       ret
> > +# endif
> > +
> > +L(page_cross):
>
> Imo unless you need the 2-byte encoding on the jump this should be at
> the end of the
> file as its expected to not be hot.

One of my goal, to reduce size as much as possible, as long as it
doesn't hurt performance.  Keeping the jump target nearby reduces
size by a few bytes, without hurting performance.

> > +       andl    $(VEC_SIZE - 1), %ecx
> > +# ifdef USE_AS_WCSLEN
> > +       sarl    $2, %ecx
> > +# endif
> > +       /* ecx contains number of w[char] to be skipped as a result
> > +          of address alignment.  */
> > +       movq    %rdi, %rax
> > +       andq    $-VEC_SIZE, %rax
> > +       VPCMP   $0, (%rax), %ZMM0, %k0
> > +       kmovq   %k0, %rax
> > +       /* Ignore number of character for alignment adjustment.  */
> > +       shrq    %cl, %rax
> > +       jz      L(align_more)
> > +
> > +       tzcntq  %rax, %rax
> > +# ifdef USE_AS_STRNLEN
> > +       cmpq    %rsi, %rax
> > +       jae     L(ret_max)
> > +# endif
> > +       ret
> > +
> > +L(align_more):
> > +       leaq    VEC_SIZE(%rdi), %rax
> > +       /* Align rax to VEC_SIZE.  */
> > +       andq    $-VEC_SIZE, %rax
> > +# ifdef USE_AS_STRNLEN
> > +       movq    %rax, %rdx
> > +       subq    %rdi, %rdx
> > +#  ifdef USE_AS_WCSLEN
> > +       shrq    $2, %rdx
> > +#  endif
> > +       /* At this point rdx contains [w]chars already compared.  */
> > +       cmpq    %rsi, %rdx
> > +       jae     L(ret_max)
> > +       subq    %rsi, %rdx
> > +       negq    %rdx
> > +       /* At this point rdx contains number of w[char] needs to go.
> > +          Now onwards rdx will keep decrementing with each compare.  */
> > +# endif
> > +
> > +       /* Loop unroll 4 times for 4 vector loop.  */
> > +       VPCMP   $0, (%rax), %ZMM0, %k0
> > +       kmovq   %k0, %rcx
> > +       testq   %rcx, %rcx
> > +       jnz     L(first_vector)
>
> Just to keep consistent with the other files can you
> rename first_vector/second_vector... to ret_vec_x{N}
> or something like that.

Agree, will be fixed in v1.

> > +
> > +# ifdef USE_AS_STRNLEN
> > +       subq    $CHAR_PER_VEC, %rdx
> > +       jbe     L(ret_max)
> > +# endif
> > +
> > +       VPCMP   $0, VEC_SIZE(%rax), %ZMM0, %k0
> > +       kmovq   %k0, %rcx
> > +       testq   %rcx, %rcx
> > +       jnz     L(second_vector)
> > +
> > +# ifdef USE_AS_STRNLEN
> > +       subq    $CHAR_PER_VEC, %rdx
> > +       jbe     L(ret_max)
> > +# endif
>
> The evex256 / avx2 versions do a simple check if we will be able
> to do all 4 aligning compares w.o a branch. This saves total
> branches. Why not do something similar here?

Done this way to reduce size and complexity. Branch taken, will
jump to terminating condition. Branch not taken has no impact on perf.

> > +
> > +       VPCMP   $0, (2 * VEC_SIZE)(%rax), %ZMM0, %k0
> > +       kmovq   %k0, %rcx
> > +       testq   %rcx, %rcx
> > +       jnz     L(third_vector)
> > +
> > +# ifdef USE_AS_STRNLEN
> > +       subq    $CHAR_PER_VEC, %rdx
> > +       jbe     L(ret_max)
> > +# endif
> > +
> > +       VPCMP   $0, (3 * VEC_SIZE)(%rax), %ZMM0, %k0
> > +       kmovq   %k0, %rcx
> > +       testq   %rcx, %rcx
> > +       jnz     L(fourth_vector)
> > +
> > +       addq    $(4 * VEC_SIZE), %rax
> > +
> > +# ifdef USE_AS_STRNLEN
> > +       /* Instead of decreasing, rdx increased to prepare for loop
> > +          first iteration.  Incremented 3 times because one increment
> > +          cancelled by previous decrement.  */
> > +       addq    $(3 * CHAR_PER_VEC), %rdx
> > +# endif
> > +
> > +       /* Test if address is already 4 * VEC_SIZE byte aligned goto
> > +          loop.  */
> > +       testq   $(3 * VEC_SIZE), %rax
> > +       jz      L(loop)
> > +
> > +       movq    %rax, %rcx
> > +
> > +       /* Align address to 4 * VEC_SIZE for loop.  */
> > +       andq    $-(4 * VEC_SIZE), %rax
> > +
> > +# ifdef USE_AS_STRNLEN
> > +       subq    %rax, %rcx
> > +#  ifdef USE_AS_WCSLEN
> > +       sarq    $2, %rcx
> > +#  endif
> > +       /* rcx contains number of [w]char will be recompared due to
> > +          alignment fixes.  rdx must be incremented by rcx to offset
> > +          alignment adjustmentment.  */
> > +       addq    %rcx, %rdx
> > +# endif
> > +
> > +L(loop):
> > +# ifdef USE_AS_STRNLEN
> > +       subq    $(CHAR_PER_VEC * 4), %rdx
> > +       jbe     L(ret_max)
>
> we have potential to overread by 255 bytes. Not correctness issue because
> we are page aligned by seems like a possible perf issue.

Correct, but overread data will be read from cache not memory, not a
significant impact, but this is the cost we have to pay for 4 vector alignments.

> > +# endif
> > +       /* VPMINU and VPCMP combination provide better perfomance as
> > +          compared to alternative combinations.  */
> > +       VMOVA   (%rax), %ZMM1
> > +       VPMINU  (VEC_SIZE)(%rax), %ZMM1, %ZMM2
> > +       VMOVA   (2 * VEC_SIZE)(%rax), %ZMM3
> > +       VPMINU  (3 * VEC_SIZE)(%rax), %ZMM3, %ZMM4
>
> I think doing 4x in the main loop is probably overkill no?
> Aligning to 256 is pretty extreme.
>
> Also I don't think the 4x zmm loads can even keep up with
> 2x / cycle so seems like it may not be worth wasting up to
> 255 bytes to get it.

Perf number looks good, so for now it should be ok.

> > +
> > +       VPCMP   $0, %ZMM2, %ZMM0, %k0
> > +       VPCMP   $0, %ZMM4, %ZMM0, %k1
> > +
> > +       addq    $(4 * VEC_SIZE), %rax
> > +       kortestq %k0, %k1
> > +       jz      L(loop)
> > +
> > +       /* Need 4 vector subtraction because address incremented in
> > +          the loop before terminating condition check.  Also want to
> > +          reuse code for exit condition before and after the loop.  */
> > +       subq    $(4 * VEC_SIZE), %rax
> > +
> > +       VPCMP   $0, %ZMM1, %ZMM0, %k2
> > +       kmovq   %k2, %rcx
> > +       testq   %rcx, %rcx
> > +       jnz     L(first_vector)
> > +
> > +       kmovq   %k0, %rcx
> > +       /* At this point, if k0 is non zero, null char must be in the
> > +          second vector.  */
> > +       testq   %rcx, %rcx
> > +       jnz     L(second_vector)
> > +
> > +       VPCMP   $0, %ZMM3, %ZMM0, %k3
> > +       kmovq   %k3, %rcx
> > +       testq   %rcx, %rcx
> > +       jnz     L(third_vector)
> > +       /* At this point null [w]char must be in the fourth vector so no
> > +          need to check.  */
> > +       kmovq   %k1, %rcx
> > +
> > +       /* Termination fourth, third, second vector are pretty much
> > +          same, implemented this way to avoid branching and reuse code
> > +          from pre loop exit condition.  */
> > +L(fourth_vector):
> > +       addq    $(3 * VEC_SIZE), %rax
> > +       tzcntq  %rcx, %rcx
> > +       subq    %rdi, %rax
> Can this be hoisted out to the begining of L(aligned_more).
> It seems every return path uses it.
>

It really depends on where the control is coming from. So moving before
align_more will not be correct, or I may be missing something here.

> > +# ifdef USE_AS_WCSLEN
> > +       sarq    $2, %rax
> > +# endif
> > +       addq    %rcx, %rax
>
> if not wcslen probably faster to use lea instead of 2x add

I'm not sure whether there will be any significant gain. lea vs add. Used add
because it's readily available on all ports.


>
> > +# ifdef USE_AS_STRNLEN
> > +       cmpq    %rsi, %rax
> > +       jae     L(ret_max)
> > +# endif
> > +       ret
> > +
> > +L(third_vector):
> > +       addq    $(2 * VEC_SIZE), %rax
> > +       tzcntq  %rcx, %rcx
> > +       subq    %rdi, %rax
> > +# ifdef USE_AS_WCSLEN
> > +       sarq    $2, %rax
> > +# endif
> > +       addq    %rcx, %rax
> > +# ifdef USE_AS_STRNLEN
> > +       cmpq    %rsi, %rax
> > +       jae     L(ret_max)
> > +# endif
> > +       ret
> > +
> > +L(second_vector):
> > +       addq    $VEC_SIZE, %rax
> > +L(first_vector):
> > +       tzcntq  %rcx, %rcx
> > +       subq    %rdi, %rax
> > +# ifdef USE_AS_WCSLEN
> > +       sarq    $2, %rax
> > +# endif
> > +       addq    %rcx, %rax
> > +# ifdef USE_AS_STRNLEN
> > +       cmpq    %rsi, %rax
> > +       jae     L(ret_max)
> > +# endif
> > +       ret
> > +
> > +END (STRLEN)
> > +#endif
> > diff --git a/sysdeps/x86_64/multiarch/strnlen-evex512.S b/sysdeps/x86_64/multiarch/strnlen-evex512.S
> > new file mode 100644
> > index 0000000000..0b7f220214
> > --- /dev/null
> > +++ b/sysdeps/x86_64/multiarch/strnlen-evex512.S
> > @@ -0,0 +1,4 @@
> > +#define STRLEN __strnlen_evex512
> > +#define USE_AS_STRNLEN 1
> > +
> > +#include "strlen-evex512.S"
> > diff --git a/sysdeps/x86_64/multiarch/wcslen-evex512.S b/sysdeps/x86_64/multiarch/wcslen-evex512.S
> > new file mode 100644
> > index 0000000000..f59c372b78
> > --- /dev/null
> > +++ b/sysdeps/x86_64/multiarch/wcslen-evex512.S
> > @@ -0,0 +1,4 @@
> > +#define STRLEN __wcslen_evex512
> > +#define USE_AS_WCSLEN 1
> > +
> > +#include "strlen-evex512.S"
> > diff --git a/sysdeps/x86_64/multiarch/wcsnlen-evex512.S b/sysdeps/x86_64/multiarch/wcsnlen-evex512.S
> > new file mode 100644
> > index 0000000000..73dcf2f210
> > --- /dev/null
> > +++ b/sysdeps/x86_64/multiarch/wcsnlen-evex512.S
> > @@ -0,0 +1,5 @@
> > +#define STRLEN __wcsnlen_evex512
> > +#define USE_AS_WCSLEN 1
> > +#define USE_AS_STRNLEN 1
> > +
> > +#include "strlen-evex512.S"
> > --
> > 2.35.3
> >

[-- Attachment #2: skxdata.tar.gz --]
[-- Type: application/gzip, Size: 57289 bytes --]

^ permalink raw reply	[flat|nested] 12+ messages in thread

* [PATCH v2] x86_64: Implement evex512 version of strlen, strnlen, wcslen and wcsnlen
  2022-05-19  3:33   ` Sunil Pandey
@ 2022-05-19  3:48     ` Sunil K Pandey
  2022-05-19 15:03       ` Noah Goldstein
  2022-05-19  4:41     ` [PATCH] " Noah Goldstein
  1 sibling, 1 reply; 12+ messages in thread
From: Sunil K Pandey @ 2022-05-19  3:48 UTC (permalink / raw)
  To: libc-alpha

This patch implements following evex512 version of string functions.
Perf gain up to 50% as compared to evex, depending on length and
alignment.

- String length function using 512 bit vectors.
- String N length using 512 bit vectors.
- Wide string length using 512 bit vectors.
- Wide string N length using 512 bit vectors.
---
 sysdeps/x86_64/multiarch/Makefile          |   4 +
 sysdeps/x86_64/multiarch/ifunc-impl-list.c |  20 ++
 sysdeps/x86_64/multiarch/strlen-evex512.S  | 291 +++++++++++++++++++++
 sysdeps/x86_64/multiarch/strnlen-evex512.S |   4 +
 sysdeps/x86_64/multiarch/wcslen-evex512.S  |   4 +
 sysdeps/x86_64/multiarch/wcsnlen-evex512.S |   5 +
 6 files changed, 328 insertions(+)
 create mode 100644 sysdeps/x86_64/multiarch/strlen-evex512.S
 create mode 100644 sysdeps/x86_64/multiarch/strnlen-evex512.S
 create mode 100644 sysdeps/x86_64/multiarch/wcslen-evex512.S
 create mode 100644 sysdeps/x86_64/multiarch/wcsnlen-evex512.S

diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
index f3ab5e0928..d0869c3ac3 100644
--- a/sysdeps/x86_64/multiarch/Makefile
+++ b/sysdeps/x86_64/multiarch/Makefile
@@ -81,6 +81,7 @@ sysdep_routines += \
   strlen-avx2 \
   strlen-avx2-rtm \
   strlen-evex \
+  strlen-evex512 \
   strlen-sse2 \
   strncase_l-avx2 \
   strncase_l-avx2-rtm \
@@ -105,6 +106,7 @@ sysdep_routines += \
   strnlen-avx2 \
   strnlen-avx2-rtm \
   strnlen-evex \
+  strnlen-evex512 \
   strnlen-sse2 \
   strpbrk-c \
   strpbrk-sse2 \
@@ -138,6 +140,7 @@ sysdep_routines += \
   wcslen-avx2 \
   wcslen-avx2-rtm \
   wcslen-evex \
+  wcslen-evex512 \
   wcslen-sse2 \
   wcslen-sse4_1 \
   wcsncmp-avx2 \
@@ -148,6 +151,7 @@ sysdep_routines += \
   wcsnlen-avx2-rtm \
   wcsnlen-c \
   wcsnlen-evex \
+  wcsnlen-evex512 \
   wcsnlen-sse4_1 \
   wcsrchr-avx2 \
   wcsrchr-avx2-rtm \
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
index 7218095430..c5cd9466fe 100644
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
@@ -328,6 +328,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 			       && CPU_FEATURE_USABLE (AVX512BW)
 			       && CPU_FEATURE_USABLE (BMI2)),
 			      __strlen_evex)
+	      IFUNC_IMPL_ADD (array, i, strlen,
+			      (CPU_FEATURE_USABLE (AVX512VL)
+			       && CPU_FEATURE_USABLE (AVX512BW)
+			       && CPU_FEATURE_USABLE (BMI2)),
+			      __strlen_evex512)
 	      IFUNC_IMPL_ADD (array, i, strlen, 1, __strlen_sse2))
 
   /* Support sysdeps/x86_64/multiarch/strnlen.c.  */
@@ -346,6 +351,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 			       && CPU_FEATURE_USABLE (AVX512BW)
 			       && CPU_FEATURE_USABLE (BMI2)),
 			      __strnlen_evex)
+	      IFUNC_IMPL_ADD (array, i, strnlen,
+			      (CPU_FEATURE_USABLE (AVX512VL)
+			       && CPU_FEATURE_USABLE (AVX512BW)
+			       && CPU_FEATURE_USABLE (BMI2)),
+			      __strnlen_evex512)
 	      IFUNC_IMPL_ADD (array, i, strnlen, 1, __strnlen_sse2))
 
   /* Support sysdeps/x86_64/multiarch/stpncpy.c.  */
@@ -699,6 +709,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 			       && CPU_FEATURE_USABLE (AVX512BW)
 			       && CPU_FEATURE_USABLE (BMI2)),
 			      __wcslen_evex)
+	      IFUNC_IMPL_ADD (array, i, wcslen,
+			      (CPU_FEATURE_USABLE (AVX512VL)
+			       && CPU_FEATURE_USABLE (AVX512BW)
+			       && CPU_FEATURE_USABLE (BMI2)),
+			      __wcslen_evex512)
 	      IFUNC_IMPL_ADD (array, i, wcslen,
 			      CPU_FEATURE_USABLE (SSE4_1),
 			      __wcslen_sse4_1)
@@ -720,6 +735,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 			       && CPU_FEATURE_USABLE (AVX512BW)
 			       && CPU_FEATURE_USABLE (BMI2)),
 			      __wcsnlen_evex)
+	      IFUNC_IMPL_ADD (array, i, wcsnlen,
+			      (CPU_FEATURE_USABLE (AVX512VL)
+			       && CPU_FEATURE_USABLE (AVX512BW)
+			       && CPU_FEATURE_USABLE (BMI2)),
+			      __wcsnlen_evex512)
 	      IFUNC_IMPL_ADD (array, i, wcsnlen,
 			      CPU_FEATURE_USABLE (SSE4_1),
 			      __wcsnlen_sse4_1)
diff --git a/sysdeps/x86_64/multiarch/strlen-evex512.S b/sysdeps/x86_64/multiarch/strlen-evex512.S
new file mode 100644
index 0000000000..0a2d7bbb1a
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strlen-evex512.S
@@ -0,0 +1,291 @@
+/* Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#if IS_IN (libc)
+
+# include <sysdep.h>
+
+# ifndef STRLEN
+#  define STRLEN	__strlen_evex512
+# endif
+
+# define VMOVA		vmovdqa64
+# ifdef USE_AS_WCSLEN
+#  define VPCMP		vpcmpd
+#  define VPMINU	vpminud
+#  define CHAR_SIZE	4
+# else
+#  define VPCMP		vpcmpb
+#  define VPMINU	vpminub
+#  define CHAR_SIZE	1
+# endif
+
+# define XMM0		xmm16
+# define ZMM0		zmm16
+# define ZMM1		zmm17
+# define ZMM2		zmm18
+# define ZMM3		zmm19
+# define ZMM4		zmm20
+# define VEC_SIZE	64
+# define PAGE_SIZE	4096
+# define CHAR_PER_VEC	(VEC_SIZE / CHAR_SIZE)
+
+	.section .text.evex512, "ax", @progbits
+/* Aligning entry point to 64 byte, provides better performance for
+   one vector length string.  */
+ENTRY_P2ALIGN (STRLEN, 6)
+# ifdef USE_AS_STRNLEN
+        /* Check zero length.  */
+	test	%RSI_LP, %RSI_LP
+	jz	L(zero)
+#  ifdef __ILP32__
+	/* Clear the upper 32 bits.  */
+	movl	%esi, %esi
+#  endif
+# endif
+
+	movl	%edi, %ecx
+	vpxorq	%XMM0, %XMM0, %XMM0
+	andl	$(PAGE_SIZE - 1), %ecx
+	cmpl	$(PAGE_SIZE - VEC_SIZE), %ecx
+	ja	L(page_cross)
+
+	/* Compare [w]char for null, mask bit will be set for match.  */
+	VPCMP	$0, (%rdi), %ZMM0, %k0
+	kmovq	%k0, %rax
+	testq	%rax, %rax
+	jz	L(align_more)
+
+	tzcntq	%rax, %rax
+# ifdef USE_AS_STRNLEN
+	cmpq	%rsi, %rax
+	jae	L(ret_max)
+# endif
+	ret
+
+# ifdef USE_AS_STRNLEN
+	/* eax instead of rax used to save encoding space.  */
+L(zero):
+	xorl	%eax, %eax
+	ret
+# endif
+
+	/* At this point vector max length reached.  */
+# ifdef USE_AS_STRNLEN
+L(ret_max):
+	movq	%rsi, %rax
+	ret
+# endif
+
+L(page_cross):
+	andl	$(VEC_SIZE - 1), %ecx
+# ifdef USE_AS_WCSLEN
+	sarl	$2, %ecx
+# endif
+	/* ecx contains number of w[char] to be skipped as a result
+	   of address alignment.  */
+	movq	%rdi, %rax
+	andq	$-VEC_SIZE, %rax
+	VPCMP	$0, (%rax), %ZMM0, %k0
+	kmovq	%k0, %rax
+	/* Ignore number of character for alignment adjustment.  */
+	shrq	%cl, %rax
+	jz	L(align_more)
+
+	tzcntq	%rax, %rax
+# ifdef USE_AS_STRNLEN
+	cmpq	%rsi, %rax
+	jae	L(ret_max)
+# endif
+	ret
+
+L(align_more):
+	leaq	VEC_SIZE(%rdi), %rax
+	/* Align rax to VEC_SIZE.  */
+	andq	$-VEC_SIZE, %rax
+# ifdef USE_AS_STRNLEN
+	movq	%rax, %rdx
+	subq	%rdi, %rdx
+#  ifdef USE_AS_WCSLEN
+	shrq	$2, %rdx
+#  endif
+	/* At this point rdx contains [w]chars already compared.  */
+	cmpq	%rsi, %rdx
+	jae	L(ret_max)
+	subq	%rsi, %rdx
+	negq	%rdx
+	/* At this point rdx contains number of w[char] needs to go.
+	   Now onwards rdx will keep decrementing with each compare.  */
+# endif
+
+	/* Loop unroll 4 times for 4 vector loop.  */
+	VPCMP	$0, (%rax), %ZMM0, %k0
+	kmovq	%k0, %rcx
+	testq	%rcx, %rcx
+	jnz	L(ret_vec_x1)
+
+# ifdef USE_AS_STRNLEN
+	subq    $CHAR_PER_VEC, %rdx
+	jbe	L(ret_max)
+# endif
+
+	VPCMP	$0, VEC_SIZE(%rax), %ZMM0, %k0
+	kmovq	%k0, %rcx
+	testq	%rcx, %rcx
+	jnz	L(ret_vec_x2)
+
+# ifdef USE_AS_STRNLEN
+	subq    $CHAR_PER_VEC, %rdx
+	jbe	L(ret_max)
+# endif
+
+	VPCMP	$0, (2 * VEC_SIZE)(%rax), %ZMM0, %k0
+	kmovq	%k0, %rcx
+	testq	%rcx, %rcx
+	jnz	L(ret_vec_x3)
+
+# ifdef USE_AS_STRNLEN
+	subq    $CHAR_PER_VEC, %rdx
+	jbe	L(ret_max)
+# endif
+
+	VPCMP	$0, (3 * VEC_SIZE)(%rax), %ZMM0, %k0
+	kmovq	%k0, %rcx
+	testq	%rcx, %rcx
+	jnz	L(ret_vec_x4)
+
+	addq    $(4 * VEC_SIZE), %rax
+
+# ifdef USE_AS_STRNLEN
+	/* Instead of decreasing, rdx increased to prepare for loop
+	   first iteration.  Incremented 3 times because one increment
+	   cancelled by previous decrement.  */
+	addq    $(3 * CHAR_PER_VEC), %rdx
+# endif
+
+	/* Test if address is already 4 * VEC_SIZE byte aligned goto
+	   loop.  */
+	testq   $(3 * VEC_SIZE), %rax
+	jz      L(loop)
+
+	movq	%rax, %rcx
+
+	/* Align address to 4 * VEC_SIZE for loop.  */
+	andq	$-(4 * VEC_SIZE), %rax
+
+# ifdef USE_AS_STRNLEN
+	subq	%rax, %rcx
+#  ifdef USE_AS_WCSLEN
+	sarq	$2, %rcx
+#  endif
+	/* rcx contains number of [w]char will be recompared due to
+	   alignment fixes.  rdx must be incremented by rcx to offset
+	   alignment adjustmentment.  */
+	addq	%rcx, %rdx
+# endif
+
+L(loop):
+# ifdef USE_AS_STRNLEN
+	subq	$(CHAR_PER_VEC * 4), %rdx
+	jbe	L(ret_max)
+# endif
+	/* VPMINU and VPCMP combination provide better perfomance as
+	   compared to alternative combinations.  */
+	VMOVA	(%rax), %ZMM1
+	VPMINU	(VEC_SIZE)(%rax), %ZMM1, %ZMM2
+	VMOVA   (2 * VEC_SIZE)(%rax), %ZMM3
+	VPMINU	(3 * VEC_SIZE)(%rax), %ZMM3, %ZMM4
+
+	VPCMP	$0, %ZMM2, %ZMM0, %k0
+	VPCMP	$0, %ZMM4, %ZMM0, %k1
+
+	addq	$(4 * VEC_SIZE), %rax
+	kortestq %k0, %k1
+	jz	L(loop)
+
+	/* Need 4 vector subtraction because address incremented in
+	   the loop before terminating condition check.  Also want to
+	   reuse code for exit condition before and after the loop.  */
+	subq	$(4 * VEC_SIZE), %rax
+
+	VPCMP	$0, %ZMM1, %ZMM0, %k2
+	kmovq	%k2, %rcx
+	testq	%rcx, %rcx
+	jnz	L(ret_vec_x1)
+
+	kmovq	%k0, %rcx
+	/* At this point, if k0 is non zero, null char must be in the
+	   second vector.  */
+	testq	%rcx, %rcx
+	jnz	L(ret_vec_x2)
+
+	VPCMP	$0, %ZMM3, %ZMM0, %k3
+	kmovq	%k3, %rcx
+	testq	%rcx, %rcx
+	jnz	L(ret_vec_x3)
+	/* At this point null [w]char must be in the fourth vector so no
+	   need to check.  */
+	kmovq	%k1, %rcx
+
+	/* Termination fourth, third, second vector are pretty much
+	   same, implemented this way to avoid branching and reuse code
+	   from pre loop exit condition.  */
+L(ret_vec_x4):
+	addq	$(3 * VEC_SIZE), %rax
+	tzcntq	%rcx, %rcx
+	subq	%rdi, %rax
+# ifdef USE_AS_WCSLEN
+	sarq	$2, %rax
+# endif
+	addq	%rcx, %rax
+# ifdef USE_AS_STRNLEN
+	cmpq	%rsi, %rax
+	jae	L(ret_max)
+# endif
+	ret
+
+L(ret_vec_x3):
+	addq	$(2 * VEC_SIZE), %rax
+	tzcntq	%rcx, %rcx
+	subq	%rdi, %rax
+# ifdef USE_AS_WCSLEN
+	sarq	$2, %rax
+# endif
+	addq	%rcx, %rax
+# ifdef USE_AS_STRNLEN
+	cmpq	%rsi, %rax
+	jae	L(ret_max)
+# endif
+	ret
+
+L(ret_vec_x2):
+	addq	$VEC_SIZE, %rax
+L(ret_vec_x1):
+	tzcntq	%rcx, %rcx
+	subq	%rdi, %rax
+# ifdef USE_AS_WCSLEN
+	sarq	$2, %rax
+# endif
+	addq	%rcx, %rax
+# ifdef USE_AS_STRNLEN
+	cmpq	%rsi, %rax
+	jae	L(ret_max)
+# endif
+	ret
+
+END (STRLEN)
+#endif
diff --git a/sysdeps/x86_64/multiarch/strnlen-evex512.S b/sysdeps/x86_64/multiarch/strnlen-evex512.S
new file mode 100644
index 0000000000..0b7f220214
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strnlen-evex512.S
@@ -0,0 +1,4 @@
+#define STRLEN __strnlen_evex512
+#define USE_AS_STRNLEN 1
+
+#include "strlen-evex512.S"
diff --git a/sysdeps/x86_64/multiarch/wcslen-evex512.S b/sysdeps/x86_64/multiarch/wcslen-evex512.S
new file mode 100644
index 0000000000..f59c372b78
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wcslen-evex512.S
@@ -0,0 +1,4 @@
+#define STRLEN __wcslen_evex512
+#define USE_AS_WCSLEN 1
+
+#include "strlen-evex512.S"
diff --git a/sysdeps/x86_64/multiarch/wcsnlen-evex512.S b/sysdeps/x86_64/multiarch/wcsnlen-evex512.S
new file mode 100644
index 0000000000..73dcf2f210
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wcsnlen-evex512.S
@@ -0,0 +1,5 @@
+#define STRLEN __wcsnlen_evex512
+#define USE_AS_WCSLEN 1
+#define USE_AS_STRNLEN 1
+
+#include "strlen-evex512.S"
-- 
2.35.3


^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH] x86_64: Implement evex512 version of strlen, strnlen, wcslen and wcsnlen
  2022-05-19  3:33   ` Sunil Pandey
  2022-05-19  3:48     ` [PATCH v2] " Sunil K Pandey
@ 2022-05-19  4:41     ` Noah Goldstein
  1 sibling, 0 replies; 12+ messages in thread
From: Noah Goldstein @ 2022-05-19  4:41 UTC (permalink / raw)
  To: Sunil Pandey; +Cc: GNU C Library

On Wed, May 18, 2022 at 10:33 PM Sunil Pandey <skpgkp2@gmail.com> wrote:
>
> On Wed, May 18, 2022 at 1:29 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> >
> > On Wed, May 18, 2022 at 1:59 PM Sunil K Pandey via Libc-alpha
> > <libc-alpha@sourceware.org> wrote:
> > >
> > > This patch implements following evex512 version of string functions.
> > > Perf gain up to 50% as compared to evex, depending on length and
> > > alignment.
> >
> > Can you include a csv (or any consistent fmt really) somewhere of all
> > the benchmarks
> > and results of ~10-20 runs and the hardware your benchmarking on?
>
> Machine:
> Intel(R) Xeon(R) Gold 6148 CPU @ 2.40GHz
> Fedora 35
> Glibc master
>
> 20 iteration data for each function is attached, please use any text
> editor(vi) to
> access it.

Any chance you can aggregate it?

Also can you add collective geometric mean of evex vs evex512 and
cpu info to the commit message.
>
> > >
> > > - String length function using 512 bit vectors.
> > > - String N length using 512 bit vectors.
> > > - Wide string length using 512 bit vectors.
> > > - Wide string N length using 512 bit vectors.
> > > ---
> > >  sysdeps/x86_64/multiarch/Makefile          |   4 +
> > >  sysdeps/x86_64/multiarch/ifunc-impl-list.c |  20 ++
> > >  sysdeps/x86_64/multiarch/strlen-evex512.S  | 291 +++++++++++++++++++++
> > >  sysdeps/x86_64/multiarch/strnlen-evex512.S |   4 +
> > >  sysdeps/x86_64/multiarch/wcslen-evex512.S  |   4 +
> > >  sysdeps/x86_64/multiarch/wcsnlen-evex512.S |   5 +
> > >  6 files changed, 328 insertions(+)
> > >  create mode 100644 sysdeps/x86_64/multiarch/strlen-evex512.S
> > >  create mode 100644 sysdeps/x86_64/multiarch/strnlen-evex512.S
> > >  create mode 100644 sysdeps/x86_64/multiarch/wcslen-evex512.S
> > >  create mode 100644 sysdeps/x86_64/multiarch/wcsnlen-evex512.S
> > >
> > > diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
> > > index f3ab5e0928..d0869c3ac3 100644
> > > --- a/sysdeps/x86_64/multiarch/Makefile
> > > +++ b/sysdeps/x86_64/multiarch/Makefile
> > > @@ -81,6 +81,7 @@ sysdep_routines += \
> > >    strlen-avx2 \
> > >    strlen-avx2-rtm \
> > >    strlen-evex \
> > > +  strlen-evex512 \
> > >    strlen-sse2 \
> > >    strncase_l-avx2 \
> > >    strncase_l-avx2-rtm \
> > > @@ -105,6 +106,7 @@ sysdep_routines += \
> > >    strnlen-avx2 \
> > >    strnlen-avx2-rtm \
> > >    strnlen-evex \
> > > +  strnlen-evex512 \
> > >    strnlen-sse2 \
> > >    strpbrk-c \
> > >    strpbrk-sse2 \
> > > @@ -138,6 +140,7 @@ sysdep_routines += \
> > >    wcslen-avx2 \
> > >    wcslen-avx2-rtm \
> > >    wcslen-evex \
> > > +  wcslen-evex512 \
> > >    wcslen-sse2 \
> > >    wcslen-sse4_1 \
> > >    wcsncmp-avx2 \
> > > @@ -148,6 +151,7 @@ sysdep_routines += \
> > >    wcsnlen-avx2-rtm \
> > >    wcsnlen-c \
> > >    wcsnlen-evex \
> > > +  wcsnlen-evex512 \
> > >    wcsnlen-sse4_1 \
> > >    wcsrchr-avx2 \
> > >    wcsrchr-avx2-rtm \
> > > diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> > > index 7218095430..c5cd9466fe 100644
> > > --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> > > +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> > > @@ -328,6 +328,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> > >                                && CPU_FEATURE_USABLE (AVX512BW)
> > >                                && CPU_FEATURE_USABLE (BMI2)),
> > >                               __strlen_evex)
> > > +             IFUNC_IMPL_ADD (array, i, strlen,
> > > +                             (CPU_FEATURE_USABLE (AVX512VL)
> > > +                              && CPU_FEATURE_USABLE (AVX512BW)
> > > +                              && CPU_FEATURE_USABLE (BMI2)),
> > > +                             __strlen_evex512)
> > >               IFUNC_IMPL_ADD (array, i, strlen, 1, __strlen_sse2))
> > >
> > >    /* Support sysdeps/x86_64/multiarch/strnlen.c.  */
> > > @@ -346,6 +351,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> > >                                && CPU_FEATURE_USABLE (AVX512BW)
> > >                                && CPU_FEATURE_USABLE (BMI2)),
> > >                               __strnlen_evex)
> > > +             IFUNC_IMPL_ADD (array, i, strnlen,
> > > +                             (CPU_FEATURE_USABLE (AVX512VL)
> > > +                              && CPU_FEATURE_USABLE (AVX512BW)
> > > +                              && CPU_FEATURE_USABLE (BMI2)),
> > > +                             __strnlen_evex512)
> > >               IFUNC_IMPL_ADD (array, i, strnlen, 1, __strnlen_sse2))
> > >
> > >    /* Support sysdeps/x86_64/multiarch/stpncpy.c.  */
> > > @@ -699,6 +709,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> > >                                && CPU_FEATURE_USABLE (AVX512BW)
> > >                                && CPU_FEATURE_USABLE (BMI2)),
> > >                               __wcslen_evex)
> > > +             IFUNC_IMPL_ADD (array, i, wcslen,
> > > +                             (CPU_FEATURE_USABLE (AVX512VL)
> > > +                              && CPU_FEATURE_USABLE (AVX512BW)
> > > +                              && CPU_FEATURE_USABLE (BMI2)),
> > > +                             __wcslen_evex512)
> > >               IFUNC_IMPL_ADD (array, i, wcslen,
> > >                               CPU_FEATURE_USABLE (SSE4_1),
> > >                               __wcslen_sse4_1)
> > > @@ -720,6 +735,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> > >                                && CPU_FEATURE_USABLE (AVX512BW)
> > >                                && CPU_FEATURE_USABLE (BMI2)),
> > >                               __wcsnlen_evex)
> > > +             IFUNC_IMPL_ADD (array, i, wcsnlen,
> > > +                             (CPU_FEATURE_USABLE (AVX512VL)
> > > +                              && CPU_FEATURE_USABLE (AVX512BW)
> > > +                              && CPU_FEATURE_USABLE (BMI2)),
> > > +                             __wcsnlen_evex512)
> > >               IFUNC_IMPL_ADD (array, i, wcsnlen,
> > >                               CPU_FEATURE_USABLE (SSE4_1),
> > >                               __wcsnlen_sse4_1)
> > > diff --git a/sysdeps/x86_64/multiarch/strlen-evex512.S b/sysdeps/x86_64/multiarch/strlen-evex512.S
> > > new file mode 100644
> > > index 0000000000..13a6b34615
> > > --- /dev/null
> > > +++ b/sysdeps/x86_64/multiarch/strlen-evex512.S
> > > @@ -0,0 +1,291 @@
> > > +/* Copyright (C) 2022 Free Software Foundation, Inc.
> > > +   This file is part of the GNU C Library.
> > > +
> > > +   The GNU C Library is free software; you can redistribute it and/or
> > > +   modify it under the terms of the GNU Lesser General Public
> > > +   License as published by the Free Software Foundation; either
> > > +   version 2.1 of the License, or (at your option) any later version.
> > > +
> > > +   The GNU C Library is distributed in the hope that it will be useful,
> > > +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> > > +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > > +   Lesser General Public License for more details.
> > > +
> > > +   You should have received a copy of the GNU Lesser General Public
> > > +   License along with the GNU C Library; if not, see
> > > +   <https://www.gnu.org/licenses/>.  */
> > > +
> > > +#if IS_IN (libc)
> > > +
> > > +# include <sysdep.h>
> > > +
> > > +# ifndef STRLEN
> > > +#  define STRLEN       __strlen_evex512
> > > +# endif
> > > +
> > > +# define VMOVA         vmovdqa64
> > > +# ifdef USE_AS_WCSLEN
> > > +#  define VPCMP                vpcmpd
> > > +#  define VPMINU       vpminud
> > > +#  define CHAR_SIZE    4
> > > +# else
> > > +#  define VPCMP                vpcmpb
> > > +#  define VPMINU       vpminub
> > > +#  define CHAR_SIZE    1
> > > +# endif
> > > +
> > > +# define XMM0          xmm16
> > > +# define ZMM0          zmm16
> > > +# define ZMM1          zmm17
> > > +# define ZMM2          zmm18
> > > +# define ZMM3          zmm19
> > > +# define ZMM4          zmm20
> > > +# define VEC_SIZE      64
> > > +# define PAGE_SIZE     4096
> > > +# define CHAR_PER_VEC  (VEC_SIZE / CHAR_SIZE)
> >
> > Is it possible to integrate this file cleanly with the evex256 version?
> > Something similar to what we do for memset/memmove.
>
> Good suggestion, I will look into it. For the first iteration, let's
> keep it standalone
> for now.

Why? There are a fair amount of functions. Given that evex/evex512
are just about 1-1 except VEC_SIZE we should try to integrate.
>
> > > +
> > > +       .section .text.evex512, "ax", @progbits
> > > +/* Aligning entry point to 64 byte, provides better performance for
> > > +   one vector length string.  */
> > > +ENTRY_P2ALIGN (STRLEN, 6)
> > > +# ifdef USE_AS_STRNLEN
> > > +        /* Check zero length.  */
> > > +       test    %RSI_LP, %RSI_LP
> > > +       jz      L(zero)
> > > +#  ifdef __ILP32__
> > > +       /* Clear the upper 32 bits.  */
> > > +       movl    %esi, %esi
> > > +#  endif
> > > +# endif
> > > +
> > > +       movl    %edi, %ecx
> > > +       vpxorq  %XMM0, %XMM0, %XMM0
> > > +       andl    $(PAGE_SIZE - 1), %ecx
> > > +       cmpl    $(PAGE_SIZE - VEC_SIZE), %ecx
> > > +       ja      L(page_cross)
> > > +
> > > +       /* Compare [w]char for null, mask bit will be set for match.  */
> > > +       VPCMP   $0, (%rdi), %ZMM0, %k0
> > > +       kmovq   %k0, %rax
> > > +       testq   %rax, %rax
> > > +       jz      L(align_more)
> > > +
> > > +       tzcntq  %rax, %rax
> > > +# ifdef USE_AS_STRNLEN
> > > +       cmpq    %rsi, %rax
> > cmpl
> >
> > > +       jae     L(ret_max)
> > > +# endif
> > > +       ret
> > > +
> > > +# ifdef USE_AS_STRNLEN
> > > +       /* eax instead of rax used to save encoding space.  */
> > > +L(zero):
> > > +       xorl    %eax, %eax
> > > +       ret
> > > +# endif
> > > +
> > > +       /* At this point vector max length reached.  */
> > > +# ifdef USE_AS_STRNLEN
> > > +L(ret_max):
> > > +       movq    %rsi, %rax
> > > +       ret
> > > +# endif
> > > +
> > > +L(page_cross):
> >
> > Imo unless you need the 2-byte encoding on the jump this should be at
> > the end of the
> > file as its expected to not be hot.
>
> One of my goal, to reduce size as much as possible, as long as it
> doesn't hurt performance.  Keeping the jump target nearby reduces
> size by a few bytes, without hurting performance.

Fair enough, although the page cross cases are pretty cold. Putting cold
code in a hot region is a waste in a sense too.
>
> > > +       andl    $(VEC_SIZE - 1), %ecx
> > > +# ifdef USE_AS_WCSLEN
> > > +       sarl    $2, %ecx
> > > +# endif
> > > +       /* ecx contains number of w[char] to be skipped as a result
> > > +          of address alignment.  */
> > > +       movq    %rdi, %rax
> > > +       andq    $-VEC_SIZE, %rax
> > > +       VPCMP   $0, (%rax), %ZMM0, %k0
> > > +       kmovq   %k0, %rax
> > > +       /* Ignore number of character for alignment adjustment.  */
> > > +       shrq    %cl, %rax
> > > +       jz      L(align_more)
> > > +
> > > +       tzcntq  %rax, %rax
> > > +# ifdef USE_AS_STRNLEN
> > > +       cmpq    %rsi, %rax
> > > +       jae     L(ret_max)
> > > +# endif
> > > +       ret
> > > +
> > > +L(align_more):
> > > +       leaq    VEC_SIZE(%rdi), %rax
> > > +       /* Align rax to VEC_SIZE.  */
> > > +       andq    $-VEC_SIZE, %rax
> > > +# ifdef USE_AS_STRNLEN
> > > +       movq    %rax, %rdx
> > > +       subq    %rdi, %rdx
> > > +#  ifdef USE_AS_WCSLEN
> > > +       shrq    $2, %rdx
> > > +#  endif
> > > +       /* At this point rdx contains [w]chars already compared.  */
> > > +       cmpq    %rsi, %rdx
> > > +       jae     L(ret_max)
> > > +       subq    %rsi, %rdx
> > > +       negq    %rdx
> > > +       /* At this point rdx contains number of w[char] needs to go.
> > > +          Now onwards rdx will keep decrementing with each compare.  */
> > > +# endif
> > > +
> > > +       /* Loop unroll 4 times for 4 vector loop.  */
> > > +       VPCMP   $0, (%rax), %ZMM0, %k0
> > > +       kmovq   %k0, %rcx
> > > +       testq   %rcx, %rcx
> > > +       jnz     L(first_vector)
> >
> > Just to keep consistent with the other files can you
> > rename first_vector/second_vector... to ret_vec_x{N}
> > or something like that.
>
> Agree, will be fixed in v1.
>
> > > +
> > > +# ifdef USE_AS_STRNLEN
> > > +       subq    $CHAR_PER_VEC, %rdx
> > > +       jbe     L(ret_max)
> > > +# endif
> > > +
> > > +       VPCMP   $0, VEC_SIZE(%rax), %ZMM0, %k0
> > > +       kmovq   %k0, %rcx
> > > +       testq   %rcx, %rcx
> > > +       jnz     L(second_vector)
> > > +
> > > +# ifdef USE_AS_STRNLEN
> > > +       subq    $CHAR_PER_VEC, %rdx
> > > +       jbe     L(ret_max)
> > > +# endif
> >
> > The evex256 / avx2 versions do a simple check if we will be able
> > to do all 4 aligning compares w.o a branch. This saves total
> > branches. Why not do something similar here?
>
> Done this way to reduce size and complexity. Branch taken, will
> jump to terminating condition. Branch not taken has no impact on perf.

Don't think that's quite true...
>
> > > +
> > > +       VPCMP   $0, (2 * VEC_SIZE)(%rax), %ZMM0, %k0
> > > +       kmovq   %k0, %rcx
> > > +       testq   %rcx, %rcx
> > > +       jnz     L(third_vector)
> > > +
> > > +# ifdef USE_AS_STRNLEN
> > > +       subq    $CHAR_PER_VEC, %rdx
> > > +       jbe     L(ret_max)
> > > +# endif
> > > +
> > > +       VPCMP   $0, (3 * VEC_SIZE)(%rax), %ZMM0, %k0
> > > +       kmovq   %k0, %rcx
> > > +       testq   %rcx, %rcx
> > > +       jnz     L(fourth_vector)
> > > +
> > > +       addq    $(4 * VEC_SIZE), %rax
> > > +
> > > +# ifdef USE_AS_STRNLEN
> > > +       /* Instead of decreasing, rdx increased to prepare for loop
> > > +          first iteration.  Incremented 3 times because one increment
> > > +          cancelled by previous decrement.  */
> > > +       addq    $(3 * CHAR_PER_VEC), %rdx
> > > +# endif
> > > +
> > > +       /* Test if address is already 4 * VEC_SIZE byte aligned goto
> > > +          loop.  */
> > > +       testq   $(3 * VEC_SIZE), %rax
> > > +       jz      L(loop)
> > > +
> > > +       movq    %rax, %rcx
> > > +
> > > +       /* Align address to 4 * VEC_SIZE for loop.  */
> > > +       andq    $-(4 * VEC_SIZE), %rax
> > > +
> > > +# ifdef USE_AS_STRNLEN
> > > +       subq    %rax, %rcx
> > > +#  ifdef USE_AS_WCSLEN
> > > +       sarq    $2, %rcx
> > > +#  endif
> > > +       /* rcx contains number of [w]char will be recompared due to
> > > +          alignment fixes.  rdx must be incremented by rcx to offset
> > > +          alignment adjustmentment.  */
> > > +       addq    %rcx, %rdx
> > > +# endif
> > > +
> > > +L(loop):
> > > +# ifdef USE_AS_STRNLEN
> > > +       subq    $(CHAR_PER_VEC * 4), %rdx
> > > +       jbe     L(ret_max)
> >
> > we have potential to overread by 255 bytes. Not correctness issue because
> > we are page aligned by seems like a possible perf issue.
>
> Correct, but overread data will be read from cache not memory, not a
> significant impact, but this is the cost we have to pay for 4 vector alignments.

You can implement it so it has a last 4x case instead.
>
> > > +# endif
> > > +       /* VPMINU and VPCMP combination provide better perfomance as
> > > +          compared to alternative combinations.  */
> > > +       VMOVA   (%rax), %ZMM1
> > > +       VPMINU  (VEC_SIZE)(%rax), %ZMM1, %ZMM2
> > > +       VMOVA   (2 * VEC_SIZE)(%rax), %ZMM3
> > > +       VPMINU  (3 * VEC_SIZE)(%rax), %ZMM3, %ZMM4
> >
> > I think doing 4x in the main loop is probably overkill no?
> > Aligning to 256 is pretty extreme.
> >
> > Also I don't think the 4x zmm loads can even keep up with
> > 2x / cycle so seems like it may not be worth wasting up to
> > 255 bytes to get it.
>
> Perf number looks good, so for now it should be ok.

Would prefer having a good final version.
>
> > > +
> > > +       VPCMP   $0, %ZMM2, %ZMM0, %k0
> > > +       VPCMP   $0, %ZMM4, %ZMM0, %k1
> > > +
> > > +       addq    $(4 * VEC_SIZE), %rax
> > > +       kortestq %k0, %k1
> > > +       jz      L(loop)
> > > +
> > > +       /* Need 4 vector subtraction because address incremented in
> > > +          the loop before terminating condition check.  Also want to
> > > +          reuse code for exit condition before and after the loop.  */
> > > +       subq    $(4 * VEC_SIZE), %rax
> > > +
> > > +       VPCMP   $0, %ZMM1, %ZMM0, %k2
> > > +       kmovq   %k2, %rcx
> > > +       testq   %rcx, %rcx
> > > +       jnz     L(first_vector)
> > > +
> > > +       kmovq   %k0, %rcx
> > > +       /* At this point, if k0 is non zero, null char must be in the
> > > +          second vector.  */
> > > +       testq   %rcx, %rcx
> > > +       jnz     L(second_vector)
> > > +
> > > +       VPCMP   $0, %ZMM3, %ZMM0, %k3
> > > +       kmovq   %k3, %rcx
> > > +       testq   %rcx, %rcx
> > > +       jnz     L(third_vector)
> > > +       /* At this point null [w]char must be in the fourth vector so no
> > > +          need to check.  */
> > > +       kmovq   %k1, %rcx
> > > +
> > > +       /* Termination fourth, third, second vector are pretty much
> > > +          same, implemented this way to avoid branching and reuse code
> > > +          from pre loop exit condition.  */
> > > +L(fourth_vector):
> > > +       addq    $(3 * VEC_SIZE), %rax
> > > +       tzcntq  %rcx, %rcx
> > > +       subq    %rdi, %rax
> > Can this be hoisted out to the begining of L(aligned_more).
> > It seems every return path uses it.
> >
>
> It really depends on where the control is coming from. So moving before
> align_more will not be correct, or I may be missing something here.

Is there any path from the *begining* of L(aligned_more) that
doesn't go to either L(ret_max) or one of the 4 return statements?
>
> > > +# ifdef USE_AS_WCSLEN
> > > +       sarq    $2, %rax
> > > +# endif
> > > +       addq    %rcx, %rax
> >
> > if not wcslen probably faster to use lea instead of 2x add
>
> I'm not sure whether there will be any significant gain. lea vs add. Used add
> because it's readily available on all ports.

AFAIK all machines we would enable evex512 on have fast LEA.
>
>
> >
> > > +# ifdef USE_AS_STRNLEN
> > > +       cmpq    %rsi, %rax
> > > +       jae     L(ret_max)
> > > +# endif
> > > +       ret
> > > +
> > > +L(third_vector):
> > > +       addq    $(2 * VEC_SIZE), %rax
> > > +       tzcntq  %rcx, %rcx
> > > +       subq    %rdi, %rax
> > > +# ifdef USE_AS_WCSLEN
> > > +       sarq    $2, %rax
> > > +# endif
> > > +       addq    %rcx, %rax
> > > +# ifdef USE_AS_STRNLEN
> > > +       cmpq    %rsi, %rax
> > > +       jae     L(ret_max)
> > > +# endif
> > > +       ret
> > > +
> > > +L(second_vector):
> > > +       addq    $VEC_SIZE, %rax
> > > +L(first_vector):
> > > +       tzcntq  %rcx, %rcx
> > > +       subq    %rdi, %rax
> > > +# ifdef USE_AS_WCSLEN
> > > +       sarq    $2, %rax
> > > +# endif
> > > +       addq    %rcx, %rax
> > > +# ifdef USE_AS_STRNLEN
> > > +       cmpq    %rsi, %rax
> > > +       jae     L(ret_max)
> > > +# endif
> > > +       ret
> > > +
> > > +END (STRLEN)
> > > +#endif
> > > diff --git a/sysdeps/x86_64/multiarch/strnlen-evex512.S b/sysdeps/x86_64/multiarch/strnlen-evex512.S
> > > new file mode 100644
> > > index 0000000000..0b7f220214
> > > --- /dev/null
> > > +++ b/sysdeps/x86_64/multiarch/strnlen-evex512.S
> > > @@ -0,0 +1,4 @@
> > > +#define STRLEN __strnlen_evex512
> > > +#define USE_AS_STRNLEN 1
> > > +
> > > +#include "strlen-evex512.S"
> > > diff --git a/sysdeps/x86_64/multiarch/wcslen-evex512.S b/sysdeps/x86_64/multiarch/wcslen-evex512.S
> > > new file mode 100644
> > > index 0000000000..f59c372b78
> > > --- /dev/null
> > > +++ b/sysdeps/x86_64/multiarch/wcslen-evex512.S
> > > @@ -0,0 +1,4 @@
> > > +#define STRLEN __wcslen_evex512
> > > +#define USE_AS_WCSLEN 1
> > > +
> > > +#include "strlen-evex512.S"
> > > diff --git a/sysdeps/x86_64/multiarch/wcsnlen-evex512.S b/sysdeps/x86_64/multiarch/wcsnlen-evex512.S
> > > new file mode 100644
> > > index 0000000000..73dcf2f210
> > > --- /dev/null
> > > +++ b/sysdeps/x86_64/multiarch/wcsnlen-evex512.S
> > > @@ -0,0 +1,5 @@
> > > +#define STRLEN __wcsnlen_evex512
> > > +#define USE_AS_WCSLEN 1
> > > +#define USE_AS_STRNLEN 1
> > > +
> > > +#include "strlen-evex512.S"
> > > --
> > > 2.35.3
> > >

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH v2] x86_64: Implement evex512 version of strlen, strnlen,  wcslen and wcsnlen
  2022-05-19  3:48     ` [PATCH v2] " Sunil K Pandey
@ 2022-05-19 15:03       ` Noah Goldstein
  2022-05-25 13:43         ` [PATCH v3] " Sunil K Pandey
  0 siblings, 1 reply; 12+ messages in thread
From: Noah Goldstein @ 2022-05-19 15:03 UTC (permalink / raw)
  To: Sunil K Pandey; +Cc: GNU C Library

On Wed, May 18, 2022 at 10:48 PM Sunil K Pandey via Libc-alpha
<libc-alpha@sourceware.org> wrote:
>
> This patch implements following evex512 version of string functions.
> Perf gain up to 50% as compared to evex, depending on length and
> alignment.
>
> - String length function using 512 bit vectors.
> - String N length using 512 bit vectors.
> - Wide string length using 512 bit vectors.
> - Wide string N length using 512 bit vectors.
> ---
>  sysdeps/x86_64/multiarch/Makefile          |   4 +
>  sysdeps/x86_64/multiarch/ifunc-impl-list.c |  20 ++
>  sysdeps/x86_64/multiarch/strlen-evex512.S  | 291 +++++++++++++++++++++
>  sysdeps/x86_64/multiarch/strnlen-evex512.S |   4 +
>  sysdeps/x86_64/multiarch/wcslen-evex512.S  |   4 +
>  sysdeps/x86_64/multiarch/wcsnlen-evex512.S |   5 +
>  6 files changed, 328 insertions(+)
>  create mode 100644 sysdeps/x86_64/multiarch/strlen-evex512.S
>  create mode 100644 sysdeps/x86_64/multiarch/strnlen-evex512.S
>  create mode 100644 sysdeps/x86_64/multiarch/wcslen-evex512.S
>  create mode 100644 sysdeps/x86_64/multiarch/wcsnlen-evex512.S
>
> diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
> index f3ab5e0928..d0869c3ac3 100644
> --- a/sysdeps/x86_64/multiarch/Makefile
> +++ b/sysdeps/x86_64/multiarch/Makefile
> @@ -81,6 +81,7 @@ sysdep_routines += \
>    strlen-avx2 \
>    strlen-avx2-rtm \
>    strlen-evex \
> +  strlen-evex512 \
>    strlen-sse2 \
>    strncase_l-avx2 \
>    strncase_l-avx2-rtm \
> @@ -105,6 +106,7 @@ sysdep_routines += \
>    strnlen-avx2 \
>    strnlen-avx2-rtm \
>    strnlen-evex \
> +  strnlen-evex512 \
>    strnlen-sse2 \
>    strpbrk-c \
>    strpbrk-sse2 \
> @@ -138,6 +140,7 @@ sysdep_routines += \
>    wcslen-avx2 \
>    wcslen-avx2-rtm \
>    wcslen-evex \
> +  wcslen-evex512 \
>    wcslen-sse2 \
>    wcslen-sse4_1 \
>    wcsncmp-avx2 \
> @@ -148,6 +151,7 @@ sysdep_routines += \
>    wcsnlen-avx2-rtm \
>    wcsnlen-c \
>    wcsnlen-evex \
> +  wcsnlen-evex512 \
>    wcsnlen-sse4_1 \
>    wcsrchr-avx2 \
>    wcsrchr-avx2-rtm \
> diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> index 7218095430..c5cd9466fe 100644
> --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> @@ -328,6 +328,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
>                                && CPU_FEATURE_USABLE (AVX512BW)
>                                && CPU_FEATURE_USABLE (BMI2)),
>                               __strlen_evex)
> +             IFUNC_IMPL_ADD (array, i, strlen,
> +                             (CPU_FEATURE_USABLE (AVX512VL)
> +                              && CPU_FEATURE_USABLE (AVX512BW)
> +                              && CPU_FEATURE_USABLE (BMI2)),
> +                             __strlen_evex512)
>               IFUNC_IMPL_ADD (array, i, strlen, 1, __strlen_sse2))
>
>    /* Support sysdeps/x86_64/multiarch/strnlen.c.  */
> @@ -346,6 +351,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
>                                && CPU_FEATURE_USABLE (AVX512BW)
>                                && CPU_FEATURE_USABLE (BMI2)),
>                               __strnlen_evex)
> +             IFUNC_IMPL_ADD (array, i, strnlen,
> +                             (CPU_FEATURE_USABLE (AVX512VL)
> +                              && CPU_FEATURE_USABLE (AVX512BW)
> +                              && CPU_FEATURE_USABLE (BMI2)),
> +                             __strnlen_evex512)
>               IFUNC_IMPL_ADD (array, i, strnlen, 1, __strnlen_sse2))
>
>    /* Support sysdeps/x86_64/multiarch/stpncpy.c.  */
> @@ -699,6 +709,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
>                                && CPU_FEATURE_USABLE (AVX512BW)
>                                && CPU_FEATURE_USABLE (BMI2)),
>                               __wcslen_evex)
> +             IFUNC_IMPL_ADD (array, i, wcslen,
> +                             (CPU_FEATURE_USABLE (AVX512VL)
> +                              && CPU_FEATURE_USABLE (AVX512BW)
> +                              && CPU_FEATURE_USABLE (BMI2)),
> +                             __wcslen_evex512)
>               IFUNC_IMPL_ADD (array, i, wcslen,
>                               CPU_FEATURE_USABLE (SSE4_1),
>                               __wcslen_sse4_1)
> @@ -720,6 +735,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
>                                && CPU_FEATURE_USABLE (AVX512BW)
>                                && CPU_FEATURE_USABLE (BMI2)),
>                               __wcsnlen_evex)
> +             IFUNC_IMPL_ADD (array, i, wcsnlen,
> +                             (CPU_FEATURE_USABLE (AVX512VL)
> +                              && CPU_FEATURE_USABLE (AVX512BW)
> +                              && CPU_FEATURE_USABLE (BMI2)),
> +                             __wcsnlen_evex512)
>               IFUNC_IMPL_ADD (array, i, wcsnlen,
>                               CPU_FEATURE_USABLE (SSE4_1),
>                               __wcsnlen_sse4_1)
> diff --git a/sysdeps/x86_64/multiarch/strlen-evex512.S b/sysdeps/x86_64/multiarch/strlen-evex512.S
> new file mode 100644
> index 0000000000..0a2d7bbb1a
> --- /dev/null
> +++ b/sysdeps/x86_64/multiarch/strlen-evex512.S
> @@ -0,0 +1,291 @@
> +/* Copyright (C) 2022 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +#if IS_IN (libc)
> +
> +# include <sysdep.h>
> +
> +# ifndef STRLEN
> +#  define STRLEN       __strlen_evex512
> +# endif
> +
> +# define VMOVA         vmovdqa64
> +# ifdef USE_AS_WCSLEN
> +#  define VPCMP                vpcmpd
> +#  define VPMINU       vpminud
> +#  define CHAR_SIZE    4
> +# else
> +#  define VPCMP                vpcmpb
> +#  define VPMINU       vpminub
> +#  define CHAR_SIZE    1
> +# endif
> +
> +# define XMM0          xmm16
> +# define ZMM0          zmm16
> +# define ZMM1          zmm17
> +# define ZMM2          zmm18
> +# define ZMM3          zmm19
> +# define ZMM4          zmm20
> +# define VEC_SIZE      64
> +# define PAGE_SIZE     4096
> +# define CHAR_PER_VEC  (VEC_SIZE / CHAR_SIZE)
> +
> +       .section .text.evex512, "ax", @progbits
> +/* Aligning entry point to 64 byte, provides better performance for
> +   one vector length string.  */
> +ENTRY_P2ALIGN (STRLEN, 6)
> +# ifdef USE_AS_STRNLEN
> +        /* Check zero length.  */
> +       test    %RSI_LP, %RSI_LP
> +       jz      L(zero)
> +#  ifdef __ILP32__
> +       /* Clear the upper 32 bits.  */
> +       movl    %esi, %esi
> +#  endif
> +# endif
> +
> +       movl    %edi, %ecx
> +       vpxorq  %XMM0, %XMM0, %XMM0
> +       andl    $(PAGE_SIZE - 1), %ecx
> +       cmpl    $(PAGE_SIZE - VEC_SIZE), %ecx

Use eax instead of ecx here to save more code size.
> +       ja      L(page_cross)
> +
> +       /* Compare [w]char for null, mask bit will be set for match.  */
> +       VPCMP   $0, (%rdi), %ZMM0, %k0
> +       kmovq   %k0, %rax
> +       testq   %rax, %rax
> +       jz      L(align_more)
> +
> +       tzcntq  %rax, %rax

Replace tzcnt with bsf to save code size
> +# ifdef USE_AS_STRNLEN
> +       cmpq    %rsi, %rax
> +       jae     L(ret_max)
> +# endif
> +       ret
> +
> +# ifdef USE_AS_STRNLEN
> +       /* eax instead of rax used to save encoding space.  */
> +L(zero):
> +       xorl    %eax, %eax
> +       ret
> +# endif
> +
> +       /* At this point vector max length reached.  */
> +# ifdef USE_AS_STRNLEN
> +L(ret_max):
> +       movq    %rsi, %rax
> +       ret
> +# endif
> +
> +L(page_cross):
> +       andl    $(VEC_SIZE - 1), %ecx

no needed shifts automatically only use bits in range
> +# ifdef USE_AS_WCSLEN
> +       sarl    $2, %ecx
> +# endif
> +       /* ecx contains number of w[char] to be skipped as a result
> +          of address alignment.  */
> +       movq    %rdi, %rax
> +       andq    $-VEC_SIZE, %rax

You can save further code size doing
`xorq %rdi, %rcx`; VPCMP $0, (PAGE_SIZE - VEC_SIZE)(%rcx)...`

then use `rdi` for the shift.
> +       VPCMP   $0, (%rax), %ZMM0, %k0
> +       kmovq   %k0, %rax
> +       /* Ignore number of character for alignment adjustment.  */
> +       shrq    %cl, %rax
> +       jz      L(align_more)
> +
> +       tzcntq  %rax, %rax
> +# ifdef USE_AS_STRNLEN
> +       cmpq    %rsi, %rax
> +       jae     L(ret_max)
> +# endif
> +       ret
> +
> +L(align_more):
> +       leaq    VEC_SIZE(%rdi), %rax
> +       /* Align rax to VEC_SIZE.  */
> +       andq    $-VEC_SIZE, %rax
> +# ifdef USE_AS_STRNLEN
> +       movq    %rax, %rdx
> +       subq    %rdi, %rdx
> +#  ifdef USE_AS_WCSLEN
> +       shrq    $2, %rdx
> +#  endif
> +       /* At this point rdx contains [w]chars already compared.  */
> +       cmpq    %rsi, %rdx
You `subq` next inst so just do the comparison with `subq`.
> +       jae     L(ret_max)
> +       subq    %rsi, %rdx
> +       negq    %rdx
> +       /* At this point rdx contains number of w[char] needs to go.
> +          Now onwards rdx will keep decrementing with each compare.  */
> +# endif
> +
> +       /* Loop unroll 4 times for 4 vector loop.  */
> +       VPCMP   $0, (%rax), %ZMM0, %k0
> +       kmovq   %k0, %rcx
> +       testq   %rcx, %rcx
> +       jnz     L(ret_vec_x1)
> +
> +# ifdef USE_AS_STRNLEN
> +       subq    $CHAR_PER_VEC, %rdx
> +       jbe     L(ret_max)
> +# endif
> +
> +       VPCMP   $0, VEC_SIZE(%rax), %ZMM0, %k0
> +       kmovq   %k0, %rcx
> +       testq   %rcx, %rcx
> +       jnz     L(ret_vec_x2)
> +
> +# ifdef USE_AS_STRNLEN
> +       subq    $CHAR_PER_VEC, %rdx
> +       jbe     L(ret_max)
> +# endif
> +
> +       VPCMP   $0, (2 * VEC_SIZE)(%rax), %ZMM0, %k0
> +       kmovq   %k0, %rcx
> +       testq   %rcx, %rcx
> +       jnz     L(ret_vec_x3)
> +
> +# ifdef USE_AS_STRNLEN
> +       subq    $CHAR_PER_VEC, %rdx
> +       jbe     L(ret_max)
> +# endif
> +
> +       VPCMP   $0, (3 * VEC_SIZE)(%rax), %ZMM0, %k0
> +       kmovq   %k0, %rcx
> +       testq   %rcx, %rcx
> +       jnz     L(ret_vec_x4)
> +
> +       addq    $(4 * VEC_SIZE), %rax
> +
> +# ifdef USE_AS_STRNLEN
> +       /* Instead of decreasing, rdx increased to prepare for loop
> +          first iteration.  Incremented 3 times because one increment
> +          cancelled by previous decrement.  */
> +       addq    $(3 * CHAR_PER_VEC), %rdx
> +# endif
> +
> +       /* Test if address is already 4 * VEC_SIZE byte aligned goto
> +          loop.  */
> +       testq   $(3 * VEC_SIZE), %rax

Can only imagine this is is possibly worth it for STRNLEN.
> +       jz      L(loop)
> +
> +       movq    %rax, %rcx
> +
> +       /* Align address to 4 * VEC_SIZE for loop.  */
> +       andq    $-(4 * VEC_SIZE), %rax

Less code size way to aligned is

`orq $(VEC_SIZE * 4 - 1), %rax; incq %rax`

> +
> +# ifdef USE_AS_STRNLEN
> +       subq    %rax, %rcx
> +#  ifdef USE_AS_WCSLEN
> +       sarq    $2, %rcx
> +#  endif
> +       /* rcx contains number of [w]char will be recompared due to
> +          alignment fixes.  rdx must be incremented by rcx to offset
> +          alignment adjustmentment.  */
> +       addq    %rcx, %rdx
> +# endif
> +
> +L(loop):
> +# ifdef USE_AS_STRNLEN
> +       subq    $(CHAR_PER_VEC * 4), %rdx
> +       jbe     L(ret_max)
> +# endif
> +       /* VPMINU and VPCMP combination provide better perfomance as
> +          compared to alternative combinations.  */
> +       VMOVA   (%rax), %ZMM1
> +       VPMINU  (VEC_SIZE)(%rax), %ZMM1, %ZMM2
> +       VMOVA   (2 * VEC_SIZE)(%rax), %ZMM3
> +       VPMINU  (3 * VEC_SIZE)(%rax), %ZMM3, %ZMM4
> +
> +       VPCMP   $0, %ZMM2, %ZMM0, %k0
> +       VPCMP   $0, %ZMM4, %ZMM0, %k1
> +
> +       addq    $(4 * VEC_SIZE), %rax
> +       kortestq %k0, %k1
> +       jz      L(loop)
> +
> +       /* Need 4 vector subtraction because address incremented in
> +          the loop before terminating condition check.  Also want to
> +          reuse code for exit condition before and after the loop.  */
> +       subq    $(4 * VEC_SIZE), %rax
Is it possible to just add 4x offset to the loop? No penalty of imm32 encoding
with evex encoding.
> +
> +       VPCMP   $0, %ZMM1, %ZMM0, %k2
> +       kmovq   %k2, %rcx
> +       testq   %rcx, %rcx
> +       jnz     L(ret_vec_x1)
> +
> +       kmovq   %k0, %rcx
> +       /* At this point, if k0 is non zero, null char must be in the
> +          second vector.  */
> +       testq   %rcx, %rcx
> +       jnz     L(ret_vec_x2)
> +
> +       VPCMP   $0, %ZMM3, %ZMM0, %k3
> +       kmovq   %k3, %rcx
> +       testq   %rcx, %rcx
> +       jnz     L(ret_vec_x3)
> +       /* At this point null [w]char must be in the fourth vector so no
> +          need to check.  */
> +       kmovq   %k1, %rcx
> +
> +       /* Termination fourth, third, second vector are pretty much
> +          same, implemented this way to avoid branching and reuse code
> +          from pre loop exit condition.  */
> +L(ret_vec_x4):
> +       addq    $(3 * VEC_SIZE), %rax
> +       tzcntq  %rcx, %rcx
> +       subq    %rdi, %rax
> +# ifdef USE_AS_WCSLEN
> +       sarq    $2, %rax
> +# endif
> +       addq    %rcx, %rax
> +# ifdef USE_AS_STRNLEN
> +       cmpq    %rsi, %rax
> +       jae     L(ret_max)
> +# endif
> +       ret
> +
> +L(ret_vec_x3):
> +       addq    $(2 * VEC_SIZE), %rax
> +       tzcntq  %rcx, %rcx
> +       subq    %rdi, %rax
> +# ifdef USE_AS_WCSLEN
> +       sarq    $2, %rax
> +# endif
> +       addq    %rcx, %rax
> +# ifdef USE_AS_STRNLEN
> +       cmpq    %rsi, %rax
> +       jae     L(ret_max)
> +# endif
> +       ret
> +
> +L(ret_vec_x2):
> +       addq    $VEC_SIZE, %rax
> +L(ret_vec_x1):
> +       tzcntq  %rcx, %rcx
> +       subq    %rdi, %rax
> +# ifdef USE_AS_WCSLEN
> +       sarq    $2, %rax
> +# endif
> +       addq    %rcx, %rax
> +# ifdef USE_AS_STRNLEN
> +       cmpq    %rsi, %rax
> +       jae     L(ret_max)
> +# endif
> +       ret
> +
> +END (STRLEN)
> +#endif
> diff --git a/sysdeps/x86_64/multiarch/strnlen-evex512.S b/sysdeps/x86_64/multiarch/strnlen-evex512.S
> new file mode 100644
> index 0000000000..0b7f220214
> --- /dev/null
> +++ b/sysdeps/x86_64/multiarch/strnlen-evex512.S
> @@ -0,0 +1,4 @@
> +#define STRLEN __strnlen_evex512
> +#define USE_AS_STRNLEN 1
> +
> +#include "strlen-evex512.S"
> diff --git a/sysdeps/x86_64/multiarch/wcslen-evex512.S b/sysdeps/x86_64/multiarch/wcslen-evex512.S
> new file mode 100644
> index 0000000000..f59c372b78
> --- /dev/null
> +++ b/sysdeps/x86_64/multiarch/wcslen-evex512.S
> @@ -0,0 +1,4 @@
> +#define STRLEN __wcslen_evex512
> +#define USE_AS_WCSLEN 1
> +
> +#include "strlen-evex512.S"
> diff --git a/sysdeps/x86_64/multiarch/wcsnlen-evex512.S b/sysdeps/x86_64/multiarch/wcsnlen-evex512.S
> new file mode 100644
> index 0000000000..73dcf2f210
> --- /dev/null
> +++ b/sysdeps/x86_64/multiarch/wcsnlen-evex512.S
> @@ -0,0 +1,5 @@
> +#define STRLEN __wcsnlen_evex512
> +#define USE_AS_WCSLEN 1
> +#define USE_AS_STRNLEN 1
> +
> +#include "strlen-evex512.S"
> --
> 2.35.3
>

^ permalink raw reply	[flat|nested] 12+ messages in thread

* [PATCH v3] x86_64: Implement evex512 version of strlen, strnlen, wcslen and wcsnlen
  2022-05-19 15:03       ` Noah Goldstein
@ 2022-05-25 13:43         ` Sunil K Pandey
  2022-05-25 17:10           ` Noah Goldstein
  0 siblings, 1 reply; 12+ messages in thread
From: Sunil K Pandey @ 2022-05-25 13:43 UTC (permalink / raw)
  To: libc-alpha

This patch implements following evex512 version of string functions.
Perf gain for evex512 version is up to 50% as compared to evex,
depending on length and alignment.

These functions are currently just for benchmarking/reference.

- String length function using 512 bit vectors.
- String N length using 512 bit vectors.
- Wide string length using 512 bit vectors.
- Wide string N length using 512 bit vectors.
---
 sysdeps/x86_64/multiarch/Makefile           |   4 +
 sysdeps/x86_64/multiarch/ifunc-impl-list.c  |  20 ++
 sysdeps/x86_64/multiarch/strlen-evex-base.S | 299 ++++++++++++++++++++
 sysdeps/x86_64/multiarch/strlen-evex512.S   |   7 +
 sysdeps/x86_64/multiarch/strnlen-evex512.S  |   4 +
 sysdeps/x86_64/multiarch/wcslen-evex512.S   |   4 +
 sysdeps/x86_64/multiarch/wcsnlen-evex512.S  |   5 +
 7 files changed, 343 insertions(+)
 create mode 100644 sysdeps/x86_64/multiarch/strlen-evex-base.S
 create mode 100644 sysdeps/x86_64/multiarch/strlen-evex512.S
 create mode 100644 sysdeps/x86_64/multiarch/strnlen-evex512.S
 create mode 100644 sysdeps/x86_64/multiarch/wcslen-evex512.S
 create mode 100644 sysdeps/x86_64/multiarch/wcsnlen-evex512.S

diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
index f3ab5e0928..d0869c3ac3 100644
--- a/sysdeps/x86_64/multiarch/Makefile
+++ b/sysdeps/x86_64/multiarch/Makefile
@@ -81,6 +81,7 @@ sysdep_routines += \
   strlen-avx2 \
   strlen-avx2-rtm \
   strlen-evex \
+  strlen-evex512 \
   strlen-sse2 \
   strncase_l-avx2 \
   strncase_l-avx2-rtm \
@@ -105,6 +106,7 @@ sysdep_routines += \
   strnlen-avx2 \
   strnlen-avx2-rtm \
   strnlen-evex \
+  strnlen-evex512 \
   strnlen-sse2 \
   strpbrk-c \
   strpbrk-sse2 \
@@ -138,6 +140,7 @@ sysdep_routines += \
   wcslen-avx2 \
   wcslen-avx2-rtm \
   wcslen-evex \
+  wcslen-evex512 \
   wcslen-sse2 \
   wcslen-sse4_1 \
   wcsncmp-avx2 \
@@ -148,6 +151,7 @@ sysdep_routines += \
   wcsnlen-avx2-rtm \
   wcsnlen-c \
   wcsnlen-evex \
+  wcsnlen-evex512 \
   wcsnlen-sse4_1 \
   wcsrchr-avx2 \
   wcsrchr-avx2-rtm \
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
index 7218095430..c5cd9466fe 100644
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
@@ -328,6 +328,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 			       && CPU_FEATURE_USABLE (AVX512BW)
 			       && CPU_FEATURE_USABLE (BMI2)),
 			      __strlen_evex)
+	      IFUNC_IMPL_ADD (array, i, strlen,
+			      (CPU_FEATURE_USABLE (AVX512VL)
+			       && CPU_FEATURE_USABLE (AVX512BW)
+			       && CPU_FEATURE_USABLE (BMI2)),
+			      __strlen_evex512)
 	      IFUNC_IMPL_ADD (array, i, strlen, 1, __strlen_sse2))
 
   /* Support sysdeps/x86_64/multiarch/strnlen.c.  */
@@ -346,6 +351,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 			       && CPU_FEATURE_USABLE (AVX512BW)
 			       && CPU_FEATURE_USABLE (BMI2)),
 			      __strnlen_evex)
+	      IFUNC_IMPL_ADD (array, i, strnlen,
+			      (CPU_FEATURE_USABLE (AVX512VL)
+			       && CPU_FEATURE_USABLE (AVX512BW)
+			       && CPU_FEATURE_USABLE (BMI2)),
+			      __strnlen_evex512)
 	      IFUNC_IMPL_ADD (array, i, strnlen, 1, __strnlen_sse2))
 
   /* Support sysdeps/x86_64/multiarch/stpncpy.c.  */
@@ -699,6 +709,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 			       && CPU_FEATURE_USABLE (AVX512BW)
 			       && CPU_FEATURE_USABLE (BMI2)),
 			      __wcslen_evex)
+	      IFUNC_IMPL_ADD (array, i, wcslen,
+			      (CPU_FEATURE_USABLE (AVX512VL)
+			       && CPU_FEATURE_USABLE (AVX512BW)
+			       && CPU_FEATURE_USABLE (BMI2)),
+			      __wcslen_evex512)
 	      IFUNC_IMPL_ADD (array, i, wcslen,
 			      CPU_FEATURE_USABLE (SSE4_1),
 			      __wcslen_sse4_1)
@@ -720,6 +735,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 			       && CPU_FEATURE_USABLE (AVX512BW)
 			       && CPU_FEATURE_USABLE (BMI2)),
 			      __wcsnlen_evex)
+	      IFUNC_IMPL_ADD (array, i, wcsnlen,
+			      (CPU_FEATURE_USABLE (AVX512VL)
+			       && CPU_FEATURE_USABLE (AVX512BW)
+			       && CPU_FEATURE_USABLE (BMI2)),
+			      __wcsnlen_evex512)
 	      IFUNC_IMPL_ADD (array, i, wcsnlen,
 			      CPU_FEATURE_USABLE (SSE4_1),
 			      __wcsnlen_sse4_1)
diff --git a/sysdeps/x86_64/multiarch/strlen-evex-base.S b/sysdeps/x86_64/multiarch/strlen-evex-base.S
new file mode 100644
index 0000000000..bd09967f76
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strlen-evex-base.S
@@ -0,0 +1,299 @@
+/* Placeholder function, not used by any processor at the moment.
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#if IS_IN (libc)
+
+# include <sysdep.h>
+
+# ifdef USE_AS_WCSLEN
+#  define VPCMP		vpcmpd
+#  define VPTESTN	vptestnmd
+#  define VPMINU	vpminud
+#  define CHAR_SIZE	4
+# else
+#  define VPCMP		vpcmpb
+#  define VPTESTN	vptestnmb
+#  define VPMINU	vpminub
+#  define CHAR_SIZE	1
+# endif
+
+# define XMM0		xmm16
+# define PAGE_SIZE	4096
+# define CHAR_PER_VEC	(VEC_SIZE / CHAR_SIZE)
+
+# if VEC_SIZE == 64
+#  define KMOV		kmovq
+#  define KORTEST	kortestq
+#  define RAX		rax
+#  define RCX		rcx
+#  define RDX		rdx
+#  define SHR		shrq
+#  define TEXTSUFFIX	evex512
+#  define VMM0		zmm16
+#  define VMM1		zmm17
+#  define VMM2		zmm18
+#  define VMM3		zmm19
+#  define VMM4		zmm20
+#  define VMOVA		vmovdqa64
+# elif VEC_SIZE == 32
+/* Currently Unused.  */
+#  define KMOV		kmovd
+#  define KORTEST	kortestd
+#  define RAX		eax
+#  define RCX		ecx
+#  define RDX		edx
+#  define SHR		shrl
+#  define TEXTSUFFIX	evex256
+#  define VMM0		ymm16
+#  define VMM1		ymm17
+#  define VMM2		ymm18
+#  define VMM3		ymm19
+#  define VMM4		ymm20
+#  define VMOVA		vmovdqa32
+# endif
+
+	.section .text.TEXTSUFFIX, "ax", @progbits
+/* Aligning entry point to 64 byte, provides better performance for
+   one vector length string.  */
+ENTRY_P2ALIGN (STRLEN, 6)
+# ifdef USE_AS_STRNLEN
+	/* Check zero length.  */
+	test	%RSI_LP, %RSI_LP
+	jz	L(ret_max)
+#  ifdef __ILP32__
+	/* Clear the upper 32 bits.  */
+	movl	%esi, %esi
+#  endif
+# endif
+
+	movl	%edi, %eax
+	vpxorq	%XMM0, %XMM0, %XMM0
+	andl	$(PAGE_SIZE - 1), %eax
+	cmpl	$(PAGE_SIZE - VEC_SIZE), %eax
+	ja	L(page_cross)
+
+	/* Compare [w]char for null, mask bit will be set for match.  */
+	VPCMP	$0, (%rdi), %VMM0, %k0
+	KMOV	%k0, %RAX
+	test	%RAX, %RAX
+	jz	L(align_more)
+
+	bsf	%RAX, %RAX
+# ifdef USE_AS_STRNLEN
+	cmpq	%rsi, %rax
+	cmovnb	%rsi, %rax
+# endif
+	ret
+
+	/* At this point vector max length reached.  */
+# ifdef USE_AS_STRNLEN
+	.p2align 4,,3
+L(ret_max):
+	movq	%rsi, %rax
+	ret
+# endif
+
+L(align_more):
+	leaq	VEC_SIZE(%rdi), %rax
+	/* Align rax to VEC_SIZE.  */
+	andq	$-VEC_SIZE, %rax
+# ifdef USE_AS_STRNLEN
+	movq	%rax, %rdx
+	subq	%rdi, %rdx
+#  ifdef USE_AS_WCSLEN
+	SHR	$2, %RDX
+#  endif
+	/* At this point rdx contains [w]chars already compared.  */
+	subq	%rsi, %rdx
+	jae	L(ret_max)
+	negq	%rdx
+	/* At this point rdx contains number of w[char] needs to go.
+	   Now onwards rdx will keep decrementing with each compare.  */
+# endif
+
+	/* Loop unroll 4 times for 4 vector loop.  */
+	VPCMP	$0, (%rax), %VMM0, %k0
+	KMOV	%k0, %RCX
+	test	%RCX, %RCX
+	jnz	L(ret_vec_x1)
+
+# ifdef USE_AS_STRNLEN
+	subq	$CHAR_PER_VEC, %rdx
+	jbe	L(ret_max)
+# endif
+
+	VPCMP	$0, VEC_SIZE(%rax), %VMM0, %k0
+	KMOV	%k0, %RCX
+	test	%RCX, %RCX
+	jnz	L(ret_vec_x2)
+
+# ifdef USE_AS_STRNLEN
+	subq	$CHAR_PER_VEC, %rdx
+	jbe	L(ret_max)
+# endif
+
+	VPCMP	$0, (VEC_SIZE * 2)(%rax), %VMM0, %k0
+	KMOV	%k0, %RCX
+	test	%RCX, %RCX
+	jnz	L(ret_vec_x3)
+
+# ifdef USE_AS_STRNLEN
+	subq	$CHAR_PER_VEC, %rdx
+	jbe	L(ret_max)
+# endif
+
+	VPCMP	$0, (VEC_SIZE * 3)(%rax), %VMM0, %k0
+	KMOV	%k0, %RCX
+	test	%RCX, %RCX
+	jnz	L(ret_vec_x4)
+
+# ifdef USE_AS_STRNLEN
+	/* Instead of decreasing, rdx increased to prepare for loop
+	   first iteration.  Incremented 3 times because one increment
+	   cancelled by previous decrement.  */
+	subq    $-(CHAR_PER_VEC * 3), %rdx
+	movq	%rax, %rcx
+# endif
+
+	/* Align address to VEC_SIZE * 4 for loop.  */
+	andq	$-(VEC_SIZE * 4), %rax
+
+# ifdef USE_AS_STRNLEN
+	subq	%rax, %rcx
+#  ifdef USE_AS_WCSLEN
+	SHR	$2, %RCX
+#  endif
+	/* rcx contains number of [w]char will be recompared due to
+	   alignment fixes.  rdx must be incremented by rcx to offset
+	   alignment adjustment.  */
+	addq	%rcx, %rdx
+# endif
+
+	.p2align 4,,11
+L(loop):
+# ifdef USE_AS_STRNLEN
+	subq	$(CHAR_PER_VEC * 4), %rdx
+	jbe	L(ret_max)
+# endif
+	/* VPMINU and VPCMP combination provide better performance as
+	   compared to alternative combinations.  */
+	VMOVA	(VEC_SIZE * 4)(%rax), %VMM1
+	VPMINU	(VEC_SIZE * 5)(%rax), %VMM1, %VMM2
+	VMOVA	(VEC_SIZE * 6)(%rax), %VMM3
+	VPMINU	(VEC_SIZE * 7)(%rax), %VMM3, %VMM4
+
+	VPTESTN	%VMM2, %VMM2, %k0
+	VPTESTN	%VMM4, %VMM4, %k1
+
+	subq	$-(VEC_SIZE * 4), %rax
+	KORTEST	%k0, %k1
+	jz	L(loop)
+
+	VPTESTN	%VMM1, %VMM1, %k2
+	KMOV	%k2, %RCX
+	test	%RCX, %RCX
+	jnz	L(ret_vec_x1)
+
+	KMOV	%k0, %RCX
+	/* At this point, if k0 is non zero, null char must be in the
+	   second vector.  */
+	test	%RCX, %RCX
+	jnz	L(ret_vec_x2)
+
+	VPTESTN	%VMM3, %VMM3, %k3
+	KMOV	%k3, %RCX
+	test	%RCX, %RCX
+	jnz	L(ret_vec_x3)
+	/* At this point null [w]char must be in the fourth vector so no
+	   need to check.  */
+	KMOV	%k1, %RCX
+
+	/* Fourth, third, second vector terminating are pretty much
+	   same, implemented this way to avoid branching and reuse code
+	   from pre loop exit condition.  */
+L(ret_vec_x4):
+	bsf	%RCX, %RCX
+	subq	%rdi, %rax
+# ifdef USE_AS_WCSLEN
+	subq	$-(VEC_SIZE * 3), %rax
+	shrq	$2, %rax
+	addq	%rcx, %rax
+# else
+	leaq	(VEC_SIZE * 3)(%rcx, %rax), %rax
+# endif
+# ifdef USE_AS_STRNLEN
+	cmpq	%rsi, %rax
+	cmovnb	%rsi, %rax
+# endif
+	ret
+
+L(ret_vec_x3):
+	bsf	%RCX, %RCX
+	subq	%rdi, %rax
+# ifdef USE_AS_WCSLEN
+	subq	$-(VEC_SIZE * 2), %rax
+	shrq	$2, %rax
+	addq	%rcx, %rax
+# else
+	leaq	(VEC_SIZE * 2)(%rcx, %rax), %rax
+# endif
+# ifdef USE_AS_STRNLEN
+	cmpq	%rsi, %rax
+	cmovnb	%rsi, %rax
+# endif
+	ret
+
+L(ret_vec_x2):
+	subq	$-VEC_SIZE, %rax
+L(ret_vec_x1):
+	bsf	%RCX, %RCX
+	subq	%rdi, %rax
+# ifdef USE_AS_WCSLEN
+	shrq	$2, %rax
+# endif
+	addq	%rcx, %rax
+# ifdef USE_AS_STRNLEN
+	cmpq	%rsi, %rax
+	cmovnb	%rsi, %rax
+# endif
+	ret
+
+L(page_cross):
+	movl	%eax, %ecx
+# ifdef USE_AS_WCSLEN
+	andl	$(VEC_SIZE - 1), %ecx
+	sarl	$2, %ecx
+# endif
+	/* ecx contains number of w[char] to be skipped as a result
+	   of address alignment.  */
+	xorq	%rdi, %rax
+	VPCMP	$0, (PAGE_SIZE - VEC_SIZE)(%rax), %VMM0, %k0
+	KMOV	%k0, %RAX
+	/* Ignore number of character for alignment adjustment.  */
+	SHR	%cl, %RAX
+	jz	L(align_more)
+
+	bsf	%RAX, %RAX
+# ifdef USE_AS_STRNLEN
+	cmpq	%rsi, %rax
+	cmovnb	%rsi, %rax
+# endif
+	ret
+
+END (STRLEN)
+#endif
diff --git a/sysdeps/x86_64/multiarch/strlen-evex512.S b/sysdeps/x86_64/multiarch/strlen-evex512.S
new file mode 100644
index 0000000000..116f8981c8
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strlen-evex512.S
@@ -0,0 +1,7 @@
+#ifndef STRLEN
+# define STRLEN		__strlen_evex512
+#endif
+
+#define VEC_SIZE	64
+
+#include "strlen-evex-base.S"
diff --git a/sysdeps/x86_64/multiarch/strnlen-evex512.S b/sysdeps/x86_64/multiarch/strnlen-evex512.S
new file mode 100644
index 0000000000..0b7f220214
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strnlen-evex512.S
@@ -0,0 +1,4 @@
+#define STRLEN __strnlen_evex512
+#define USE_AS_STRNLEN 1
+
+#include "strlen-evex512.S"
diff --git a/sysdeps/x86_64/multiarch/wcslen-evex512.S b/sysdeps/x86_64/multiarch/wcslen-evex512.S
new file mode 100644
index 0000000000..f59c372b78
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wcslen-evex512.S
@@ -0,0 +1,4 @@
+#define STRLEN __wcslen_evex512
+#define USE_AS_WCSLEN 1
+
+#include "strlen-evex512.S"
diff --git a/sysdeps/x86_64/multiarch/wcsnlen-evex512.S b/sysdeps/x86_64/multiarch/wcsnlen-evex512.S
new file mode 100644
index 0000000000..73dcf2f210
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wcsnlen-evex512.S
@@ -0,0 +1,5 @@
+#define STRLEN __wcsnlen_evex512
+#define USE_AS_WCSLEN 1
+#define USE_AS_STRNLEN 1
+
+#include "strlen-evex512.S"
-- 
2.35.3


^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH v3] x86_64: Implement evex512 version of strlen, strnlen,  wcslen and wcsnlen
  2022-05-25 13:43         ` [PATCH v3] " Sunil K Pandey
@ 2022-05-25 17:10           ` Noah Goldstein
  2022-05-25 18:20             ` Sunil Pandey
  0 siblings, 1 reply; 12+ messages in thread
From: Noah Goldstein @ 2022-05-25 17:10 UTC (permalink / raw)
  To: Sunil K Pandey; +Cc: GNU C Library

On Wed, May 25, 2022 at 8:44 AM Sunil K Pandey via Libc-alpha
<libc-alpha@sourceware.org> wrote:
>
> This patch implements following evex512 version of string functions.
> Perf gain for evex512 version is up to 50% as compared to evex,
> depending on length and alignment.
>
> These functions are currently just for benchmarking/reference.
>
> - String length function using 512 bit vectors.
> - String N length using 512 bit vectors.
> - Wide string length using 512 bit vectors.
> - Wide string N length using 512 bit vectors.
> ---
>  sysdeps/x86_64/multiarch/Makefile           |   4 +
>  sysdeps/x86_64/multiarch/ifunc-impl-list.c  |  20 ++
>  sysdeps/x86_64/multiarch/strlen-evex-base.S | 299 ++++++++++++++++++++
>  sysdeps/x86_64/multiarch/strlen-evex512.S   |   7 +
>  sysdeps/x86_64/multiarch/strnlen-evex512.S  |   4 +
>  sysdeps/x86_64/multiarch/wcslen-evex512.S   |   4 +
>  sysdeps/x86_64/multiarch/wcsnlen-evex512.S  |   5 +
>  7 files changed, 343 insertions(+)
>  create mode 100644 sysdeps/x86_64/multiarch/strlen-evex-base.S
>  create mode 100644 sysdeps/x86_64/multiarch/strlen-evex512.S
>  create mode 100644 sysdeps/x86_64/multiarch/strnlen-evex512.S
>  create mode 100644 sysdeps/x86_64/multiarch/wcslen-evex512.S
>  create mode 100644 sysdeps/x86_64/multiarch/wcsnlen-evex512.S
>
> diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
> index f3ab5e0928..d0869c3ac3 100644
> --- a/sysdeps/x86_64/multiarch/Makefile
> +++ b/sysdeps/x86_64/multiarch/Makefile
> @@ -81,6 +81,7 @@ sysdep_routines += \
>    strlen-avx2 \
>    strlen-avx2-rtm \
>    strlen-evex \
> +  strlen-evex512 \
>    strlen-sse2 \
>    strncase_l-avx2 \
>    strncase_l-avx2-rtm \
> @@ -105,6 +106,7 @@ sysdep_routines += \
>    strnlen-avx2 \
>    strnlen-avx2-rtm \
>    strnlen-evex \
> +  strnlen-evex512 \
>    strnlen-sse2 \
>    strpbrk-c \
>    strpbrk-sse2 \
> @@ -138,6 +140,7 @@ sysdep_routines += \
>    wcslen-avx2 \
>    wcslen-avx2-rtm \
>    wcslen-evex \
> +  wcslen-evex512 \
>    wcslen-sse2 \
>    wcslen-sse4_1 \
>    wcsncmp-avx2 \
> @@ -148,6 +151,7 @@ sysdep_routines += \
>    wcsnlen-avx2-rtm \
>    wcsnlen-c \
>    wcsnlen-evex \
> +  wcsnlen-evex512 \
>    wcsnlen-sse4_1 \
>    wcsrchr-avx2 \
>    wcsrchr-avx2-rtm \
> diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> index 7218095430..c5cd9466fe 100644
> --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> @@ -328,6 +328,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
>                                && CPU_FEATURE_USABLE (AVX512BW)
>                                && CPU_FEATURE_USABLE (BMI2)),
>                               __strlen_evex)
> +             IFUNC_IMPL_ADD (array, i, strlen,
> +                             (CPU_FEATURE_USABLE (AVX512VL)
> +                              && CPU_FEATURE_USABLE (AVX512BW)
> +                              && CPU_FEATURE_USABLE (BMI2)),
> +                             __strlen_evex512)
>               IFUNC_IMPL_ADD (array, i, strlen, 1, __strlen_sse2))
>
>    /* Support sysdeps/x86_64/multiarch/strnlen.c.  */
> @@ -346,6 +351,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
>                                && CPU_FEATURE_USABLE (AVX512BW)
>                                && CPU_FEATURE_USABLE (BMI2)),
>                               __strnlen_evex)
> +             IFUNC_IMPL_ADD (array, i, strnlen,
> +                             (CPU_FEATURE_USABLE (AVX512VL)
> +                              && CPU_FEATURE_USABLE (AVX512BW)
> +                              && CPU_FEATURE_USABLE (BMI2)),
> +                             __strnlen_evex512)
>               IFUNC_IMPL_ADD (array, i, strnlen, 1, __strnlen_sse2))
>
>    /* Support sysdeps/x86_64/multiarch/stpncpy.c.  */
> @@ -699,6 +709,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
>                                && CPU_FEATURE_USABLE (AVX512BW)
>                                && CPU_FEATURE_USABLE (BMI2)),
>                               __wcslen_evex)
> +             IFUNC_IMPL_ADD (array, i, wcslen,
> +                             (CPU_FEATURE_USABLE (AVX512VL)
> +                              && CPU_FEATURE_USABLE (AVX512BW)
> +                              && CPU_FEATURE_USABLE (BMI2)),
> +                             __wcslen_evex512)
>               IFUNC_IMPL_ADD (array, i, wcslen,
>                               CPU_FEATURE_USABLE (SSE4_1),
>                               __wcslen_sse4_1)
> @@ -720,6 +735,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
>                                && CPU_FEATURE_USABLE (AVX512BW)
>                                && CPU_FEATURE_USABLE (BMI2)),
>                               __wcsnlen_evex)
> +             IFUNC_IMPL_ADD (array, i, wcsnlen,
> +                             (CPU_FEATURE_USABLE (AVX512VL)
> +                              && CPU_FEATURE_USABLE (AVX512BW)
> +                              && CPU_FEATURE_USABLE (BMI2)),
> +                             __wcsnlen_evex512)
>               IFUNC_IMPL_ADD (array, i, wcsnlen,
>                               CPU_FEATURE_USABLE (SSE4_1),
>                               __wcsnlen_sse4_1)
> diff --git a/sysdeps/x86_64/multiarch/strlen-evex-base.S b/sysdeps/x86_64/multiarch/strlen-evex-base.S
> new file mode 100644
> index 0000000000..bd09967f76
> --- /dev/null
> +++ b/sysdeps/x86_64/multiarch/strlen-evex-base.S
> @@ -0,0 +1,299 @@
> +/* Placeholder function, not used by any processor at the moment.
> +   Copyright (C) 2022 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +#if IS_IN (libc)
> +
> +# include <sysdep.h>
> +
> +# ifdef USE_AS_WCSLEN
> +#  define VPCMP                vpcmpd
> +#  define VPTESTN      vptestnmd
> +#  define VPMINU       vpminud
> +#  define CHAR_SIZE    4
> +# else
> +#  define VPCMP                vpcmpb
> +#  define VPTESTN      vptestnmb
> +#  define VPMINU       vpminub
> +#  define CHAR_SIZE    1
> +# endif
> +
> +# define XMM0          xmm16
> +# define PAGE_SIZE     4096
> +# define CHAR_PER_VEC  (VEC_SIZE / CHAR_SIZE)
> +
> +# if VEC_SIZE == 64
> +#  define KMOV         kmovq
> +#  define KORTEST      kortestq
> +#  define RAX          rax
> +#  define RCX          rcx
> +#  define RDX          rdx
> +#  define SHR          shrq
> +#  define TEXTSUFFIX   evex512
> +#  define VMM0         zmm16
> +#  define VMM1         zmm17
> +#  define VMM2         zmm18
> +#  define VMM3         zmm19
> +#  define VMM4         zmm20
> +#  define VMOVA                vmovdqa64
> +# elif VEC_SIZE == 32
> +/* Currently Unused.  */
> +#  define KMOV         kmovd
> +#  define KORTEST      kortestd
> +#  define RAX          eax
> +#  define RCX          ecx
> +#  define RDX          edx
> +#  define SHR          shrl
> +#  define TEXTSUFFIX   evex256
> +#  define VMM0         ymm16
> +#  define VMM1         ymm17
> +#  define VMM2         ymm18
> +#  define VMM3         ymm19
> +#  define VMM4         ymm20
> +#  define VMOVA                vmovdqa32
> +# endif
> +
> +       .section .text.TEXTSUFFIX, "ax", @progbits
> +/* Aligning entry point to 64 byte, provides better performance for
> +   one vector length string.  */
> +ENTRY_P2ALIGN (STRLEN, 6)
> +# ifdef USE_AS_STRNLEN
> +       /* Check zero length.  */
> +       test    %RSI_LP, %RSI_LP
> +       jz      L(ret_max)
> +#  ifdef __ILP32__
> +       /* Clear the upper 32 bits.  */
> +       movl    %esi, %esi
> +#  endif
> +# endif
> +
> +       movl    %edi, %eax
> +       vpxorq  %XMM0, %XMM0, %XMM0
> +       andl    $(PAGE_SIZE - 1), %eax
> +       cmpl    $(PAGE_SIZE - VEC_SIZE), %eax
> +       ja      L(page_cross)
> +
> +       /* Compare [w]char for null, mask bit will be set for match.  */
> +       VPCMP   $0, (%rdi), %VMM0, %k0
> +       KMOV    %k0, %RAX
> +       test    %RAX, %RAX
> +       jz      L(align_more)
> +
> +       bsf     %RAX, %RAX
> +# ifdef USE_AS_STRNLEN
> +       cmpq    %rsi, %rax
> +       cmovnb  %rsi, %rax
> +# endif
> +       ret
> +
> +       /* At this point vector max length reached.  */
> +# ifdef USE_AS_STRNLEN
> +       .p2align 4,,3
> +L(ret_max):
> +       movq    %rsi, %rax
> +       ret
> +# endif
> +
> +L(align_more):
> +       leaq    VEC_SIZE(%rdi), %rax
> +       /* Align rax to VEC_SIZE.  */
> +       andq    $-VEC_SIZE, %rax
> +# ifdef USE_AS_STRNLEN
> +       movq    %rax, %rdx
> +       subq    %rdi, %rdx
> +#  ifdef USE_AS_WCSLEN
> +       SHR     $2, %RDX
> +#  endif
> +       /* At this point rdx contains [w]chars already compared.  */
> +       subq    %rsi, %rdx
> +       jae     L(ret_max)
> +       negq    %rdx
> +       /* At this point rdx contains number of w[char] needs to go.
> +          Now onwards rdx will keep decrementing with each compare.  */
> +# endif
> +
> +       /* Loop unroll 4 times for 4 vector loop.  */
> +       VPCMP   $0, (%rax), %VMM0, %k0
> +       KMOV    %k0, %RCX
> +       test    %RCX, %RCX
> +       jnz     L(ret_vec_x1)
> +
> +# ifdef USE_AS_STRNLEN
> +       subq    $CHAR_PER_VEC, %rdx
> +       jbe     L(ret_max)
> +# endif
> +
> +       VPCMP   $0, VEC_SIZE(%rax), %VMM0, %k0
> +       KMOV    %k0, %RCX
> +       test    %RCX, %RCX
> +       jnz     L(ret_vec_x2)
> +
> +# ifdef USE_AS_STRNLEN
> +       subq    $CHAR_PER_VEC, %rdx
> +       jbe     L(ret_max)
> +# endif
> +
> +       VPCMP   $0, (VEC_SIZE * 2)(%rax), %VMM0, %k0
> +       KMOV    %k0, %RCX
> +       test    %RCX, %RCX
> +       jnz     L(ret_vec_x3)
> +
> +# ifdef USE_AS_STRNLEN
> +       subq    $CHAR_PER_VEC, %rdx
> +       jbe     L(ret_max)
> +# endif
> +
> +       VPCMP   $0, (VEC_SIZE * 3)(%rax), %VMM0, %k0
> +       KMOV    %k0, %RCX
> +       test    %RCX, %RCX
> +       jnz     L(ret_vec_x4)
> +
> +# ifdef USE_AS_STRNLEN
> +       /* Instead of decreasing, rdx increased to prepare for loop
> +          first iteration.  Incremented 3 times because one increment
> +          cancelled by previous decrement.  */
> +       subq    $-(CHAR_PER_VEC * 3), %rdx
> +       movq    %rax, %rcx
> +# endif
> +
> +       /* Align address to VEC_SIZE * 4 for loop.  */
> +       andq    $-(VEC_SIZE * 4), %rax
> +
> +# ifdef USE_AS_STRNLEN
> +       subq    %rax, %rcx
> +#  ifdef USE_AS_WCSLEN
> +       SHR     $2, %RCX
> +#  endif
> +       /* rcx contains number of [w]char will be recompared due to
> +          alignment fixes.  rdx must be incremented by rcx to offset
> +          alignment adjustment.  */
> +       addq    %rcx, %rdx

This is buggy for strnlen / wcslen. You are adding back more more than
the original size
so you can overflow.

See strnlen_evex512 for:

strlen=319
align%4096 = 1
maxlen = -1UL

expec = 319
result = 18446744073709551615

> +# endif
> +
> +       .p2align 4,,11
> +L(loop):
> +# ifdef USE_AS_STRNLEN
> +       subq    $(CHAR_PER_VEC * 4), %rdx
> +       jbe     L(ret_max)
> +# endif
> +       /* VPMINU and VPCMP combination provide better performance as
> +          compared to alternative combinations.  */
> +       VMOVA   (VEC_SIZE * 4)(%rax), %VMM1
> +       VPMINU  (VEC_SIZE * 5)(%rax), %VMM1, %VMM2
> +       VMOVA   (VEC_SIZE * 6)(%rax), %VMM3
> +       VPMINU  (VEC_SIZE * 7)(%rax), %VMM3, %VMM4
> +
> +       VPTESTN %VMM2, %VMM2, %k0
> +       VPTESTN %VMM4, %VMM4, %k1
> +
> +       subq    $-(VEC_SIZE * 4), %rax
> +       KORTEST %k0, %k1
> +       jz      L(loop)
> +
> +       VPTESTN %VMM1, %VMM1, %k2
> +       KMOV    %k2, %RCX
> +       test    %RCX, %RCX
> +       jnz     L(ret_vec_x1)
> +
> +       KMOV    %k0, %RCX
> +       /* At this point, if k0 is non zero, null char must be in the
> +          second vector.  */
> +       test    %RCX, %RCX
> +       jnz     L(ret_vec_x2)
> +
> +       VPTESTN %VMM3, %VMM3, %k3
> +       KMOV    %k3, %RCX
> +       test    %RCX, %RCX
> +       jnz     L(ret_vec_x3)
> +       /* At this point null [w]char must be in the fourth vector so no
> +          need to check.  */
> +       KMOV    %k1, %RCX
> +
> +       /* Fourth, third, second vector terminating are pretty much
> +          same, implemented this way to avoid branching and reuse code
> +          from pre loop exit condition.  */
> +L(ret_vec_x4):
> +       bsf     %RCX, %RCX
> +       subq    %rdi, %rax
> +# ifdef USE_AS_WCSLEN
> +       subq    $-(VEC_SIZE * 3), %rax
> +       shrq    $2, %rax
> +       addq    %rcx, %rax
> +# else
> +       leaq    (VEC_SIZE * 3)(%rcx, %rax), %rax
> +# endif
> +# ifdef USE_AS_STRNLEN
> +       cmpq    %rsi, %rax
> +       cmovnb  %rsi, %rax
> +# endif
> +       ret
> +
> +L(ret_vec_x3):
> +       bsf     %RCX, %RCX
> +       subq    %rdi, %rax
> +# ifdef USE_AS_WCSLEN
> +       subq    $-(VEC_SIZE * 2), %rax
> +       shrq    $2, %rax
> +       addq    %rcx, %rax
> +# else
> +       leaq    (VEC_SIZE * 2)(%rcx, %rax), %rax
> +# endif
> +# ifdef USE_AS_STRNLEN
> +       cmpq    %rsi, %rax
> +       cmovnb  %rsi, %rax
> +# endif
> +       ret
> +
> +L(ret_vec_x2):
> +       subq    $-VEC_SIZE, %rax
> +L(ret_vec_x1):
> +       bsf     %RCX, %RCX
> +       subq    %rdi, %rax
> +# ifdef USE_AS_WCSLEN
> +       shrq    $2, %rax
> +# endif
> +       addq    %rcx, %rax
> +# ifdef USE_AS_STRNLEN
> +       cmpq    %rsi, %rax
> +       cmovnb  %rsi, %rax
> +# endif
> +       ret
> +
> +L(page_cross):
> +       movl    %eax, %ecx
> +# ifdef USE_AS_WCSLEN
> +       andl    $(VEC_SIZE - 1), %ecx
> +       sarl    $2, %ecx
> +# endif
> +       /* ecx contains number of w[char] to be skipped as a result
> +          of address alignment.  */
> +       xorq    %rdi, %rax
> +       VPCMP   $0, (PAGE_SIZE - VEC_SIZE)(%rax), %VMM0, %k0
> +       KMOV    %k0, %RAX
> +       /* Ignore number of character for alignment adjustment.  */
> +       SHR     %cl, %RAX
> +       jz      L(align_more)
> +
> +       bsf     %RAX, %RAX
> +# ifdef USE_AS_STRNLEN
> +       cmpq    %rsi, %rax
> +       cmovnb  %rsi, %rax
> +# endif
> +       ret
> +
> +END (STRLEN)
> +#endif
> diff --git a/sysdeps/x86_64/multiarch/strlen-evex512.S b/sysdeps/x86_64/multiarch/strlen-evex512.S
> new file mode 100644
> index 0000000000..116f8981c8
> --- /dev/null
> +++ b/sysdeps/x86_64/multiarch/strlen-evex512.S
> @@ -0,0 +1,7 @@
> +#ifndef STRLEN
> +# define STRLEN                __strlen_evex512
> +#endif
> +
> +#define VEC_SIZE       64
> +
> +#include "strlen-evex-base.S"
> diff --git a/sysdeps/x86_64/multiarch/strnlen-evex512.S b/sysdeps/x86_64/multiarch/strnlen-evex512.S
> new file mode 100644
> index 0000000000..0b7f220214
> --- /dev/null
> +++ b/sysdeps/x86_64/multiarch/strnlen-evex512.S
> @@ -0,0 +1,4 @@
> +#define STRLEN __strnlen_evex512
> +#define USE_AS_STRNLEN 1
> +
> +#include "strlen-evex512.S"
> diff --git a/sysdeps/x86_64/multiarch/wcslen-evex512.S b/sysdeps/x86_64/multiarch/wcslen-evex512.S
> new file mode 100644
> index 0000000000..f59c372b78
> --- /dev/null
> +++ b/sysdeps/x86_64/multiarch/wcslen-evex512.S
> @@ -0,0 +1,4 @@
> +#define STRLEN __wcslen_evex512
> +#define USE_AS_WCSLEN 1
> +
> +#include "strlen-evex512.S"
> diff --git a/sysdeps/x86_64/multiarch/wcsnlen-evex512.S b/sysdeps/x86_64/multiarch/wcsnlen-evex512.S
> new file mode 100644
> index 0000000000..73dcf2f210
> --- /dev/null
> +++ b/sysdeps/x86_64/multiarch/wcsnlen-evex512.S
> @@ -0,0 +1,5 @@
> +#define STRLEN __wcsnlen_evex512
> +#define USE_AS_WCSLEN 1
> +#define USE_AS_STRNLEN 1
> +
> +#include "strlen-evex512.S"
> --
> 2.35.3
>

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH v3] x86_64: Implement evex512 version of strlen, strnlen,  wcslen and wcsnlen
  2022-05-25 17:10           ` Noah Goldstein
@ 2022-05-25 18:20             ` Sunil Pandey
  2022-05-26 18:35               ` [PATCH v4] " Sunil K Pandey
  0 siblings, 1 reply; 12+ messages in thread
From: Sunil Pandey @ 2022-05-25 18:20 UTC (permalink / raw)
  To: Noah Goldstein; +Cc: GNU C Library

On Wed, May 25, 2022 at 10:10 AM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> On Wed, May 25, 2022 at 8:44 AM Sunil K Pandey via Libc-alpha
> <libc-alpha@sourceware.org> wrote:
> >
> > This patch implements following evex512 version of string functions.
> > Perf gain for evex512 version is up to 50% as compared to evex,
> > depending on length and alignment.
> >
> > These functions are currently just for benchmarking/reference.
> >
> > - String length function using 512 bit vectors.
> > - String N length using 512 bit vectors.
> > - Wide string length using 512 bit vectors.
> > - Wide string N length using 512 bit vectors.
> > ---
> >  sysdeps/x86_64/multiarch/Makefile           |   4 +
> >  sysdeps/x86_64/multiarch/ifunc-impl-list.c  |  20 ++
> >  sysdeps/x86_64/multiarch/strlen-evex-base.S | 299 ++++++++++++++++++++
> >  sysdeps/x86_64/multiarch/strlen-evex512.S   |   7 +
> >  sysdeps/x86_64/multiarch/strnlen-evex512.S  |   4 +
> >  sysdeps/x86_64/multiarch/wcslen-evex512.S   |   4 +
> >  sysdeps/x86_64/multiarch/wcsnlen-evex512.S  |   5 +
> >  7 files changed, 343 insertions(+)
> >  create mode 100644 sysdeps/x86_64/multiarch/strlen-evex-base.S
> >  create mode 100644 sysdeps/x86_64/multiarch/strlen-evex512.S
> >  create mode 100644 sysdeps/x86_64/multiarch/strnlen-evex512.S
> >  create mode 100644 sysdeps/x86_64/multiarch/wcslen-evex512.S
> >  create mode 100644 sysdeps/x86_64/multiarch/wcsnlen-evex512.S
> >
> > diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
> > index f3ab5e0928..d0869c3ac3 100644
> > --- a/sysdeps/x86_64/multiarch/Makefile
> > +++ b/sysdeps/x86_64/multiarch/Makefile
> > @@ -81,6 +81,7 @@ sysdep_routines += \
> >    strlen-avx2 \
> >    strlen-avx2-rtm \
> >    strlen-evex \
> > +  strlen-evex512 \
> >    strlen-sse2 \
> >    strncase_l-avx2 \
> >    strncase_l-avx2-rtm \
> > @@ -105,6 +106,7 @@ sysdep_routines += \
> >    strnlen-avx2 \
> >    strnlen-avx2-rtm \
> >    strnlen-evex \
> > +  strnlen-evex512 \
> >    strnlen-sse2 \
> >    strpbrk-c \
> >    strpbrk-sse2 \
> > @@ -138,6 +140,7 @@ sysdep_routines += \
> >    wcslen-avx2 \
> >    wcslen-avx2-rtm \
> >    wcslen-evex \
> > +  wcslen-evex512 \
> >    wcslen-sse2 \
> >    wcslen-sse4_1 \
> >    wcsncmp-avx2 \
> > @@ -148,6 +151,7 @@ sysdep_routines += \
> >    wcsnlen-avx2-rtm \
> >    wcsnlen-c \
> >    wcsnlen-evex \
> > +  wcsnlen-evex512 \
> >    wcsnlen-sse4_1 \
> >    wcsrchr-avx2 \
> >    wcsrchr-avx2-rtm \
> > diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> > index 7218095430..c5cd9466fe 100644
> > --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> > +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> > @@ -328,6 +328,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> >                                && CPU_FEATURE_USABLE (AVX512BW)
> >                                && CPU_FEATURE_USABLE (BMI2)),
> >                               __strlen_evex)
> > +             IFUNC_IMPL_ADD (array, i, strlen,
> > +                             (CPU_FEATURE_USABLE (AVX512VL)
> > +                              && CPU_FEATURE_USABLE (AVX512BW)
> > +                              && CPU_FEATURE_USABLE (BMI2)),
> > +                             __strlen_evex512)
> >               IFUNC_IMPL_ADD (array, i, strlen, 1, __strlen_sse2))
> >
> >    /* Support sysdeps/x86_64/multiarch/strnlen.c.  */
> > @@ -346,6 +351,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> >                                && CPU_FEATURE_USABLE (AVX512BW)
> >                                && CPU_FEATURE_USABLE (BMI2)),
> >                               __strnlen_evex)
> > +             IFUNC_IMPL_ADD (array, i, strnlen,
> > +                             (CPU_FEATURE_USABLE (AVX512VL)
> > +                              && CPU_FEATURE_USABLE (AVX512BW)
> > +                              && CPU_FEATURE_USABLE (BMI2)),
> > +                             __strnlen_evex512)
> >               IFUNC_IMPL_ADD (array, i, strnlen, 1, __strnlen_sse2))
> >
> >    /* Support sysdeps/x86_64/multiarch/stpncpy.c.  */
> > @@ -699,6 +709,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> >                                && CPU_FEATURE_USABLE (AVX512BW)
> >                                && CPU_FEATURE_USABLE (BMI2)),
> >                               __wcslen_evex)
> > +             IFUNC_IMPL_ADD (array, i, wcslen,
> > +                             (CPU_FEATURE_USABLE (AVX512VL)
> > +                              && CPU_FEATURE_USABLE (AVX512BW)
> > +                              && CPU_FEATURE_USABLE (BMI2)),
> > +                             __wcslen_evex512)
> >               IFUNC_IMPL_ADD (array, i, wcslen,
> >                               CPU_FEATURE_USABLE (SSE4_1),
> >                               __wcslen_sse4_1)
> > @@ -720,6 +735,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> >                                && CPU_FEATURE_USABLE (AVX512BW)
> >                                && CPU_FEATURE_USABLE (BMI2)),
> >                               __wcsnlen_evex)
> > +             IFUNC_IMPL_ADD (array, i, wcsnlen,
> > +                             (CPU_FEATURE_USABLE (AVX512VL)
> > +                              && CPU_FEATURE_USABLE (AVX512BW)
> > +                              && CPU_FEATURE_USABLE (BMI2)),
> > +                             __wcsnlen_evex512)
> >               IFUNC_IMPL_ADD (array, i, wcsnlen,
> >                               CPU_FEATURE_USABLE (SSE4_1),
> >                               __wcsnlen_sse4_1)
> > diff --git a/sysdeps/x86_64/multiarch/strlen-evex-base.S b/sysdeps/x86_64/multiarch/strlen-evex-base.S
> > new file mode 100644
> > index 0000000000..bd09967f76
> > --- /dev/null
> > +++ b/sysdeps/x86_64/multiarch/strlen-evex-base.S
> > @@ -0,0 +1,299 @@
> > +/* Placeholder function, not used by any processor at the moment.
> > +   Copyright (C) 2022 Free Software Foundation, Inc.
> > +   This file is part of the GNU C Library.
> > +
> > +   The GNU C Library is free software; you can redistribute it and/or
> > +   modify it under the terms of the GNU Lesser General Public
> > +   License as published by the Free Software Foundation; either
> > +   version 2.1 of the License, or (at your option) any later version.
> > +
> > +   The GNU C Library is distributed in the hope that it will be useful,
> > +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> > +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > +   Lesser General Public License for more details.
> > +
> > +   You should have received a copy of the GNU Lesser General Public
> > +   License along with the GNU C Library; if not, see
> > +   <https://www.gnu.org/licenses/>.  */
> > +
> > +#if IS_IN (libc)
> > +
> > +# include <sysdep.h>
> > +
> > +# ifdef USE_AS_WCSLEN
> > +#  define VPCMP                vpcmpd
> > +#  define VPTESTN      vptestnmd
> > +#  define VPMINU       vpminud
> > +#  define CHAR_SIZE    4
> > +# else
> > +#  define VPCMP                vpcmpb
> > +#  define VPTESTN      vptestnmb
> > +#  define VPMINU       vpminub
> > +#  define CHAR_SIZE    1
> > +# endif
> > +
> > +# define XMM0          xmm16
> > +# define PAGE_SIZE     4096
> > +# define CHAR_PER_VEC  (VEC_SIZE / CHAR_SIZE)
> > +
> > +# if VEC_SIZE == 64
> > +#  define KMOV         kmovq
> > +#  define KORTEST      kortestq
> > +#  define RAX          rax
> > +#  define RCX          rcx
> > +#  define RDX          rdx
> > +#  define SHR          shrq
> > +#  define TEXTSUFFIX   evex512
> > +#  define VMM0         zmm16
> > +#  define VMM1         zmm17
> > +#  define VMM2         zmm18
> > +#  define VMM3         zmm19
> > +#  define VMM4         zmm20
> > +#  define VMOVA                vmovdqa64
> > +# elif VEC_SIZE == 32
> > +/* Currently Unused.  */
> > +#  define KMOV         kmovd
> > +#  define KORTEST      kortestd
> > +#  define RAX          eax
> > +#  define RCX          ecx
> > +#  define RDX          edx
> > +#  define SHR          shrl
> > +#  define TEXTSUFFIX   evex256
> > +#  define VMM0         ymm16
> > +#  define VMM1         ymm17
> > +#  define VMM2         ymm18
> > +#  define VMM3         ymm19
> > +#  define VMM4         ymm20
> > +#  define VMOVA                vmovdqa32
> > +# endif
> > +
> > +       .section .text.TEXTSUFFIX, "ax", @progbits
> > +/* Aligning entry point to 64 byte, provides better performance for
> > +   one vector length string.  */
> > +ENTRY_P2ALIGN (STRLEN, 6)
> > +# ifdef USE_AS_STRNLEN
> > +       /* Check zero length.  */
> > +       test    %RSI_LP, %RSI_LP
> > +       jz      L(ret_max)
> > +#  ifdef __ILP32__
> > +       /* Clear the upper 32 bits.  */
> > +       movl    %esi, %esi
> > +#  endif
> > +# endif
> > +
> > +       movl    %edi, %eax
> > +       vpxorq  %XMM0, %XMM0, %XMM0
> > +       andl    $(PAGE_SIZE - 1), %eax
> > +       cmpl    $(PAGE_SIZE - VEC_SIZE), %eax
> > +       ja      L(page_cross)
> > +
> > +       /* Compare [w]char for null, mask bit will be set for match.  */
> > +       VPCMP   $0, (%rdi), %VMM0, %k0
> > +       KMOV    %k0, %RAX
> > +       test    %RAX, %RAX
> > +       jz      L(align_more)
> > +
> > +       bsf     %RAX, %RAX
> > +# ifdef USE_AS_STRNLEN
> > +       cmpq    %rsi, %rax
> > +       cmovnb  %rsi, %rax
> > +# endif
> > +       ret
> > +
> > +       /* At this point vector max length reached.  */
> > +# ifdef USE_AS_STRNLEN
> > +       .p2align 4,,3
> > +L(ret_max):
> > +       movq    %rsi, %rax
> > +       ret
> > +# endif
> > +
> > +L(align_more):
> > +       leaq    VEC_SIZE(%rdi), %rax
> > +       /* Align rax to VEC_SIZE.  */
> > +       andq    $-VEC_SIZE, %rax
> > +# ifdef USE_AS_STRNLEN
> > +       movq    %rax, %rdx
> > +       subq    %rdi, %rdx
> > +#  ifdef USE_AS_WCSLEN
> > +       SHR     $2, %RDX
> > +#  endif
> > +       /* At this point rdx contains [w]chars already compared.  */
> > +       subq    %rsi, %rdx
> > +       jae     L(ret_max)
> > +       negq    %rdx
> > +       /* At this point rdx contains number of w[char] needs to go.
> > +          Now onwards rdx will keep decrementing with each compare.  */
> > +# endif
> > +
> > +       /* Loop unroll 4 times for 4 vector loop.  */
> > +       VPCMP   $0, (%rax), %VMM0, %k0
> > +       KMOV    %k0, %RCX
> > +       test    %RCX, %RCX
> > +       jnz     L(ret_vec_x1)
> > +
> > +# ifdef USE_AS_STRNLEN
> > +       subq    $CHAR_PER_VEC, %rdx
> > +       jbe     L(ret_max)
> > +# endif
> > +
> > +       VPCMP   $0, VEC_SIZE(%rax), %VMM0, %k0
> > +       KMOV    %k0, %RCX
> > +       test    %RCX, %RCX
> > +       jnz     L(ret_vec_x2)
> > +
> > +# ifdef USE_AS_STRNLEN
> > +       subq    $CHAR_PER_VEC, %rdx
> > +       jbe     L(ret_max)
> > +# endif
> > +
> > +       VPCMP   $0, (VEC_SIZE * 2)(%rax), %VMM0, %k0
> > +       KMOV    %k0, %RCX
> > +       test    %RCX, %RCX
> > +       jnz     L(ret_vec_x3)
> > +
> > +# ifdef USE_AS_STRNLEN
> > +       subq    $CHAR_PER_VEC, %rdx
> > +       jbe     L(ret_max)
> > +# endif
> > +
> > +       VPCMP   $0, (VEC_SIZE * 3)(%rax), %VMM0, %k0
> > +       KMOV    %k0, %RCX
> > +       test    %RCX, %RCX
> > +       jnz     L(ret_vec_x4)
> > +
> > +# ifdef USE_AS_STRNLEN
> > +       /* Instead of decreasing, rdx increased to prepare for loop
> > +          first iteration.  Incremented 3 times because one increment
> > +          cancelled by previous decrement.  */
> > +       subq    $-(CHAR_PER_VEC * 3), %rdx
> > +       movq    %rax, %rcx
> > +# endif
> > +
> > +       /* Align address to VEC_SIZE * 4 for loop.  */
> > +       andq    $-(VEC_SIZE * 4), %rax
> > +
> > +# ifdef USE_AS_STRNLEN
> > +       subq    %rax, %rcx
> > +#  ifdef USE_AS_WCSLEN
> > +       SHR     $2, %RCX
> > +#  endif
> > +       /* rcx contains number of [w]char will be recompared due to
> > +          alignment fixes.  rdx must be incremented by rcx to offset
> > +          alignment adjustment.  */
> > +       addq    %rcx, %rdx
>
> This is buggy for strnlen / wcslen. You are adding back more more than
> the original size
> so you can overflow.
>
> See strnlen_evex512 for:
>
> strlen=319
> align%4096 = 1
> maxlen = -1UL
>
> expec = 319
> result = 18446744073709551615
>

Good catch. Will fix it in v4.

> > +# endif
> > +
> > +       .p2align 4,,11
> > +L(loop):
> > +# ifdef USE_AS_STRNLEN
> > +       subq    $(CHAR_PER_VEC * 4), %rdx
> > +       jbe     L(ret_max)
> > +# endif
> > +       /* VPMINU and VPCMP combination provide better performance as
> > +          compared to alternative combinations.  */
> > +       VMOVA   (VEC_SIZE * 4)(%rax), %VMM1
> > +       VPMINU  (VEC_SIZE * 5)(%rax), %VMM1, %VMM2
> > +       VMOVA   (VEC_SIZE * 6)(%rax), %VMM3
> > +       VPMINU  (VEC_SIZE * 7)(%rax), %VMM3, %VMM4
> > +
> > +       VPTESTN %VMM2, %VMM2, %k0
> > +       VPTESTN %VMM4, %VMM4, %k1
> > +
> > +       subq    $-(VEC_SIZE * 4), %rax
> > +       KORTEST %k0, %k1
> > +       jz      L(loop)
> > +
> > +       VPTESTN %VMM1, %VMM1, %k2
> > +       KMOV    %k2, %RCX
> > +       test    %RCX, %RCX
> > +       jnz     L(ret_vec_x1)
> > +
> > +       KMOV    %k0, %RCX
> > +       /* At this point, if k0 is non zero, null char must be in the
> > +          second vector.  */
> > +       test    %RCX, %RCX
> > +       jnz     L(ret_vec_x2)
> > +
> > +       VPTESTN %VMM3, %VMM3, %k3
> > +       KMOV    %k3, %RCX
> > +       test    %RCX, %RCX
> > +       jnz     L(ret_vec_x3)
> > +       /* At this point null [w]char must be in the fourth vector so no
> > +          need to check.  */
> > +       KMOV    %k1, %RCX
> > +
> > +       /* Fourth, third, second vector terminating are pretty much
> > +          same, implemented this way to avoid branching and reuse code
> > +          from pre loop exit condition.  */
> > +L(ret_vec_x4):
> > +       bsf     %RCX, %RCX
> > +       subq    %rdi, %rax
> > +# ifdef USE_AS_WCSLEN
> > +       subq    $-(VEC_SIZE * 3), %rax
> > +       shrq    $2, %rax
> > +       addq    %rcx, %rax
> > +# else
> > +       leaq    (VEC_SIZE * 3)(%rcx, %rax), %rax
> > +# endif
> > +# ifdef USE_AS_STRNLEN
> > +       cmpq    %rsi, %rax
> > +       cmovnb  %rsi, %rax
> > +# endif
> > +       ret
> > +
> > +L(ret_vec_x3):
> > +       bsf     %RCX, %RCX
> > +       subq    %rdi, %rax
> > +# ifdef USE_AS_WCSLEN
> > +       subq    $-(VEC_SIZE * 2), %rax
> > +       shrq    $2, %rax
> > +       addq    %rcx, %rax
> > +# else
> > +       leaq    (VEC_SIZE * 2)(%rcx, %rax), %rax
> > +# endif
> > +# ifdef USE_AS_STRNLEN
> > +       cmpq    %rsi, %rax
> > +       cmovnb  %rsi, %rax
> > +# endif
> > +       ret
> > +
> > +L(ret_vec_x2):
> > +       subq    $-VEC_SIZE, %rax
> > +L(ret_vec_x1):
> > +       bsf     %RCX, %RCX
> > +       subq    %rdi, %rax
> > +# ifdef USE_AS_WCSLEN
> > +       shrq    $2, %rax
> > +# endif
> > +       addq    %rcx, %rax
> > +# ifdef USE_AS_STRNLEN
> > +       cmpq    %rsi, %rax
> > +       cmovnb  %rsi, %rax
> > +# endif
> > +       ret
> > +
> > +L(page_cross):
> > +       movl    %eax, %ecx
> > +# ifdef USE_AS_WCSLEN
> > +       andl    $(VEC_SIZE - 1), %ecx
> > +       sarl    $2, %ecx
> > +# endif
> > +       /* ecx contains number of w[char] to be skipped as a result
> > +          of address alignment.  */
> > +       xorq    %rdi, %rax
> > +       VPCMP   $0, (PAGE_SIZE - VEC_SIZE)(%rax), %VMM0, %k0
> > +       KMOV    %k0, %RAX
> > +       /* Ignore number of character for alignment adjustment.  */
> > +       SHR     %cl, %RAX
> > +       jz      L(align_more)
> > +
> > +       bsf     %RAX, %RAX
> > +# ifdef USE_AS_STRNLEN
> > +       cmpq    %rsi, %rax
> > +       cmovnb  %rsi, %rax
> > +# endif
> > +       ret
> > +
> > +END (STRLEN)
> > +#endif
> > diff --git a/sysdeps/x86_64/multiarch/strlen-evex512.S b/sysdeps/x86_64/multiarch/strlen-evex512.S
> > new file mode 100644
> > index 0000000000..116f8981c8
> > --- /dev/null
> > +++ b/sysdeps/x86_64/multiarch/strlen-evex512.S
> > @@ -0,0 +1,7 @@
> > +#ifndef STRLEN
> > +# define STRLEN                __strlen_evex512
> > +#endif
> > +
> > +#define VEC_SIZE       64
> > +
> > +#include "strlen-evex-base.S"
> > diff --git a/sysdeps/x86_64/multiarch/strnlen-evex512.S b/sysdeps/x86_64/multiarch/strnlen-evex512.S
> > new file mode 100644
> > index 0000000000..0b7f220214
> > --- /dev/null
> > +++ b/sysdeps/x86_64/multiarch/strnlen-evex512.S
> > @@ -0,0 +1,4 @@
> > +#define STRLEN __strnlen_evex512
> > +#define USE_AS_STRNLEN 1
> > +
> > +#include "strlen-evex512.S"
> > diff --git a/sysdeps/x86_64/multiarch/wcslen-evex512.S b/sysdeps/x86_64/multiarch/wcslen-evex512.S
> > new file mode 100644
> > index 0000000000..f59c372b78
> > --- /dev/null
> > +++ b/sysdeps/x86_64/multiarch/wcslen-evex512.S
> > @@ -0,0 +1,4 @@
> > +#define STRLEN __wcslen_evex512
> > +#define USE_AS_WCSLEN 1
> > +
> > +#include "strlen-evex512.S"
> > diff --git a/sysdeps/x86_64/multiarch/wcsnlen-evex512.S b/sysdeps/x86_64/multiarch/wcsnlen-evex512.S
> > new file mode 100644
> > index 0000000000..73dcf2f210
> > --- /dev/null
> > +++ b/sysdeps/x86_64/multiarch/wcsnlen-evex512.S
> > @@ -0,0 +1,5 @@
> > +#define STRLEN __wcsnlen_evex512
> > +#define USE_AS_WCSLEN 1
> > +#define USE_AS_STRNLEN 1
> > +
> > +#include "strlen-evex512.S"
> > --
> > 2.35.3
> >

^ permalink raw reply	[flat|nested] 12+ messages in thread

* [PATCH v4] x86_64: Implement evex512 version of strlen, strnlen, wcslen and wcsnlen
  2022-05-25 18:20             ` Sunil Pandey
@ 2022-05-26 18:35               ` Sunil K Pandey
  2022-05-26 20:07                 ` Noah Goldstein
  0 siblings, 1 reply; 12+ messages in thread
From: Sunil K Pandey @ 2022-05-26 18:35 UTC (permalink / raw)
  To: libc-alpha

This patch implements following evex512 version of string functions.
Perf gain for evex512 version is up to 50% as compared to evex,
depending on length and alignment.

Placeholder function, not used by any processor at the moment.

- String length function using 512 bit vectors.
- String N length using 512 bit vectors.
- Wide string length using 512 bit vectors.
- Wide string N length using 512 bit vectors.
---
 sysdeps/x86_64/multiarch/Makefile           |   4 +
 sysdeps/x86_64/multiarch/ifunc-impl-list.c  |  20 ++
 sysdeps/x86_64/multiarch/strlen-evex-base.S | 302 ++++++++++++++++++++
 sysdeps/x86_64/multiarch/strlen-evex512.S   |   7 +
 sysdeps/x86_64/multiarch/strnlen-evex512.S  |   4 +
 sysdeps/x86_64/multiarch/wcslen-evex512.S   |   4 +
 sysdeps/x86_64/multiarch/wcsnlen-evex512.S  |   5 +
 7 files changed, 346 insertions(+)
 create mode 100644 sysdeps/x86_64/multiarch/strlen-evex-base.S
 create mode 100644 sysdeps/x86_64/multiarch/strlen-evex512.S
 create mode 100644 sysdeps/x86_64/multiarch/strnlen-evex512.S
 create mode 100644 sysdeps/x86_64/multiarch/wcslen-evex512.S
 create mode 100644 sysdeps/x86_64/multiarch/wcsnlen-evex512.S

diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
index f3ab5e0928..d0869c3ac3 100644
--- a/sysdeps/x86_64/multiarch/Makefile
+++ b/sysdeps/x86_64/multiarch/Makefile
@@ -81,6 +81,7 @@ sysdep_routines += \
   strlen-avx2 \
   strlen-avx2-rtm \
   strlen-evex \
+  strlen-evex512 \
   strlen-sse2 \
   strncase_l-avx2 \
   strncase_l-avx2-rtm \
@@ -105,6 +106,7 @@ sysdep_routines += \
   strnlen-avx2 \
   strnlen-avx2-rtm \
   strnlen-evex \
+  strnlen-evex512 \
   strnlen-sse2 \
   strpbrk-c \
   strpbrk-sse2 \
@@ -138,6 +140,7 @@ sysdep_routines += \
   wcslen-avx2 \
   wcslen-avx2-rtm \
   wcslen-evex \
+  wcslen-evex512 \
   wcslen-sse2 \
   wcslen-sse4_1 \
   wcsncmp-avx2 \
@@ -148,6 +151,7 @@ sysdep_routines += \
   wcsnlen-avx2-rtm \
   wcsnlen-c \
   wcsnlen-evex \
+  wcsnlen-evex512 \
   wcsnlen-sse4_1 \
   wcsrchr-avx2 \
   wcsrchr-avx2-rtm \
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
index 7218095430..c5cd9466fe 100644
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
@@ -328,6 +328,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 			       && CPU_FEATURE_USABLE (AVX512BW)
 			       && CPU_FEATURE_USABLE (BMI2)),
 			      __strlen_evex)
+	      IFUNC_IMPL_ADD (array, i, strlen,
+			      (CPU_FEATURE_USABLE (AVX512VL)
+			       && CPU_FEATURE_USABLE (AVX512BW)
+			       && CPU_FEATURE_USABLE (BMI2)),
+			      __strlen_evex512)
 	      IFUNC_IMPL_ADD (array, i, strlen, 1, __strlen_sse2))
 
   /* Support sysdeps/x86_64/multiarch/strnlen.c.  */
@@ -346,6 +351,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 			       && CPU_FEATURE_USABLE (AVX512BW)
 			       && CPU_FEATURE_USABLE (BMI2)),
 			      __strnlen_evex)
+	      IFUNC_IMPL_ADD (array, i, strnlen,
+			      (CPU_FEATURE_USABLE (AVX512VL)
+			       && CPU_FEATURE_USABLE (AVX512BW)
+			       && CPU_FEATURE_USABLE (BMI2)),
+			      __strnlen_evex512)
 	      IFUNC_IMPL_ADD (array, i, strnlen, 1, __strnlen_sse2))
 
   /* Support sysdeps/x86_64/multiarch/stpncpy.c.  */
@@ -699,6 +709,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 			       && CPU_FEATURE_USABLE (AVX512BW)
 			       && CPU_FEATURE_USABLE (BMI2)),
 			      __wcslen_evex)
+	      IFUNC_IMPL_ADD (array, i, wcslen,
+			      (CPU_FEATURE_USABLE (AVX512VL)
+			       && CPU_FEATURE_USABLE (AVX512BW)
+			       && CPU_FEATURE_USABLE (BMI2)),
+			      __wcslen_evex512)
 	      IFUNC_IMPL_ADD (array, i, wcslen,
 			      CPU_FEATURE_USABLE (SSE4_1),
 			      __wcslen_sse4_1)
@@ -720,6 +735,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 			       && CPU_FEATURE_USABLE (AVX512BW)
 			       && CPU_FEATURE_USABLE (BMI2)),
 			      __wcsnlen_evex)
+	      IFUNC_IMPL_ADD (array, i, wcsnlen,
+			      (CPU_FEATURE_USABLE (AVX512VL)
+			       && CPU_FEATURE_USABLE (AVX512BW)
+			       && CPU_FEATURE_USABLE (BMI2)),
+			      __wcsnlen_evex512)
 	      IFUNC_IMPL_ADD (array, i, wcsnlen,
 			      CPU_FEATURE_USABLE (SSE4_1),
 			      __wcsnlen_sse4_1)
diff --git a/sysdeps/x86_64/multiarch/strlen-evex-base.S b/sysdeps/x86_64/multiarch/strlen-evex-base.S
new file mode 100644
index 0000000000..278c899691
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strlen-evex-base.S
@@ -0,0 +1,302 @@
+/* Placeholder function, not used by any processor at the moment.
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#if IS_IN (libc)
+
+# include <sysdep.h>
+
+# ifdef USE_AS_WCSLEN
+#  define VPCMP		vpcmpd
+#  define VPTESTN	vptestnmd
+#  define VPMINU	vpminud
+#  define CHAR_SIZE	4
+# else
+#  define VPCMP		vpcmpb
+#  define VPTESTN	vptestnmb
+#  define VPMINU	vpminub
+#  define CHAR_SIZE	1
+# endif
+
+# define XMM0		xmm16
+# define PAGE_SIZE	4096
+# define CHAR_PER_VEC	(VEC_SIZE / CHAR_SIZE)
+
+# if VEC_SIZE == 64
+#  define KMOV		kmovq
+#  define KORTEST	kortestq
+#  define RAX		rax
+#  define RCX		rcx
+#  define RDX		rdx
+#  define SHR		shrq
+#  define TEXTSUFFIX	evex512
+#  define VMM0		zmm16
+#  define VMM1		zmm17
+#  define VMM2		zmm18
+#  define VMM3		zmm19
+#  define VMM4		zmm20
+#  define VMOVA		vmovdqa64
+# elif VEC_SIZE == 32
+/* Currently Unused.  */
+#  define KMOV		kmovd
+#  define KORTEST	kortestd
+#  define RAX		eax
+#  define RCX		ecx
+#  define RDX		edx
+#  define SHR		shrl
+#  define TEXTSUFFIX	evex256
+#  define VMM0		ymm16
+#  define VMM1		ymm17
+#  define VMM2		ymm18
+#  define VMM3		ymm19
+#  define VMM4		ymm20
+#  define VMOVA		vmovdqa32
+# endif
+
+	.section .text.TEXTSUFFIX, "ax", @progbits
+/* Aligning entry point to 64 byte, provides better performance for
+   one vector length string.  */
+ENTRY_P2ALIGN (STRLEN, 6)
+# ifdef USE_AS_STRNLEN
+	/* Check zero length.  */
+	test	%RSI_LP, %RSI_LP
+	jz	L(ret_max)
+#  ifdef __ILP32__
+	/* Clear the upper 32 bits.  */
+	movl	%esi, %esi
+#  endif
+# endif
+
+	movl	%edi, %eax
+	vpxorq	%XMM0, %XMM0, %XMM0
+	andl	$(PAGE_SIZE - 1), %eax
+	cmpl	$(PAGE_SIZE - VEC_SIZE), %eax
+	ja	L(page_cross)
+
+	/* Compare [w]char for null, mask bit will be set for match.  */
+	VPCMP	$0, (%rdi), %VMM0, %k0
+	KMOV	%k0, %RAX
+	test	%RAX, %RAX
+	jz	L(align_more)
+
+	bsf	%RAX, %RAX
+# ifdef USE_AS_STRNLEN
+	cmpq	%rsi, %rax
+	cmovnb	%rsi, %rax
+# endif
+	ret
+
+	/* At this point vector max length reached.  */
+# ifdef USE_AS_STRNLEN
+	.p2align 4,,3
+L(ret_max):
+	movq	%rsi, %rax
+	ret
+# endif
+
+L(align_more):
+	leaq	VEC_SIZE(%rdi), %rax
+	/* Align rax to VEC_SIZE.  */
+	andq	$-VEC_SIZE, %rax
+# ifdef USE_AS_STRNLEN
+	movq	%rax, %rdx
+	subq	%rdi, %rdx
+#  ifdef USE_AS_WCSLEN
+	SHR	$2, %RDX
+#  endif
+	/* At this point rdx contains [w]chars already compared.  */
+	subq	%rsi, %rdx
+	jae	L(ret_max)
+	negq	%rdx
+	/* At this point rdx contains number of w[char] needs to go.
+	   Now onwards rdx will keep decrementing with each compare.  */
+# endif
+
+	/* Loop unroll 4 times for 4 vector loop.  */
+	VPCMP	$0, (%rax), %VMM0, %k0
+	KMOV	%k0, %RCX
+	test	%RCX, %RCX
+	jnz	L(ret_vec_x1)
+
+# ifdef USE_AS_STRNLEN
+	subq	$CHAR_PER_VEC, %rdx
+	jbe	L(ret_max)
+# endif
+
+	VPCMP	$0, VEC_SIZE(%rax), %VMM0, %k0
+	KMOV	%k0, %RCX
+	test	%RCX, %RCX
+	jnz	L(ret_vec_x2)
+
+# ifdef USE_AS_STRNLEN
+	subq	$CHAR_PER_VEC, %rdx
+	jbe	L(ret_max)
+# endif
+
+	VPCMP	$0, (VEC_SIZE * 2)(%rax), %VMM0, %k0
+	KMOV	%k0, %RCX
+	test	%RCX, %RCX
+	jnz	L(ret_vec_x3)
+
+# ifdef USE_AS_STRNLEN
+	subq	$CHAR_PER_VEC, %rdx
+	jbe	L(ret_max)
+# endif
+
+	VPCMP	$0, (VEC_SIZE * 3)(%rax), %VMM0, %k0
+	KMOV	%k0, %RCX
+	test	%RCX, %RCX
+	jnz	L(ret_vec_x4)
+
+# ifdef USE_AS_STRNLEN
+	subq	$CHAR_PER_VEC, %rdx
+	jbe	L(ret_max)
+	/* Save pointer before 4 x VEC_SIZE alignment.  */
+	movq	%rax, %rcx
+# endif
+
+	/* Align address to VEC_SIZE * 4 for loop.  */
+	andq	$-(VEC_SIZE * 4), %rax
+
+# ifdef USE_AS_STRNLEN
+	subq	%rax, %rcx
+#  ifdef USE_AS_WCSLEN
+	SHR	$2, %RCX
+#  endif
+	/* rcx contains number of [w]char will be recompared due to
+	   alignment fixes.  rdx must be incremented by rcx to offset
+	   alignment adjustment.  */
+	addq	%rcx, %rdx
+	/* Need jump as we don't want to add/subtract rdx for first
+	   iteration of 4 x VEC_SIZE aligned loop.  */
+	jmp	L(loop_entry)
+# endif
+
+	.p2align 4,,11
+L(loop):
+# ifdef USE_AS_STRNLEN
+	subq	$(CHAR_PER_VEC * 4), %rdx
+	jbe	L(ret_max)
+L(loop_entry):
+# endif
+	/* VPMINU and VPCMP combination provide better performance as
+	   compared to alternative combinations.  */
+	VMOVA	(VEC_SIZE * 4)(%rax), %VMM1
+	VPMINU	(VEC_SIZE * 5)(%rax), %VMM1, %VMM2
+	VMOVA	(VEC_SIZE * 6)(%rax), %VMM3
+	VPMINU	(VEC_SIZE * 7)(%rax), %VMM3, %VMM4
+
+	VPTESTN	%VMM2, %VMM2, %k0
+	VPTESTN	%VMM4, %VMM4, %k1
+
+	subq	$-(VEC_SIZE * 4), %rax
+	KORTEST	%k0, %k1
+	jz	L(loop)
+
+	VPTESTN	%VMM1, %VMM1, %k2
+	KMOV	%k2, %RCX
+	test	%RCX, %RCX
+	jnz	L(ret_vec_x1)
+
+	KMOV	%k0, %RCX
+	/* At this point, if k0 is non zero, null char must be in the
+	   second vector.  */
+	test	%RCX, %RCX
+	jnz	L(ret_vec_x2)
+
+	VPTESTN	%VMM3, %VMM3, %k3
+	KMOV	%k3, %RCX
+	test	%RCX, %RCX
+	jnz	L(ret_vec_x3)
+	/* At this point null [w]char must be in the fourth vector so no
+	   need to check.  */
+	KMOV	%k1, %RCX
+
+	/* Fourth, third, second vector terminating are pretty much
+	   same, implemented this way to avoid branching and reuse code
+	   from pre loop exit condition.  */
+L(ret_vec_x4):
+	bsf	%RCX, %RCX
+	subq	%rdi, %rax
+# ifdef USE_AS_WCSLEN
+	subq	$-(VEC_SIZE * 3), %rax
+	shrq	$2, %rax
+	addq	%rcx, %rax
+# else
+	leaq	(VEC_SIZE * 3)(%rcx, %rax), %rax
+# endif
+# ifdef USE_AS_STRNLEN
+	cmpq	%rsi, %rax
+	cmovnb	%rsi, %rax
+# endif
+	ret
+
+L(ret_vec_x3):
+	bsf	%RCX, %RCX
+	subq	%rdi, %rax
+# ifdef USE_AS_WCSLEN
+	subq	$-(VEC_SIZE * 2), %rax
+	shrq	$2, %rax
+	addq	%rcx, %rax
+# else
+	leaq	(VEC_SIZE * 2)(%rcx, %rax), %rax
+# endif
+# ifdef USE_AS_STRNLEN
+	cmpq	%rsi, %rax
+	cmovnb	%rsi, %rax
+# endif
+	ret
+
+L(ret_vec_x2):
+	subq	$-VEC_SIZE, %rax
+L(ret_vec_x1):
+	bsf	%RCX, %RCX
+	subq	%rdi, %rax
+# ifdef USE_AS_WCSLEN
+	shrq	$2, %rax
+# endif
+	addq	%rcx, %rax
+# ifdef USE_AS_STRNLEN
+	cmpq	%rsi, %rax
+	cmovnb	%rsi, %rax
+# endif
+	ret
+
+L(page_cross):
+	movl	%eax, %ecx
+# ifdef USE_AS_WCSLEN
+	andl	$(VEC_SIZE - 1), %ecx
+	sarl	$2, %ecx
+# endif
+	/* ecx contains number of w[char] to be skipped as a result
+	   of address alignment.  */
+	xorq	%rdi, %rax
+	VPCMP	$0, (PAGE_SIZE - VEC_SIZE)(%rax), %VMM0, %k0
+	KMOV	%k0, %RAX
+	/* Ignore number of character for alignment adjustment.  */
+	SHR	%cl, %RAX
+	jz	L(align_more)
+
+	bsf	%RAX, %RAX
+# ifdef USE_AS_STRNLEN
+	cmpq	%rsi, %rax
+	cmovnb	%rsi, %rax
+# endif
+	ret
+
+END (STRLEN)
+#endif
diff --git a/sysdeps/x86_64/multiarch/strlen-evex512.S b/sysdeps/x86_64/multiarch/strlen-evex512.S
new file mode 100644
index 0000000000..116f8981c8
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strlen-evex512.S
@@ -0,0 +1,7 @@
+#ifndef STRLEN
+# define STRLEN		__strlen_evex512
+#endif
+
+#define VEC_SIZE	64
+
+#include "strlen-evex-base.S"
diff --git a/sysdeps/x86_64/multiarch/strnlen-evex512.S b/sysdeps/x86_64/multiarch/strnlen-evex512.S
new file mode 100644
index 0000000000..0b7f220214
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strnlen-evex512.S
@@ -0,0 +1,4 @@
+#define STRLEN __strnlen_evex512
+#define USE_AS_STRNLEN 1
+
+#include "strlen-evex512.S"
diff --git a/sysdeps/x86_64/multiarch/wcslen-evex512.S b/sysdeps/x86_64/multiarch/wcslen-evex512.S
new file mode 100644
index 0000000000..f59c372b78
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wcslen-evex512.S
@@ -0,0 +1,4 @@
+#define STRLEN __wcslen_evex512
+#define USE_AS_WCSLEN 1
+
+#include "strlen-evex512.S"
diff --git a/sysdeps/x86_64/multiarch/wcsnlen-evex512.S b/sysdeps/x86_64/multiarch/wcsnlen-evex512.S
new file mode 100644
index 0000000000..73dcf2f210
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wcsnlen-evex512.S
@@ -0,0 +1,5 @@
+#define STRLEN __wcsnlen_evex512
+#define USE_AS_WCSLEN 1
+#define USE_AS_STRNLEN 1
+
+#include "strlen-evex512.S"
-- 
2.35.3


^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH v4] x86_64: Implement evex512 version of strlen, strnlen,  wcslen and wcsnlen
  2022-05-26 18:35               ` [PATCH v4] " Sunil K Pandey
@ 2022-05-26 20:07                 ` Noah Goldstein
  2022-07-14  0:03                   ` Sunil Pandey
  0 siblings, 1 reply; 12+ messages in thread
From: Noah Goldstein @ 2022-05-26 20:07 UTC (permalink / raw)
  To: Sunil K Pandey; +Cc: GNU C Library

On Thu, May 26, 2022 at 1:36 PM Sunil K Pandey via Libc-alpha
<libc-alpha@sourceware.org> wrote:
>
> This patch implements following evex512 version of string functions.
> Perf gain for evex512 version is up to 50% as compared to evex,
> depending on length and alignment.
>
> Placeholder function, not used by any processor at the moment.
>
> - String length function using 512 bit vectors.
> - String N length using 512 bit vectors.
> - Wide string length using 512 bit vectors.
> - Wide string N length using 512 bit vectors.
> ---
>  sysdeps/x86_64/multiarch/Makefile           |   4 +
>  sysdeps/x86_64/multiarch/ifunc-impl-list.c  |  20 ++
>  sysdeps/x86_64/multiarch/strlen-evex-base.S | 302 ++++++++++++++++++++
>  sysdeps/x86_64/multiarch/strlen-evex512.S   |   7 +
>  sysdeps/x86_64/multiarch/strnlen-evex512.S  |   4 +
>  sysdeps/x86_64/multiarch/wcslen-evex512.S   |   4 +
>  sysdeps/x86_64/multiarch/wcsnlen-evex512.S  |   5 +
>  7 files changed, 346 insertions(+)
>  create mode 100644 sysdeps/x86_64/multiarch/strlen-evex-base.S
>  create mode 100644 sysdeps/x86_64/multiarch/strlen-evex512.S
>  create mode 100644 sysdeps/x86_64/multiarch/strnlen-evex512.S
>  create mode 100644 sysdeps/x86_64/multiarch/wcslen-evex512.S
>  create mode 100644 sysdeps/x86_64/multiarch/wcsnlen-evex512.S
>
> diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
> index f3ab5e0928..d0869c3ac3 100644
> --- a/sysdeps/x86_64/multiarch/Makefile
> +++ b/sysdeps/x86_64/multiarch/Makefile
> @@ -81,6 +81,7 @@ sysdep_routines += \
>    strlen-avx2 \
>    strlen-avx2-rtm \
>    strlen-evex \
> +  strlen-evex512 \
>    strlen-sse2 \
>    strncase_l-avx2 \
>    strncase_l-avx2-rtm \
> @@ -105,6 +106,7 @@ sysdep_routines += \
>    strnlen-avx2 \
>    strnlen-avx2-rtm \
>    strnlen-evex \
> +  strnlen-evex512 \
>    strnlen-sse2 \
>    strpbrk-c \
>    strpbrk-sse2 \
> @@ -138,6 +140,7 @@ sysdep_routines += \
>    wcslen-avx2 \
>    wcslen-avx2-rtm \
>    wcslen-evex \
> +  wcslen-evex512 \
>    wcslen-sse2 \
>    wcslen-sse4_1 \
>    wcsncmp-avx2 \
> @@ -148,6 +151,7 @@ sysdep_routines += \
>    wcsnlen-avx2-rtm \
>    wcsnlen-c \
>    wcsnlen-evex \
> +  wcsnlen-evex512 \
>    wcsnlen-sse4_1 \
>    wcsrchr-avx2 \
>    wcsrchr-avx2-rtm \
> diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> index 7218095430..c5cd9466fe 100644
> --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> @@ -328,6 +328,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
>                                && CPU_FEATURE_USABLE (AVX512BW)
>                                && CPU_FEATURE_USABLE (BMI2)),
>                               __strlen_evex)
> +             IFUNC_IMPL_ADD (array, i, strlen,
> +                             (CPU_FEATURE_USABLE (AVX512VL)
> +                              && CPU_FEATURE_USABLE (AVX512BW)
> +                              && CPU_FEATURE_USABLE (BMI2)),
> +                             __strlen_evex512)
>               IFUNC_IMPL_ADD (array, i, strlen, 1, __strlen_sse2))
>
>    /* Support sysdeps/x86_64/multiarch/strnlen.c.  */
> @@ -346,6 +351,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
>                                && CPU_FEATURE_USABLE (AVX512BW)
>                                && CPU_FEATURE_USABLE (BMI2)),
>                               __strnlen_evex)
> +             IFUNC_IMPL_ADD (array, i, strnlen,
> +                             (CPU_FEATURE_USABLE (AVX512VL)
> +                              && CPU_FEATURE_USABLE (AVX512BW)
> +                              && CPU_FEATURE_USABLE (BMI2)),
> +                             __strnlen_evex512)
>               IFUNC_IMPL_ADD (array, i, strnlen, 1, __strnlen_sse2))
>
>    /* Support sysdeps/x86_64/multiarch/stpncpy.c.  */
> @@ -699,6 +709,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
>                                && CPU_FEATURE_USABLE (AVX512BW)
>                                && CPU_FEATURE_USABLE (BMI2)),
>                               __wcslen_evex)
> +             IFUNC_IMPL_ADD (array, i, wcslen,
> +                             (CPU_FEATURE_USABLE (AVX512VL)
> +                              && CPU_FEATURE_USABLE (AVX512BW)
> +                              && CPU_FEATURE_USABLE (BMI2)),
> +                             __wcslen_evex512)
>               IFUNC_IMPL_ADD (array, i, wcslen,
>                               CPU_FEATURE_USABLE (SSE4_1),
>                               __wcslen_sse4_1)
> @@ -720,6 +735,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
>                                && CPU_FEATURE_USABLE (AVX512BW)
>                                && CPU_FEATURE_USABLE (BMI2)),
>                               __wcsnlen_evex)
> +             IFUNC_IMPL_ADD (array, i, wcsnlen,
> +                             (CPU_FEATURE_USABLE (AVX512VL)
> +                              && CPU_FEATURE_USABLE (AVX512BW)
> +                              && CPU_FEATURE_USABLE (BMI2)),
> +                             __wcsnlen_evex512)
>               IFUNC_IMPL_ADD (array, i, wcsnlen,
>                               CPU_FEATURE_USABLE (SSE4_1),
>                               __wcsnlen_sse4_1)
> diff --git a/sysdeps/x86_64/multiarch/strlen-evex-base.S b/sysdeps/x86_64/multiarch/strlen-evex-base.S
> new file mode 100644
> index 0000000000..278c899691
> --- /dev/null
> +++ b/sysdeps/x86_64/multiarch/strlen-evex-base.S
> @@ -0,0 +1,302 @@
> +/* Placeholder function, not used by any processor at the moment.
> +   Copyright (C) 2022 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +#if IS_IN (libc)
> +
> +# include <sysdep.h>
> +
> +# ifdef USE_AS_WCSLEN
> +#  define VPCMP                vpcmpd
> +#  define VPTESTN      vptestnmd
> +#  define VPMINU       vpminud
> +#  define CHAR_SIZE    4
> +# else
> +#  define VPCMP                vpcmpb
> +#  define VPTESTN      vptestnmb
> +#  define VPMINU       vpminub
> +#  define CHAR_SIZE    1
> +# endif
> +
> +# define XMM0          xmm16
> +# define PAGE_SIZE     4096
> +# define CHAR_PER_VEC  (VEC_SIZE / CHAR_SIZE)
> +
> +# if VEC_SIZE == 64
> +#  define KMOV         kmovq
> +#  define KORTEST      kortestq
> +#  define RAX          rax
> +#  define RCX          rcx
> +#  define RDX          rdx
> +#  define SHR          shrq
> +#  define TEXTSUFFIX   evex512
> +#  define VMM0         zmm16
> +#  define VMM1         zmm17
> +#  define VMM2         zmm18
> +#  define VMM3         zmm19
> +#  define VMM4         zmm20
> +#  define VMOVA                vmovdqa64
> +# elif VEC_SIZE == 32
> +/* Currently Unused.  */
> +#  define KMOV         kmovd
> +#  define KORTEST      kortestd
> +#  define RAX          eax
> +#  define RCX          ecx
> +#  define RDX          edx
> +#  define SHR          shrl
> +#  define TEXTSUFFIX   evex256
> +#  define VMM0         ymm16
> +#  define VMM1         ymm17
> +#  define VMM2         ymm18
> +#  define VMM3         ymm19
> +#  define VMM4         ymm20
> +#  define VMOVA                vmovdqa32
> +# endif
> +
> +       .section .text.TEXTSUFFIX, "ax", @progbits
> +/* Aligning entry point to 64 byte, provides better performance for
> +   one vector length string.  */
> +ENTRY_P2ALIGN (STRLEN, 6)
> +# ifdef USE_AS_STRNLEN
> +       /* Check zero length.  */
> +       test    %RSI_LP, %RSI_LP
> +       jz      L(ret_max)
> +#  ifdef __ILP32__
> +       /* Clear the upper 32 bits.  */
> +       movl    %esi, %esi
> +#  endif
> +# endif
> +
> +       movl    %edi, %eax
> +       vpxorq  %XMM0, %XMM0, %XMM0
> +       andl    $(PAGE_SIZE - 1), %eax
> +       cmpl    $(PAGE_SIZE - VEC_SIZE), %eax
> +       ja      L(page_cross)
> +
> +       /* Compare [w]char for null, mask bit will be set for match.  */
> +       VPCMP   $0, (%rdi), %VMM0, %k0
> +       KMOV    %k0, %RAX
> +       test    %RAX, %RAX
> +       jz      L(align_more)
> +
> +       bsf     %RAX, %RAX
> +# ifdef USE_AS_STRNLEN
> +       cmpq    %rsi, %rax
> +       cmovnb  %rsi, %rax
> +# endif
> +       ret
> +
> +       /* At this point vector max length reached.  */
> +# ifdef USE_AS_STRNLEN
> +       .p2align 4,,3
> +L(ret_max):
> +       movq    %rsi, %rax
> +       ret
> +# endif
> +
> +L(align_more):
> +       leaq    VEC_SIZE(%rdi), %rax
> +       /* Align rax to VEC_SIZE.  */
> +       andq    $-VEC_SIZE, %rax
> +# ifdef USE_AS_STRNLEN
> +       movq    %rax, %rdx
> +       subq    %rdi, %rdx
> +#  ifdef USE_AS_WCSLEN
> +       SHR     $2, %RDX
> +#  endif
> +       /* At this point rdx contains [w]chars already compared.  */
> +       subq    %rsi, %rdx
> +       jae     L(ret_max)
> +       negq    %rdx
> +       /* At this point rdx contains number of w[char] needs to go.
> +          Now onwards rdx will keep decrementing with each compare.  */
> +# endif
> +
> +       /* Loop unroll 4 times for 4 vector loop.  */
> +       VPCMP   $0, (%rax), %VMM0, %k0
> +       KMOV    %k0, %RCX
> +       test    %RCX, %RCX
> +       jnz     L(ret_vec_x1)
> +
> +# ifdef USE_AS_STRNLEN
> +       subq    $CHAR_PER_VEC, %rdx
> +       jbe     L(ret_max)
> +# endif
> +
> +       VPCMP   $0, VEC_SIZE(%rax), %VMM0, %k0
> +       KMOV    %k0, %RCX
> +       test    %RCX, %RCX
> +       jnz     L(ret_vec_x2)
> +
> +# ifdef USE_AS_STRNLEN
> +       subq    $CHAR_PER_VEC, %rdx
> +       jbe     L(ret_max)
> +# endif
> +
> +       VPCMP   $0, (VEC_SIZE * 2)(%rax), %VMM0, %k0
> +       KMOV    %k0, %RCX
> +       test    %RCX, %RCX
> +       jnz     L(ret_vec_x3)
> +
> +# ifdef USE_AS_STRNLEN
> +       subq    $CHAR_PER_VEC, %rdx
> +       jbe     L(ret_max)
> +# endif
> +
> +       VPCMP   $0, (VEC_SIZE * 3)(%rax), %VMM0, %k0
> +       KMOV    %k0, %RCX
> +       test    %RCX, %RCX
> +       jnz     L(ret_vec_x4)
> +
> +# ifdef USE_AS_STRNLEN
> +       subq    $CHAR_PER_VEC, %rdx
> +       jbe     L(ret_max)
> +       /* Save pointer before 4 x VEC_SIZE alignment.  */
> +       movq    %rax, %rcx
> +# endif
> +
> +       /* Align address to VEC_SIZE * 4 for loop.  */
> +       andq    $-(VEC_SIZE * 4), %rax
> +
> +# ifdef USE_AS_STRNLEN
> +       subq    %rax, %rcx
> +#  ifdef USE_AS_WCSLEN
> +       SHR     $2, %RCX
> +#  endif
> +       /* rcx contains number of [w]char will be recompared due to
> +          alignment fixes.  rdx must be incremented by rcx to offset
> +          alignment adjustment.  */
> +       addq    %rcx, %rdx
> +       /* Need jump as we don't want to add/subtract rdx for first
> +          iteration of 4 x VEC_SIZE aligned loop.  */
> +       jmp     L(loop_entry)
> +# endif
> +
> +       .p2align 4,,11
> +L(loop):
> +# ifdef USE_AS_STRNLEN
> +       subq    $(CHAR_PER_VEC * 4), %rdx
> +       jbe     L(ret_max)
> +L(loop_entry):
> +# endif
> +       /* VPMINU and VPCMP combination provide better performance as
> +          compared to alternative combinations.  */
> +       VMOVA   (VEC_SIZE * 4)(%rax), %VMM1
> +       VPMINU  (VEC_SIZE * 5)(%rax), %VMM1, %VMM2
> +       VMOVA   (VEC_SIZE * 6)(%rax), %VMM3
> +       VPMINU  (VEC_SIZE * 7)(%rax), %VMM3, %VMM4
> +
> +       VPTESTN %VMM2, %VMM2, %k0
> +       VPTESTN %VMM4, %VMM4, %k1
> +
> +       subq    $-(VEC_SIZE * 4), %rax
> +       KORTEST %k0, %k1
> +       jz      L(loop)
> +
> +       VPTESTN %VMM1, %VMM1, %k2
> +       KMOV    %k2, %RCX
> +       test    %RCX, %RCX
> +       jnz     L(ret_vec_x1)
> +
> +       KMOV    %k0, %RCX
> +       /* At this point, if k0 is non zero, null char must be in the
> +          second vector.  */
> +       test    %RCX, %RCX
> +       jnz     L(ret_vec_x2)
> +
> +       VPTESTN %VMM3, %VMM3, %k3
> +       KMOV    %k3, %RCX
> +       test    %RCX, %RCX
> +       jnz     L(ret_vec_x3)
> +       /* At this point null [w]char must be in the fourth vector so no
> +          need to check.  */
> +       KMOV    %k1, %RCX
> +
> +       /* Fourth, third, second vector terminating are pretty much
> +          same, implemented this way to avoid branching and reuse code
> +          from pre loop exit condition.  */
> +L(ret_vec_x4):
> +       bsf     %RCX, %RCX
> +       subq    %rdi, %rax
> +# ifdef USE_AS_WCSLEN
> +       subq    $-(VEC_SIZE * 3), %rax
> +       shrq    $2, %rax
> +       addq    %rcx, %rax
> +# else
> +       leaq    (VEC_SIZE * 3)(%rcx, %rax), %rax
> +# endif
> +# ifdef USE_AS_STRNLEN
> +       cmpq    %rsi, %rax
> +       cmovnb  %rsi, %rax
> +# endif
> +       ret
> +
> +L(ret_vec_x3):
> +       bsf     %RCX, %RCX
> +       subq    %rdi, %rax
> +# ifdef USE_AS_WCSLEN
> +       subq    $-(VEC_SIZE * 2), %rax
> +       shrq    $2, %rax
> +       addq    %rcx, %rax
> +# else
> +       leaq    (VEC_SIZE * 2)(%rcx, %rax), %rax
> +# endif
> +# ifdef USE_AS_STRNLEN
> +       cmpq    %rsi, %rax
> +       cmovnb  %rsi, %rax
> +# endif
> +       ret
> +
> +L(ret_vec_x2):
> +       subq    $-VEC_SIZE, %rax
> +L(ret_vec_x1):
> +       bsf     %RCX, %RCX
> +       subq    %rdi, %rax
> +# ifdef USE_AS_WCSLEN
> +       shrq    $2, %rax
> +# endif
> +       addq    %rcx, %rax
> +# ifdef USE_AS_STRNLEN
> +       cmpq    %rsi, %rax
> +       cmovnb  %rsi, %rax
> +# endif
> +       ret
> +
> +L(page_cross):
> +       movl    %eax, %ecx
> +# ifdef USE_AS_WCSLEN
> +       andl    $(VEC_SIZE - 1), %ecx
> +       sarl    $2, %ecx
> +# endif
> +       /* ecx contains number of w[char] to be skipped as a result
> +          of address alignment.  */
> +       xorq    %rdi, %rax
> +       VPCMP   $0, (PAGE_SIZE - VEC_SIZE)(%rax), %VMM0, %k0
> +       KMOV    %k0, %RAX
> +       /* Ignore number of character for alignment adjustment.  */
> +       SHR     %cl, %RAX
> +       jz      L(align_more)
> +
> +       bsf     %RAX, %RAX
> +# ifdef USE_AS_STRNLEN
> +       cmpq    %rsi, %rax
> +       cmovnb  %rsi, %rax
> +# endif
> +       ret
> +
> +END (STRLEN)
> +#endif
> diff --git a/sysdeps/x86_64/multiarch/strlen-evex512.S b/sysdeps/x86_64/multiarch/strlen-evex512.S
> new file mode 100644
> index 0000000000..116f8981c8
> --- /dev/null
> +++ b/sysdeps/x86_64/multiarch/strlen-evex512.S
> @@ -0,0 +1,7 @@
> +#ifndef STRLEN
> +# define STRLEN                __strlen_evex512
> +#endif
> +
> +#define VEC_SIZE       64
> +
> +#include "strlen-evex-base.S"
> diff --git a/sysdeps/x86_64/multiarch/strnlen-evex512.S b/sysdeps/x86_64/multiarch/strnlen-evex512.S
> new file mode 100644
> index 0000000000..0b7f220214
> --- /dev/null
> +++ b/sysdeps/x86_64/multiarch/strnlen-evex512.S
> @@ -0,0 +1,4 @@
> +#define STRLEN __strnlen_evex512
> +#define USE_AS_STRNLEN 1
> +
> +#include "strlen-evex512.S"
> diff --git a/sysdeps/x86_64/multiarch/wcslen-evex512.S b/sysdeps/x86_64/multiarch/wcslen-evex512.S
> new file mode 100644
> index 0000000000..f59c372b78
> --- /dev/null
> +++ b/sysdeps/x86_64/multiarch/wcslen-evex512.S
> @@ -0,0 +1,4 @@
> +#define STRLEN __wcslen_evex512
> +#define USE_AS_WCSLEN 1
> +
> +#include "strlen-evex512.S"
> diff --git a/sysdeps/x86_64/multiarch/wcsnlen-evex512.S b/sysdeps/x86_64/multiarch/wcsnlen-evex512.S
> new file mode 100644
> index 0000000000..73dcf2f210
> --- /dev/null
> +++ b/sysdeps/x86_64/multiarch/wcsnlen-evex512.S
> @@ -0,0 +1,5 @@
> +#define STRLEN __wcsnlen_evex512
> +#define USE_AS_WCSLEN 1
> +#define USE_AS_STRNLEN 1
> +
> +#include "strlen-evex512.S"
> --
> 2.35.3
>

LGTM.

Reviewed-by: Noah Goldstein <goldstein.w.n@gmail.com>

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH v4] x86_64: Implement evex512 version of strlen, strnlen,  wcslen and wcsnlen
  2022-05-26 20:07                 ` Noah Goldstein
@ 2022-07-14  0:03                   ` Sunil Pandey
  0 siblings, 0 replies; 12+ messages in thread
From: Sunil Pandey @ 2022-07-14  0:03 UTC (permalink / raw)
  To: Noah Goldstein; +Cc: GNU C Library

On Thu, May 26, 2022 at 1:07 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> On Thu, May 26, 2022 at 1:36 PM Sunil K Pandey via Libc-alpha
> <libc-alpha@sourceware.org> wrote:
> >
> > This patch implements following evex512 version of string functions.
> > Perf gain for evex512 version is up to 50% as compared to evex,
> > depending on length and alignment.
> >
> > Placeholder function, not used by any processor at the moment.
> >
> > - String length function using 512 bit vectors.
> > - String N length using 512 bit vectors.
> > - Wide string length using 512 bit vectors.
> > - Wide string N length using 512 bit vectors.
> > ---
> >  sysdeps/x86_64/multiarch/Makefile           |   4 +
> >  sysdeps/x86_64/multiarch/ifunc-impl-list.c  |  20 ++
> >  sysdeps/x86_64/multiarch/strlen-evex-base.S | 302 ++++++++++++++++++++
> >  sysdeps/x86_64/multiarch/strlen-evex512.S   |   7 +
> >  sysdeps/x86_64/multiarch/strnlen-evex512.S  |   4 +
> >  sysdeps/x86_64/multiarch/wcslen-evex512.S   |   4 +
> >  sysdeps/x86_64/multiarch/wcsnlen-evex512.S  |   5 +
> >  7 files changed, 346 insertions(+)
> >  create mode 100644 sysdeps/x86_64/multiarch/strlen-evex-base.S
> >  create mode 100644 sysdeps/x86_64/multiarch/strlen-evex512.S
> >  create mode 100644 sysdeps/x86_64/multiarch/strnlen-evex512.S
> >  create mode 100644 sysdeps/x86_64/multiarch/wcslen-evex512.S
> >  create mode 100644 sysdeps/x86_64/multiarch/wcsnlen-evex512.S
> >
> > diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
> > index f3ab5e0928..d0869c3ac3 100644
> > --- a/sysdeps/x86_64/multiarch/Makefile
> > +++ b/sysdeps/x86_64/multiarch/Makefile
> > @@ -81,6 +81,7 @@ sysdep_routines += \
> >    strlen-avx2 \
> >    strlen-avx2-rtm \
> >    strlen-evex \
> > +  strlen-evex512 \
> >    strlen-sse2 \
> >    strncase_l-avx2 \
> >    strncase_l-avx2-rtm \
> > @@ -105,6 +106,7 @@ sysdep_routines += \
> >    strnlen-avx2 \
> >    strnlen-avx2-rtm \
> >    strnlen-evex \
> > +  strnlen-evex512 \
> >    strnlen-sse2 \
> >    strpbrk-c \
> >    strpbrk-sse2 \
> > @@ -138,6 +140,7 @@ sysdep_routines += \
> >    wcslen-avx2 \
> >    wcslen-avx2-rtm \
> >    wcslen-evex \
> > +  wcslen-evex512 \
> >    wcslen-sse2 \
> >    wcslen-sse4_1 \
> >    wcsncmp-avx2 \
> > @@ -148,6 +151,7 @@ sysdep_routines += \
> >    wcsnlen-avx2-rtm \
> >    wcsnlen-c \
> >    wcsnlen-evex \
> > +  wcsnlen-evex512 \
> >    wcsnlen-sse4_1 \
> >    wcsrchr-avx2 \
> >    wcsrchr-avx2-rtm \
> > diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> > index 7218095430..c5cd9466fe 100644
> > --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> > +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> > @@ -328,6 +328,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> >                                && CPU_FEATURE_USABLE (AVX512BW)
> >                                && CPU_FEATURE_USABLE (BMI2)),
> >                               __strlen_evex)
> > +             IFUNC_IMPL_ADD (array, i, strlen,
> > +                             (CPU_FEATURE_USABLE (AVX512VL)
> > +                              && CPU_FEATURE_USABLE (AVX512BW)
> > +                              && CPU_FEATURE_USABLE (BMI2)),
> > +                             __strlen_evex512)
> >               IFUNC_IMPL_ADD (array, i, strlen, 1, __strlen_sse2))
> >
> >    /* Support sysdeps/x86_64/multiarch/strnlen.c.  */
> > @@ -346,6 +351,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> >                                && CPU_FEATURE_USABLE (AVX512BW)
> >                                && CPU_FEATURE_USABLE (BMI2)),
> >                               __strnlen_evex)
> > +             IFUNC_IMPL_ADD (array, i, strnlen,
> > +                             (CPU_FEATURE_USABLE (AVX512VL)
> > +                              && CPU_FEATURE_USABLE (AVX512BW)
> > +                              && CPU_FEATURE_USABLE (BMI2)),
> > +                             __strnlen_evex512)
> >               IFUNC_IMPL_ADD (array, i, strnlen, 1, __strnlen_sse2))
> >
> >    /* Support sysdeps/x86_64/multiarch/stpncpy.c.  */
> > @@ -699,6 +709,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> >                                && CPU_FEATURE_USABLE (AVX512BW)
> >                                && CPU_FEATURE_USABLE (BMI2)),
> >                               __wcslen_evex)
> > +             IFUNC_IMPL_ADD (array, i, wcslen,
> > +                             (CPU_FEATURE_USABLE (AVX512VL)
> > +                              && CPU_FEATURE_USABLE (AVX512BW)
> > +                              && CPU_FEATURE_USABLE (BMI2)),
> > +                             __wcslen_evex512)
> >               IFUNC_IMPL_ADD (array, i, wcslen,
> >                               CPU_FEATURE_USABLE (SSE4_1),
> >                               __wcslen_sse4_1)
> > @@ -720,6 +735,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> >                                && CPU_FEATURE_USABLE (AVX512BW)
> >                                && CPU_FEATURE_USABLE (BMI2)),
> >                               __wcsnlen_evex)
> > +             IFUNC_IMPL_ADD (array, i, wcsnlen,
> > +                             (CPU_FEATURE_USABLE (AVX512VL)
> > +                              && CPU_FEATURE_USABLE (AVX512BW)
> > +                              && CPU_FEATURE_USABLE (BMI2)),
> > +                             __wcsnlen_evex512)
> >               IFUNC_IMPL_ADD (array, i, wcsnlen,
> >                               CPU_FEATURE_USABLE (SSE4_1),
> >                               __wcsnlen_sse4_1)
> > diff --git a/sysdeps/x86_64/multiarch/strlen-evex-base.S b/sysdeps/x86_64/multiarch/strlen-evex-base.S
> > new file mode 100644
> > index 0000000000..278c899691
> > --- /dev/null
> > +++ b/sysdeps/x86_64/multiarch/strlen-evex-base.S
> > @@ -0,0 +1,302 @@
> > +/* Placeholder function, not used by any processor at the moment.
> > +   Copyright (C) 2022 Free Software Foundation, Inc.
> > +   This file is part of the GNU C Library.
> > +
> > +   The GNU C Library is free software; you can redistribute it and/or
> > +   modify it under the terms of the GNU Lesser General Public
> > +   License as published by the Free Software Foundation; either
> > +   version 2.1 of the License, or (at your option) any later version.
> > +
> > +   The GNU C Library is distributed in the hope that it will be useful,
> > +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> > +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > +   Lesser General Public License for more details.
> > +
> > +   You should have received a copy of the GNU Lesser General Public
> > +   License along with the GNU C Library; if not, see
> > +   <https://www.gnu.org/licenses/>.  */
> > +
> > +#if IS_IN (libc)
> > +
> > +# include <sysdep.h>
> > +
> > +# ifdef USE_AS_WCSLEN
> > +#  define VPCMP                vpcmpd
> > +#  define VPTESTN      vptestnmd
> > +#  define VPMINU       vpminud
> > +#  define CHAR_SIZE    4
> > +# else
> > +#  define VPCMP                vpcmpb
> > +#  define VPTESTN      vptestnmb
> > +#  define VPMINU       vpminub
> > +#  define CHAR_SIZE    1
> > +# endif
> > +
> > +# define XMM0          xmm16
> > +# define PAGE_SIZE     4096
> > +# define CHAR_PER_VEC  (VEC_SIZE / CHAR_SIZE)
> > +
> > +# if VEC_SIZE == 64
> > +#  define KMOV         kmovq
> > +#  define KORTEST      kortestq
> > +#  define RAX          rax
> > +#  define RCX          rcx
> > +#  define RDX          rdx
> > +#  define SHR          shrq
> > +#  define TEXTSUFFIX   evex512
> > +#  define VMM0         zmm16
> > +#  define VMM1         zmm17
> > +#  define VMM2         zmm18
> > +#  define VMM3         zmm19
> > +#  define VMM4         zmm20
> > +#  define VMOVA                vmovdqa64
> > +# elif VEC_SIZE == 32
> > +/* Currently Unused.  */
> > +#  define KMOV         kmovd
> > +#  define KORTEST      kortestd
> > +#  define RAX          eax
> > +#  define RCX          ecx
> > +#  define RDX          edx
> > +#  define SHR          shrl
> > +#  define TEXTSUFFIX   evex256
> > +#  define VMM0         ymm16
> > +#  define VMM1         ymm17
> > +#  define VMM2         ymm18
> > +#  define VMM3         ymm19
> > +#  define VMM4         ymm20
> > +#  define VMOVA                vmovdqa32
> > +# endif
> > +
> > +       .section .text.TEXTSUFFIX, "ax", @progbits
> > +/* Aligning entry point to 64 byte, provides better performance for
> > +   one vector length string.  */
> > +ENTRY_P2ALIGN (STRLEN, 6)
> > +# ifdef USE_AS_STRNLEN
> > +       /* Check zero length.  */
> > +       test    %RSI_LP, %RSI_LP
> > +       jz      L(ret_max)
> > +#  ifdef __ILP32__
> > +       /* Clear the upper 32 bits.  */
> > +       movl    %esi, %esi
> > +#  endif
> > +# endif
> > +
> > +       movl    %edi, %eax
> > +       vpxorq  %XMM0, %XMM0, %XMM0
> > +       andl    $(PAGE_SIZE - 1), %eax
> > +       cmpl    $(PAGE_SIZE - VEC_SIZE), %eax
> > +       ja      L(page_cross)
> > +
> > +       /* Compare [w]char for null, mask bit will be set for match.  */
> > +       VPCMP   $0, (%rdi), %VMM0, %k0
> > +       KMOV    %k0, %RAX
> > +       test    %RAX, %RAX
> > +       jz      L(align_more)
> > +
> > +       bsf     %RAX, %RAX
> > +# ifdef USE_AS_STRNLEN
> > +       cmpq    %rsi, %rax
> > +       cmovnb  %rsi, %rax
> > +# endif
> > +       ret
> > +
> > +       /* At this point vector max length reached.  */
> > +# ifdef USE_AS_STRNLEN
> > +       .p2align 4,,3
> > +L(ret_max):
> > +       movq    %rsi, %rax
> > +       ret
> > +# endif
> > +
> > +L(align_more):
> > +       leaq    VEC_SIZE(%rdi), %rax
> > +       /* Align rax to VEC_SIZE.  */
> > +       andq    $-VEC_SIZE, %rax
> > +# ifdef USE_AS_STRNLEN
> > +       movq    %rax, %rdx
> > +       subq    %rdi, %rdx
> > +#  ifdef USE_AS_WCSLEN
> > +       SHR     $2, %RDX
> > +#  endif
> > +       /* At this point rdx contains [w]chars already compared.  */
> > +       subq    %rsi, %rdx
> > +       jae     L(ret_max)
> > +       negq    %rdx
> > +       /* At this point rdx contains number of w[char] needs to go.
> > +          Now onwards rdx will keep decrementing with each compare.  */
> > +# endif
> > +
> > +       /* Loop unroll 4 times for 4 vector loop.  */
> > +       VPCMP   $0, (%rax), %VMM0, %k0
> > +       KMOV    %k0, %RCX
> > +       test    %RCX, %RCX
> > +       jnz     L(ret_vec_x1)
> > +
> > +# ifdef USE_AS_STRNLEN
> > +       subq    $CHAR_PER_VEC, %rdx
> > +       jbe     L(ret_max)
> > +# endif
> > +
> > +       VPCMP   $0, VEC_SIZE(%rax), %VMM0, %k0
> > +       KMOV    %k0, %RCX
> > +       test    %RCX, %RCX
> > +       jnz     L(ret_vec_x2)
> > +
> > +# ifdef USE_AS_STRNLEN
> > +       subq    $CHAR_PER_VEC, %rdx
> > +       jbe     L(ret_max)
> > +# endif
> > +
> > +       VPCMP   $0, (VEC_SIZE * 2)(%rax), %VMM0, %k0
> > +       KMOV    %k0, %RCX
> > +       test    %RCX, %RCX
> > +       jnz     L(ret_vec_x3)
> > +
> > +# ifdef USE_AS_STRNLEN
> > +       subq    $CHAR_PER_VEC, %rdx
> > +       jbe     L(ret_max)
> > +# endif
> > +
> > +       VPCMP   $0, (VEC_SIZE * 3)(%rax), %VMM0, %k0
> > +       KMOV    %k0, %RCX
> > +       test    %RCX, %RCX
> > +       jnz     L(ret_vec_x4)
> > +
> > +# ifdef USE_AS_STRNLEN
> > +       subq    $CHAR_PER_VEC, %rdx
> > +       jbe     L(ret_max)
> > +       /* Save pointer before 4 x VEC_SIZE alignment.  */
> > +       movq    %rax, %rcx
> > +# endif
> > +
> > +       /* Align address to VEC_SIZE * 4 for loop.  */
> > +       andq    $-(VEC_SIZE * 4), %rax
> > +
> > +# ifdef USE_AS_STRNLEN
> > +       subq    %rax, %rcx
> > +#  ifdef USE_AS_WCSLEN
> > +       SHR     $2, %RCX
> > +#  endif
> > +       /* rcx contains number of [w]char will be recompared due to
> > +          alignment fixes.  rdx must be incremented by rcx to offset
> > +          alignment adjustment.  */
> > +       addq    %rcx, %rdx
> > +       /* Need jump as we don't want to add/subtract rdx for first
> > +          iteration of 4 x VEC_SIZE aligned loop.  */
> > +       jmp     L(loop_entry)
> > +# endif
> > +
> > +       .p2align 4,,11
> > +L(loop):
> > +# ifdef USE_AS_STRNLEN
> > +       subq    $(CHAR_PER_VEC * 4), %rdx
> > +       jbe     L(ret_max)
> > +L(loop_entry):
> > +# endif
> > +       /* VPMINU and VPCMP combination provide better performance as
> > +          compared to alternative combinations.  */
> > +       VMOVA   (VEC_SIZE * 4)(%rax), %VMM1
> > +       VPMINU  (VEC_SIZE * 5)(%rax), %VMM1, %VMM2
> > +       VMOVA   (VEC_SIZE * 6)(%rax), %VMM3
> > +       VPMINU  (VEC_SIZE * 7)(%rax), %VMM3, %VMM4
> > +
> > +       VPTESTN %VMM2, %VMM2, %k0
> > +       VPTESTN %VMM4, %VMM4, %k1
> > +
> > +       subq    $-(VEC_SIZE * 4), %rax
> > +       KORTEST %k0, %k1
> > +       jz      L(loop)
> > +
> > +       VPTESTN %VMM1, %VMM1, %k2
> > +       KMOV    %k2, %RCX
> > +       test    %RCX, %RCX
> > +       jnz     L(ret_vec_x1)
> > +
> > +       KMOV    %k0, %RCX
> > +       /* At this point, if k0 is non zero, null char must be in the
> > +          second vector.  */
> > +       test    %RCX, %RCX
> > +       jnz     L(ret_vec_x2)
> > +
> > +       VPTESTN %VMM3, %VMM3, %k3
> > +       KMOV    %k3, %RCX
> > +       test    %RCX, %RCX
> > +       jnz     L(ret_vec_x3)
> > +       /* At this point null [w]char must be in the fourth vector so no
> > +          need to check.  */
> > +       KMOV    %k1, %RCX
> > +
> > +       /* Fourth, third, second vector terminating are pretty much
> > +          same, implemented this way to avoid branching and reuse code
> > +          from pre loop exit condition.  */
> > +L(ret_vec_x4):
> > +       bsf     %RCX, %RCX
> > +       subq    %rdi, %rax
> > +# ifdef USE_AS_WCSLEN
> > +       subq    $-(VEC_SIZE * 3), %rax
> > +       shrq    $2, %rax
> > +       addq    %rcx, %rax
> > +# else
> > +       leaq    (VEC_SIZE * 3)(%rcx, %rax), %rax
> > +# endif
> > +# ifdef USE_AS_STRNLEN
> > +       cmpq    %rsi, %rax
> > +       cmovnb  %rsi, %rax
> > +# endif
> > +       ret
> > +
> > +L(ret_vec_x3):
> > +       bsf     %RCX, %RCX
> > +       subq    %rdi, %rax
> > +# ifdef USE_AS_WCSLEN
> > +       subq    $-(VEC_SIZE * 2), %rax
> > +       shrq    $2, %rax
> > +       addq    %rcx, %rax
> > +# else
> > +       leaq    (VEC_SIZE * 2)(%rcx, %rax), %rax
> > +# endif
> > +# ifdef USE_AS_STRNLEN
> > +       cmpq    %rsi, %rax
> > +       cmovnb  %rsi, %rax
> > +# endif
> > +       ret
> > +
> > +L(ret_vec_x2):
> > +       subq    $-VEC_SIZE, %rax
> > +L(ret_vec_x1):
> > +       bsf     %RCX, %RCX
> > +       subq    %rdi, %rax
> > +# ifdef USE_AS_WCSLEN
> > +       shrq    $2, %rax
> > +# endif
> > +       addq    %rcx, %rax
> > +# ifdef USE_AS_STRNLEN
> > +       cmpq    %rsi, %rax
> > +       cmovnb  %rsi, %rax
> > +# endif
> > +       ret
> > +
> > +L(page_cross):
> > +       movl    %eax, %ecx
> > +# ifdef USE_AS_WCSLEN
> > +       andl    $(VEC_SIZE - 1), %ecx
> > +       sarl    $2, %ecx
> > +# endif
> > +       /* ecx contains number of w[char] to be skipped as a result
> > +          of address alignment.  */
> > +       xorq    %rdi, %rax
> > +       VPCMP   $0, (PAGE_SIZE - VEC_SIZE)(%rax), %VMM0, %k0
> > +       KMOV    %k0, %RAX
> > +       /* Ignore number of character for alignment adjustment.  */
> > +       SHR     %cl, %RAX
> > +       jz      L(align_more)
> > +
> > +       bsf     %RAX, %RAX
> > +# ifdef USE_AS_STRNLEN
> > +       cmpq    %rsi, %rax
> > +       cmovnb  %rsi, %rax
> > +# endif
> > +       ret
> > +
> > +END (STRLEN)
> > +#endif
> > diff --git a/sysdeps/x86_64/multiarch/strlen-evex512.S b/sysdeps/x86_64/multiarch/strlen-evex512.S
> > new file mode 100644
> > index 0000000000..116f8981c8
> > --- /dev/null
> > +++ b/sysdeps/x86_64/multiarch/strlen-evex512.S
> > @@ -0,0 +1,7 @@
> > +#ifndef STRLEN
> > +# define STRLEN                __strlen_evex512
> > +#endif
> > +
> > +#define VEC_SIZE       64
> > +
> > +#include "strlen-evex-base.S"
> > diff --git a/sysdeps/x86_64/multiarch/strnlen-evex512.S b/sysdeps/x86_64/multiarch/strnlen-evex512.S
> > new file mode 100644
> > index 0000000000..0b7f220214
> > --- /dev/null
> > +++ b/sysdeps/x86_64/multiarch/strnlen-evex512.S
> > @@ -0,0 +1,4 @@
> > +#define STRLEN __strnlen_evex512
> > +#define USE_AS_STRNLEN 1
> > +
> > +#include "strlen-evex512.S"
> > diff --git a/sysdeps/x86_64/multiarch/wcslen-evex512.S b/sysdeps/x86_64/multiarch/wcslen-evex512.S
> > new file mode 100644
> > index 0000000000..f59c372b78
> > --- /dev/null
> > +++ b/sysdeps/x86_64/multiarch/wcslen-evex512.S
> > @@ -0,0 +1,4 @@
> > +#define STRLEN __wcslen_evex512
> > +#define USE_AS_WCSLEN 1
> > +
> > +#include "strlen-evex512.S"
> > diff --git a/sysdeps/x86_64/multiarch/wcsnlen-evex512.S b/sysdeps/x86_64/multiarch/wcsnlen-evex512.S
> > new file mode 100644
> > index 0000000000..73dcf2f210
> > --- /dev/null
> > +++ b/sysdeps/x86_64/multiarch/wcsnlen-evex512.S
> > @@ -0,0 +1,5 @@
> > +#define STRLEN __wcsnlen_evex512
> > +#define USE_AS_WCSLEN 1
> > +#define USE_AS_STRNLEN 1
> > +
> > +#include "strlen-evex512.S"
> > --
> > 2.35.3
> >
>
> LGTM.
>
> Reviewed-by: Noah Goldstein <goldstein.w.n@gmail.com>

I would like to backport this patch to release branches.
Any comments or objections?

--Sunil

^ permalink raw reply	[flat|nested] 12+ messages in thread

end of thread, other threads:[~2022-07-14  0:04 UTC | newest]

Thread overview: 12+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-05-18 18:59 [PATCH] x86_64: Implement evex512 version of strlen, strnlen, wcslen and wcsnlen Sunil K Pandey
2022-05-18 20:29 ` Noah Goldstein
2022-05-19  3:33   ` Sunil Pandey
2022-05-19  3:48     ` [PATCH v2] " Sunil K Pandey
2022-05-19 15:03       ` Noah Goldstein
2022-05-25 13:43         ` [PATCH v3] " Sunil K Pandey
2022-05-25 17:10           ` Noah Goldstein
2022-05-25 18:20             ` Sunil Pandey
2022-05-26 18:35               ` [PATCH v4] " Sunil K Pandey
2022-05-26 20:07                 ` Noah Goldstein
2022-07-14  0:03                   ` Sunil Pandey
2022-05-19  4:41     ` [PATCH] " Noah Goldstein

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).