public inbox for libc-alpha@sourceware.org
 help / color / mirror / Atom feed
* [PATCH] x86_64: Implement evex512 version of strchrnul, strchr and wcschr
@ 2022-09-22  0:16 Sunil K Pandey
  2022-09-22  0:50 ` Noah Goldstein
  0 siblings, 1 reply; 10+ messages in thread
From: Sunil K Pandey @ 2022-09-22  0:16 UTC (permalink / raw)
  To: libc-alpha; +Cc: hjl.tools

This patch implements following evex512 version of string functions.
evex512 version takes up to 30% less cycle as compared to evex,
depending on length and alignment.

- strchrnul function using 512 bit vectors.
- strchr function using 512 bit vectors.
- wcschr function using 512 bit vectors.

Code size data:

strchrnul-evex.o	615 byte
strchrnul-evex512.o	573 byte (-7%)

strchr-evex.o		670 byte
strchr-evex512.o	616 byte (-8%)

wcschr-evex.o		678 byte
wcschr-evex512.o	620 byte (-9%)

Placeholder function, not used by any processor at the moment.
---
 sysdeps/x86_64/multiarch/Makefile            |   3 +
 sysdeps/x86_64/multiarch/ifunc-impl-list.c   |  12 +
 sysdeps/x86_64/multiarch/strchr-evex-base.S  | 294 +++++++++++++++++++
 sysdeps/x86_64/multiarch/strchr-evex512.S    |   7 +
 sysdeps/x86_64/multiarch/strchrnul-evex512.S |   8 +
 sysdeps/x86_64/multiarch/wcschr-evex512.S    |   8 +
 6 files changed, 332 insertions(+)
 create mode 100644 sysdeps/x86_64/multiarch/strchr-evex-base.S
 create mode 100644 sysdeps/x86_64/multiarch/strchr-evex512.S
 create mode 100644 sysdeps/x86_64/multiarch/strchrnul-evex512.S
 create mode 100644 sysdeps/x86_64/multiarch/wcschr-evex512.S

diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
index df4601c294..89b58fa557 100644
--- a/sysdeps/x86_64/multiarch/Makefile
+++ b/sysdeps/x86_64/multiarch/Makefile
@@ -60,11 +60,13 @@ sysdep_routines += \
   strchr-avx2 \
   strchr-avx2-rtm \
   strchr-evex \
+  strchr-evex512 \
   strchr-sse2 \
   strchr-sse2-no-bsf \
   strchrnul-avx2 \
   strchrnul-avx2-rtm \
   strchrnul-evex \
+  strchrnul-evex512 \
   strchrnul-sse2 \
   strcmp-avx2 \
   strcmp-avx2-rtm \
@@ -129,6 +131,7 @@ sysdep_routines += \
   wcschr-avx2 \
   wcschr-avx2-rtm \
   wcschr-evex \
+  wcschr-evex512 \
   wcschr-sse2 \
   wcscmp-avx2 \
   wcscmp-avx2-rtm \
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
index a71444eccb..bce1d15171 100644
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
@@ -518,6 +518,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 				      && CPU_FEATURE_USABLE (AVX512BW)
 				      && CPU_FEATURE_USABLE (BMI2)),
 				     __strchr_evex)
+	      X86_IFUNC_IMPL_ADD_V4 (array, i, strchr,
+				     (CPU_FEATURE_USABLE (AVX512VL)
+				      && CPU_FEATURE_USABLE (AVX512BW)),
+				     __strchr_evex512)
 	      X86_IFUNC_IMPL_ADD_V3 (array, i, strchr,
 				     (CPU_FEATURE_USABLE (AVX2)
 				      && CPU_FEATURE_USABLE (BMI2)),
@@ -543,6 +547,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 				      && CPU_FEATURE_USABLE (AVX512BW)
 				      && CPU_FEATURE_USABLE (BMI2)),
 				     __strchrnul_evex)
+	      X86_IFUNC_IMPL_ADD_V4 (array, i, strchrnul,
+				     (CPU_FEATURE_USABLE (AVX512VL)
+				      && CPU_FEATURE_USABLE (AVX512BW)),
+				     __strchrnul_evex512)
 	      X86_IFUNC_IMPL_ADD_V3 (array, i, strchrnul,
 				     (CPU_FEATURE_USABLE (AVX2)
 				      && CPU_FEATURE_USABLE (BMI2)),
@@ -753,6 +761,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 				      && CPU_FEATURE_USABLE (AVX512BW)
 				      && CPU_FEATURE_USABLE (BMI2)),
 				     __wcschr_evex)
+	      X86_IFUNC_IMPL_ADD_V4 (array, i, wcschr,
+				     (CPU_FEATURE_USABLE (AVX512VL)
+				      && CPU_FEATURE_USABLE (AVX512BW)),
+				     __wcschr_evex512)
 	      X86_IFUNC_IMPL_ADD_V3 (array, i, wcschr,
 				     (CPU_FEATURE_USABLE (AVX2)
 				      && CPU_FEATURE_USABLE (BMI2)),
diff --git a/sysdeps/x86_64/multiarch/strchr-evex-base.S b/sysdeps/x86_64/multiarch/strchr-evex-base.S
new file mode 100644
index 0000000000..919dafc8b6
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strchr-evex-base.S
@@ -0,0 +1,294 @@
+/* Placeholder function, not used by any processor at the moment.
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+/* UNUSED. Exists purely as reference implementation.  */
+
+#include <isa-level.h>
+
+#if ISA_SHOULD_BUILD (4)
+
+# include <sysdep.h>
+
+# ifdef USE_AS_WCSCHR
+#  define CHAR_REG	esi
+#  define CHAR_SIZE	4
+#  define VPBROADCAST   vpbroadcastd
+#  define VPCMP		vpcmpd
+#  define VPMINU	vpminud
+#  define VPTESTN	vptestnmd
+# else
+#  define CHAR_REG	sil
+#  define CHAR_SIZE	1
+#  define VPBROADCAST   vpbroadcastb
+#  define VPCMP		vpcmpb
+#  define VPMINU	vpminub
+#  define VPTESTN	vptestnmb
+# endif
+
+# define PAGE_SIZE	4096
+# define CHAR_PER_VEC	(VEC_SIZE / CHAR_SIZE)
+# define XMM1           xmm17
+
+# if VEC_SIZE == 64
+#  define KMOV		kmovq
+#  define KORTEST	kortestq
+#  define RAX		rax
+#  define RCX		rcx
+#  define RDX		rdx
+#  define SHR		shrq
+#  define TEXTSUFFIX	evex512
+#  define VMM0		zmm16
+#  define VMM1		zmm17
+#  define VMM2		zmm18
+#  define VMM3		zmm19
+#  define VMM4		zmm20
+#  define VMM5		zmm21
+#  define VMOVA		vmovdqa64
+#  define VMOVU		vmovdqu64
+
+# elif VEC_SIZE == 32
+/* Currently Unused.  */
+#  define KMOV		kmovd
+#  define KORTEST	kortestd
+#  define RAX		eax
+#  define RCX		ecx
+#  define RDX		edx
+#  define SHR		shrl
+#  define TEXTSUFFIX	evex256
+#  define VMM0		ymm16
+#  define VMM1		ymm17
+#  define VMM2		ymm18
+#  define VMM3		ymm19
+#  define VMM4		ymm20
+#  define VMM5		ymm21
+#  define VMOVA		vmovdqa32
+#  define VMOVU		vmovdqu32
+# endif
+
+	.section .text.TEXTSUFFIX, "ax", @progbits
+/* Aligning entry point to 64 byte, provides better performance for
+   one vector length string.  */
+ENTRY_P2ALIGN (STRCHR, 6)
+
+	/* Broadcast CHAR to VMM0.  */
+	VPBROADCAST %esi, %VMM0
+	movl	%edi, %eax
+	andl	$(PAGE_SIZE - 1), %eax
+	cmpl	$(PAGE_SIZE - VEC_SIZE), %eax
+	ja	L(page_cross)
+
+	/* Compare [w]char for null, mask bit will be set for match.  */
+	VMOVU	(%rdi), %VMM1
+
+	vpxorq	%VMM1, %VMM0, %VMM2
+	VPMINU	%VMM2, %VMM1, %VMM2
+	VPTESTN	%VMM2, %VMM2, %k0
+
+	KMOV	%k0, %RAX
+# ifndef USE_AS_STRCHRNUL
+	test	%RAX, %RAX
+	jz	L(align_more)
+	bsf	%RAX, %RAX
+# else
+	/* For strchnul, using bsf, if string is less than 64 byte,
+	   entire logic will fit in 64 byte cache line and offset
+	   the perf gap as compared to evex version.  Even though
+	   using bsf as condition can save code size but it is not
+	   preferred for conditional jump for 2 reason.  1) It's
+	   latency is 3. 2) Unlike test, it can't be micro-fused
+	   with jump.  */
+	bsf	%RAX, %RAX
+	jz	L(align_more)
+# endif
+
+# ifdef USE_AS_WCSCHR
+	leaq	(%rdi, %rax, CHAR_SIZE), %rax
+# else
+	add	%rdi, %rax
+# endif
+# ifndef USE_AS_STRCHRNUL
+	cmp	(%rax), %CHAR_REG
+	jne	L(zero)
+# endif
+	ret
+
+# ifndef USE_AS_STRCHRNUL
+L(zero):
+	xorl	%eax, %eax
+	ret
+# endif
+
+L(ret_vec_x2):
+	subq	$-VEC_SIZE, %rax
+L(ret_vec_x1):
+	bsf     %RCX, %RCX
+# ifdef USE_AS_WCSCHR
+	leaq	(%rax, %rcx, CHAR_SIZE), %rax
+# else
+	add	%rcx, %rax
+# endif
+
+# ifndef USE_AS_STRCHRNUL
+	cmp	(%rax), %CHAR_REG
+	jne	L(zero)
+# endif
+	ret
+
+L(align_more):
+	leaq	VEC_SIZE(%rdi), %rax
+	/* Align rax to VEC_SIZE.  */
+	andq	$-VEC_SIZE, %rax
+
+	/* Loop unroll 4 times for 4 vector loop.  */
+	VMOVA	(%rax), %VMM1
+	vpxorq	%VMM1, %VMM0, %VMM2
+	VPMINU	%VMM2, %VMM1, %VMM2
+	VPTESTN	%VMM2, %VMM2, %k0
+
+	KMOV	%k0, %RCX
+	test	%RCX, %RCX
+	jnz	L(ret_vec_x1)
+
+	VMOVA	VEC_SIZE(%rax), %VMM1
+	vpxorq	%VMM1, %VMM0, %VMM2
+	VPMINU	%VMM2, %VMM1, %VMM2
+	VPTESTN	%VMM2, %VMM2, %k0
+
+	KMOV	%k0, %RCX
+	test	%RCX, %RCX
+	jnz	L(ret_vec_x2)
+
+	VMOVA	(VEC_SIZE * 2)(%rax), %VMM1
+	vpxorq	%VMM1, %VMM0, %VMM2
+	VPMINU	%VMM2, %VMM1, %VMM2
+	VPTESTN	%VMM2, %VMM2, %k0
+	KMOV	%k0, %RCX
+	test	%RCX, %RCX
+	jnz	L(ret_vec_x3)
+
+	VMOVA	(VEC_SIZE * 3)(%rax), %VMM1
+	vpxorq	%VMM1, %VMM0, %VMM2
+	VPMINU	%VMM2, %VMM1, %VMM2
+	VPTESTN	%VMM2, %VMM2, %k0
+	KMOV	%k0, %RCX
+	test	%RCX, %RCX
+	jnz	L(ret_vec_x4)
+
+	/* Align address to VEC_SIZE * 4 for loop.  */
+	andq	$-(VEC_SIZE * 4), %rax
+
+	.p2align 4,,11
+L(loop):
+	/* VPMINU and VPCMP combination provide better performance as
+	   compared to alternative combinations.  */
+	VMOVA	(VEC_SIZE * 4)(%rax), %VMM1
+	VMOVA	(VEC_SIZE * 5)(%rax), %VMM2
+	VMOVA	(VEC_SIZE * 6)(%rax), %VMM3
+	VMOVA	(VEC_SIZE * 7)(%rax), %VMM4
+
+	vpxorq	%VMM1, %VMM0, %VMM5
+	VPMINU	%VMM5, %VMM1, %VMM1
+
+	VPCMP   $4, %VMM0, %VMM2, %k1
+	VPMINU	%VMM1, %VMM2, %VMM2{%k1}{z}
+
+	VPCMP   $4, %VMM0, %VMM3, %k2
+	VPMINU	%VMM2, %VMM3, %VMM3{%k2}{z}
+
+	VPCMP   $4, %VMM0, %VMM4, %k3
+	VPMINU	%VMM3, %VMM4, %VMM4{%k3}{z}
+
+	VPTESTN	%VMM4, %VMM4, %k3
+
+	subq	$-(VEC_SIZE * 4), %rax
+	KORTEST	%k3, %k3
+	jz	L(loop)
+
+	VPTESTN	%VMM1, %VMM1, %k0
+	KMOV	%k0, %RCX
+	test	%RCX, %RCX
+	jnz	L(ret_vec_x1)
+
+	VPTESTN	%VMM2, %VMM2, %k0
+	KMOV	%k0, %RCX
+	/* At this point, if k1 is non zero, null char must be in the
+	   second vector.  */
+	test	%RCX, %RCX
+	jnz	L(ret_vec_x2)
+
+	VPTESTN	%VMM3, %VMM3, %k0
+	KMOV	%k0, %RCX
+	test	%RCX, %RCX
+	jnz	L(ret_vec_x3)
+	/* At this point null [w]char must be in the fourth vector so no
+	   need to check.  */
+	KMOV	%k3, %RCX
+
+L(ret_vec_x4):
+	bsf	%RCX, %RCX
+	leaq	(VEC_SIZE * 3)(%rax, %rcx, CHAR_SIZE), %rax
+# ifndef USE_AS_STRCHRNUL
+	cmp	(%rax), %CHAR_REG
+	jne	L(zero)
+# endif
+	ret
+
+L(ret_vec_x3):
+	bsf     %RCX, %RCX
+	leaq	(VEC_SIZE * 2)(%rax, %rcx, CHAR_SIZE), %rax
+# ifndef USE_AS_STRCHRNUL
+	cmp	(%rax), %CHAR_REG
+	jne	L(zero)
+# endif
+	ret
+
+L(page_cross):
+	movl	%eax, %ecx
+# ifdef USE_AS_WCSCHR
+	/* Calculate number of compare result bits to be skipped for
+	   wide string alignment adjustment.  */
+	andl	$(VEC_SIZE - 1), %ecx
+	sarl	$2, %ecx
+# endif
+	/* ecx contains number of w[char] to be skipped as a result
+	   of address alignment.  */
+	xorq	%rdi, %rax
+	VMOVA	(PAGE_SIZE - VEC_SIZE)(%rax), %VMM1
+	vpxorq	%VMM1, %VMM0, %VMM2
+	VPMINU	%VMM2, %VMM1, %VMM2
+	VPTESTN	%VMM2, %VMM2, %k0
+	KMOV	%k0, %RAX
+	/* Ignore number of character for alignment adjustment.  */
+	SHR	%cl, %RAX
+	jz	L(align_more)
+
+	bsf	%RAX, %RAX
+# ifdef USE_AS_WCSCHR
+	leaq	(%rdi, %rax, CHAR_SIZE), %rax
+# else
+	addq	%rdi, %rax
+# endif
+
+# ifndef USE_AS_STRCHRNUL
+	cmp	(%rax), %CHAR_REG
+	jne	L(zero)
+# endif
+	ret
+
+END (STRCHR)
+#endif
diff --git a/sysdeps/x86_64/multiarch/strchr-evex512.S b/sysdeps/x86_64/multiarch/strchr-evex512.S
new file mode 100644
index 0000000000..4079bf387d
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strchr-evex512.S
@@ -0,0 +1,7 @@
+# ifndef STRCHR
+#  define STRCHR	__strchr_evex512
+# endif
+
+#define VEC_SIZE	64
+
+#include "strchr-evex-base.S"
diff --git a/sysdeps/x86_64/multiarch/strchrnul-evex512.S b/sysdeps/x86_64/multiarch/strchrnul-evex512.S
new file mode 100644
index 0000000000..1be0b12f38
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strchrnul-evex512.S
@@ -0,0 +1,8 @@
+#ifndef STRCHRNUL
+# define STRCHRNUL	__strchrnul_evex512
+#endif
+
+#define STRCHR	STRCHRNUL
+#define USE_AS_STRCHRNUL 1
+
+#include "strchr-evex512.S"
diff --git a/sysdeps/x86_64/multiarch/wcschr-evex512.S b/sysdeps/x86_64/multiarch/wcschr-evex512.S
new file mode 100644
index 0000000000..50c87ab1e5
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wcschr-evex512.S
@@ -0,0 +1,8 @@
+#ifndef WCSCHR
+# define WCSCHR	__wcschr_evex512
+#endif
+
+#define STRCHR	WCSCHR
+#define USE_AS_WCSCHR 1
+
+#include "strchr-evex512.S"
-- 
2.36.1


^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH] x86_64: Implement evex512 version of strchrnul, strchr and wcschr
  2022-09-22  0:16 [PATCH] x86_64: Implement evex512 version of strchrnul, strchr and wcschr Sunil K Pandey
@ 2022-09-22  0:50 ` Noah Goldstein
  2022-09-23  3:57   ` Sunil Pandey
  0 siblings, 1 reply; 10+ messages in thread
From: Noah Goldstein @ 2022-09-22  0:50 UTC (permalink / raw)
  To: Sunil K Pandey; +Cc: GNU C Library

On Wed, Sep 21, 2022 at 5:17 PM Sunil K Pandey via Libc-alpha
<libc-alpha@sourceware.org> wrote:
>
> This patch implements following evex512 version of string functions.
> evex512 version takes up to 30% less cycle as compared to evex,
> depending on length and alignment.

Please attach benchmark numbers.
>
> - strchrnul function using 512 bit vectors.
> - strchr function using 512 bit vectors.
> - wcschr function using 512 bit vectors.
>
> Code size data:
>
> strchrnul-evex.o        615 byte
> strchrnul-evex512.o     573 byte (-7%)
>
> strchr-evex.o           670 byte
> strchr-evex512.o        616 byte (-8%)
>
> wcschr-evex.o           678 byte
> wcschr-evex512.o        620 byte (-9%)
>
> Placeholder function, not used by any processor at the moment.
> ---
>  sysdeps/x86_64/multiarch/Makefile            |   3 +
>  sysdeps/x86_64/multiarch/ifunc-impl-list.c   |  12 +
>  sysdeps/x86_64/multiarch/strchr-evex-base.S  | 294 +++++++++++++++++++
>  sysdeps/x86_64/multiarch/strchr-evex512.S    |   7 +
>  sysdeps/x86_64/multiarch/strchrnul-evex512.S |   8 +
>  sysdeps/x86_64/multiarch/wcschr-evex512.S    |   8 +
>  6 files changed, 332 insertions(+)
>  create mode 100644 sysdeps/x86_64/multiarch/strchr-evex-base.S
>  create mode 100644 sysdeps/x86_64/multiarch/strchr-evex512.S
>  create mode 100644 sysdeps/x86_64/multiarch/strchrnul-evex512.S
>  create mode 100644 sysdeps/x86_64/multiarch/wcschr-evex512.S
>
> diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
> index df4601c294..89b58fa557 100644
> --- a/sysdeps/x86_64/multiarch/Makefile
> +++ b/sysdeps/x86_64/multiarch/Makefile
> @@ -60,11 +60,13 @@ sysdep_routines += \
>    strchr-avx2 \
>    strchr-avx2-rtm \
>    strchr-evex \
> +  strchr-evex512 \
>    strchr-sse2 \
>    strchr-sse2-no-bsf \
>    strchrnul-avx2 \
>    strchrnul-avx2-rtm \
>    strchrnul-evex \
> +  strchrnul-evex512 \
>    strchrnul-sse2 \
>    strcmp-avx2 \
>    strcmp-avx2-rtm \
> @@ -129,6 +131,7 @@ sysdep_routines += \
>    wcschr-avx2 \
>    wcschr-avx2-rtm \
>    wcschr-evex \
> +  wcschr-evex512 \
>    wcschr-sse2 \
>    wcscmp-avx2 \
>    wcscmp-avx2-rtm \
> diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> index a71444eccb..bce1d15171 100644
> --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> @@ -518,6 +518,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
>                                       && CPU_FEATURE_USABLE (AVX512BW)
>                                       && CPU_FEATURE_USABLE (BMI2)),
>                                      __strchr_evex)
> +             X86_IFUNC_IMPL_ADD_V4 (array, i, strchr,
> +                                    (CPU_FEATURE_USABLE (AVX512VL)
> +                                     && CPU_FEATURE_USABLE (AVX512BW)),
> +                                    __strchr_evex512)
>               X86_IFUNC_IMPL_ADD_V3 (array, i, strchr,
>                                      (CPU_FEATURE_USABLE (AVX2)
>                                       && CPU_FEATURE_USABLE (BMI2)),
> @@ -543,6 +547,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
>                                       && CPU_FEATURE_USABLE (AVX512BW)
>                                       && CPU_FEATURE_USABLE (BMI2)),
>                                      __strchrnul_evex)
> +             X86_IFUNC_IMPL_ADD_V4 (array, i, strchrnul,
> +                                    (CPU_FEATURE_USABLE (AVX512VL)
> +                                     && CPU_FEATURE_USABLE (AVX512BW)),
> +                                    __strchrnul_evex512)
>               X86_IFUNC_IMPL_ADD_V3 (array, i, strchrnul,
>                                      (CPU_FEATURE_USABLE (AVX2)
>                                       && CPU_FEATURE_USABLE (BMI2)),
> @@ -753,6 +761,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
>                                       && CPU_FEATURE_USABLE (AVX512BW)
>                                       && CPU_FEATURE_USABLE (BMI2)),
>                                      __wcschr_evex)
> +             X86_IFUNC_IMPL_ADD_V4 (array, i, wcschr,
> +                                    (CPU_FEATURE_USABLE (AVX512VL)
> +                                     && CPU_FEATURE_USABLE (AVX512BW)),
> +                                    __wcschr_evex512)
>               X86_IFUNC_IMPL_ADD_V3 (array, i, wcschr,
>                                      (CPU_FEATURE_USABLE (AVX2)
>                                       && CPU_FEATURE_USABLE (BMI2)),
> diff --git a/sysdeps/x86_64/multiarch/strchr-evex-base.S b/sysdeps/x86_64/multiarch/strchr-evex-base.S
> new file mode 100644
> index 0000000000..919dafc8b6
> --- /dev/null
> +++ b/sysdeps/x86_64/multiarch/strchr-evex-base.S
> @@ -0,0 +1,294 @@
> +/* Placeholder function, not used by any processor at the moment.
> +   Copyright (C) 2022 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +/* UNUSED. Exists purely as reference implementation.  */
> +
> +#include <isa-level.h>
> +
> +#if ISA_SHOULD_BUILD (4)
> +
> +# include <sysdep.h>
> +
> +# ifdef USE_AS_WCSCHR
> +#  define CHAR_REG     esi
> +#  define CHAR_SIZE    4
> +#  define VPBROADCAST   vpbroadcastd
> +#  define VPCMP                vpcmpd
> +#  define VPMINU       vpminud
> +#  define VPTESTN      vptestnmd
> +# else
> +#  define CHAR_REG     sil
> +#  define CHAR_SIZE    1
> +#  define VPBROADCAST   vpbroadcastb
> +#  define VPCMP                vpcmpb
> +#  define VPMINU       vpminub
> +#  define VPTESTN      vptestnmb
> +# endif
> +
> +# define PAGE_SIZE     4096
> +# define CHAR_PER_VEC  (VEC_SIZE / CHAR_SIZE)
> +# define XMM1           xmm17
> +
> +# if VEC_SIZE == 64
> +#  define KMOV         kmovq
> +#  define KORTEST      kortestq
> +#  define RAX          rax
> +#  define RCX          rcx
> +#  define RDX          rdx
> +#  define SHR          shrq
> +#  define TEXTSUFFIX   evex512
> +#  define VMM0         zmm16
> +#  define VMM1         zmm17
> +#  define VMM2         zmm18
> +#  define VMM3         zmm19
> +#  define VMM4         zmm20
> +#  define VMM5         zmm21
> +#  define VMOVA                vmovdqa64
> +#  define VMOVU                vmovdqu64
> +
> +# elif VEC_SIZE == 32
> +/* Currently Unused.  */
> +#  define KMOV         kmovd
> +#  define KORTEST      kortestd
> +#  define RAX          eax
> +#  define RCX          ecx
> +#  define RDX          edx
> +#  define SHR          shrl
> +#  define TEXTSUFFIX   evex256
> +#  define VMM0         ymm16
> +#  define VMM1         ymm17
> +#  define VMM2         ymm18
> +#  define VMM3         ymm19
> +#  define VMM4         ymm20
> +#  define VMM5         ymm21
> +#  define VMOVA                vmovdqa32
> +#  define VMOVU                vmovdqu32
> +# endif
> +
> +       .section .text.TEXTSUFFIX, "ax", @progbits
> +/* Aligning entry point to 64 byte, provides better performance for
> +   one vector length string.  */
> +ENTRY_P2ALIGN (STRCHR, 6)
> +
> +       /* Broadcast CHAR to VMM0.  */
> +       VPBROADCAST %esi, %VMM0
> +       movl    %edi, %eax
> +       andl    $(PAGE_SIZE - 1), %eax
> +       cmpl    $(PAGE_SIZE - VEC_SIZE), %eax
> +       ja      L(page_cross)
> +
> +       /* Compare [w]char for null, mask bit will be set for match.  */
> +       VMOVU   (%rdi), %VMM1
> +
> +       vpxorq  %VMM1, %VMM0, %VMM2
> +       VPMINU  %VMM2, %VMM1, %VMM2
> +       VPTESTN %VMM2, %VMM2, %k0
> +
> +       KMOV    %k0, %RAX
> +# ifndef USE_AS_STRCHRNUL
> +       test    %RAX, %RAX
> +       jz      L(align_more)
> +       bsf     %RAX, %RAX
> +# else
> +       /* For strchnul, using bsf, if string is less than 64 byte,
> +          entire logic will fit in 64 byte cache line and offset
> +          the perf gap as compared to evex version.  Even though
> +          using bsf as condition can save code size but it is not
> +          preferred for conditional jump for 2 reason.  1) It's
> +          latency is 3. 2) Unlike test, it can't be micro-fused
> +          with jump.  */
> +       bsf     %RAX, %RAX
> +       jz      L(align_more)
> +# endif
> +
> +# ifdef USE_AS_WCSCHR
> +       leaq    (%rdi, %rax, CHAR_SIZE), %rax
> +# else
> +       add     %rdi, %rax
> +# endif
> +# ifndef USE_AS_STRCHRNUL
> +       cmp     (%rax), %CHAR_REG
> +       jne     L(zero)
> +# endif
> +       ret
> +
> +# ifndef USE_AS_STRCHRNUL
> +L(zero):
> +       xorl    %eax, %eax
> +       ret
> +# endif
> +
> +L(ret_vec_x2):
> +       subq    $-VEC_SIZE, %rax
> +L(ret_vec_x1):
> +       bsf     %RCX, %RCX
> +# ifdef USE_AS_WCSCHR
> +       leaq    (%rax, %rcx, CHAR_SIZE), %rax
> +# else
> +       add     %rcx, %rax
> +# endif
> +
> +# ifndef USE_AS_STRCHRNUL
> +       cmp     (%rax), %CHAR_REG
> +       jne     L(zero)
> +# endif
> +       ret
> +
> +L(align_more):
> +       leaq    VEC_SIZE(%rdi), %rax
> +       /* Align rax to VEC_SIZE.  */
> +       andq    $-VEC_SIZE, %rax
> +
> +       /* Loop unroll 4 times for 4 vector loop.  */
> +       VMOVA   (%rax), %VMM1
> +       vpxorq  %VMM1, %VMM0, %VMM2
> +       VPMINU  %VMM2, %VMM1, %VMM2
> +       VPTESTN %VMM2, %VMM2, %k0
> +
> +       KMOV    %k0, %RCX
> +       test    %RCX, %RCX
> +       jnz     L(ret_vec_x1)
> +
> +       VMOVA   VEC_SIZE(%rax), %VMM1
> +       vpxorq  %VMM1, %VMM0, %VMM2
> +       VPMINU  %VMM2, %VMM1, %VMM2
> +       VPTESTN %VMM2, %VMM2, %k0
> +
> +       KMOV    %k0, %RCX
> +       test    %RCX, %RCX
> +       jnz     L(ret_vec_x2)
> +
> +       VMOVA   (VEC_SIZE * 2)(%rax), %VMM1
> +       vpxorq  %VMM1, %VMM0, %VMM2
> +       VPMINU  %VMM2, %VMM1, %VMM2
> +       VPTESTN %VMM2, %VMM2, %k0
> +       KMOV    %k0, %RCX
> +       test    %RCX, %RCX
> +       jnz     L(ret_vec_x3)
> +
> +       VMOVA   (VEC_SIZE * 3)(%rax), %VMM1
> +       vpxorq  %VMM1, %VMM0, %VMM2
> +       VPMINU  %VMM2, %VMM1, %VMM2
> +       VPTESTN %VMM2, %VMM2, %k0
> +       KMOV    %k0, %RCX
> +       test    %RCX, %RCX
> +       jnz     L(ret_vec_x4)
> +
> +       /* Align address to VEC_SIZE * 4 for loop.  */
> +       andq    $-(VEC_SIZE * 4), %rax
> +
> +       .p2align 4,,11
> +L(loop):
> +       /* VPMINU and VPCMP combination provide better performance as
> +          compared to alternative combinations.  */
> +       VMOVA   (VEC_SIZE * 4)(%rax), %VMM1
> +       VMOVA   (VEC_SIZE * 5)(%rax), %VMM2
> +       VMOVA   (VEC_SIZE * 6)(%rax), %VMM3
> +       VMOVA   (VEC_SIZE * 7)(%rax), %VMM4
> +
> +       vpxorq  %VMM1, %VMM0, %VMM5
> +       VPMINU  %VMM5, %VMM1, %VMM1
> +
> +       VPCMP   $4, %VMM0, %VMM2, %k1
> +       VPMINU  %VMM1, %VMM2, %VMM2{%k1}{z}
> +
> +       VPCMP   $4, %VMM0, %VMM3, %k2
> +       VPMINU  %VMM2, %VMM3, %VMM3{%k2}{z}
> +
> +       VPCMP   $4, %VMM0, %VMM4, %k3
> +       VPMINU  %VMM3, %VMM4, %VMM4{%k3}{z}
> +
> +       VPTESTN %VMM4, %VMM4, %k3
> +
> +       subq    $-(VEC_SIZE * 4), %rax
> +       KORTEST %k3, %k3
> +       jz      L(loop)
> +
> +       VPTESTN %VMM1, %VMM1, %k0
> +       KMOV    %k0, %RCX
> +       test    %RCX, %RCX
> +       jnz     L(ret_vec_x1)
> +
> +       VPTESTN %VMM2, %VMM2, %k0
> +       KMOV    %k0, %RCX
> +       /* At this point, if k1 is non zero, null char must be in the
> +          second vector.  */
> +       test    %RCX, %RCX
> +       jnz     L(ret_vec_x2)
> +
> +       VPTESTN %VMM3, %VMM3, %k0
> +       KMOV    %k0, %RCX
> +       test    %RCX, %RCX
> +       jnz     L(ret_vec_x3)
> +       /* At this point null [w]char must be in the fourth vector so no
> +          need to check.  */
> +       KMOV    %k3, %RCX
> +
> +L(ret_vec_x4):
> +       bsf     %RCX, %RCX
> +       leaq    (VEC_SIZE * 3)(%rax, %rcx, CHAR_SIZE), %rax
> +# ifndef USE_AS_STRCHRNUL
> +       cmp     (%rax), %CHAR_REG
> +       jne     L(zero)
> +# endif
> +       ret
> +
> +L(ret_vec_x3):
> +       bsf     %RCX, %RCX
> +       leaq    (VEC_SIZE * 2)(%rax, %rcx, CHAR_SIZE), %rax
> +# ifndef USE_AS_STRCHRNUL
> +       cmp     (%rax), %CHAR_REG
> +       jne     L(zero)
> +# endif
> +       ret
> +
> +L(page_cross):
> +       movl    %eax, %ecx
> +# ifdef USE_AS_WCSCHR
> +       /* Calculate number of compare result bits to be skipped for
> +          wide string alignment adjustment.  */
> +       andl    $(VEC_SIZE - 1), %ecx
> +       sarl    $2, %ecx
> +# endif
> +       /* ecx contains number of w[char] to be skipped as a result
> +          of address alignment.  */
> +       xorq    %rdi, %rax
> +       VMOVA   (PAGE_SIZE - VEC_SIZE)(%rax), %VMM1
> +       vpxorq  %VMM1, %VMM0, %VMM2
> +       VPMINU  %VMM2, %VMM1, %VMM2
> +       VPTESTN %VMM2, %VMM2, %k0
> +       KMOV    %k0, %RAX
> +       /* Ignore number of character for alignment adjustment.  */
> +       SHR     %cl, %RAX
> +       jz      L(align_more)
> +
> +       bsf     %RAX, %RAX
> +# ifdef USE_AS_WCSCHR
> +       leaq    (%rdi, %rax, CHAR_SIZE), %rax
> +# else
> +       addq    %rdi, %rax
> +# endif
> +
> +# ifndef USE_AS_STRCHRNUL
> +       cmp     (%rax), %CHAR_REG
> +       jne     L(zero)
> +# endif
> +       ret
> +
> +END (STRCHR)
> +#endif
> diff --git a/sysdeps/x86_64/multiarch/strchr-evex512.S b/sysdeps/x86_64/multiarch/strchr-evex512.S
> new file mode 100644
> index 0000000000..4079bf387d
> --- /dev/null
> +++ b/sysdeps/x86_64/multiarch/strchr-evex512.S
> @@ -0,0 +1,7 @@
> +# ifndef STRCHR
> +#  define STRCHR       __strchr_evex512
> +# endif
> +
> +#define VEC_SIZE       64
> +
> +#include "strchr-evex-base.S"
> diff --git a/sysdeps/x86_64/multiarch/strchrnul-evex512.S b/sysdeps/x86_64/multiarch/strchrnul-evex512.S
> new file mode 100644
> index 0000000000..1be0b12f38
> --- /dev/null
> +++ b/sysdeps/x86_64/multiarch/strchrnul-evex512.S
> @@ -0,0 +1,8 @@
> +#ifndef STRCHRNUL
> +# define STRCHRNUL     __strchrnul_evex512
> +#endif
> +
> +#define STRCHR STRCHRNUL
> +#define USE_AS_STRCHRNUL 1
> +
> +#include "strchr-evex512.S"
> diff --git a/sysdeps/x86_64/multiarch/wcschr-evex512.S b/sysdeps/x86_64/multiarch/wcschr-evex512.S
> new file mode 100644
> index 0000000000..50c87ab1e5
> --- /dev/null
> +++ b/sysdeps/x86_64/multiarch/wcschr-evex512.S
> @@ -0,0 +1,8 @@
> +#ifndef WCSCHR
> +# define WCSCHR        __wcschr_evex512
> +#endif
> +
> +#define STRCHR WCSCHR
> +#define USE_AS_WCSCHR 1
> +
> +#include "strchr-evex512.S"
> --
> 2.36.1
>

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH] x86_64: Implement evex512 version of strchrnul, strchr and wcschr
  2022-09-22  0:50 ` Noah Goldstein
@ 2022-09-23  3:57   ` Sunil Pandey
  2022-09-29  3:41     ` Sunil Pandey
  0 siblings, 1 reply; 10+ messages in thread
From: Sunil Pandey @ 2022-09-23  3:57 UTC (permalink / raw)
  To: Noah Goldstein; +Cc: GNU C Library

[-- Attachment #1: Type: text/plain, Size: 16162 bytes --]

Microbenchmark data collected on: Intel(R) Xeon(R) Gold 6148 CPU @ 2.40GHz



On Wed, Sep 21, 2022 at 5:50 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> On Wed, Sep 21, 2022 at 5:17 PM Sunil K Pandey via Libc-alpha
> <libc-alpha@sourceware.org> wrote:
> >
> > This patch implements following evex512 version of string functions.
> > evex512 version takes up to 30% less cycle as compared to evex,
> > depending on length and alignment.
>
> Please attach benchmark numbers.
> >
> > - strchrnul function using 512 bit vectors.
> > - strchr function using 512 bit vectors.
> > - wcschr function using 512 bit vectors.
> >
> > Code size data:
> >
> > strchrnul-evex.o        615 byte
> > strchrnul-evex512.o     573 byte (-7%)
> >
> > strchr-evex.o           670 byte
> > strchr-evex512.o        616 byte (-8%)
> >
> > wcschr-evex.o           678 byte
> > wcschr-evex512.o        620 byte (-9%)
> >
> > Placeholder function, not used by any processor at the moment.
> > ---
> >  sysdeps/x86_64/multiarch/Makefile            |   3 +
> >  sysdeps/x86_64/multiarch/ifunc-impl-list.c   |  12 +
> >  sysdeps/x86_64/multiarch/strchr-evex-base.S  | 294 +++++++++++++++++++
> >  sysdeps/x86_64/multiarch/strchr-evex512.S    |   7 +
> >  sysdeps/x86_64/multiarch/strchrnul-evex512.S |   8 +
> >  sysdeps/x86_64/multiarch/wcschr-evex512.S    |   8 +
> >  6 files changed, 332 insertions(+)
> >  create mode 100644 sysdeps/x86_64/multiarch/strchr-evex-base.S
> >  create mode 100644 sysdeps/x86_64/multiarch/strchr-evex512.S
> >  create mode 100644 sysdeps/x86_64/multiarch/strchrnul-evex512.S
> >  create mode 100644 sysdeps/x86_64/multiarch/wcschr-evex512.S
> >
> > diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
> > index df4601c294..89b58fa557 100644
> > --- a/sysdeps/x86_64/multiarch/Makefile
> > +++ b/sysdeps/x86_64/multiarch/Makefile
> > @@ -60,11 +60,13 @@ sysdep_routines += \
> >    strchr-avx2 \
> >    strchr-avx2-rtm \
> >    strchr-evex \
> > +  strchr-evex512 \
> >    strchr-sse2 \
> >    strchr-sse2-no-bsf \
> >    strchrnul-avx2 \
> >    strchrnul-avx2-rtm \
> >    strchrnul-evex \
> > +  strchrnul-evex512 \
> >    strchrnul-sse2 \
> >    strcmp-avx2 \
> >    strcmp-avx2-rtm \
> > @@ -129,6 +131,7 @@ sysdep_routines += \
> >    wcschr-avx2 \
> >    wcschr-avx2-rtm \
> >    wcschr-evex \
> > +  wcschr-evex512 \
> >    wcschr-sse2 \
> >    wcscmp-avx2 \
> >    wcscmp-avx2-rtm \
> > diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> > index a71444eccb..bce1d15171 100644
> > --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> > +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> > @@ -518,6 +518,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> >                                       && CPU_FEATURE_USABLE (AVX512BW)
> >                                       && CPU_FEATURE_USABLE (BMI2)),
> >                                      __strchr_evex)
> > +             X86_IFUNC_IMPL_ADD_V4 (array, i, strchr,
> > +                                    (CPU_FEATURE_USABLE (AVX512VL)
> > +                                     && CPU_FEATURE_USABLE (AVX512BW)),
> > +                                    __strchr_evex512)
> >               X86_IFUNC_IMPL_ADD_V3 (array, i, strchr,
> >                                      (CPU_FEATURE_USABLE (AVX2)
> >                                       && CPU_FEATURE_USABLE (BMI2)),
> > @@ -543,6 +547,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> >                                       && CPU_FEATURE_USABLE (AVX512BW)
> >                                       && CPU_FEATURE_USABLE (BMI2)),
> >                                      __strchrnul_evex)
> > +             X86_IFUNC_IMPL_ADD_V4 (array, i, strchrnul,
> > +                                    (CPU_FEATURE_USABLE (AVX512VL)
> > +                                     && CPU_FEATURE_USABLE (AVX512BW)),
> > +                                    __strchrnul_evex512)
> >               X86_IFUNC_IMPL_ADD_V3 (array, i, strchrnul,
> >                                      (CPU_FEATURE_USABLE (AVX2)
> >                                       && CPU_FEATURE_USABLE (BMI2)),
> > @@ -753,6 +761,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> >                                       && CPU_FEATURE_USABLE (AVX512BW)
> >                                       && CPU_FEATURE_USABLE (BMI2)),
> >                                      __wcschr_evex)
> > +             X86_IFUNC_IMPL_ADD_V4 (array, i, wcschr,
> > +                                    (CPU_FEATURE_USABLE (AVX512VL)
> > +                                     && CPU_FEATURE_USABLE (AVX512BW)),
> > +                                    __wcschr_evex512)
> >               X86_IFUNC_IMPL_ADD_V3 (array, i, wcschr,
> >                                      (CPU_FEATURE_USABLE (AVX2)
> >                                       && CPU_FEATURE_USABLE (BMI2)),
> > diff --git a/sysdeps/x86_64/multiarch/strchr-evex-base.S b/sysdeps/x86_64/multiarch/strchr-evex-base.S
> > new file mode 100644
> > index 0000000000..919dafc8b6
> > --- /dev/null
> > +++ b/sysdeps/x86_64/multiarch/strchr-evex-base.S
> > @@ -0,0 +1,294 @@
> > +/* Placeholder function, not used by any processor at the moment.
> > +   Copyright (C) 2022 Free Software Foundation, Inc.
> > +   This file is part of the GNU C Library.
> > +
> > +   The GNU C Library is free software; you can redistribute it and/or
> > +   modify it under the terms of the GNU Lesser General Public
> > +   License as published by the Free Software Foundation; either
> > +   version 2.1 of the License, or (at your option) any later version.
> > +
> > +   The GNU C Library is distributed in the hope that it will be useful,
> > +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> > +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > +   Lesser General Public License for more details.
> > +
> > +   You should have received a copy of the GNU Lesser General Public
> > +   License along with the GNU C Library; if not, see
> > +   <https://www.gnu.org/licenses/>.  */
> > +
> > +/* UNUSED. Exists purely as reference implementation.  */
> > +
> > +#include <isa-level.h>
> > +
> > +#if ISA_SHOULD_BUILD (4)
> > +
> > +# include <sysdep.h>
> > +
> > +# ifdef USE_AS_WCSCHR
> > +#  define CHAR_REG     esi
> > +#  define CHAR_SIZE    4
> > +#  define VPBROADCAST   vpbroadcastd
> > +#  define VPCMP                vpcmpd
> > +#  define VPMINU       vpminud
> > +#  define VPTESTN      vptestnmd
> > +# else
> > +#  define CHAR_REG     sil
> > +#  define CHAR_SIZE    1
> > +#  define VPBROADCAST   vpbroadcastb
> > +#  define VPCMP                vpcmpb
> > +#  define VPMINU       vpminub
> > +#  define VPTESTN      vptestnmb
> > +# endif
> > +
> > +# define PAGE_SIZE     4096
> > +# define CHAR_PER_VEC  (VEC_SIZE / CHAR_SIZE)
> > +# define XMM1           xmm17
> > +
> > +# if VEC_SIZE == 64
> > +#  define KMOV         kmovq
> > +#  define KORTEST      kortestq
> > +#  define RAX          rax
> > +#  define RCX          rcx
> > +#  define RDX          rdx
> > +#  define SHR          shrq
> > +#  define TEXTSUFFIX   evex512
> > +#  define VMM0         zmm16
> > +#  define VMM1         zmm17
> > +#  define VMM2         zmm18
> > +#  define VMM3         zmm19
> > +#  define VMM4         zmm20
> > +#  define VMM5         zmm21
> > +#  define VMOVA                vmovdqa64
> > +#  define VMOVU                vmovdqu64
> > +
> > +# elif VEC_SIZE == 32
> > +/* Currently Unused.  */
> > +#  define KMOV         kmovd
> > +#  define KORTEST      kortestd
> > +#  define RAX          eax
> > +#  define RCX          ecx
> > +#  define RDX          edx
> > +#  define SHR          shrl
> > +#  define TEXTSUFFIX   evex256
> > +#  define VMM0         ymm16
> > +#  define VMM1         ymm17
> > +#  define VMM2         ymm18
> > +#  define VMM3         ymm19
> > +#  define VMM4         ymm20
> > +#  define VMM5         ymm21
> > +#  define VMOVA                vmovdqa32
> > +#  define VMOVU                vmovdqu32
> > +# endif
> > +
> > +       .section .text.TEXTSUFFIX, "ax", @progbits
> > +/* Aligning entry point to 64 byte, provides better performance for
> > +   one vector length string.  */
> > +ENTRY_P2ALIGN (STRCHR, 6)
> > +
> > +       /* Broadcast CHAR to VMM0.  */
> > +       VPBROADCAST %esi, %VMM0
> > +       movl    %edi, %eax
> > +       andl    $(PAGE_SIZE - 1), %eax
> > +       cmpl    $(PAGE_SIZE - VEC_SIZE), %eax
> > +       ja      L(page_cross)
> > +
> > +       /* Compare [w]char for null, mask bit will be set for match.  */
> > +       VMOVU   (%rdi), %VMM1
> > +
> > +       vpxorq  %VMM1, %VMM0, %VMM2
> > +       VPMINU  %VMM2, %VMM1, %VMM2
> > +       VPTESTN %VMM2, %VMM2, %k0
> > +
> > +       KMOV    %k0, %RAX
> > +# ifndef USE_AS_STRCHRNUL
> > +       test    %RAX, %RAX
> > +       jz      L(align_more)
> > +       bsf     %RAX, %RAX
> > +# else
> > +       /* For strchnul, using bsf, if string is less than 64 byte,
> > +          entire logic will fit in 64 byte cache line and offset
> > +          the perf gap as compared to evex version.  Even though
> > +          using bsf as condition can save code size but it is not
> > +          preferred for conditional jump for 2 reason.  1) It's
> > +          latency is 3. 2) Unlike test, it can't be micro-fused
> > +          with jump.  */
> > +       bsf     %RAX, %RAX
> > +       jz      L(align_more)
> > +# endif
> > +
> > +# ifdef USE_AS_WCSCHR
> > +       leaq    (%rdi, %rax, CHAR_SIZE), %rax
> > +# else
> > +       add     %rdi, %rax
> > +# endif
> > +# ifndef USE_AS_STRCHRNUL
> > +       cmp     (%rax), %CHAR_REG
> > +       jne     L(zero)
> > +# endif
> > +       ret
> > +
> > +# ifndef USE_AS_STRCHRNUL
> > +L(zero):
> > +       xorl    %eax, %eax
> > +       ret
> > +# endif
> > +
> > +L(ret_vec_x2):
> > +       subq    $-VEC_SIZE, %rax
> > +L(ret_vec_x1):
> > +       bsf     %RCX, %RCX
> > +# ifdef USE_AS_WCSCHR
> > +       leaq    (%rax, %rcx, CHAR_SIZE), %rax
> > +# else
> > +       add     %rcx, %rax
> > +# endif
> > +
> > +# ifndef USE_AS_STRCHRNUL
> > +       cmp     (%rax), %CHAR_REG
> > +       jne     L(zero)
> > +# endif
> > +       ret
> > +
> > +L(align_more):
> > +       leaq    VEC_SIZE(%rdi), %rax
> > +       /* Align rax to VEC_SIZE.  */
> > +       andq    $-VEC_SIZE, %rax
> > +
> > +       /* Loop unroll 4 times for 4 vector loop.  */
> > +       VMOVA   (%rax), %VMM1
> > +       vpxorq  %VMM1, %VMM0, %VMM2
> > +       VPMINU  %VMM2, %VMM1, %VMM2
> > +       VPTESTN %VMM2, %VMM2, %k0
> > +
> > +       KMOV    %k0, %RCX
> > +       test    %RCX, %RCX
> > +       jnz     L(ret_vec_x1)
> > +
> > +       VMOVA   VEC_SIZE(%rax), %VMM1
> > +       vpxorq  %VMM1, %VMM0, %VMM2
> > +       VPMINU  %VMM2, %VMM1, %VMM2
> > +       VPTESTN %VMM2, %VMM2, %k0
> > +
> > +       KMOV    %k0, %RCX
> > +       test    %RCX, %RCX
> > +       jnz     L(ret_vec_x2)
> > +
> > +       VMOVA   (VEC_SIZE * 2)(%rax), %VMM1
> > +       vpxorq  %VMM1, %VMM0, %VMM2
> > +       VPMINU  %VMM2, %VMM1, %VMM2
> > +       VPTESTN %VMM2, %VMM2, %k0
> > +       KMOV    %k0, %RCX
> > +       test    %RCX, %RCX
> > +       jnz     L(ret_vec_x3)
> > +
> > +       VMOVA   (VEC_SIZE * 3)(%rax), %VMM1
> > +       vpxorq  %VMM1, %VMM0, %VMM2
> > +       VPMINU  %VMM2, %VMM1, %VMM2
> > +       VPTESTN %VMM2, %VMM2, %k0
> > +       KMOV    %k0, %RCX
> > +       test    %RCX, %RCX
> > +       jnz     L(ret_vec_x4)
> > +
> > +       /* Align address to VEC_SIZE * 4 for loop.  */
> > +       andq    $-(VEC_SIZE * 4), %rax
> > +
> > +       .p2align 4,,11
> > +L(loop):
> > +       /* VPMINU and VPCMP combination provide better performance as
> > +          compared to alternative combinations.  */
> > +       VMOVA   (VEC_SIZE * 4)(%rax), %VMM1
> > +       VMOVA   (VEC_SIZE * 5)(%rax), %VMM2
> > +       VMOVA   (VEC_SIZE * 6)(%rax), %VMM3
> > +       VMOVA   (VEC_SIZE * 7)(%rax), %VMM4
> > +
> > +       vpxorq  %VMM1, %VMM0, %VMM5
> > +       VPMINU  %VMM5, %VMM1, %VMM1
> > +
> > +       VPCMP   $4, %VMM0, %VMM2, %k1
> > +       VPMINU  %VMM1, %VMM2, %VMM2{%k1}{z}
> > +
> > +       VPCMP   $4, %VMM0, %VMM3, %k2
> > +       VPMINU  %VMM2, %VMM3, %VMM3{%k2}{z}
> > +
> > +       VPCMP   $4, %VMM0, %VMM4, %k3
> > +       VPMINU  %VMM3, %VMM4, %VMM4{%k3}{z}
> > +
> > +       VPTESTN %VMM4, %VMM4, %k3
> > +
> > +       subq    $-(VEC_SIZE * 4), %rax
> > +       KORTEST %k3, %k3
> > +       jz      L(loop)
> > +
> > +       VPTESTN %VMM1, %VMM1, %k0
> > +       KMOV    %k0, %RCX
> > +       test    %RCX, %RCX
> > +       jnz     L(ret_vec_x1)
> > +
> > +       VPTESTN %VMM2, %VMM2, %k0
> > +       KMOV    %k0, %RCX
> > +       /* At this point, if k1 is non zero, null char must be in the
> > +          second vector.  */
> > +       test    %RCX, %RCX
> > +       jnz     L(ret_vec_x2)
> > +
> > +       VPTESTN %VMM3, %VMM3, %k0
> > +       KMOV    %k0, %RCX
> > +       test    %RCX, %RCX
> > +       jnz     L(ret_vec_x3)
> > +       /* At this point null [w]char must be in the fourth vector so no
> > +          need to check.  */
> > +       KMOV    %k3, %RCX
> > +
> > +L(ret_vec_x4):
> > +       bsf     %RCX, %RCX
> > +       leaq    (VEC_SIZE * 3)(%rax, %rcx, CHAR_SIZE), %rax
> > +# ifndef USE_AS_STRCHRNUL
> > +       cmp     (%rax), %CHAR_REG
> > +       jne     L(zero)
> > +# endif
> > +       ret
> > +
> > +L(ret_vec_x3):
> > +       bsf     %RCX, %RCX
> > +       leaq    (VEC_SIZE * 2)(%rax, %rcx, CHAR_SIZE), %rax
> > +# ifndef USE_AS_STRCHRNUL
> > +       cmp     (%rax), %CHAR_REG
> > +       jne     L(zero)
> > +# endif
> > +       ret
> > +
> > +L(page_cross):
> > +       movl    %eax, %ecx
> > +# ifdef USE_AS_WCSCHR
> > +       /* Calculate number of compare result bits to be skipped for
> > +          wide string alignment adjustment.  */
> > +       andl    $(VEC_SIZE - 1), %ecx
> > +       sarl    $2, %ecx
> > +# endif
> > +       /* ecx contains number of w[char] to be skipped as a result
> > +          of address alignment.  */
> > +       xorq    %rdi, %rax
> > +       VMOVA   (PAGE_SIZE - VEC_SIZE)(%rax), %VMM1
> > +       vpxorq  %VMM1, %VMM0, %VMM2
> > +       VPMINU  %VMM2, %VMM1, %VMM2
> > +       VPTESTN %VMM2, %VMM2, %k0
> > +       KMOV    %k0, %RAX
> > +       /* Ignore number of character for alignment adjustment.  */
> > +       SHR     %cl, %RAX
> > +       jz      L(align_more)
> > +
> > +       bsf     %RAX, %RAX
> > +# ifdef USE_AS_WCSCHR
> > +       leaq    (%rdi, %rax, CHAR_SIZE), %rax
> > +# else
> > +       addq    %rdi, %rax
> > +# endif
> > +
> > +# ifndef USE_AS_STRCHRNUL
> > +       cmp     (%rax), %CHAR_REG
> > +       jne     L(zero)
> > +# endif
> > +       ret
> > +
> > +END (STRCHR)
> > +#endif
> > diff --git a/sysdeps/x86_64/multiarch/strchr-evex512.S b/sysdeps/x86_64/multiarch/strchr-evex512.S
> > new file mode 100644
> > index 0000000000..4079bf387d
> > --- /dev/null
> > +++ b/sysdeps/x86_64/multiarch/strchr-evex512.S
> > @@ -0,0 +1,7 @@
> > +# ifndef STRCHR
> > +#  define STRCHR       __strchr_evex512
> > +# endif
> > +
> > +#define VEC_SIZE       64
> > +
> > +#include "strchr-evex-base.S"
> > diff --git a/sysdeps/x86_64/multiarch/strchrnul-evex512.S b/sysdeps/x86_64/multiarch/strchrnul-evex512.S
> > new file mode 100644
> > index 0000000000..1be0b12f38
> > --- /dev/null
> > +++ b/sysdeps/x86_64/multiarch/strchrnul-evex512.S
> > @@ -0,0 +1,8 @@
> > +#ifndef STRCHRNUL
> > +# define STRCHRNUL     __strchrnul_evex512
> > +#endif
> > +
> > +#define STRCHR STRCHRNUL
> > +#define USE_AS_STRCHRNUL 1
> > +
> > +#include "strchr-evex512.S"
> > diff --git a/sysdeps/x86_64/multiarch/wcschr-evex512.S b/sysdeps/x86_64/multiarch/wcschr-evex512.S
> > new file mode 100644
> > index 0000000000..50c87ab1e5
> > --- /dev/null
> > +++ b/sysdeps/x86_64/multiarch/wcschr-evex512.S
> > @@ -0,0 +1,8 @@
> > +#ifndef WCSCHR
> > +# define WCSCHR        __wcschr_evex512
> > +#endif
> > +
> > +#define STRCHR WCSCHR
> > +#define USE_AS_WCSCHR 1
> > +
> > +#include "strchr-evex512.S"
> > --
> > 2.36.1
> >

[-- Attachment #2: strchrnul.txt --]
[-- Type: text/plain, Size: 18524 bytes --]

Function: strchrnul
Variant: 
                                    __strchrnul_evex	__strchrnul_evex512
========================================================================================================================
                 alignment=0, pos=32:         9.21	        7.26 ( 21.22%)	
                 alignment=1, pos=32:         9.19	        7.29 ( 20.76%)	
                 alignment=0, pos=64:        12.33	        9.31 ( 24.48%)	
                 alignment=2, pos=64:         9.43	        7.27 ( 22.91%)	
                alignment=0, pos=128:         8.68	        9.29 ( -6.99%)	
                alignment=3, pos=128:         7.65	        9.04 (-18.16%)	
                alignment=0, pos=256:        14.07	        9.80 ( 30.36%)	
                alignment=4, pos=256:        14.27	        9.75 ( 31.68%)	
                alignment=0, pos=512:        20.16	       16.58 ( 17.80%)	
                alignment=5, pos=512:        21.46	       17.57 ( 18.12%)	
               alignment=0, pos=1024:        31.16	       23.03 ( 26.07%)	
               alignment=6, pos=1024:        31.09	       23.12 ( 25.64%)	
               alignment=0, pos=2048:        53.15	       36.37 ( 31.58%)	
               alignment=7, pos=2048:        53.16	       36.43 ( 31.47%)	
                 alignment=0, pos=32:         6.08	        5.08 ( 16.44%)	
                 alignment=1, pos=32:         6.07	        5.09 ( 16.21%)	
                 alignment=0, pos=64:         7.91	        6.41 ( 18.95%)	
                 alignment=2, pos=64:         8.02	        6.41 ( 20.00%)	
                alignment=0, pos=128:         8.26	        9.15 (-10.73%)	
                alignment=3, pos=128:         7.64	        9.04 (-18.33%)	
                alignment=0, pos=256:        14.05	        9.80 ( 30.24%)	
                alignment=4, pos=256:        14.26	        9.77 ( 31.53%)	
                alignment=0, pos=512:        20.10	       16.58 ( 17.52%)	
                alignment=5, pos=512:        20.20	       16.61 ( 17.76%)	
               alignment=0, pos=1024:        31.18	       23.03 ( 26.15%)	
               alignment=6, pos=1024:        31.09	       23.12 ( 25.65%)	
               alignment=0, pos=2048:        56.32	       38.52 ( 31.60%)	
               alignment=7, pos=2048:        53.42	       40.10 ( 24.94%)	
                 alignment=1, pos=64:         8.04	        6.40 ( 20.38%)	
                 alignment=1, pos=64:         7.53	        6.40 ( 15.04%)	
                 alignment=2, pos=64:         7.48	        6.39 ( 14.55%)	
                 alignment=2, pos=64:         7.99	        6.43 ( 19.49%)	
                 alignment=3, pos=64:         7.99	        6.45 ( 19.28%)	
                 alignment=3, pos=64:         8.00	        6.45 ( 19.31%)	
                 alignment=4, pos=64:         8.00	        6.45 ( 19.39%)	
                 alignment=4, pos=64:         7.98	        6.46 ( 19.06%)	
                 alignment=5, pos=64:         7.94	        6.40 ( 19.36%)	
                 alignment=5, pos=64:         8.37	        6.76 ( 19.22%)	
                 alignment=6, pos=64:         8.03	        6.41 ( 20.13%)	
                 alignment=6, pos=64:         8.05	        6.42 ( 20.26%)	
                 alignment=7, pos=64:         8.04	        6.41 ( 20.23%)	
                 alignment=7, pos=64:         8.03	        6.40 ( 20.30%)	
                alignment=0, pos=256:        14.24	        9.79 ( 31.29%)	
                alignment=0, pos=256:        14.22	        9.75 ( 31.40%)	
               alignment=16, pos=256:        14.11	        9.79 ( 30.63%)	
               alignment=16, pos=256:        14.24	        9.77 ( 31.38%)	
               alignment=32, pos=256:        14.76	        9.77 ( 33.86%)	
               alignment=32, pos=256:        14.70	        9.78 ( 33.48%)	
               alignment=48, pos=256:        14.65	        9.75 ( 33.46%)	
               alignment=48, pos=256:        14.46	        9.80 ( 32.21%)	
               alignment=64, pos=256:        15.52	        9.81 ( 36.78%)	
               alignment=64, pos=256:        15.50	        9.76 ( 37.01%)	
               alignment=80, pos=256:        15.50	        9.79 ( 36.83%)	
               alignment=80, pos=256:        15.40	        9.75 ( 36.69%)	
               alignment=96, pos=256:        13.08	        9.79 ( 25.14%)	
               alignment=96, pos=256:        13.03	        9.81 ( 24.74%)	
              alignment=112, pos=256:        13.20	        9.76 ( 26.07%)	
              alignment=112, pos=256:        13.23	        9.75 ( 26.27%)	
                  alignment=0, pos=0:         5.19	        5.09 (  1.92%)	
                  alignment=0, pos=0:         5.37	        4.67 ( 13.16%)	
                  alignment=0, pos=1:         5.33	        4.71 ( 11.69%)	
                  alignment=0, pos=1:         5.33	        4.67 ( 12.50%)	
                  alignment=0, pos=2:         5.37	        4.67 ( 13.17%)	
                  alignment=0, pos=2:         5.33	        4.67 ( 12.50%)	
                  alignment=0, pos=3:         5.33	        4.67 ( 12.50%)	
                  alignment=0, pos=3:         5.33	        4.67 ( 12.50%)	
                  alignment=0, pos=4:         5.45	        5.10 (  6.44%)	
                  alignment=0, pos=4:         5.45	        5.09 (  6.62%)	
                  alignment=0, pos=5:         5.47	        5.10 (  6.76%)	
                  alignment=0, pos=5:         5.46	        5.08 (  6.98%)	
                  alignment=0, pos=6:         5.42	        5.09 (  6.03%)	
                  alignment=0, pos=6:         5.41	        5.07 (  6.31%)	
                  alignment=0, pos=7:         5.70	        5.35 (  6.04%)	
                  alignment=0, pos=7:         5.33	        4.71 ( 11.76%)	
                  alignment=0, pos=8:         5.44	        5.08 (  6.61%)	
                  alignment=0, pos=8:         5.44	        5.07 (  6.79%)	
                  alignment=0, pos=9:         5.42	        5.07 (  6.31%)	
                  alignment=0, pos=9:         5.54	        5.11 (  7.72%)	
                 alignment=0, pos=10:         5.42	        5.07 (  6.42%)	
                 alignment=0, pos=10:         5.42	        5.14 (  5.21%)	
                 alignment=0, pos=11:         5.45	        5.08 (  6.73%)	
                 alignment=0, pos=11:         5.39	        5.07 (  5.92%)	
                 alignment=0, pos=12:         5.46	        5.08 (  6.89%)	
                 alignment=0, pos=12:         5.40	        5.11 (  5.32%)	
                 alignment=0, pos=13:         5.42	        5.07 (  6.30%)	
                 alignment=0, pos=13:         5.41	        5.13 (  5.11%)	
                 alignment=0, pos=14:         5.39	        5.08 (  5.73%)	
                 alignment=0, pos=14:         5.43	        5.08 (  6.54%)	
                 alignment=0, pos=15:         5.50	        5.08 (  7.68%)	
                 alignment=0, pos=15:         5.40	        5.12 (  5.24%)	
                 alignment=0, pos=16:         5.41	        5.09 (  5.85%)	
                 alignment=0, pos=16:         5.44	        5.07 (  6.90%)	
                 alignment=0, pos=17:         5.42	        5.09 (  6.06%)	
                 alignment=0, pos=17:         5.40	        5.08 (  5.92%)	
                 alignment=0, pos=18:         5.41	        5.12 (  5.34%)	
                 alignment=0, pos=18:         5.45	        5.09 (  6.55%)	
                 alignment=0, pos=19:         5.45	        5.09 (  6.57%)	
                 alignment=0, pos=19:         5.41	        5.10 (  5.77%)	
                 alignment=0, pos=20:         5.41	        5.09 (  5.91%)	
                 alignment=0, pos=20:         5.41	        5.12 (  5.25%)	
                 alignment=0, pos=21:         5.44	        5.08 (  6.69%)	
                 alignment=0, pos=21:         5.31	        5.10 (  4.03%)	
                 alignment=0, pos=22:         5.45	        5.10 (  6.40%)	
                 alignment=0, pos=22:         5.41	        5.11 (  5.53%)	
                 alignment=0, pos=23:         5.39	        5.11 (  5.22%)	
                 alignment=0, pos=23:         5.42	        5.09 (  6.11%)	
                 alignment=0, pos=24:         5.39	        5.11 (  5.23%)	
                 alignment=0, pos=24:         5.41	        5.08 (  6.11%)	
                 alignment=0, pos=25:         5.43	        5.09 (  6.33%)	
                 alignment=0, pos=25:         5.41	        5.08 (  6.11%)	
                 alignment=0, pos=26:         5.42	        5.09 (  6.02%)	
                 alignment=0, pos=26:         5.45	        5.08 (  6.74%)	
                 alignment=0, pos=27:         5.40	        5.12 (  5.09%)	
                 alignment=0, pos=27:         5.42	        5.08 (  6.25%)	
                 alignment=0, pos=28:         5.42	        5.07 (  6.45%)	
                 alignment=0, pos=28:         5.72	        5.39 (  5.63%)	
                 alignment=0, pos=29:         5.41	        5.10 (  5.74%)	
                 alignment=0, pos=29:         5.42	        5.09 (  5.94%)	
                 alignment=0, pos=30:         5.41	        5.08 (  6.05%)	
                 alignment=0, pos=30:         5.40	        5.08 (  5.88%)	
                 alignment=0, pos=31:         5.41	        5.09 (  6.06%)	
                 alignment=0, pos=31:         5.41	        5.08 (  6.12%)	
                 alignment=0, pos=32:         6.09	        5.11 ( 16.03%)	
                 alignment=1, pos=32:         6.10	        5.08 ( 16.67%)	
                 alignment=0, pos=64:         8.00	        6.42 ( 19.77%)	
                 alignment=2, pos=64:         7.99	        6.41 ( 19.78%)	
                alignment=0, pos=128:         7.51	        9.10 (-21.15%)	
                alignment=3, pos=128:         7.67	        9.04 (-17.87%)	
                alignment=0, pos=256:        14.23	        9.76 ( 31.42%)	
                alignment=4, pos=256:        14.26	        9.76 ( 31.53%)	
                alignment=0, pos=512:        20.05	       16.61 ( 17.19%)	
                alignment=5, pos=512:        20.10	       16.62 ( 17.35%)	
               alignment=0, pos=1024:        30.81	       22.74 ( 26.20%)	
               alignment=6, pos=1024:        31.33	       23.15 ( 26.11%)	
               alignment=0, pos=2048:        53.30	       36.36 ( 31.79%)	
               alignment=7, pos=2048:        53.38	       36.37 ( 31.87%)	
                 alignment=0, pos=32:         6.05	        5.07 ( 16.08%)	
                 alignment=1, pos=32:         6.10	        5.09 ( 16.62%)	
                 alignment=0, pos=64:         7.98	        6.80 ( 14.84%)	
                 alignment=2, pos=64:         8.00	        6.48 ( 18.98%)	
                alignment=0, pos=128:         7.60	        9.10 (-19.74%)	
                alignment=3, pos=128:         7.70	        9.04 (-17.44%)	
                alignment=0, pos=256:        14.26	        9.80 ( 31.31%)	
                alignment=4, pos=256:        14.10	        9.74 ( 30.89%)	
                alignment=0, pos=512:        20.88	       16.59 ( 20.52%)	
                alignment=5, pos=512:        20.24	       16.59 ( 18.02%)	
               alignment=0, pos=1024:        31.25	       23.08 ( 26.16%)	
               alignment=6, pos=1024:        30.76	       23.01 ( 25.20%)	
               alignment=0, pos=2048:        53.53	       36.36 ( 32.07%)	
               alignment=7, pos=2048:        53.20	       36.38 ( 31.61%)	
                 alignment=1, pos=64:         7.62	        6.40 ( 16.07%)	
                 alignment=1, pos=64:         7.68	        6.43 ( 16.30%)	
                 alignment=2, pos=64:         7.84	        6.41 ( 18.30%)	
                 alignment=2, pos=64:         8.04	        6.41 ( 20.26%)	
                 alignment=3, pos=64:         8.03	        6.40 ( 20.26%)	
                 alignment=3, pos=64:         7.85	        6.40 ( 18.43%)	
                 alignment=4, pos=64:         8.00	        6.42 ( 19.76%)	
                 alignment=4, pos=64:         7.99	        6.46 ( 19.22%)	
                 alignment=5, pos=64:         8.04	        6.40 ( 20.37%)	
                 alignment=5, pos=64:         8.19	        6.77 ( 17.36%)	
                 alignment=6, pos=64:         8.43	        6.76 ( 19.77%)	
                 alignment=6, pos=64:         8.42	        6.84 ( 18.84%)	
                 alignment=7, pos=64:         7.98	        6.40 ( 19.78%)	
                 alignment=7, pos=64:         8.00	        6.43 ( 19.64%)	
                alignment=0, pos=256:        14.27	        9.81 ( 31.26%)	
                alignment=0, pos=256:        14.00	        9.75 ( 30.36%)	
               alignment=16, pos=256:        14.25	        9.76 ( 31.50%)	
               alignment=16, pos=256:        14.06	        9.78 ( 30.44%)	
               alignment=32, pos=256:        14.80	        9.81 ( 33.74%)	
               alignment=32, pos=256:        14.77	        9.79 ( 33.74%)	
               alignment=48, pos=256:        14.99	        9.79 ( 34.67%)	
               alignment=48, pos=256:        14.67	        9.81 ( 33.17%)	
               alignment=64, pos=256:        15.49	        9.80 ( 36.72%)	
               alignment=64, pos=256:        15.50	        9.79 ( 36.86%)	
               alignment=80, pos=256:        15.51	       10.21 ( 34.22%)	
               alignment=80, pos=256:        15.38	        9.79 ( 36.39%)	
               alignment=96, pos=256:        13.09	        9.78 ( 25.25%)	
               alignment=96, pos=256:        13.06	        9.89 ( 24.31%)	
              alignment=112, pos=256:        13.17	        9.79 ( 25.69%)	
              alignment=112, pos=256:        13.10	        9.81 ( 25.15%)	
                  alignment=0, pos=0:         5.19	        5.12 (  1.41%)	
                  alignment=0, pos=0:         5.33	        4.67 ( 12.50%)	
                  alignment=0, pos=1:         5.33	        4.81 (  9.79%)	
                  alignment=0, pos=1:         5.33	        4.67 ( 12.50%)	
                  alignment=0, pos=2:         5.33	        4.70 ( 11.87%)	
                  alignment=0, pos=2:         5.33	        4.67 ( 12.50%)	
                  alignment=0, pos=3:         5.33	        4.67 ( 12.50%)	
                  alignment=0, pos=3:         5.37	        4.67 ( 13.15%)	
                  alignment=0, pos=4:         5.33	        4.71 ( 11.75%)	
                  alignment=0, pos=4:         5.33	        4.67 ( 12.50%)	
                  alignment=0, pos=5:         5.33	        4.67 ( 12.50%)	
                  alignment=0, pos=5:         5.33	        4.67 ( 12.50%)	
                  alignment=0, pos=6:         5.41	        5.08 (  6.13%)	
                  alignment=0, pos=6:         5.45	        5.08 (  6.79%)	
                  alignment=0, pos=7:         5.43	        5.08 (  6.50%)	
                  alignment=0, pos=7:         5.46	        5.08 (  7.04%)	
                  alignment=0, pos=8:         5.40	        5.08 (  5.99%)	
                  alignment=0, pos=8:         5.41	        5.08 (  6.16%)	
                  alignment=0, pos=9:         5.33	        4.67 ( 12.50%)	
                  alignment=0, pos=9:         5.33	        4.71 ( 11.76%)	
                 alignment=0, pos=10:         5.45	        5.10 (  6.39%)	
                 alignment=0, pos=10:         5.43	        5.09 (  6.28%)	
                 alignment=0, pos=11:         5.40	        5.09 (  5.76%)	
                 alignment=0, pos=11:         5.33	        4.71 ( 11.75%)	
                 alignment=0, pos=12:         5.39	        5.07 (  5.86%)	
                 alignment=0, pos=12:         5.40	        5.09 (  5.61%)	
                 alignment=0, pos=13:         5.46	        5.08 (  6.82%)	
                 alignment=0, pos=13:         5.42	        5.10 (  5.90%)	
                 alignment=0, pos=14:         5.45	        5.08 (  6.70%)	
                 alignment=0, pos=14:         5.42	        5.08 (  6.24%)	
                 alignment=0, pos=15:         5.42	        5.07 (  6.38%)	
                 alignment=0, pos=15:         5.42	        5.14 (  5.12%)	
                 alignment=0, pos=16:         5.40	        5.08 (  5.83%)	
                 alignment=0, pos=16:         5.42	        5.09 (  6.14%)	
                 alignment=0, pos=17:         5.40	        5.10 (  5.48%)	
                 alignment=0, pos=17:         5.41	        5.07 (  6.40%)	
                 alignment=0, pos=18:         5.44	        5.08 (  6.56%)	
                 alignment=0, pos=18:         5.39	        5.07 (  5.86%)	
                 alignment=0, pos=19:         5.41	        5.08 (  6.22%)	
                 alignment=0, pos=19:         5.42	        5.09 (  6.09%)	
                 alignment=0, pos=20:         5.41	        5.09 (  5.83%)	
                 alignment=0, pos=20:         5.39	        5.14 (  4.76%)	
                 alignment=0, pos=21:         5.42	        5.11 (  5.69%)	
                 alignment=0, pos=21:         5.43	        5.09 (  6.20%)	
                 alignment=0, pos=22:         5.69	        5.41 (  4.96%)	
                 alignment=0, pos=22:         5.40	        5.15 (  4.71%)	
                 alignment=0, pos=23:         5.44	        5.09 (  6.52%)	
                 alignment=0, pos=23:         5.42	        5.08 (  6.31%)	
                 alignment=0, pos=24:         5.40	        5.12 (  5.12%)	
                 alignment=0, pos=24:         5.43	        5.08 (  6.38%)	
                 alignment=0, pos=25:         5.41	        5.09 (  5.98%)	
                 alignment=0, pos=25:         5.46	        5.08 (  6.93%)	
                 alignment=0, pos=26:         5.41	        5.08 (  6.19%)	
                 alignment=0, pos=26:         5.45	        5.09 (  6.62%)	
                 alignment=0, pos=27:         5.41	        5.12 (  5.40%)	
                 alignment=0, pos=27:         5.40	        5.08 (  5.95%)	
                 alignment=0, pos=28:         5.44	        5.07 (  6.80%)	
                 alignment=0, pos=28:         5.39	        5.13 (  4.75%)	
                 alignment=0, pos=29:         5.39	        5.08 (  5.84%)	
                 alignment=0, pos=29:         5.39	        5.09 (  5.66%)	
                 alignment=0, pos=30:         5.50	        5.07 (  7.86%)	
                 alignment=0, pos=30:         5.41	        5.10 (  5.84%)	
                 alignment=0, pos=31:         5.39	        5.13 (  4.94%)	
                 alignment=0, pos=31:         5.39	        5.13 (  4.80%)	

[-- Attachment #3: wcschr.txt --]
[-- Type: text/plain, Size: 15665 bytes --]

Function: wcschr
Variant: 
                                    __wcschr_evex	__wcschr_evex512
========================================================================================================================
                 alignment=1, pos=64:        15.96	       10.14 ( 36.48%)	
                 alignment=1, pos=64:        15.89	       10.17 ( 35.95%)	
                 alignment=2, pos=64:        15.23	       10.09 ( 33.70%)	
                 alignment=2, pos=64:        15.82	       10.13 ( 35.97%)	
                 alignment=3, pos=64:        14.97	        9.97 ( 33.38%)	
                 alignment=3, pos=64:        15.02	        9.59 ( 36.15%)	
                 alignment=4, pos=64:        14.87	        9.52 ( 35.96%)	
                 alignment=4, pos=64:        14.98	        9.59 ( 35.97%)	
                 alignment=5, pos=64:        15.02	        9.59 ( 36.19%)	
                 alignment=5, pos=64:        14.87	        9.57 ( 35.67%)	
                 alignment=6, pos=64:        14.96	        9.59 ( 35.92%)	
                 alignment=6, pos=64:        14.97	        9.33 ( 37.64%)	
                 alignment=7, pos=64:        15.03	        9.53 ( 36.60%)	
                 alignment=7, pos=64:        14.99	        9.58 ( 36.07%)	
                alignment=0, pos=256:        31.47	       22.50 ( 28.51%)	
                alignment=0, pos=256:        31.44	       22.50 ( 28.44%)	
               alignment=16, pos=256:        33.48	       23.18 ( 30.77%)	
               alignment=16, pos=256:        33.47	       23.14 ( 30.86%)	
               alignment=32, pos=256:        31.08	       23.79 ( 23.47%)	
               alignment=32, pos=256:        31.43	       23.67 ( 24.68%)	
               alignment=48, pos=256:        32.88	       20.82 ( 36.68%)	
               alignment=48, pos=256:        32.84	       20.84 ( 36.54%)	
               alignment=64, pos=256:        30.94	       23.31 ( 24.66%)	
               alignment=64, pos=256:        33.00	       23.82 ( 27.81%)	
               alignment=80, pos=256:        32.86	       23.15 ( 29.56%)	
               alignment=80, pos=256:        33.01	       23.20 ( 29.73%)	
               alignment=96, pos=256:        30.87	       23.65 ( 23.38%)	
               alignment=96, pos=256:        30.91	       23.66 ( 23.44%)	
              alignment=112, pos=256:        32.86	       20.83 ( 36.62%)	
              alignment=112, pos=256:        33.08	       20.07 ( 39.32%)	
                  alignment=0, pos=0:         5.84	        5.11 ( 12.42%)	
                  alignment=0, pos=0:         6.04	        4.67 ( 22.75%)	
                  alignment=0, pos=1:         6.00	        4.71 ( 21.53%)	
                  alignment=0, pos=1:         6.00	        4.71 ( 21.55%)	
                  alignment=0, pos=2:         6.00	        4.71 ( 21.56%)	
                  alignment=0, pos=2:         6.00	        4.67 ( 22.22%)	
                  alignment=0, pos=3:         6.00	        4.71 ( 21.56%)	
                  alignment=0, pos=3:         6.00	        4.71 ( 21.56%)	
                  alignment=0, pos=4:         6.06	        5.12 ( 15.57%)	
                  alignment=0, pos=4:         6.09	        5.09 ( 16.45%)	
                  alignment=0, pos=5:         6.00	        4.67 ( 22.22%)	
                  alignment=0, pos=5:         6.00	        4.67 ( 22.22%)	
                  alignment=0, pos=6:         6.22	        5.09 ( 18.11%)	
                  alignment=0, pos=6:         6.11	        5.11 ( 16.38%)	
                  alignment=0, pos=7:         6.00	        4.67 ( 22.22%)	
                  alignment=0, pos=7:         6.00	        4.70 ( 21.66%)	
                  alignment=0, pos=8:         6.12	        5.11 ( 16.49%)	
                  alignment=0, pos=8:         6.00	        4.71 ( 21.57%)	
                  alignment=0, pos=9:         6.07	        5.11 ( 15.69%)	
                  alignment=0, pos=9:         5.84	        5.13 ( 12.07%)	
                 alignment=0, pos=10:         6.08	        5.13 ( 15.71%)	
                 alignment=0, pos=10:         5.84	        5.17 ( 11.48%)	
                 alignment=0, pos=11:         6.08	        5.10 ( 16.01%)	
                 alignment=0, pos=11:         6.02	        5.09 ( 15.53%)	
                 alignment=0, pos=12:         6.00	        4.67 ( 22.22%)	
                 alignment=0, pos=12:         6.11	        5.09 ( 16.66%)	
                 alignment=0, pos=13:         5.84	        5.13 ( 12.12%)	
                 alignment=0, pos=13:         6.00	        4.67 ( 22.22%)	
                 alignment=0, pos=14:         6.11	        5.09 ( 16.67%)	
                 alignment=0, pos=14:         6.20	        5.09 ( 17.89%)	
                 alignment=0, pos=15:         6.00	        4.71 ( 21.56%)	
                 alignment=0, pos=15:         6.04	        5.14 ( 14.97%)	
                 alignment=0, pos=16:         7.41	        7.75 ( -4.59%)	
                 alignment=0, pos=16:         6.72	        7.75 (-15.30%)	
                 alignment=0, pos=17:         6.71	        7.75 (-15.41%)	
                 alignment=0, pos=17:         6.71	        7.79 (-16.10%)	
                 alignment=0, pos=18:         6.67	        7.38 (-10.65%)	
                 alignment=0, pos=18:         6.67	        7.38 (-10.65%)	
                 alignment=0, pos=19:         6.78	        7.80 (-15.14%)	
                 alignment=0, pos=19:         6.78	        7.74 (-14.14%)	
                 alignment=0, pos=20:         6.71	        7.76 (-15.62%)	
                 alignment=0, pos=20:         6.72	        7.79 (-15.91%)	
                 alignment=0, pos=21:         6.78	        7.74 (-14.07%)	
                 alignment=0, pos=21:         6.72	        7.73 (-15.01%)	
                 alignment=0, pos=22:         6.71	        7.78 (-15.85%)	
                 alignment=0, pos=22:         6.88	        7.80 (-13.40%)	
                 alignment=0, pos=23:         6.74	        7.74 (-14.91%)	
                 alignment=0, pos=23:         6.71	        7.77 (-15.71%)	
                 alignment=0, pos=24:         7.62	        7.74 ( -1.52%)	
                 alignment=0, pos=24:         7.39	        7.78 ( -5.26%)	
                 alignment=0, pos=25:         7.37	        7.74 ( -5.06%)	
                 alignment=0, pos=25:         7.42	        7.75 ( -4.51%)	
                 alignment=0, pos=26:         7.38	        7.79 ( -5.49%)	
                 alignment=0, pos=26:         7.37	        7.79 ( -5.71%)	
                 alignment=0, pos=27:         7.38	        7.72 ( -4.61%)	
                 alignment=0, pos=27:         7.44	        7.73 ( -3.93%)	
                 alignment=0, pos=28:         7.40	        7.72 ( -4.40%)	
                 alignment=0, pos=28:         7.43	        7.73 ( -4.02%)	
                 alignment=0, pos=29:         7.40	        7.73 ( -4.36%)	
                 alignment=0, pos=29:         7.43	        7.73 ( -4.07%)	
                 alignment=0, pos=30:         7.45	        7.73 ( -3.86%)	
                 alignment=0, pos=30:         7.36	        7.79 ( -5.79%)	
                 alignment=0, pos=31:         7.36	        7.79 ( -5.82%)	
                 alignment=0, pos=31:         7.36	        7.78 ( -5.73%)	
                 alignment=1, pos=64:        15.02	        9.54 ( 36.48%)	
                 alignment=1, pos=64:        14.80	        9.32 ( 37.02%)	
                 alignment=2, pos=64:        15.04	        9.57 ( 36.37%)	
                 alignment=2, pos=64:        14.86	        9.27 ( 37.59%)	
                 alignment=3, pos=64:        15.03	        9.58 ( 36.23%)	
                 alignment=3, pos=64:        15.01	        9.57 ( 36.25%)	
                 alignment=4, pos=64:        14.85	        9.52 ( 35.92%)	
                 alignment=4, pos=64:        14.98	        9.57 ( 36.12%)	
                 alignment=5, pos=64:        15.00	        9.57 ( 36.20%)	
                 alignment=5, pos=64:        15.03	        9.56 ( 36.38%)	
                 alignment=6, pos=64:        14.91	        9.29 ( 37.69%)	
                 alignment=6, pos=64:        14.97	        9.57 ( 36.06%)	
                 alignment=7, pos=64:        15.03	        9.32 ( 38.03%)	
                 alignment=7, pos=64:        14.88	        9.52 ( 35.99%)	
                alignment=0, pos=256:        31.49	       22.53 ( 28.46%)	
                alignment=0, pos=256:        31.44	       22.49 ( 28.47%)	
               alignment=16, pos=256:        35.25	       24.55 ( 30.36%)	
               alignment=16, pos=256:        33.41	       23.18 ( 30.61%)	
               alignment=32, pos=256:        32.87	       23.63 ( 28.11%)	
               alignment=32, pos=256:        32.53	       23.96 ( 26.34%)	
               alignment=48, pos=256:        32.74	       21.50 ( 34.34%)	
               alignment=48, pos=256:        33.19	       20.86 ( 37.15%)	
               alignment=64, pos=256:        31.01	       22.47 ( 27.53%)	
               alignment=64, pos=256:        30.98	       22.50 ( 27.38%)	
               alignment=80, pos=256:        33.02	       23.21 ( 29.72%)	
               alignment=80, pos=256:        32.96	       23.14 ( 29.79%)	
               alignment=96, pos=256:        30.93	       23.62 ( 23.64%)	
               alignment=96, pos=256:        30.89	       23.65 ( 23.43%)	
              alignment=112, pos=256:        32.78	       20.83 ( 36.46%)	
              alignment=112, pos=256:        32.82	       20.83 ( 36.53%)	
                  alignment=0, pos=0:         5.84	        5.11 ( 12.44%)	
                  alignment=0, pos=0:         6.00	        4.67 ( 22.22%)	
                  alignment=0, pos=1:         6.00	        4.67 ( 22.22%)	
                  alignment=0, pos=1:         6.04	        4.67 ( 22.73%)	
                  alignment=0, pos=2:         6.04	        4.67 ( 22.74%)	
                  alignment=0, pos=2:         6.04	        4.67 ( 22.75%)	
                  alignment=0, pos=3:         6.05	        5.11 ( 15.49%)	
                  alignment=0, pos=3:         6.00	        4.67 ( 22.22%)	
                  alignment=0, pos=4:         6.00	        4.67 ( 22.22%)	
                  alignment=0, pos=4:         6.00	        4.71 ( 21.56%)	
                  alignment=0, pos=5:         6.00	        4.71 ( 21.56%)	
                  alignment=0, pos=5:         6.05	        4.67 ( 22.80%)	
                  alignment=0, pos=6:         6.00	        4.67 ( 22.22%)	
                  alignment=0, pos=6:         6.00	        4.71 ( 21.56%)	
                  alignment=0, pos=7:         6.11	        5.12 ( 16.22%)	
                  alignment=0, pos=7:         6.06	        5.11 ( 15.77%)	
                  alignment=0, pos=8:         6.00	        4.70 ( 21.66%)	
                  alignment=0, pos=8:         6.06	        5.09 ( 15.92%)	
                  alignment=0, pos=9:         5.84	        5.14 ( 11.96%)	
                  alignment=0, pos=9:         6.13	        5.10 ( 16.82%)	
                 alignment=0, pos=10:         5.84	        5.14 ( 11.92%)	
                 alignment=0, pos=10:         6.08	        5.11 ( 15.93%)	
                 alignment=0, pos=11:         5.84	        5.15 ( 11.82%)	
                 alignment=0, pos=11:         6.09	        5.09 ( 16.45%)	
                 alignment=0, pos=12:         6.43	        5.09 ( 20.90%)	
                 alignment=0, pos=12:         6.00	        4.71 ( 21.56%)	
                 alignment=0, pos=13:         8.71	        8.40 (  3.63%)	
                 alignment=0, pos=13:         7.05	        4.94 ( 29.88%)	
                 alignment=0, pos=14:         7.63	        5.58 ( 26.87%)	
                 alignment=0, pos=14:         7.70	        6.00 ( 22.02%)	
                 alignment=0, pos=15:         6.55	        5.66 ( 13.55%)	
                 alignment=0, pos=15:         6.40	        5.38 ( 15.89%)	
                 alignment=0, pos=16:         7.77	        8.30 ( -6.82%)	
                 alignment=0, pos=16:         7.06	        7.85 (-11.14%)	
                 alignment=0, pos=17:         7.18	        8.35 (-16.21%)	
                 alignment=0, pos=17:         7.35	        8.18 (-11.21%)	
                 alignment=0, pos=18:         7.11	        8.26 (-16.14%)	
                 alignment=0, pos=18:         6.92	        7.74 (-11.85%)	
                 alignment=0, pos=19:         6.80	        7.74 (-13.80%)	
                 alignment=0, pos=19:         6.89	        7.73 (-12.20%)	
                 alignment=0, pos=20:         6.75	        7.77 (-15.22%)	
                 alignment=0, pos=20:         6.67	        7.37 (-10.60%)	
                 alignment=0, pos=21:         6.76	        7.76 (-14.83%)	
                 alignment=0, pos=21:         6.73	        7.72 (-14.81%)	
                 alignment=0, pos=22:         6.73	        7.79 (-15.78%)	
                 alignment=0, pos=22:         6.75	        7.75 (-14.70%)	
                 alignment=0, pos=23:         6.73	        7.76 (-15.30%)	
                 alignment=0, pos=23:         6.72	        8.50 (-26.43%)	
                 alignment=0, pos=24:         8.03	        7.75 (  3.53%)	
                 alignment=0, pos=24:         7.43	        7.75 ( -4.32%)	
                 alignment=0, pos=25:         7.37	        7.78 ( -5.63%)	
                 alignment=0, pos=25:         7.51	        7.79 ( -3.67%)	
                 alignment=0, pos=26:         7.38	        7.80 ( -5.70%)	
                 alignment=0, pos=26:         7.43	        7.73 ( -3.96%)	
                 alignment=0, pos=27:         7.41	        7.74 ( -4.47%)	
                 alignment=0, pos=27:         7.39	        7.78 ( -5.24%)	
                 alignment=0, pos=28:         7.38	        7.78 ( -5.42%)	
                 alignment=0, pos=28:         7.39	        7.73 ( -4.73%)	
                 alignment=0, pos=29:         7.38	        7.74 ( -4.86%)	
                 alignment=0, pos=29:         7.43	        7.73 ( -4.08%)	
                 alignment=0, pos=30:         7.39	        7.77 ( -5.18%)	
                 alignment=0, pos=30:         7.38	        7.78 ( -5.37%)	
                 alignment=0, pos=31:         7.42	        7.78 ( -4.85%)	
                 alignment=0, pos=31:         7.40	        7.77 ( -5.04%)	
                 alignment=0, pos=15:         8.09	        7.80 (  3.58%)	
                 alignment=0, pos=15:         7.83	        6.74 ( 13.95%)	
                 alignment=0, pos=15:        10.61	       10.60 (  0.05%)	
                 alignment=0, pos=15:        10.04	        9.27 (  7.66%)	
                 alignment=0, pos=15:        11.82	       10.94 (  7.45%)	
                 alignment=0, pos=15:        11.59	       11.77 ( -1.56%)	
                 alignment=0, pos=15:        13.17	       12.26 (  6.84%)	
                 alignment=0, pos=15:        12.78	       11.66 (  8.75%)	
                 alignment=0, pos=15:        16.30	       14.06 ( 13.72%)	
                 alignment=0, pos=15:        16.66	       13.95 ( 16.30%)	
                 alignment=0, pos=15:        12.08	       10.97 (  9.21%)	
                 alignment=0, pos=15:        12.45	       10.89 ( 12.50%)	
                 alignment=0, pos=15:        10.73	        8.85 ( 17.50%)	
                 alignment=0, pos=15:        10.85	        8.95 ( 17.57%)	
                 alignment=0, pos=15:         8.69	        6.85 ( 21.21%)	
                 alignment=0, pos=15:         8.47	        7.00 ( 17.34%)	
                 alignment=0, pos=15:         8.11	        7.34 (  9.51%)	
                 alignment=0, pos=15:         7.94	        7.12 ( 10.31%)	

[-- Attachment #4: strchr.txt --]
[-- Type: text/plain, Size: 19865 bytes --]

Function: strchr
Variant: 
                                    __strchr_evex	__strchr_evex512
========================================================================================================================
                 alignment=0, pos=32:         9.22	        6.28 ( 31.96%)	
                 alignment=1, pos=32:         8.64	        5.81 ( 32.70%)	
                 alignment=0, pos=64:         8.78	        8.03 (  8.54%)	
                 alignment=2, pos=64:         8.24	        7.66 (  7.12%)	
                alignment=0, pos=128:         9.04	        7.80 ( 13.77%)	
                alignment=3, pos=128:         8.30	        6.96 ( 16.22%)	
                alignment=0, pos=256:        14.01	        8.76 ( 37.53%)	
                alignment=4, pos=256:        14.20	        8.72 ( 38.60%)	
                alignment=0, pos=512:        20.50	       16.21 ( 20.92%)	
                alignment=5, pos=512:        20.65	       16.47 ( 20.26%)	
               alignment=0, pos=1024:        31.78	       23.15 ( 27.14%)	
               alignment=6, pos=1024:        31.81	       23.13 ( 27.27%)	
               alignment=0, pos=2048:        54.18	       36.24 ( 33.11%)	
               alignment=7, pos=2048:        54.39	       36.22 ( 33.41%)	
                 alignment=0, pos=32:         6.09	        4.45 ( 26.87%)	
                 alignment=1, pos=32:         6.08	        4.45 ( 26.75%)	
                 alignment=0, pos=64:         6.78	        6.41 (  5.35%)	
                 alignment=2, pos=64:         6.71	        6.43 (  4.14%)	
                alignment=0, pos=128:         8.13	        7.08 ( 12.89%)	
                alignment=3, pos=128:         8.09	        7.14 ( 11.73%)	
                alignment=0, pos=256:        13.98	        8.68 ( 37.91%)	
                alignment=4, pos=256:        14.41	        8.71 ( 39.54%)	
                alignment=0, pos=512:        20.68	       16.19 ( 21.70%)	
                alignment=5, pos=512:        20.67	       16.25 ( 21.39%)	
               alignment=0, pos=1024:        32.91	       23.35 ( 29.06%)	
               alignment=6, pos=1024:        31.78	       22.96 ( 27.76%)	
               alignment=0, pos=2048:        53.93	       36.19 ( 32.89%)	
               alignment=7, pos=2048:        53.89	       36.19 ( 32.84%)	
                 alignment=1, pos=64:         6.74	        6.53 (  3.02%)	
                 alignment=1, pos=64:         6.78	        6.42 (  5.33%)	
                 alignment=2, pos=64:         6.74	        6.46 (  4.20%)	
                 alignment=2, pos=64:         6.77	        6.43 (  5.03%)	
                 alignment=3, pos=64:         6.59	        6.44 (  2.15%)	
                 alignment=3, pos=64:         6.76	        6.43 (  4.89%)	
                 alignment=4, pos=64:         6.71	        6.47 (  3.61%)	
                 alignment=4, pos=64:         6.75	        6.41 (  4.98%)	
                 alignment=5, pos=64:         6.77	        6.48 (  4.27%)	
                 alignment=5, pos=64:         6.86	        6.44 (  6.16%)	
                 alignment=6, pos=64:         6.77	        6.43 (  5.13%)	
                 alignment=6, pos=64:         6.74	        6.44 (  4.48%)	
                 alignment=7, pos=64:         7.16	        6.79 (  5.17%)	
                 alignment=7, pos=64:         7.32	        6.79 (  7.16%)	
                alignment=0, pos=256:        14.12	        8.72 ( 38.20%)	
                alignment=0, pos=256:        13.92	        8.74 ( 37.19%)	
               alignment=16, pos=256:        14.43	        8.73 ( 39.49%)	
               alignment=16, pos=256:        13.88	        8.77 ( 36.82%)	
               alignment=32, pos=256:        14.84	        8.78 ( 40.84%)	
               alignment=32, pos=256:        14.85	        8.76 ( 40.98%)	
               alignment=48, pos=256:        15.31	        8.77 ( 42.70%)	
               alignment=48, pos=256:        21.82	        8.78 ( 59.77%)	
               alignment=64, pos=256:        16.06	        8.73 ( 45.63%)	
               alignment=64, pos=256:        15.95	        8.58 ( 46.22%)	
               alignment=80, pos=256:        17.02	        9.22 ( 45.82%)	
               alignment=80, pos=256:        15.75	        8.74 ( 44.48%)	
               alignment=96, pos=256:        13.33	        8.80 ( 33.96%)	
               alignment=96, pos=256:        13.30	        8.76 ( 34.12%)	
              alignment=112, pos=256:        13.57	        8.77 ( 35.35%)	
              alignment=112, pos=256:        13.47	        8.74 ( 35.11%)	
                  alignment=0, pos=0:         3.89	        4.44 (-14.06%)	
                  alignment=0, pos=0:         4.04	        4.00 (  0.99%)	
                  alignment=0, pos=1:         4.00	        4.04 ( -0.98%)	
                  alignment=0, pos=1:         4.00	        4.04 ( -0.98%)	
                  alignment=0, pos=2:         4.04	        4.00 (  0.98%)	
                  alignment=0, pos=2:         4.06	        4.00 (  1.38%)	
                  alignment=0, pos=3:         4.00	        4.00 (  0.00%)	
                  alignment=0, pos=3:         4.00	        4.00 (  0.00%)	
                  alignment=0, pos=4:         4.26	        4.49 ( -5.34%)	
                  alignment=0, pos=4:         4.29	        4.44 ( -3.38%)	
                  alignment=0, pos=5:         4.00	        4.00 (  0.00%)	
                  alignment=0, pos=5:         4.00	        4.00 (  0.00%)	
                  alignment=0, pos=6:         4.00	        4.04 ( -1.00%)	
                  alignment=0, pos=6:         4.04	        4.00 (  0.98%)	
                  alignment=0, pos=7:         4.00	        4.00 ( -0.00%)	
                  alignment=0, pos=7:         4.00	        4.00 (  0.00%)	
                  alignment=0, pos=8:         4.31	        4.49 ( -4.12%)	
                  alignment=0, pos=8:         4.34	        4.44 ( -2.30%)	
                  alignment=0, pos=9:         4.00	        4.00 (  0.00%)	
                  alignment=0, pos=9:         4.00	        4.00 ( -0.00%)	
                 alignment=0, pos=10:         4.21	        4.49 ( -6.75%)	
                 alignment=0, pos=10:         4.00	        4.00 ( -0.00%)	
                 alignment=0, pos=11:         4.00	        4.04 ( -0.99%)	
                 alignment=0, pos=11:         4.18	        4.44 ( -6.31%)	
                 alignment=0, pos=12:         4.00	        4.00 (  0.00%)	
                 alignment=0, pos=12:         4.21	        4.44 ( -5.32%)	
                 alignment=0, pos=13:         4.00	        4.00 (  0.00%)	
                 alignment=0, pos=13:         4.18	        4.45 ( -6.37%)	
                 alignment=0, pos=14:         4.00	        4.00 ( -0.00%)	
                 alignment=0, pos=14:         4.19	        4.45 ( -6.12%)	
                 alignment=0, pos=15:         4.00	        4.00 (  0.01%)	
                 alignment=0, pos=15:         4.13	        4.46 ( -7.83%)	
                 alignment=0, pos=16:         4.26	        4.49 ( -5.44%)	
                 alignment=0, pos=16:         4.00	        4.00 (  0.00%)	
                 alignment=0, pos=17:         4.22	        4.46 ( -5.68%)	
                 alignment=0, pos=17:         4.14	        4.46 ( -7.77%)	
                 alignment=0, pos=18:         4.18	        4.51 ( -7.85%)	
                 alignment=0, pos=18:         4.17	        4.47 ( -7.05%)	
                 alignment=0, pos=19:         3.99	        4.49 (-12.42%)	
                 alignment=0, pos=19:         4.17	        4.48 ( -7.53%)	
                 alignment=0, pos=20:         4.16	        4.47 ( -7.53%)	
                 alignment=0, pos=20:         4.13	        4.45 ( -7.72%)	
                 alignment=0, pos=21:         4.18	        4.46 ( -6.86%)	
                 alignment=0, pos=21:         4.16	        4.45 ( -6.97%)	
                 alignment=0, pos=22:         4.13	        4.45 ( -7.68%)	
                 alignment=0, pos=22:         4.14	        4.45 ( -7.62%)	
                 alignment=0, pos=23:         4.16	        4.44 ( -6.77%)	
                 alignment=0, pos=23:         4.15	        4.46 ( -7.54%)	
                 alignment=0, pos=24:         4.16	        4.49 ( -7.83%)	
                 alignment=0, pos=24:         4.16	        4.49 ( -8.04%)	
                 alignment=0, pos=25:         4.19	        4.45 ( -6.34%)	
                 alignment=0, pos=25:         4.16	        4.50 ( -8.33%)	
                 alignment=0, pos=26:         4.17	        4.46 ( -7.10%)	
                 alignment=0, pos=26:         4.16	        4.50 ( -8.36%)	
                 alignment=0, pos=27:         4.20	        4.44 ( -5.66%)	
                 alignment=0, pos=27:         4.15	        4.45 ( -7.21%)	
                 alignment=0, pos=28:         4.20	        4.46 ( -6.19%)	
                 alignment=0, pos=28:         4.15	        4.49 ( -8.07%)	
                 alignment=0, pos=29:         4.19	        4.45 ( -6.28%)	
                 alignment=0, pos=29:         4.14	        4.46 ( -7.74%)	
                 alignment=0, pos=30:         4.14	        4.50 ( -8.67%)	
                 alignment=0, pos=30:         4.22	        4.45 ( -5.54%)	
                 alignment=0, pos=31:         4.16	        4.46 ( -7.11%)	
                 alignment=0, pos=31:         4.17	        4.44 ( -6.35%)	
                 alignment=0, pos=32:         6.06	        4.44 ( 26.70%)	
                 alignment=1, pos=32:         6.11	        4.44 ( 27.31%)	
                 alignment=0, pos=64:         6.78	        6.43 (  5.07%)	
                 alignment=2, pos=64:         6.71	        6.43 (  4.19%)	
                alignment=0, pos=128:         8.00	        7.09 ( 11.43%)	
                alignment=3, pos=128:         8.11	        7.14 ( 12.01%)	
                alignment=0, pos=256:        14.38	        8.72 ( 39.40%)	
                alignment=4, pos=256:        13.90	        8.74 ( 37.12%)	
                alignment=0, pos=512:        20.64	       16.18 ( 21.64%)	
                alignment=5, pos=512:        20.69	       16.23 ( 21.55%)	
               alignment=0, pos=1024:        31.85	       23.10 ( 27.48%)	
               alignment=6, pos=1024:        31.78	       23.15 ( 27.14%)	
               alignment=0, pos=2048:        54.06	       36.21 ( 33.01%)	
               alignment=7, pos=2048:        54.14	       36.28 ( 32.99%)	
                 alignment=0, pos=32:         6.09	        4.45 ( 27.00%)	
                 alignment=1, pos=32:         6.12	        4.46 ( 27.09%)	
                 alignment=0, pos=64:         6.81	        6.53 (  4.11%)	
                 alignment=2, pos=64:         6.76	        6.43 (  4.97%)	
                alignment=0, pos=128:         8.11	        7.10 ( 12.44%)	
                alignment=3, pos=128:         8.07	        7.13 ( 11.68%)	
                alignment=0, pos=256:        14.23	        8.57 ( 39.76%)	
                alignment=4, pos=256:        13.90	        8.74 ( 37.13%)	
                alignment=0, pos=512:        20.25	       16.19 ( 20.06%)	
                alignment=5, pos=512:        20.67	       16.45 ( 20.41%)	
               alignment=0, pos=1024:        31.78	       23.13 ( 27.24%)	
               alignment=6, pos=1024:        31.72	       23.14 ( 27.06%)	
               alignment=0, pos=2048:        53.96	       36.21 ( 32.89%)	
               alignment=7, pos=2048:        53.96	       36.16 ( 32.99%)	
                 alignment=1, pos=64:         7.17	        6.78 (  5.39%)	
                 alignment=1, pos=64:         6.78	        6.44 (  5.01%)	
                 alignment=2, pos=64:         6.74	        6.47 (  4.08%)	
                 alignment=2, pos=64:         6.74	        6.41 (  4.83%)	
                 alignment=3, pos=64:         6.75	        6.41 (  4.97%)	
                 alignment=3, pos=64:         6.73	        6.43 (  4.49%)	
                 alignment=4, pos=64:         6.76	        6.52 (  3.63%)	
                 alignment=4, pos=64:         6.73	        6.43 (  4.58%)	
                 alignment=5, pos=64:         6.73	        6.47 (  3.80%)	
                 alignment=5, pos=64:         6.74	        6.43 (  4.58%)	
                 alignment=6, pos=64:         6.58	        6.48 (  1.59%)	
                 alignment=6, pos=64:         6.74	        6.42 (  4.65%)	
                 alignment=7, pos=64:         6.71	        6.47 (  3.66%)	
                 alignment=7, pos=64:         6.74	        6.43 (  4.58%)	
                alignment=0, pos=256:        14.45	        8.71 ( 39.72%)	
                alignment=0, pos=256:        13.88	        8.73 ( 37.14%)	
               alignment=16, pos=256:        14.42	        8.77 ( 39.17%)	
               alignment=16, pos=256:        13.90	        8.59 ( 38.20%)	
               alignment=32, pos=256:        14.84	        8.73 ( 41.17%)	
               alignment=32, pos=256:        14.77	        8.74 ( 40.81%)	
               alignment=48, pos=256:        15.32	        8.58 ( 43.98%)	
               alignment=48, pos=256:        15.30	        8.79 ( 42.57%)	
               alignment=64, pos=256:        16.10	        8.72 ( 45.84%)	
               alignment=64, pos=256:        16.06	        8.68 ( 45.98%)	
               alignment=80, pos=256:        16.14	        8.74 ( 45.82%)	
               alignment=80, pos=256:        15.97	        8.71 ( 45.47%)	
               alignment=96, pos=256:        13.31	        8.76 ( 34.23%)	
               alignment=96, pos=256:        13.31	        8.62 ( 35.25%)	
              alignment=112, pos=256:        13.64	        8.56 ( 37.27%)	
              alignment=112, pos=256:        13.58	        8.73 ( 35.75%)	
                  alignment=0, pos=0:         3.93	        4.46 (-13.48%)	
                  alignment=0, pos=0:         4.00	        4.04 ( -0.98%)	
                  alignment=0, pos=1:         4.00	        4.00 ( -0.01%)	
                  alignment=0, pos=1:         4.00	        4.00 (  0.00%)	
                  alignment=0, pos=2:         4.00	        4.04 ( -0.98%)	
                  alignment=0, pos=2:         4.00	        4.04 ( -1.00%)	
                  alignment=0, pos=3:         4.00	        4.00 (  0.01%)	
                  alignment=0, pos=3:         4.00	        4.00 (  0.00%)	
                  alignment=0, pos=4:         4.00	        4.04 ( -0.99%)	
                  alignment=0, pos=4:         4.36	        4.45 ( -2.19%)	
                  alignment=0, pos=5:         4.00	        4.00 (  0.00%)	
                  alignment=0, pos=5:         4.00	        4.00 (  0.00%)	
                  alignment=0, pos=6:         4.00	        4.00 (  0.00%)	
                  alignment=0, pos=6:         4.00	        4.04 ( -0.99%)	
                  alignment=0, pos=7:         4.00	        4.00 (  0.00%)	
                  alignment=0, pos=7:         4.00	        4.00 ( -0.00%)	
                  alignment=0, pos=8:         4.31	        4.46 ( -3.38%)	
                  alignment=0, pos=8:         4.32	        4.44 ( -2.85%)	
                  alignment=0, pos=9:         4.00	        4.00 (  0.00%)	
                  alignment=0, pos=9:         4.00	        4.04 ( -0.99%)	
                 alignment=0, pos=10:         4.21	        4.46 ( -6.06%)	
                 alignment=0, pos=10:         4.00	        4.00 (  0.00%)	
                 alignment=0, pos=11:         4.23	        4.46 ( -5.47%)	
                 alignment=0, pos=11:         4.00	        4.00 (  0.00%)	
                 alignment=0, pos=12:         4.15	        4.51 ( -8.45%)	
                 alignment=0, pos=12:         4.00	        4.00 (  0.00%)	
                 alignment=0, pos=13:         4.26	        4.49 ( -5.59%)	
                 alignment=0, pos=13:         4.17	        4.45 ( -6.66%)	
                 alignment=0, pos=14:         4.16	        4.49 ( -7.77%)	
                 alignment=0, pos=14:         4.00	        4.00 (  0.00%)	
                 alignment=0, pos=15:         4.17	        4.45 ( -6.91%)	
                 alignment=0, pos=15:         4.17	        4.49 ( -7.81%)	
                 alignment=0, pos=16:         4.00	        4.00 ( -0.00%)	
                 alignment=0, pos=16:         4.17	        4.44 ( -6.58%)	
                 alignment=0, pos=17:         4.17	        4.44 ( -6.47%)	
                 alignment=0, pos=17:         4.16	        4.45 ( -6.91%)	
                 alignment=0, pos=18:         4.17	        4.45 ( -6.59%)	
                 alignment=0, pos=18:         4.13	        4.46 ( -7.95%)	
                 alignment=0, pos=19:         4.15	        4.47 ( -7.77%)	
                 alignment=0, pos=19:         4.13	        4.45 ( -7.75%)	
                 alignment=0, pos=20:         4.16	        4.43 ( -6.52%)	
                 alignment=0, pos=20:         4.16	        4.44 ( -6.70%)	
                 alignment=0, pos=21:         4.13	        4.46 ( -7.85%)	
                 alignment=0, pos=21:         4.17	        4.44 ( -6.52%)	
                 alignment=0, pos=22:         4.21	        4.46 ( -5.88%)	
                 alignment=0, pos=22:         4.18	        4.46 ( -6.49%)	
                 alignment=0, pos=23:         4.13	        4.44 ( -7.47%)	
                 alignment=0, pos=23:         4.14	        4.46 ( -7.60%)	
                 alignment=0, pos=24:         4.00	        4.04 ( -0.98%)	
                 alignment=0, pos=24:         4.15	        4.48 ( -8.00%)	
                 alignment=0, pos=25:         4.18	        4.45 ( -6.39%)	
                 alignment=0, pos=25:         4.17	        4.46 ( -6.94%)	
                 alignment=0, pos=26:         4.15	        4.44 ( -6.87%)	
                 alignment=0, pos=26:         4.16	        4.47 ( -7.47%)	
                 alignment=0, pos=27:         4.16	        4.49 ( -7.86%)	
                 alignment=0, pos=27:         4.19	        4.46 ( -6.43%)	
                 alignment=0, pos=28:         4.15	        4.45 ( -7.07%)	
                 alignment=0, pos=28:         4.14	        4.45 ( -7.51%)	
                 alignment=0, pos=29:         4.14	        4.49 ( -8.43%)	
                 alignment=0, pos=29:         4.21	        4.45 ( -5.74%)	
                 alignment=0, pos=30:         4.16	        4.43 ( -6.66%)	
                 alignment=0, pos=30:         4.15	        4.49 ( -8.31%)	
                 alignment=0, pos=31:         4.16	        4.54 ( -9.09%)	
                 alignment=0, pos=31:         4.15	        4.49 ( -8.12%)	
                 alignment=0, pos=15:         4.67	        4.71 ( -0.86%)	
                 alignment=0, pos=15:         5.11	        5.15 ( -0.78%)	
                 alignment=0, pos=15:         4.83	        5.11 ( -5.74%)	
                 alignment=0, pos=15:         5.20	        5.54 ( -6.59%)	
                 alignment=0, pos=15:         4.81	        5.11 ( -6.17%)	
                 alignment=0, pos=15:         5.22	        5.54 ( -6.14%)	
                 alignment=0, pos=15:         4.82	        5.11 ( -5.99%)	
                 alignment=0, pos=15:         5.24	        5.59 ( -6.57%)	
                 alignment=0, pos=15:         4.84	        5.12 ( -5.67%)	
                 alignment=0, pos=15:         5.24	        5.59 ( -6.56%)	
                 alignment=0, pos=15:         4.81	        5.11 ( -6.36%)	
                 alignment=0, pos=15:         5.22	        5.55 ( -6.22%)	
                 alignment=0, pos=15:         4.81	        5.11 ( -6.16%)	
                 alignment=0, pos=15:         5.24	        5.55 ( -5.88%)	
                 alignment=0, pos=15:         4.78	        5.15 ( -7.71%)	
                 alignment=0, pos=15:         5.21	        5.56 ( -6.54%)	
                 alignment=0, pos=15:         4.82	        5.12 ( -6.25%)	
                 alignment=0, pos=15:         5.23	        5.55 ( -6.14%)	

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH] x86_64: Implement evex512 version of strchrnul, strchr and wcschr
  2022-09-23  3:57   ` Sunil Pandey
@ 2022-09-29  3:41     ` Sunil Pandey
  2022-09-29  4:07       ` Noah Goldstein
  0 siblings, 1 reply; 10+ messages in thread
From: Sunil Pandey @ 2022-09-29  3:41 UTC (permalink / raw)
  To: Noah Goldstein; +Cc: GNU C Library

On Thu, Sep 22, 2022 at 8:57 PM Sunil Pandey <skpgkp2@gmail.com> wrote:
>
> Microbenchmark data collected on: Intel(R) Xeon(R) Gold 6148 CPU @ 2.40GHz
>
>
>
> On Wed, Sep 21, 2022 at 5:50 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> >
> > On Wed, Sep 21, 2022 at 5:17 PM Sunil K Pandey via Libc-alpha
> > <libc-alpha@sourceware.org> wrote:
> > >
> > > This patch implements following evex512 version of string functions.
> > > evex512 version takes up to 30% less cycle as compared to evex,
> > > depending on length and alignment.
> >
> > Please attach benchmark numbers.
> > >
> > > - strchrnul function using 512 bit vectors.
> > > - strchr function using 512 bit vectors.
> > > - wcschr function using 512 bit vectors.
> > >
> > > Code size data:
> > >
> > > strchrnul-evex.o        615 byte
> > > strchrnul-evex512.o     573 byte (-7%)
> > >
> > > strchr-evex.o           670 byte
> > > strchr-evex512.o        616 byte (-8%)
> > >
> > > wcschr-evex.o           678 byte
> > > wcschr-evex512.o        620 byte (-9%)
> > >
> > > Placeholder function, not used by any processor at the moment.
> > > ---
> > >  sysdeps/x86_64/multiarch/Makefile            |   3 +
> > >  sysdeps/x86_64/multiarch/ifunc-impl-list.c   |  12 +
> > >  sysdeps/x86_64/multiarch/strchr-evex-base.S  | 294 +++++++++++++++++++
> > >  sysdeps/x86_64/multiarch/strchr-evex512.S    |   7 +
> > >  sysdeps/x86_64/multiarch/strchrnul-evex512.S |   8 +
> > >  sysdeps/x86_64/multiarch/wcschr-evex512.S    |   8 +
> > >  6 files changed, 332 insertions(+)
> > >  create mode 100644 sysdeps/x86_64/multiarch/strchr-evex-base.S
> > >  create mode 100644 sysdeps/x86_64/multiarch/strchr-evex512.S
> > >  create mode 100644 sysdeps/x86_64/multiarch/strchrnul-evex512.S
> > >  create mode 100644 sysdeps/x86_64/multiarch/wcschr-evex512.S
> > >
> > > diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
> > > index df4601c294..89b58fa557 100644
> > > --- a/sysdeps/x86_64/multiarch/Makefile
> > > +++ b/sysdeps/x86_64/multiarch/Makefile
> > > @@ -60,11 +60,13 @@ sysdep_routines += \
> > >    strchr-avx2 \
> > >    strchr-avx2-rtm \
> > >    strchr-evex \
> > > +  strchr-evex512 \
> > >    strchr-sse2 \
> > >    strchr-sse2-no-bsf \
> > >    strchrnul-avx2 \
> > >    strchrnul-avx2-rtm \
> > >    strchrnul-evex \
> > > +  strchrnul-evex512 \
> > >    strchrnul-sse2 \
> > >    strcmp-avx2 \
> > >    strcmp-avx2-rtm \
> > > @@ -129,6 +131,7 @@ sysdep_routines += \
> > >    wcschr-avx2 \
> > >    wcschr-avx2-rtm \
> > >    wcschr-evex \
> > > +  wcschr-evex512 \
> > >    wcschr-sse2 \
> > >    wcscmp-avx2 \
> > >    wcscmp-avx2-rtm \
> > > diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> > > index a71444eccb..bce1d15171 100644
> > > --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> > > +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> > > @@ -518,6 +518,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> > >                                       && CPU_FEATURE_USABLE (AVX512BW)
> > >                                       && CPU_FEATURE_USABLE (BMI2)),
> > >                                      __strchr_evex)
> > > +             X86_IFUNC_IMPL_ADD_V4 (array, i, strchr,
> > > +                                    (CPU_FEATURE_USABLE (AVX512VL)
> > > +                                     && CPU_FEATURE_USABLE (AVX512BW)),
> > > +                                    __strchr_evex512)
> > >               X86_IFUNC_IMPL_ADD_V3 (array, i, strchr,
> > >                                      (CPU_FEATURE_USABLE (AVX2)
> > >                                       && CPU_FEATURE_USABLE (BMI2)),
> > > @@ -543,6 +547,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> > >                                       && CPU_FEATURE_USABLE (AVX512BW)
> > >                                       && CPU_FEATURE_USABLE (BMI2)),
> > >                                      __strchrnul_evex)
> > > +             X86_IFUNC_IMPL_ADD_V4 (array, i, strchrnul,
> > > +                                    (CPU_FEATURE_USABLE (AVX512VL)
> > > +                                     && CPU_FEATURE_USABLE (AVX512BW)),
> > > +                                    __strchrnul_evex512)
> > >               X86_IFUNC_IMPL_ADD_V3 (array, i, strchrnul,
> > >                                      (CPU_FEATURE_USABLE (AVX2)
> > >                                       && CPU_FEATURE_USABLE (BMI2)),
> > > @@ -753,6 +761,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> > >                                       && CPU_FEATURE_USABLE (AVX512BW)
> > >                                       && CPU_FEATURE_USABLE (BMI2)),
> > >                                      __wcschr_evex)
> > > +             X86_IFUNC_IMPL_ADD_V4 (array, i, wcschr,
> > > +                                    (CPU_FEATURE_USABLE (AVX512VL)
> > > +                                     && CPU_FEATURE_USABLE (AVX512BW)),
> > > +                                    __wcschr_evex512)
> > >               X86_IFUNC_IMPL_ADD_V3 (array, i, wcschr,
> > >                                      (CPU_FEATURE_USABLE (AVX2)
> > >                                       && CPU_FEATURE_USABLE (BMI2)),
> > > diff --git a/sysdeps/x86_64/multiarch/strchr-evex-base.S b/sysdeps/x86_64/multiarch/strchr-evex-base.S
> > > new file mode 100644
> > > index 0000000000..919dafc8b6
> > > --- /dev/null
> > > +++ b/sysdeps/x86_64/multiarch/strchr-evex-base.S
> > > @@ -0,0 +1,294 @@
> > > +/* Placeholder function, not used by any processor at the moment.
> > > +   Copyright (C) 2022 Free Software Foundation, Inc.
> > > +   This file is part of the GNU C Library.
> > > +
> > > +   The GNU C Library is free software; you can redistribute it and/or
> > > +   modify it under the terms of the GNU Lesser General Public
> > > +   License as published by the Free Software Foundation; either
> > > +   version 2.1 of the License, or (at your option) any later version.
> > > +
> > > +   The GNU C Library is distributed in the hope that it will be useful,
> > > +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> > > +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > > +   Lesser General Public License for more details.
> > > +
> > > +   You should have received a copy of the GNU Lesser General Public
> > > +   License along with the GNU C Library; if not, see
> > > +   <https://www.gnu.org/licenses/>.  */
> > > +
> > > +/* UNUSED. Exists purely as reference implementation.  */
> > > +
> > > +#include <isa-level.h>
> > > +
> > > +#if ISA_SHOULD_BUILD (4)
> > > +
> > > +# include <sysdep.h>
> > > +
> > > +# ifdef USE_AS_WCSCHR
> > > +#  define CHAR_REG     esi
> > > +#  define CHAR_SIZE    4
> > > +#  define VPBROADCAST   vpbroadcastd
> > > +#  define VPCMP                vpcmpd
> > > +#  define VPMINU       vpminud
> > > +#  define VPTESTN      vptestnmd
> > > +# else
> > > +#  define CHAR_REG     sil
> > > +#  define CHAR_SIZE    1
> > > +#  define VPBROADCAST   vpbroadcastb
> > > +#  define VPCMP                vpcmpb
> > > +#  define VPMINU       vpminub
> > > +#  define VPTESTN      vptestnmb
> > > +# endif
> > > +
> > > +# define PAGE_SIZE     4096
> > > +# define CHAR_PER_VEC  (VEC_SIZE / CHAR_SIZE)
> > > +# define XMM1           xmm17
> > > +
> > > +# if VEC_SIZE == 64
> > > +#  define KMOV         kmovq
> > > +#  define KORTEST      kortestq
> > > +#  define RAX          rax
> > > +#  define RCX          rcx
> > > +#  define RDX          rdx
> > > +#  define SHR          shrq
> > > +#  define TEXTSUFFIX   evex512
> > > +#  define VMM0         zmm16
> > > +#  define VMM1         zmm17
> > > +#  define VMM2         zmm18
> > > +#  define VMM3         zmm19
> > > +#  define VMM4         zmm20
> > > +#  define VMM5         zmm21
> > > +#  define VMOVA                vmovdqa64
> > > +#  define VMOVU                vmovdqu64
> > > +
> > > +# elif VEC_SIZE == 32
> > > +/* Currently Unused.  */
> > > +#  define KMOV         kmovd
> > > +#  define KORTEST      kortestd
> > > +#  define RAX          eax
> > > +#  define RCX          ecx
> > > +#  define RDX          edx
> > > +#  define SHR          shrl
> > > +#  define TEXTSUFFIX   evex256
> > > +#  define VMM0         ymm16
> > > +#  define VMM1         ymm17
> > > +#  define VMM2         ymm18
> > > +#  define VMM3         ymm19
> > > +#  define VMM4         ymm20
> > > +#  define VMM5         ymm21
> > > +#  define VMOVA                vmovdqa32
> > > +#  define VMOVU                vmovdqu32
> > > +# endif
> > > +
> > > +       .section .text.TEXTSUFFIX, "ax", @progbits
> > > +/* Aligning entry point to 64 byte, provides better performance for
> > > +   one vector length string.  */
> > > +ENTRY_P2ALIGN (STRCHR, 6)
> > > +
> > > +       /* Broadcast CHAR to VMM0.  */
> > > +       VPBROADCAST %esi, %VMM0
> > > +       movl    %edi, %eax
> > > +       andl    $(PAGE_SIZE - 1), %eax
> > > +       cmpl    $(PAGE_SIZE - VEC_SIZE), %eax
> > > +       ja      L(page_cross)
> > > +
> > > +       /* Compare [w]char for null, mask bit will be set for match.  */
> > > +       VMOVU   (%rdi), %VMM1
> > > +
> > > +       vpxorq  %VMM1, %VMM0, %VMM2
> > > +       VPMINU  %VMM2, %VMM1, %VMM2
> > > +       VPTESTN %VMM2, %VMM2, %k0
> > > +
> > > +       KMOV    %k0, %RAX
> > > +# ifndef USE_AS_STRCHRNUL
> > > +       test    %RAX, %RAX
> > > +       jz      L(align_more)
> > > +       bsf     %RAX, %RAX
> > > +# else
> > > +       /* For strchnul, using bsf, if string is less than 64 byte,
> > > +          entire logic will fit in 64 byte cache line and offset
> > > +          the perf gap as compared to evex version.  Even though
> > > +          using bsf as condition can save code size but it is not
> > > +          preferred for conditional jump for 2 reason.  1) It's
> > > +          latency is 3. 2) Unlike test, it can't be micro-fused
> > > +          with jump.  */
> > > +       bsf     %RAX, %RAX
> > > +       jz      L(align_more)
> > > +# endif
> > > +
> > > +# ifdef USE_AS_WCSCHR
> > > +       leaq    (%rdi, %rax, CHAR_SIZE), %rax
> > > +# else
> > > +       add     %rdi, %rax
> > > +# endif
> > > +# ifndef USE_AS_STRCHRNUL
> > > +       cmp     (%rax), %CHAR_REG
> > > +       jne     L(zero)
> > > +# endif
> > > +       ret
> > > +
> > > +# ifndef USE_AS_STRCHRNUL
> > > +L(zero):
> > > +       xorl    %eax, %eax
> > > +       ret
> > > +# endif
> > > +
> > > +L(ret_vec_x2):
> > > +       subq    $-VEC_SIZE, %rax
> > > +L(ret_vec_x1):
> > > +       bsf     %RCX, %RCX
> > > +# ifdef USE_AS_WCSCHR
> > > +       leaq    (%rax, %rcx, CHAR_SIZE), %rax
> > > +# else
> > > +       add     %rcx, %rax
> > > +# endif
> > > +
> > > +# ifndef USE_AS_STRCHRNUL
> > > +       cmp     (%rax), %CHAR_REG
> > > +       jne     L(zero)
> > > +# endif
> > > +       ret
> > > +
> > > +L(align_more):
> > > +       leaq    VEC_SIZE(%rdi), %rax
> > > +       /* Align rax to VEC_SIZE.  */
> > > +       andq    $-VEC_SIZE, %rax
> > > +
> > > +       /* Loop unroll 4 times for 4 vector loop.  */
> > > +       VMOVA   (%rax), %VMM1
> > > +       vpxorq  %VMM1, %VMM0, %VMM2
> > > +       VPMINU  %VMM2, %VMM1, %VMM2
> > > +       VPTESTN %VMM2, %VMM2, %k0
> > > +
> > > +       KMOV    %k0, %RCX
> > > +       test    %RCX, %RCX
> > > +       jnz     L(ret_vec_x1)
> > > +
> > > +       VMOVA   VEC_SIZE(%rax), %VMM1
> > > +       vpxorq  %VMM1, %VMM0, %VMM2
> > > +       VPMINU  %VMM2, %VMM1, %VMM2
> > > +       VPTESTN %VMM2, %VMM2, %k0
> > > +
> > > +       KMOV    %k0, %RCX
> > > +       test    %RCX, %RCX
> > > +       jnz     L(ret_vec_x2)
> > > +
> > > +       VMOVA   (VEC_SIZE * 2)(%rax), %VMM1
> > > +       vpxorq  %VMM1, %VMM0, %VMM2
> > > +       VPMINU  %VMM2, %VMM1, %VMM2
> > > +       VPTESTN %VMM2, %VMM2, %k0
> > > +       KMOV    %k0, %RCX
> > > +       test    %RCX, %RCX
> > > +       jnz     L(ret_vec_x3)
> > > +
> > > +       VMOVA   (VEC_SIZE * 3)(%rax), %VMM1
> > > +       vpxorq  %VMM1, %VMM0, %VMM2
> > > +       VPMINU  %VMM2, %VMM1, %VMM2
> > > +       VPTESTN %VMM2, %VMM2, %k0
> > > +       KMOV    %k0, %RCX
> > > +       test    %RCX, %RCX
> > > +       jnz     L(ret_vec_x4)
> > > +
> > > +       /* Align address to VEC_SIZE * 4 for loop.  */
> > > +       andq    $-(VEC_SIZE * 4), %rax
> > > +
> > > +       .p2align 4,,11
> > > +L(loop):
> > > +       /* VPMINU and VPCMP combination provide better performance as
> > > +          compared to alternative combinations.  */
> > > +       VMOVA   (VEC_SIZE * 4)(%rax), %VMM1
> > > +       VMOVA   (VEC_SIZE * 5)(%rax), %VMM2
> > > +       VMOVA   (VEC_SIZE * 6)(%rax), %VMM3
> > > +       VMOVA   (VEC_SIZE * 7)(%rax), %VMM4
> > > +
> > > +       vpxorq  %VMM1, %VMM0, %VMM5
> > > +       VPMINU  %VMM5, %VMM1, %VMM1
> > > +
> > > +       VPCMP   $4, %VMM0, %VMM2, %k1
> > > +       VPMINU  %VMM1, %VMM2, %VMM2{%k1}{z}
> > > +
> > > +       VPCMP   $4, %VMM0, %VMM3, %k2
> > > +       VPMINU  %VMM2, %VMM3, %VMM3{%k2}{z}
> > > +
> > > +       VPCMP   $4, %VMM0, %VMM4, %k3
> > > +       VPMINU  %VMM3, %VMM4, %VMM4{%k3}{z}
> > > +
> > > +       VPTESTN %VMM4, %VMM4, %k3
> > > +
> > > +       subq    $-(VEC_SIZE * 4), %rax
> > > +       KORTEST %k3, %k3
> > > +       jz      L(loop)
> > > +
> > > +       VPTESTN %VMM1, %VMM1, %k0
> > > +       KMOV    %k0, %RCX
> > > +       test    %RCX, %RCX
> > > +       jnz     L(ret_vec_x1)
> > > +
> > > +       VPTESTN %VMM2, %VMM2, %k0
> > > +       KMOV    %k0, %RCX
> > > +       /* At this point, if k1 is non zero, null char must be in the
> > > +          second vector.  */
> > > +       test    %RCX, %RCX
> > > +       jnz     L(ret_vec_x2)
> > > +
> > > +       VPTESTN %VMM3, %VMM3, %k0
> > > +       KMOV    %k0, %RCX
> > > +       test    %RCX, %RCX
> > > +       jnz     L(ret_vec_x3)
> > > +       /* At this point null [w]char must be in the fourth vector so no
> > > +          need to check.  */
> > > +       KMOV    %k3, %RCX
> > > +
> > > +L(ret_vec_x4):
> > > +       bsf     %RCX, %RCX
> > > +       leaq    (VEC_SIZE * 3)(%rax, %rcx, CHAR_SIZE), %rax
> > > +# ifndef USE_AS_STRCHRNUL
> > > +       cmp     (%rax), %CHAR_REG
> > > +       jne     L(zero)
> > > +# endif
> > > +       ret
> > > +
> > > +L(ret_vec_x3):
> > > +       bsf     %RCX, %RCX
> > > +       leaq    (VEC_SIZE * 2)(%rax, %rcx, CHAR_SIZE), %rax
> > > +# ifndef USE_AS_STRCHRNUL
> > > +       cmp     (%rax), %CHAR_REG
> > > +       jne     L(zero)
> > > +# endif
> > > +       ret
> > > +
> > > +L(page_cross):
> > > +       movl    %eax, %ecx
> > > +# ifdef USE_AS_WCSCHR
> > > +       /* Calculate number of compare result bits to be skipped for
> > > +          wide string alignment adjustment.  */
> > > +       andl    $(VEC_SIZE - 1), %ecx
> > > +       sarl    $2, %ecx
> > > +# endif
> > > +       /* ecx contains number of w[char] to be skipped as a result
> > > +          of address alignment.  */
> > > +       xorq    %rdi, %rax
> > > +       VMOVA   (PAGE_SIZE - VEC_SIZE)(%rax), %VMM1
> > > +       vpxorq  %VMM1, %VMM0, %VMM2
> > > +       VPMINU  %VMM2, %VMM1, %VMM2
> > > +       VPTESTN %VMM2, %VMM2, %k0
> > > +       KMOV    %k0, %RAX
> > > +       /* Ignore number of character for alignment adjustment.  */
> > > +       SHR     %cl, %RAX
> > > +       jz      L(align_more)
> > > +
> > > +       bsf     %RAX, %RAX
> > > +# ifdef USE_AS_WCSCHR
> > > +       leaq    (%rdi, %rax, CHAR_SIZE), %rax
> > > +# else
> > > +       addq    %rdi, %rax
> > > +# endif
> > > +
> > > +# ifndef USE_AS_STRCHRNUL
> > > +       cmp     (%rax), %CHAR_REG
> > > +       jne     L(zero)
> > > +# endif
> > > +       ret
> > > +
> > > +END (STRCHR)
> > > +#endif
> > > diff --git a/sysdeps/x86_64/multiarch/strchr-evex512.S b/sysdeps/x86_64/multiarch/strchr-evex512.S
> > > new file mode 100644
> > > index 0000000000..4079bf387d
> > > --- /dev/null
> > > +++ b/sysdeps/x86_64/multiarch/strchr-evex512.S
> > > @@ -0,0 +1,7 @@
> > > +# ifndef STRCHR
> > > +#  define STRCHR       __strchr_evex512
> > > +# endif
> > > +
> > > +#define VEC_SIZE       64
> > > +
> > > +#include "strchr-evex-base.S"
> > > diff --git a/sysdeps/x86_64/multiarch/strchrnul-evex512.S b/sysdeps/x86_64/multiarch/strchrnul-evex512.S
> > > new file mode 100644
> > > index 0000000000..1be0b12f38
> > > --- /dev/null
> > > +++ b/sysdeps/x86_64/multiarch/strchrnul-evex512.S
> > > @@ -0,0 +1,8 @@
> > > +#ifndef STRCHRNUL
> > > +# define STRCHRNUL     __strchrnul_evex512
> > > +#endif
> > > +
> > > +#define STRCHR STRCHRNUL
> > > +#define USE_AS_STRCHRNUL 1
> > > +
> > > +#include "strchr-evex512.S"
> > > diff --git a/sysdeps/x86_64/multiarch/wcschr-evex512.S b/sysdeps/x86_64/multiarch/wcschr-evex512.S
> > > new file mode 100644
> > > index 0000000000..50c87ab1e5
> > > --- /dev/null
> > > +++ b/sysdeps/x86_64/multiarch/wcschr-evex512.S
> > > @@ -0,0 +1,8 @@
> > > +#ifndef WCSCHR
> > > +# define WCSCHR        __wcschr_evex512
> > > +#endif
> > > +
> > > +#define STRCHR WCSCHR
> > > +#define USE_AS_WCSCHR 1
> > > +
> > > +#include "strchr-evex512.S"
> > > --
> > > 2.36.1
> > >

ping

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH] x86_64: Implement evex512 version of strchrnul, strchr and wcschr
  2022-09-29  3:41     ` Sunil Pandey
@ 2022-09-29  4:07       ` Noah Goldstein
  2022-10-21 21:23         ` [PATCH v2] " Sunil K Pandey
  0 siblings, 1 reply; 10+ messages in thread
From: Noah Goldstein @ 2022-09-29  4:07 UTC (permalink / raw)
  To: Sunil Pandey; +Cc: GNU C Library

On Wed, Sep 28, 2022 at 8:42 PM Sunil Pandey <skpgkp2@gmail.com> wrote:
>
> On Thu, Sep 22, 2022 at 8:57 PM Sunil Pandey <skpgkp2@gmail.com> wrote:
> >
> > Microbenchmark data collected on: Intel(R) Xeon(R) Gold 6148 CPU @ 2.40GHz
> >
> >
> >
> > On Wed, Sep 21, 2022 at 5:50 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> > >
> > > On Wed, Sep 21, 2022 at 5:17 PM Sunil K Pandey via Libc-alpha
> > > <libc-alpha@sourceware.org> wrote:
> > > >
> > > > This patch implements following evex512 version of string functions.
> > > > evex512 version takes up to 30% less cycle as compared to evex,
> > > > depending on length and alignment.
> > >
> > > Please attach benchmark numbers.
> > > >
> > > > - strchrnul function using 512 bit vectors.
> > > > - strchr function using 512 bit vectors.
> > > > - wcschr function using 512 bit vectors.
> > > >
> > > > Code size data:
> > > >
> > > > strchrnul-evex.o        615 byte
> > > > strchrnul-evex512.o     573 byte (-7%)
> > > >
> > > > strchr-evex.o           670 byte
> > > > strchr-evex512.o        616 byte (-8%)
> > > >
> > > > wcschr-evex.o           678 byte
> > > > wcschr-evex512.o        620 byte (-9%)
> > > >
> > > > Placeholder function, not used by any processor at the moment.
> > > > ---
> > > >  sysdeps/x86_64/multiarch/Makefile            |   3 +
> > > >  sysdeps/x86_64/multiarch/ifunc-impl-list.c   |  12 +
> > > >  sysdeps/x86_64/multiarch/strchr-evex-base.S  | 294 +++++++++++++++++++
> > > >  sysdeps/x86_64/multiarch/strchr-evex512.S    |   7 +
> > > >  sysdeps/x86_64/multiarch/strchrnul-evex512.S |   8 +
> > > >  sysdeps/x86_64/multiarch/wcschr-evex512.S    |   8 +
> > > >  6 files changed, 332 insertions(+)
> > > >  create mode 100644 sysdeps/x86_64/multiarch/strchr-evex-base.S
> > > >  create mode 100644 sysdeps/x86_64/multiarch/strchr-evex512.S
> > > >  create mode 100644 sysdeps/x86_64/multiarch/strchrnul-evex512.S
> > > >  create mode 100644 sysdeps/x86_64/multiarch/wcschr-evex512.S
> > > >
> > > > diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
> > > > index df4601c294..89b58fa557 100644
> > > > --- a/sysdeps/x86_64/multiarch/Makefile
> > > > +++ b/sysdeps/x86_64/multiarch/Makefile
> > > > @@ -60,11 +60,13 @@ sysdep_routines += \
> > > >    strchr-avx2 \
> > > >    strchr-avx2-rtm \
> > > >    strchr-evex \
> > > > +  strchr-evex512 \
> > > >    strchr-sse2 \
> > > >    strchr-sse2-no-bsf \
> > > >    strchrnul-avx2 \
> > > >    strchrnul-avx2-rtm \
> > > >    strchrnul-evex \
> > > > +  strchrnul-evex512 \
> > > >    strchrnul-sse2 \
> > > >    strcmp-avx2 \
> > > >    strcmp-avx2-rtm \
> > > > @@ -129,6 +131,7 @@ sysdep_routines += \
> > > >    wcschr-avx2 \
> > > >    wcschr-avx2-rtm \
> > > >    wcschr-evex \
> > > > +  wcschr-evex512 \
> > > >    wcschr-sse2 \
> > > >    wcscmp-avx2 \
> > > >    wcscmp-avx2-rtm \
> > > > diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> > > > index a71444eccb..bce1d15171 100644
> > > > --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> > > > +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> > > > @@ -518,6 +518,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> > > >                                       && CPU_FEATURE_USABLE (AVX512BW)
> > > >                                       && CPU_FEATURE_USABLE (BMI2)),
> > > >                                      __strchr_evex)
> > > > +             X86_IFUNC_IMPL_ADD_V4 (array, i, strchr,
> > > > +                                    (CPU_FEATURE_USABLE (AVX512VL)
> > > > +                                     && CPU_FEATURE_USABLE (AVX512BW)),
> > > > +                                    __strchr_evex512)
> > > >               X86_IFUNC_IMPL_ADD_V3 (array, i, strchr,
> > > >                                      (CPU_FEATURE_USABLE (AVX2)
> > > >                                       && CPU_FEATURE_USABLE (BMI2)),
> > > > @@ -543,6 +547,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> > > >                                       && CPU_FEATURE_USABLE (AVX512BW)
> > > >                                       && CPU_FEATURE_USABLE (BMI2)),
> > > >                                      __strchrnul_evex)
> > > > +             X86_IFUNC_IMPL_ADD_V4 (array, i, strchrnul,
> > > > +                                    (CPU_FEATURE_USABLE (AVX512VL)
> > > > +                                     && CPU_FEATURE_USABLE (AVX512BW)),
> > > > +                                    __strchrnul_evex512)
> > > >               X86_IFUNC_IMPL_ADD_V3 (array, i, strchrnul,
> > > >                                      (CPU_FEATURE_USABLE (AVX2)
> > > >                                       && CPU_FEATURE_USABLE (BMI2)),
> > > > @@ -753,6 +761,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> > > >                                       && CPU_FEATURE_USABLE (AVX512BW)
> > > >                                       && CPU_FEATURE_USABLE (BMI2)),
> > > >                                      __wcschr_evex)
> > > > +             X86_IFUNC_IMPL_ADD_V4 (array, i, wcschr,
> > > > +                                    (CPU_FEATURE_USABLE (AVX512VL)
> > > > +                                     && CPU_FEATURE_USABLE (AVX512BW)),
> > > > +                                    __wcschr_evex512)
> > > >               X86_IFUNC_IMPL_ADD_V3 (array, i, wcschr,
> > > >                                      (CPU_FEATURE_USABLE (AVX2)
> > > >                                       && CPU_FEATURE_USABLE (BMI2)),
> > > > diff --git a/sysdeps/x86_64/multiarch/strchr-evex-base.S b/sysdeps/x86_64/multiarch/strchr-evex-base.S
> > > > new file mode 100644
> > > > index 0000000000..919dafc8b6
> > > > --- /dev/null
> > > > +++ b/sysdeps/x86_64/multiarch/strchr-evex-base.S
> > > > @@ -0,0 +1,294 @@
> > > > +/* Placeholder function, not used by any processor at the moment.
> > > > +   Copyright (C) 2022 Free Software Foundation, Inc.
> > > > +   This file is part of the GNU C Library.
> > > > +
> > > > +   The GNU C Library is free software; you can redistribute it and/or
> > > > +   modify it under the terms of the GNU Lesser General Public
> > > > +   License as published by the Free Software Foundation; either
> > > > +   version 2.1 of the License, or (at your option) any later version.
> > > > +
> > > > +   The GNU C Library is distributed in the hope that it will be useful,
> > > > +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> > > > +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > > > +   Lesser General Public License for more details.
> > > > +
> > > > +   You should have received a copy of the GNU Lesser General Public
> > > > +   License along with the GNU C Library; if not, see
> > > > +   <https://www.gnu.org/licenses/>.  */
> > > > +
> > > > +/* UNUSED. Exists purely as reference implementation.  */
> > > > +
> > > > +#include <isa-level.h>
> > > > +
> > > > +#if ISA_SHOULD_BUILD (4)
> > > > +
> > > > +# include <sysdep.h>
> > > > +
> > > > +# ifdef USE_AS_WCSCHR
> > > > +#  define CHAR_REG     esi
> > > > +#  define CHAR_SIZE    4
> > > > +#  define VPBROADCAST   vpbroadcastd
> > > > +#  define VPCMP                vpcmpd
> > > > +#  define VPMINU       vpminud
> > > > +#  define VPTESTN      vptestnmd
> > > > +# else
> > > > +#  define CHAR_REG     sil
> > > > +#  define CHAR_SIZE    1
> > > > +#  define VPBROADCAST   vpbroadcastb
> > > > +#  define VPCMP                vpcmpb
> > > > +#  define VPMINU       vpminub
> > > > +#  define VPTESTN      vptestnmb
> > > > +# endif
> > > > +
> > > > +# define PAGE_SIZE     4096
> > > > +# define CHAR_PER_VEC  (VEC_SIZE / CHAR_SIZE)
> > > > +# define XMM1           xmm17
> > > > +
> > > > +# if VEC_SIZE == 64
> > > > +#  define KMOV         kmovq
> > > > +#  define KORTEST      kortestq
> > > > +#  define RAX          rax
> > > > +#  define RCX          rcx
> > > > +#  define RDX          rdx
> > > > +#  define SHR          shrq
> > > > +#  define TEXTSUFFIX   evex512
> > > > +#  define VMM0         zmm16
> > > > +#  define VMM1         zmm17
> > > > +#  define VMM2         zmm18
> > > > +#  define VMM3         zmm19
> > > > +#  define VMM4         zmm20
> > > > +#  define VMM5         zmm21
> > > > +#  define VMOVA                vmovdqa64
> > > > +#  define VMOVU                vmovdqu64
> > > > +
> > > > +# elif VEC_SIZE == 32
> > > > +/* Currently Unused.  */
> > > > +#  define KMOV         kmovd
> > > > +#  define KORTEST      kortestd
> > > > +#  define RAX          eax
> > > > +#  define RCX          ecx
> > > > +#  define RDX          edx
> > > > +#  define SHR          shrl
> > > > +#  define TEXTSUFFIX   evex256
> > > > +#  define VMM0         ymm16
> > > > +#  define VMM1         ymm17
> > > > +#  define VMM2         ymm18
> > > > +#  define VMM3         ymm19
> > > > +#  define VMM4         ymm20
> > > > +#  define VMM5         ymm21
> > > > +#  define VMOVA                vmovdqa32
> > > > +#  define VMOVU                vmovdqu32
> > > > +# endif
> > > > +
> > > > +       .section .text.TEXTSUFFIX, "ax", @progbits
> > > > +/* Aligning entry point to 64 byte, provides better performance for
> > > > +   one vector length string.  */
> > > > +ENTRY_P2ALIGN (STRCHR, 6)
> > > > +
> > > > +       /* Broadcast CHAR to VMM0.  */
> > > > +       VPBROADCAST %esi, %VMM0
> > > > +       movl    %edi, %eax
> > > > +       andl    $(PAGE_SIZE - 1), %eax
> > > > +       cmpl    $(PAGE_SIZE - VEC_SIZE), %eax
> > > > +       ja      L(page_cross)
> > > > +
> > > > +       /* Compare [w]char for null, mask bit will be set for match.  */
> > > > +       VMOVU   (%rdi), %VMM1
> > > > +
> > > > +       vpxorq  %VMM1, %VMM0, %VMM2
> > > > +       VPMINU  %VMM2, %VMM1, %VMM2
> > > > +       VPTESTN %VMM2, %VMM2, %k0
> > > > +
> > > > +       KMOV    %k0, %RAX
> > > > +# ifndef USE_AS_STRCHRNUL
> > > > +       test    %RAX, %RAX
> > > > +       jz      L(align_more)
> > > > +       bsf     %RAX, %RAX
> > > > +# else
> > > > +       /* For strchnul, using bsf, if string is less than 64 byte,
> > > > +          entire logic will fit in 64 byte cache line and offset
> > > > +          the perf gap as compared to evex version.  Even though
> > > > +          using bsf as condition can save code size but it is not
> > > > +          preferred for conditional jump for 2 reason.  1) It's
> > > > +          latency is 3. 2) Unlike test, it can't be micro-fused
> > > > +          with jump.  */
> > > > +       bsf     %RAX, %RAX
> > > > +       jz      L(align_more)
> > > > +# endif
> > > > +
> > > > +# ifdef USE_AS_WCSCHR
> > > > +       leaq    (%rdi, %rax, CHAR_SIZE), %rax
> > > > +# else
> > > > +       add     %rdi, %rax
> > > > +# endif
> > > > +# ifndef USE_AS_STRCHRNUL
> > > > +       cmp     (%rax), %CHAR_REG
> > > > +       jne     L(zero)
> > > > +# endif
> > > > +       ret
> > > > +
> > > > +# ifndef USE_AS_STRCHRNUL
> > > > +L(zero):
> > > > +       xorl    %eax, %eax
> > > > +       ret
> > > > +# endif
> > > > +
> > > > +L(ret_vec_x2):
> > > > +       subq    $-VEC_SIZE, %rax
> > > > +L(ret_vec_x1):
> > > > +       bsf     %RCX, %RCX
> > > > +# ifdef USE_AS_WCSCHR
> > > > +       leaq    (%rax, %rcx, CHAR_SIZE), %rax
> > > > +# else
> > > > +       add     %rcx, %rax
> > > > +# endif
> > > > +
> > > > +# ifndef USE_AS_STRCHRNUL
> > > > +       cmp     (%rax), %CHAR_REG
> > > > +       jne     L(zero)
> > > > +# endif
> > > > +       ret
> > > > +
> > > > +L(align_more):
> > > > +       leaq    VEC_SIZE(%rdi), %rax
> > > > +       /* Align rax to VEC_SIZE.  */
> > > > +       andq    $-VEC_SIZE, %rax
> > > > +
> > > > +       /* Loop unroll 4 times for 4 vector loop.  */
> > > > +       VMOVA   (%rax), %VMM1
> > > > +       vpxorq  %VMM1, %VMM0, %VMM2
> > > > +       VPMINU  %VMM2, %VMM1, %VMM2
> > > > +       VPTESTN %VMM2, %VMM2, %k0
> > > > +
> > > > +       KMOV    %k0, %RCX
> > > > +       test    %RCX, %RCX
> > > > +       jnz     L(ret_vec_x1)
> > > > +
> > > > +       VMOVA   VEC_SIZE(%rax), %VMM1
> > > > +       vpxorq  %VMM1, %VMM0, %VMM2
> > > > +       VPMINU  %VMM2, %VMM1, %VMM2
> > > > +       VPTESTN %VMM2, %VMM2, %k0
> > > > +
> > > > +       KMOV    %k0, %RCX
> > > > +       test    %RCX, %RCX
> > > > +       jnz     L(ret_vec_x2)
> > > > +
> > > > +       VMOVA   (VEC_SIZE * 2)(%rax), %VMM1
> > > > +       vpxorq  %VMM1, %VMM0, %VMM2
> > > > +       VPMINU  %VMM2, %VMM1, %VMM2
> > > > +       VPTESTN %VMM2, %VMM2, %k0
> > > > +       KMOV    %k0, %RCX
> > > > +       test    %RCX, %RCX
> > > > +       jnz     L(ret_vec_x3)
> > > > +
> > > > +       VMOVA   (VEC_SIZE * 3)(%rax), %VMM1
> > > > +       vpxorq  %VMM1, %VMM0, %VMM2
> > > > +       VPMINU  %VMM2, %VMM1, %VMM2
> > > > +       VPTESTN %VMM2, %VMM2, %k0
> > > > +       KMOV    %k0, %RCX
> > > > +       test    %RCX, %RCX
> > > > +       jnz     L(ret_vec_x4)
> > > > +
> > > > +       /* Align address to VEC_SIZE * 4 for loop.  */
> > > > +       andq    $-(VEC_SIZE * 4), %rax
> > > > +
> > > > +       .p2align 4,,11
> > > > +L(loop):
> > > > +       /* VPMINU and VPCMP combination provide better performance as
> > > > +          compared to alternative combinations.  */
> > > > +       VMOVA   (VEC_SIZE * 4)(%rax), %VMM1
> > > > +       VMOVA   (VEC_SIZE * 5)(%rax), %VMM2
> > > > +       VMOVA   (VEC_SIZE * 6)(%rax), %VMM3
> > > > +       VMOVA   (VEC_SIZE * 7)(%rax), %VMM4
> > > > +
> > > > +       vpxorq  %VMM1, %VMM0, %VMM5
> > > > +       VPMINU  %VMM5, %VMM1, %VMM1
> > > > +
> > > > +       VPCMP   $4, %VMM0, %VMM2, %k1
> > > > +       VPMINU  %VMM1, %VMM2, %VMM2{%k1}{z}
> > > > +
> > > > +       VPCMP   $4, %VMM0, %VMM3, %k2
> > > > +       VPMINU  %VMM2, %VMM3, %VMM3{%k2}{z}
> > > > +
> > > > +       VPCMP   $4, %VMM0, %VMM4, %k3
> > > > +       VPMINU  %VMM3, %VMM4, %VMM4{%k3}{z}
> > > > +
> > > > +       VPTESTN %VMM4, %VMM4, %k3
> > > > +
> > > > +       subq    $-(VEC_SIZE * 4), %rax
> > > > +       KORTEST %k3, %k3
> > > > +       jz      L(loop)
> > > > +
> > > > +       VPTESTN %VMM1, %VMM1, %k0
> > > > +       KMOV    %k0, %RCX
> > > > +       test    %RCX, %RCX
> > > > +       jnz     L(ret_vec_x1)
> > > > +
> > > > +       VPTESTN %VMM2, %VMM2, %k0
> > > > +       KMOV    %k0, %RCX
> > > > +       /* At this point, if k1 is non zero, null char must be in the
> > > > +          second vector.  */
> > > > +       test    %RCX, %RCX
> > > > +       jnz     L(ret_vec_x2)
> > > > +
> > > > +       VPTESTN %VMM3, %VMM3, %k0
> > > > +       KMOV    %k0, %RCX
> > > > +       test    %RCX, %RCX
> > > > +       jnz     L(ret_vec_x3)
> > > > +       /* At this point null [w]char must be in the fourth vector so no
> > > > +          need to check.  */
> > > > +       KMOV    %k3, %RCX
> > > > +
> > > > +L(ret_vec_x4):
> > > > +       bsf     %RCX, %RCX
> > > > +       leaq    (VEC_SIZE * 3)(%rax, %rcx, CHAR_SIZE), %rax
> > > > +# ifndef USE_AS_STRCHRNUL
> > > > +       cmp     (%rax), %CHAR_REG
> > > > +       jne     L(zero)
> > > > +# endif
> > > > +       ret
> > > > +
> > > > +L(ret_vec_x3):
> > > > +       bsf     %RCX, %RCX
> > > > +       leaq    (VEC_SIZE * 2)(%rax, %rcx, CHAR_SIZE), %rax
> > > > +# ifndef USE_AS_STRCHRNUL
> > > > +       cmp     (%rax), %CHAR_REG
> > > > +       jne     L(zero)
> > > > +# endif
> > > > +       ret
> > > > +
> > > > +L(page_cross):
> > > > +       movl    %eax, %ecx
> > > > +# ifdef USE_AS_WCSCHR
> > > > +       /* Calculate number of compare result bits to be skipped for
> > > > +          wide string alignment adjustment.  */
> > > > +       andl    $(VEC_SIZE - 1), %ecx
> > > > +       sarl    $2, %ecx
> > > > +# endif
> > > > +       /* ecx contains number of w[char] to be skipped as a result
> > > > +          of address alignment.  */
> > > > +       xorq    %rdi, %rax
> > > > +       VMOVA   (PAGE_SIZE - VEC_SIZE)(%rax), %VMM1
> > > > +       vpxorq  %VMM1, %VMM0, %VMM2
> > > > +       VPMINU  %VMM2, %VMM1, %VMM2
> > > > +       VPTESTN %VMM2, %VMM2, %k0
> > > > +       KMOV    %k0, %RAX
> > > > +       /* Ignore number of character for alignment adjustment.  */
> > > > +       SHR     %cl, %RAX
> > > > +       jz      L(align_more)
> > > > +
> > > > +       bsf     %RAX, %RAX
> > > > +# ifdef USE_AS_WCSCHR
> > > > +       leaq    (%rdi, %rax, CHAR_SIZE), %rax
> > > > +# else
> > > > +       addq    %rdi, %rax
> > > > +# endif
> > > > +
> > > > +# ifndef USE_AS_STRCHRNUL
> > > > +       cmp     (%rax), %CHAR_REG
> > > > +       jne     L(zero)
> > > > +# endif
> > > > +       ret
> > > > +
> > > > +END (STRCHR)
> > > > +#endif
> > > > diff --git a/sysdeps/x86_64/multiarch/strchr-evex512.S b/sysdeps/x86_64/multiarch/strchr-evex512.S
> > > > new file mode 100644
> > > > index 0000000000..4079bf387d
> > > > --- /dev/null
> > > > +++ b/sysdeps/x86_64/multiarch/strchr-evex512.S
> > > > @@ -0,0 +1,7 @@
> > > > +# ifndef STRCHR
> > > > +#  define STRCHR       __strchr_evex512
> > > > +# endif
> > > > +
> > > > +#define VEC_SIZE       64
> > > > +
> > > > +#include "strchr-evex-base.S"
> > > > diff --git a/sysdeps/x86_64/multiarch/strchrnul-evex512.S b/sysdeps/x86_64/multiarch/strchrnul-evex512.S
> > > > new file mode 100644
> > > > index 0000000000..1be0b12f38
> > > > --- /dev/null
> > > > +++ b/sysdeps/x86_64/multiarch/strchrnul-evex512.S
> > > > @@ -0,0 +1,8 @@
> > > > +#ifndef STRCHRNUL
> > > > +# define STRCHRNUL     __strchrnul_evex512
> > > > +#endif
> > > > +
> > > > +#define STRCHR STRCHRNUL
> > > > +#define USE_AS_STRCHRNUL 1
> > > > +
> > > > +#include "strchr-evex512.S"
> > > > diff --git a/sysdeps/x86_64/multiarch/wcschr-evex512.S b/sysdeps/x86_64/multiarch/wcschr-evex512.S
> > > > new file mode 100644
> > > > index 0000000000..50c87ab1e5
> > > > --- /dev/null
> > > > +++ b/sysdeps/x86_64/multiarch/wcschr-evex512.S
> > > > @@ -0,0 +1,8 @@
> > > > +#ifndef WCSCHR
> > > > +# define WCSCHR        __wcschr_evex512
> > > > +#endif
> > > > +
> > > > +#define STRCHR WCSCHR
> > > > +#define USE_AS_WCSCHR 1
> > > > +
> > > > +#include "strchr-evex512.S"
> > > > --
> > > > 2.36.1
> > > >
>
> ping
see my reply to strrchr.

^ permalink raw reply	[flat|nested] 10+ messages in thread

* [PATCH v2] x86_64: Implement evex512 version of strchrnul, strchr and wcschr
  2022-09-29  4:07       ` Noah Goldstein
@ 2022-10-21 21:23         ` Sunil K Pandey
  2022-10-25 23:35           ` [PATCH v3] " Sunil K Pandey
  0 siblings, 1 reply; 10+ messages in thread
From: Sunil K Pandey @ 2022-10-21 21:23 UTC (permalink / raw)
  To: libc-alpha

Changes from v1:
- Use VEC API.
- Replace vec load with vec load+op where possible.
- Replace extra lea in align_more with add.
- Restructure loop logic.
- Create zero_2 to avaoid long jmp.
- Combine first, second and third vector return logic.

This patch implements following evex512 version of string functions.
evex512 version takes up to 30% less cycle as compared to evex,
depending on length and alignment.

- strchrnul function using 512 bit vectors.
- strchr function using 512 bit vectors.
- wcschr function using 512 bit vectors.

Code size data:

strchrnul-evex.o	599 byte
strchrnul-evex512.o	547 byte (-9%)

strchr-evex.o		639 byte
strchr-evex512.o	577 byte (-10%)

wcschr-evex.o		644 byte
wcschr-evex512.o	572 byte (-11%)

Placeholder function, not used by any processor at the moment.
---
 sysdeps/x86_64/multiarch/Makefile            |   3 +
 sysdeps/x86_64/multiarch/ifunc-impl-list.c   |  12 +
 sysdeps/x86_64/multiarch/strchr-evex-base.S  | 270 +++++++++++++++++++
 sysdeps/x86_64/multiarch/strchr-evex512.S    |   8 +
 sysdeps/x86_64/multiarch/strchrnul-evex512.S |   8 +
 sysdeps/x86_64/multiarch/wcschr-evex512.S    |   9 +
 6 files changed, 310 insertions(+)
 create mode 100644 sysdeps/x86_64/multiarch/strchr-evex-base.S
 create mode 100644 sysdeps/x86_64/multiarch/strchr-evex512.S
 create mode 100644 sysdeps/x86_64/multiarch/strchrnul-evex512.S
 create mode 100644 sysdeps/x86_64/multiarch/wcschr-evex512.S

diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
index e974b1ad97..597ac9d5e9 100644
--- a/sysdeps/x86_64/multiarch/Makefile
+++ b/sysdeps/x86_64/multiarch/Makefile
@@ -62,11 +62,13 @@ sysdep_routines += \
   strchr-avx2 \
   strchr-avx2-rtm \
   strchr-evex \
+  strchr-evex512 \
   strchr-sse2 \
   strchr-sse2-no-bsf \
   strchrnul-avx2 \
   strchrnul-avx2-rtm \
   strchrnul-evex \
+  strchrnul-evex512 \
   strchrnul-sse2 \
   strcmp-avx2 \
   strcmp-avx2-rtm \
@@ -131,6 +133,7 @@ sysdep_routines += \
   wcschr-avx2 \
   wcschr-avx2-rtm \
   wcschr-evex \
+  wcschr-evex512 \
   wcschr-sse2 \
   wcscmp-avx2 \
   wcscmp-avx2-rtm \
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
index 529c0b0ef0..c3d75a09f4 100644
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
@@ -544,6 +544,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 				      && CPU_FEATURE_USABLE (AVX512BW)
 				      && CPU_FEATURE_USABLE (BMI2)),
 				     __strchr_evex)
+	      X86_IFUNC_IMPL_ADD_V4 (array, i, strchr,
+				     (CPU_FEATURE_USABLE (AVX512VL)
+				      && CPU_FEATURE_USABLE (AVX512BW)),
+				     __strchr_evex512)
 	      X86_IFUNC_IMPL_ADD_V3 (array, i, strchr,
 				     (CPU_FEATURE_USABLE (AVX2)
 				      && CPU_FEATURE_USABLE (BMI2)),
@@ -569,6 +573,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 				      && CPU_FEATURE_USABLE (AVX512BW)
 				      && CPU_FEATURE_USABLE (BMI2)),
 				     __strchrnul_evex)
+	      X86_IFUNC_IMPL_ADD_V4 (array, i, strchrnul,
+				     (CPU_FEATURE_USABLE (AVX512VL)
+				      && CPU_FEATURE_USABLE (AVX512BW)),
+				     __strchrnul_evex512)
 	      X86_IFUNC_IMPL_ADD_V3 (array, i, strchrnul,
 				     (CPU_FEATURE_USABLE (AVX2)
 				      && CPU_FEATURE_USABLE (BMI2)),
@@ -793,6 +801,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 				      && CPU_FEATURE_USABLE (AVX512BW)
 				      && CPU_FEATURE_USABLE (BMI2)),
 				     __wcschr_evex)
+	      X86_IFUNC_IMPL_ADD_V4 (array, i, wcschr,
+				     (CPU_FEATURE_USABLE (AVX512VL)
+				      && CPU_FEATURE_USABLE (AVX512BW)),
+				     __wcschr_evex512)
 	      X86_IFUNC_IMPL_ADD_V3 (array, i, wcschr,
 				     (CPU_FEATURE_USABLE (AVX2)
 				      && CPU_FEATURE_USABLE (BMI2)),
diff --git a/sysdeps/x86_64/multiarch/strchr-evex-base.S b/sysdeps/x86_64/multiarch/strchr-evex-base.S
new file mode 100644
index 0000000000..eb22171954
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strchr-evex-base.S
@@ -0,0 +1,270 @@
+/* Placeholder function, not used by any processor at the moment.
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+/* UNUSED. Exists purely as reference implementation.  */
+
+#include <isa-level.h>
+
+#if ISA_SHOULD_BUILD (4)
+
+# include <sysdep.h>
+
+# ifdef USE_AS_WCSCHR
+#  define CHAR_REG	esi
+#  define CHAR_SIZE	4
+#  define VPBROADCAST   vpbroadcastd
+#  define VPCMP		vpcmpd
+#  define VPCMPNE	vpcmpneqd
+#  define VPMINU	vpminud
+#  define VPTEST	vptestmd
+#  define VPTESTN	vptestnmd
+# else
+#  define CHAR_REG	sil
+#  define CHAR_SIZE	1
+#  define VPBROADCAST   vpbroadcastb
+#  define VPCMP		vpcmpb
+#  define VPCMPNE	vpcmpneqb
+#  define VPMINU	vpminub
+#  define VPTEST	vptestmb
+#  define VPTESTN	vptestnmb
+# endif
+
+# define PAGE_SIZE	4096
+# define CHAR_PER_VEC	(VEC_SIZE / CHAR_SIZE)
+
+	.section SECTION(.text), "ax", @progbits
+/* Aligning entry point to 64 byte, provides better performance for
+   one vector length string.  */
+ENTRY_P2ALIGN (STRCHR, 6)
+
+	/* Broadcast CHAR to VMM(0).  */
+	VPBROADCAST %esi, %VMM(0)
+	movl	%edi, %eax
+	andl	$(PAGE_SIZE - 1), %eax
+	cmpl	$(PAGE_SIZE - VEC_SIZE), %eax
+	ja	L(page_cross)
+
+	/* Compare [w]char for null, mask bit will be set for match.  */
+	vpxorq	(%rdi), %VMM(0), %VMM(1)
+	VPMINU	(%rdi), %VMM(1), %VMM(1)
+	VPTESTN	%VMM(1), %VMM(1), %k0
+
+	KMOV	%k0, %VRAX
+	bsf	%VRAX, %VRAX
+	jz	L(align_more)
+
+# ifdef USE_AS_WCSCHR
+	leaq	(%rdi, %rax, CHAR_SIZE), %rax
+# else
+	add	%rdi, %rax
+# endif
+# ifndef USE_AS_STRCHRNUL
+	cmp	(%rax), %CHAR_REG
+	jne	L(zero)
+	ret
+L(zero):
+	xorl	%eax, %eax
+# endif
+	ret
+
+L(ret_vec_x3):
+	subq	$-VEC_SIZE, %rdi
+L(ret_vec_x2):
+	subq	$-VEC_SIZE, %rdi
+L(ret_vec_x1):
+	bsf     %VRAX, %VRAX
+# ifdef USE_AS_WCSCHR
+	leaq	(%rdi, %rax, CHAR_SIZE), %rax
+# else
+	add	%rdi, %rax
+# endif
+
+# ifndef USE_AS_STRCHRNUL
+	cmp	(%rax), %CHAR_REG
+	jne	L(zero)
+# endif
+	ret
+
+L(page_cross):
+	movl	%eax, %ecx
+# ifdef USE_AS_WCSCHR
+	/* Calculate number of compare result bits to be skipped for
+	   wide string alignment adjustment.  */
+	andl	$(VEC_SIZE - 1), %ecx
+	sarl	$2, %ecx
+# endif
+	/* ecx contains number of w[char] to be skipped as a result
+	   of address alignment.  */
+	xorq	%rdi, %rax
+	vpxorq	(PAGE_SIZE - VEC_SIZE)(%rax), %VMM(0), %VMM(1)
+	VPMINU	(PAGE_SIZE - VEC_SIZE)(%rax), %VMM(1), %VMM(1)
+	VPTESTN	%VMM(1), %VMM(1), %k0
+	KMOV	%k0, %VRAX
+	/* Ignore number of character for alignment adjustment.  */
+	shr	%cl, %VRAX
+	jz	L(align_more)
+
+	bsf	%VRAX, %VRAX
+# ifdef USE_AS_WCSCHR
+	leaq	(%rdi, %rax, CHAR_SIZE), %rax
+# else
+	addq	%rdi, %rax
+# endif
+
+# ifndef USE_AS_STRCHRNUL
+	cmp	(%rax), %CHAR_REG
+	jne	L(zero)
+# endif
+	ret
+
+L(align_more):
+	/* Align rax to VEC_SIZE.  */
+	andq	$-VEC_SIZE, %rdi
+
+	/* Loop unroll 4 times for 4 vector loop.  */
+	vpxorq	VEC_SIZE(%rdi), %VMM(0), %VMM(1)
+	VPMINU	VEC_SIZE(%rdi), %VMM(1), %VMM(1)
+
+	/* Add VEC_SIZE here, in case of match ret_vec_x1
+	   will be called.  It will reduce dependency for
+	   vector load.  */
+	subq	$-VEC_SIZE, %rdi
+
+	VPTESTN	%VMM(1), %VMM(1), %k0
+
+	KMOV	%k0, %VRAX
+	test	%VRAX, %VRAX
+	jnz	L(ret_vec_x1)
+
+	vpxorq	VEC_SIZE(%rdi), %VMM(0), %VMM(1)
+	VPMINU	VEC_SIZE(%rdi), %VMM(1), %VMM(1)
+	VPTESTN	%VMM(1), %VMM(1), %k0
+
+	KMOV	%k0, %VRAX
+	test	%VRAX, %VRAX
+	jnz	L(ret_vec_x2)
+
+	vpxorq	(VEC_SIZE * 2)(%rdi), %VMM(0), %VMM(1)
+	VPMINU	(VEC_SIZE * 2)(%rdi), %VMM(1), %VMM(1)
+	VPTESTN	%VMM(1), %VMM(1), %k0
+
+	KMOV	%k0, %VRAX
+	test	%VRAX, %VRAX
+	jnz	L(ret_vec_x3)
+
+	vpxorq	(VEC_SIZE * 3)(%rdi), %VMM(0), %VMM(1)
+	VPMINU	(VEC_SIZE * 3)(%rdi), %VMM(1), %VMM(1)
+	VPTESTN	%VMM(1), %VMM(1), %k0
+
+	KMOV	%k0, %VRDX
+	test	%VRDX, %VRDX
+	jnz	L(ret_vec_x4)
+
+	/* Align address to VEC_SIZE * 4 for loop.  */
+	andq	$-(VEC_SIZE * 4), %rdi
+L(loop):
+	/* VPMINU and VPCMP combination provide better performance as
+	   compared to alternative combinations.  */
+	VMOVA	(VEC_SIZE * 4)(%rdi), %VMM(1)
+	VMOVA	(VEC_SIZE * 6)(%rdi), %VMM(3)
+
+	VPCMPNE	%VMM(1), %VMM(0), %k1
+	VPCMPNE	(VEC_SIZE * 5)(%rdi), %VMM(0), %k2
+
+	VPMINU	(VEC_SIZE * 5)(%rdi), %VMM(1), %VMM(2)
+
+	VPCMPNE	%VMM(3), %VMM(0), %k3{%k1}
+	VPCMPNE	(VEC_SIZE * 7)(%rdi), %VMM(0), %k4{%k2}
+
+	VPMINU	(VEC_SIZE * 7)(%rdi), %VMM(3), %VMM(4)
+	VPMINU	%VMM(2), %VMM(4), %VMM(4){%k3}{z}
+
+	VPTEST	%VMM(4), %VMM(4), %k5{%k4}
+
+	KMOV	%k5, %VRDX
+	subq	$-(VEC_SIZE * 4), %rdi
+# ifdef USE_AS_WCSCHR
+#  if CHAR_PER_VEC == 8
+	sub	$0xff, %VRDX
+#  else
+	sub	$0xffff, %VRDX
+#  endif
+# else
+	inc	%VRDX
+# endif
+	jz	L(loop)
+
+	VPTEST	%VMM(1), %VMM(1), %k0{%k1}
+	KMOV	%k0, %VRAX
+# ifdef USE_AS_WCSCHR
+#  if CHAR_PER_VEC == 8
+	sub	$0xff, %VRAX
+#  else
+	sub	$0xffff, %VRAX
+#  endif
+# else
+	inc	%VRAX
+# endif
+	jnz	L(ret_vec_x1)
+
+	VPTEST	%VMM(2), %VMM(2), %k0{%k2}
+	KMOV	%k0, %VRAX
+	/* At this point, if k1 is non zero, null char must be in the
+	   second vector.  */
+# ifdef USE_AS_WCSCHR
+#  if CHAR_PER_VEC == 8
+	sub	$0xff, %VRAX
+#  else
+	sub	$0xffff, %VRAX
+#  endif
+# else
+	inc	%VRAX
+# endif
+	jnz	L(ret_vec_x2)
+
+	VPTEST	%VMM(3), %VMM(3), %k0{%k3}
+	KMOV	%k0, %VRAX
+# ifdef USE_AS_WCSCHR
+#  if CHAR_PER_VEC == 8
+	sub	$0xff, %VRAX
+#  else
+	sub	$0xffff, %VRAX
+#  endif
+# else
+	inc	%VRAX
+# endif
+	jnz	L(ret_vec_x3)
+	/* At this point null [w]char must be in the fourth vector so no
+	   need to check.  */
+
+L(ret_vec_x4):
+	bsf	%VRDX, %VRAX
+	leaq	(VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax
+# ifndef USE_AS_STRCHRNUL
+	cmp	(%rax), %CHAR_REG
+	jne	L(zero_2)
+# endif
+	ret
+
+# ifndef USE_AS_STRCHRNUL
+L(zero_2):
+	xor	%eax, %eax
+	ret
+# endif
+END (STRCHR)
+#endif
diff --git a/sysdeps/x86_64/multiarch/strchr-evex512.S b/sysdeps/x86_64/multiarch/strchr-evex512.S
new file mode 100644
index 0000000000..a4ac022952
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strchr-evex512.S
@@ -0,0 +1,8 @@
+# ifndef STRCHR
+#  define STRCHR	__strchr_evex512
+# endif
+
+#include "x86-evex512-vecs.h"
+#include "reg-macros.h"
+
+#include "strchr-evex-base.S"
diff --git a/sysdeps/x86_64/multiarch/strchrnul-evex512.S b/sysdeps/x86_64/multiarch/strchrnul-evex512.S
new file mode 100644
index 0000000000..1be0b12f38
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strchrnul-evex512.S
@@ -0,0 +1,8 @@
+#ifndef STRCHRNUL
+# define STRCHRNUL	__strchrnul_evex512
+#endif
+
+#define STRCHR	STRCHRNUL
+#define USE_AS_STRCHRNUL 1
+
+#include "strchr-evex512.S"
diff --git a/sysdeps/x86_64/multiarch/wcschr-evex512.S b/sysdeps/x86_64/multiarch/wcschr-evex512.S
new file mode 100644
index 0000000000..3fe4e77a70
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wcschr-evex512.S
@@ -0,0 +1,9 @@
+#ifndef WCSCHR
+# define WCSCHR	__wcschr_evex512
+#endif
+
+#define STRCHR	WCSCHR
+#define USE_AS_WCSCHR 1
+
+#define USE_WIDE_CHAR 1
+#include "strchr-evex512.S"
-- 
2.36.1


^ permalink raw reply	[flat|nested] 10+ messages in thread

* [PATCH v3] x86_64: Implement evex512 version of strchrnul, strchr and wcschr
  2022-10-21 21:23         ` [PATCH v2] " Sunil K Pandey
@ 2022-10-25 23:35           ` Sunil K Pandey
  2022-10-26  1:35             ` Noah Goldstein
  0 siblings, 1 reply; 10+ messages in thread
From: Sunil K Pandey @ 2022-10-25 23:35 UTC (permalink / raw)
  To: libc-alpha

Changes from v2:
- Replace 2x load with mask logic.
Changes from v1:
- Use VEC API.
- Replace extra lea in align_more with add.
- Restructure loop logic.
- Create zero_2 to avoid long jmp.
- Combine first, second and third vector return logic.

This patch implements following evex512 version of string functions.
evex512 version takes up to 30% less cycle as compared to evex,
depending on length and alignment.

- strchrnul function using 512 bit vectors.
- strchr function using 512 bit vectors.
- wcschr function using 512 bit vectors.

Code size data:

strchrnul-evex.o	599 byte
strchrnul-evex512.o	569 byte (-5%)

strchr-evex.o		639 byte
strchr-evex512.o	595 byte (-7%)

wcschr-evex.o		644 byte
wcschr-evex512.o	607 byte (-6%)

Placeholder function, not used by any processor at the moment.
---
 sysdeps/x86_64/multiarch/Makefile            |   3 +
 sysdeps/x86_64/multiarch/ifunc-impl-list.c   |  12 +
 sysdeps/x86_64/multiarch/strchr-evex-base.S  | 282 +++++++++++++++++++
 sysdeps/x86_64/multiarch/strchr-evex512.S    |   8 +
 sysdeps/x86_64/multiarch/strchrnul-evex512.S |   8 +
 sysdeps/x86_64/multiarch/wcschr-evex512.S    |   9 +
 6 files changed, 322 insertions(+)
 create mode 100644 sysdeps/x86_64/multiarch/strchr-evex-base.S
 create mode 100644 sysdeps/x86_64/multiarch/strchr-evex512.S
 create mode 100644 sysdeps/x86_64/multiarch/strchrnul-evex512.S
 create mode 100644 sysdeps/x86_64/multiarch/wcschr-evex512.S

diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
index e974b1ad97..597ac9d5e9 100644
--- a/sysdeps/x86_64/multiarch/Makefile
+++ b/sysdeps/x86_64/multiarch/Makefile
@@ -62,11 +62,13 @@ sysdep_routines += \
   strchr-avx2 \
   strchr-avx2-rtm \
   strchr-evex \
+  strchr-evex512 \
   strchr-sse2 \
   strchr-sse2-no-bsf \
   strchrnul-avx2 \
   strchrnul-avx2-rtm \
   strchrnul-evex \
+  strchrnul-evex512 \
   strchrnul-sse2 \
   strcmp-avx2 \
   strcmp-avx2-rtm \
@@ -131,6 +133,7 @@ sysdep_routines += \
   wcschr-avx2 \
   wcschr-avx2-rtm \
   wcschr-evex \
+  wcschr-evex512 \
   wcschr-sse2 \
   wcscmp-avx2 \
   wcscmp-avx2-rtm \
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
index 529c0b0ef0..c3d75a09f4 100644
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
@@ -544,6 +544,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 				      && CPU_FEATURE_USABLE (AVX512BW)
 				      && CPU_FEATURE_USABLE (BMI2)),
 				     __strchr_evex)
+	      X86_IFUNC_IMPL_ADD_V4 (array, i, strchr,
+				     (CPU_FEATURE_USABLE (AVX512VL)
+				      && CPU_FEATURE_USABLE (AVX512BW)),
+				     __strchr_evex512)
 	      X86_IFUNC_IMPL_ADD_V3 (array, i, strchr,
 				     (CPU_FEATURE_USABLE (AVX2)
 				      && CPU_FEATURE_USABLE (BMI2)),
@@ -569,6 +573,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 				      && CPU_FEATURE_USABLE (AVX512BW)
 				      && CPU_FEATURE_USABLE (BMI2)),
 				     __strchrnul_evex)
+	      X86_IFUNC_IMPL_ADD_V4 (array, i, strchrnul,
+				     (CPU_FEATURE_USABLE (AVX512VL)
+				      && CPU_FEATURE_USABLE (AVX512BW)),
+				     __strchrnul_evex512)
 	      X86_IFUNC_IMPL_ADD_V3 (array, i, strchrnul,
 				     (CPU_FEATURE_USABLE (AVX2)
 				      && CPU_FEATURE_USABLE (BMI2)),
@@ -793,6 +801,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 				      && CPU_FEATURE_USABLE (AVX512BW)
 				      && CPU_FEATURE_USABLE (BMI2)),
 				     __wcschr_evex)
+	      X86_IFUNC_IMPL_ADD_V4 (array, i, wcschr,
+				     (CPU_FEATURE_USABLE (AVX512VL)
+				      && CPU_FEATURE_USABLE (AVX512BW)),
+				     __wcschr_evex512)
 	      X86_IFUNC_IMPL_ADD_V3 (array, i, wcschr,
 				     (CPU_FEATURE_USABLE (AVX2)
 				      && CPU_FEATURE_USABLE (BMI2)),
diff --git a/sysdeps/x86_64/multiarch/strchr-evex-base.S b/sysdeps/x86_64/multiarch/strchr-evex-base.S
new file mode 100644
index 0000000000..21a6bc5907
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strchr-evex-base.S
@@ -0,0 +1,282 @@
+/* Placeholder function, not used by any processor at the moment.
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+/* UNUSED. Exists purely as reference implementation.  */
+
+#include <isa-level.h>
+
+#if ISA_SHOULD_BUILD (4)
+
+# include <sysdep.h>
+
+# ifdef USE_AS_WCSCHR
+#  define CHAR_REG	esi
+#  define CHAR_SIZE	4
+#  define VPBROADCAST   vpbroadcastd
+#  define VPCMP		vpcmpd
+#  define VPCMPNE	vpcmpneqd
+#  define VPMINU	vpminud
+#  define VPTEST	vptestmd
+#  define VPTESTN	vptestnmd
+# else
+#  define CHAR_REG	sil
+#  define CHAR_SIZE	1
+#  define VPBROADCAST   vpbroadcastb
+#  define VPCMP		vpcmpb
+#  define VPCMPNE	vpcmpneqb
+#  define VPMINU	vpminub
+#  define VPTEST	vptestmb
+#  define VPTESTN	vptestnmb
+# endif
+
+# define PAGE_SIZE	4096
+# define CHAR_PER_VEC	(VEC_SIZE / CHAR_SIZE)
+# define VEC_MATCH_MASK ((1 << CHAR_PER_VEC) - 1)
+
+	.section SECTION(.text), "ax", @progbits
+/* Aligning entry point to 64 byte, provides better performance for
+   one vector length string.  */
+ENTRY_P2ALIGN (STRCHR, 6)
+
+	/* Broadcast CHAR to VMM(0).  */
+	VPBROADCAST %esi, %VMM(0)
+	movl	%edi, %eax
+	sall	$20,%eax
+	cmpl	$((PAGE_SIZE - VEC_SIZE) << 20), %eax
+	ja	L(page_cross)
+
+	VMOVU	(%rdi), %VMM(1)
+	VPCMPNE	%VMM(1), %VMM(0), %k1
+	VPTEST	%VMM(1), %VMM(1), %k0{%k1}
+	KMOV	%k0, %VRAX
+	/* Compare [w]char for null, mask bit will be set for match.  */
+
+# ifdef USE_AS_WCSCHR
+	sub	$VEC_MATCH_MASK, %VRAX
+# else
+	inc	%VRAX
+# endif
+	jz	L(align_more)
+
+	bsf	%VRAX, %VRAX
+
+# ifdef USE_AS_WCSCHR
+	leaq	(%rdi, %rax, CHAR_SIZE), %rax
+# else
+	add	%rdi, %rax
+# endif
+# ifndef USE_AS_STRCHRNUL
+	cmp	(%rax), %CHAR_REG
+	jne	L(zero)
+	ret
+L(zero):
+	xorl	%eax, %eax
+# endif
+	ret
+
+L(ret_vec_x3):
+	subq	$-VEC_SIZE, %rdi
+L(ret_vec_x2):
+	subq	$-VEC_SIZE, %rdi
+L(ret_vec_x1):
+	bsf     %VRAX, %VRAX
+# ifdef USE_AS_WCSCHR
+	leaq	(%rdi, %rax, CHAR_SIZE), %rax
+# else
+	add	%rdi, %rax
+# endif
+
+# ifndef USE_AS_STRCHRNUL
+	cmp	(%rax), %CHAR_REG
+	jne	L(zero)
+# endif
+	ret
+
+L(page_cross):
+	mov	%rdi, %rax
+	movl	%edi, %ecx
+# ifdef USE_AS_WCSCHR
+	/* Calculate number of compare result bits to be skipped for
+	   wide string alignment adjustment.  */
+	andl	$(VEC_SIZE - 1), %ecx
+	sarl	$2, %ecx
+# endif
+	/* ecx contains number of w[char] to be skipped as a result
+	   of address alignment.  */
+	andq    $-VEC_SIZE, %rax
+
+	VMOVA	(%rax), %VMM(1)
+	VPCMPNE	%VMM(1), %VMM(0), %k1
+	VPTEST	%VMM(1), %VMM(1), %k0{%k1}
+	KMOV	%k0, %VRAX
+# ifdef USE_AS_WCSCHR
+	sub	$VEC_MATCH_MASK, %VRAX
+# else
+	inc	%VRAX
+# endif
+	/* Ignore number of character for alignment adjustment.  */
+	shr	%cl, %VRAX
+	jz	L(align_more)
+
+	bsf	%VRAX, %VRAX
+# ifdef USE_AS_WCSCHR
+	leaq	(%rdi, %rax, CHAR_SIZE), %rax
+# else
+	addq	%rdi, %rax
+# endif
+
+# ifndef USE_AS_STRCHRNUL
+	cmp	(%rax), %CHAR_REG
+	jne	L(zero)
+# endif
+	ret
+
+L(align_more):
+	/* Align rax to VEC_SIZE.  */
+	andq	$-VEC_SIZE, %rdi
+
+	/* Loop unroll 4 times for 4 vector loop.  */
+	VMOVA	VEC_SIZE(%rdi), %VMM(1)
+	VPCMPNE	%VMM(1), %VMM(0), %k1
+	VPTEST	%VMM(1), %VMM(1), %k0{%k1}
+
+	/* Increment rdi by vector size for further comparison and
+	   return.  */
+	subq	$-VEC_SIZE, %rdi
+	KMOV	%k0, %VRAX
+
+# ifdef USE_AS_WCSCHR
+	sub	$VEC_MATCH_MASK, %VRAX
+# else
+	inc	%VRAX
+# endif
+	jnz	L(ret_vec_x1)
+
+	VMOVA	VEC_SIZE(%rdi), %VMM(1)
+	VPCMPNE	%VMM(1), %VMM(0), %k1
+	VPTEST	%VMM(1), %VMM(1), %k0{%k1}
+	KMOV	%k0, %VRAX
+# ifdef USE_AS_WCSCHR
+	sub	$VEC_MATCH_MASK, %VRAX
+# else
+	inc	%VRAX
+# endif
+	jnz	L(ret_vec_x2)
+
+	VMOVA	(VEC_SIZE * 2)(%rdi), %VMM(1)
+	VPCMPNE	%VMM(1), %VMM(0), %k1
+	VPTEST	%VMM(1), %VMM(1), %k0{%k1}
+	KMOV	%k0, %VRAX
+# ifdef USE_AS_WCSCHR
+	sub	$VEC_MATCH_MASK, %VRAX
+# else
+	inc	%VRAX
+# endif
+	jnz	L(ret_vec_x3)
+
+	VMOVA	(VEC_SIZE * 3)(%rdi), %VMM(1)
+	VPCMPNE	%VMM(1), %VMM(0), %k1
+	VPTEST	%VMM(1), %VMM(1), %k0{%k1}
+	KMOV	%k0, %VRDX
+# ifdef USE_AS_WCSCHR
+	sub	$VEC_MATCH_MASK, %VRDX
+# else
+	inc	%VRDX
+# endif
+	jnz	L(ret_vec_x4)
+
+
+	/* Align address to VEC_SIZE * 4 for loop.  */
+	andq	$-(VEC_SIZE * 4), %rdi
+L(loop):
+	/* VPMINU and VPCMP combination provide better performance as
+	   compared to alternative combinations.  */
+	VMOVA	(VEC_SIZE * 4)(%rdi), %VMM(1)
+	VMOVA	(VEC_SIZE * 5)(%rdi), %VMM(2)
+	VMOVA	(VEC_SIZE * 6)(%rdi), %VMM(3)
+	VMOVA	(VEC_SIZE * 7)(%rdi), %VMM(4)
+
+	VPCMPNE	%VMM(1), %VMM(0), %k1
+	VPCMPNE	%VMM(2), %VMM(0), %k2
+
+	VPMINU	%VMM(2), %VMM(1), %VMM(2)
+
+	VPCMPNE	%VMM(3), %VMM(0), %k3{%k1}
+	VPCMPNE	%VMM(4), %VMM(0), %k4{%k2}
+
+	VPMINU	%VMM(4), %VMM(3), %VMM(4)
+	VPMINU	%VMM(2), %VMM(4), %VMM(4){%k3}{z}
+
+	VPTEST	%VMM(4), %VMM(4), %k5{%k4}
+
+	KMOV	%k5, %VRDX
+	subq	$-(VEC_SIZE * 4), %rdi
+# ifdef USE_AS_WCSCHR
+	sub	$VEC_MATCH_MASK, %VRDX
+# else
+	inc	%VRDX
+# endif
+	jz	L(loop)
+
+	VPTEST	%VMM(1), %VMM(1), %k0{%k1}
+	KMOV	%k0, %VRAX
+# ifdef USE_AS_WCSCHR
+	sub	$VEC_MATCH_MASK, %VRAX
+# else
+	inc	%VRAX
+# endif
+	jnz	L(ret_vec_x1)
+
+	VPTEST	%VMM(2), %VMM(2), %k0{%k2}
+	KMOV	%k0, %VRAX
+	/* At this point, if k1 is non zero, null char must be in the
+	   second vector.  */
+# ifdef USE_AS_WCSCHR
+	sub	$VEC_MATCH_MASK, %VRAX
+# else
+	inc	%VRAX
+# endif
+	jnz	L(ret_vec_x2)
+
+	VPTEST	%VMM(3), %VMM(3), %k0{%k3}
+	KMOV	%k0, %VRAX
+# ifdef USE_AS_WCSCHR
+	sub	$VEC_MATCH_MASK, %VRAX
+# else
+	inc	%VRAX
+# endif
+	jnz	L(ret_vec_x3)
+	/* At this point null [w]char must be in the fourth vector so no
+	   need to check.  */
+
+L(ret_vec_x4):
+	bsf	%VRDX, %VRAX
+	leaq	(VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax
+# ifndef USE_AS_STRCHRNUL
+	cmp	(%rax), %CHAR_REG
+	jne	L(zero_2)
+# endif
+	ret
+
+# ifndef USE_AS_STRCHRNUL
+L(zero_2):
+	xor	%eax, %eax
+	ret
+# endif
+END (STRCHR)
+#endif
diff --git a/sysdeps/x86_64/multiarch/strchr-evex512.S b/sysdeps/x86_64/multiarch/strchr-evex512.S
new file mode 100644
index 0000000000..a4ac022952
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strchr-evex512.S
@@ -0,0 +1,8 @@
+# ifndef STRCHR
+#  define STRCHR	__strchr_evex512
+# endif
+
+#include "x86-evex512-vecs.h"
+#include "reg-macros.h"
+
+#include "strchr-evex-base.S"
diff --git a/sysdeps/x86_64/multiarch/strchrnul-evex512.S b/sysdeps/x86_64/multiarch/strchrnul-evex512.S
new file mode 100644
index 0000000000..1be0b12f38
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strchrnul-evex512.S
@@ -0,0 +1,8 @@
+#ifndef STRCHRNUL
+# define STRCHRNUL	__strchrnul_evex512
+#endif
+
+#define STRCHR	STRCHRNUL
+#define USE_AS_STRCHRNUL 1
+
+#include "strchr-evex512.S"
diff --git a/sysdeps/x86_64/multiarch/wcschr-evex512.S b/sysdeps/x86_64/multiarch/wcschr-evex512.S
new file mode 100644
index 0000000000..3fe4e77a70
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wcschr-evex512.S
@@ -0,0 +1,9 @@
+#ifndef WCSCHR
+# define WCSCHR	__wcschr_evex512
+#endif
+
+#define STRCHR	WCSCHR
+#define USE_AS_WCSCHR 1
+
+#define USE_WIDE_CHAR 1
+#include "strchr-evex512.S"
-- 
2.36.1


^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH v3] x86_64: Implement evex512 version of strchrnul, strchr and wcschr
  2022-10-25 23:35           ` [PATCH v3] " Sunil K Pandey
@ 2022-10-26  1:35             ` Noah Goldstein
  2022-10-26  2:06               ` [PATCH v4] " Sunil K Pandey
  0 siblings, 1 reply; 10+ messages in thread
From: Noah Goldstein @ 2022-10-26  1:35 UTC (permalink / raw)
  To: Sunil K Pandey; +Cc: libc-alpha

On Tue, Oct 25, 2022 at 6:35 PM Sunil K Pandey via Libc-alpha
<libc-alpha@sourceware.org> wrote:
>
> Changes from v2:
> - Replace 2x load with mask logic.
> Changes from v1:
> - Use VEC API.
> - Replace extra lea in align_more with add.
> - Restructure loop logic.
> - Create zero_2 to avoid long jmp.
> - Combine first, second and third vector return logic.
>
> This patch implements following evex512 version of string functions.
> evex512 version takes up to 30% less cycle as compared to evex,
> depending on length and alignment.
>
> - strchrnul function using 512 bit vectors.
> - strchr function using 512 bit vectors.
> - wcschr function using 512 bit vectors.
>
> Code size data:
>
> strchrnul-evex.o        599 byte
> strchrnul-evex512.o     569 byte (-5%)
>
> strchr-evex.o           639 byte
> strchr-evex512.o        595 byte (-7%)
>
> wcschr-evex.o           644 byte
> wcschr-evex512.o        607 byte (-6%)
>
> Placeholder function, not used by any processor at the moment.
> ---
>  sysdeps/x86_64/multiarch/Makefile            |   3 +
>  sysdeps/x86_64/multiarch/ifunc-impl-list.c   |  12 +
>  sysdeps/x86_64/multiarch/strchr-evex-base.S  | 282 +++++++++++++++++++
>  sysdeps/x86_64/multiarch/strchr-evex512.S    |   8 +
>  sysdeps/x86_64/multiarch/strchrnul-evex512.S |   8 +
>  sysdeps/x86_64/multiarch/wcschr-evex512.S    |   9 +
>  6 files changed, 322 insertions(+)
>  create mode 100644 sysdeps/x86_64/multiarch/strchr-evex-base.S
>  create mode 100644 sysdeps/x86_64/multiarch/strchr-evex512.S
>  create mode 100644 sysdeps/x86_64/multiarch/strchrnul-evex512.S
>  create mode 100644 sysdeps/x86_64/multiarch/wcschr-evex512.S
>
> diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
> index e974b1ad97..597ac9d5e9 100644
> --- a/sysdeps/x86_64/multiarch/Makefile
> +++ b/sysdeps/x86_64/multiarch/Makefile
> @@ -62,11 +62,13 @@ sysdep_routines += \
>    strchr-avx2 \
>    strchr-avx2-rtm \
>    strchr-evex \
> +  strchr-evex512 \
>    strchr-sse2 \
>    strchr-sse2-no-bsf \
>    strchrnul-avx2 \
>    strchrnul-avx2-rtm \
>    strchrnul-evex \
> +  strchrnul-evex512 \
>    strchrnul-sse2 \
>    strcmp-avx2 \
>    strcmp-avx2-rtm \
> @@ -131,6 +133,7 @@ sysdep_routines += \
>    wcschr-avx2 \
>    wcschr-avx2-rtm \
>    wcschr-evex \
> +  wcschr-evex512 \
>    wcschr-sse2 \
>    wcscmp-avx2 \
>    wcscmp-avx2-rtm \
> diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> index 529c0b0ef0..c3d75a09f4 100644
> --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> @@ -544,6 +544,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
>                                       && CPU_FEATURE_USABLE (AVX512BW)
>                                       && CPU_FEATURE_USABLE (BMI2)),
>                                      __strchr_evex)
> +             X86_IFUNC_IMPL_ADD_V4 (array, i, strchr,
> +                                    (CPU_FEATURE_USABLE (AVX512VL)
> +                                     && CPU_FEATURE_USABLE (AVX512BW)),
> +                                    __strchr_evex512)
>               X86_IFUNC_IMPL_ADD_V3 (array, i, strchr,
>                                      (CPU_FEATURE_USABLE (AVX2)
>                                       && CPU_FEATURE_USABLE (BMI2)),
> @@ -569,6 +573,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
>                                       && CPU_FEATURE_USABLE (AVX512BW)
>                                       && CPU_FEATURE_USABLE (BMI2)),
>                                      __strchrnul_evex)
> +             X86_IFUNC_IMPL_ADD_V4 (array, i, strchrnul,
> +                                    (CPU_FEATURE_USABLE (AVX512VL)
> +                                     && CPU_FEATURE_USABLE (AVX512BW)),
> +                                    __strchrnul_evex512)
>               X86_IFUNC_IMPL_ADD_V3 (array, i, strchrnul,
>                                      (CPU_FEATURE_USABLE (AVX2)
>                                       && CPU_FEATURE_USABLE (BMI2)),
> @@ -793,6 +801,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
>                                       && CPU_FEATURE_USABLE (AVX512BW)
>                                       && CPU_FEATURE_USABLE (BMI2)),
>                                      __wcschr_evex)
> +             X86_IFUNC_IMPL_ADD_V4 (array, i, wcschr,
> +                                    (CPU_FEATURE_USABLE (AVX512VL)
> +                                     && CPU_FEATURE_USABLE (AVX512BW)),
> +                                    __wcschr_evex512)
>               X86_IFUNC_IMPL_ADD_V3 (array, i, wcschr,
>                                      (CPU_FEATURE_USABLE (AVX2)
>                                       && CPU_FEATURE_USABLE (BMI2)),
> diff --git a/sysdeps/x86_64/multiarch/strchr-evex-base.S b/sysdeps/x86_64/multiarch/strchr-evex-base.S
> new file mode 100644
> index 0000000000..21a6bc5907
> --- /dev/null
> +++ b/sysdeps/x86_64/multiarch/strchr-evex-base.S
> @@ -0,0 +1,282 @@
> +/* Placeholder function, not used by any processor at the moment.
> +   Copyright (C) 2022 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +/* UNUSED. Exists purely as reference implementation.  */
> +
> +#include <isa-level.h>
> +
> +#if ISA_SHOULD_BUILD (4)
> +
> +# include <sysdep.h>
> +
> +# ifdef USE_AS_WCSCHR
> +#  define CHAR_REG     esi
> +#  define CHAR_SIZE    4
> +#  define VPBROADCAST   vpbroadcastd
> +#  define VPCMP                vpcmpd
> +#  define VPCMPNE      vpcmpneqd
> +#  define VPMINU       vpminud
> +#  define VPTEST       vptestmd
> +#  define VPTESTN      vptestnmd
> +# else
> +#  define CHAR_REG     sil
> +#  define CHAR_SIZE    1
> +#  define VPBROADCAST   vpbroadcastb
> +#  define VPCMP                vpcmpb
> +#  define VPCMPNE      vpcmpneqb
> +#  define VPMINU       vpminub
> +#  define VPTEST       vptestmb
> +#  define VPTESTN      vptestnmb
> +# endif
> +
> +# define PAGE_SIZE     4096
> +# define CHAR_PER_VEC  (VEC_SIZE / CHAR_SIZE)
> +# define VEC_MATCH_MASK ((1 << CHAR_PER_VEC) - 1)
> +
> +       .section SECTION(.text), "ax", @progbits
> +/* Aligning entry point to 64 byte, provides better performance for
> +   one vector length string.  */
> +ENTRY_P2ALIGN (STRCHR, 6)
> +
> +       /* Broadcast CHAR to VMM(0).  */
> +       VPBROADCAST %esi, %VMM(0)
> +       movl    %edi, %eax
> +       sall    $20,%eax
> +       cmpl    $((PAGE_SIZE - VEC_SIZE) << 20), %eax
> +       ja      L(page_cross)
> +
> +       VMOVU   (%rdi), %VMM(1)
> +       VPCMPNE %VMM(1), %VMM(0), %k1
> +       VPTEST  %VMM(1), %VMM(1), %k0{%k1}
> +       KMOV    %k0, %VRAX
> +       /* Compare [w]char for null, mask bit will be set for match.  */
> +
> +# ifdef USE_AS_WCSCHR
> +       sub     $VEC_MATCH_MASK, %VRAX
> +# else
> +       inc     %VRAX
> +# endif
> +       jz      L(align_more)
> +
> +       bsf     %VRAX, %VRAX
> +
> +# ifdef USE_AS_WCSCHR
> +       leaq    (%rdi, %rax, CHAR_SIZE), %rax
> +# else
> +       add     %rdi, %rax
> +# endif
> +# ifndef USE_AS_STRCHRNUL
> +       cmp     (%rax), %CHAR_REG
> +       jne     L(zero)
> +       ret
> +L(zero):
> +       xorl    %eax, %eax
> +# endif
> +       ret
> +
> +L(ret_vec_x3):
> +       subq    $-VEC_SIZE, %rdi
> +L(ret_vec_x2):
> +       subq    $-VEC_SIZE, %rdi
> +L(ret_vec_x1):
> +       bsf     %VRAX, %VRAX
> +# ifdef USE_AS_WCSCHR
> +       leaq    (%rdi, %rax, CHAR_SIZE), %rax
> +# else
> +       add     %rdi, %rax
> +# endif
> +
> +# ifndef USE_AS_STRCHRNUL
> +       cmp     (%rax), %CHAR_REG
> +       jne     L(zero)
> +# endif
> +       ret
> +
> +L(page_cross):
> +       mov     %rdi, %rax
> +       movl    %edi, %ecx
> +# ifdef USE_AS_WCSCHR
> +       /* Calculate number of compare result bits to be skipped for
> +          wide string alignment adjustment.  */
> +       andl    $(VEC_SIZE - 1), %ecx
> +       sarl    $2, %ecx
> +# endif
> +       /* ecx contains number of w[char] to be skipped as a result
> +          of address alignment.  */
> +       andq    $-VEC_SIZE, %rax
> +
> +       VMOVA   (%rax), %VMM(1)
> +       VPCMPNE %VMM(1), %VMM(0), %k1
> +       VPTEST  %VMM(1), %VMM(1), %k0{%k1}
> +       KMOV    %k0, %VRAX
> +# ifdef USE_AS_WCSCHR
> +       sub     $VEC_MATCH_MASK, %VRAX
> +# else
> +       inc     %VRAX
> +# endif
> +       /* Ignore number of character for alignment adjustment.  */
> +       shr     %cl, %VRAX
> +       jz      L(align_more)
> +
> +       bsf     %VRAX, %VRAX
> +# ifdef USE_AS_WCSCHR
> +       leaq    (%rdi, %rax, CHAR_SIZE), %rax
> +# else
> +       addq    %rdi, %rax
> +# endif
> +
> +# ifndef USE_AS_STRCHRNUL
> +       cmp     (%rax), %CHAR_REG
> +       jne     L(zero)
> +# endif
> +       ret
> +
> +L(align_more):
> +       /* Align rax to VEC_SIZE.  */
> +       andq    $-VEC_SIZE, %rdi
> +
> +       /* Loop unroll 4 times for 4 vector loop.  */
> +       VMOVA   VEC_SIZE(%rdi), %VMM(1)
> +       VPCMPNE %VMM(1), %VMM(0), %k1
> +       VPTEST  %VMM(1), %VMM(1), %k0{%k1}
> +
> +       /* Increment rdi by vector size for further comparison and
> +          return.  */
> +       subq    $-VEC_SIZE, %rdi
> +       KMOV    %k0, %VRAX
> +
> +# ifdef USE_AS_WCSCHR
> +       sub     $VEC_MATCH_MASK, %VRAX
> +# else
> +       inc     %VRAX
> +# endif
> +       jnz     L(ret_vec_x1)
> +
> +       VMOVA   VEC_SIZE(%rdi), %VMM(1)
> +       VPCMPNE %VMM(1), %VMM(0), %k1
> +       VPTEST  %VMM(1), %VMM(1), %k0{%k1}
> +       KMOV    %k0, %VRAX
> +# ifdef USE_AS_WCSCHR
> +       sub     $VEC_MATCH_MASK, %VRAX
> +# else
> +       inc     %VRAX
> +# endif
> +       jnz     L(ret_vec_x2)
> +
> +       VMOVA   (VEC_SIZE * 2)(%rdi), %VMM(1)
> +       VPCMPNE %VMM(1), %VMM(0), %k1
> +       VPTEST  %VMM(1), %VMM(1), %k0{%k1}
> +       KMOV    %k0, %VRAX
> +# ifdef USE_AS_WCSCHR
> +       sub     $VEC_MATCH_MASK, %VRAX
> +# else
> +       inc     %VRAX
> +# endif
> +       jnz     L(ret_vec_x3)
> +
> +       VMOVA   (VEC_SIZE * 3)(%rdi), %VMM(1)
> +       VPCMPNE %VMM(1), %VMM(0), %k1
> +       VPTEST  %VMM(1), %VMM(1), %k0{%k1}
> +       KMOV    %k0, %VRDX
> +# ifdef USE_AS_WCSCHR
> +       sub     $VEC_MATCH_MASK, %VRDX
> +# else
> +       inc     %VRDX
> +# endif
> +       jnz     L(ret_vec_x4)
> +
> +
> +       /* Align address to VEC_SIZE * 4 for loop.  */
> +       andq    $-(VEC_SIZE * 4), %rdi
> +L(loop):
> +       /* VPMINU and VPCMP combination provide better performance as
> +          compared to alternative combinations.  */
> +       VMOVA   (VEC_SIZE * 4)(%rdi), %VMM(1)
> +       VMOVA   (VEC_SIZE * 5)(%rdi), %VMM(2)
> +       VMOVA   (VEC_SIZE * 6)(%rdi), %VMM(3)
> +       VMOVA   (VEC_SIZE * 7)(%rdi), %VMM(4)
> +
> +       VPCMPNE %VMM(1), %VMM(0), %k1
> +       VPCMPNE %VMM(2), %VMM(0), %k2
> +
> +       VPMINU  %VMM(2), %VMM(1), %VMM(2)
> +
> +       VPCMPNE %VMM(3), %VMM(0), %k3{%k1}
> +       VPCMPNE %VMM(4), %VMM(0), %k4{%k2}
> +
> +       VPMINU  %VMM(4), %VMM(3), %VMM(4)
> +       VPMINU  %VMM(2), %VMM(4), %VMM(4){%k3}{z}
> +
> +       VPTEST  %VMM(4), %VMM(4), %k5{%k4}
> +
> +       KMOV    %k5, %VRDX
> +       subq    $-(VEC_SIZE * 4), %rdi
> +# ifdef USE_AS_WCSCHR
> +       sub     $VEC_MATCH_MASK, %VRDX
> +# else
> +       inc     %VRDX
> +# endif
> +       jz      L(loop)
> +
> +       VPTEST  %VMM(1), %VMM(1), %k0{%k1}
> +       KMOV    %k0, %VRAX
> +# ifdef USE_AS_WCSCHR
> +       sub     $VEC_MATCH_MASK, %VRAX
> +# else
> +       inc     %VRAX
> +# endif
> +       jnz     L(ret_vec_x1)
> +
> +       VPTEST  %VMM(2), %VMM(2), %k0{%k2}
> +       KMOV    %k0, %VRAX
> +       /* At this point, if k1 is non zero, null char must be in the
> +          second vector.  */
> +# ifdef USE_AS_WCSCHR
> +       sub     $VEC_MATCH_MASK, %VRAX
> +# else
> +       inc     %VRAX
> +# endif
> +       jnz     L(ret_vec_x2)
> +
> +       VPTEST  %VMM(3), %VMM(3), %k0{%k3}
> +       KMOV    %k0, %VRAX
> +# ifdef USE_AS_WCSCHR
> +       sub     $VEC_MATCH_MASK, %VRAX
> +# else
> +       inc     %VRAX
> +# endif
> +       jnz     L(ret_vec_x3)
> +       /* At this point null [w]char must be in the fourth vector so no
> +          need to check.  */
> +
> +L(ret_vec_x4):
> +       bsf     %VRDX, %VRAX
> +       leaq    (VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax

Can you make this:
```
bsf %VRDX, %VRDX
leaq (VEC_SIZE * 3)(%rdi, %rdx, CHAR_SIZE), %rax
```
bsf has a false dependency on the destination register.

> +# ifndef USE_AS_STRCHRNUL
> +       cmp     (%rax), %CHAR_REG
> +       jne     L(zero_2)
> +# endif
> +       ret
> +
> +# ifndef USE_AS_STRCHRNUL
> +L(zero_2):
> +       xor     %eax, %eax
> +       ret
> +# endif
> +END (STRCHR)
> +#endif
> diff --git a/sysdeps/x86_64/multiarch/strchr-evex512.S b/sysdeps/x86_64/multiarch/strchr-evex512.S
> new file mode 100644
> index 0000000000..a4ac022952
> --- /dev/null
> +++ b/sysdeps/x86_64/multiarch/strchr-evex512.S
> @@ -0,0 +1,8 @@
> +# ifndef STRCHR
> +#  define STRCHR       __strchr_evex512
> +# endif
> +
> +#include "x86-evex512-vecs.h"
> +#include "reg-macros.h"
> +
> +#include "strchr-evex-base.S"
> diff --git a/sysdeps/x86_64/multiarch/strchrnul-evex512.S b/sysdeps/x86_64/multiarch/strchrnul-evex512.S
> new file mode 100644
> index 0000000000..1be0b12f38
> --- /dev/null
> +++ b/sysdeps/x86_64/multiarch/strchrnul-evex512.S
> @@ -0,0 +1,8 @@
> +#ifndef STRCHRNUL
> +# define STRCHRNUL     __strchrnul_evex512
> +#endif
> +
> +#define STRCHR STRCHRNUL
> +#define USE_AS_STRCHRNUL 1
> +
> +#include "strchr-evex512.S"
> diff --git a/sysdeps/x86_64/multiarch/wcschr-evex512.S b/sysdeps/x86_64/multiarch/wcschr-evex512.S
> new file mode 100644
> index 0000000000..3fe4e77a70
> --- /dev/null
> +++ b/sysdeps/x86_64/multiarch/wcschr-evex512.S
> @@ -0,0 +1,9 @@
> +#ifndef WCSCHR
> +# define WCSCHR        __wcschr_evex512
> +#endif
> +
> +#define STRCHR WCSCHR
> +#define USE_AS_WCSCHR 1
> +
> +#define USE_WIDE_CHAR 1
> +#include "strchr-evex512.S"
> --
> 2.36.1
>

^ permalink raw reply	[flat|nested] 10+ messages in thread

* [PATCH v4] x86_64: Implement evex512 version of strchrnul, strchr and wcschr
  2022-10-26  1:35             ` Noah Goldstein
@ 2022-10-26  2:06               ` Sunil K Pandey
  2022-10-26  4:11                 ` Noah Goldstein
  0 siblings, 1 reply; 10+ messages in thread
From: Sunil K Pandey @ 2022-10-26  2:06 UTC (permalink / raw)
  To: libc-alpha

Changes from v3:
- Remove false dependency from 4th vector return computation.
Changes from v2:
- Replace 2x load with mask logic.
Changes from v1:
- Use VEC API.
- Replace extra lea in align_more with add.
- Restructure loop logic.
- Create zero_2 to avoid long jmp.
- Combine first, second and third vector return logic.

This patch implements following evex512 version of string functions.
evex512 version takes up to 30% less cycle as compared to evex,
depending on length and alignment.

- strchrnul function using 512 bit vectors.
- strchr function using 512 bit vectors.
- wcschr function using 512 bit vectors.

Code size data:

strchrnul-evex.o	599 byte
strchrnul-evex512.o	569 byte (-5%)

strchr-evex.o		639 byte
strchr-evex512.o	595 byte (-7%)

wcschr-evex.o		644 byte
wcschr-evex512.o	607 byte (-6%)

Placeholder function, not used by any processor at the moment.
---
 sysdeps/x86_64/multiarch/Makefile            |   3 +
 sysdeps/x86_64/multiarch/ifunc-impl-list.c   |  12 +
 sysdeps/x86_64/multiarch/strchr-evex-base.S  | 282 +++++++++++++++++++
 sysdeps/x86_64/multiarch/strchr-evex512.S    |   8 +
 sysdeps/x86_64/multiarch/strchrnul-evex512.S |   8 +
 sysdeps/x86_64/multiarch/wcschr-evex512.S    |   9 +
 6 files changed, 322 insertions(+)
 create mode 100644 sysdeps/x86_64/multiarch/strchr-evex-base.S
 create mode 100644 sysdeps/x86_64/multiarch/strchr-evex512.S
 create mode 100644 sysdeps/x86_64/multiarch/strchrnul-evex512.S
 create mode 100644 sysdeps/x86_64/multiarch/wcschr-evex512.S

diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
index e974b1ad97..597ac9d5e9 100644
--- a/sysdeps/x86_64/multiarch/Makefile
+++ b/sysdeps/x86_64/multiarch/Makefile
@@ -62,11 +62,13 @@ sysdep_routines += \
   strchr-avx2 \
   strchr-avx2-rtm \
   strchr-evex \
+  strchr-evex512 \
   strchr-sse2 \
   strchr-sse2-no-bsf \
   strchrnul-avx2 \
   strchrnul-avx2-rtm \
   strchrnul-evex \
+  strchrnul-evex512 \
   strchrnul-sse2 \
   strcmp-avx2 \
   strcmp-avx2-rtm \
@@ -131,6 +133,7 @@ sysdep_routines += \
   wcschr-avx2 \
   wcschr-avx2-rtm \
   wcschr-evex \
+  wcschr-evex512 \
   wcschr-sse2 \
   wcscmp-avx2 \
   wcscmp-avx2-rtm \
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
index 529c0b0ef0..c3d75a09f4 100644
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
@@ -544,6 +544,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 				      && CPU_FEATURE_USABLE (AVX512BW)
 				      && CPU_FEATURE_USABLE (BMI2)),
 				     __strchr_evex)
+	      X86_IFUNC_IMPL_ADD_V4 (array, i, strchr,
+				     (CPU_FEATURE_USABLE (AVX512VL)
+				      && CPU_FEATURE_USABLE (AVX512BW)),
+				     __strchr_evex512)
 	      X86_IFUNC_IMPL_ADD_V3 (array, i, strchr,
 				     (CPU_FEATURE_USABLE (AVX2)
 				      && CPU_FEATURE_USABLE (BMI2)),
@@ -569,6 +573,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 				      && CPU_FEATURE_USABLE (AVX512BW)
 				      && CPU_FEATURE_USABLE (BMI2)),
 				     __strchrnul_evex)
+	      X86_IFUNC_IMPL_ADD_V4 (array, i, strchrnul,
+				     (CPU_FEATURE_USABLE (AVX512VL)
+				      && CPU_FEATURE_USABLE (AVX512BW)),
+				     __strchrnul_evex512)
 	      X86_IFUNC_IMPL_ADD_V3 (array, i, strchrnul,
 				     (CPU_FEATURE_USABLE (AVX2)
 				      && CPU_FEATURE_USABLE (BMI2)),
@@ -793,6 +801,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 				      && CPU_FEATURE_USABLE (AVX512BW)
 				      && CPU_FEATURE_USABLE (BMI2)),
 				     __wcschr_evex)
+	      X86_IFUNC_IMPL_ADD_V4 (array, i, wcschr,
+				     (CPU_FEATURE_USABLE (AVX512VL)
+				      && CPU_FEATURE_USABLE (AVX512BW)),
+				     __wcschr_evex512)
 	      X86_IFUNC_IMPL_ADD_V3 (array, i, wcschr,
 				     (CPU_FEATURE_USABLE (AVX2)
 				      && CPU_FEATURE_USABLE (BMI2)),
diff --git a/sysdeps/x86_64/multiarch/strchr-evex-base.S b/sysdeps/x86_64/multiarch/strchr-evex-base.S
new file mode 100644
index 0000000000..75fee8c82a
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strchr-evex-base.S
@@ -0,0 +1,282 @@
+/* Placeholder function, not used by any processor at the moment.
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+/* UNUSED. Exists purely as reference implementation.  */
+
+#include <isa-level.h>
+
+#if ISA_SHOULD_BUILD (4)
+
+# include <sysdep.h>
+
+# ifdef USE_AS_WCSCHR
+#  define CHAR_REG	esi
+#  define CHAR_SIZE	4
+#  define VPBROADCAST   vpbroadcastd
+#  define VPCMP		vpcmpd
+#  define VPCMPNE	vpcmpneqd
+#  define VPMINU	vpminud
+#  define VPTEST	vptestmd
+#  define VPTESTN	vptestnmd
+# else
+#  define CHAR_REG	sil
+#  define CHAR_SIZE	1
+#  define VPBROADCAST   vpbroadcastb
+#  define VPCMP		vpcmpb
+#  define VPCMPNE	vpcmpneqb
+#  define VPMINU	vpminub
+#  define VPTEST	vptestmb
+#  define VPTESTN	vptestnmb
+# endif
+
+# define PAGE_SIZE	4096
+# define CHAR_PER_VEC	(VEC_SIZE / CHAR_SIZE)
+# define VEC_MATCH_MASK ((1 << CHAR_PER_VEC) - 1)
+
+	.section SECTION(.text), "ax", @progbits
+/* Aligning entry point to 64 byte, provides better performance for
+   one vector length string.  */
+ENTRY_P2ALIGN (STRCHR, 6)
+
+	/* Broadcast CHAR to VMM(0).  */
+	VPBROADCAST %esi, %VMM(0)
+	movl	%edi, %eax
+	sall	$20,%eax
+	cmpl	$((PAGE_SIZE - VEC_SIZE) << 20), %eax
+	ja	L(page_cross)
+
+	VMOVU	(%rdi), %VMM(1)
+	VPCMPNE	%VMM(1), %VMM(0), %k1
+	VPTEST	%VMM(1), %VMM(1), %k0{%k1}
+	KMOV	%k0, %VRAX
+	/* Compare [w]char for null, mask bit will be set for match.  */
+
+# ifdef USE_AS_WCSCHR
+	sub	$VEC_MATCH_MASK, %VRAX
+# else
+	inc	%VRAX
+# endif
+	jz	L(align_more)
+
+	bsf	%VRAX, %VRAX
+
+# ifdef USE_AS_WCSCHR
+	leaq	(%rdi, %rax, CHAR_SIZE), %rax
+# else
+	add	%rdi, %rax
+# endif
+# ifndef USE_AS_STRCHRNUL
+	cmp	(%rax), %CHAR_REG
+	jne	L(zero)
+	ret
+L(zero):
+	xorl	%eax, %eax
+# endif
+	ret
+
+L(ret_vec_x3):
+	subq	$-VEC_SIZE, %rdi
+L(ret_vec_x2):
+	subq	$-VEC_SIZE, %rdi
+L(ret_vec_x1):
+	bsf     %VRAX, %VRAX
+# ifdef USE_AS_WCSCHR
+	leaq	(%rdi, %rax, CHAR_SIZE), %rax
+# else
+	add	%rdi, %rax
+# endif
+
+# ifndef USE_AS_STRCHRNUL
+	cmp	(%rax), %CHAR_REG
+	jne	L(zero)
+# endif
+	ret
+
+L(page_cross):
+	mov	%rdi, %rax
+	movl	%edi, %ecx
+# ifdef USE_AS_WCSCHR
+	/* Calculate number of compare result bits to be skipped for
+	   wide string alignment adjustment.  */
+	andl	$(VEC_SIZE - 1), %ecx
+	sarl	$2, %ecx
+# endif
+	/* ecx contains number of w[char] to be skipped as a result
+	   of address alignment.  */
+	andq    $-VEC_SIZE, %rax
+
+	VMOVA	(%rax), %VMM(1)
+	VPCMPNE	%VMM(1), %VMM(0), %k1
+	VPTEST	%VMM(1), %VMM(1), %k0{%k1}
+	KMOV	%k0, %VRAX
+# ifdef USE_AS_WCSCHR
+	sub	$VEC_MATCH_MASK, %VRAX
+# else
+	inc	%VRAX
+# endif
+	/* Ignore number of character for alignment adjustment.  */
+	shr	%cl, %VRAX
+	jz	L(align_more)
+
+	bsf	%VRAX, %VRAX
+# ifdef USE_AS_WCSCHR
+	leaq	(%rdi, %rax, CHAR_SIZE), %rax
+# else
+	addq	%rdi, %rax
+# endif
+
+# ifndef USE_AS_STRCHRNUL
+	cmp	(%rax), %CHAR_REG
+	jne	L(zero)
+# endif
+	ret
+
+L(align_more):
+	/* Align rax to VEC_SIZE.  */
+	andq	$-VEC_SIZE, %rdi
+
+	/* Loop unroll 4 times for 4 vector loop.  */
+	VMOVA	VEC_SIZE(%rdi), %VMM(1)
+	VPCMPNE	%VMM(1), %VMM(0), %k1
+	VPTEST	%VMM(1), %VMM(1), %k0{%k1}
+
+	/* Increment rdi by vector size for further comparison and
+	   return.  */
+	subq	$-VEC_SIZE, %rdi
+	KMOV	%k0, %VRAX
+
+# ifdef USE_AS_WCSCHR
+	sub	$VEC_MATCH_MASK, %VRAX
+# else
+	inc	%VRAX
+# endif
+	jnz	L(ret_vec_x1)
+
+	VMOVA	VEC_SIZE(%rdi), %VMM(1)
+	VPCMPNE	%VMM(1), %VMM(0), %k1
+	VPTEST	%VMM(1), %VMM(1), %k0{%k1}
+	KMOV	%k0, %VRAX
+# ifdef USE_AS_WCSCHR
+	sub	$VEC_MATCH_MASK, %VRAX
+# else
+	inc	%VRAX
+# endif
+	jnz	L(ret_vec_x2)
+
+	VMOVA	(VEC_SIZE * 2)(%rdi), %VMM(1)
+	VPCMPNE	%VMM(1), %VMM(0), %k1
+	VPTEST	%VMM(1), %VMM(1), %k0{%k1}
+	KMOV	%k0, %VRAX
+# ifdef USE_AS_WCSCHR
+	sub	$VEC_MATCH_MASK, %VRAX
+# else
+	inc	%VRAX
+# endif
+	jnz	L(ret_vec_x3)
+
+	VMOVA	(VEC_SIZE * 3)(%rdi), %VMM(1)
+	VPCMPNE	%VMM(1), %VMM(0), %k1
+	VPTEST	%VMM(1), %VMM(1), %k0{%k1}
+	KMOV	%k0, %VRDX
+# ifdef USE_AS_WCSCHR
+	sub	$VEC_MATCH_MASK, %VRDX
+# else
+	inc	%VRDX
+# endif
+	jnz	L(ret_vec_x4)
+
+
+	/* Align address to VEC_SIZE * 4 for loop.  */
+	andq	$-(VEC_SIZE * 4), %rdi
+L(loop):
+	/* VPMINU and VPCMP combination provide better performance as
+	   compared to alternative combinations.  */
+	VMOVA	(VEC_SIZE * 4)(%rdi), %VMM(1)
+	VMOVA	(VEC_SIZE * 5)(%rdi), %VMM(2)
+	VMOVA	(VEC_SIZE * 6)(%rdi), %VMM(3)
+	VMOVA	(VEC_SIZE * 7)(%rdi), %VMM(4)
+
+	VPCMPNE	%VMM(1), %VMM(0), %k1
+	VPCMPNE	%VMM(2), %VMM(0), %k2
+
+	VPMINU	%VMM(2), %VMM(1), %VMM(2)
+
+	VPCMPNE	%VMM(3), %VMM(0), %k3{%k1}
+	VPCMPNE	%VMM(4), %VMM(0), %k4{%k2}
+
+	VPMINU	%VMM(4), %VMM(3), %VMM(4)
+	VPMINU	%VMM(2), %VMM(4), %VMM(4){%k3}{z}
+
+	VPTEST	%VMM(4), %VMM(4), %k5{%k4}
+
+	KMOV	%k5, %VRDX
+	subq	$-(VEC_SIZE * 4), %rdi
+# ifdef USE_AS_WCSCHR
+	sub	$VEC_MATCH_MASK, %VRDX
+# else
+	inc	%VRDX
+# endif
+	jz	L(loop)
+
+	VPTEST	%VMM(1), %VMM(1), %k0{%k1}
+	KMOV	%k0, %VRAX
+# ifdef USE_AS_WCSCHR
+	sub	$VEC_MATCH_MASK, %VRAX
+# else
+	inc	%VRAX
+# endif
+	jnz	L(ret_vec_x1)
+
+	VPTEST	%VMM(2), %VMM(2), %k0{%k2}
+	KMOV	%k0, %VRAX
+	/* At this point, if k1 is non zero, null char must be in the
+	   second vector.  */
+# ifdef USE_AS_WCSCHR
+	sub	$VEC_MATCH_MASK, %VRAX
+# else
+	inc	%VRAX
+# endif
+	jnz	L(ret_vec_x2)
+
+	VPTEST	%VMM(3), %VMM(3), %k0{%k3}
+	KMOV	%k0, %VRAX
+# ifdef USE_AS_WCSCHR
+	sub	$VEC_MATCH_MASK, %VRAX
+# else
+	inc	%VRAX
+# endif
+	jnz	L(ret_vec_x3)
+	/* At this point null [w]char must be in the fourth vector so no
+	   need to check.  */
+
+L(ret_vec_x4):
+	bsf	%VRDX, %VRDX
+	leaq	(VEC_SIZE * 3)(%rdi, %rdx, CHAR_SIZE), %rax
+# ifndef USE_AS_STRCHRNUL
+	cmp	(%rax), %CHAR_REG
+	jne	L(zero_2)
+# endif
+	ret
+
+# ifndef USE_AS_STRCHRNUL
+L(zero_2):
+	xor	%eax, %eax
+	ret
+# endif
+END (STRCHR)
+#endif
diff --git a/sysdeps/x86_64/multiarch/strchr-evex512.S b/sysdeps/x86_64/multiarch/strchr-evex512.S
new file mode 100644
index 0000000000..a4ac022952
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strchr-evex512.S
@@ -0,0 +1,8 @@
+# ifndef STRCHR
+#  define STRCHR	__strchr_evex512
+# endif
+
+#include "x86-evex512-vecs.h"
+#include "reg-macros.h"
+
+#include "strchr-evex-base.S"
diff --git a/sysdeps/x86_64/multiarch/strchrnul-evex512.S b/sysdeps/x86_64/multiarch/strchrnul-evex512.S
new file mode 100644
index 0000000000..1be0b12f38
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strchrnul-evex512.S
@@ -0,0 +1,8 @@
+#ifndef STRCHRNUL
+# define STRCHRNUL	__strchrnul_evex512
+#endif
+
+#define STRCHR	STRCHRNUL
+#define USE_AS_STRCHRNUL 1
+
+#include "strchr-evex512.S"
diff --git a/sysdeps/x86_64/multiarch/wcschr-evex512.S b/sysdeps/x86_64/multiarch/wcschr-evex512.S
new file mode 100644
index 0000000000..3fe4e77a70
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wcschr-evex512.S
@@ -0,0 +1,9 @@
+#ifndef WCSCHR
+# define WCSCHR	__wcschr_evex512
+#endif
+
+#define STRCHR	WCSCHR
+#define USE_AS_WCSCHR 1
+
+#define USE_WIDE_CHAR 1
+#include "strchr-evex512.S"
-- 
2.36.1


^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH v4] x86_64: Implement evex512 version of strchrnul, strchr and wcschr
  2022-10-26  2:06               ` [PATCH v4] " Sunil K Pandey
@ 2022-10-26  4:11                 ` Noah Goldstein
  0 siblings, 0 replies; 10+ messages in thread
From: Noah Goldstein @ 2022-10-26  4:11 UTC (permalink / raw)
  To: Sunil K Pandey; +Cc: libc-alpha

On Tue, Oct 25, 2022 at 7:07 PM Sunil K Pandey via Libc-alpha
<libc-alpha@sourceware.org> wrote:
>
> Changes from v3:
> - Remove false dependency from 4th vector return computation.
> Changes from v2:
> - Replace 2x load with mask logic.
> Changes from v1:
> - Use VEC API.
> - Replace extra lea in align_more with add.
> - Restructure loop logic.
> - Create zero_2 to avoid long jmp.
> - Combine first, second and third vector return logic.
>
> This patch implements following evex512 version of string functions.
> evex512 version takes up to 30% less cycle as compared to evex,
> depending on length and alignment.
>
> - strchrnul function using 512 bit vectors.
> - strchr function using 512 bit vectors.
> - wcschr function using 512 bit vectors.
>
> Code size data:
>
> strchrnul-evex.o        599 byte
> strchrnul-evex512.o     569 byte (-5%)
>
> strchr-evex.o           639 byte
> strchr-evex512.o        595 byte (-7%)
>
> wcschr-evex.o           644 byte
> wcschr-evex512.o        607 byte (-6%)
>
> Placeholder function, not used by any processor at the moment.
> ---
>  sysdeps/x86_64/multiarch/Makefile            |   3 +
>  sysdeps/x86_64/multiarch/ifunc-impl-list.c   |  12 +
>  sysdeps/x86_64/multiarch/strchr-evex-base.S  | 282 +++++++++++++++++++
>  sysdeps/x86_64/multiarch/strchr-evex512.S    |   8 +
>  sysdeps/x86_64/multiarch/strchrnul-evex512.S |   8 +
>  sysdeps/x86_64/multiarch/wcschr-evex512.S    |   9 +
>  6 files changed, 322 insertions(+)
>  create mode 100644 sysdeps/x86_64/multiarch/strchr-evex-base.S
>  create mode 100644 sysdeps/x86_64/multiarch/strchr-evex512.S
>  create mode 100644 sysdeps/x86_64/multiarch/strchrnul-evex512.S
>  create mode 100644 sysdeps/x86_64/multiarch/wcschr-evex512.S
>
> diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
> index e974b1ad97..597ac9d5e9 100644
> --- a/sysdeps/x86_64/multiarch/Makefile
> +++ b/sysdeps/x86_64/multiarch/Makefile
> @@ -62,11 +62,13 @@ sysdep_routines += \
>    strchr-avx2 \
>    strchr-avx2-rtm \
>    strchr-evex \
> +  strchr-evex512 \
>    strchr-sse2 \
>    strchr-sse2-no-bsf \
>    strchrnul-avx2 \
>    strchrnul-avx2-rtm \
>    strchrnul-evex \
> +  strchrnul-evex512 \
>    strchrnul-sse2 \
>    strcmp-avx2 \
>    strcmp-avx2-rtm \
> @@ -131,6 +133,7 @@ sysdep_routines += \
>    wcschr-avx2 \
>    wcschr-avx2-rtm \
>    wcschr-evex \
> +  wcschr-evex512 \
>    wcschr-sse2 \
>    wcscmp-avx2 \
>    wcscmp-avx2-rtm \
> diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> index 529c0b0ef0..c3d75a09f4 100644
> --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> @@ -544,6 +544,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
>                                       && CPU_FEATURE_USABLE (AVX512BW)
>                                       && CPU_FEATURE_USABLE (BMI2)),
>                                      __strchr_evex)
> +             X86_IFUNC_IMPL_ADD_V4 (array, i, strchr,
> +                                    (CPU_FEATURE_USABLE (AVX512VL)
> +                                     && CPU_FEATURE_USABLE (AVX512BW)),
> +                                    __strchr_evex512)
>               X86_IFUNC_IMPL_ADD_V3 (array, i, strchr,
>                                      (CPU_FEATURE_USABLE (AVX2)
>                                       && CPU_FEATURE_USABLE (BMI2)),
> @@ -569,6 +573,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
>                                       && CPU_FEATURE_USABLE (AVX512BW)
>                                       && CPU_FEATURE_USABLE (BMI2)),
>                                      __strchrnul_evex)
> +             X86_IFUNC_IMPL_ADD_V4 (array, i, strchrnul,
> +                                    (CPU_FEATURE_USABLE (AVX512VL)
> +                                     && CPU_FEATURE_USABLE (AVX512BW)),
> +                                    __strchrnul_evex512)
>               X86_IFUNC_IMPL_ADD_V3 (array, i, strchrnul,
>                                      (CPU_FEATURE_USABLE (AVX2)
>                                       && CPU_FEATURE_USABLE (BMI2)),
> @@ -793,6 +801,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
>                                       && CPU_FEATURE_USABLE (AVX512BW)
>                                       && CPU_FEATURE_USABLE (BMI2)),
>                                      __wcschr_evex)
> +             X86_IFUNC_IMPL_ADD_V4 (array, i, wcschr,
> +                                    (CPU_FEATURE_USABLE (AVX512VL)
> +                                     && CPU_FEATURE_USABLE (AVX512BW)),
> +                                    __wcschr_evex512)
>               X86_IFUNC_IMPL_ADD_V3 (array, i, wcschr,
>                                      (CPU_FEATURE_USABLE (AVX2)
>                                       && CPU_FEATURE_USABLE (BMI2)),
> diff --git a/sysdeps/x86_64/multiarch/strchr-evex-base.S b/sysdeps/x86_64/multiarch/strchr-evex-base.S
> new file mode 100644
> index 0000000000..75fee8c82a
> --- /dev/null
> +++ b/sysdeps/x86_64/multiarch/strchr-evex-base.S
> @@ -0,0 +1,282 @@
> +/* Placeholder function, not used by any processor at the moment.
> +   Copyright (C) 2022 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +/* UNUSED. Exists purely as reference implementation.  */
> +
> +#include <isa-level.h>
> +
> +#if ISA_SHOULD_BUILD (4)
> +
> +# include <sysdep.h>
> +
> +# ifdef USE_AS_WCSCHR
> +#  define CHAR_REG     esi
> +#  define CHAR_SIZE    4
> +#  define VPBROADCAST   vpbroadcastd
> +#  define VPCMP                vpcmpd
> +#  define VPCMPNE      vpcmpneqd
> +#  define VPMINU       vpminud
> +#  define VPTEST       vptestmd
> +#  define VPTESTN      vptestnmd
> +# else
> +#  define CHAR_REG     sil
> +#  define CHAR_SIZE    1
> +#  define VPBROADCAST   vpbroadcastb
> +#  define VPCMP                vpcmpb
> +#  define VPCMPNE      vpcmpneqb
> +#  define VPMINU       vpminub
> +#  define VPTEST       vptestmb
> +#  define VPTESTN      vptestnmb
> +# endif
> +
> +# define PAGE_SIZE     4096
> +# define CHAR_PER_VEC  (VEC_SIZE / CHAR_SIZE)
> +# define VEC_MATCH_MASK ((1 << CHAR_PER_VEC) - 1)
> +
> +       .section SECTION(.text), "ax", @progbits
> +/* Aligning entry point to 64 byte, provides better performance for
> +   one vector length string.  */
> +ENTRY_P2ALIGN (STRCHR, 6)
> +
> +       /* Broadcast CHAR to VMM(0).  */
> +       VPBROADCAST %esi, %VMM(0)
> +       movl    %edi, %eax
> +       sall    $20,%eax
> +       cmpl    $((PAGE_SIZE - VEC_SIZE) << 20), %eax
> +       ja      L(page_cross)
> +
> +       VMOVU   (%rdi), %VMM(1)
> +       VPCMPNE %VMM(1), %VMM(0), %k1
> +       VPTEST  %VMM(1), %VMM(1), %k0{%k1}
> +       KMOV    %k0, %VRAX
> +       /* Compare [w]char for null, mask bit will be set for match.  */
> +
> +# ifdef USE_AS_WCSCHR
> +       sub     $VEC_MATCH_MASK, %VRAX
> +# else
> +       inc     %VRAX
> +# endif
> +       jz      L(align_more)
> +
> +       bsf     %VRAX, %VRAX
> +
> +# ifdef USE_AS_WCSCHR
> +       leaq    (%rdi, %rax, CHAR_SIZE), %rax
> +# else
> +       add     %rdi, %rax
> +# endif
> +# ifndef USE_AS_STRCHRNUL
> +       cmp     (%rax), %CHAR_REG
> +       jne     L(zero)
> +       ret
> +L(zero):
> +       xorl    %eax, %eax
> +# endif
> +       ret
> +
> +L(ret_vec_x3):
> +       subq    $-VEC_SIZE, %rdi
> +L(ret_vec_x2):
> +       subq    $-VEC_SIZE, %rdi
> +L(ret_vec_x1):
> +       bsf     %VRAX, %VRAX
> +# ifdef USE_AS_WCSCHR
> +       leaq    (%rdi, %rax, CHAR_SIZE), %rax
> +# else
> +       add     %rdi, %rax
> +# endif
> +
> +# ifndef USE_AS_STRCHRNUL
> +       cmp     (%rax), %CHAR_REG
> +       jne     L(zero)
> +# endif
> +       ret
> +
> +L(page_cross):
> +       mov     %rdi, %rax
> +       movl    %edi, %ecx
> +# ifdef USE_AS_WCSCHR
> +       /* Calculate number of compare result bits to be skipped for
> +          wide string alignment adjustment.  */
> +       andl    $(VEC_SIZE - 1), %ecx
> +       sarl    $2, %ecx
> +# endif
> +       /* ecx contains number of w[char] to be skipped as a result
> +          of address alignment.  */
> +       andq    $-VEC_SIZE, %rax
> +
> +       VMOVA   (%rax), %VMM(1)
> +       VPCMPNE %VMM(1), %VMM(0), %k1
> +       VPTEST  %VMM(1), %VMM(1), %k0{%k1}
> +       KMOV    %k0, %VRAX
> +# ifdef USE_AS_WCSCHR
> +       sub     $VEC_MATCH_MASK, %VRAX
> +# else
> +       inc     %VRAX
> +# endif
> +       /* Ignore number of character for alignment adjustment.  */
> +       shr     %cl, %VRAX
> +       jz      L(align_more)
> +
> +       bsf     %VRAX, %VRAX
> +# ifdef USE_AS_WCSCHR
> +       leaq    (%rdi, %rax, CHAR_SIZE), %rax
> +# else
> +       addq    %rdi, %rax
> +# endif
> +
> +# ifndef USE_AS_STRCHRNUL
> +       cmp     (%rax), %CHAR_REG
> +       jne     L(zero)
> +# endif
> +       ret
> +
> +L(align_more):
> +       /* Align rax to VEC_SIZE.  */
> +       andq    $-VEC_SIZE, %rdi
> +
> +       /* Loop unroll 4 times for 4 vector loop.  */
> +       VMOVA   VEC_SIZE(%rdi), %VMM(1)
> +       VPCMPNE %VMM(1), %VMM(0), %k1
> +       VPTEST  %VMM(1), %VMM(1), %k0{%k1}
> +
> +       /* Increment rdi by vector size for further comparison and
> +          return.  */
> +       subq    $-VEC_SIZE, %rdi
> +       KMOV    %k0, %VRAX
> +
> +# ifdef USE_AS_WCSCHR
> +       sub     $VEC_MATCH_MASK, %VRAX
> +# else
> +       inc     %VRAX
> +# endif
> +       jnz     L(ret_vec_x1)
> +
> +       VMOVA   VEC_SIZE(%rdi), %VMM(1)
> +       VPCMPNE %VMM(1), %VMM(0), %k1
> +       VPTEST  %VMM(1), %VMM(1), %k0{%k1}
> +       KMOV    %k0, %VRAX
> +# ifdef USE_AS_WCSCHR
> +       sub     $VEC_MATCH_MASK, %VRAX
> +# else
> +       inc     %VRAX
> +# endif
> +       jnz     L(ret_vec_x2)
> +
> +       VMOVA   (VEC_SIZE * 2)(%rdi), %VMM(1)
> +       VPCMPNE %VMM(1), %VMM(0), %k1
> +       VPTEST  %VMM(1), %VMM(1), %k0{%k1}
> +       KMOV    %k0, %VRAX
> +# ifdef USE_AS_WCSCHR
> +       sub     $VEC_MATCH_MASK, %VRAX
> +# else
> +       inc     %VRAX
> +# endif
> +       jnz     L(ret_vec_x3)
> +
> +       VMOVA   (VEC_SIZE * 3)(%rdi), %VMM(1)
> +       VPCMPNE %VMM(1), %VMM(0), %k1
> +       VPTEST  %VMM(1), %VMM(1), %k0{%k1}
> +       KMOV    %k0, %VRDX
> +# ifdef USE_AS_WCSCHR
> +       sub     $VEC_MATCH_MASK, %VRDX
> +# else
> +       inc     %VRDX
> +# endif
> +       jnz     L(ret_vec_x4)
> +
> +
> +       /* Align address to VEC_SIZE * 4 for loop.  */
> +       andq    $-(VEC_SIZE * 4), %rdi
> +L(loop):
> +       /* VPMINU and VPCMP combination provide better performance as
> +          compared to alternative combinations.  */
> +       VMOVA   (VEC_SIZE * 4)(%rdi), %VMM(1)
> +       VMOVA   (VEC_SIZE * 5)(%rdi), %VMM(2)
> +       VMOVA   (VEC_SIZE * 6)(%rdi), %VMM(3)
> +       VMOVA   (VEC_SIZE * 7)(%rdi), %VMM(4)
> +
> +       VPCMPNE %VMM(1), %VMM(0), %k1
> +       VPCMPNE %VMM(2), %VMM(0), %k2
> +
> +       VPMINU  %VMM(2), %VMM(1), %VMM(2)
> +
> +       VPCMPNE %VMM(3), %VMM(0), %k3{%k1}
> +       VPCMPNE %VMM(4), %VMM(0), %k4{%k2}
> +
> +       VPMINU  %VMM(4), %VMM(3), %VMM(4)
> +       VPMINU  %VMM(2), %VMM(4), %VMM(4){%k3}{z}
> +
> +       VPTEST  %VMM(4), %VMM(4), %k5{%k4}
> +
> +       KMOV    %k5, %VRDX
> +       subq    $-(VEC_SIZE * 4), %rdi
> +# ifdef USE_AS_WCSCHR
> +       sub     $VEC_MATCH_MASK, %VRDX
> +# else
> +       inc     %VRDX
> +# endif
> +       jz      L(loop)
> +
> +       VPTEST  %VMM(1), %VMM(1), %k0{%k1}
> +       KMOV    %k0, %VRAX
> +# ifdef USE_AS_WCSCHR
> +       sub     $VEC_MATCH_MASK, %VRAX
> +# else
> +       inc     %VRAX
> +# endif
> +       jnz     L(ret_vec_x1)
> +
> +       VPTEST  %VMM(2), %VMM(2), %k0{%k2}
> +       KMOV    %k0, %VRAX
> +       /* At this point, if k1 is non zero, null char must be in the
> +          second vector.  */
> +# ifdef USE_AS_WCSCHR
> +       sub     $VEC_MATCH_MASK, %VRAX
> +# else
> +       inc     %VRAX
> +# endif
> +       jnz     L(ret_vec_x2)
> +
> +       VPTEST  %VMM(3), %VMM(3), %k0{%k3}
> +       KMOV    %k0, %VRAX
> +# ifdef USE_AS_WCSCHR
> +       sub     $VEC_MATCH_MASK, %VRAX
> +# else
> +       inc     %VRAX
> +# endif
> +       jnz     L(ret_vec_x3)
> +       /* At this point null [w]char must be in the fourth vector so no
> +          need to check.  */
> +
> +L(ret_vec_x4):
> +       bsf     %VRDX, %VRDX
> +       leaq    (VEC_SIZE * 3)(%rdi, %rdx, CHAR_SIZE), %rax
> +# ifndef USE_AS_STRCHRNUL
> +       cmp     (%rax), %CHAR_REG
> +       jne     L(zero_2)
> +# endif
> +       ret
> +
> +# ifndef USE_AS_STRCHRNUL
> +L(zero_2):
> +       xor     %eax, %eax
> +       ret
> +# endif
> +END (STRCHR)
> +#endif
> diff --git a/sysdeps/x86_64/multiarch/strchr-evex512.S b/sysdeps/x86_64/multiarch/strchr-evex512.S
> new file mode 100644
> index 0000000000..a4ac022952
> --- /dev/null
> +++ b/sysdeps/x86_64/multiarch/strchr-evex512.S
> @@ -0,0 +1,8 @@
> +# ifndef STRCHR
> +#  define STRCHR       __strchr_evex512
> +# endif
> +
> +#include "x86-evex512-vecs.h"
> +#include "reg-macros.h"
> +
> +#include "strchr-evex-base.S"
> diff --git a/sysdeps/x86_64/multiarch/strchrnul-evex512.S b/sysdeps/x86_64/multiarch/strchrnul-evex512.S
> new file mode 100644
> index 0000000000..1be0b12f38
> --- /dev/null
> +++ b/sysdeps/x86_64/multiarch/strchrnul-evex512.S
> @@ -0,0 +1,8 @@
> +#ifndef STRCHRNUL
> +# define STRCHRNUL     __strchrnul_evex512
> +#endif
> +
> +#define STRCHR STRCHRNUL
> +#define USE_AS_STRCHRNUL 1
> +
> +#include "strchr-evex512.S"
> diff --git a/sysdeps/x86_64/multiarch/wcschr-evex512.S b/sysdeps/x86_64/multiarch/wcschr-evex512.S
> new file mode 100644
> index 0000000000..3fe4e77a70
> --- /dev/null
> +++ b/sysdeps/x86_64/multiarch/wcschr-evex512.S
> @@ -0,0 +1,9 @@
> +#ifndef WCSCHR
> +# define WCSCHR        __wcschr_evex512
> +#endif
> +
> +#define STRCHR WCSCHR
> +#define USE_AS_WCSCHR 1
> +
> +#define USE_WIDE_CHAR 1
> +#include "strchr-evex512.S"
> --
> 2.36.1
>

LGTM.

^ permalink raw reply	[flat|nested] 10+ messages in thread

end of thread, other threads:[~2022-10-26  4:11 UTC | newest]

Thread overview: 10+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-09-22  0:16 [PATCH] x86_64: Implement evex512 version of strchrnul, strchr and wcschr Sunil K Pandey
2022-09-22  0:50 ` Noah Goldstein
2022-09-23  3:57   ` Sunil Pandey
2022-09-29  3:41     ` Sunil Pandey
2022-09-29  4:07       ` Noah Goldstein
2022-10-21 21:23         ` [PATCH v2] " Sunil K Pandey
2022-10-25 23:35           ` [PATCH v3] " Sunil K Pandey
2022-10-26  1:35             ` Noah Goldstein
2022-10-26  2:06               ` [PATCH v4] " Sunil K Pandey
2022-10-26  4:11                 ` Noah Goldstein

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).