[PATCH] x86_64: Implement evex512 version of strrchr and wcsrchr

public inbox for libc-alpha@sourceware.org
 help / color / mirror / Atom feed

* [PATCH] x86_64: Implement evex512 version of strrchr and wcsrchr
@ 2022-09-22  0:24 Sunil K Pandey
  2022-09-22  0:50 ` Noah Goldstein
  0 siblings, 1 reply; 7+ messages in thread
From: Sunil K Pandey @ 2022-09-22  0:24 UTC (permalink / raw)
  To: libc-alpha; +Cc: hjl.tools

This patch implements following evex512 version of string functions.
evex512 version takes up to 30% less cycle as compared to evex,
depending on length and alignment.

- strrchr function using 512 bit vectors.
- wcsrchr function using 512 bit vectors.

Code size data:

strrchr-evex.o		833 byte
strrchr-evex512.o	573 byte (-31%)

wcsrchr-evex.o		836 byte
wcsrchr-evex512.o	581 byte (-31%)

Placeholder function, not used by any processor at the moment.
---
 sysdeps/x86_64/multiarch/Makefile            |   2 +
 sysdeps/x86_64/multiarch/ifunc-impl-list.c   |  10 +
 sysdeps/x86_64/multiarch/strrchr-evex-base.S | 307 +++++++++++++++++++
 sysdeps/x86_64/multiarch/strrchr-evex512.S   |   7 +
 sysdeps/x86_64/multiarch/wcsrchr-evex512.S   |   8 +
 5 files changed, 334 insertions(+)
 create mode 100644 sysdeps/x86_64/multiarch/strrchr-evex-base.S
 create mode 100644 sysdeps/x86_64/multiarch/strrchr-evex512.S
 create mode 100644 sysdeps/x86_64/multiarch/wcsrchr-evex512.S

diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
index df4601c294..6a275f1c3d 100644
--- a/sysdeps/x86_64/multiarch/Makefile
+++ b/sysdeps/x86_64/multiarch/Makefile
@@ -110,6 +110,7 @@ sysdep_routines += \
   strrchr-avx2 \
   strrchr-avx2-rtm \
   strrchr-evex \
+  strrchr-evex512 \
   strrchr-sse2 \
   strspn-sse4 \
   strstr-avx512 \
@@ -152,6 +153,7 @@ sysdep_routines += \
   wcsrchr-avx2 \
   wcsrchr-avx2-rtm \
   wcsrchr-evex \
+  wcsrchr-evex512 \
   wcsrchr-sse2 \
   wmemchr-avx2 \
   wmemchr-avx2-rtm \
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
index a71444eccb..26c941023a 100644
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
@@ -564,6 +564,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 				     (CPU_FEATURE_USABLE (AVX512VL)
 				      && CPU_FEATURE_USABLE (AVX512BW)),
 				     __strrchr_evex)
+	      X86_IFUNC_IMPL_ADD_V4 (array, i, strrchr,
+				     (CPU_FEATURE_USABLE (AVX512VL)
+				      && CPU_FEATURE_USABLE (AVX512BW)
+				      && CPU_FEATURE_USABLE (BMI2)),
+				     __strrchr_evex512)
 	      X86_IFUNC_IMPL_ADD_V3 (array, i, strrchr,
 				     CPU_FEATURE_USABLE (AVX2),
 				     __strrchr_avx2)
@@ -775,6 +780,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 				      && CPU_FEATURE_USABLE (AVX512BW)
 				      && CPU_FEATURE_USABLE (BMI2)),
 				     __wcsrchr_evex)
+	      X86_IFUNC_IMPL_ADD_V4 (array, i, wcsrchr,
+				     (CPU_FEATURE_USABLE (AVX512VL)
+				      && CPU_FEATURE_USABLE (AVX512BW)
+				      && CPU_FEATURE_USABLE (BMI2)),
+				     __wcsrchr_evex512)
 	      X86_IFUNC_IMPL_ADD_V3 (array, i, wcsrchr,
 				     CPU_FEATURE_USABLE (AVX2),
 				     __wcsrchr_avx2)
diff --git a/sysdeps/x86_64/multiarch/strrchr-evex-base.S b/sysdeps/x86_64/multiarch/strrchr-evex-base.S
new file mode 100644
index 0000000000..e937cb193c
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strrchr-evex-base.S
@@ -0,0 +1,307 @@
+/* Placeholder function, not used by any processor at the moment.
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+/* UNUSED. Exists purely as reference implementation.  */
+
+#include <isa-level.h>
+
+#if ISA_SHOULD_BUILD (4)
+
+# include <sysdep.h>
+
+# ifdef USE_AS_WCSRCHR
+#  define CHAR_SIZE	4
+#  define VPBROADCAST   vpbroadcastd
+#  define VPCMP		vpcmpd
+#  define VPMINU	vpminud
+#  define VPTESTN	vptestnmd
+# else
+#  define CHAR_SIZE	1
+#  define VPBROADCAST   vpbroadcastb
+#  define VPCMP		vpcmpb
+#  define VPMINU	vpminub
+#  define VPTESTN	vptestnmb
+# endif
+
+# define PAGE_SIZE	4096
+# define CHAR_PER_VEC	(VEC_SIZE / CHAR_SIZE)
+
+# if VEC_SIZE == 64
+#  define BLSMSK	blsmskq
+#  define BSR		bsrq
+#  define KMOV		kmovq
+#  define KOR		korq
+#  define KORTEST	kortestq
+#  define R8		r8
+#  define RAX		rax
+#  define RCX		rcx
+#  define RDX		rdx
+#  define SHR		shrq
+#  define TEXTSUFFIX	evex512
+#  define VMM0		zmm16
+#  define VMM1		zmm17
+#  define VMM2		zmm18
+#  define VMM3		zmm19
+#  define VMM4		zmm20
+#  define VMM5		zmm21
+#  define VMOVA		vmovdqa64
+#  define VMOVU		vmovdqu64
+
+# elif VEC_SIZE == 32
+/* Currently Unused.  */
+#  define BLSMSK	blsmskl
+#  define BSR		bsrl
+#  define KMOV		kmovd
+#  define KOR		kord
+#  define KORTEST	kortestd
+#  define R8		r8d
+#  define RAX		eax
+#  define RCX		ecx
+#  define RDX		edx
+#  define SHR		shrl
+#  define TEXTSUFFIX	evex256
+#  define VMM0		ymm16
+#  define VMM1		ymm17
+#  define VMM2		ymm18
+#  define VMM3		ymm19
+#  define VMM4		ymm20
+#  define VMM5		ymm21
+#  define VMOVA		vmovdqa32
+#  define VMOVU		vmovdqu32
+# endif
+
+	.section .text.TEXTSUFFIX, "ax", @progbits
+/* Aligning entry point to 64 byte, provides better performance for
+   one vector length string.  */
+ENTRY_P2ALIGN (STRRCHR, 6)
+
+	/* Broadcast CHAR to VMM0.  */
+	VPBROADCAST %esi, %VMM0
+	movl	%edi, %eax
+	andl	$(PAGE_SIZE - 1), %eax
+	cmpl	$(PAGE_SIZE - VEC_SIZE), %eax
+	ja	L(page_cross)
+
+L(page_cross_continue):
+	/* Compare [w]char for null, mask bit will be set for match.  */
+	VMOVU	(%rdi), %VMM1
+
+	VPTESTN	%VMM1, %VMM1, %k1
+	KMOV	%k1, %RCX
+	test	%RCX, %RCX
+	jz	L(align_more)
+
+	VPCMP	$0, %VMM1, %VMM0, %k0
+	KMOV	%k0, %RAX
+	BLSMSK	%RCX, %RCX
+	and	%RCX, %RAX
+	jz	L(ret)
+
+	BSR	%RAX, %RAX
+# ifdef USE_AS_WCSRCHR
+	leaq	(%rdi, %rax, CHAR_SIZE), %rax
+# else
+	add	%rdi, %rax
+# endif
+L(ret):
+	ret
+
+L(vector_x2_end):
+	VPCMP	$0, %VMM2, %VMM0, %k2
+	KMOV	%k2, %RAX
+	BLSMSK	%RCX, %RCX
+	and	%RCX, %RAX
+	jz	L(vector_x1_ret)
+
+	BSR	%RAX, %RAX
+	leaq	(VEC_SIZE)(%rdi, %rax, CHAR_SIZE), %rax
+	ret
+
+	/* Check the first vector at very last to look for match.  */
+L(vector_x1_ret):
+	VPCMP   $0, %VMM1, %VMM0, %k2
+	KMOV	%k2, %RAX
+	test	%RAX, %RAX
+	jz	L(ret)
+
+	BSR	%RAX, %RAX
+# ifdef USE_AS_WCSRCHR
+	leaq	(%rsi, %rax, CHAR_SIZE), %rax
+# else
+	add	%rsi, %rax
+# endif
+	ret
+
+L(align_more):
+	/* Zero r8 to store match result.  */
+	xorq	%r8, %r8
+	/* Save pointer of first vector, in case if no match found.  */
+	movq	%rdi, %rsi
+	/* Align pointer to vector size.  */
+	andq	$-VEC_SIZE, %rdi
+	/* Loop unroll 2 times for 2 vector loop.  */
+	VMOVA	(VEC_SIZE)(%rdi), %VMM2
+	VPTESTN	%VMM2, %VMM2, %k0
+	KMOV	%k0, %RCX
+	test	%RCX, %RCX
+	jnz	L(vector_x2_end)
+
+	/* Save pointer of second vector, in case if no match
+	   found.  */
+	movq	%rdi, %r9
+	/* Align address to VEC_SIZE * 2 for loop.  */
+	andq	$-(VEC_SIZE * 2), %rdi
+
+	.p2align 4,,11
+L(loop):
+	/* 2 vector loop, as it provide better performance as compared
+	   to 4 vector loop.  */
+	VMOVA	(VEC_SIZE * 2)(%rdi), %VMM3
+	VMOVA	(VEC_SIZE * 3)(%rdi), %VMM4
+	VPCMP	$0, %VMM3, %VMM0, %k1
+	VPCMP	$0, %VMM4, %VMM0, %k2
+	VPMINU	%VMM3, %VMM4, %VMM5
+	VPTESTN	%VMM5, %VMM5, %k0
+	KOR	%k1, %k2, %k3
+	subq	$-(VEC_SIZE * 2), %rdi
+	/* If k0 and k3 zero, match and end of string not found.  */
+	KORTEST	%k0, %k3
+	jz	L(loop)
+
+	/* If k0 is non zero, end of string found.  */
+	KORTEST %k0, %k0
+	jnz	L(endloop)
+
+	/* A match found, it need to be stored in r8 before loop
+	   continue.  */
+	/* Check second vector first.  */
+	KMOV	%k2, %RDX
+	test	%RDX, %RDX
+	jz	L(loop_vec_x3_ret)
+
+	BSR	%RDX, %RDX
+	leaq	(VEC_SIZE)(%rdi, %rdx, CHAR_SIZE), %r8
+	jmp	L(loop)
+
+	/* If second vector doesn't have match, first vector must
+	   have match.  */
+L(loop_vec_x3_ret):
+	KMOV	%k1, %R8
+	BSR	%R8, %R8
+# ifdef USE_AS_WCSRCHR
+	leaq	(%rdi, %r8, CHAR_SIZE), %r8
+# else
+	add	%rdi, %r8
+# endif
+	jmp	L(loop)
+
+L(endloop):
+	/* Check if string end in first loop vector.  */
+	VPTESTN	%VMM3, %VMM3, %k0
+	KMOV	%k0, %RCX
+	test	%RCX, %RCX
+	jnz	L(vector_x3_end)
+
+	/* Check if it has match in first loop vector.  */
+	KMOV	%k1, %RAX
+	test	%RAX, %RAX
+	jz	L(vector_x4_end)
+
+	BSR	%RAX, %RAX
+	leaq	(%rdi, %rax, CHAR_SIZE), %r8
+
+	/* String must end in second loop vector.  */
+L(vector_x4_end):
+	VPTESTN	%VMM4, %VMM4, %k0
+	KMOV	%k0, %RCX
+	KMOV	%k2, %RAX
+	BLSMSK	%RCX, %RCX
+	/* Check if it has match in second loop vector.  */
+	and	%RCX, %RAX
+	jz	L(check_last_match)
+
+	BSR	%RAX, %RAX
+	leaq	(VEC_SIZE)(%rdi, %rax, CHAR_SIZE), %rax
+	ret
+
+	/* String end in first loop vector.  */
+L(vector_x3_end):
+	KMOV	%k1, %RAX
+	BLSMSK	%RCX, %RCX
+	/* Check if it has match in second loop vector.  */
+	and	%RCX, %RAX
+	jz	L(check_last_match)
+
+	BSR	%RAX, %RAX
+	leaq	(%rdi, %rax, CHAR_SIZE), %rax
+	ret
+
+	/* No match in first and second loop vector.  */
+L(check_last_match):
+	/* Check if any match recorded in r8.  */
+	test	%r8, %r8
+	jz	L(vector_x2_ret)
+	movq	%r8, %rax
+	ret
+
+	/* No match recorded in r8. Check the second saved vector
+	   in begining.  */
+L(vector_x2_ret):
+	VPCMP   $0, %VMM2, %VMM0, %k2
+	KMOV	%k2, %RAX
+	test	%RAX, %RAX
+	jz	L(vector_x1_ret)
+
+	/* Match found in the second saved vector.  */
+	BSR	%RAX, %RAX
+	leaq	(VEC_SIZE)(%r9, %rax, CHAR_SIZE), %rax
+	ret
+
+L(page_cross):
+	movl	%eax, %ecx
+# ifdef USE_AS_WCSRCHR
+	/* Calculate number of compare result bits to be skipped for
+	   wide string alignment adjustment.  */
+	andl	$(VEC_SIZE - 1), %ecx
+	sarl	$2, %ecx
+# endif
+	/* ecx contains number of w[char] to be skipped as a result
+	   of address alignment.  */
+	xorq	%rdi, %rax
+	VMOVA	(PAGE_SIZE - VEC_SIZE)(%rax), %VMM1
+
+	VPTESTN	%VMM1, %VMM1, %k1
+	KMOV	%k1, %RAX
+	SHR     %cl, %RAX
+	jz	L(page_cross_continue)
+	VPCMP	$0, %VMM1, %VMM0, %k0
+	KMOV	%k0, %RDX
+	SHR     %cl, %RDX
+	BLSMSK	%RAX, %RAX
+	and	%RDX, %RAX
+	jz	L(ret)
+	BSR	%RAX, %RAX
+# ifdef USE_AS_WCSRCHR
+	leaq	(%rdi, %rax, CHAR_SIZE), %rax
+# else
+	add	%rdi, %rax
+# endif
+
+	ret
+END (STRRCHR)
+#endif
diff --git a/sysdeps/x86_64/multiarch/strrchr-evex512.S b/sysdeps/x86_64/multiarch/strrchr-evex512.S
new file mode 100644
index 0000000000..f880848e09
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strrchr-evex512.S
@@ -0,0 +1,7 @@
+# ifndef STRRCHR
+#  define STRRCHR	__strrchr_evex512
+# endif
+
+#define VEC_SIZE	64
+
+#include "strrchr-evex-base.S"
diff --git a/sysdeps/x86_64/multiarch/wcsrchr-evex512.S b/sysdeps/x86_64/multiarch/wcsrchr-evex512.S
new file mode 100644
index 0000000000..65b7710b22
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wcsrchr-evex512.S
@@ -0,0 +1,8 @@
+#ifndef WCSRCHR
+# define WCSRCHR	__wcsrchr_evex512
+#endif
+
+#define STRRCHR	WCSRCHR
+#define USE_AS_WCSRCHR 1
+
+#include "strrchr-evex512.S"
-- 
2.36.1


^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH] x86_64: Implement evex512 version of strrchr and wcsrchr
  2022-09-22  0:24 [PATCH] x86_64: Implement evex512 version of strrchr and wcsrchr Sunil K Pandey
@ 2022-09-22  0:50 ` Noah Goldstein
  2022-09-23  3:57   ` Sunil Pandey
  0 siblings, 1 reply; 7+ messages in thread
From: Noah Goldstein @ 2022-09-22  0:50 UTC (permalink / raw)
  To: Sunil K Pandey; +Cc: GNU C Library

On Wed, Sep 21, 2022 at 5:24 PM Sunil K Pandey via Libc-alpha
<libc-alpha@sourceware.org> wrote:
>
> This patch implements following evex512 version of string functions.
> evex512 version takes up to 30% less cycle as compared to evex,
> depending on length and alignment.
>

Please attach benchmark numbers.

> - strrchr function using 512 bit vectors.
> - wcsrchr function using 512 bit vectors.
>
> Code size data:
>
> strrchr-evex.o          833 byte
> strrchr-evex512.o       573 byte (-31%)
>
> wcsrchr-evex.o          836 byte
> wcsrchr-evex512.o       581 byte (-31%)
>
> Placeholder function, not used by any processor at the moment.
> ---
>  sysdeps/x86_64/multiarch/Makefile            |   2 +
>  sysdeps/x86_64/multiarch/ifunc-impl-list.c   |  10 +
>  sysdeps/x86_64/multiarch/strrchr-evex-base.S | 307 +++++++++++++++++++
>  sysdeps/x86_64/multiarch/strrchr-evex512.S   |   7 +
>  sysdeps/x86_64/multiarch/wcsrchr-evex512.S   |   8 +
>  5 files changed, 334 insertions(+)
>  create mode 100644 sysdeps/x86_64/multiarch/strrchr-evex-base.S
>  create mode 100644 sysdeps/x86_64/multiarch/strrchr-evex512.S
>  create mode 100644 sysdeps/x86_64/multiarch/wcsrchr-evex512.S
>
> diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
> index df4601c294..6a275f1c3d 100644
> --- a/sysdeps/x86_64/multiarch/Makefile
> +++ b/sysdeps/x86_64/multiarch/Makefile
> @@ -110,6 +110,7 @@ sysdep_routines += \
>    strrchr-avx2 \
>    strrchr-avx2-rtm \
>    strrchr-evex \
> +  strrchr-evex512 \
>    strrchr-sse2 \
>    strspn-sse4 \
>    strstr-avx512 \
> @@ -152,6 +153,7 @@ sysdep_routines += \
>    wcsrchr-avx2 \
>    wcsrchr-avx2-rtm \
>    wcsrchr-evex \
> +  wcsrchr-evex512 \
>    wcsrchr-sse2 \
>    wmemchr-avx2 \
>    wmemchr-avx2-rtm \
> diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> index a71444eccb..26c941023a 100644
> --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> @@ -564,6 +564,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
>                                      (CPU_FEATURE_USABLE (AVX512VL)
>                                       && CPU_FEATURE_USABLE (AVX512BW)),
>                                      __strrchr_evex)
> +             X86_IFUNC_IMPL_ADD_V4 (array, i, strrchr,
> +                                    (CPU_FEATURE_USABLE (AVX512VL)
> +                                     && CPU_FEATURE_USABLE (AVX512BW)
> +                                     && CPU_FEATURE_USABLE (BMI2)),
> +                                    __strrchr_evex512)
>               X86_IFUNC_IMPL_ADD_V3 (array, i, strrchr,
>                                      CPU_FEATURE_USABLE (AVX2),
>                                      __strrchr_avx2)
> @@ -775,6 +780,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
>                                       && CPU_FEATURE_USABLE (AVX512BW)
>                                       && CPU_FEATURE_USABLE (BMI2)),
>                                      __wcsrchr_evex)
> +             X86_IFUNC_IMPL_ADD_V4 (array, i, wcsrchr,
> +                                    (CPU_FEATURE_USABLE (AVX512VL)
> +                                     && CPU_FEATURE_USABLE (AVX512BW)
> +                                     && CPU_FEATURE_USABLE (BMI2)),
> +                                    __wcsrchr_evex512)
>               X86_IFUNC_IMPL_ADD_V3 (array, i, wcsrchr,
>                                      CPU_FEATURE_USABLE (AVX2),
>                                      __wcsrchr_avx2)
> diff --git a/sysdeps/x86_64/multiarch/strrchr-evex-base.S b/sysdeps/x86_64/multiarch/strrchr-evex-base.S
> new file mode 100644
> index 0000000000..e937cb193c
> --- /dev/null
> +++ b/sysdeps/x86_64/multiarch/strrchr-evex-base.S
> @@ -0,0 +1,307 @@
> +/* Placeholder function, not used by any processor at the moment.
> +   Copyright (C) 2022 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +/* UNUSED. Exists purely as reference implementation.  */
> +
> +#include <isa-level.h>
> +
> +#if ISA_SHOULD_BUILD (4)
> +
> +# include <sysdep.h>
> +
> +# ifdef USE_AS_WCSRCHR
> +#  define CHAR_SIZE    4
> +#  define VPBROADCAST   vpbroadcastd
> +#  define VPCMP                vpcmpd
> +#  define VPMINU       vpminud
> +#  define VPTESTN      vptestnmd
> +# else
> +#  define CHAR_SIZE    1
> +#  define VPBROADCAST   vpbroadcastb
> +#  define VPCMP                vpcmpb
> +#  define VPMINU       vpminub
> +#  define VPTESTN      vptestnmb
> +# endif
> +
> +# define PAGE_SIZE     4096
> +# define CHAR_PER_VEC  (VEC_SIZE / CHAR_SIZE)
> +
> +# if VEC_SIZE == 64
> +#  define BLSMSK       blsmskq
> +#  define BSR          bsrq
> +#  define KMOV         kmovq
> +#  define KOR          korq
> +#  define KORTEST      kortestq
> +#  define R8           r8
> +#  define RAX          rax
> +#  define RCX          rcx
> +#  define RDX          rdx
> +#  define SHR          shrq
> +#  define TEXTSUFFIX   evex512
> +#  define VMM0         zmm16
> +#  define VMM1         zmm17
> +#  define VMM2         zmm18
> +#  define VMM3         zmm19
> +#  define VMM4         zmm20
> +#  define VMM5         zmm21
> +#  define VMOVA                vmovdqa64
> +#  define VMOVU                vmovdqu64
> +
> +# elif VEC_SIZE == 32
> +/* Currently Unused.  */
> +#  define BLSMSK       blsmskl
> +#  define BSR          bsrl
> +#  define KMOV         kmovd
> +#  define KOR          kord
> +#  define KORTEST      kortestd
> +#  define R8           r8d
> +#  define RAX          eax
> +#  define RCX          ecx
> +#  define RDX          edx
> +#  define SHR          shrl
> +#  define TEXTSUFFIX   evex256
> +#  define VMM0         ymm16
> +#  define VMM1         ymm17
> +#  define VMM2         ymm18
> +#  define VMM3         ymm19
> +#  define VMM4         ymm20
> +#  define VMM5         ymm21
> +#  define VMOVA                vmovdqa32
> +#  define VMOVU                vmovdqu32
> +# endif
> +
> +       .section .text.TEXTSUFFIX, "ax", @progbits
> +/* Aligning entry point to 64 byte, provides better performance for
> +   one vector length string.  */
> +ENTRY_P2ALIGN (STRRCHR, 6)
> +
> +       /* Broadcast CHAR to VMM0.  */
> +       VPBROADCAST %esi, %VMM0
> +       movl    %edi, %eax
> +       andl    $(PAGE_SIZE - 1), %eax
> +       cmpl    $(PAGE_SIZE - VEC_SIZE), %eax
> +       ja      L(page_cross)
> +
> +L(page_cross_continue):
> +       /* Compare [w]char for null, mask bit will be set for match.  */
> +       VMOVU   (%rdi), %VMM1
> +
> +       VPTESTN %VMM1, %VMM1, %k1
> +       KMOV    %k1, %RCX
> +       test    %RCX, %RCX
> +       jz      L(align_more)
> +
> +       VPCMP   $0, %VMM1, %VMM0, %k0
> +       KMOV    %k0, %RAX
> +       BLSMSK  %RCX, %RCX
> +       and     %RCX, %RAX
> +       jz      L(ret)
> +
> +       BSR     %RAX, %RAX
> +# ifdef USE_AS_WCSRCHR
> +       leaq    (%rdi, %rax, CHAR_SIZE), %rax
> +# else
> +       add     %rdi, %rax
> +# endif
> +L(ret):
> +       ret
> +
> +L(vector_x2_end):
> +       VPCMP   $0, %VMM2, %VMM0, %k2
> +       KMOV    %k2, %RAX
> +       BLSMSK  %RCX, %RCX
> +       and     %RCX, %RAX
> +       jz      L(vector_x1_ret)
> +
> +       BSR     %RAX, %RAX
> +       leaq    (VEC_SIZE)(%rdi, %rax, CHAR_SIZE), %rax
> +       ret
> +
> +       /* Check the first vector at very last to look for match.  */
> +L(vector_x1_ret):
> +       VPCMP   $0, %VMM1, %VMM0, %k2
> +       KMOV    %k2, %RAX
> +       test    %RAX, %RAX
> +       jz      L(ret)
> +
> +       BSR     %RAX, %RAX
> +# ifdef USE_AS_WCSRCHR
> +       leaq    (%rsi, %rax, CHAR_SIZE), %rax
> +# else
> +       add     %rsi, %rax
> +# endif
> +       ret
> +
> +L(align_more):
> +       /* Zero r8 to store match result.  */
> +       xorq    %r8, %r8
> +       /* Save pointer of first vector, in case if no match found.  */
> +       movq    %rdi, %rsi
> +       /* Align pointer to vector size.  */
> +       andq    $-VEC_SIZE, %rdi
> +       /* Loop unroll 2 times for 2 vector loop.  */
> +       VMOVA   (VEC_SIZE)(%rdi), %VMM2
> +       VPTESTN %VMM2, %VMM2, %k0
> +       KMOV    %k0, %RCX
> +       test    %RCX, %RCX
> +       jnz     L(vector_x2_end)
> +
> +       /* Save pointer of second vector, in case if no match
> +          found.  */
> +       movq    %rdi, %r9
> +       /* Align address to VEC_SIZE * 2 for loop.  */
> +       andq    $-(VEC_SIZE * 2), %rdi
> +
> +       .p2align 4,,11
> +L(loop):
> +       /* 2 vector loop, as it provide better performance as compared
> +          to 4 vector loop.  */
> +       VMOVA   (VEC_SIZE * 2)(%rdi), %VMM3
> +       VMOVA   (VEC_SIZE * 3)(%rdi), %VMM4
> +       VPCMP   $0, %VMM3, %VMM0, %k1
> +       VPCMP   $0, %VMM4, %VMM0, %k2
> +       VPMINU  %VMM3, %VMM4, %VMM5
> +       VPTESTN %VMM5, %VMM5, %k0
> +       KOR     %k1, %k2, %k3
> +       subq    $-(VEC_SIZE * 2), %rdi
> +       /* If k0 and k3 zero, match and end of string not found.  */
> +       KORTEST %k0, %k3
> +       jz      L(loop)
> +
> +       /* If k0 is non zero, end of string found.  */
> +       KORTEST %k0, %k0
> +       jnz     L(endloop)
> +
> +       /* A match found, it need to be stored in r8 before loop
> +          continue.  */
> +       /* Check second vector first.  */
> +       KMOV    %k2, %RDX
> +       test    %RDX, %RDX
> +       jz      L(loop_vec_x3_ret)
> +
> +       BSR     %RDX, %RDX
> +       leaq    (VEC_SIZE)(%rdi, %rdx, CHAR_SIZE), %r8
> +       jmp     L(loop)
> +
> +       /* If second vector doesn't have match, first vector must
> +          have match.  */
> +L(loop_vec_x3_ret):
> +       KMOV    %k1, %R8
> +       BSR     %R8, %R8
> +# ifdef USE_AS_WCSRCHR
> +       leaq    (%rdi, %r8, CHAR_SIZE), %r8
> +# else
> +       add     %rdi, %r8
> +# endif
> +       jmp     L(loop)
> +
> +L(endloop):
> +       /* Check if string end in first loop vector.  */
> +       VPTESTN %VMM3, %VMM3, %k0
> +       KMOV    %k0, %RCX
> +       test    %RCX, %RCX
> +       jnz     L(vector_x3_end)
> +
> +       /* Check if it has match in first loop vector.  */
> +       KMOV    %k1, %RAX
> +       test    %RAX, %RAX
> +       jz      L(vector_x4_end)
> +
> +       BSR     %RAX, %RAX
> +       leaq    (%rdi, %rax, CHAR_SIZE), %r8
> +
> +       /* String must end in second loop vector.  */
> +L(vector_x4_end):
> +       VPTESTN %VMM4, %VMM4, %k0
> +       KMOV    %k0, %RCX
> +       KMOV    %k2, %RAX
> +       BLSMSK  %RCX, %RCX
> +       /* Check if it has match in second loop vector.  */
> +       and     %RCX, %RAX
> +       jz      L(check_last_match)
> +
> +       BSR     %RAX, %RAX
> +       leaq    (VEC_SIZE)(%rdi, %rax, CHAR_SIZE), %rax
> +       ret
> +
> +       /* String end in first loop vector.  */
> +L(vector_x3_end):
> +       KMOV    %k1, %RAX
> +       BLSMSK  %RCX, %RCX
> +       /* Check if it has match in second loop vector.  */
> +       and     %RCX, %RAX
> +       jz      L(check_last_match)
> +
> +       BSR     %RAX, %RAX
> +       leaq    (%rdi, %rax, CHAR_SIZE), %rax
> +       ret
> +
> +       /* No match in first and second loop vector.  */
> +L(check_last_match):
> +       /* Check if any match recorded in r8.  */
> +       test    %r8, %r8
> +       jz      L(vector_x2_ret)
> +       movq    %r8, %rax
> +       ret
> +
> +       /* No match recorded in r8. Check the second saved vector
> +          in begining.  */
> +L(vector_x2_ret):
> +       VPCMP   $0, %VMM2, %VMM0, %k2
> +       KMOV    %k2, %RAX
> +       test    %RAX, %RAX
> +       jz      L(vector_x1_ret)
> +
> +       /* Match found in the second saved vector.  */
> +       BSR     %RAX, %RAX
> +       leaq    (VEC_SIZE)(%r9, %rax, CHAR_SIZE), %rax
> +       ret
> +
> +L(page_cross):
> +       movl    %eax, %ecx
> +# ifdef USE_AS_WCSRCHR
> +       /* Calculate number of compare result bits to be skipped for
> +          wide string alignment adjustment.  */
> +       andl    $(VEC_SIZE - 1), %ecx
> +       sarl    $2, %ecx
> +# endif
> +       /* ecx contains number of w[char] to be skipped as a result
> +          of address alignment.  */
> +       xorq    %rdi, %rax
> +       VMOVA   (PAGE_SIZE - VEC_SIZE)(%rax), %VMM1
> +
> +       VPTESTN %VMM1, %VMM1, %k1
> +       KMOV    %k1, %RAX
> +       SHR     %cl, %RAX
> +       jz      L(page_cross_continue)
> +       VPCMP   $0, %VMM1, %VMM0, %k0
> +       KMOV    %k0, %RDX
> +       SHR     %cl, %RDX
> +       BLSMSK  %RAX, %RAX
> +       and     %RDX, %RAX
> +       jz      L(ret)
> +       BSR     %RAX, %RAX
> +# ifdef USE_AS_WCSRCHR
> +       leaq    (%rdi, %rax, CHAR_SIZE), %rax
> +# else
> +       add     %rdi, %rax
> +# endif
> +
> +       ret
> +END (STRRCHR)
> +#endif
> diff --git a/sysdeps/x86_64/multiarch/strrchr-evex512.S b/sysdeps/x86_64/multiarch/strrchr-evex512.S
> new file mode 100644
> index 0000000000..f880848e09
> --- /dev/null
> +++ b/sysdeps/x86_64/multiarch/strrchr-evex512.S
> @@ -0,0 +1,7 @@
> +# ifndef STRRCHR
> +#  define STRRCHR      __strrchr_evex512
> +# endif
> +
> +#define VEC_SIZE       64
> +
> +#include "strrchr-evex-base.S"
> diff --git a/sysdeps/x86_64/multiarch/wcsrchr-evex512.S b/sysdeps/x86_64/multiarch/wcsrchr-evex512.S
> new file mode 100644
> index 0000000000..65b7710b22
> --- /dev/null
> +++ b/sysdeps/x86_64/multiarch/wcsrchr-evex512.S
> @@ -0,0 +1,8 @@
> +#ifndef WCSRCHR
> +# define WCSRCHR       __wcsrchr_evex512
> +#endif
> +
> +#define STRRCHR        WCSRCHR
> +#define USE_AS_WCSRCHR 1
> +
> +#include "strrchr-evex512.S"
> --
> 2.36.1
>

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH] x86_64: Implement evex512 version of strrchr and wcsrchr
  2022-09-22  0:50 ` Noah Goldstein
@ 2022-09-23  3:57   ` Sunil Pandey
  2022-09-29  3:42     ` Sunil Pandey
  0 siblings, 1 reply; 7+ messages in thread
From: Sunil Pandey @ 2022-09-23  3:57 UTC (permalink / raw)
  To: Noah Goldstein; +Cc: GNU C Library

[-- Attachment #1: Type: text/plain, Size: 15299 bytes --]

Microbenchmark data collected on: Intel(R) Xeon(R) Gold 6148 CPU @ 2.40GHz


On Wed, Sep 21, 2022 at 5:50 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> On Wed, Sep 21, 2022 at 5:24 PM Sunil K Pandey via Libc-alpha
> <libc-alpha@sourceware.org> wrote:
> >
> > This patch implements following evex512 version of string functions.
> > evex512 version takes up to 30% less cycle as compared to evex,
> > depending on length and alignment.
> >
>
> Please attach benchmark numbers.
>
> > - strrchr function using 512 bit vectors.
> > - wcsrchr function using 512 bit vectors.
> >
> > Code size data:
> >
> > strrchr-evex.o          833 byte
> > strrchr-evex512.o       573 byte (-31%)
> >
> > wcsrchr-evex.o          836 byte
> > wcsrchr-evex512.o       581 byte (-31%)
> >
> > Placeholder function, not used by any processor at the moment.
> > ---
> >  sysdeps/x86_64/multiarch/Makefile            |   2 +
> >  sysdeps/x86_64/multiarch/ifunc-impl-list.c   |  10 +
> >  sysdeps/x86_64/multiarch/strrchr-evex-base.S | 307 +++++++++++++++++++
> >  sysdeps/x86_64/multiarch/strrchr-evex512.S   |   7 +
> >  sysdeps/x86_64/multiarch/wcsrchr-evex512.S   |   8 +
> >  5 files changed, 334 insertions(+)
> >  create mode 100644 sysdeps/x86_64/multiarch/strrchr-evex-base.S
> >  create mode 100644 sysdeps/x86_64/multiarch/strrchr-evex512.S
> >  create mode 100644 sysdeps/x86_64/multiarch/wcsrchr-evex512.S
> >
> > diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
> > index df4601c294..6a275f1c3d 100644
> > --- a/sysdeps/x86_64/multiarch/Makefile
> > +++ b/sysdeps/x86_64/multiarch/Makefile
> > @@ -110,6 +110,7 @@ sysdep_routines += \
> >    strrchr-avx2 \
> >    strrchr-avx2-rtm \
> >    strrchr-evex \
> > +  strrchr-evex512 \
> >    strrchr-sse2 \
> >    strspn-sse4 \
> >    strstr-avx512 \
> > @@ -152,6 +153,7 @@ sysdep_routines += \
> >    wcsrchr-avx2 \
> >    wcsrchr-avx2-rtm \
> >    wcsrchr-evex \
> > +  wcsrchr-evex512 \
> >    wcsrchr-sse2 \
> >    wmemchr-avx2 \
> >    wmemchr-avx2-rtm \
> > diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> > index a71444eccb..26c941023a 100644
> > --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> > +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> > @@ -564,6 +564,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> >                                      (CPU_FEATURE_USABLE (AVX512VL)
> >                                       && CPU_FEATURE_USABLE (AVX512BW)),
> >                                      __strrchr_evex)
> > +             X86_IFUNC_IMPL_ADD_V4 (array, i, strrchr,
> > +                                    (CPU_FEATURE_USABLE (AVX512VL)
> > +                                     && CPU_FEATURE_USABLE (AVX512BW)
> > +                                     && CPU_FEATURE_USABLE (BMI2)),
> > +                                    __strrchr_evex512)
> >               X86_IFUNC_IMPL_ADD_V3 (array, i, strrchr,
> >                                      CPU_FEATURE_USABLE (AVX2),
> >                                      __strrchr_avx2)
> > @@ -775,6 +780,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> >                                       && CPU_FEATURE_USABLE (AVX512BW)
> >                                       && CPU_FEATURE_USABLE (BMI2)),
> >                                      __wcsrchr_evex)
> > +             X86_IFUNC_IMPL_ADD_V4 (array, i, wcsrchr,
> > +                                    (CPU_FEATURE_USABLE (AVX512VL)
> > +                                     && CPU_FEATURE_USABLE (AVX512BW)
> > +                                     && CPU_FEATURE_USABLE (BMI2)),
> > +                                    __wcsrchr_evex512)
> >               X86_IFUNC_IMPL_ADD_V3 (array, i, wcsrchr,
> >                                      CPU_FEATURE_USABLE (AVX2),
> >                                      __wcsrchr_avx2)
> > diff --git a/sysdeps/x86_64/multiarch/strrchr-evex-base.S b/sysdeps/x86_64/multiarch/strrchr-evex-base.S
> > new file mode 100644
> > index 0000000000..e937cb193c
> > --- /dev/null
> > +++ b/sysdeps/x86_64/multiarch/strrchr-evex-base.S
> > @@ -0,0 +1,307 @@
> > +/* Placeholder function, not used by any processor at the moment.
> > +   Copyright (C) 2022 Free Software Foundation, Inc.
> > +   This file is part of the GNU C Library.
> > +
> > +   The GNU C Library is free software; you can redistribute it and/or
> > +   modify it under the terms of the GNU Lesser General Public
> > +   License as published by the Free Software Foundation; either
> > +   version 2.1 of the License, or (at your option) any later version.
> > +
> > +   The GNU C Library is distributed in the hope that it will be useful,
> > +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> > +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > +   Lesser General Public License for more details.
> > +
> > +   You should have received a copy of the GNU Lesser General Public
> > +   License along with the GNU C Library; if not, see
> > +   <https://www.gnu.org/licenses/>.  */
> > +
> > +/* UNUSED. Exists purely as reference implementation.  */
> > +
> > +#include <isa-level.h>
> > +
> > +#if ISA_SHOULD_BUILD (4)
> > +
> > +# include <sysdep.h>
> > +
> > +# ifdef USE_AS_WCSRCHR
> > +#  define CHAR_SIZE    4
> > +#  define VPBROADCAST   vpbroadcastd
> > +#  define VPCMP                vpcmpd
> > +#  define VPMINU       vpminud
> > +#  define VPTESTN      vptestnmd
> > +# else
> > +#  define CHAR_SIZE    1
> > +#  define VPBROADCAST   vpbroadcastb
> > +#  define VPCMP                vpcmpb
> > +#  define VPMINU       vpminub
> > +#  define VPTESTN      vptestnmb
> > +# endif
> > +
> > +# define PAGE_SIZE     4096
> > +# define CHAR_PER_VEC  (VEC_SIZE / CHAR_SIZE)
> > +
> > +# if VEC_SIZE == 64
> > +#  define BLSMSK       blsmskq
> > +#  define BSR          bsrq
> > +#  define KMOV         kmovq
> > +#  define KOR          korq
> > +#  define KORTEST      kortestq
> > +#  define R8           r8
> > +#  define RAX          rax
> > +#  define RCX          rcx
> > +#  define RDX          rdx
> > +#  define SHR          shrq
> > +#  define TEXTSUFFIX   evex512
> > +#  define VMM0         zmm16
> > +#  define VMM1         zmm17
> > +#  define VMM2         zmm18
> > +#  define VMM3         zmm19
> > +#  define VMM4         zmm20
> > +#  define VMM5         zmm21
> > +#  define VMOVA                vmovdqa64
> > +#  define VMOVU                vmovdqu64
> > +
> > +# elif VEC_SIZE == 32
> > +/* Currently Unused.  */
> > +#  define BLSMSK       blsmskl
> > +#  define BSR          bsrl
> > +#  define KMOV         kmovd
> > +#  define KOR          kord
> > +#  define KORTEST      kortestd
> > +#  define R8           r8d
> > +#  define RAX          eax
> > +#  define RCX          ecx
> > +#  define RDX          edx
> > +#  define SHR          shrl
> > +#  define TEXTSUFFIX   evex256
> > +#  define VMM0         ymm16
> > +#  define VMM1         ymm17
> > +#  define VMM2         ymm18
> > +#  define VMM3         ymm19
> > +#  define VMM4         ymm20
> > +#  define VMM5         ymm21
> > +#  define VMOVA                vmovdqa32
> > +#  define VMOVU                vmovdqu32
> > +# endif
> > +
> > +       .section .text.TEXTSUFFIX, "ax", @progbits
> > +/* Aligning entry point to 64 byte, provides better performance for
> > +   one vector length string.  */
> > +ENTRY_P2ALIGN (STRRCHR, 6)
> > +
> > +       /* Broadcast CHAR to VMM0.  */
> > +       VPBROADCAST %esi, %VMM0
> > +       movl    %edi, %eax
> > +       andl    $(PAGE_SIZE - 1), %eax
> > +       cmpl    $(PAGE_SIZE - VEC_SIZE), %eax
> > +       ja      L(page_cross)
> > +
> > +L(page_cross_continue):
> > +       /* Compare [w]char for null, mask bit will be set for match.  */
> > +       VMOVU   (%rdi), %VMM1
> > +
> > +       VPTESTN %VMM1, %VMM1, %k1
> > +       KMOV    %k1, %RCX
> > +       test    %RCX, %RCX
> > +       jz      L(align_more)
> > +
> > +       VPCMP   $0, %VMM1, %VMM0, %k0
> > +       KMOV    %k0, %RAX
> > +       BLSMSK  %RCX, %RCX
> > +       and     %RCX, %RAX
> > +       jz      L(ret)
> > +
> > +       BSR     %RAX, %RAX
> > +# ifdef USE_AS_WCSRCHR
> > +       leaq    (%rdi, %rax, CHAR_SIZE), %rax
> > +# else
> > +       add     %rdi, %rax
> > +# endif
> > +L(ret):
> > +       ret
> > +
> > +L(vector_x2_end):
> > +       VPCMP   $0, %VMM2, %VMM0, %k2
> > +       KMOV    %k2, %RAX
> > +       BLSMSK  %RCX, %RCX
> > +       and     %RCX, %RAX
> > +       jz      L(vector_x1_ret)
> > +
> > +       BSR     %RAX, %RAX
> > +       leaq    (VEC_SIZE)(%rdi, %rax, CHAR_SIZE), %rax
> > +       ret
> > +
> > +       /* Check the first vector at very last to look for match.  */
> > +L(vector_x1_ret):
> > +       VPCMP   $0, %VMM1, %VMM0, %k2
> > +       KMOV    %k2, %RAX
> > +       test    %RAX, %RAX
> > +       jz      L(ret)
> > +
> > +       BSR     %RAX, %RAX
> > +# ifdef USE_AS_WCSRCHR
> > +       leaq    (%rsi, %rax, CHAR_SIZE), %rax
> > +# else
> > +       add     %rsi, %rax
> > +# endif
> > +       ret
> > +
> > +L(align_more):
> > +       /* Zero r8 to store match result.  */
> > +       xorq    %r8, %r8
> > +       /* Save pointer of first vector, in case if no match found.  */
> > +       movq    %rdi, %rsi
> > +       /* Align pointer to vector size.  */
> > +       andq    $-VEC_SIZE, %rdi
> > +       /* Loop unroll 2 times for 2 vector loop.  */
> > +       VMOVA   (VEC_SIZE)(%rdi), %VMM2
> > +       VPTESTN %VMM2, %VMM2, %k0
> > +       KMOV    %k0, %RCX
> > +       test    %RCX, %RCX
> > +       jnz     L(vector_x2_end)
> > +
> > +       /* Save pointer of second vector, in case if no match
> > +          found.  */
> > +       movq    %rdi, %r9
> > +       /* Align address to VEC_SIZE * 2 for loop.  */
> > +       andq    $-(VEC_SIZE * 2), %rdi
> > +
> > +       .p2align 4,,11
> > +L(loop):
> > +       /* 2 vector loop, as it provide better performance as compared
> > +          to 4 vector loop.  */
> > +       VMOVA   (VEC_SIZE * 2)(%rdi), %VMM3
> > +       VMOVA   (VEC_SIZE * 3)(%rdi), %VMM4
> > +       VPCMP   $0, %VMM3, %VMM0, %k1
> > +       VPCMP   $0, %VMM4, %VMM0, %k2
> > +       VPMINU  %VMM3, %VMM4, %VMM5
> > +       VPTESTN %VMM5, %VMM5, %k0
> > +       KOR     %k1, %k2, %k3
> > +       subq    $-(VEC_SIZE * 2), %rdi
> > +       /* If k0 and k3 zero, match and end of string not found.  */
> > +       KORTEST %k0, %k3
> > +       jz      L(loop)
> > +
> > +       /* If k0 is non zero, end of string found.  */
> > +       KORTEST %k0, %k0
> > +       jnz     L(endloop)
> > +
> > +       /* A match found, it need to be stored in r8 before loop
> > +          continue.  */
> > +       /* Check second vector first.  */
> > +       KMOV    %k2, %RDX
> > +       test    %RDX, %RDX
> > +       jz      L(loop_vec_x3_ret)
> > +
> > +       BSR     %RDX, %RDX
> > +       leaq    (VEC_SIZE)(%rdi, %rdx, CHAR_SIZE), %r8
> > +       jmp     L(loop)
> > +
> > +       /* If second vector doesn't have match, first vector must
> > +          have match.  */
> > +L(loop_vec_x3_ret):
> > +       KMOV    %k1, %R8
> > +       BSR     %R8, %R8
> > +# ifdef USE_AS_WCSRCHR
> > +       leaq    (%rdi, %r8, CHAR_SIZE), %r8
> > +# else
> > +       add     %rdi, %r8
> > +# endif
> > +       jmp     L(loop)
> > +
> > +L(endloop):
> > +       /* Check if string end in first loop vector.  */
> > +       VPTESTN %VMM3, %VMM3, %k0
> > +       KMOV    %k0, %RCX
> > +       test    %RCX, %RCX
> > +       jnz     L(vector_x3_end)
> > +
> > +       /* Check if it has match in first loop vector.  */
> > +       KMOV    %k1, %RAX
> > +       test    %RAX, %RAX
> > +       jz      L(vector_x4_end)
> > +
> > +       BSR     %RAX, %RAX
> > +       leaq    (%rdi, %rax, CHAR_SIZE), %r8
> > +
> > +       /* String must end in second loop vector.  */
> > +L(vector_x4_end):
> > +       VPTESTN %VMM4, %VMM4, %k0
> > +       KMOV    %k0, %RCX
> > +       KMOV    %k2, %RAX
> > +       BLSMSK  %RCX, %RCX
> > +       /* Check if it has match in second loop vector.  */
> > +       and     %RCX, %RAX
> > +       jz      L(check_last_match)
> > +
> > +       BSR     %RAX, %RAX
> > +       leaq    (VEC_SIZE)(%rdi, %rax, CHAR_SIZE), %rax
> > +       ret
> > +
> > +       /* String end in first loop vector.  */
> > +L(vector_x3_end):
> > +       KMOV    %k1, %RAX
> > +       BLSMSK  %RCX, %RCX
> > +       /* Check if it has match in second loop vector.  */
> > +       and     %RCX, %RAX
> > +       jz      L(check_last_match)
> > +
> > +       BSR     %RAX, %RAX
> > +       leaq    (%rdi, %rax, CHAR_SIZE), %rax
> > +       ret
> > +
> > +       /* No match in first and second loop vector.  */
> > +L(check_last_match):
> > +       /* Check if any match recorded in r8.  */
> > +       test    %r8, %r8
> > +       jz      L(vector_x2_ret)
> > +       movq    %r8, %rax
> > +       ret
> > +
> > +       /* No match recorded in r8. Check the second saved vector
> > +          in begining.  */
> > +L(vector_x2_ret):
> > +       VPCMP   $0, %VMM2, %VMM0, %k2
> > +       KMOV    %k2, %RAX
> > +       test    %RAX, %RAX
> > +       jz      L(vector_x1_ret)
> > +
> > +       /* Match found in the second saved vector.  */
> > +       BSR     %RAX, %RAX
> > +       leaq    (VEC_SIZE)(%r9, %rax, CHAR_SIZE), %rax
> > +       ret
> > +
> > +L(page_cross):
> > +       movl    %eax, %ecx
> > +# ifdef USE_AS_WCSRCHR
> > +       /* Calculate number of compare result bits to be skipped for
> > +          wide string alignment adjustment.  */
> > +       andl    $(VEC_SIZE - 1), %ecx
> > +       sarl    $2, %ecx
> > +# endif
> > +       /* ecx contains number of w[char] to be skipped as a result
> > +          of address alignment.  */
> > +       xorq    %rdi, %rax
> > +       VMOVA   (PAGE_SIZE - VEC_SIZE)(%rax), %VMM1
> > +
> > +       VPTESTN %VMM1, %VMM1, %k1
> > +       KMOV    %k1, %RAX
> > +       SHR     %cl, %RAX
> > +       jz      L(page_cross_continue)
> > +       VPCMP   $0, %VMM1, %VMM0, %k0
> > +       KMOV    %k0, %RDX
> > +       SHR     %cl, %RDX
> > +       BLSMSK  %RAX, %RAX
> > +       and     %RDX, %RAX
> > +       jz      L(ret)
> > +       BSR     %RAX, %RAX
> > +# ifdef USE_AS_WCSRCHR
> > +       leaq    (%rdi, %rax, CHAR_SIZE), %rax
> > +# else
> > +       add     %rdi, %rax
> > +# endif
> > +
> > +       ret
> > +END (STRRCHR)
> > +#endif
> > diff --git a/sysdeps/x86_64/multiarch/strrchr-evex512.S b/sysdeps/x86_64/multiarch/strrchr-evex512.S
> > new file mode 100644
> > index 0000000000..f880848e09
> > --- /dev/null
> > +++ b/sysdeps/x86_64/multiarch/strrchr-evex512.S
> > @@ -0,0 +1,7 @@
> > +# ifndef STRRCHR
> > +#  define STRRCHR      __strrchr_evex512
> > +# endif
> > +
> > +#define VEC_SIZE       64
> > +
> > +#include "strrchr-evex-base.S"
> > diff --git a/sysdeps/x86_64/multiarch/wcsrchr-evex512.S b/sysdeps/x86_64/multiarch/wcsrchr-evex512.S
> > new file mode 100644
> > index 0000000000..65b7710b22
> > --- /dev/null
> > +++ b/sysdeps/x86_64/multiarch/wcsrchr-evex512.S
> > @@ -0,0 +1,8 @@
> > +#ifndef WCSRCHR
> > +# define WCSRCHR       __wcsrchr_evex512
> > +#endif
> > +
> > +#define STRRCHR        WCSRCHR
> > +#define USE_AS_WCSRCHR 1
> > +
> > +#include "strrchr-evex512.S"
> > --
> > 2.36.1
> >

[-- Attachment #2: wcsrchr.txt --]
[-- Type: text/plain, Size: 41618 bytes --]

Function: wcsrchr
Variant: 
                                    __wcsrchr_evex	__wcsrchr_evex512
========================================================================================================================
            len=256, pos=64, align=1:        21.29	       16.97 ( 20.28%)	
            len=256, pos=64, align=1:        22.62	       15.73 ( 30.45%)	
           len=256, pos=64, align=15:        21.59	       16.63 ( 22.97%)	
           len=256, pos=64, align=15:        21.23	       17.50 ( 17.57%)	
            len=256, pos=64, align=2:        20.73	       14.58 ( 29.65%)	
            len=256, pos=64, align=2:        19.78	       15.80 ( 20.10%)	
           len=256, pos=64, align=30:        20.09	       17.02 ( 15.29%)	
           len=256, pos=64, align=30:        19.06	       16.43 ( 13.81%)	
            len=256, pos=64, align=3:        18.44	       14.15 ( 23.24%)	
            len=256, pos=64, align=3:        16.87	       11.87 ( 29.62%)	
           len=256, pos=64, align=45:        17.03	       13.87 ( 18.58%)	
           len=256, pos=64, align=45:        17.36	       14.59 ( 15.96%)	
            len=256, pos=64, align=4:        17.64	       13.28 ( 24.72%)	
            len=256, pos=64, align=4:        17.13	       12.94 ( 24.44%)	
           len=256, pos=64, align=60:        17.21	       14.14 ( 17.82%)	
           len=256, pos=64, align=60:        16.67	       12.99 ( 22.09%)	
            len=256, pos=64, align=5:        16.32	       13.24 ( 18.86%)	
            len=256, pos=64, align=5:        15.09	       12.53 ( 16.95%)	
           len=256, pos=64, align=75:        15.11	       12.34 ( 18.32%)	
           len=256, pos=64, align=75:        14.67	       12.00 ( 18.19%)	
            len=256, pos=64, align=6:        14.67	       10.22 ( 30.30%)	
            len=256, pos=64, align=6:        15.11	       11.95 ( 20.90%)	
           len=256, pos=64, align=90:        14.85	       13.65 (  8.09%)	
           len=256, pos=64, align=90:        14.50	       13.53 (  6.70%)	
            len=256, pos=64, align=7:        14.53	       12.00 ( 17.40%)	
            len=256, pos=64, align=7:        14.53	       12.00 ( 17.41%)	
          len=256, pos=64, align=105:        14.67	       10.22 ( 30.30%)	
          len=256, pos=64, align=105:        16.23	       11.89 ( 26.74%)	
               len=1, pos=0, align=0:         4.67	        4.67 (  0.03%)	
               len=1, pos=0, align=0:         4.67	        5.33 (-14.27%)	
               len=2, pos=1, align=0:         4.67	        5.28 (-13.12%)	
               len=2, pos=1, align=0:         4.67	        5.33 (-14.23%)	
               len=3, pos=2, align=0:         5.33	        5.33 ( -0.00%)	
               len=3, pos=2, align=0:         4.67	        4.67 (  0.00%)	
               len=4, pos=3, align=0:         5.33	        5.33 (  0.00%)	
               len=4, pos=3, align=0:         4.67	        5.07 ( -8.66%)	
               len=5, pos=4, align=0:         4.67	        5.33 (-14.23%)	
               len=5, pos=4, align=0:         4.67	        5.33 (-14.23%)	
               len=6, pos=5, align=0:         4.67	        4.67 (  0.04%)	
               len=6, pos=5, align=0:         4.67	        5.33 (-14.22%)	
               len=7, pos=6, align=0:         4.67	        5.33 (-14.22%)	
               len=7, pos=6, align=0:         4.67	        5.33 (-14.23%)	
               len=8, pos=7, align=0:         4.67	        4.67 (  0.01%)	
               len=8, pos=7, align=0:         4.67	        4.67 (  0.05%)	
               len=9, pos=8, align=0:         8.00	        4.67 ( 41.67%)	
               len=9, pos=8, align=0:         8.00	        5.33 ( 33.33%)	
              len=10, pos=9, align=0:         8.00	        4.67 ( 41.64%)	
              len=10, pos=9, align=0:         7.33	        5.33 ( 27.27%)	
             len=11, pos=10, align=0:         8.00	        4.67 ( 41.66%)	
             len=11, pos=10, align=0:         8.17	        5.33 ( 34.71%)	
             len=12, pos=11, align=0:         8.00	        5.33 ( 33.33%)	
             len=12, pos=11, align=0:         8.00	        4.67 ( 41.66%)	
             len=13, pos=12, align=0:         8.00	        4.67 ( 41.66%)	
             len=13, pos=12, align=0:         8.00	        4.67 ( 41.64%)	
             len=14, pos=13, align=0:         8.00	        4.67 ( 41.64%)	
             len=14, pos=13, align=0:         8.00	        5.33 ( 33.33%)	
             len=15, pos=14, align=0:         8.00	        4.67 ( 41.66%)	
             len=15, pos=14, align=0:         8.00	        4.67 ( 41.64%)	
             len=16, pos=15, align=0:         7.33	        4.67 ( 36.34%)	
             len=16, pos=15, align=0:         8.00	        4.67 ( 41.66%)	
             len=17, pos=16, align=0:         6.67	        6.01 (  9.98%)	
             len=17, pos=16, align=0:         6.67	        6.00 ( 10.00%)	
             len=18, pos=17, align=0:         6.67	        6.00 (  9.99%)	
             len=18, pos=17, align=0:         6.67	        6.00 (  9.99%)	
             len=19, pos=18, align=0:         6.00	        6.00 ( -0.02%)	
             len=19, pos=18, align=0:         6.67	        6.00 ( 10.00%)	
             len=20, pos=19, align=0:         6.67	        6.00 ( 10.00%)	
             len=20, pos=19, align=0:         6.67	        6.00 ( 10.03%)	
             len=21, pos=20, align=0:         6.67	        6.00 ( 10.03%)	
             len=21, pos=20, align=0:         6.67	        6.17 (  7.49%)	
             len=22, pos=21, align=0:         6.67	        6.00 ( 10.00%)	
             len=22, pos=21, align=0:         6.67	        6.00 ( 10.02%)	
             len=23, pos=22, align=0:         6.67	        6.00 ( 10.01%)	
             len=23, pos=22, align=0:         6.67	        6.00 ( 10.00%)	
             len=24, pos=23, align=0:         6.67	        6.17 (  7.50%)	
             len=24, pos=23, align=0:         6.67	        6.00 ( 10.03%)	
             len=25, pos=24, align=0:         8.00	        6.00 ( 25.03%)	
             len=25, pos=24, align=0:         8.67	        6.00 ( 30.75%)	
             len=26, pos=25, align=0:         8.67	        6.00 ( 30.77%)	
             len=26, pos=25, align=0:         8.00	        6.00 ( 25.01%)	
             len=27, pos=26, align=0:         8.00	        6.00 ( 24.98%)	
             len=27, pos=26, align=0:         8.00	        6.00 ( 24.97%)	
             len=28, pos=27, align=0:         8.00	        6.00 ( 25.01%)	
             len=28, pos=27, align=0:         8.00	        6.00 ( 25.01%)	
             len=29, pos=28, align=0:         8.00	        6.00 ( 24.99%)	
             len=29, pos=28, align=0:         8.00	        6.00 ( 25.01%)	
             len=30, pos=29, align=0:         8.67	        6.00 ( 30.74%)	
             len=30, pos=29, align=0:         8.00	        6.00 ( 25.01%)	
             len=31, pos=30, align=0:         8.00	        6.00 ( 25.01%)	
             len=31, pos=30, align=0:         8.00	        6.00 ( 24.98%)	
             len=32, pos=31, align=0:         8.67	        6.00 ( 30.77%)	
             len=32, pos=31, align=0:         8.00	        6.00 ( 25.01%)	
            len=256, pos=64, align=1:        35.05	       27.10 ( 22.68%)	
            len=256, pos=64, align=1:        35.09	       27.05 ( 22.92%)	
           len=256, pos=64, align=15:        33.65	       27.42 ( 18.51%)	
           len=256, pos=64, align=15:        34.41	       26.49 ( 23.01%)	
            len=256, pos=64, align=2:        35.11	       27.10 ( 22.83%)	
            len=256, pos=64, align=2:        34.71	       26.18 ( 24.58%)	
           len=256, pos=64, align=30:        35.01	       26.04 ( 25.62%)	
           len=256, pos=64, align=30:        34.90	       25.79 ( 26.10%)	
            len=256, pos=64, align=3:        33.87	       26.38 ( 22.10%)	
            len=256, pos=64, align=3:        33.96	       26.76 ( 21.18%)	
           len=256, pos=64, align=45:        34.63	       26.45 ( 23.62%)	
           len=256, pos=64, align=45:        33.97	       26.06 ( 23.27%)	
            len=256, pos=64, align=4:        34.63	       26.34 ( 23.95%)	
            len=256, pos=64, align=4:        34.74	       27.33 ( 21.33%)	
           len=256, pos=64, align=60:        33.84	       25.74 ( 23.92%)	
           len=256, pos=64, align=60:        35.60	       28.81 ( 19.07%)	
            len=256, pos=64, align=5:        35.25	       27.10 ( 23.11%)	
            len=256, pos=64, align=5:        34.73	       26.43 ( 23.91%)	
           len=256, pos=64, align=75:        33.96	       26.59 ( 21.69%)	
           len=256, pos=64, align=75:        35.08	       26.48 ( 24.52%)	
            len=256, pos=64, align=6:        36.00	       26.15 ( 27.38%)	
            len=256, pos=64, align=6:        34.76	       26.63 ( 23.38%)	
           len=256, pos=64, align=90:        34.88	       27.15 ( 22.17%)	
           len=256, pos=64, align=90:        34.69	       27.05 ( 22.00%)	
            len=256, pos=64, align=7:        33.85	       26.29 ( 22.32%)	
            len=256, pos=64, align=7:        35.03	       27.08 ( 22.70%)	
          len=256, pos=64, align=105:        35.22	       26.11 ( 25.87%)	
          len=256, pos=64, align=105:        34.91	       27.23 ( 21.99%)	
               len=1, pos=0, align=0:         5.19	        6.47 (-24.54%)	
               len=1, pos=0, align=0:         4.67	        5.33 (-14.24%)	
               len=2, pos=1, align=0:         4.83	        4.67 (  3.42%)	
               len=2, pos=1, align=0:         4.67	        4.67 (  0.00%)	
               len=3, pos=2, align=0:         5.33	        5.33 ( -0.00%)	
               len=3, pos=2, align=0:         4.67	        4.67 (  0.04%)	
               len=4, pos=3, align=0:         4.67	        5.33 (-14.23%)	
               len=4, pos=3, align=0:         4.67	        5.33 (-14.24%)	
               len=5, pos=4, align=0:         4.67	        5.33 (-14.23%)	
               len=5, pos=4, align=0:         4.67	        4.67 (  0.03%)	
               len=6, pos=5, align=0:         4.67	        4.67 (  0.03%)	
               len=6, pos=5, align=0:         4.67	        4.67 (  0.04%)	
               len=7, pos=6, align=0:         4.67	        5.33 (-14.23%)	
               len=7, pos=6, align=0:         4.67	        5.33 (-14.24%)	
               len=8, pos=7, align=0:         7.34	        5.33 ( 27.32%)	
               len=8, pos=7, align=0:         7.33	        5.33 ( 27.28%)	
               len=9, pos=8, align=0:         8.00	        4.67 ( 41.66%)	
               len=9, pos=8, align=0:         8.00	        5.33 ( 33.33%)	
              len=10, pos=9, align=0:         8.00	        5.39 ( 32.61%)	
              len=10, pos=9, align=0:         8.00	        4.67 ( 41.66%)	
             len=11, pos=10, align=0:         8.16	        5.33 ( 34.66%)	
             len=11, pos=10, align=0:         8.00	        5.33 ( 33.33%)	
             len=12, pos=11, align=0:         8.00	        4.67 ( 41.64%)	
             len=12, pos=11, align=0:         8.00	        5.33 ( 33.33%)	
             len=13, pos=12, align=0:         8.00	        4.67 ( 41.65%)	
             len=13, pos=12, align=0:         7.33	        5.33 ( 27.26%)	
             len=14, pos=13, align=0:         8.00	        5.33 ( 33.33%)	
             len=14, pos=13, align=0:         8.00	        5.33 ( 33.33%)	
             len=15, pos=14, align=0:         8.00	        4.67 ( 41.64%)	
             len=15, pos=14, align=0:         8.00	        4.67 ( 41.66%)	
             len=16, pos=15, align=0:         8.67	        6.89 ( 20.50%)	
             len=16, pos=15, align=0:         8.67	        6.67 ( 23.06%)	
             len=17, pos=16, align=0:         6.85	        6.00 ( 12.31%)	
             len=17, pos=16, align=0:         6.00	        6.00 (  0.04%)	
             len=18, pos=17, align=0:         6.00	        6.00 (  0.02%)	
             len=18, pos=17, align=0:         6.67	        6.00 ( 10.03%)	
             len=19, pos=18, align=0:         6.67	        6.00 (  9.97%)	
             len=19, pos=18, align=0:         6.67	        6.00 (  9.99%)	
             len=20, pos=19, align=0:         6.67	        6.00 ( 10.02%)	
             len=20, pos=19, align=0:         6.67	        6.00 ( 10.02%)	
             len=21, pos=20, align=0:         6.35	        6.35 (  0.01%)	
             len=21, pos=20, align=0:         7.06	        6.79 (  3.87%)	
             len=22, pos=21, align=0:         6.67	        6.17 (  7.56%)	
             len=22, pos=21, align=0:         6.67	        6.00 ( 10.02%)	
             len=23, pos=22, align=0:         6.67	        6.00 ( 10.03%)	
             len=23, pos=22, align=0:         6.00	        6.00 (  0.02%)	
             len=24, pos=23, align=0:         8.67	        6.00 ( 30.80%)	
             len=24, pos=23, align=0:         9.34	        6.00 ( 35.72%)	
             len=25, pos=24, align=0:         8.00	        6.00 ( 24.99%)	
             len=25, pos=24, align=0:         8.00	        6.00 ( 25.01%)	
             len=26, pos=25, align=0:         8.00	        6.00 ( 25.01%)	
             len=26, pos=25, align=0:         8.00	        6.00 ( 25.01%)	
             len=27, pos=26, align=0:         8.00	        6.00 ( 24.99%)	
             len=27, pos=26, align=0:         8.94	        6.00 ( 32.88%)	
             len=28, pos=27, align=0:         8.00	        6.00 ( 24.99%)	
             len=28, pos=27, align=0:         8.00	        6.00 ( 25.01%)	
             len=29, pos=28, align=0:         8.00	        6.00 ( 25.01%)	
             len=29, pos=28, align=0:         8.00	        6.00 ( 25.01%)	
             len=30, pos=29, align=0:         8.00	        6.00 ( 25.01%)	
             len=30, pos=29, align=0:         8.00	        6.00 ( 25.01%)	
             len=31, pos=30, align=0:         8.00	        6.00 ( 25.01%)	
             len=31, pos=30, align=0:         8.00	        6.00 ( 25.01%)	
             len=32, pos=31, align=0:        11.34	       10.01 ( 11.75%)	
             len=32, pos=31, align=0:        11.34	       10.00 ( 11.77%)	
            len=256, pos=64, align=1:        35.77	       29.33 ( 18.01%)	
            len=256, pos=64, align=1:        35.59	       29.50 ( 17.12%)	
           len=256, pos=64, align=15:        35.83	       29.30 ( 18.20%)	
           len=256, pos=64, align=15:        35.58	       29.17 ( 18.02%)	
            len=256, pos=64, align=2:        35.72	       29.50 ( 17.42%)	
            len=256, pos=64, align=2:        35.59	       29.33 ( 17.58%)	
           len=256, pos=64, align=30:        35.72	       29.16 ( 18.37%)	
           len=256, pos=64, align=30:        35.76	       26.18 ( 26.79%)	
            len=256, pos=64, align=3:        35.41	       29.44 ( 16.86%)	
            len=256, pos=64, align=3:        36.34	       29.30 ( 19.38%)	
           len=256, pos=64, align=45:        35.98	       28.28 ( 21.38%)	
           len=256, pos=64, align=45:        35.87	       29.65 ( 17.34%)	
            len=256, pos=64, align=4:        35.74	       29.37 ( 17.83%)	
            len=256, pos=64, align=4:        35.72	       31.72 ( 11.21%)	
           len=256, pos=64, align=60:        36.87	       26.12 ( 29.16%)	
           len=256, pos=64, align=60:        35.67	       26.67 ( 25.24%)	
            len=256, pos=64, align=5:        35.89	       29.31 ( 18.35%)	
            len=256, pos=64, align=5:        35.60	       29.50 ( 17.14%)	
           len=256, pos=64, align=75:        37.68	       30.98 ( 17.78%)	
           len=256, pos=64, align=75:        35.70	       28.69 ( 19.62%)	
            len=256, pos=64, align=6:        36.02	       29.06 ( 19.33%)	
            len=256, pos=64, align=6:        35.73	       29.21 ( 18.23%)	
           len=256, pos=64, align=90:        36.28	       28.19 ( 22.30%)	
           len=256, pos=64, align=90:        36.05	       26.19 ( 27.35%)	
            len=256, pos=64, align=7:        35.75	       28.19 ( 21.13%)	
            len=256, pos=64, align=7:        35.80	       29.34 ( 18.04%)	
          len=256, pos=64, align=105:        36.05	       28.68 ( 20.44%)	
          len=256, pos=64, align=105:        35.85	       29.14 ( 18.72%)	
               len=1, pos=0, align=0:         4.54	        6.40 (-40.85%)	
               len=1, pos=0, align=0:         4.67	        4.67 (  0.03%)	
               len=2, pos=1, align=0:         5.33	        4.67 ( 12.46%)	
               len=2, pos=1, align=0:         4.67	        4.83 ( -3.51%)	
               len=3, pos=2, align=0:         4.67	        4.67 ( -0.04%)	
               len=3, pos=2, align=0:         4.67	        4.67 ( -0.01%)	
               len=4, pos=3, align=0:         4.67	        5.33 (-14.26%)	
               len=4, pos=3, align=0:         4.67	        5.33 (-14.30%)	
               len=5, pos=4, align=0:         4.67	        5.33 (-14.25%)	
               len=5, pos=4, align=0:         4.67	        4.67 (  0.03%)	
               len=6, pos=5, align=0:         4.94	        5.65 (-14.22%)	
               len=6, pos=5, align=0:         4.94	        4.94 (  0.00%)	
               len=7, pos=6, align=0:         4.67	        4.67 (  0.03%)	
               len=7, pos=6, align=0:         4.67	        4.67 (  0.00%)	
               len=8, pos=7, align=0:         7.34	        5.33 ( 27.31%)	
               len=8, pos=7, align=0:         8.00	        5.33 ( 33.33%)	
               len=9, pos=8, align=0:         8.00	        4.67 ( 41.66%)	
               len=9, pos=8, align=0:         7.33	        4.67 ( 36.34%)	
              len=10, pos=9, align=0:         7.33	        5.33 ( 27.27%)	
              len=10, pos=9, align=0:         8.16	        5.33 ( 34.67%)	
             len=11, pos=10, align=0:         8.00	        5.33 ( 33.34%)	
             len=11, pos=10, align=0:         8.00	        5.33 ( 33.33%)	
             len=12, pos=11, align=0:         8.00	        4.67 ( 41.64%)	
             len=12, pos=11, align=0:         8.00	        4.67 ( 41.64%)	
             len=13, pos=12, align=0:         8.16	        5.33 ( 34.67%)	
             len=13, pos=12, align=0:         7.33	        4.67 ( 36.36%)	
             len=14, pos=13, align=0:         8.00	        4.67 ( 41.64%)	
             len=14, pos=13, align=0:         8.16	        5.33 ( 34.65%)	
             len=15, pos=14, align=0:         8.00	        5.33 ( 33.33%)	
             len=15, pos=14, align=0:         8.00	        4.67 ( 41.64%)	
             len=16, pos=15, align=0:         8.45	        6.89 ( 18.45%)	
             len=16, pos=15, align=0:         8.67	        6.67 ( 23.06%)	
             len=17, pos=16, align=0:         7.34	        6.00 ( 18.17%)	
             len=17, pos=16, align=0:         6.67	        6.00 (  9.99%)	
             len=18, pos=17, align=0:         6.00	        6.00 (  0.02%)	
             len=18, pos=17, align=0:         6.00	        6.00 (  0.03%)	
             len=19, pos=18, align=0:         6.56	        6.00 (  8.58%)	
             len=19, pos=18, align=0:         6.67	        6.00 ( 10.03%)	
             len=20, pos=19, align=0:         6.67	        6.14 (  7.90%)	
             len=20, pos=19, align=0:         6.67	        6.00 ( 10.01%)	
             len=21, pos=20, align=0:         6.67	        6.00 ( 10.03%)	
             len=21, pos=20, align=0:         6.67	        6.00 ( 10.01%)	
             len=22, pos=21, align=0:         6.67	        6.00 ( 10.01%)	
             len=22, pos=21, align=0:         6.76	        6.00 ( 11.25%)	
             len=23, pos=22, align=0:         6.67	        6.00 ( 10.00%)	
             len=23, pos=22, align=0:         6.67	        6.00 ( 10.03%)	
             len=24, pos=23, align=0:         8.67	        6.00 ( 30.80%)	
             len=24, pos=23, align=0:         9.34	        6.00 ( 35.72%)	
             len=25, pos=24, align=0:         8.00	        6.00 ( 25.02%)	
             len=25, pos=24, align=0:         8.00	        6.00 ( 25.01%)	
             len=26, pos=25, align=0:         8.00	        6.00 ( 24.98%)	
             len=26, pos=25, align=0:         8.00	        6.00 ( 25.01%)	
             len=27, pos=26, align=0:         8.00	        6.00 ( 25.01%)	
             len=27, pos=26, align=0:         8.00	        6.00 ( 25.01%)	
             len=28, pos=27, align=0:         8.00	        6.00 ( 25.01%)	
             len=28, pos=27, align=0:         8.00	        6.00 ( 25.01%)	
             len=29, pos=28, align=0:         8.00	        6.00 ( 25.01%)	
             len=29, pos=28, align=0:         8.00	        6.00 ( 25.01%)	
             len=30, pos=29, align=0:         8.00	        6.00 ( 25.00%)	
             len=30, pos=29, align=0:         8.00	        6.00 ( 25.01%)	
             len=31, pos=30, align=0:         8.00	        6.00 ( 25.01%)	
             len=31, pos=30, align=0:         8.00	        6.00 ( 25.01%)	
             len=32, pos=31, align=0:        12.67	       10.01 ( 21.04%)	
             len=32, pos=31, align=0:        12.00	       10.00 ( 16.67%)	
            len=256, pos=64, align=1:        35.59	       30.41 ( 14.58%)	
            len=256, pos=64, align=1:        35.40	       29.31 ( 17.18%)	
           len=256, pos=64, align=15:        35.64	       27.78 ( 22.06%)	
           len=256, pos=64, align=15:        35.32	       30.57 ( 13.44%)	
            len=256, pos=64, align=2:        35.83	       30.82 ( 13.98%)	
            len=256, pos=64, align=2:        35.37	       29.08 ( 17.79%)	
           len=256, pos=64, align=30:        36.50	       26.16 ( 28.32%)	
           len=256, pos=64, align=30:        36.09	       26.28 ( 27.18%)	
            len=256, pos=64, align=3:        35.82	       28.42 ( 20.67%)	
            len=256, pos=64, align=3:        34.95	       27.73 ( 20.66%)	
           len=256, pos=64, align=45:        35.65	       27.80 ( 22.01%)	
           len=256, pos=64, align=45:        35.44	       28.46 ( 19.70%)	
            len=256, pos=64, align=4:        35.34	       30.41 ( 13.97%)	
            len=256, pos=64, align=4:        35.60	       27.78 ( 21.97%)	
           len=256, pos=64, align=60:        35.43	       27.70 ( 21.83%)	
           len=256, pos=64, align=60:        36.41	       27.75 ( 23.79%)	
            len=256, pos=64, align=5:        35.44	       30.23 ( 14.71%)	
            len=256, pos=64, align=5:        36.77	       30.43 ( 17.25%)	
           len=256, pos=64, align=75:        35.31	       27.94 ( 20.88%)	
           len=256, pos=64, align=75:        35.76	       28.37 ( 20.67%)	
            len=256, pos=64, align=6:        35.38	       28.28 ( 20.08%)	
            len=256, pos=64, align=6:        35.69	       27.78 ( 22.17%)	
           len=256, pos=64, align=90:        36.20	       27.77 ( 23.29%)	
           len=256, pos=64, align=90:        36.49	       29.13 ( 20.18%)	
            len=256, pos=64, align=7:        35.43	       27.78 ( 21.60%)	
            len=256, pos=64, align=7:        37.20	       29.20 ( 21.52%)	
          len=256, pos=64, align=105:        35.57	       27.73 ( 22.06%)	
          len=256, pos=64, align=105:        36.33	       28.43 ( 21.73%)	
               len=1, pos=0, align=0:         4.54	        6.39 (-40.72%)	
               len=1, pos=0, align=0:         4.67	        4.67 (  0.04%)	
               len=2, pos=1, align=0:         5.33	        5.33 ( -0.01%)	
               len=2, pos=1, align=0:         4.67	        5.33 (-14.29%)	
               len=3, pos=2, align=0:         4.67	        4.67 (  0.03%)	
               len=3, pos=2, align=0:         4.67	        4.67 (  0.04%)	
               len=4, pos=3, align=0:         4.67	        4.67 (  0.03%)	
               len=4, pos=3, align=0:         4.67	        4.67 (  0.00%)	
               len=5, pos=4, align=0:         4.67	        5.33 (-14.23%)	
               len=5, pos=4, align=0:         4.67	        4.67 (  0.04%)	
               len=6, pos=5, align=0:         4.67	        4.67 (  0.00%)	
               len=6, pos=5, align=0:         4.67	        5.33 (-14.24%)	
               len=7, pos=6, align=0:         4.90	        5.33 ( -8.94%)	
               len=7, pos=6, align=0:         4.67	        5.33 (-14.28%)	
               len=8, pos=7, align=0:         7.51	        5.33 ( 29.00%)	
               len=8, pos=7, align=0:         8.00	        5.33 ( 33.33%)	
               len=9, pos=8, align=0:         8.00	        5.33 ( 33.35%)	
               len=9, pos=8, align=0:         8.00	        4.67 ( 41.66%)	
              len=10, pos=9, align=0:         8.00	        5.33 ( 33.33%)	
              len=10, pos=9, align=0:         8.00	        5.33 ( 33.33%)	
             len=11, pos=10, align=0:         8.00	        4.67 ( 41.66%)	
             len=11, pos=10, align=0:         8.16	        4.67 ( 42.83%)	
             len=12, pos=11, align=0:         8.00	        4.67 ( 41.64%)	
             len=12, pos=11, align=0:         8.00	        5.33 ( 33.33%)	
             len=13, pos=12, align=0:         7.33	        5.33 ( 27.27%)	
             len=13, pos=12, align=0:         8.00	        4.67 ( 41.64%)	
             len=14, pos=13, align=0:         8.00	        4.67 ( 41.64%)	
             len=14, pos=13, align=0:         8.00	        4.67 ( 41.66%)	
             len=15, pos=14, align=0:         7.33	        4.67 ( 36.36%)	
             len=15, pos=14, align=0:         8.00	        4.67 ( 41.64%)	
             len=16, pos=15, align=0:         8.67	        6.89 ( 20.50%)	
             len=16, pos=15, align=0:         8.67	        6.83 ( 21.15%)	
             len=17, pos=16, align=0:         6.67	        6.00 ( 10.01%)	
             len=17, pos=16, align=0:         6.00	        6.00 ( -0.01%)	
             len=18, pos=17, align=0:         6.67	        6.16 (  7.60%)	
             len=18, pos=17, align=0:         6.67	        6.00 ( 10.03%)	
             len=19, pos=18, align=0:         6.00	        6.00 (  0.02%)	
             len=19, pos=18, align=0:         6.67	        6.00 ( 10.03%)	
             len=20, pos=19, align=0:         6.83	        6.00 ( 12.16%)	
             len=20, pos=19, align=0:         6.67	        6.00 ( 10.00%)	
             len=21, pos=20, align=0:         6.00	        6.00 (  0.05%)	
             len=21, pos=20, align=0:         6.67	        6.00 ( 10.03%)	
             len=22, pos=21, align=0:         6.67	        6.00 ( 10.03%)	
             len=22, pos=21, align=0:         6.67	        6.00 ( 10.00%)	
             len=23, pos=22, align=0:         6.67	        6.16 (  7.62%)	
             len=23, pos=22, align=0:         6.67	        6.00 ( 10.00%)	
             len=24, pos=23, align=0:         8.67	        6.00 ( 30.80%)	
             len=24, pos=23, align=0:         9.33	        6.00 ( 35.71%)	
             len=25, pos=24, align=0:         8.00	        6.00 ( 25.00%)	
             len=25, pos=24, align=0:         8.00	        6.00 ( 25.01%)	
             len=26, pos=25, align=0:         8.00	        6.00 ( 25.01%)	
             len=26, pos=25, align=0:         8.67	        6.00 ( 30.76%)	
             len=27, pos=26, align=0:         8.67	        6.00 ( 30.74%)	
             len=27, pos=26, align=0:         8.00	        6.00 ( 25.01%)	
             len=28, pos=27, align=0:         8.00	        6.00 ( 25.01%)	
             len=28, pos=27, align=0:         8.00	        6.00 ( 25.01%)	
             len=29, pos=28, align=0:         8.00	        6.00 ( 25.01%)	
             len=29, pos=28, align=0:         8.00	        6.00 ( 25.01%)	
             len=30, pos=29, align=0:         8.00	        6.00 ( 24.99%)	
             len=30, pos=29, align=0:         8.00	        6.00 ( 25.01%)	
             len=31, pos=30, align=0:         8.00	        6.00 ( 24.99%)	
             len=31, pos=30, align=0:         8.00	        6.00 ( 25.01%)	
             len=32, pos=31, align=0:        12.67	       10.01 ( 21.05%)	
             len=32, pos=31, align=0:        12.76	       10.00 ( 21.63%)	
            len=256, pos=64, align=1:        35.37	       29.05 ( 17.87%)	
            len=256, pos=64, align=1:        35.73	       28.45 ( 20.38%)	
           len=256, pos=64, align=15:        35.40	       29.74 ( 16.00%)	
           len=256, pos=64, align=15:        36.82	       29.23 ( 20.62%)	
            len=256, pos=64, align=2:        35.51	       27.76 ( 21.82%)	
            len=256, pos=64, align=2:        35.31	       27.99 ( 20.72%)	
           len=256, pos=64, align=30:        35.03	       27.77 ( 20.72%)	
           len=256, pos=64, align=30:        35.72	       27.15 ( 24.00%)	
            len=256, pos=64, align=3:        35.46	       30.45 ( 14.12%)	
            len=256, pos=64, align=3:        35.43	       27.88 ( 21.31%)	
           len=256, pos=64, align=45:        35.36	       27.73 ( 21.58%)	
           len=256, pos=64, align=45:        35.55	       27.73 ( 21.99%)	
            len=256, pos=64, align=4:        35.45	       27.98 ( 21.07%)	
            len=256, pos=64, align=4:        38.56	       30.94 ( 19.77%)	
           len=256, pos=64, align=60:        35.43	       29.30 ( 17.29%)	
           len=256, pos=64, align=60:        35.34	       28.17 ( 20.29%)	
            len=256, pos=64, align=5:        36.10	       30.37 ( 15.88%)	
            len=256, pos=64, align=5:        35.38	       28.48 ( 19.51%)	
           len=256, pos=64, align=75:        36.42	       27.73 ( 23.87%)	
           len=256, pos=64, align=75:        35.39	       27.93 ( 21.10%)	
            len=256, pos=64, align=6:        36.58	       30.37 ( 16.97%)	
            len=256, pos=64, align=6:        35.37	       28.90 ( 18.31%)	
           len=256, pos=64, align=90:        35.55	       26.13 ( 26.50%)	
           len=256, pos=64, align=90:        36.39	       29.42 ( 19.14%)	
            len=256, pos=64, align=7:        35.32	       29.51 ( 16.44%)	
            len=256, pos=64, align=7:        35.73	       27.73 ( 22.41%)	
          len=256, pos=64, align=105:        35.36	       30.36 ( 14.14%)	
          len=256, pos=64, align=105:        35.41	       27.82 ( 21.42%)	
               len=1, pos=0, align=0:         4.80	        6.77 (-41.02%)	
               len=1, pos=0, align=0:         5.65	        5.65 ( -0.00%)	
               len=2, pos=1, align=0:         4.94	        4.94 ( -0.01%)	
               len=2, pos=1, align=0:         5.65	        5.65 (  0.00%)	
               len=3, pos=2, align=0:         4.94	        4.94 (  0.00%)	
               len=3, pos=2, align=0:         4.94	        4.94 (  0.00%)	
               len=4, pos=3, align=0:         4.94	        5.15 ( -4.17%)	
               len=4, pos=3, align=0:         4.94	        4.94 (  0.03%)	
               len=5, pos=4, align=0:         5.16	        4.94 (  4.12%)	
               len=5, pos=4, align=0:         4.94	        4.94 (  0.04%)	
               len=6, pos=5, align=0:         4.94	        5.21 ( -5.35%)	
               len=6, pos=5, align=0:         4.94	        4.94 ( -0.01%)	
               len=7, pos=6, align=0:         4.94	        5.17 ( -4.55%)	
               len=7, pos=6, align=0:         4.94	        4.94 (  0.04%)	
               len=8, pos=7, align=0:         7.97	        5.65 ( 29.17%)	
               len=8, pos=7, align=0:         8.47	        5.65 ( 33.33%)	
               len=9, pos=8, align=0:         8.47	        5.65 ( 33.34%)	
               len=9, pos=8, align=0:         8.47	        4.94 ( 41.64%)	
              len=10, pos=9, align=0:         8.47	        4.94 ( 41.64%)	
              len=10, pos=9, align=0:         7.97	        5.65 ( 29.17%)	
             len=11, pos=10, align=0:         8.47	        5.65 ( 33.33%)	
             len=11, pos=10, align=0:         8.47	        4.94 ( 41.64%)	
             len=12, pos=11, align=0:         8.47	        5.65 ( 33.33%)	
             len=12, pos=11, align=0:         8.47	        5.65 ( 33.32%)	
             len=13, pos=12, align=0:         8.47	        5.65 ( 33.32%)	
             len=13, pos=12, align=0:         8.47	        5.65 ( 33.32%)	
             len=14, pos=13, align=0:         8.47	        5.65 ( 33.33%)	
             len=14, pos=13, align=0:         7.77	        5.65 ( 27.27%)	
             len=15, pos=14, align=0:         7.77	        5.14 ( 33.82%)	
             len=15, pos=14, align=0:         7.77	        5.65 ( 27.27%)	
             len=16, pos=15, align=0:         9.21	        7.30 ( 20.77%)	
             len=16, pos=15, align=0:         8.70	        6.67 ( 23.37%)	
             len=17, pos=16, align=0:         6.95	        6.00 ( 13.59%)	
             len=17, pos=16, align=0:         6.67	        6.00 ( 10.03%)	
             len=18, pos=17, align=0:         6.00	        6.00 (  0.03%)	
             len=18, pos=17, align=0:         6.67	        6.00 ( 10.02%)	
             len=19, pos=18, align=0:         6.67	        6.00 ( 10.03%)	
             len=19, pos=18, align=0:         6.67	        6.18 (  7.38%)	
             len=20, pos=19, align=0:         6.67	        6.00 ( 10.00%)	
             len=20, pos=19, align=0:         6.67	        6.00 ( 10.00%)	
             len=21, pos=20, align=0:         6.00	        6.00 (  0.02%)	
             len=21, pos=20, align=0:         6.67	        6.00 ( 10.03%)	
             len=22, pos=21, align=0:         6.67	        6.17 (  7.55%)	
             len=22, pos=21, align=0:         6.67	        6.00 ( 10.00%)	
             len=23, pos=22, align=0:         6.67	        6.00 ( 10.03%)	
             len=23, pos=22, align=0:         6.67	        6.00 ( 10.03%)	
             len=24, pos=23, align=0:         8.67	        6.00 ( 30.79%)	
             len=24, pos=23, align=0:         8.67	        6.00 ( 30.79%)	
             len=25, pos=24, align=0:         8.00	        6.00 ( 25.02%)	
             len=25, pos=24, align=0:         8.00	        6.00 ( 25.01%)	
             len=26, pos=25, align=0:         8.00	        6.00 ( 25.01%)	
             len=26, pos=25, align=0:         8.00	        6.00 ( 25.01%)	
             len=27, pos=26, align=0:         8.00	        6.00 ( 24.98%)	
             len=27, pos=26, align=0:         8.00	        6.00 ( 25.01%)	
             len=28, pos=27, align=0:         8.00	        6.00 ( 25.01%)	
             len=28, pos=27, align=0:         8.00	        6.00 ( 25.01%)	
             len=29, pos=28, align=0:         8.00	        6.00 ( 25.01%)	
             len=29, pos=28, align=0:         8.00	        6.00 ( 25.01%)	
             len=30, pos=29, align=0:         8.00	        6.00 ( 25.01%)	
             len=30, pos=29, align=0:         8.00	        6.00 ( 25.01%)	
             len=31, pos=30, align=0:         8.00	        6.00 ( 25.01%)	
             len=31, pos=30, align=0:         8.00	        6.00 ( 25.01%)	
             len=32, pos=31, align=0:        12.67	       10.00 ( 21.04%)	
             len=32, pos=31, align=0:        12.00	       10.00 ( 16.65%)	
            len=256, pos=64, align=1:        35.59	       30.38 ( 14.63%)	
            len=256, pos=64, align=1:        35.64	       30.35 ( 14.86%)	
           len=256, pos=64, align=15:        35.12	       30.56 ( 12.98%)	
           len=256, pos=64, align=15:        36.68	       27.83 ( 24.13%)	
            len=256, pos=64, align=2:        35.39	       30.60 ( 13.53%)	
            len=256, pos=64, align=2:        36.85	       27.83 ( 24.48%)	
           len=256, pos=64, align=30:        35.54	       29.10 ( 18.12%)	
           len=256, pos=64, align=30:        36.33	       27.73 ( 23.67%)	
            len=256, pos=64, align=3:        35.59	       29.07 ( 18.32%)	
            len=256, pos=64, align=3:        35.58	       29.68 ( 16.58%)	
           len=256, pos=64, align=45:        35.62	       30.36 ( 14.74%)	
           len=256, pos=64, align=45:        35.40	       27.93 ( 21.09%)	
            len=256, pos=64, align=4:        35.48	       27.78 ( 21.70%)	
            len=256, pos=64, align=4:        35.42	       27.94 ( 21.11%)	
           len=256, pos=64, align=60:        35.43	       29.13 ( 17.77%)	
           len=256, pos=64, align=60:        36.30	       26.18 ( 27.86%)	
            len=256, pos=64, align=5:        35.39	       27.73 ( 21.66%)	
            len=256, pos=64, align=5:        35.48	       27.80 ( 21.64%)	
           len=256, pos=64, align=75:        34.94	       27.92 ( 20.08%)	
           len=256, pos=64, align=75:        36.40	       27.82 ( 23.57%)	
            len=256, pos=64, align=6:        35.58	       30.01 ( 15.67%)	
            len=256, pos=64, align=6:        35.54	       30.36 ( 14.55%)	
           len=256, pos=64, align=90:        36.26	       27.06 ( 25.38%)	
           len=256, pos=64, align=90:        35.80	       29.05 ( 18.86%)	
            len=256, pos=64, align=7:        36.35	       29.73 ( 18.22%)	
            len=256, pos=64, align=7:        36.33	       27.74 ( 23.65%)	
          len=256, pos=64, align=105:        35.49	       30.41 ( 14.31%)	
          len=256, pos=64, align=105:        35.59	       29.50 ( 17.12%)	
               len=1, pos=0, align=0:         4.54	        6.47 (-42.30%)	
               len=1, pos=0, align=0:         4.67	        4.67 (  0.00%)	
               len=2, pos=1, align=0:         4.67	        4.67 (  0.04%)	
               len=2, pos=1, align=0:         5.33	        4.67 ( 12.47%)	
               len=3, pos=2, align=0:         4.94	        4.94 (  0.04%)	
               len=3, pos=2, align=0:         4.94	        4.94 (  0.00%)	
               len=4, pos=3, align=0:         4.84	        5.33 (-10.27%)	
               len=4, pos=3, align=0:         4.67	        5.33 (-14.24%)	
               len=5, pos=4, align=0:         4.83	        4.67 (  3.42%)	
               len=5, pos=4, align=0:         4.67	        4.67 ( -0.00%)	
               len=6, pos=5, align=0:         4.67	        5.30 (-13.40%)	
               len=6, pos=5, align=0:         4.67	        5.33 (-14.24%)	
               len=7, pos=6, align=0:         4.67	        4.67 (  0.02%)	
               len=7, pos=6, align=0:         4.67	        5.33 (-14.24%)	
               len=8, pos=7, align=0:         7.34	        5.33 ( 27.30%)	
               len=8, pos=7, align=0:         8.00	        5.33 ( 33.33%)	
               len=9, pos=8, align=0:         8.16	        5.33 ( 34.66%)	
               len=9, pos=8, align=0:         8.00	        5.33 ( 33.33%)	
              len=10, pos=9, align=0:         8.00	        5.33 ( 33.33%)	
              len=10, pos=9, align=0:         8.00	        5.33 ( 33.33%)	
             len=11, pos=10, align=0:         8.00	        4.67 ( 41.66%)	
             len=11, pos=10, align=0:         7.33	        5.33 ( 27.27%)	
             len=12, pos=11, align=0:         8.00	        4.67 ( 41.66%)	
             len=12, pos=11, align=0:         8.00	        5.33 ( 33.32%)	
             len=13, pos=12, align=0:         8.00	        5.33 ( 33.32%)	
             len=13, pos=12, align=0:         8.00	        5.33 ( 33.32%)	
             len=14, pos=13, align=0:         8.10	        4.67 ( 42.36%)	
             len=14, pos=13, align=0:         7.33	        4.67 ( 36.36%)	
             len=15, pos=14, align=0:         7.33	        4.67 ( 36.34%)	
             len=15, pos=14, align=0:         8.18	        5.33 ( 34.77%)	
             len=16, pos=15, align=0:         8.70	        6.67 ( 23.34%)	
             len=16, pos=15, align=0:         8.70	        6.89 ( 20.80%)	
             len=17, pos=16, align=0:         6.69	        6.00 ( 10.30%)	
             len=17, pos=16, align=0:         6.69	        6.00 ( 10.36%)	
             len=18, pos=17, align=0:         6.70	        6.00 ( 10.37%)	
             len=18, pos=17, align=0:         6.70	        6.00 ( 10.38%)	
             len=19, pos=18, align=0:         6.70	        6.00 ( 10.39%)	
             len=19, pos=18, align=0:         6.70	        6.22 (  7.07%)	
             len=20, pos=19, align=0:         6.70	        6.00 ( 10.37%)	
             len=20, pos=19, align=0:         6.70	        6.00 ( 10.41%)	
             len=21, pos=20, align=0:         6.63	        6.00 (  9.56%)	
             len=21, pos=20, align=0:         6.64	        6.00 (  9.57%)	
             len=22, pos=21, align=0:         6.70	        6.16 (  7.95%)	
             len=22, pos=21, align=0:         6.64	        6.00 (  9.58%)	
             len=23, pos=22, align=0:         6.65	        6.00 (  9.72%)	
             len=23, pos=22, align=0:         6.70	        6.00 ( 10.39%)	
             len=24, pos=23, align=0:         8.67	        6.00 ( 30.80%)	
             len=24, pos=23, align=0:         8.67	        6.00 ( 30.79%)	
             len=25, pos=24, align=0:         8.67	        6.00 ( 30.77%)	
             len=25, pos=24, align=0:         8.00	        6.00 ( 24.99%)	
             len=26, pos=25, align=0:         8.00	        6.00 ( 25.01%)	
             len=26, pos=25, align=0:         8.00	        6.00 ( 24.98%)	
             len=27, pos=26, align=0:         8.00	        6.00 ( 25.01%)	
             len=27, pos=26, align=0:         8.00	        6.00 ( 25.01%)	
             len=28, pos=27, align=0:         8.00	        6.00 ( 25.01%)	
             len=28, pos=27, align=0:         8.00	        6.00 ( 25.01%)	
             len=29, pos=28, align=0:         8.00	        6.00 ( 24.99%)	
             len=29, pos=28, align=0:         8.00	        6.00 ( 25.01%)	
             len=30, pos=29, align=0:         8.00	        6.00 ( 25.01%)	
             len=30, pos=29, align=0:         8.00	        6.00 ( 25.01%)	
             len=31, pos=30, align=0:         8.00	        6.00 ( 25.01%)	
             len=31, pos=30, align=0:         8.00	        6.00 ( 25.01%)	
             len=32, pos=31, align=0:        12.67	       10.00 ( 21.05%)	
             len=32, pos=31, align=0:        12.00	       10.00 ( 16.66%)	

[-- Attachment #3: strrchr.txt --]
[-- Type: text/plain, Size: 63218 bytes --]

Function: strrchr
Variant: 
                                    __strrchr_evex	__strrchr_evex512
========================================================================================================================
           len=2048, pos=32, align=0:         9.50	        8.16 ( 14.13%)	
           len=2048, pos=32, align=1:         9.00	        7.26 ( 19.37%)	
           len=2048, pos=64, align=0:        10.52	       10.20 (  3.00%)	
           len=2048, pos=64, align=2:        10.58	       10.02 (  5.34%)	
          len=2048, pos=128, align=0:        15.96	       11.74 ( 26.44%)	
          len=2048, pos=128, align=3:        15.70	       11.43 ( 27.23%)	
          len=2048, pos=256, align=0:        17.59	       16.30 (  7.34%)	
          len=2048, pos=256, align=4:        16.54	       14.24 ( 13.87%)	
          len=2048, pos=512, align=0:        23.84	       16.44 ( 31.03%)	
          len=2048, pos=512, align=5:        23.30	       15.14 ( 35.02%)	
         len=2048, pos=1024, align=0:        34.75	       24.35 ( 29.93%)	
         len=2048, pos=1024, align=6:        34.59	       24.42 ( 29.41%)	
         len=2048, pos=2048, align=0:        62.52	       40.61 ( 35.05%)	
         len=2048, pos=2048, align=7:        63.02	       40.42 ( 35.87%)	
         len=2048, pos=4096, align=0:        62.52	       40.51 ( 35.21%)	
         len=2048, pos=4096, align=8:        62.59	       40.44 ( 35.40%)	
            len=256, pos=64, align=1:         7.45	        7.67 ( -2.93%)	
            len=256, pos=64, align=1:         7.51	        7.73 ( -2.95%)	
           len=256, pos=64, align=15:         6.67	        6.00 ( 10.00%)	
           len=256, pos=64, align=15:         7.50	        7.90 ( -5.28%)	
            len=256, pos=64, align=2:         6.67	        6.00 (  9.99%)	
            len=256, pos=64, align=2:         6.67	        6.00 (  9.99%)	
           len=256, pos=64, align=30:         7.50	        7.88 ( -5.12%)	
           len=256, pos=64, align=30:         6.67	        6.00 ( 10.00%)	
            len=256, pos=64, align=3:         6.67	        6.00 ( 10.00%)	
            len=256, pos=64, align=3:         7.60	        7.84 ( -3.12%)	
           len=256, pos=64, align=45:         6.67	        6.00 ( 10.00%)	
           len=256, pos=64, align=45:         6.67	        6.00 ( 10.00%)	
            len=256, pos=64, align=4:         7.56	        7.66 ( -1.32%)	
            len=256, pos=64, align=4:         6.67	        6.00 (  9.97%)	
           len=256, pos=64, align=60:         7.89	        7.67 (  2.81%)	
           len=256, pos=64, align=60:         7.58	        7.73 ( -1.95%)	
            len=256, pos=64, align=5:         6.67	        6.00 ( 10.00%)	
            len=256, pos=64, align=5:         7.38	        7.65 ( -3.76%)	
           len=256, pos=64, align=75:         7.39	        7.71 ( -4.33%)	
           len=256, pos=64, align=75:         6.67	        6.00 ( 10.00%)	
            len=256, pos=64, align=6:         7.47	        7.90 ( -5.76%)	
            len=256, pos=64, align=6:         6.67	        6.00 ( 10.00%)	
           len=256, pos=64, align=90:         6.67	        6.00 ( 10.00%)	
           len=256, pos=64, align=90:         7.47	        7.72 ( -3.43%)	
            len=256, pos=64, align=7:         6.67	        6.00 ( 10.00%)	
            len=256, pos=64, align=7:         6.67	        6.00 ( 10.00%)	
          len=256, pos=64, align=105:         7.56	        7.70 ( -1.75%)	
          len=256, pos=64, align=105:         6.67	        6.00 ( 10.00%)	
               len=1, pos=0, align=0:         4.01	        4.67 (-16.61%)	
               len=1, pos=0, align=0:         4.00	        4.67 (-16.69%)	
            len=1, pos=0, align=4095:         5.34	        5.34 (  0.03%)	
               len=2, pos=1, align=0:         4.00	        4.67 (-16.69%)	
               len=2, pos=1, align=0:         4.00	        4.67 (-16.67%)	
            len=2, pos=1, align=4095:         6.67	        6.00 ( 10.00%)	
               len=3, pos=2, align=0:         4.00	        4.67 (-16.62%)	
               len=3, pos=2, align=0:         4.00	        4.67 (-16.70%)	
            len=3, pos=2, align=4094:         6.84	        6.00 ( 12.19%)	
               len=4, pos=3, align=0:         4.00	        4.68 (-16.94%)	
               len=4, pos=3, align=0:         4.00	        4.84 (-20.93%)	
            len=4, pos=3, align=4094:         6.67	        6.00 ( 10.00%)	
               len=5, pos=4, align=0:         4.00	        4.84 (-20.87%)	
               len=5, pos=4, align=0:         4.00	        4.67 (-16.63%)	
            len=5, pos=4, align=4093:         6.67	        6.00 (  9.98%)	
               len=6, pos=5, align=0:         4.00	        4.67 (-16.72%)	
               len=6, pos=5, align=0:         4.00	        4.67 (-16.66%)	
            len=6, pos=5, align=4093:         6.67	        6.00 (  9.96%)	
               len=7, pos=6, align=0:         4.00	        4.67 (-16.64%)	
               len=7, pos=6, align=0:         4.00	        4.67 (-16.66%)	
            len=7, pos=6, align=4092:         6.67	        6.00 (  9.99%)	
               len=8, pos=7, align=0:         4.00	        4.67 (-16.70%)	
               len=8, pos=7, align=0:         4.00	        4.67 (-16.63%)	
            len=8, pos=7, align=4092:         6.67	        6.00 ( 10.00%)	
               len=9, pos=8, align=0:         4.00	        4.67 (-16.64%)	
               len=9, pos=8, align=0:         4.00	        4.67 (-16.65%)	
            len=9, pos=8, align=4091:         6.67	        6.00 (  9.99%)	
              len=10, pos=9, align=0:         4.00	        4.67 (-16.71%)	
              len=10, pos=9, align=0:         4.00	        4.67 (-16.70%)	
           len=10, pos=9, align=4091:         6.84	        6.00 ( 12.19%)	
             len=11, pos=10, align=0:         4.00	        4.67 (-16.64%)	
             len=11, pos=10, align=0:         4.00	        4.67 (-16.70%)	
          len=11, pos=10, align=4090:         6.67	        6.00 (  9.99%)	
             len=12, pos=11, align=0:         4.00	        4.67 (-16.69%)	
             len=12, pos=11, align=0:         4.00	        4.67 (-16.63%)	
          len=12, pos=11, align=4090:         6.67	        6.00 (  9.97%)	
             len=13, pos=12, align=0:         4.00	        4.67 (-16.64%)	
             len=13, pos=12, align=0:         4.00	        4.67 (-16.65%)	
          len=13, pos=12, align=4089:         6.67	        6.00 (  9.99%)	
             len=14, pos=13, align=0:         4.00	        4.67 (-16.67%)	
             len=14, pos=13, align=0:         4.00	        4.67 (-16.66%)	
          len=14, pos=13, align=4089:         6.67	        6.00 ( 10.00%)	
             len=15, pos=14, align=0:         4.00	        4.67 (-16.64%)	
             len=15, pos=14, align=0:         4.00	        4.67 (-16.63%)	
          len=15, pos=14, align=4088:         6.67	        6.16 (  7.63%)	
             len=16, pos=15, align=0:         4.00	        4.67 (-16.67%)	
             len=16, pos=15, align=0:         4.00	        4.67 (-16.66%)	
          len=16, pos=15, align=4088:         6.67	        6.00 (  9.99%)	
             len=17, pos=16, align=0:         4.00	        4.68 (-16.99%)	
             len=17, pos=16, align=0:         4.00	        4.68 (-16.91%)	
          len=17, pos=16, align=4087:         7.06	        6.35 (  9.99%)	
             len=18, pos=17, align=0:         4.24	        4.96 (-17.01%)	
             len=18, pos=17, align=0:         4.00	        4.68 (-16.94%)	
          len=18, pos=17, align=4087:         6.67	        6.00 (  9.97%)	
             len=19, pos=18, align=0:         4.00	        4.68 (-16.93%)	
             len=19, pos=18, align=0:         4.00	        4.68 (-16.97%)	
          len=19, pos=18, align=4086:         6.84	        6.00 ( 12.21%)	
             len=20, pos=19, align=0:         4.00	        4.68 (-17.00%)	
             len=20, pos=19, align=0:         4.00	        4.68 (-17.00%)	
          len=20, pos=19, align=4086:         6.67	        6.00 (  9.99%)	
             len=21, pos=20, align=0:         4.00	        4.68 (-16.91%)	
             len=21, pos=20, align=0:         4.00	        4.68 (-17.02%)	
          len=21, pos=20, align=4085:         6.67	        6.00 ( 10.00%)	
             len=22, pos=21, align=0:         4.00	        4.68 (-16.95%)	
             len=22, pos=21, align=0:         4.00	        4.84 (-20.93%)	
          len=22, pos=21, align=4085:         6.67	        6.00 (  9.99%)	
             len=23, pos=22, align=0:         4.00	        4.68 (-16.95%)	
             len=23, pos=22, align=0:         4.00	        4.68 (-16.97%)	
          len=23, pos=22, align=4084:         6.67	        6.00 (  9.97%)	
             len=24, pos=23, align=0:         4.00	        4.68 (-16.97%)	
             len=24, pos=23, align=0:         4.00	        4.68 (-16.91%)	
          len=24, pos=23, align=4084:         6.67	        6.00 (  9.97%)	
             len=25, pos=24, align=0:         4.00	        4.68 (-17.02%)	
             len=25, pos=24, align=0:         4.00	        4.68 (-16.91%)	
          len=25, pos=24, align=4083:         6.67	        6.00 ( 10.00%)	
             len=26, pos=25, align=0:         4.00	        4.68 (-16.98%)	
             len=26, pos=25, align=0:         4.00	        4.68 (-16.92%)	
          len=26, pos=25, align=4083:         6.67	        6.00 (  9.99%)	
             len=27, pos=26, align=0:         4.00	        4.68 (-17.00%)	
             len=27, pos=26, align=0:         4.00	        4.68 (-16.98%)	
          len=27, pos=26, align=4082:         6.67	        6.00 (  9.99%)	
             len=28, pos=27, align=0:         4.00	        4.68 (-16.95%)	
             len=28, pos=27, align=0:         4.17	        4.68 (-12.18%)	
          len=28, pos=27, align=4082:         6.67	        6.00 ( 10.00%)	
             len=29, pos=28, align=0:         4.00	        4.68 (-16.91%)	
             len=29, pos=28, align=0:         4.00	        4.68 (-16.99%)	
          len=29, pos=28, align=4081:         6.67	        6.00 ( 10.00%)	
             len=30, pos=29, align=0:         4.00	        4.68 (-16.91%)	
             len=30, pos=29, align=0:         4.00	        4.68 (-16.98%)	
          len=30, pos=29, align=4081:         6.84	        6.00 ( 12.28%)	
             len=31, pos=30, align=0:         4.00	        4.68 (-16.94%)	
             len=31, pos=30, align=0:         4.00	        4.68 (-16.92%)	
          len=31, pos=30, align=4080:         6.67	        6.00 (  9.97%)	
             len=32, pos=31, align=0:         4.17	        4.68 (-12.23%)	
             len=32, pos=31, align=0:         4.00	        4.68 (-16.97%)	
          len=32, pos=31, align=4080:         6.67	        6.00 (  9.99%)	
           len=2048, pos=32, align=0:        87.08	       43.87 ( 49.62%)	
           len=2048, pos=32, align=1:        85.90	       43.75 ( 49.06%)	
           len=2048, pos=64, align=0:        92.56	       46.43 ( 49.84%)	
           len=2048, pos=64, align=2:        85.68	       43.81 ( 48.87%)	
          len=2048, pos=128, align=0:        61.45	       42.93 ( 30.13%)	
          len=2048, pos=128, align=3:        61.15	       43.53 ( 28.82%)	
          len=2048, pos=256, align=0:        64.65	       44.51 ( 31.16%)	
          len=2048, pos=256, align=4:        64.00	       45.53 ( 28.87%)	
          len=2048, pos=512, align=0:        64.18	       45.71 ( 28.78%)	
          len=2048, pos=512, align=5:        61.30	       42.45 ( 30.76%)	
         len=2048, pos=1024, align=0:        67.65	       47.13 ( 30.33%)	
         len=2048, pos=1024, align=6:        62.01	       45.89 ( 26.00%)	
         len=2048, pos=2048, align=0:        60.77	       43.90 ( 27.77%)	
         len=2048, pos=2048, align=7:        61.80	       43.89 ( 28.98%)	
         len=2048, pos=4096, align=0:        62.35	       45.05 ( 27.75%)	
         len=2048, pos=4096, align=8:        61.32	       43.86 ( 28.47%)	
            len=256, pos=64, align=1:        13.92	       14.40 ( -3.49%)	
            len=256, pos=64, align=1:        14.34	       15.56 ( -8.53%)	
           len=256, pos=64, align=15:        13.88	       14.48 ( -4.30%)	
           len=256, pos=64, align=15:        14.58	       14.48 (  0.68%)	
            len=256, pos=64, align=2:        14.02	       14.62 ( -4.27%)	
            len=256, pos=64, align=2:        13.95	       14.53 ( -4.11%)	
           len=256, pos=64, align=30:        13.99	       14.53 ( -3.82%)	
           len=256, pos=64, align=30:        13.92	       14.35 ( -3.09%)	
            len=256, pos=64, align=3:        14.18	       13.96 (  1.52%)	
            len=256, pos=64, align=3:        13.93	       14.56 ( -4.48%)	
           len=256, pos=64, align=45:        13.90	       14.62 ( -5.19%)	
           len=256, pos=64, align=45:        14.00	       14.70 ( -5.02%)	
            len=256, pos=64, align=4:        13.88	       14.50 ( -4.50%)	
            len=256, pos=64, align=4:        14.00	       15.59 (-11.30%)	
           len=256, pos=64, align=60:        13.94	       14.53 ( -4.21%)	
           len=256, pos=64, align=60:        14.06	       14.46 ( -2.83%)	
            len=256, pos=64, align=5:        13.97	       14.02 ( -0.38%)	
            len=256, pos=64, align=5:        14.01	       14.52 ( -3.64%)	
           len=256, pos=64, align=75:        13.98	       14.26 ( -2.00%)	
           len=256, pos=64, align=75:        14.54	       13.73 (  5.55%)	
            len=256, pos=64, align=6:        14.13	       14.38 ( -1.76%)	
            len=256, pos=64, align=6:        13.43	       14.11 ( -5.05%)	
           len=256, pos=64, align=90:        14.03	       14.44 ( -2.92%)	
           len=256, pos=64, align=90:        13.94	       14.63 ( -4.97%)	
            len=256, pos=64, align=7:        13.88	       13.98 ( -0.73%)	
            len=256, pos=64, align=7:        14.09	       14.26 ( -1.20%)	
          len=256, pos=64, align=105:        13.88	       13.44 (  3.17%)	
          len=256, pos=64, align=105:        14.02	       13.40 (  4.43%)	
               len=1, pos=0, align=0:         4.00	        4.67 (-16.67%)	
               len=1, pos=0, align=0:         4.00	        4.67 (-16.62%)	
            len=1, pos=0, align=4095:         6.67	        6.17 (  7.56%)	
               len=2, pos=1, align=0:         4.00	        4.67 (-16.64%)	
               len=2, pos=1, align=0:         4.00	        4.67 (-16.67%)	
            len=2, pos=1, align=4095:         6.67	        6.00 (  9.99%)	
               len=3, pos=2, align=0:         4.01	        4.68 (-16.65%)	
               len=3, pos=2, align=0:         4.01	        4.68 (-16.65%)	
            len=3, pos=2, align=4094:         6.67	        6.00 (  9.97%)	
               len=4, pos=3, align=0:         4.01	        4.68 (-16.58%)	
               len=4, pos=3, align=0:         4.01	        4.68 (-16.64%)	
            len=4, pos=3, align=4094:         6.85	        6.00 ( 12.44%)	
               len=5, pos=4, align=0:         4.01	        4.68 (-16.66%)	
               len=5, pos=4, align=0:         4.01	        4.84 (-20.72%)	
            len=5, pos=4, align=4093:         6.67	        6.00 ( 10.00%)	
               len=6, pos=5, align=0:         4.17	        4.68 (-12.20%)	
               len=6, pos=5, align=0:         4.00	        4.68 (-16.94%)	
            len=6, pos=5, align=4093:         6.67	        6.24 (  6.35%)	
               len=7, pos=6, align=0:         4.01	        4.68 (-16.63%)	
               len=7, pos=6, align=0:         4.01	        4.68 (-16.60%)	
            len=7, pos=6, align=4092:         6.67	        6.00 (  9.99%)	
               len=8, pos=7, align=0:         4.01	        4.83 (-20.48%)	
               len=8, pos=7, align=0:         4.01	        4.67 (-16.30%)	
            len=8, pos=7, align=4092:         6.67	        6.00 (  9.99%)	
               len=9, pos=8, align=0:         4.01	        4.68 (-16.66%)	
               len=9, pos=8, align=0:         4.01	        4.68 (-16.60%)	
            len=9, pos=8, align=4091:         6.67	        6.00 (  9.99%)	
              len=10, pos=9, align=0:         4.01	        4.84 (-20.62%)	
              len=10, pos=9, align=0:         4.01	        4.68 (-16.61%)	
           len=10, pos=9, align=4091:         6.67	        6.18 (  7.33%)	
             len=11, pos=10, align=0:         4.00	        4.68 (-16.91%)	
             len=11, pos=10, align=0:         4.00	        4.84 (-20.90%)	
          len=11, pos=10, align=4090:         6.67	        6.00 (  9.99%)	
             len=12, pos=11, align=0:         4.17	        4.68 (-12.27%)	
             len=12, pos=11, align=0:         4.01	        4.68 (-16.57%)	
          len=12, pos=11, align=4090:         6.83	        6.00 ( 12.08%)	
             len=13, pos=12, align=0:         4.01	        4.68 (-16.64%)	
             len=13, pos=12, align=0:         4.01	        4.68 (-16.63%)	
          len=13, pos=12, align=4089:         6.67	        6.00 (  9.97%)	
             len=14, pos=13, align=0:         4.01	        4.68 (-16.61%)	
             len=14, pos=13, align=0:         4.01	        4.68 (-16.63%)	
          len=14, pos=13, align=4089:         6.67	        6.00 (  9.99%)	
             len=15, pos=14, align=0:         4.01	        4.68 (-16.66%)	
             len=15, pos=14, align=0:         4.01	        4.68 (-16.63%)	
          len=15, pos=14, align=4088:         6.67	        6.17 (  7.48%)	
             len=16, pos=15, align=0:         4.01	        4.68 (-16.62%)	
             len=16, pos=15, align=0:         4.01	        4.68 (-16.63%)	
          len=16, pos=15, align=4088:         6.67	        6.00 (  9.99%)	
             len=17, pos=16, align=0:         4.01	        4.68 (-16.56%)	
             len=17, pos=16, align=0:         4.01	        4.68 (-16.56%)	
          len=17, pos=16, align=4087:         6.67	        6.00 (  9.99%)	
             len=18, pos=17, align=0:         4.01	        4.68 (-16.56%)	
             len=18, pos=17, align=0:         4.01	        4.68 (-16.56%)	
          len=18, pos=17, align=4087:         6.67	        6.00 (  9.99%)	
             len=19, pos=18, align=0:         4.01	        4.68 (-16.57%)	
             len=19, pos=18, align=0:         4.01	        4.68 (-16.58%)	
          len=19, pos=18, align=4086:         6.83	        6.00 ( 12.11%)	
             len=20, pos=19, align=0:         4.01	        4.68 (-16.61%)	
             len=20, pos=19, align=0:         4.01	        4.68 (-16.66%)	
          len=20, pos=19, align=4086:         6.67	        6.14 (  7.93%)	
             len=21, pos=20, align=0:         4.01	        4.68 (-16.59%)	
             len=21, pos=20, align=0:         4.01	        4.68 (-16.60%)	
          len=21, pos=20, align=4085:         6.67	        6.00 ( 10.00%)	
             len=22, pos=21, align=0:         4.01	        4.68 (-16.56%)	
             len=22, pos=21, align=0:         4.01	        4.68 (-16.61%)	
          len=22, pos=21, align=4085:         6.67	        6.00 ( 10.00%)	
             len=23, pos=22, align=0:         4.01	        4.68 (-16.58%)	
             len=23, pos=22, align=0:         4.01	        4.68 (-16.53%)	
          len=23, pos=22, align=4084:         6.67	        6.00 ( 10.00%)	
             len=24, pos=23, align=0:         4.01	        4.68 (-16.58%)	
             len=24, pos=23, align=0:         4.16	        4.68 (-12.36%)	
          len=24, pos=23, align=4084:         6.67	        6.00 (  9.98%)	
             len=25, pos=24, align=0:         4.01	        4.68 (-16.60%)	
             len=25, pos=24, align=0:         4.02	        4.68 (-16.47%)	
          len=25, pos=24, align=4083:         6.67	        6.00 (  9.99%)	
             len=26, pos=25, align=0:         4.01	        4.68 (-16.58%)	
             len=26, pos=25, align=0:         4.01	        4.68 (-16.57%)	
          len=26, pos=25, align=4083:         6.67	        6.00 (  9.99%)	
             len=27, pos=26, align=0:         4.01	        4.68 (-16.57%)	
             len=27, pos=26, align=0:         4.01	        4.68 (-16.57%)	
          len=27, pos=26, align=4082:         6.67	        6.00 ( 10.00%)	
             len=28, pos=27, align=0:         4.01	        4.68 (-16.55%)	
             len=28, pos=27, align=0:         4.01	        4.68 (-16.59%)	
          len=28, pos=27, align=4082:         6.83	        6.00 ( 12.11%)	
             len=29, pos=28, align=0:         4.01	        4.68 (-16.61%)	
             len=29, pos=28, align=0:         4.01	        4.68 (-16.56%)	
          len=29, pos=28, align=4081:         6.67	        6.00 (  9.99%)	
             len=30, pos=29, align=0:         4.01	        4.83 (-20.49%)	
             len=30, pos=29, align=0:         4.01	        4.68 (-16.57%)	
          len=30, pos=29, align=4081:         6.67	        6.00 ( 10.00%)	
             len=31, pos=30, align=0:         4.01	        4.68 (-16.54%)	
             len=31, pos=30, align=0:         4.01	        4.68 (-16.58%)	
          len=31, pos=30, align=4080:         6.67	        6.00 (  9.99%)	
             len=32, pos=31, align=0:         6.67	        4.68 ( 29.91%)	
             len=32, pos=31, align=0:         6.00	        4.67 ( 22.24%)	
          len=32, pos=31, align=4080:         9.16	        6.00 ( 34.51%)	
           len=2048, pos=32, align=0:        88.33	       44.81 ( 49.28%)	
           len=2048, pos=32, align=1:        86.53	       43.80 ( 49.39%)	
           len=2048, pos=64, align=0:        88.52	       43.85 ( 50.46%)	
           len=2048, pos=64, align=2:        86.64	       43.87 ( 49.36%)	
          len=2048, pos=128, align=0:        61.30	       44.25 ( 27.82%)	
          len=2048, pos=128, align=3:        61.02	       44.11 ( 27.72%)	
          len=2048, pos=256, align=0:        63.58	       44.55 ( 29.93%)	
          len=2048, pos=256, align=4:        63.56	       44.44 ( 30.09%)	
          len=2048, pos=512, align=0:        63.08	       45.16 ( 28.40%)	
          len=2048, pos=512, align=5:        62.77	       44.46 ( 29.17%)	
         len=2048, pos=1024, align=0:        63.36	       45.16 ( 28.74%)	
         len=2048, pos=1024, align=6:        65.55	       46.72 ( 28.73%)	
         len=2048, pos=2048, align=0:        62.27	       42.25 ( 32.16%)	
         len=2048, pos=2048, align=7:        62.75	       42.16 ( 32.82%)	
         len=2048, pos=4096, align=0:        66.74	       46.42 ( 30.45%)	
         len=2048, pos=4096, align=8:        61.43	       43.88 ( 28.57%)	
            len=256, pos=64, align=1:        14.69	       13.93 (  5.13%)	
            len=256, pos=64, align=1:        14.86	       15.09 ( -1.50%)	
           len=256, pos=64, align=15:        14.00	       14.05 ( -0.35%)	
           len=256, pos=64, align=15:        14.05	       14.47 ( -2.97%)	
            len=256, pos=64, align=2:        14.46	       14.47 ( -0.13%)	
            len=256, pos=64, align=2:        13.95	       14.51 ( -3.99%)	
           len=256, pos=64, align=30:        14.00	       14.52 ( -3.71%)	
           len=256, pos=64, align=30:        13.95	       14.89 ( -6.72%)	
            len=256, pos=64, align=3:        14.11	       14.52 ( -2.85%)	
            len=256, pos=64, align=3:        13.42	       13.94 ( -3.92%)	
           len=256, pos=64, align=45:        13.95	       14.49 ( -3.84%)	
           len=256, pos=64, align=45:        14.52	       14.41 (  0.71%)	
            len=256, pos=64, align=4:        14.60	       14.35 (  1.66%)	
            len=256, pos=64, align=4:        14.71	       14.34 (  2.52%)	
           len=256, pos=64, align=60:        14.03	       14.31 ( -1.99%)	
           len=256, pos=64, align=60:        14.59	       14.01 (  3.98%)	
            len=256, pos=64, align=5:        15.26	       14.55 (  4.62%)	
            len=256, pos=64, align=5:        13.98	       14.04 ( -0.45%)	
           len=256, pos=64, align=75:        14.65	       13.46 (  8.13%)	
           len=256, pos=64, align=75:        13.89	       13.45 (  3.15%)	
            len=256, pos=64, align=6:        14.00	       14.70 ( -5.03%)	
            len=256, pos=64, align=6:        13.96	       14.59 ( -4.57%)	
           len=256, pos=64, align=90:        14.00	       13.38 (  4.40%)	
           len=256, pos=64, align=90:        14.61	       15.13 ( -3.58%)	
            len=256, pos=64, align=7:        14.02	       13.98 (  0.26%)	
            len=256, pos=64, align=7:        13.90	       14.01 ( -0.77%)	
          len=256, pos=64, align=105:        13.89	       15.27 ( -9.90%)	
          len=256, pos=64, align=105:        13.98	       15.07 ( -7.82%)	
               len=1, pos=0, align=0:         4.00	        4.67 (-16.66%)	
               len=1, pos=0, align=0:         4.00	        4.83 (-20.66%)	
            len=1, pos=0, align=4095:         6.67	        6.00 ( 10.02%)	
               len=2, pos=1, align=0:         4.00	        4.67 (-16.65%)	
               len=2, pos=1, align=0:         4.00	        4.67 (-16.67%)	
            len=2, pos=1, align=4095:         6.67	        6.00 (  9.99%)	
               len=3, pos=2, align=0:         4.00	        4.68 (-16.87%)	
               len=3, pos=2, align=0:         4.00	        4.68 (-16.95%)	
            len=3, pos=2, align=4094:         6.67	        6.18 (  7.33%)	
               len=4, pos=3, align=0:         4.00	        4.68 (-16.89%)	
               len=4, pos=3, align=0:         4.00	        4.84 (-21.02%)	
            len=4, pos=3, align=4094:         6.67	        6.00 (  9.99%)	
               len=5, pos=4, align=0:         4.00	        4.68 (-16.88%)	
               len=5, pos=4, align=0:         4.00	        4.68 (-16.90%)	
            len=5, pos=4, align=4093:         6.67	        6.00 (  9.99%)	
               len=6, pos=5, align=0:         4.00	        4.68 (-16.83%)	
               len=6, pos=5, align=0:         4.00	        4.68 (-16.91%)	
            len=6, pos=5, align=4093:         6.67	        6.00 ( 10.00%)	
               len=7, pos=6, align=0:         4.00	        4.68 (-16.91%)	
               len=7, pos=6, align=0:         4.00	        4.68 (-16.87%)	
            len=7, pos=6, align=4092:         6.67	        6.00 ( 10.00%)	
               len=8, pos=7, align=0:         4.00	        4.68 (-16.87%)	
               len=8, pos=7, align=0:         4.00	        4.68 (-16.87%)	
            len=8, pos=7, align=4092:         6.67	        6.00 (  9.99%)	
               len=9, pos=8, align=0:         4.00	        4.67 (-16.64%)	
               len=9, pos=8, align=0:         4.00	        4.67 (-16.70%)	
            len=9, pos=8, align=4091:         6.67	        6.00 (  9.99%)	
              len=10, pos=9, align=0:         4.00	        4.68 (-16.98%)	
              len=10, pos=9, align=0:         4.00	        4.68 (-16.88%)	
           len=10, pos=9, align=4091:         6.67	        6.00 (  9.99%)	
             len=11, pos=10, align=0:         4.00	        4.68 (-16.88%)	
             len=11, pos=10, align=0:         4.00	        4.68 (-16.95%)	
          len=11, pos=10, align=4090:         6.67	        6.00 (  9.99%)	
             len=12, pos=11, align=0:         4.00	        4.68 (-16.94%)	
             len=12, pos=11, align=0:         4.00	        4.68 (-16.91%)	
          len=12, pos=11, align=4090:         6.67	        6.00 (  9.99%)	
             len=13, pos=12, align=0:         4.00	        4.84 (-21.00%)	
             len=13, pos=12, align=0:         4.00	        4.67 (-16.66%)	
          len=13, pos=12, align=4089:         6.83	        6.00 ( 12.19%)	
             len=14, pos=13, align=0:         4.01	        4.68 (-16.56%)	
             len=14, pos=13, align=0:         4.01	        4.68 (-16.63%)	
          len=14, pos=13, align=4089:         6.67	        6.00 ( 10.00%)	
             len=15, pos=14, align=0:         4.01	        4.68 (-16.54%)	
             len=15, pos=14, align=0:         4.01	        4.68 (-16.57%)	
          len=15, pos=14, align=4088:         6.67	        6.00 (  9.99%)	
             len=16, pos=15, align=0:         4.01	        4.83 (-20.49%)	
             len=16, pos=15, align=0:         4.01	        4.67 (-16.34%)	
          len=16, pos=15, align=4088:         6.82	        6.00 ( 12.06%)	
             len=17, pos=16, align=0:         4.01	        4.68 (-16.53%)	
             len=17, pos=16, align=0:         4.01	        4.68 (-16.57%)	
          len=17, pos=16, align=4087:         7.06	        6.36 (  9.97%)	
             len=18, pos=17, align=0:         4.95	        4.68 (  5.47%)	
             len=18, pos=17, align=0:         4.00	        4.68 (-16.90%)	
          len=18, pos=17, align=4087:         6.67	        6.00 (  9.99%)	
             len=19, pos=18, align=0:         4.00	        4.68 (-16.88%)	
             len=19, pos=18, align=0:         4.00	        4.84 (-20.92%)	
          len=19, pos=18, align=4086:         6.67	        6.00 (  9.97%)	
             len=20, pos=19, align=0:         4.00	        4.68 (-16.85%)	
             len=20, pos=19, align=0:         4.00	        4.68 (-16.93%)	
          len=20, pos=19, align=4086:         6.67	        6.00 (  9.99%)	
             len=21, pos=20, align=0:         4.00	        4.68 (-16.88%)	
             len=21, pos=20, align=0:         4.00	        4.68 (-16.95%)	
          len=21, pos=20, align=4085:         6.67	        6.00 (  9.99%)	
             len=22, pos=21, align=0:         4.00	        4.87 (-21.67%)	
             len=22, pos=21, align=0:         4.00	        4.68 (-16.96%)	
          len=22, pos=21, align=4085:         6.67	        6.00 (  9.99%)	
             len=23, pos=22, align=0:         4.00	        4.68 (-16.88%)	
             len=23, pos=22, align=0:         4.00	        4.68 (-16.88%)	
          len=23, pos=22, align=4084:         6.67	        6.00 (  9.99%)	
             len=24, pos=23, align=0:         4.00	        4.68 (-16.85%)	
             len=24, pos=23, align=0:         4.00	        4.68 (-16.88%)	
          len=24, pos=23, align=4084:         6.67	        6.00 (  9.99%)	
             len=25, pos=24, align=0:         4.00	        4.68 (-16.88%)	
             len=25, pos=24, align=0:         4.00	        4.68 (-16.88%)	
          len=25, pos=24, align=4083:         6.67	        6.00 (  9.99%)	
             len=26, pos=25, align=0:         4.00	        4.83 (-20.79%)	
             len=26, pos=25, align=0:         4.00	        4.67 (-16.67%)	
          len=26, pos=25, align=4083:         6.67	        6.00 (  9.96%)	
             len=27, pos=26, align=0:         4.16	        4.68 (-12.35%)	
             len=27, pos=26, align=0:         4.00	        4.68 (-16.91%)	
          len=27, pos=26, align=4082:         6.67	        6.00 (  9.99%)	
             len=28, pos=27, align=0:         4.00	        4.68 (-16.90%)	
             len=28, pos=27, align=0:         4.00	        4.68 (-16.91%)	
          len=28, pos=27, align=4082:         6.67	        6.00 ( 10.00%)	
             len=29, pos=28, align=0:         4.00	        4.68 (-16.88%)	
             len=29, pos=28, align=0:         4.00	        4.84 (-20.87%)	
          len=29, pos=28, align=4081:         6.67	        6.00 (  9.99%)	
             len=30, pos=29, align=0:         4.00	        4.68 (-16.87%)	
             len=30, pos=29, align=0:         4.00	        4.68 (-16.91%)	
          len=30, pos=29, align=4081:         6.67	        6.00 (  9.99%)	
             len=31, pos=30, align=0:         4.00	        4.68 (-16.91%)	
             len=31, pos=30, align=0:         4.00	        4.68 (-16.91%)	
          len=31, pos=30, align=4080:         6.67	        6.00 (  9.97%)	
             len=32, pos=31, align=0:         6.01	        4.84 ( 19.50%)	
             len=32, pos=31, align=0:         6.00	        4.68 ( 22.05%)	
          len=32, pos=31, align=4080:         9.00	        6.00 ( 33.34%)	
           len=2048, pos=32, align=0:        85.36	       44.00 ( 48.46%)	
           len=2048, pos=32, align=1:        86.66	       43.76 ( 49.50%)	
           len=2048, pos=64, align=0:        90.74	       43.79 ( 51.74%)	
           len=2048, pos=64, align=2:        86.03	       43.95 ( 48.91%)	
          len=2048, pos=128, align=0:        62.48	       43.37 ( 30.59%)	
          len=2048, pos=128, align=3:        61.36	       44.03 ( 28.24%)	
          len=2048, pos=256, align=0:        61.49	       44.26 ( 28.02%)	
          len=2048, pos=256, align=4:        61.74	       44.60 ( 27.77%)	
          len=2048, pos=512, align=0:        65.32	       46.82 ( 28.33%)	
          len=2048, pos=512, align=5:        66.22	       47.36 ( 28.49%)	
         len=2048, pos=1024, align=0:        65.56	       48.78 ( 25.60%)	
         len=2048, pos=1024, align=6:        66.16	       49.63 ( 24.99%)	
         len=2048, pos=2048, align=0:        64.26	       44.68 ( 30.46%)	
         len=2048, pos=2048, align=7:        64.79	       44.85 ( 30.78%)	
         len=2048, pos=4096, align=0:        62.72	       42.25 ( 32.63%)	
         len=2048, pos=4096, align=8:        61.97	       43.09 ( 30.47%)	
            len=256, pos=64, align=1:        13.90	       14.57 ( -4.84%)	
            len=256, pos=64, align=1:        13.90	       14.44 ( -3.93%)	
           len=256, pos=64, align=15:        14.44	       14.49 ( -0.39%)	
           len=256, pos=64, align=15:        13.96	       14.47 ( -3.66%)	
            len=256, pos=64, align=2:        14.04	       14.29 ( -1.75%)	
            len=256, pos=64, align=2:        14.57	       14.41 (  1.11%)	
           len=256, pos=64, align=30:        13.92	       14.50 ( -4.19%)	
           len=256, pos=64, align=30:        14.14	       14.50 ( -2.57%)	
            len=256, pos=64, align=3:        13.96	       14.31 ( -2.56%)	
            len=256, pos=64, align=3:        13.89	       15.16 ( -9.15%)	
           len=256, pos=64, align=45:        14.00	       14.34 ( -2.41%)	
           len=256, pos=64, align=45:        13.94	       14.53 ( -4.22%)	
            len=256, pos=64, align=4:        14.80	       14.54 (  1.78%)	
            len=256, pos=64, align=4:        13.96	       14.29 ( -2.37%)	
           len=256, pos=64, align=60:        13.92	       14.54 ( -4.46%)	
           len=256, pos=64, align=60:        13.89	       14.20 ( -2.21%)	
            len=256, pos=64, align=5:        13.98	       14.04 ( -0.38%)	
            len=256, pos=64, align=5:        14.46	       14.00 (  3.23%)	
           len=256, pos=64, align=75:        13.97	       13.45 (  3.74%)	
           len=256, pos=64, align=75:        14.94	       13.49 (  9.76%)	
            len=256, pos=64, align=6:        14.01	       14.42 ( -2.87%)	
            len=256, pos=64, align=6:        14.01	       13.99 (  0.15%)	
           len=256, pos=64, align=90:        14.18	       14.42 ( -1.68%)	
           len=256, pos=64, align=90:        13.89	       13.43 (  3.28%)	
            len=256, pos=64, align=7:        13.99	       13.97 (  0.14%)	
            len=256, pos=64, align=7:        14.01	       15.04 ( -7.29%)	
          len=256, pos=64, align=105:        14.03	       13.46 (  4.04%)	
          len=256, pos=64, align=105:        14.49	       13.45 (  7.17%)	
               len=1, pos=0, align=0:         4.00	        4.68 (-17.04%)	
               len=1, pos=0, align=0:         4.00	        4.69 (-17.08%)	
            len=1, pos=0, align=4095:         6.67	        6.00 (  9.96%)	
               len=2, pos=1, align=0:         4.00	        4.68 (-16.91%)	
               len=2, pos=1, align=0:         4.00	        4.68 (-16.90%)	
            len=2, pos=1, align=4095:         6.67	        6.00 (  9.99%)	
               len=3, pos=2, align=0:         4.00	        4.68 (-16.89%)	
               len=3, pos=2, align=0:         4.16	        4.68 (-12.41%)	
            len=3, pos=2, align=4094:         6.67	        6.00 ( 10.01%)	
               len=4, pos=3, align=0:         4.16	        4.68 (-12.47%)	
               len=4, pos=3, align=0:         4.00	        4.68 (-16.93%)	
            len=4, pos=3, align=4094:         6.67	        6.16 (  7.61%)	
               len=5, pos=4, align=0:         4.00	        4.68 (-16.91%)	
               len=5, pos=4, align=0:         4.00	        4.68 (-16.92%)	
            len=5, pos=4, align=4093:         6.67	        6.00 ( 10.00%)	
               len=6, pos=5, align=0:         4.00	        4.68 (-16.95%)	
               len=6, pos=5, align=0:         4.00	        4.68 (-16.92%)	
            len=6, pos=5, align=4093:         6.67	        6.00 (  9.99%)	
               len=7, pos=6, align=0:         4.00	        4.68 (-16.90%)	
               len=7, pos=6, align=0:         4.00	        4.68 (-16.92%)	
            len=7, pos=6, align=4092:         6.67	        6.00 ( 10.00%)	
               len=8, pos=7, align=0:         4.00	        4.68 (-16.88%)	
               len=8, pos=7, align=0:         4.00	        4.68 (-16.90%)	
            len=8, pos=7, align=4092:         6.67	        6.00 (  9.99%)	
               len=9, pos=8, align=0:         4.00	        4.68 (-16.92%)	
               len=9, pos=8, align=0:         4.00	        4.68 (-16.90%)	
            len=9, pos=8, align=4091:         6.67	        6.00 (  9.99%)	
              len=10, pos=9, align=0:         4.00	        4.68 (-16.94%)	
              len=10, pos=9, align=0:         4.00	        4.68 (-16.88%)	
           len=10, pos=9, align=4091:         6.67	        6.00 (  9.99%)	
             len=11, pos=10, align=0:         4.00	        4.68 (-16.90%)	
             len=11, pos=10, align=0:         4.00	        4.68 (-16.91%)	
          len=11, pos=10, align=4090:         6.67	        6.00 (  9.99%)	
             len=12, pos=11, align=0:         4.00	        4.68 (-16.93%)	
             len=12, pos=11, align=0:         4.00	        4.68 (-16.91%)	
          len=12, pos=11, align=4090:         6.67	        6.16 (  7.60%)	
             len=13, pos=12, align=0:         4.00	        4.68 (-16.89%)	
             len=13, pos=12, align=0:         4.16	        4.68 (-12.44%)	
          len=13, pos=12, align=4089:         6.67	        6.00 ( 10.00%)	
             len=14, pos=13, align=0:         4.00	        4.68 (-16.91%)	
             len=14, pos=13, align=0:         4.00	        4.68 (-16.89%)	
          len=14, pos=13, align=4089:         6.67	        6.00 (  9.99%)	
             len=15, pos=14, align=0:         4.00	        4.68 (-16.91%)	
             len=15, pos=14, align=0:         4.00	        4.68 (-16.88%)	
          len=15, pos=14, align=4088:         6.67	        6.00 (  9.97%)	
             len=16, pos=15, align=0:         4.00	        4.68 (-16.91%)	
             len=16, pos=15, align=0:         4.16	        4.68 (-12.45%)	
          len=16, pos=15, align=4088:         6.67	        6.00 (  9.99%)	
             len=17, pos=16, align=0:         4.00	        4.68 (-16.92%)	
             len=17, pos=16, align=0:         4.00	        4.68 (-16.91%)	
          len=17, pos=16, align=4087:         6.67	        6.00 (  9.99%)	
             len=18, pos=17, align=0:         4.00	        4.84 (-20.88%)	
             len=18, pos=17, align=0:         4.00	        4.68 (-16.87%)	
          len=18, pos=17, align=4087:         6.67	        6.00 (  9.99%)	
             len=19, pos=18, align=0:         4.00	        4.68 (-16.91%)	
             len=19, pos=18, align=0:         4.00	        4.68 (-16.90%)	
          len=19, pos=18, align=4086:         6.67	        6.16 (  7.58%)	
             len=20, pos=19, align=0:         4.00	        4.68 (-16.88%)	
             len=20, pos=19, align=0:         4.00	        4.68 (-16.90%)	
          len=20, pos=19, align=4086:         6.67	        6.00 (  9.99%)	
             len=21, pos=20, align=0:         4.00	        4.68 (-16.89%)	
             len=21, pos=20, align=0:         4.00	        4.68 (-16.90%)	
          len=21, pos=20, align=4085:         6.67	        6.00 (  9.99%)	
             len=22, pos=21, align=0:         4.00	        4.68 (-16.97%)	
             len=22, pos=21, align=0:         4.16	        4.68 (-12.44%)	
          len=22, pos=21, align=4085:         6.67	        6.00 (  9.99%)	
             len=23, pos=22, align=0:         4.00	        4.68 (-16.90%)	
             len=23, pos=22, align=0:         4.00	        4.83 (-20.79%)	
          len=23, pos=22, align=4084:         6.67	        6.00 ( 10.00%)	
             len=24, pos=23, align=0:         4.00	        4.68 (-16.91%)	
             len=24, pos=23, align=0:         4.00	        4.83 (-20.81%)	
          len=24, pos=23, align=4084:         6.67	        6.00 ( 10.00%)	
             len=25, pos=24, align=0:         4.00	        4.68 (-16.93%)	
             len=25, pos=24, align=0:         4.00	        4.83 (-20.84%)	
          len=25, pos=24, align=4083:         6.67	        6.00 (  9.99%)	
             len=26, pos=25, align=0:         4.00	        4.68 (-16.91%)	
             len=26, pos=25, align=0:         4.16	        4.68 (-12.47%)	
          len=26, pos=25, align=4083:         6.67	        6.00 ( 10.00%)	
             len=27, pos=26, align=0:         4.00	        4.68 (-16.90%)	
             len=27, pos=26, align=0:         4.00	        4.68 (-16.90%)	
          len=27, pos=26, align=4082:         6.67	        6.00 (  9.99%)	
             len=28, pos=27, align=0:         4.00	        4.68 (-16.91%)	
             len=28, pos=27, align=0:         4.00	        4.68 (-16.90%)	
          len=28, pos=27, align=4082:         6.67	        6.17 (  7.53%)	
             len=29, pos=28, align=0:         4.00	        4.68 (-16.89%)	
             len=29, pos=28, align=0:         4.00	        4.68 (-16.91%)	
          len=29, pos=28, align=4081:         6.67	        6.00 (  9.99%)	
             len=30, pos=29, align=0:         4.00	        4.67 (-16.67%)	
             len=30, pos=29, align=0:         4.00	        4.67 (-16.66%)	
          len=30, pos=29, align=4081:         6.67	        6.00 (  9.99%)	
             len=31, pos=30, align=0:         4.00	        4.68 (-16.90%)	
             len=31, pos=30, align=0:         4.00	        4.68 (-16.91%)	
          len=31, pos=30, align=4080:         6.67	        6.00 (  9.99%)	
             len=32, pos=31, align=0:         6.00	        4.68 ( 22.07%)	
             len=32, pos=31, align=0:         6.00	        4.68 ( 22.09%)	
          len=32, pos=31, align=4080:         9.00	        6.16 ( 31.53%)	
           len=2048, pos=32, align=0:        64.76	       43.76 ( 32.43%)	
           len=2048, pos=32, align=1:        59.89	       43.78 ( 26.90%)	
           len=2048, pos=64, align=0:        63.03	       46.42 ( 26.36%)	
           len=2048, pos=64, align=2:        60.22	       43.99 ( 26.95%)	
          len=2048, pos=128, align=0:        60.54	       44.18 ( 27.03%)	
          len=2048, pos=128, align=3:        61.40	       43.48 ( 29.19%)	
          len=2048, pos=256, align=0:        62.04	       44.21 ( 28.73%)	
          len=2048, pos=256, align=4:        61.04	       46.58 ( 23.68%)	
          len=2048, pos=512, align=0:        67.69	       45.74 ( 32.43%)	
          len=2048, pos=512, align=5:        68.13	       47.23 ( 30.68%)	
         len=2048, pos=1024, align=0:        72.87	       52.68 ( 27.71%)	
         len=2048, pos=1024, align=6:        73.18	       54.81 ( 25.10%)	
         len=2048, pos=2048, align=0:        67.89	       55.91 ( 17.64%)	
         len=2048, pos=2048, align=7:        68.21	       55.83 ( 18.15%)	
         len=2048, pos=4096, align=0:        63.24	       45.10 ( 28.68%)	
         len=2048, pos=4096, align=8:        61.78	       45.91 ( 25.68%)	
            len=256, pos=64, align=1:        13.99	       14.56 ( -4.06%)	
            len=256, pos=64, align=1:        13.94	       14.57 ( -4.49%)	
           len=256, pos=64, align=15:        14.00	       14.51 ( -3.65%)	
           len=256, pos=64, align=15:        15.77	       14.49 (  8.14%)	
            len=256, pos=64, align=2:        14.03	       15.27 ( -8.86%)	
            len=256, pos=64, align=2:        13.89	       14.49 ( -4.30%)	
           len=256, pos=64, align=30:        14.14	       14.04 (  0.71%)	
           len=256, pos=64, align=30:        13.46	       13.98 ( -3.91%)	
            len=256, pos=64, align=3:        13.90	       14.50 ( -4.33%)	
            len=256, pos=64, align=3:        14.16	       14.48 ( -2.27%)	
           len=256, pos=64, align=45:        13.88	       14.61 ( -5.24%)	
           len=256, pos=64, align=45:        14.00	       14.56 ( -4.01%)	
            len=256, pos=64, align=4:        13.94	       14.70 ( -5.46%)	
            len=256, pos=64, align=4:        13.91	       14.44 ( -3.82%)	
           len=256, pos=64, align=60:        14.01	       14.90 ( -6.39%)	
           len=256, pos=64, align=60:        14.01	       14.55 ( -3.90%)	
            len=256, pos=64, align=5:        15.16	       14.59 (  3.79%)	
            len=256, pos=64, align=5:        13.93	       14.53 ( -4.28%)	
           len=256, pos=64, align=75:        14.02	       13.43 (  4.21%)	
           len=256, pos=64, align=75:        14.00	       13.39 (  4.36%)	
            len=256, pos=64, align=6:        13.97	       14.51 ( -3.85%)	
            len=256, pos=64, align=6:        14.17	       14.54 ( -2.61%)	
           len=256, pos=64, align=90:        13.89	       13.50 (  2.80%)	
           len=256, pos=64, align=90:        14.17	       14.41 ( -1.74%)	
            len=256, pos=64, align=7:        13.90	       14.04 ( -1.05%)	
            len=256, pos=64, align=7:        14.00	       14.04 ( -0.27%)	
          len=256, pos=64, align=105:        13.98	       13.57 (  2.89%)	
          len=256, pos=64, align=105:        13.88	       15.42 (-11.09%)	
               len=1, pos=0, align=0:         4.00	        4.67 (-16.68%)	
               len=1, pos=0, align=0:         4.00	        4.67 (-16.65%)	
            len=1, pos=0, align=4095:         6.67	        6.00 ( 10.03%)	
               len=2, pos=1, align=0:         4.00	        4.67 (-16.64%)	
               len=2, pos=1, align=0:         4.00	        4.67 (-16.65%)	
            len=2, pos=1, align=4095:         6.67	        6.00 (  9.97%)	
               len=3, pos=2, align=0:         4.00	        4.88 (-21.90%)	
               len=3, pos=2, align=0:         4.00	        4.67 (-16.63%)	
            len=3, pos=2, align=4094:         6.67	        6.00 (  9.97%)	
               len=4, pos=3, align=0:         4.00	        4.67 (-16.66%)	
               len=4, pos=3, align=0:         4.00	        4.67 (-16.67%)	
            len=4, pos=3, align=4094:         6.67	        6.00 ( 10.00%)	
               len=5, pos=4, align=0:         4.00	        4.67 (-16.66%)	
               len=5, pos=4, align=0:         4.00	        4.67 (-16.63%)	
            len=5, pos=4, align=4093:         6.67	        6.00 ( 10.00%)	
               len=6, pos=5, align=0:         4.00	        4.67 (-16.65%)	
               len=6, pos=5, align=0:         4.00	        4.67 (-16.66%)	
            len=6, pos=5, align=4093:         6.67	        6.00 (  9.99%)	
               len=7, pos=6, align=0:         4.00	        4.67 (-16.65%)	
               len=7, pos=6, align=0:         4.00	        4.67 (-16.63%)	
            len=7, pos=6, align=4092:         6.67	        6.00 ( 10.00%)	
               len=8, pos=7, align=0:         4.00	        4.67 (-16.65%)	
               len=8, pos=7, align=0:         4.00	        4.67 (-16.66%)	
            len=8, pos=7, align=4092:         6.85	        6.00 ( 12.35%)	
               len=9, pos=8, align=0:         4.00	        4.67 (-16.66%)	
               len=9, pos=8, align=0:         4.00	        4.67 (-16.62%)	
            len=9, pos=8, align=4091:         6.67	        6.00 (  9.99%)	
              len=10, pos=9, align=0:         4.00	        4.67 (-16.60%)	
              len=10, pos=9, align=0:         4.00	        4.67 (-16.65%)	
           len=10, pos=9, align=4091:         6.67	        6.00 (  9.99%)	
             len=11, pos=10, align=0:         4.00	        4.67 (-16.60%)	
             len=11, pos=10, align=0:         4.00	        4.67 (-16.65%)	
          len=11, pos=10, align=4090:         6.67	        6.00 (  9.99%)	
             len=12, pos=11, align=0:         4.00	        4.67 (-16.63%)	
             len=12, pos=11, align=0:         4.00	        4.67 (-16.66%)	
          len=12, pos=11, align=4090:         6.67	        6.00 (  9.99%)	
             len=13, pos=12, align=0:         4.00	        4.67 (-16.63%)	
             len=13, pos=12, align=0:         4.00	        4.67 (-16.70%)	
          len=13, pos=12, align=4089:         6.67	        6.00 (  9.99%)	
             len=14, pos=13, align=0:         4.00	        4.67 (-16.71%)	
             len=14, pos=13, align=0:         4.00	        4.83 (-20.69%)	
          len=14, pos=13, align=4089:         6.67	        6.00 (  9.99%)	
             len=15, pos=14, align=0:         4.00	        4.67 (-16.66%)	
             len=15, pos=14, align=0:         4.00	        4.67 (-16.65%)	
          len=15, pos=14, align=4088:         6.67	        6.00 (  9.97%)	
             len=16, pos=15, align=0:         4.00	        4.67 (-16.70%)	
             len=16, pos=15, align=0:         4.00	        4.67 (-16.65%)	
          len=16, pos=15, align=4088:         6.67	        6.00 (  9.99%)	
             len=17, pos=16, align=0:         4.00	        4.67 (-16.66%)	
             len=17, pos=16, align=0:         4.16	        4.67 (-12.07%)	
          len=17, pos=16, align=4087:         6.67	        6.00 (  9.99%)	
             len=18, pos=17, align=0:         4.00	        4.67 (-16.65%)	
             len=18, pos=17, align=0:         4.00	        4.67 (-16.66%)	
          len=18, pos=17, align=4087:         6.67	        6.00 (  9.99%)	
             len=19, pos=18, align=0:         4.00	        4.67 (-16.65%)	
             len=19, pos=18, align=0:         4.00	        4.67 (-16.65%)	
          len=19, pos=18, align=4086:         6.67	        6.00 (  9.99%)	
             len=20, pos=19, align=0:         4.00	        4.67 (-16.70%)	
             len=20, pos=19, align=0:         4.00	        4.67 (-16.66%)	
          len=20, pos=19, align=4086:         6.67	        6.00 (  9.96%)	
             len=21, pos=20, align=0:         4.00	        4.82 (-20.53%)	
             len=21, pos=20, align=0:         4.00	        4.67 (-16.65%)	
          len=21, pos=20, align=4085:         6.67	        6.00 (  9.99%)	
             len=22, pos=21, align=0:         4.00	        4.67 (-16.63%)	
             len=22, pos=21, align=0:         4.00	        4.67 (-16.65%)	
          len=22, pos=21, align=4085:         6.67	        6.00 (  9.99%)	
             len=23, pos=22, align=0:         4.00	        4.67 (-16.65%)	
             len=23, pos=22, align=0:         4.00	        4.67 (-16.70%)	
          len=23, pos=22, align=4084:         6.67	        6.00 (  9.96%)	
             len=24, pos=23, align=0:         4.00	        4.67 (-16.65%)	
             len=24, pos=23, align=0:         4.00	        4.67 (-16.70%)	
          len=24, pos=23, align=4084:         6.67	        6.00 (  9.99%)	
             len=25, pos=24, align=0:         4.00	        4.67 (-16.63%)	
             len=25, pos=24, align=0:         4.00	        4.67 (-16.66%)	
          len=25, pos=24, align=4083:         6.67	        6.00 (  9.97%)	
             len=26, pos=25, align=0:         4.00	        4.67 (-16.65%)	
             len=26, pos=25, align=0:         4.00	        4.67 (-16.64%)	
          len=26, pos=25, align=4083:         6.67	        6.00 (  9.99%)	
             len=27, pos=26, align=0:         4.00	        4.83 (-20.66%)	
             len=27, pos=26, align=0:         4.00	        4.67 (-16.66%)	
          len=27, pos=26, align=4082:         6.67	        6.00 (  9.99%)	
             len=28, pos=27, align=0:         4.00	        4.67 (-16.66%)	
             len=28, pos=27, align=0:         4.00	        4.67 (-16.65%)	
          len=28, pos=27, align=4082:         6.67	        6.00 (  9.99%)	
             len=29, pos=28, align=0:         4.00	        4.67 (-16.62%)	
             len=29, pos=28, align=0:         4.00	        4.67 (-16.65%)	
          len=29, pos=28, align=4081:         6.67	        6.00 ( 10.00%)	
             len=30, pos=29, align=0:         4.00	        4.67 (-16.65%)	
             len=30, pos=29, align=0:         4.00	        4.67 (-16.65%)	
          len=30, pos=29, align=4081:         6.67	        6.00 (  9.99%)	
             len=31, pos=30, align=0:         4.00	        4.67 (-16.63%)	
             len=31, pos=30, align=0:         4.00	        4.67 (-16.66%)	
          len=31, pos=30, align=4080:         6.67	        6.00 ( 10.02%)	
             len=32, pos=31, align=0:         6.01	        4.67 ( 22.28%)	
             len=32, pos=31, align=0:         6.00	        4.67 ( 22.20%)	
          len=32, pos=31, align=4080:         9.00	        6.00 ( 33.34%)	
           len=2048, pos=32, align=0:        66.79	       43.91 ( 34.25%)	
           len=2048, pos=32, align=1:        86.30	       43.76 ( 49.29%)	
           len=2048, pos=64, align=0:        93.06	       48.59 ( 47.78%)	
           len=2048, pos=64, align=2:        91.49	       43.89 ( 52.03%)	
          len=2048, pos=128, align=0:        60.63	       42.77 ( 29.46%)	
          len=2048, pos=128, align=3:        61.23	       42.61 ( 30.41%)	
          len=2048, pos=256, align=0:        62.65	       44.54 ( 28.92%)	
          len=2048, pos=256, align=4:        62.56	       44.47 ( 28.91%)	
          len=2048, pos=512, align=0:        69.30	       47.13 ( 31.99%)	
          len=2048, pos=512, align=5:        68.40	       48.17 ( 29.57%)	
         len=2048, pos=1024, align=0:        80.41	       52.68 ( 34.48%)	
         len=2048, pos=1024, align=6:        80.58	       52.92 ( 34.32%)	
         len=2048, pos=2048, align=0:        78.58	       67.28 ( 14.37%)	
         len=2048, pos=2048, align=7:        78.98	       67.49 ( 14.54%)	
         len=2048, pos=4096, align=0:        67.66	       55.91 ( 17.37%)	
         len=2048, pos=4096, align=8:        67.71	       55.39 ( 18.19%)	
            len=256, pos=64, align=1:        15.74	       14.18 (  9.91%)	
            len=256, pos=64, align=1:        16.02	       14.31 ( 10.72%)	
           len=256, pos=64, align=15:        14.22	       14.06 (  1.11%)	
           len=256, pos=64, align=15:        13.89	       13.96 ( -0.47%)	
            len=256, pos=64, align=2:        14.01	       14.03 ( -0.14%)	
            len=256, pos=64, align=2:        13.95	       14.21 ( -1.89%)	
           len=256, pos=64, align=30:        13.89	       13.98 ( -0.69%)	
           len=256, pos=64, align=30:        14.08	       14.05 (  0.21%)	
            len=256, pos=64, align=3:        14.08	       13.97 (  0.80%)	
            len=256, pos=64, align=3:        13.89	       14.28 ( -2.80%)	
           len=256, pos=64, align=45:        14.20	       14.14 (  0.43%)	
           len=256, pos=64, align=45:        13.90	       14.29 ( -2.76%)	
            len=256, pos=64, align=4:        13.96	       15.58 (-11.58%)	
            len=256, pos=64, align=4:        14.58	       16.07 (-10.16%)	
           len=256, pos=64, align=60:        14.54	       14.31 (  1.58%)	
           len=256, pos=64, align=60:        14.14	       14.03 (  0.77%)	
            len=256, pos=64, align=5:        13.93	       14.04 ( -0.79%)	
            len=256, pos=64, align=5:        13.43	       13.99 ( -4.16%)	
           len=256, pos=64, align=75:        13.92	       13.94 ( -0.10%)	
           len=256, pos=64, align=75:        13.99	       13.77 (  1.54%)	
            len=256, pos=64, align=6:        14.60	       14.04 (  3.89%)	
            len=256, pos=64, align=6:        14.18	       13.97 (  1.46%)	
           len=256, pos=64, align=90:        14.03	       13.45 (  4.19%)	
           len=256, pos=64, align=90:        14.58	       13.91 (  4.56%)	
            len=256, pos=64, align=7:        14.02	       14.28 ( -1.81%)	
            len=256, pos=64, align=7:        14.11	       14.28 ( -1.17%)	
          len=256, pos=64, align=105:        13.88	       13.56 (  2.31%)	
          len=256, pos=64, align=105:        14.57	       13.46 (  7.61%)	
               len=1, pos=0, align=0:         4.00	        4.67 (-16.59%)	
               len=1, pos=0, align=0:         4.00	        4.67 (-16.66%)	
            len=1, pos=0, align=4095:         6.83	        6.00 ( 12.09%)	
               len=2, pos=1, align=0:         4.01	        4.67 (-16.35%)	
               len=2, pos=1, align=0:         4.01	        4.67 (-16.33%)	
            len=2, pos=1, align=4095:         6.67	        6.00 (  9.99%)	
               len=3, pos=2, align=0:         4.00	        4.68 (-16.95%)	
               len=3, pos=2, align=0:         4.00	        4.68 (-16.89%)	
            len=3, pos=2, align=4094:         6.67	        6.00 ( 10.00%)	
               len=4, pos=3, align=0:         4.01	        4.68 (-16.60%)	
               len=4, pos=3, align=0:         4.01	        4.68 (-16.59%)	
            len=4, pos=3, align=4094:         6.67	        6.00 (  9.99%)	
               len=5, pos=4, align=0:         4.01	        4.68 (-16.59%)	
               len=5, pos=4, align=0:         4.01	        4.68 (-16.62%)	
            len=5, pos=4, align=4093:         6.67	        6.00 (  9.99%)	
               len=6, pos=5, align=0:         4.01	        4.68 (-16.59%)	
               len=6, pos=5, align=0:         4.01	        4.68 (-16.62%)	
            len=6, pos=5, align=4093:         6.83	        6.00 ( 12.08%)	
               len=7, pos=6, align=0:         4.01	        4.68 (-16.57%)	
               len=7, pos=6, align=0:         4.01	        4.84 (-20.65%)	
            len=7, pos=6, align=4092:         6.67	        6.00 (  9.97%)	
               len=8, pos=7, align=0:         4.24	        4.68 (-10.43%)	
               len=8, pos=7, align=0:         4.00	        4.68 (-16.95%)	
            len=8, pos=7, align=4092:         6.67	        6.17 (  7.50%)	
               len=9, pos=8, align=0:         4.01	        4.68 (-16.62%)	
               len=9, pos=8, align=0:         4.01	        4.84 (-20.54%)	
            len=9, pos=8, align=4091:         6.67	        6.00 (  9.99%)	
              len=10, pos=9, align=0:         4.01	        4.84 (-20.63%)	
              len=10, pos=9, align=0:         4.01	        4.68 (-16.66%)	
           len=10, pos=9, align=4091:         6.67	        6.16 (  7.57%)	
             len=11, pos=10, align=0:         4.01	        4.68 (-16.64%)	
             len=11, pos=10, align=0:         4.00	        4.85 (-21.08%)	
          len=11, pos=10, align=4090:         6.67	        6.00 (  9.99%)	
             len=12, pos=11, align=0:         4.01	        4.68 (-16.60%)	
             len=12, pos=11, align=0:         4.01	        4.68 (-16.64%)	
          len=12, pos=11, align=4090:         6.83	        6.00 ( 12.12%)	
             len=13, pos=12, align=0:         4.01	        4.68 (-16.65%)	
             len=13, pos=12, align=0:         4.01	        4.68 (-16.63%)	
          len=13, pos=12, align=4089:         6.67	        6.00 ( 10.00%)	
             len=14, pos=13, align=0:         4.01	        4.68 (-16.61%)	
             len=14, pos=13, align=0:         4.01	        4.68 (-16.60%)	
          len=14, pos=13, align=4089:         6.67	        6.00 (  9.99%)	
             len=15, pos=14, align=0:         4.01	        4.68 (-16.62%)	
             len=15, pos=14, align=0:         4.01	        4.68 (-16.59%)	
          len=15, pos=14, align=4088:         6.67	        6.18 (  7.37%)	
             len=16, pos=15, align=0:         4.01	        4.67 (-16.40%)	
             len=16, pos=15, align=0:         4.01	        4.67 (-16.31%)	
          len=16, pos=15, align=4088:         6.67	        6.00 (  9.99%)	
             len=17, pos=16, align=0:         4.01	        4.68 (-16.62%)	
             len=17, pos=16, align=0:         4.01	        4.68 (-16.59%)	
          len=17, pos=16, align=4087:         6.67	        6.00 ( 10.00%)	
             len=18, pos=17, align=0:         4.01	        4.68 (-16.63%)	
             len=18, pos=17, align=0:         4.01	        4.68 (-16.63%)	
          len=18, pos=17, align=4087:         6.67	        6.00 (  9.99%)	
             len=19, pos=18, align=0:         4.01	        4.68 (-16.60%)	
             len=19, pos=18, align=0:         4.01	        4.68 (-16.65%)	
          len=19, pos=18, align=4086:         6.67	        6.00 ( 10.00%)	
             len=20, pos=19, align=0:         5.35	        4.96 (  7.42%)	
             len=20, pos=19, align=0:         4.24	        4.96 (-17.01%)	
          len=20, pos=19, align=4086:         7.06	        6.35 ( 10.00%)	
             len=21, pos=20, align=0:         4.25	        4.96 (-16.65%)	
             len=21, pos=20, align=0:         4.25	        4.95 (-16.64%)	
          len=21, pos=20, align=4085:         7.06	        6.35 (  9.99%)	
             len=22, pos=21, align=0:         4.25	        4.95 (-16.66%)	
             len=22, pos=21, align=0:         4.25	        4.96 (-16.70%)	
          len=22, pos=21, align=4085:         7.06	        6.35 (  9.99%)	
             len=23, pos=22, align=0:         4.25	        4.95 (-16.61%)	
             len=23, pos=22, align=0:         4.25	        4.95 (-16.59%)	
          len=23, pos=22, align=4084:         7.06	        6.35 ( 10.00%)	
             len=24, pos=23, align=0:         4.25	        4.95 (-16.64%)	
             len=24, pos=23, align=0:         4.25	        4.96 (-16.63%)	
          len=24, pos=23, align=4084:         7.06	        6.35 ( 10.00%)	
             len=25, pos=24, align=0:         4.25	        4.95 (-16.60%)	
             len=25, pos=24, align=0:         4.42	        4.95 (-12.17%)	
          len=25, pos=24, align=4083:         7.06	        6.35 (  9.99%)	
             len=26, pos=25, align=0:         4.25	        4.95 (-16.64%)	
             len=26, pos=25, align=0:         4.25	        4.95 (-16.64%)	
          len=26, pos=25, align=4083:         7.06	        6.57 (  6.91%)	
             len=27, pos=26, align=0:         4.25	        4.94 (-16.39%)	
             len=27, pos=26, align=0:         4.25	        4.94 (-16.35%)	
          len=27, pos=26, align=4082:         7.06	        6.35 (  9.99%)	
             len=28, pos=27, align=0:         4.25	        5.16 (-21.52%)	
             len=28, pos=27, align=0:         4.25	        4.96 (-16.68%)	
          len=28, pos=27, align=4082:         7.06	        6.35 ( 10.00%)	
             len=29, pos=28, align=0:         4.25	        4.95 (-16.61%)	
             len=29, pos=28, align=0:         4.25	        5.15 (-21.28%)	
          len=29, pos=28, align=4081:         7.06	        6.35 (  9.99%)	
             len=30, pos=29, align=0:         4.01	        4.68 (-16.66%)	
             len=30, pos=29, align=0:         4.01	        4.68 (-16.62%)	
          len=30, pos=29, align=4081:         6.87	        6.00 ( 12.59%)	
             len=31, pos=30, align=0:         4.01	        4.68 (-16.66%)	
             len=31, pos=30, align=0:         4.25	        4.95 (-16.63%)	
          len=31, pos=30, align=4080:         6.67	        6.00 (  9.99%)	
             len=32, pos=31, align=0:         6.01	        4.68 ( 22.09%)	
             len=32, pos=31, align=0:         6.00	        4.88 ( 18.74%)	
          len=32, pos=31, align=4080:         8.67	        6.00 ( 30.79%)	

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH] x86_64: Implement evex512 version of strrchr and wcsrchr
  2022-09-23  3:57   ` Sunil Pandey
@ 2022-09-29  3:42     ` Sunil Pandey
  2022-09-29  4:06       ` Noah Goldstein
  0 siblings, 1 reply; 7+ messages in thread
From: Sunil Pandey @ 2022-09-29  3:42 UTC (permalink / raw)
  To: Noah Goldstein; +Cc: GNU C Library

On Thu, Sep 22, 2022 at 8:57 PM Sunil Pandey <skpgkp2@gmail.com> wrote:
>
> Microbenchmark data collected on: Intel(R) Xeon(R) Gold 6148 CPU @ 2.40GHz
>
>
> On Wed, Sep 21, 2022 at 5:50 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> >
> > On Wed, Sep 21, 2022 at 5:24 PM Sunil K Pandey via Libc-alpha
> > <libc-alpha@sourceware.org> wrote:
> > >
> > > This patch implements following evex512 version of string functions.
> > > evex512 version takes up to 30% less cycle as compared to evex,
> > > depending on length and alignment.
> > >
> >
> > Please attach benchmark numbers.
> >
> > > - strrchr function using 512 bit vectors.
> > > - wcsrchr function using 512 bit vectors.
> > >
> > > Code size data:
> > >
> > > strrchr-evex.o          833 byte
> > > strrchr-evex512.o       573 byte (-31%)
> > >
> > > wcsrchr-evex.o          836 byte
> > > wcsrchr-evex512.o       581 byte (-31%)
> > >
> > > Placeholder function, not used by any processor at the moment.
> > > ---
> > >  sysdeps/x86_64/multiarch/Makefile            |   2 +
> > >  sysdeps/x86_64/multiarch/ifunc-impl-list.c   |  10 +
> > >  sysdeps/x86_64/multiarch/strrchr-evex-base.S | 307 +++++++++++++++++++
> > >  sysdeps/x86_64/multiarch/strrchr-evex512.S   |   7 +
> > >  sysdeps/x86_64/multiarch/wcsrchr-evex512.S   |   8 +
> > >  5 files changed, 334 insertions(+)
> > >  create mode 100644 sysdeps/x86_64/multiarch/strrchr-evex-base.S
> > >  create mode 100644 sysdeps/x86_64/multiarch/strrchr-evex512.S
> > >  create mode 100644 sysdeps/x86_64/multiarch/wcsrchr-evex512.S
> > >
> > > diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
> > > index df4601c294..6a275f1c3d 100644
> > > --- a/sysdeps/x86_64/multiarch/Makefile
> > > +++ b/sysdeps/x86_64/multiarch/Makefile
> > > @@ -110,6 +110,7 @@ sysdep_routines += \
> > >    strrchr-avx2 \
> > >    strrchr-avx2-rtm \
> > >    strrchr-evex \
> > > +  strrchr-evex512 \
> > >    strrchr-sse2 \
> > >    strspn-sse4 \
> > >    strstr-avx512 \
> > > @@ -152,6 +153,7 @@ sysdep_routines += \
> > >    wcsrchr-avx2 \
> > >    wcsrchr-avx2-rtm \
> > >    wcsrchr-evex \
> > > +  wcsrchr-evex512 \
> > >    wcsrchr-sse2 \
> > >    wmemchr-avx2 \
> > >    wmemchr-avx2-rtm \
> > > diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> > > index a71444eccb..26c941023a 100644
> > > --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> > > +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> > > @@ -564,6 +564,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> > >                                      (CPU_FEATURE_USABLE (AVX512VL)
> > >                                       && CPU_FEATURE_USABLE (AVX512BW)),
> > >                                      __strrchr_evex)
> > > +             X86_IFUNC_IMPL_ADD_V4 (array, i, strrchr,
> > > +                                    (CPU_FEATURE_USABLE (AVX512VL)
> > > +                                     && CPU_FEATURE_USABLE (AVX512BW)
> > > +                                     && CPU_FEATURE_USABLE (BMI2)),
> > > +                                    __strrchr_evex512)
> > >               X86_IFUNC_IMPL_ADD_V3 (array, i, strrchr,
> > >                                      CPU_FEATURE_USABLE (AVX2),
> > >                                      __strrchr_avx2)
> > > @@ -775,6 +780,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> > >                                       && CPU_FEATURE_USABLE (AVX512BW)
> > >                                       && CPU_FEATURE_USABLE (BMI2)),
> > >                                      __wcsrchr_evex)
> > > +             X86_IFUNC_IMPL_ADD_V4 (array, i, wcsrchr,
> > > +                                    (CPU_FEATURE_USABLE (AVX512VL)
> > > +                                     && CPU_FEATURE_USABLE (AVX512BW)
> > > +                                     && CPU_FEATURE_USABLE (BMI2)),
> > > +                                    __wcsrchr_evex512)
> > >               X86_IFUNC_IMPL_ADD_V3 (array, i, wcsrchr,
> > >                                      CPU_FEATURE_USABLE (AVX2),
> > >                                      __wcsrchr_avx2)
> > > diff --git a/sysdeps/x86_64/multiarch/strrchr-evex-base.S b/sysdeps/x86_64/multiarch/strrchr-evex-base.S
> > > new file mode 100644
> > > index 0000000000..e937cb193c
> > > --- /dev/null
> > > +++ b/sysdeps/x86_64/multiarch/strrchr-evex-base.S
> > > @@ -0,0 +1,307 @@
> > > +/* Placeholder function, not used by any processor at the moment.
> > > +   Copyright (C) 2022 Free Software Foundation, Inc.
> > > +   This file is part of the GNU C Library.
> > > +
> > > +   The GNU C Library is free software; you can redistribute it and/or
> > > +   modify it under the terms of the GNU Lesser General Public
> > > +   License as published by the Free Software Foundation; either
> > > +   version 2.1 of the License, or (at your option) any later version.
> > > +
> > > +   The GNU C Library is distributed in the hope that it will be useful,
> > > +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> > > +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > > +   Lesser General Public License for more details.
> > > +
> > > +   You should have received a copy of the GNU Lesser General Public
> > > +   License along with the GNU C Library; if not, see
> > > +   <https://www.gnu.org/licenses/>.  */
> > > +
> > > +/* UNUSED. Exists purely as reference implementation.  */
> > > +
> > > +#include <isa-level.h>
> > > +
> > > +#if ISA_SHOULD_BUILD (4)
> > > +
> > > +# include <sysdep.h>
> > > +
> > > +# ifdef USE_AS_WCSRCHR
> > > +#  define CHAR_SIZE    4
> > > +#  define VPBROADCAST   vpbroadcastd
> > > +#  define VPCMP                vpcmpd
> > > +#  define VPMINU       vpminud
> > > +#  define VPTESTN      vptestnmd
> > > +# else
> > > +#  define CHAR_SIZE    1
> > > +#  define VPBROADCAST   vpbroadcastb
> > > +#  define VPCMP                vpcmpb
> > > +#  define VPMINU       vpminub
> > > +#  define VPTESTN      vptestnmb
> > > +# endif
> > > +
> > > +# define PAGE_SIZE     4096
> > > +# define CHAR_PER_VEC  (VEC_SIZE / CHAR_SIZE)
> > > +
> > > +# if VEC_SIZE == 64
> > > +#  define BLSMSK       blsmskq
> > > +#  define BSR          bsrq
> > > +#  define KMOV         kmovq
> > > +#  define KOR          korq
> > > +#  define KORTEST      kortestq
> > > +#  define R8           r8
> > > +#  define RAX          rax
> > > +#  define RCX          rcx
> > > +#  define RDX          rdx
> > > +#  define SHR          shrq
> > > +#  define TEXTSUFFIX   evex512
> > > +#  define VMM0         zmm16
> > > +#  define VMM1         zmm17
> > > +#  define VMM2         zmm18
> > > +#  define VMM3         zmm19
> > > +#  define VMM4         zmm20
> > > +#  define VMM5         zmm21
> > > +#  define VMOVA                vmovdqa64
> > > +#  define VMOVU                vmovdqu64
> > > +
> > > +# elif VEC_SIZE == 32
> > > +/* Currently Unused.  */
> > > +#  define BLSMSK       blsmskl
> > > +#  define BSR          bsrl
> > > +#  define KMOV         kmovd
> > > +#  define KOR          kord
> > > +#  define KORTEST      kortestd
> > > +#  define R8           r8d
> > > +#  define RAX          eax
> > > +#  define RCX          ecx
> > > +#  define RDX          edx
> > > +#  define SHR          shrl
> > > +#  define TEXTSUFFIX   evex256
> > > +#  define VMM0         ymm16
> > > +#  define VMM1         ymm17
> > > +#  define VMM2         ymm18
> > > +#  define VMM3         ymm19
> > > +#  define VMM4         ymm20
> > > +#  define VMM5         ymm21
> > > +#  define VMOVA                vmovdqa32
> > > +#  define VMOVU                vmovdqu32
> > > +# endif
> > > +
> > > +       .section .text.TEXTSUFFIX, "ax", @progbits
> > > +/* Aligning entry point to 64 byte, provides better performance for
> > > +   one vector length string.  */
> > > +ENTRY_P2ALIGN (STRRCHR, 6)
> > > +
> > > +       /* Broadcast CHAR to VMM0.  */
> > > +       VPBROADCAST %esi, %VMM0
> > > +       movl    %edi, %eax
> > > +       andl    $(PAGE_SIZE - 1), %eax
> > > +       cmpl    $(PAGE_SIZE - VEC_SIZE), %eax
> > > +       ja      L(page_cross)
> > > +
> > > +L(page_cross_continue):
> > > +       /* Compare [w]char for null, mask bit will be set for match.  */
> > > +       VMOVU   (%rdi), %VMM1
> > > +
> > > +       VPTESTN %VMM1, %VMM1, %k1
> > > +       KMOV    %k1, %RCX
> > > +       test    %RCX, %RCX
> > > +       jz      L(align_more)
> > > +
> > > +       VPCMP   $0, %VMM1, %VMM0, %k0
> > > +       KMOV    %k0, %RAX
> > > +       BLSMSK  %RCX, %RCX
> > > +       and     %RCX, %RAX
> > > +       jz      L(ret)
> > > +
> > > +       BSR     %RAX, %RAX
> > > +# ifdef USE_AS_WCSRCHR
> > > +       leaq    (%rdi, %rax, CHAR_SIZE), %rax
> > > +# else
> > > +       add     %rdi, %rax
> > > +# endif
> > > +L(ret):
> > > +       ret
> > > +
> > > +L(vector_x2_end):
> > > +       VPCMP   $0, %VMM2, %VMM0, %k2
> > > +       KMOV    %k2, %RAX
> > > +       BLSMSK  %RCX, %RCX
> > > +       and     %RCX, %RAX
> > > +       jz      L(vector_x1_ret)
> > > +
> > > +       BSR     %RAX, %RAX
> > > +       leaq    (VEC_SIZE)(%rdi, %rax, CHAR_SIZE), %rax
> > > +       ret
> > > +
> > > +       /* Check the first vector at very last to look for match.  */
> > > +L(vector_x1_ret):
> > > +       VPCMP   $0, %VMM1, %VMM0, %k2
> > > +       KMOV    %k2, %RAX
> > > +       test    %RAX, %RAX
> > > +       jz      L(ret)
> > > +
> > > +       BSR     %RAX, %RAX
> > > +# ifdef USE_AS_WCSRCHR
> > > +       leaq    (%rsi, %rax, CHAR_SIZE), %rax
> > > +# else
> > > +       add     %rsi, %rax
> > > +# endif
> > > +       ret
> > > +
> > > +L(align_more):
> > > +       /* Zero r8 to store match result.  */
> > > +       xorq    %r8, %r8
> > > +       /* Save pointer of first vector, in case if no match found.  */
> > > +       movq    %rdi, %rsi
> > > +       /* Align pointer to vector size.  */
> > > +       andq    $-VEC_SIZE, %rdi
> > > +       /* Loop unroll 2 times for 2 vector loop.  */
> > > +       VMOVA   (VEC_SIZE)(%rdi), %VMM2
> > > +       VPTESTN %VMM2, %VMM2, %k0
> > > +       KMOV    %k0, %RCX
> > > +       test    %RCX, %RCX
> > > +       jnz     L(vector_x2_end)
> > > +
> > > +       /* Save pointer of second vector, in case if no match
> > > +          found.  */
> > > +       movq    %rdi, %r9
> > > +       /* Align address to VEC_SIZE * 2 for loop.  */
> > > +       andq    $-(VEC_SIZE * 2), %rdi
> > > +
> > > +       .p2align 4,,11
> > > +L(loop):
> > > +       /* 2 vector loop, as it provide better performance as compared
> > > +          to 4 vector loop.  */
> > > +       VMOVA   (VEC_SIZE * 2)(%rdi), %VMM3
> > > +       VMOVA   (VEC_SIZE * 3)(%rdi), %VMM4
> > > +       VPCMP   $0, %VMM3, %VMM0, %k1
> > > +       VPCMP   $0, %VMM4, %VMM0, %k2
> > > +       VPMINU  %VMM3, %VMM4, %VMM5
> > > +       VPTESTN %VMM5, %VMM5, %k0
> > > +       KOR     %k1, %k2, %k3
> > > +       subq    $-(VEC_SIZE * 2), %rdi
> > > +       /* If k0 and k3 zero, match and end of string not found.  */
> > > +       KORTEST %k0, %k3
> > > +       jz      L(loop)
> > > +
> > > +       /* If k0 is non zero, end of string found.  */
> > > +       KORTEST %k0, %k0
> > > +       jnz     L(endloop)
> > > +
> > > +       /* A match found, it need to be stored in r8 before loop
> > > +          continue.  */
> > > +       /* Check second vector first.  */
> > > +       KMOV    %k2, %RDX
> > > +       test    %RDX, %RDX
> > > +       jz      L(loop_vec_x3_ret)
> > > +
> > > +       BSR     %RDX, %RDX
> > > +       leaq    (VEC_SIZE)(%rdi, %rdx, CHAR_SIZE), %r8
> > > +       jmp     L(loop)
> > > +
> > > +       /* If second vector doesn't have match, first vector must
> > > +          have match.  */
> > > +L(loop_vec_x3_ret):
> > > +       KMOV    %k1, %R8
> > > +       BSR     %R8, %R8
> > > +# ifdef USE_AS_WCSRCHR
> > > +       leaq    (%rdi, %r8, CHAR_SIZE), %r8
> > > +# else
> > > +       add     %rdi, %r8
> > > +# endif
> > > +       jmp     L(loop)
> > > +
> > > +L(endloop):
> > > +       /* Check if string end in first loop vector.  */
> > > +       VPTESTN %VMM3, %VMM3, %k0
> > > +       KMOV    %k0, %RCX
> > > +       test    %RCX, %RCX
> > > +       jnz     L(vector_x3_end)
> > > +
> > > +       /* Check if it has match in first loop vector.  */
> > > +       KMOV    %k1, %RAX
> > > +       test    %RAX, %RAX
> > > +       jz      L(vector_x4_end)
> > > +
> > > +       BSR     %RAX, %RAX
> > > +       leaq    (%rdi, %rax, CHAR_SIZE), %r8
> > > +
> > > +       /* String must end in second loop vector.  */
> > > +L(vector_x4_end):
> > > +       VPTESTN %VMM4, %VMM4, %k0
> > > +       KMOV    %k0, %RCX
> > > +       KMOV    %k2, %RAX
> > > +       BLSMSK  %RCX, %RCX
> > > +       /* Check if it has match in second loop vector.  */
> > > +       and     %RCX, %RAX
> > > +       jz      L(check_last_match)
> > > +
> > > +       BSR     %RAX, %RAX
> > > +       leaq    (VEC_SIZE)(%rdi, %rax, CHAR_SIZE), %rax
> > > +       ret
> > > +
> > > +       /* String end in first loop vector.  */
> > > +L(vector_x3_end):
> > > +       KMOV    %k1, %RAX
> > > +       BLSMSK  %RCX, %RCX
> > > +       /* Check if it has match in second loop vector.  */
> > > +       and     %RCX, %RAX
> > > +       jz      L(check_last_match)
> > > +
> > > +       BSR     %RAX, %RAX
> > > +       leaq    (%rdi, %rax, CHAR_SIZE), %rax
> > > +       ret
> > > +
> > > +       /* No match in first and second loop vector.  */
> > > +L(check_last_match):
> > > +       /* Check if any match recorded in r8.  */
> > > +       test    %r8, %r8
> > > +       jz      L(vector_x2_ret)
> > > +       movq    %r8, %rax
> > > +       ret
> > > +
> > > +       /* No match recorded in r8. Check the second saved vector
> > > +          in begining.  */
> > > +L(vector_x2_ret):
> > > +       VPCMP   $0, %VMM2, %VMM0, %k2
> > > +       KMOV    %k2, %RAX
> > > +       test    %RAX, %RAX
> > > +       jz      L(vector_x1_ret)
> > > +
> > > +       /* Match found in the second saved vector.  */
> > > +       BSR     %RAX, %RAX
> > > +       leaq    (VEC_SIZE)(%r9, %rax, CHAR_SIZE), %rax
> > > +       ret
> > > +
> > > +L(page_cross):
> > > +       movl    %eax, %ecx
> > > +# ifdef USE_AS_WCSRCHR
> > > +       /* Calculate number of compare result bits to be skipped for
> > > +          wide string alignment adjustment.  */
> > > +       andl    $(VEC_SIZE - 1), %ecx
> > > +       sarl    $2, %ecx
> > > +# endif
> > > +       /* ecx contains number of w[char] to be skipped as a result
> > > +          of address alignment.  */
> > > +       xorq    %rdi, %rax
> > > +       VMOVA   (PAGE_SIZE - VEC_SIZE)(%rax), %VMM1
> > > +
> > > +       VPTESTN %VMM1, %VMM1, %k1
> > > +       KMOV    %k1, %RAX
> > > +       SHR     %cl, %RAX
> > > +       jz      L(page_cross_continue)
> > > +       VPCMP   $0, %VMM1, %VMM0, %k0
> > > +       KMOV    %k0, %RDX
> > > +       SHR     %cl, %RDX
> > > +       BLSMSK  %RAX, %RAX
> > > +       and     %RDX, %RAX
> > > +       jz      L(ret)
> > > +       BSR     %RAX, %RAX
> > > +# ifdef USE_AS_WCSRCHR
> > > +       leaq    (%rdi, %rax, CHAR_SIZE), %rax
> > > +# else
> > > +       add     %rdi, %rax
> > > +# endif
> > > +
> > > +       ret
> > > +END (STRRCHR)
> > > +#endif
> > > diff --git a/sysdeps/x86_64/multiarch/strrchr-evex512.S b/sysdeps/x86_64/multiarch/strrchr-evex512.S
> > > new file mode 100644
> > > index 0000000000..f880848e09
> > > --- /dev/null
> > > +++ b/sysdeps/x86_64/multiarch/strrchr-evex512.S
> > > @@ -0,0 +1,7 @@
> > > +# ifndef STRRCHR
> > > +#  define STRRCHR      __strrchr_evex512
> > > +# endif
> > > +
> > > +#define VEC_SIZE       64
> > > +
> > > +#include "strrchr-evex-base.S"
> > > diff --git a/sysdeps/x86_64/multiarch/wcsrchr-evex512.S b/sysdeps/x86_64/multiarch/wcsrchr-evex512.S
> > > new file mode 100644
> > > index 0000000000..65b7710b22
> > > --- /dev/null
> > > +++ b/sysdeps/x86_64/multiarch/wcsrchr-evex512.S
> > > @@ -0,0 +1,8 @@
> > > +#ifndef WCSRCHR
> > > +# define WCSRCHR       __wcsrchr_evex512
> > > +#endif
> > > +
> > > +#define STRRCHR        WCSRCHR
> > > +#define USE_AS_WCSRCHR 1
> > > +
> > > +#include "strrchr-evex512.S"
> > > --
> > > 2.36.1
> > >

ping

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH] x86_64: Implement evex512 version of strrchr and wcsrchr
  2022-09-29  3:42     ` Sunil Pandey
@ 2022-09-29  4:06       ` Noah Goldstein
  2022-09-30 18:49         ` Sunil Pandey
  0 siblings, 1 reply; 7+ messages in thread
From: Noah Goldstein @ 2022-09-29  4:06 UTC (permalink / raw)
  To: Sunil Pandey; +Cc: GNU C Library

On Wed, Sep 28, 2022 at 8:42 PM Sunil Pandey <skpgkp2@gmail.com> wrote:
>
> On Thu, Sep 22, 2022 at 8:57 PM Sunil Pandey <skpgkp2@gmail.com> wrote:
> >
> > Microbenchmark data collected on: Intel(R) Xeon(R) Gold 6148 CPU @ 2.40GHz
> >
> >
> > On Wed, Sep 21, 2022 at 5:50 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> > >
> > > On Wed, Sep 21, 2022 at 5:24 PM Sunil K Pandey via Libc-alpha
> > > <libc-alpha@sourceware.org> wrote:
> > > >
> > > > This patch implements following evex512 version of string functions.
> > > > evex512 version takes up to 30% less cycle as compared to evex,
> > > > depending on length and alignment.
> > > >
> > >
> > > Please attach benchmark numbers.
> > >
> > > > - strrchr function using 512 bit vectors.
> > > > - wcsrchr function using 512 bit vectors.
> > > >
> > > > Code size data:
> > > >
> > > > strrchr-evex.o          833 byte
> > > > strrchr-evex512.o       573 byte (-31%)
> > > >
> > > > wcsrchr-evex.o          836 byte
> > > > wcsrchr-evex512.o       581 byte (-31%)
> > > >
> > > > Placeholder function, not used by any processor at the moment.
> > > > ---
> > > >  sysdeps/x86_64/multiarch/Makefile            |   2 +
> > > >  sysdeps/x86_64/multiarch/ifunc-impl-list.c   |  10 +
> > > >  sysdeps/x86_64/multiarch/strrchr-evex-base.S | 307 +++++++++++++++++++
> > > >  sysdeps/x86_64/multiarch/strrchr-evex512.S   |   7 +
> > > >  sysdeps/x86_64/multiarch/wcsrchr-evex512.S   |   8 +
> > > >  5 files changed, 334 insertions(+)
> > > >  create mode 100644 sysdeps/x86_64/multiarch/strrchr-evex-base.S
> > > >  create mode 100644 sysdeps/x86_64/multiarch/strrchr-evex512.S
> > > >  create mode 100644 sysdeps/x86_64/multiarch/wcsrchr-evex512.S
> > > >
> > > > diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
> > > > index df4601c294..6a275f1c3d 100644
> > > > --- a/sysdeps/x86_64/multiarch/Makefile
> > > > +++ b/sysdeps/x86_64/multiarch/Makefile
> > > > @@ -110,6 +110,7 @@ sysdep_routines += \
> > > >    strrchr-avx2 \
> > > >    strrchr-avx2-rtm \
> > > >    strrchr-evex \
> > > > +  strrchr-evex512 \
> > > >    strrchr-sse2 \
> > > >    strspn-sse4 \
> > > >    strstr-avx512 \
> > > > @@ -152,6 +153,7 @@ sysdep_routines += \
> > > >    wcsrchr-avx2 \
> > > >    wcsrchr-avx2-rtm \
> > > >    wcsrchr-evex \
> > > > +  wcsrchr-evex512 \
> > > >    wcsrchr-sse2 \
> > > >    wmemchr-avx2 \
> > > >    wmemchr-avx2-rtm \
> > > > diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> > > > index a71444eccb..26c941023a 100644
> > > > --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> > > > +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> > > > @@ -564,6 +564,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> > > >                                      (CPU_FEATURE_USABLE (AVX512VL)
> > > >                                       && CPU_FEATURE_USABLE (AVX512BW)),
> > > >                                      __strrchr_evex)
> > > > +             X86_IFUNC_IMPL_ADD_V4 (array, i, strrchr,
> > > > +                                    (CPU_FEATURE_USABLE (AVX512VL)
> > > > +                                     && CPU_FEATURE_USABLE (AVX512BW)
> > > > +                                     && CPU_FEATURE_USABLE (BMI2)),
> > > > +                                    __strrchr_evex512)
> > > >               X86_IFUNC_IMPL_ADD_V3 (array, i, strrchr,
> > > >                                      CPU_FEATURE_USABLE (AVX2),
> > > >                                      __strrchr_avx2)
> > > > @@ -775,6 +780,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> > > >                                       && CPU_FEATURE_USABLE (AVX512BW)
> > > >                                       && CPU_FEATURE_USABLE (BMI2)),
> > > >                                      __wcsrchr_evex)
> > > > +             X86_IFUNC_IMPL_ADD_V4 (array, i, wcsrchr,
> > > > +                                    (CPU_FEATURE_USABLE (AVX512VL)
> > > > +                                     && CPU_FEATURE_USABLE (AVX512BW)
> > > > +                                     && CPU_FEATURE_USABLE (BMI2)),
> > > > +                                    __wcsrchr_evex512)
> > > >               X86_IFUNC_IMPL_ADD_V3 (array, i, wcsrchr,
> > > >                                      CPU_FEATURE_USABLE (AVX2),
> > > >                                      __wcsrchr_avx2)
> > > > diff --git a/sysdeps/x86_64/multiarch/strrchr-evex-base.S b/sysdeps/x86_64/multiarch/strrchr-evex-base.S
> > > > new file mode 100644
> > > > index 0000000000..e937cb193c
> > > > --- /dev/null
> > > > +++ b/sysdeps/x86_64/multiarch/strrchr-evex-base.S
> > > > @@ -0,0 +1,307 @@
> > > > +/* Placeholder function, not used by any processor at the moment.
> > > > +   Copyright (C) 2022 Free Software Foundation, Inc.
> > > > +   This file is part of the GNU C Library.
> > > > +
> > > > +   The GNU C Library is free software; you can redistribute it and/or
> > > > +   modify it under the terms of the GNU Lesser General Public
> > > > +   License as published by the Free Software Foundation; either
> > > > +   version 2.1 of the License, or (at your option) any later version.
> > > > +
> > > > +   The GNU C Library is distributed in the hope that it will be useful,
> > > > +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> > > > +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > > > +   Lesser General Public License for more details.
> > > > +
> > > > +   You should have received a copy of the GNU Lesser General Public
> > > > +   License along with the GNU C Library; if not, see
> > > > +   <https://www.gnu.org/licenses/>.  */
> > > > +
> > > > +/* UNUSED. Exists purely as reference implementation.  */
> > > > +
> > > > +#include <isa-level.h>
> > > > +
> > > > +#if ISA_SHOULD_BUILD (4)
> > > > +
> > > > +# include <sysdep.h>
> > > > +
> > > > +# ifdef USE_AS_WCSRCHR
> > > > +#  define CHAR_SIZE    4
> > > > +#  define VPBROADCAST   vpbroadcastd
> > > > +#  define VPCMP                vpcmpd
> > > > +#  define VPMINU       vpminud
> > > > +#  define VPTESTN      vptestnmd
> > > > +# else
> > > > +#  define CHAR_SIZE    1
> > > > +#  define VPBROADCAST   vpbroadcastb
> > > > +#  define VPCMP                vpcmpb
> > > > +#  define VPMINU       vpminub
> > > > +#  define VPTESTN      vptestnmb
> > > > +# endif
> > > > +
> > > > +# define PAGE_SIZE     4096
> > > > +# define CHAR_PER_VEC  (VEC_SIZE / CHAR_SIZE)
> > > > +
> > > > +# if VEC_SIZE == 64
> > > > +#  define BLSMSK       blsmskq
> > > > +#  define BSR          bsrq
> > > > +#  define KMOV         kmovq
> > > > +#  define KOR          korq
> > > > +#  define KORTEST      kortestq
> > > > +#  define R8           r8
> > > > +#  define RAX          rax
> > > > +#  define RCX          rcx
> > > > +#  define RDX          rdx
> > > > +#  define SHR          shrq
> > > > +#  define TEXTSUFFIX   evex512
> > > > +#  define VMM0         zmm16
> > > > +#  define VMM1         zmm17
> > > > +#  define VMM2         zmm18
> > > > +#  define VMM3         zmm19
> > > > +#  define VMM4         zmm20
> > > > +#  define VMM5         zmm21
> > > > +#  define VMOVA                vmovdqa64
> > > > +#  define VMOVU                vmovdqu64
> > > > +
> > > > +# elif VEC_SIZE == 32
> > > > +/* Currently Unused.  */
> > > > +#  define BLSMSK       blsmskl
> > > > +#  define BSR          bsrl
> > > > +#  define KMOV         kmovd
> > > > +#  define KOR          kord
> > > > +#  define KORTEST      kortestd
> > > > +#  define R8           r8d
> > > > +#  define RAX          eax
> > > > +#  define RCX          ecx
> > > > +#  define RDX          edx
> > > > +#  define SHR          shrl
> > > > +#  define TEXTSUFFIX   evex256
> > > > +#  define VMM0         ymm16
> > > > +#  define VMM1         ymm17
> > > > +#  define VMM2         ymm18
> > > > +#  define VMM3         ymm19
> > > > +#  define VMM4         ymm20
> > > > +#  define VMM5         ymm21
> > > > +#  define VMOVA                vmovdqa32
> > > > +#  define VMOVU                vmovdqu32
> > > > +# endif
> > > > +
> > > > +       .section .text.TEXTSUFFIX, "ax", @progbits
> > > > +/* Aligning entry point to 64 byte, provides better performance for
> > > > +   one vector length string.  */
> > > > +ENTRY_P2ALIGN (STRRCHR, 6)
> > > > +
> > > > +       /* Broadcast CHAR to VMM0.  */
> > > > +       VPBROADCAST %esi, %VMM0
> > > > +       movl    %edi, %eax
> > > > +       andl    $(PAGE_SIZE - 1), %eax
> > > > +       cmpl    $(PAGE_SIZE - VEC_SIZE), %eax
> > > > +       ja      L(page_cross)
> > > > +
> > > > +L(page_cross_continue):
> > > > +       /* Compare [w]char for null, mask bit will be set for match.  */
> > > > +       VMOVU   (%rdi), %VMM1
> > > > +
> > > > +       VPTESTN %VMM1, %VMM1, %k1
> > > > +       KMOV    %k1, %RCX
> > > > +       test    %RCX, %RCX
> > > > +       jz      L(align_more)
> > > > +
> > > > +       VPCMP   $0, %VMM1, %VMM0, %k0
> > > > +       KMOV    %k0, %RAX
> > > > +       BLSMSK  %RCX, %RCX
> > > > +       and     %RCX, %RAX
> > > > +       jz      L(ret)
> > > > +
> > > > +       BSR     %RAX, %RAX
> > > > +# ifdef USE_AS_WCSRCHR
> > > > +       leaq    (%rdi, %rax, CHAR_SIZE), %rax
> > > > +# else
> > > > +       add     %rdi, %rax
> > > > +# endif
> > > > +L(ret):
> > > > +       ret
> > > > +
> > > > +L(vector_x2_end):
> > > > +       VPCMP   $0, %VMM2, %VMM0, %k2
> > > > +       KMOV    %k2, %RAX
> > > > +       BLSMSK  %RCX, %RCX
> > > > +       and     %RCX, %RAX
> > > > +       jz      L(vector_x1_ret)
> > > > +
> > > > +       BSR     %RAX, %RAX
> > > > +       leaq    (VEC_SIZE)(%rdi, %rax, CHAR_SIZE), %rax
> > > > +       ret
> > > > +
> > > > +       /* Check the first vector at very last to look for match.  */
> > > > +L(vector_x1_ret):
> > > > +       VPCMP   $0, %VMM1, %VMM0, %k2
> > > > +       KMOV    %k2, %RAX
> > > > +       test    %RAX, %RAX
> > > > +       jz      L(ret)
> > > > +
> > > > +       BSR     %RAX, %RAX
> > > > +# ifdef USE_AS_WCSRCHR
> > > > +       leaq    (%rsi, %rax, CHAR_SIZE), %rax
> > > > +# else
> > > > +       add     %rsi, %rax
> > > > +# endif
> > > > +       ret
> > > > +
> > > > +L(align_more):
> > > > +       /* Zero r8 to store match result.  */
> > > > +       xorq    %r8, %r8
> > > > +       /* Save pointer of first vector, in case if no match found.  */
> > > > +       movq    %rdi, %rsi
> > > > +       /* Align pointer to vector size.  */
> > > > +       andq    $-VEC_SIZE, %rdi
> > > > +       /* Loop unroll 2 times for 2 vector loop.  */
> > > > +       VMOVA   (VEC_SIZE)(%rdi), %VMM2
> > > > +       VPTESTN %VMM2, %VMM2, %k0
> > > > +       KMOV    %k0, %RCX
> > > > +       test    %RCX, %RCX
> > > > +       jnz     L(vector_x2_end)
> > > > +
> > > > +       /* Save pointer of second vector, in case if no match
> > > > +          found.  */
> > > > +       movq    %rdi, %r9
> > > > +       /* Align address to VEC_SIZE * 2 for loop.  */
> > > > +       andq    $-(VEC_SIZE * 2), %rdi
> > > > +
> > > > +       .p2align 4,,11
> > > > +L(loop):
> > > > +       /* 2 vector loop, as it provide better performance as compared
> > > > +          to 4 vector loop.  */
> > > > +       VMOVA   (VEC_SIZE * 2)(%rdi), %VMM3
> > > > +       VMOVA   (VEC_SIZE * 3)(%rdi), %VMM4
> > > > +       VPCMP   $0, %VMM3, %VMM0, %k1
> > > > +       VPCMP   $0, %VMM4, %VMM0, %k2
> > > > +       VPMINU  %VMM3, %VMM4, %VMM5
> > > > +       VPTESTN %VMM5, %VMM5, %k0
> > > > +       KOR     %k1, %k2, %k3
> > > > +       subq    $-(VEC_SIZE * 2), %rdi
> > > > +       /* If k0 and k3 zero, match and end of string not found.  */
> > > > +       KORTEST %k0, %k3
> > > > +       jz      L(loop)
> > > > +
> > > > +       /* If k0 is non zero, end of string found.  */
> > > > +       KORTEST %k0, %k0
> > > > +       jnz     L(endloop)
> > > > +
> > > > +       /* A match found, it need to be stored in r8 before loop
> > > > +          continue.  */
> > > > +       /* Check second vector first.  */
> > > > +       KMOV    %k2, %RDX
> > > > +       test    %RDX, %RDX
> > > > +       jz      L(loop_vec_x3_ret)
> > > > +
> > > > +       BSR     %RDX, %RDX
> > > > +       leaq    (VEC_SIZE)(%rdi, %rdx, CHAR_SIZE), %r8
> > > > +       jmp     L(loop)
> > > > +
> > > > +       /* If second vector doesn't have match, first vector must
> > > > +          have match.  */
> > > > +L(loop_vec_x3_ret):
> > > > +       KMOV    %k1, %R8
> > > > +       BSR     %R8, %R8
> > > > +# ifdef USE_AS_WCSRCHR
> > > > +       leaq    (%rdi, %r8, CHAR_SIZE), %r8
> > > > +# else
> > > > +       add     %rdi, %r8
> > > > +# endif
> > > > +       jmp     L(loop)
> > > > +
> > > > +L(endloop):
> > > > +       /* Check if string end in first loop vector.  */
> > > > +       VPTESTN %VMM3, %VMM3, %k0
> > > > +       KMOV    %k0, %RCX
> > > > +       test    %RCX, %RCX
> > > > +       jnz     L(vector_x3_end)
> > > > +
> > > > +       /* Check if it has match in first loop vector.  */
> > > > +       KMOV    %k1, %RAX
> > > > +       test    %RAX, %RAX
> > > > +       jz      L(vector_x4_end)
> > > > +
> > > > +       BSR     %RAX, %RAX
> > > > +       leaq    (%rdi, %rax, CHAR_SIZE), %r8
> > > > +
> > > > +       /* String must end in second loop vector.  */
> > > > +L(vector_x4_end):
> > > > +       VPTESTN %VMM4, %VMM4, %k0
> > > > +       KMOV    %k0, %RCX
> > > > +       KMOV    %k2, %RAX
> > > > +       BLSMSK  %RCX, %RCX
> > > > +       /* Check if it has match in second loop vector.  */
> > > > +       and     %RCX, %RAX
> > > > +       jz      L(check_last_match)
> > > > +
> > > > +       BSR     %RAX, %RAX
> > > > +       leaq    (VEC_SIZE)(%rdi, %rax, CHAR_SIZE), %rax
> > > > +       ret
> > > > +
> > > > +       /* String end in first loop vector.  */
> > > > +L(vector_x3_end):
> > > > +       KMOV    %k1, %RAX
> > > > +       BLSMSK  %RCX, %RCX
> > > > +       /* Check if it has match in second loop vector.  */
> > > > +       and     %RCX, %RAX
> > > > +       jz      L(check_last_match)
> > > > +
> > > > +       BSR     %RAX, %RAX
> > > > +       leaq    (%rdi, %rax, CHAR_SIZE), %rax
> > > > +       ret
> > > > +
> > > > +       /* No match in first and second loop vector.  */
> > > > +L(check_last_match):
> > > > +       /* Check if any match recorded in r8.  */
> > > > +       test    %r8, %r8
> > > > +       jz      L(vector_x2_ret)
> > > > +       movq    %r8, %rax
> > > > +       ret
> > > > +
> > > > +       /* No match recorded in r8. Check the second saved vector
> > > > +          in begining.  */
> > > > +L(vector_x2_ret):
> > > > +       VPCMP   $0, %VMM2, %VMM0, %k2
> > > > +       KMOV    %k2, %RAX
> > > > +       test    %RAX, %RAX
> > > > +       jz      L(vector_x1_ret)
> > > > +
> > > > +       /* Match found in the second saved vector.  */
> > > > +       BSR     %RAX, %RAX
> > > > +       leaq    (VEC_SIZE)(%r9, %rax, CHAR_SIZE), %rax
> > > > +       ret
> > > > +
> > > > +L(page_cross):
> > > > +       movl    %eax, %ecx
> > > > +# ifdef USE_AS_WCSRCHR
> > > > +       /* Calculate number of compare result bits to be skipped for
> > > > +          wide string alignment adjustment.  */
> > > > +       andl    $(VEC_SIZE - 1), %ecx
> > > > +       sarl    $2, %ecx
> > > > +# endif
> > > > +       /* ecx contains number of w[char] to be skipped as a result
> > > > +          of address alignment.  */
> > > > +       xorq    %rdi, %rax
> > > > +       VMOVA   (PAGE_SIZE - VEC_SIZE)(%rax), %VMM1
> > > > +
> > > > +       VPTESTN %VMM1, %VMM1, %k1
> > > > +       KMOV    %k1, %RAX
> > > > +       SHR     %cl, %RAX
> > > > +       jz      L(page_cross_continue)
> > > > +       VPCMP   $0, %VMM1, %VMM0, %k0
> > > > +       KMOV    %k0, %RDX
> > > > +       SHR     %cl, %RDX
> > > > +       BLSMSK  %RAX, %RAX
> > > > +       and     %RDX, %RAX
> > > > +       jz      L(ret)
> > > > +       BSR     %RAX, %RAX
> > > > +# ifdef USE_AS_WCSRCHR
> > > > +       leaq    (%rdi, %rax, CHAR_SIZE), %rax
> > > > +# else
> > > > +       add     %rdi, %rax
> > > > +# endif
> > > > +
> > > > +       ret
> > > > +END (STRRCHR)
> > > > +#endif
> > > > diff --git a/sysdeps/x86_64/multiarch/strrchr-evex512.S b/sysdeps/x86_64/multiarch/strrchr-evex512.S
> > > > new file mode 100644
> > > > index 0000000000..f880848e09
> > > > --- /dev/null
> > > > +++ b/sysdeps/x86_64/multiarch/strrchr-evex512.S
> > > > @@ -0,0 +1,7 @@
> > > > +# ifndef STRRCHR
> > > > +#  define STRRCHR      __strrchr_evex512
> > > > +# endif
> > > > +
> > > > +#define VEC_SIZE       64
> > > > +
> > > > +#include "strrchr-evex-base.S"
> > > > diff --git a/sysdeps/x86_64/multiarch/wcsrchr-evex512.S b/sysdeps/x86_64/multiarch/wcsrchr-evex512.S
> > > > new file mode 100644
> > > > index 0000000000..65b7710b22
> > > > --- /dev/null
> > > > +++ b/sysdeps/x86_64/multiarch/wcsrchr-evex512.S
> > > > @@ -0,0 +1,8 @@
> > > > +#ifndef WCSRCHR
> > > > +# define WCSRCHR       __wcsrchr_evex512
> > > > +#endif
> > > > +
> > > > +#define STRRCHR        WCSRCHR
> > > > +#define USE_AS_WCSRCHR 1
> > > > +
> > > > +#include "strrchr-evex512.S"
> > > > --
> > > > 2.36.1
> > > >
>
> ping

Regarding this patch along with the corresponding memchr and strchr
ones, I would prefer to try and implement the ZMM version alongside
the YMM similar to what we do in memset/memmove.

Since all/nearly all of the instructions are the same this shouldn't
be too difficult with the `VEC(n)` macros.

Examples are:
https://gitlab.com/x86-glibc/glibc/-/tree/users/goldsteinn/memcmp-evex512

and there is a congruent patch to strlen to do the same (still in the
works):
https://gitlab.com/x86-glibc/glibc/-/tree/users/goldsteinn/evex512

There are many good ideas in these patches that I believe would also
apply to the YMM implementations and think it would be best to ensure
both files are as close to optimal as we can get them as opposed to
adding yet another bespoke implementation we need to maintain / keep
optimized.

Can you try and integrate this and the memchr/strchr implementations
similar to how we do memmove/memset?

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH] x86_64: Implement evex512 version of strrchr and wcsrchr
  2022-09-29  4:06       ` Noah Goldstein
@ 2022-09-30 18:49         ` Sunil Pandey
  2022-09-30 19:09           ` Noah Goldstein
  0 siblings, 1 reply; 7+ messages in thread
From: Sunil Pandey @ 2022-09-30 18:49 UTC (permalink / raw)
  To: Noah Goldstein; +Cc: GNU C Library

On Wed, Sep 28, 2022 at 9:06 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> On Wed, Sep 28, 2022 at 8:42 PM Sunil Pandey <skpgkp2@gmail.com> wrote:
> >
> > On Thu, Sep 22, 2022 at 8:57 PM Sunil Pandey <skpgkp2@gmail.com> wrote:
> > >
> > > Microbenchmark data collected on: Intel(R) Xeon(R) Gold 6148 CPU @ 2.40GHz
> > >
> > >
> > > On Wed, Sep 21, 2022 at 5:50 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> > > >
> > > > On Wed, Sep 21, 2022 at 5:24 PM Sunil K Pandey via Libc-alpha
> > > > <libc-alpha@sourceware.org> wrote:
> > > > >
> > > > > This patch implements following evex512 version of string functions.
> > > > > evex512 version takes up to 30% less cycle as compared to evex,
> > > > > depending on length and alignment.
> > > > >
> > > >
> > > > Please attach benchmark numbers.
> > > >
> > > > > - strrchr function using 512 bit vectors.
> > > > > - wcsrchr function using 512 bit vectors.
> > > > >
> > > > > Code size data:
> > > > >
> > > > > strrchr-evex.o          833 byte
> > > > > strrchr-evex512.o       573 byte (-31%)
> > > > >
> > > > > wcsrchr-evex.o          836 byte
> > > > > wcsrchr-evex512.o       581 byte (-31%)
> > > > >
> > > > > Placeholder function, not used by any processor at the moment.
> > > > > ---
> > > > >  sysdeps/x86_64/multiarch/Makefile            |   2 +
> > > > >  sysdeps/x86_64/multiarch/ifunc-impl-list.c   |  10 +
> > > > >  sysdeps/x86_64/multiarch/strrchr-evex-base.S | 307 +++++++++++++++++++
> > > > >  sysdeps/x86_64/multiarch/strrchr-evex512.S   |   7 +
> > > > >  sysdeps/x86_64/multiarch/wcsrchr-evex512.S   |   8 +
> > > > >  5 files changed, 334 insertions(+)
> > > > >  create mode 100644 sysdeps/x86_64/multiarch/strrchr-evex-base.S
> > > > >  create mode 100644 sysdeps/x86_64/multiarch/strrchr-evex512.S
> > > > >  create mode 100644 sysdeps/x86_64/multiarch/wcsrchr-evex512.S
> > > > >
> > > > > diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
> > > > > index df4601c294..6a275f1c3d 100644
> > > > > --- a/sysdeps/x86_64/multiarch/Makefile
> > > > > +++ b/sysdeps/x86_64/multiarch/Makefile
> > > > > @@ -110,6 +110,7 @@ sysdep_routines += \
> > > > >    strrchr-avx2 \
> > > > >    strrchr-avx2-rtm \
> > > > >    strrchr-evex \
> > > > > +  strrchr-evex512 \
> > > > >    strrchr-sse2 \
> > > > >    strspn-sse4 \
> > > > >    strstr-avx512 \
> > > > > @@ -152,6 +153,7 @@ sysdep_routines += \
> > > > >    wcsrchr-avx2 \
> > > > >    wcsrchr-avx2-rtm \
> > > > >    wcsrchr-evex \
> > > > > +  wcsrchr-evex512 \
> > > > >    wcsrchr-sse2 \
> > > > >    wmemchr-avx2 \
> > > > >    wmemchr-avx2-rtm \
> > > > > diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> > > > > index a71444eccb..26c941023a 100644
> > > > > --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> > > > > +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> > > > > @@ -564,6 +564,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> > > > >                                      (CPU_FEATURE_USABLE (AVX512VL)
> > > > >                                       && CPU_FEATURE_USABLE (AVX512BW)),
> > > > >                                      __strrchr_evex)
> > > > > +             X86_IFUNC_IMPL_ADD_V4 (array, i, strrchr,
> > > > > +                                    (CPU_FEATURE_USABLE (AVX512VL)
> > > > > +                                     && CPU_FEATURE_USABLE (AVX512BW)
> > > > > +                                     && CPU_FEATURE_USABLE (BMI2)),
> > > > > +                                    __strrchr_evex512)
> > > > >               X86_IFUNC_IMPL_ADD_V3 (array, i, strrchr,
> > > > >                                      CPU_FEATURE_USABLE (AVX2),
> > > > >                                      __strrchr_avx2)
> > > > > @@ -775,6 +780,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> > > > >                                       && CPU_FEATURE_USABLE (AVX512BW)
> > > > >                                       && CPU_FEATURE_USABLE (BMI2)),
> > > > >                                      __wcsrchr_evex)
> > > > > +             X86_IFUNC_IMPL_ADD_V4 (array, i, wcsrchr,
> > > > > +                                    (CPU_FEATURE_USABLE (AVX512VL)
> > > > > +                                     && CPU_FEATURE_USABLE (AVX512BW)
> > > > > +                                     && CPU_FEATURE_USABLE (BMI2)),
> > > > > +                                    __wcsrchr_evex512)
> > > > >               X86_IFUNC_IMPL_ADD_V3 (array, i, wcsrchr,
> > > > >                                      CPU_FEATURE_USABLE (AVX2),
> > > > >                                      __wcsrchr_avx2)
> > > > > diff --git a/sysdeps/x86_64/multiarch/strrchr-evex-base.S b/sysdeps/x86_64/multiarch/strrchr-evex-base.S
> > > > > new file mode 100644
> > > > > index 0000000000..e937cb193c
> > > > > --- /dev/null
> > > > > +++ b/sysdeps/x86_64/multiarch/strrchr-evex-base.S
> > > > > @@ -0,0 +1,307 @@
> > > > > +/* Placeholder function, not used by any processor at the moment.
> > > > > +   Copyright (C) 2022 Free Software Foundation, Inc.
> > > > > +   This file is part of the GNU C Library.
> > > > > +
> > > > > +   The GNU C Library is free software; you can redistribute it and/or
> > > > > +   modify it under the terms of the GNU Lesser General Public
> > > > > +   License as published by the Free Software Foundation; either
> > > > > +   version 2.1 of the License, or (at your option) any later version.
> > > > > +
> > > > > +   The GNU C Library is distributed in the hope that it will be useful,
> > > > > +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> > > > > +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > > > > +   Lesser General Public License for more details.
> > > > > +
> > > > > +   You should have received a copy of the GNU Lesser General Public
> > > > > +   License along with the GNU C Library; if not, see
> > > > > +   <https://www.gnu.org/licenses/>.  */
> > > > > +
> > > > > +/* UNUSED. Exists purely as reference implementation.  */
> > > > > +
> > > > > +#include <isa-level.h>
> > > > > +
> > > > > +#if ISA_SHOULD_BUILD (4)
> > > > > +
> > > > > +# include <sysdep.h>
> > > > > +
> > > > > +# ifdef USE_AS_WCSRCHR
> > > > > +#  define CHAR_SIZE    4
> > > > > +#  define VPBROADCAST   vpbroadcastd
> > > > > +#  define VPCMP                vpcmpd
> > > > > +#  define VPMINU       vpminud
> > > > > +#  define VPTESTN      vptestnmd
> > > > > +# else
> > > > > +#  define CHAR_SIZE    1
> > > > > +#  define VPBROADCAST   vpbroadcastb
> > > > > +#  define VPCMP                vpcmpb
> > > > > +#  define VPMINU       vpminub
> > > > > +#  define VPTESTN      vptestnmb
> > > > > +# endif
> > > > > +
> > > > > +# define PAGE_SIZE     4096
> > > > > +# define CHAR_PER_VEC  (VEC_SIZE / CHAR_SIZE)
> > > > > +
> > > > > +# if VEC_SIZE == 64
> > > > > +#  define BLSMSK       blsmskq
> > > > > +#  define BSR          bsrq
> > > > > +#  define KMOV         kmovq
> > > > > +#  define KOR          korq
> > > > > +#  define KORTEST      kortestq
> > > > > +#  define R8           r8
> > > > > +#  define RAX          rax
> > > > > +#  define RCX          rcx
> > > > > +#  define RDX          rdx
> > > > > +#  define SHR          shrq
> > > > > +#  define TEXTSUFFIX   evex512
> > > > > +#  define VMM0         zmm16
> > > > > +#  define VMM1         zmm17
> > > > > +#  define VMM2         zmm18
> > > > > +#  define VMM3         zmm19
> > > > > +#  define VMM4         zmm20
> > > > > +#  define VMM5         zmm21
> > > > > +#  define VMOVA                vmovdqa64
> > > > > +#  define VMOVU                vmovdqu64
> > > > > +
> > > > > +# elif VEC_SIZE == 32
> > > > > +/* Currently Unused.  */
> > > > > +#  define BLSMSK       blsmskl
> > > > > +#  define BSR          bsrl
> > > > > +#  define KMOV         kmovd
> > > > > +#  define KOR          kord
> > > > > +#  define KORTEST      kortestd
> > > > > +#  define R8           r8d
> > > > > +#  define RAX          eax
> > > > > +#  define RCX          ecx
> > > > > +#  define RDX          edx
> > > > > +#  define SHR          shrl
> > > > > +#  define TEXTSUFFIX   evex256
> > > > > +#  define VMM0         ymm16
> > > > > +#  define VMM1         ymm17
> > > > > +#  define VMM2         ymm18
> > > > > +#  define VMM3         ymm19
> > > > > +#  define VMM4         ymm20
> > > > > +#  define VMM5         ymm21
> > > > > +#  define VMOVA                vmovdqa32
> > > > > +#  define VMOVU                vmovdqu32
> > > > > +# endif
> > > > > +
> > > > > +       .section .text.TEXTSUFFIX, "ax", @progbits
> > > > > +/* Aligning entry point to 64 byte, provides better performance for
> > > > > +   one vector length string.  */
> > > > > +ENTRY_P2ALIGN (STRRCHR, 6)
> > > > > +
> > > > > +       /* Broadcast CHAR to VMM0.  */
> > > > > +       VPBROADCAST %esi, %VMM0
> > > > > +       movl    %edi, %eax
> > > > > +       andl    $(PAGE_SIZE - 1), %eax
> > > > > +       cmpl    $(PAGE_SIZE - VEC_SIZE), %eax
> > > > > +       ja      L(page_cross)
> > > > > +
> > > > > +L(page_cross_continue):
> > > > > +       /* Compare [w]char for null, mask bit will be set for match.  */
> > > > > +       VMOVU   (%rdi), %VMM1
> > > > > +
> > > > > +       VPTESTN %VMM1, %VMM1, %k1
> > > > > +       KMOV    %k1, %RCX
> > > > > +       test    %RCX, %RCX
> > > > > +       jz      L(align_more)
> > > > > +
> > > > > +       VPCMP   $0, %VMM1, %VMM0, %k0
> > > > > +       KMOV    %k0, %RAX
> > > > > +       BLSMSK  %RCX, %RCX
> > > > > +       and     %RCX, %RAX
> > > > > +       jz      L(ret)
> > > > > +
> > > > > +       BSR     %RAX, %RAX
> > > > > +# ifdef USE_AS_WCSRCHR
> > > > > +       leaq    (%rdi, %rax, CHAR_SIZE), %rax
> > > > > +# else
> > > > > +       add     %rdi, %rax
> > > > > +# endif
> > > > > +L(ret):
> > > > > +       ret
> > > > > +
> > > > > +L(vector_x2_end):
> > > > > +       VPCMP   $0, %VMM2, %VMM0, %k2
> > > > > +       KMOV    %k2, %RAX
> > > > > +       BLSMSK  %RCX, %RCX
> > > > > +       and     %RCX, %RAX
> > > > > +       jz      L(vector_x1_ret)
> > > > > +
> > > > > +       BSR     %RAX, %RAX
> > > > > +       leaq    (VEC_SIZE)(%rdi, %rax, CHAR_SIZE), %rax
> > > > > +       ret
> > > > > +
> > > > > +       /* Check the first vector at very last to look for match.  */
> > > > > +L(vector_x1_ret):
> > > > > +       VPCMP   $0, %VMM1, %VMM0, %k2
> > > > > +       KMOV    %k2, %RAX
> > > > > +       test    %RAX, %RAX
> > > > > +       jz      L(ret)
> > > > > +
> > > > > +       BSR     %RAX, %RAX
> > > > > +# ifdef USE_AS_WCSRCHR
> > > > > +       leaq    (%rsi, %rax, CHAR_SIZE), %rax
> > > > > +# else
> > > > > +       add     %rsi, %rax
> > > > > +# endif
> > > > > +       ret
> > > > > +
> > > > > +L(align_more):
> > > > > +       /* Zero r8 to store match result.  */
> > > > > +       xorq    %r8, %r8
> > > > > +       /* Save pointer of first vector, in case if no match found.  */
> > > > > +       movq    %rdi, %rsi
> > > > > +       /* Align pointer to vector size.  */
> > > > > +       andq    $-VEC_SIZE, %rdi
> > > > > +       /* Loop unroll 2 times for 2 vector loop.  */
> > > > > +       VMOVA   (VEC_SIZE)(%rdi), %VMM2
> > > > > +       VPTESTN %VMM2, %VMM2, %k0
> > > > > +       KMOV    %k0, %RCX
> > > > > +       test    %RCX, %RCX
> > > > > +       jnz     L(vector_x2_end)
> > > > > +
> > > > > +       /* Save pointer of second vector, in case if no match
> > > > > +          found.  */
> > > > > +       movq    %rdi, %r9
> > > > > +       /* Align address to VEC_SIZE * 2 for loop.  */
> > > > > +       andq    $-(VEC_SIZE * 2), %rdi
> > > > > +
> > > > > +       .p2align 4,,11
> > > > > +L(loop):
> > > > > +       /* 2 vector loop, as it provide better performance as compared
> > > > > +          to 4 vector loop.  */
> > > > > +       VMOVA   (VEC_SIZE * 2)(%rdi), %VMM3
> > > > > +       VMOVA   (VEC_SIZE * 3)(%rdi), %VMM4
> > > > > +       VPCMP   $0, %VMM3, %VMM0, %k1
> > > > > +       VPCMP   $0, %VMM4, %VMM0, %k2
> > > > > +       VPMINU  %VMM3, %VMM4, %VMM5
> > > > > +       VPTESTN %VMM5, %VMM5, %k0
> > > > > +       KOR     %k1, %k2, %k3
> > > > > +       subq    $-(VEC_SIZE * 2), %rdi
> > > > > +       /* If k0 and k3 zero, match and end of string not found.  */
> > > > > +       KORTEST %k0, %k3
> > > > > +       jz      L(loop)
> > > > > +
> > > > > +       /* If k0 is non zero, end of string found.  */
> > > > > +       KORTEST %k0, %k0
> > > > > +       jnz     L(endloop)
> > > > > +
> > > > > +       /* A match found, it need to be stored in r8 before loop
> > > > > +          continue.  */
> > > > > +       /* Check second vector first.  */
> > > > > +       KMOV    %k2, %RDX
> > > > > +       test    %RDX, %RDX
> > > > > +       jz      L(loop_vec_x3_ret)
> > > > > +
> > > > > +       BSR     %RDX, %RDX
> > > > > +       leaq    (VEC_SIZE)(%rdi, %rdx, CHAR_SIZE), %r8
> > > > > +       jmp     L(loop)
> > > > > +
> > > > > +       /* If second vector doesn't have match, first vector must
> > > > > +          have match.  */
> > > > > +L(loop_vec_x3_ret):
> > > > > +       KMOV    %k1, %R8
> > > > > +       BSR     %R8, %R8
> > > > > +# ifdef USE_AS_WCSRCHR
> > > > > +       leaq    (%rdi, %r8, CHAR_SIZE), %r8
> > > > > +# else
> > > > > +       add     %rdi, %r8
> > > > > +# endif
> > > > > +       jmp     L(loop)
> > > > > +
> > > > > +L(endloop):
> > > > > +       /* Check if string end in first loop vector.  */
> > > > > +       VPTESTN %VMM3, %VMM3, %k0
> > > > > +       KMOV    %k0, %RCX
> > > > > +       test    %RCX, %RCX
> > > > > +       jnz     L(vector_x3_end)
> > > > > +
> > > > > +       /* Check if it has match in first loop vector.  */
> > > > > +       KMOV    %k1, %RAX
> > > > > +       test    %RAX, %RAX
> > > > > +       jz      L(vector_x4_end)
> > > > > +
> > > > > +       BSR     %RAX, %RAX
> > > > > +       leaq    (%rdi, %rax, CHAR_SIZE), %r8
> > > > > +
> > > > > +       /* String must end in second loop vector.  */
> > > > > +L(vector_x4_end):
> > > > > +       VPTESTN %VMM4, %VMM4, %k0
> > > > > +       KMOV    %k0, %RCX
> > > > > +       KMOV    %k2, %RAX
> > > > > +       BLSMSK  %RCX, %RCX
> > > > > +       /* Check if it has match in second loop vector.  */
> > > > > +       and     %RCX, %RAX
> > > > > +       jz      L(check_last_match)
> > > > > +
> > > > > +       BSR     %RAX, %RAX
> > > > > +       leaq    (VEC_SIZE)(%rdi, %rax, CHAR_SIZE), %rax
> > > > > +       ret
> > > > > +
> > > > > +       /* String end in first loop vector.  */
> > > > > +L(vector_x3_end):
> > > > > +       KMOV    %k1, %RAX
> > > > > +       BLSMSK  %RCX, %RCX
> > > > > +       /* Check if it has match in second loop vector.  */
> > > > > +       and     %RCX, %RAX
> > > > > +       jz      L(check_last_match)
> > > > > +
> > > > > +       BSR     %RAX, %RAX
> > > > > +       leaq    (%rdi, %rax, CHAR_SIZE), %rax
> > > > > +       ret
> > > > > +
> > > > > +       /* No match in first and second loop vector.  */
> > > > > +L(check_last_match):
> > > > > +       /* Check if any match recorded in r8.  */
> > > > > +       test    %r8, %r8
> > > > > +       jz      L(vector_x2_ret)
> > > > > +       movq    %r8, %rax
> > > > > +       ret
> > > > > +
> > > > > +       /* No match recorded in r8. Check the second saved vector
> > > > > +          in begining.  */
> > > > > +L(vector_x2_ret):
> > > > > +       VPCMP   $0, %VMM2, %VMM0, %k2
> > > > > +       KMOV    %k2, %RAX
> > > > > +       test    %RAX, %RAX
> > > > > +       jz      L(vector_x1_ret)
> > > > > +
> > > > > +       /* Match found in the second saved vector.  */
> > > > > +       BSR     %RAX, %RAX
> > > > > +       leaq    (VEC_SIZE)(%r9, %rax, CHAR_SIZE), %rax
> > > > > +       ret
> > > > > +
> > > > > +L(page_cross):
> > > > > +       movl    %eax, %ecx
> > > > > +# ifdef USE_AS_WCSRCHR
> > > > > +       /* Calculate number of compare result bits to be skipped for
> > > > > +          wide string alignment adjustment.  */
> > > > > +       andl    $(VEC_SIZE - 1), %ecx
> > > > > +       sarl    $2, %ecx
> > > > > +# endif
> > > > > +       /* ecx contains number of w[char] to be skipped as a result
> > > > > +          of address alignment.  */
> > > > > +       xorq    %rdi, %rax
> > > > > +       VMOVA   (PAGE_SIZE - VEC_SIZE)(%rax), %VMM1
> > > > > +
> > > > > +       VPTESTN %VMM1, %VMM1, %k1
> > > > > +       KMOV    %k1, %RAX
> > > > > +       SHR     %cl, %RAX
> > > > > +       jz      L(page_cross_continue)
> > > > > +       VPCMP   $0, %VMM1, %VMM0, %k0
> > > > > +       KMOV    %k0, %RDX
> > > > > +       SHR     %cl, %RDX
> > > > > +       BLSMSK  %RAX, %RAX
> > > > > +       and     %RDX, %RAX
> > > > > +       jz      L(ret)
> > > > > +       BSR     %RAX, %RAX
> > > > > +# ifdef USE_AS_WCSRCHR
> > > > > +       leaq    (%rdi, %rax, CHAR_SIZE), %rax
> > > > > +# else
> > > > > +       add     %rdi, %rax
> > > > > +# endif
> > > > > +
> > > > > +       ret
> > > > > +END (STRRCHR)
> > > > > +#endif
> > > > > diff --git a/sysdeps/x86_64/multiarch/strrchr-evex512.S b/sysdeps/x86_64/multiarch/strrchr-evex512.S
> > > > > new file mode 100644
> > > > > index 0000000000..f880848e09
> > > > > --- /dev/null
> > > > > +++ b/sysdeps/x86_64/multiarch/strrchr-evex512.S
> > > > > @@ -0,0 +1,7 @@
> > > > > +# ifndef STRRCHR
> > > > > +#  define STRRCHR      __strrchr_evex512
> > > > > +# endif
> > > > > +
> > > > > +#define VEC_SIZE       64
> > > > > +
> > > > > +#include "strrchr-evex-base.S"
> > > > > diff --git a/sysdeps/x86_64/multiarch/wcsrchr-evex512.S b/sysdeps/x86_64/multiarch/wcsrchr-evex512.S
> > > > > new file mode 100644
> > > > > index 0000000000..65b7710b22
> > > > > --- /dev/null
> > > > > +++ b/sysdeps/x86_64/multiarch/wcsrchr-evex512.S
> > > > > @@ -0,0 +1,8 @@
> > > > > +#ifndef WCSRCHR
> > > > > +# define WCSRCHR       __wcsrchr_evex512
> > > > > +#endif
> > > > > +
> > > > > +#define STRRCHR        WCSRCHR
> > > > > +#define USE_AS_WCSRCHR 1
> > > > > +
> > > > > +#include "strrchr-evex512.S"
> > > > > --
> > > > > 2.36.1
> > > > >
> >
> > ping
>
> Regarding this patch along with the corresponding memchr and strchr
> ones, I would prefer to try and implement the ZMM version alongside
> the YMM similar to what we do in memset/memmove.

This is a question of methodology. Everyone has different ways to
implement.  I don't think it's fair to expect that everyone follows same
existing methodology.

>
> Since all/nearly all of the instructions are the same this shouldn't
> be too difficult with the `VEC(n)` macros.
>

VEC(n) uses 3 levels of extra indirection to simply understand what
actual registers are used.

memrchr-evex.S->evex256-vecs.h->evex-vecs-common.h->vec-macros.h

> Examples are:
> https://gitlab.com/x86-glibc/glibc/-/tree/users/goldsteinn/memcmp-evex512
>
> and there is a congruent patch to strlen to do the same (still in the
> works):
> https://gitlab.com/x86-glibc/glibc/-/tree/users/goldsteinn/evex512
>
> There are many good ideas in these patches that I believe would also
> apply to the YMM implementations and think it would be best to ensure
> both files are as close to optimal as we can get them as opposed to
> adding yet another bespoke implementation we need to maintain / keep
> optimized.
>

I don't think it's a good idea to centralize when the entire ecosystem is
 moving towards modularization and inclusion.

Also it will not encourage any new contributors, if good ideas
taken from the patch and discard the actual patch just because it's using
different implementation methodology.

> Can you try and integrate this and the memchr/strchr implementations
> similar to how we do memmove/memset?

Why? I don't see any reason for that.

^ permalink raw reply	[flat|nested] 7+ messages in thread

* Re: [PATCH] x86_64: Implement evex512 version of strrchr and wcsrchr
  2022-09-30 18:49         ` Sunil Pandey
@ 2022-09-30 19:09           ` Noah Goldstein
  0 siblings, 0 replies; 7+ messages in thread
From: Noah Goldstein @ 2022-09-30 19:09 UTC (permalink / raw)
  To: Sunil Pandey; +Cc: GNU C Library

On Fri, Sep 30, 2022 at 11:49 AM Sunil Pandey <skpgkp2@gmail.com> wrote:
>
> On Wed, Sep 28, 2022 at 9:06 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> >
> > On Wed, Sep 28, 2022 at 8:42 PM Sunil Pandey <skpgkp2@gmail.com> wrote:
> > >
> > > On Thu, Sep 22, 2022 at 8:57 PM Sunil Pandey <skpgkp2@gmail.com> wrote:
> > > >
> > > > Microbenchmark data collected on: Intel(R) Xeon(R) Gold 6148 CPU @ 2.40GHz
> > > >
> > > >
> > > > On Wed, Sep 21, 2022 at 5:50 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> > > > >
> > > > > On Wed, Sep 21, 2022 at 5:24 PM Sunil K Pandey via Libc-alpha
> > > > > <libc-alpha@sourceware.org> wrote:
> > > > > >
> > > > > > This patch implements following evex512 version of string functions.
> > > > > > evex512 version takes up to 30% less cycle as compared to evex,
> > > > > > depending on length and alignment.
> > > > > >
> > > > >
> > > > > Please attach benchmark numbers.
> > > > >
> > > > > > - strrchr function using 512 bit vectors.
> > > > > > - wcsrchr function using 512 bit vectors.
> > > > > >
> > > > > > Code size data:
> > > > > >
> > > > > > strrchr-evex.o          833 byte
> > > > > > strrchr-evex512.o       573 byte (-31%)
> > > > > >
> > > > > > wcsrchr-evex.o          836 byte
> > > > > > wcsrchr-evex512.o       581 byte (-31%)
> > > > > >
> > > > > > Placeholder function, not used by any processor at the moment.
> > > > > > ---
> > > > > >  sysdeps/x86_64/multiarch/Makefile            |   2 +
> > > > > >  sysdeps/x86_64/multiarch/ifunc-impl-list.c   |  10 +
> > > > > >  sysdeps/x86_64/multiarch/strrchr-evex-base.S | 307 +++++++++++++++++++
> > > > > >  sysdeps/x86_64/multiarch/strrchr-evex512.S   |   7 +
> > > > > >  sysdeps/x86_64/multiarch/wcsrchr-evex512.S   |   8 +
> > > > > >  5 files changed, 334 insertions(+)
> > > > > >  create mode 100644 sysdeps/x86_64/multiarch/strrchr-evex-base.S
> > > > > >  create mode 100644 sysdeps/x86_64/multiarch/strrchr-evex512.S
> > > > > >  create mode 100644 sysdeps/x86_64/multiarch/wcsrchr-evex512.S
> > > > > >
> > > > > > diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
> > > > > > index df4601c294..6a275f1c3d 100644
> > > > > > --- a/sysdeps/x86_64/multiarch/Makefile
> > > > > > +++ b/sysdeps/x86_64/multiarch/Makefile
> > > > > > @@ -110,6 +110,7 @@ sysdep_routines += \
> > > > > >    strrchr-avx2 \
> > > > > >    strrchr-avx2-rtm \
> > > > > >    strrchr-evex \
> > > > > > +  strrchr-evex512 \
> > > > > >    strrchr-sse2 \
> > > > > >    strspn-sse4 \
> > > > > >    strstr-avx512 \
> > > > > > @@ -152,6 +153,7 @@ sysdep_routines += \
> > > > > >    wcsrchr-avx2 \
> > > > > >    wcsrchr-avx2-rtm \
> > > > > >    wcsrchr-evex \
> > > > > > +  wcsrchr-evex512 \
> > > > > >    wcsrchr-sse2 \
> > > > > >    wmemchr-avx2 \
> > > > > >    wmemchr-avx2-rtm \
> > > > > > diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> > > > > > index a71444eccb..26c941023a 100644
> > > > > > --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> > > > > > +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> > > > > > @@ -564,6 +564,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> > > > > >                                      (CPU_FEATURE_USABLE (AVX512VL)
> > > > > >                                       && CPU_FEATURE_USABLE (AVX512BW)),
> > > > > >                                      __strrchr_evex)
> > > > > > +             X86_IFUNC_IMPL_ADD_V4 (array, i, strrchr,
> > > > > > +                                    (CPU_FEATURE_USABLE (AVX512VL)
> > > > > > +                                     && CPU_FEATURE_USABLE (AVX512BW)
> > > > > > +                                     && CPU_FEATURE_USABLE (BMI2)),
> > > > > > +                                    __strrchr_evex512)
> > > > > >               X86_IFUNC_IMPL_ADD_V3 (array, i, strrchr,
> > > > > >                                      CPU_FEATURE_USABLE (AVX2),
> > > > > >                                      __strrchr_avx2)
> > > > > > @@ -775,6 +780,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> > > > > >                                       && CPU_FEATURE_USABLE (AVX512BW)
> > > > > >                                       && CPU_FEATURE_USABLE (BMI2)),
> > > > > >                                      __wcsrchr_evex)
> > > > > > +             X86_IFUNC_IMPL_ADD_V4 (array, i, wcsrchr,
> > > > > > +                                    (CPU_FEATURE_USABLE (AVX512VL)
> > > > > > +                                     && CPU_FEATURE_USABLE (AVX512BW)
> > > > > > +                                     && CPU_FEATURE_USABLE (BMI2)),
> > > > > > +                                    __wcsrchr_evex512)
> > > > > >               X86_IFUNC_IMPL_ADD_V3 (array, i, wcsrchr,
> > > > > >                                      CPU_FEATURE_USABLE (AVX2),
> > > > > >                                      __wcsrchr_avx2)
> > > > > > diff --git a/sysdeps/x86_64/multiarch/strrchr-evex-base.S b/sysdeps/x86_64/multiarch/strrchr-evex-base.S
> > > > > > new file mode 100644
> > > > > > index 0000000000..e937cb193c
> > > > > > --- /dev/null
> > > > > > +++ b/sysdeps/x86_64/multiarch/strrchr-evex-base.S
> > > > > > @@ -0,0 +1,307 @@
> > > > > > +/* Placeholder function, not used by any processor at the moment.
> > > > > > +   Copyright (C) 2022 Free Software Foundation, Inc.
> > > > > > +   This file is part of the GNU C Library.
> > > > > > +
> > > > > > +   The GNU C Library is free software; you can redistribute it and/or
> > > > > > +   modify it under the terms of the GNU Lesser General Public
> > > > > > +   License as published by the Free Software Foundation; either
> > > > > > +   version 2.1 of the License, or (at your option) any later version.
> > > > > > +
> > > > > > +   The GNU C Library is distributed in the hope that it will be useful,
> > > > > > +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> > > > > > +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > > > > > +   Lesser General Public License for more details.
> > > > > > +
> > > > > > +   You should have received a copy of the GNU Lesser General Public
> > > > > > +   License along with the GNU C Library; if not, see
> > > > > > +   <https://www.gnu.org/licenses/>.  */
> > > > > > +
> > > > > > +/* UNUSED. Exists purely as reference implementation.  */
> > > > > > +
> > > > > > +#include <isa-level.h>
> > > > > > +
> > > > > > +#if ISA_SHOULD_BUILD (4)
> > > > > > +
> > > > > > +# include <sysdep.h>
> > > > > > +
> > > > > > +# ifdef USE_AS_WCSRCHR
> > > > > > +#  define CHAR_SIZE    4
> > > > > > +#  define VPBROADCAST   vpbroadcastd
> > > > > > +#  define VPCMP                vpcmpd
> > > > > > +#  define VPMINU       vpminud
> > > > > > +#  define VPTESTN      vptestnmd
> > > > > > +# else
> > > > > > +#  define CHAR_SIZE    1
> > > > > > +#  define VPBROADCAST   vpbroadcastb
> > > > > > +#  define VPCMP                vpcmpb
> > > > > > +#  define VPMINU       vpminub
> > > > > > +#  define VPTESTN      vptestnmb
> > > > > > +# endif
> > > > > > +
> > > > > > +# define PAGE_SIZE     4096
> > > > > > +# define CHAR_PER_VEC  (VEC_SIZE / CHAR_SIZE)
> > > > > > +
> > > > > > +# if VEC_SIZE == 64
> > > > > > +#  define BLSMSK       blsmskq
> > > > > > +#  define BSR          bsrq
> > > > > > +#  define KMOV         kmovq
> > > > > > +#  define KOR          korq
> > > > > > +#  define KORTEST      kortestq
> > > > > > +#  define R8           r8
> > > > > > +#  define RAX          rax
> > > > > > +#  define RCX          rcx
> > > > > > +#  define RDX          rdx
> > > > > > +#  define SHR          shrq
> > > > > > +#  define TEXTSUFFIX   evex512
> > > > > > +#  define VMM0         zmm16
> > > > > > +#  define VMM1         zmm17
> > > > > > +#  define VMM2         zmm18
> > > > > > +#  define VMM3         zmm19
> > > > > > +#  define VMM4         zmm20
> > > > > > +#  define VMM5         zmm21
> > > > > > +#  define VMOVA                vmovdqa64
> > > > > > +#  define VMOVU                vmovdqu64
> > > > > > +
> > > > > > +# elif VEC_SIZE == 32
> > > > > > +/* Currently Unused.  */
> > > > > > +#  define BLSMSK       blsmskl
> > > > > > +#  define BSR          bsrl
> > > > > > +#  define KMOV         kmovd
> > > > > > +#  define KOR          kord
> > > > > > +#  define KORTEST      kortestd
> > > > > > +#  define R8           r8d
> > > > > > +#  define RAX          eax
> > > > > > +#  define RCX          ecx
> > > > > > +#  define RDX          edx
> > > > > > +#  define SHR          shrl
> > > > > > +#  define TEXTSUFFIX   evex256
> > > > > > +#  define VMM0         ymm16
> > > > > > +#  define VMM1         ymm17
> > > > > > +#  define VMM2         ymm18
> > > > > > +#  define VMM3         ymm19
> > > > > > +#  define VMM4         ymm20
> > > > > > +#  define VMM5         ymm21
> > > > > > +#  define VMOVA                vmovdqa32
> > > > > > +#  define VMOVU                vmovdqu32
> > > > > > +# endif
> > > > > > +
> > > > > > +       .section .text.TEXTSUFFIX, "ax", @progbits
> > > > > > +/* Aligning entry point to 64 byte, provides better performance for
> > > > > > +   one vector length string.  */
> > > > > > +ENTRY_P2ALIGN (STRRCHR, 6)
> > > > > > +
> > > > > > +       /* Broadcast CHAR to VMM0.  */
> > > > > > +       VPBROADCAST %esi, %VMM0
> > > > > > +       movl    %edi, %eax
> > > > > > +       andl    $(PAGE_SIZE - 1), %eax
> > > > > > +       cmpl    $(PAGE_SIZE - VEC_SIZE), %eax
> > > > > > +       ja      L(page_cross)
> > > > > > +
> > > > > > +L(page_cross_continue):
> > > > > > +       /* Compare [w]char for null, mask bit will be set for match.  */
> > > > > > +       VMOVU   (%rdi), %VMM1
> > > > > > +
> > > > > > +       VPTESTN %VMM1, %VMM1, %k1
> > > > > > +       KMOV    %k1, %RCX
> > > > > > +       test    %RCX, %RCX
> > > > > > +       jz      L(align_more)
> > > > > > +
> > > > > > +       VPCMP   $0, %VMM1, %VMM0, %k0
> > > > > > +       KMOV    %k0, %RAX
> > > > > > +       BLSMSK  %RCX, %RCX
> > > > > > +       and     %RCX, %RAX
> > > > > > +       jz      L(ret)
> > > > > > +
> > > > > > +       BSR     %RAX, %RAX
> > > > > > +# ifdef USE_AS_WCSRCHR
> > > > > > +       leaq    (%rdi, %rax, CHAR_SIZE), %rax
> > > > > > +# else
> > > > > > +       add     %rdi, %rax
> > > > > > +# endif
> > > > > > +L(ret):
> > > > > > +       ret
> > > > > > +
> > > > > > +L(vector_x2_end):
> > > > > > +       VPCMP   $0, %VMM2, %VMM0, %k2
> > > > > > +       KMOV    %k2, %RAX
> > > > > > +       BLSMSK  %RCX, %RCX
> > > > > > +       and     %RCX, %RAX
> > > > > > +       jz      L(vector_x1_ret)
> > > > > > +
> > > > > > +       BSR     %RAX, %RAX
> > > > > > +       leaq    (VEC_SIZE)(%rdi, %rax, CHAR_SIZE), %rax
> > > > > > +       ret
> > > > > > +
> > > > > > +       /* Check the first vector at very last to look for match.  */
> > > > > > +L(vector_x1_ret):
> > > > > > +       VPCMP   $0, %VMM1, %VMM0, %k2
> > > > > > +       KMOV    %k2, %RAX
> > > > > > +       test    %RAX, %RAX
> > > > > > +       jz      L(ret)
> > > > > > +
> > > > > > +       BSR     %RAX, %RAX
> > > > > > +# ifdef USE_AS_WCSRCHR
> > > > > > +       leaq    (%rsi, %rax, CHAR_SIZE), %rax
> > > > > > +# else
> > > > > > +       add     %rsi, %rax
> > > > > > +# endif
> > > > > > +       ret
> > > > > > +
> > > > > > +L(align_more):
> > > > > > +       /* Zero r8 to store match result.  */
> > > > > > +       xorq    %r8, %r8
> > > > > > +       /* Save pointer of first vector, in case if no match found.  */
> > > > > > +       movq    %rdi, %rsi
> > > > > > +       /* Align pointer to vector size.  */
> > > > > > +       andq    $-VEC_SIZE, %rdi
> > > > > > +       /* Loop unroll 2 times for 2 vector loop.  */
> > > > > > +       VMOVA   (VEC_SIZE)(%rdi), %VMM2
> > > > > > +       VPTESTN %VMM2, %VMM2, %k0
> > > > > > +       KMOV    %k0, %RCX
> > > > > > +       test    %RCX, %RCX
> > > > > > +       jnz     L(vector_x2_end)
> > > > > > +
> > > > > > +       /* Save pointer of second vector, in case if no match
> > > > > > +          found.  */
> > > > > > +       movq    %rdi, %r9
> > > > > > +       /* Align address to VEC_SIZE * 2 for loop.  */
> > > > > > +       andq    $-(VEC_SIZE * 2), %rdi
> > > > > > +
> > > > > > +       .p2align 4,,11
> > > > > > +L(loop):
> > > > > > +       /* 2 vector loop, as it provide better performance as compared
> > > > > > +          to 4 vector loop.  */
> > > > > > +       VMOVA   (VEC_SIZE * 2)(%rdi), %VMM3
> > > > > > +       VMOVA   (VEC_SIZE * 3)(%rdi), %VMM4
> > > > > > +       VPCMP   $0, %VMM3, %VMM0, %k1
> > > > > > +       VPCMP   $0, %VMM4, %VMM0, %k2
> > > > > > +       VPMINU  %VMM3, %VMM4, %VMM5
> > > > > > +       VPTESTN %VMM5, %VMM5, %k0
> > > > > > +       KOR     %k1, %k2, %k3
> > > > > > +       subq    $-(VEC_SIZE * 2), %rdi
> > > > > > +       /* If k0 and k3 zero, match and end of string not found.  */
> > > > > > +       KORTEST %k0, %k3
> > > > > > +       jz      L(loop)
> > > > > > +
> > > > > > +       /* If k0 is non zero, end of string found.  */
> > > > > > +       KORTEST %k0, %k0
> > > > > > +       jnz     L(endloop)
> > > > > > +
> > > > > > +       /* A match found, it need to be stored in r8 before loop
> > > > > > +          continue.  */
> > > > > > +       /* Check second vector first.  */
> > > > > > +       KMOV    %k2, %RDX
> > > > > > +       test    %RDX, %RDX
> > > > > > +       jz      L(loop_vec_x3_ret)
> > > > > > +
> > > > > > +       BSR     %RDX, %RDX
> > > > > > +       leaq    (VEC_SIZE)(%rdi, %rdx, CHAR_SIZE), %r8
> > > > > > +       jmp     L(loop)
> > > > > > +
> > > > > > +       /* If second vector doesn't have match, first vector must
> > > > > > +          have match.  */
> > > > > > +L(loop_vec_x3_ret):
> > > > > > +       KMOV    %k1, %R8
> > > > > > +       BSR     %R8, %R8
> > > > > > +# ifdef USE_AS_WCSRCHR
> > > > > > +       leaq    (%rdi, %r8, CHAR_SIZE), %r8
> > > > > > +# else
> > > > > > +       add     %rdi, %r8
> > > > > > +# endif
> > > > > > +       jmp     L(loop)
> > > > > > +
> > > > > > +L(endloop):
> > > > > > +       /* Check if string end in first loop vector.  */
> > > > > > +       VPTESTN %VMM3, %VMM3, %k0
> > > > > > +       KMOV    %k0, %RCX
> > > > > > +       test    %RCX, %RCX
> > > > > > +       jnz     L(vector_x3_end)
> > > > > > +
> > > > > > +       /* Check if it has match in first loop vector.  */
> > > > > > +       KMOV    %k1, %RAX
> > > > > > +       test    %RAX, %RAX
> > > > > > +       jz      L(vector_x4_end)
> > > > > > +
> > > > > > +       BSR     %RAX, %RAX
> > > > > > +       leaq    (%rdi, %rax, CHAR_SIZE), %r8
> > > > > > +
> > > > > > +       /* String must end in second loop vector.  */
> > > > > > +L(vector_x4_end):
> > > > > > +       VPTESTN %VMM4, %VMM4, %k0
> > > > > > +       KMOV    %k0, %RCX
> > > > > > +       KMOV    %k2, %RAX
> > > > > > +       BLSMSK  %RCX, %RCX
> > > > > > +       /* Check if it has match in second loop vector.  */
> > > > > > +       and     %RCX, %RAX
> > > > > > +       jz      L(check_last_match)
> > > > > > +
> > > > > > +       BSR     %RAX, %RAX
> > > > > > +       leaq    (VEC_SIZE)(%rdi, %rax, CHAR_SIZE), %rax
> > > > > > +       ret
> > > > > > +
> > > > > > +       /* String end in first loop vector.  */
> > > > > > +L(vector_x3_end):
> > > > > > +       KMOV    %k1, %RAX
> > > > > > +       BLSMSK  %RCX, %RCX
> > > > > > +       /* Check if it has match in second loop vector.  */
> > > > > > +       and     %RCX, %RAX
> > > > > > +       jz      L(check_last_match)
> > > > > > +
> > > > > > +       BSR     %RAX, %RAX
> > > > > > +       leaq    (%rdi, %rax, CHAR_SIZE), %rax
> > > > > > +       ret
> > > > > > +
> > > > > > +       /* No match in first and second loop vector.  */
> > > > > > +L(check_last_match):
> > > > > > +       /* Check if any match recorded in r8.  */
> > > > > > +       test    %r8, %r8
> > > > > > +       jz      L(vector_x2_ret)
> > > > > > +       movq    %r8, %rax
> > > > > > +       ret
> > > > > > +
> > > > > > +       /* No match recorded in r8. Check the second saved vector
> > > > > > +          in begining.  */
> > > > > > +L(vector_x2_ret):
> > > > > > +       VPCMP   $0, %VMM2, %VMM0, %k2
> > > > > > +       KMOV    %k2, %RAX
> > > > > > +       test    %RAX, %RAX
> > > > > > +       jz      L(vector_x1_ret)
> > > > > > +
> > > > > > +       /* Match found in the second saved vector.  */
> > > > > > +       BSR     %RAX, %RAX
> > > > > > +       leaq    (VEC_SIZE)(%r9, %rax, CHAR_SIZE), %rax
> > > > > > +       ret
> > > > > > +
> > > > > > +L(page_cross):
> > > > > > +       movl    %eax, %ecx
> > > > > > +# ifdef USE_AS_WCSRCHR
> > > > > > +       /* Calculate number of compare result bits to be skipped for
> > > > > > +          wide string alignment adjustment.  */
> > > > > > +       andl    $(VEC_SIZE - 1), %ecx
> > > > > > +       sarl    $2, %ecx
> > > > > > +# endif
> > > > > > +       /* ecx contains number of w[char] to be skipped as a result
> > > > > > +          of address alignment.  */
> > > > > > +       xorq    %rdi, %rax
> > > > > > +       VMOVA   (PAGE_SIZE - VEC_SIZE)(%rax), %VMM1
> > > > > > +
> > > > > > +       VPTESTN %VMM1, %VMM1, %k1
> > > > > > +       KMOV    %k1, %RAX
> > > > > > +       SHR     %cl, %RAX
> > > > > > +       jz      L(page_cross_continue)
> > > > > > +       VPCMP   $0, %VMM1, %VMM0, %k0
> > > > > > +       KMOV    %k0, %RDX
> > > > > > +       SHR     %cl, %RDX
> > > > > > +       BLSMSK  %RAX, %RAX
> > > > > > +       and     %RDX, %RAX
> > > > > > +       jz      L(ret)
> > > > > > +       BSR     %RAX, %RAX
> > > > > > +# ifdef USE_AS_WCSRCHR
> > > > > > +       leaq    (%rdi, %rax, CHAR_SIZE), %rax
> > > > > > +# else
> > > > > > +       add     %rdi, %rax
> > > > > > +# endif
> > > > > > +
> > > > > > +       ret
> > > > > > +END (STRRCHR)
> > > > > > +#endif
> > > > > > diff --git a/sysdeps/x86_64/multiarch/strrchr-evex512.S b/sysdeps/x86_64/multiarch/strrchr-evex512.S
> > > > > > new file mode 100644
> > > > > > index 0000000000..f880848e09
> > > > > > --- /dev/null
> > > > > > +++ b/sysdeps/x86_64/multiarch/strrchr-evex512.S
> > > > > > @@ -0,0 +1,7 @@
> > > > > > +# ifndef STRRCHR
> > > > > > +#  define STRRCHR      __strrchr_evex512
> > > > > > +# endif
> > > > > > +
> > > > > > +#define VEC_SIZE       64
> > > > > > +
> > > > > > +#include "strrchr-evex-base.S"
> > > > > > diff --git a/sysdeps/x86_64/multiarch/wcsrchr-evex512.S b/sysdeps/x86_64/multiarch/wcsrchr-evex512.S
> > > > > > new file mode 100644
> > > > > > index 0000000000..65b7710b22
> > > > > > --- /dev/null
> > > > > > +++ b/sysdeps/x86_64/multiarch/wcsrchr-evex512.S
> > > > > > @@ -0,0 +1,8 @@
> > > > > > +#ifndef WCSRCHR
> > > > > > +# define WCSRCHR       __wcsrchr_evex512
> > > > > > +#endif
> > > > > > +
> > > > > > +#define STRRCHR        WCSRCHR
> > > > > > +#define USE_AS_WCSRCHR 1
> > > > > > +
> > > > > > +#include "strrchr-evex512.S"
> > > > > > --
> > > > > > 2.36.1
> > > > > >
> > >
> > > ping
> >
> > Regarding this patch along with the corresponding memchr and strchr
> > ones, I would prefer to try and implement the ZMM version alongside
> > the YMM similar to what we do in memset/memmove.
>
> This is a question of methodology. Everyone has different ways to
> implement.  I don't think it's fair to expect that everyone follows same
> existing methodology.
>
> >
> > Since all/nearly all of the instructions are the same this shouldn't
> > be too difficult with the `VEC(n)` macros.
> >
>
> VEC(n) uses 3 levels of extra indirection to simply understand what
> actual registers are used.
>
> memrchr-evex.S->evex256-vecs.h->evex-vecs-common.h->vec-macros.h

Imo it beats recopying the upcased GPR and VMM macros in each file.

>
> > Examples are:
> > https://gitlab.com/x86-glibc/glibc/-/tree/users/goldsteinn/memcmp-evex512
> >
> > and there is a congruent patch to strlen to do the same (still in the
> > works):
> > https://gitlab.com/x86-glibc/glibc/-/tree/users/goldsteinn/evex512
> >
> > There are many good ideas in these patches that I believe would also
> > apply to the YMM implementations and think it would be best to ensure
> > both files are as close to optimal as we can get them as opposed to
> > adding yet another bespoke implementation we need to maintain / keep
> > optimized.
> >
>
> I don't think it's a good idea to centralize when the entire ecosystem is
>  moving towards modularization and inclusion.

Reusing code promotes modularity.

Tell me which is more modular?

template<typename T>
T max(T a, T b) {
 return a < b ? a : b;
}

or

max_int(int a, int b) {
....
}

max_long(long a, long b) {
...
}

?

>
> Also it will not encourage any new contributors, if good ideas
> taken from the patch and discard the actual patch just because it's using
> different implementation methodology.

Is there a reason the evex512 implementation methodology doesn't
suite evex256 or vice versa?

They use just about the exact same instructions. Minus a few edge cases
where the evex256 version combines 2x GPR for a bit-scan there seem to
be few cases the two can't share logic.

>
> > Can you try and integrate this and the memchr/strchr implementations
> > similar to how we do memmove/memset?
>
> Why? I don't see any reason for that.

The reasons are above.

^ permalink raw reply	[flat|nested] 7+ messages in thread

end of thread, other threads:[~2022-09-30 19:09 UTC | newest]

Thread overview: 7+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-09-22  0:24 [PATCH] x86_64: Implement evex512 version of strrchr and wcsrchr Sunil K Pandey
2022-09-22  0:50 ` Noah Goldstein
2022-09-23  3:57   ` Sunil Pandey
2022-09-29  3:42     ` Sunil Pandey
2022-09-29  4:06       ` Noah Goldstein
2022-09-30 18:49         ` Sunil Pandey
2022-09-30 19:09           ` Noah Goldstein

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).