* [PATCH v1] x86: Remove AVX512-BVMI2 instruction from strrchr-evex.S
@ 2022-10-20 17:26 Noah Goldstein
2022-10-20 17:59 ` H.J. Lu
0 siblings, 1 reply; 2+ messages in thread
From: Noah Goldstein @ 2022-10-20 17:26 UTC (permalink / raw)
To: libc-alpha; +Cc: goldstein.w.n, hjl.tools, carlos
commit b412213eee0afa3b51dfe92b736dfc7c981309f5
Author: Noah Goldstein <goldstein.w.n@gmail.com>
Date: Tue Oct 18 17:44:07 2022 -0700
x86: Optimize strrchr-evex.S and implement with VMM headers
Added `vpcompress{b|d}` to the page-cross logic with is an
AVX512-VBMI2 instruction. This is not supported on SKX. Since the
page-cross logic is relatively cold and the benefit is minimal
revert the page-cross case back to the old logic which is supported
on SKX.
Tested on x86-64.
---
sysdeps/x86_64/multiarch/strrchr-evex.S | 69 +++++++++++--------------
1 file changed, 29 insertions(+), 40 deletions(-)
diff --git a/sysdeps/x86_64/multiarch/strrchr-evex.S b/sysdeps/x86_64/multiarch/strrchr-evex.S
index 45487dc87a..26b3457875 100644
--- a/sysdeps/x86_64/multiarch/strrchr-evex.S
+++ b/sysdeps/x86_64/multiarch/strrchr-evex.S
@@ -29,9 +29,7 @@
# include "x86-evex256-vecs.h"
# ifdef USE_AS_WCSRCHR
-# define RCX_M cl
-# define SHIFT_REG rcx
-# define VPCOMPRESS vpcompressd
+# define SHIFT_REG rsi
# define kunpck_2x kunpckbw
# define kmov_2x kmovd
# define maskz_2x ecx
@@ -46,9 +44,7 @@
# define USE_WIDE_CHAR
# else
-# define RCX_M ecx
# define SHIFT_REG rdi
-# define VPCOMPRESS vpcompressb
# define kunpck_2x kunpckdq
# define kmov_2x kmovq
# define maskz_2x rcx
@@ -78,7 +74,7 @@ ENTRY_P2ALIGN(STRRCHR, 6)
andl $(PAGE_SIZE - 1), %eax
cmpl $(PAGE_SIZE - VEC_SIZE), %eax
jg L(cross_page_boundary)
-
+L(page_cross_continue):
VMOVU (%rdi), %VMM(1)
/* k0 has a 1 for each zero CHAR in VEC(1). */
VPTESTN %VMM(1), %VMM(1), %k0
@@ -86,7 +82,6 @@ ENTRY_P2ALIGN(STRRCHR, 6)
test %VRSI, %VRSI
jz L(aligned_more)
/* fallthrough: zero CHAR in first VEC. */
-L(page_cross_return):
/* K1 has a 1 for each search CHAR match in VEC(1). */
VPCMPEQ %VMATCH, %VMM(1), %k1
KMOV %k1, %VRAX
@@ -197,7 +192,6 @@ L(first_vec_x2):
.p2align 4,, 12
L(aligned_more):
-L(page_cross_continue):
/* Need to keep original pointer incase VEC(1) has last match.
*/
movq %rdi, %rsi
@@ -353,53 +347,48 @@ L(return_new_match):
leaq (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
ret
- .p2align 4,, 4
L(cross_page_boundary):
+ /* eax contains all the page offset bits of src (rdi). `xor rdi,
+ rax` sets pointer will all page offset bits cleared so
+ offset of (PAGE_SIZE - VEC_SIZE) will get last aligned VEC
+ before page cross (guranteed to be safe to read). Doing this
+ as opposed to `movq %rdi, %rax; andq $-VEC_SIZE, %rax` saves
+ a bit of code size. */
xorq %rdi, %rax
- mov $-1, %VRDX
- VMOVU (PAGE_SIZE - VEC_SIZE)(%rax), %VMM(6)
- VPTESTN %VMM(6), %VMM(6), %k0
- KMOV %k0, %VRSI
-
-# ifdef USE_AS_WCSRCHR
- movl %edi, %ecx
- and $(VEC_SIZE - 1), %ecx
- shrl $2, %ecx
-# endif
- shlx %VGPR(SHIFT_REG), %VRDX, %VRDX
+ VMOVU (PAGE_SIZE - VEC_SIZE)(%rax), %VMM(1)
+ VPTESTN %VMM(1), %VMM(1), %k0
+ KMOV %k0, %VRCX
+ /* Shift out zero CHAR matches that are before the begining of
+ src (rdi). */
# ifdef USE_AS_WCSRCHR
- kmovb %edx, %k1
-# else
- KMOV %VRDX, %k1
+ movl %edi, %esi
+ andl $(VEC_SIZE - 1), %esi
+ shrl $2, %esi
# endif
+ shrx %VGPR(SHIFT_REG), %VRCX, %VRCX
- /* Need to adjust result to VEC(1) so it can be re-used by
- L(return_vec_x0_test). The alternative is to collect VEC(1)
- will a page cross load which is far more expensive. */
- VPCOMPRESS %VMM(6), %VMM(1){%k1}{z}
-
- /* We could technically just jmp back after the vpcompress but
- it doesn't save any 16-byte blocks. */
- shrx %VGPR(SHIFT_REG), %VRSI, %VRSI
- test %VRSI, %VRSI
+ test %VRCX, %VRCX
jz L(page_cross_continue)
- /* Duplicate of return logic from ENTRY. Doesn't cause spill to
- next cache line so might as well copy it here. */
- VPCMPEQ %VMATCH, %VMM(1), %k1
+ /* Found zero CHAR so need to test for search CHAR. */
+ VPCMP $0, %VMATCH, %VMM(1), %k1
KMOV %k1, %VRAX
- blsmsk %VRSI, %VRSI
- and %VRSI, %VRAX
- jz L(ret_page_cross)
+ /* Shift out search CHAR matches that are before the begining of
+ src (rdi). */
+ shrx %VGPR(SHIFT_REG), %VRAX, %VRAX
+
+ /* Check if any search CHAR match in range. */
+ blsmsk %VRCX, %VRCX
+ and %VRCX, %VRAX
+ jz L(ret3)
bsr %VRAX, %VRAX
# ifdef USE_AS_WCSRCHR
leaq (%rdi, %rax, CHAR_SIZE), %rax
# else
addq %rdi, %rax
# endif
-L(ret_page_cross):
+L(ret3):
ret
- /* 1 byte till next cache line. */
END(STRRCHR)
#endif
--
2.34.1
^ permalink raw reply [flat|nested] 2+ messages in thread
* Re: [PATCH v1] x86: Remove AVX512-BVMI2 instruction from strrchr-evex.S
2022-10-20 17:26 [PATCH v1] x86: Remove AVX512-BVMI2 instruction from strrchr-evex.S Noah Goldstein
@ 2022-10-20 17:59 ` H.J. Lu
0 siblings, 0 replies; 2+ messages in thread
From: H.J. Lu @ 2022-10-20 17:59 UTC (permalink / raw)
To: Noah Goldstein; +Cc: libc-alpha, carlos
On Thu, Oct 20, 2022 at 10:26 AM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> commit b412213eee0afa3b51dfe92b736dfc7c981309f5
> Author: Noah Goldstein <goldstein.w.n@gmail.com>
> Date: Tue Oct 18 17:44:07 2022 -0700
>
> x86: Optimize strrchr-evex.S and implement with VMM headers
>
> Added `vpcompress{b|d}` to the page-cross logic with is an
> AVX512-VBMI2 instruction. This is not supported on SKX. Since the
> page-cross logic is relatively cold and the benefit is minimal
> revert the page-cross case back to the old logic which is supported
> on SKX.
Sorry I didn't catch it. Since IFUNC selector wasn't updated,
strrchr-evex will fail on SKX.
LGTM.
Thanks.
> Tested on x86-64.
> ---
> sysdeps/x86_64/multiarch/strrchr-evex.S | 69 +++++++++++--------------
> 1 file changed, 29 insertions(+), 40 deletions(-)
>
> diff --git a/sysdeps/x86_64/multiarch/strrchr-evex.S b/sysdeps/x86_64/multiarch/strrchr-evex.S
> index 45487dc87a..26b3457875 100644
> --- a/sysdeps/x86_64/multiarch/strrchr-evex.S
> +++ b/sysdeps/x86_64/multiarch/strrchr-evex.S
> @@ -29,9 +29,7 @@
> # include "x86-evex256-vecs.h"
>
> # ifdef USE_AS_WCSRCHR
> -# define RCX_M cl
> -# define SHIFT_REG rcx
> -# define VPCOMPRESS vpcompressd
> +# define SHIFT_REG rsi
> # define kunpck_2x kunpckbw
> # define kmov_2x kmovd
> # define maskz_2x ecx
> @@ -46,9 +44,7 @@
>
> # define USE_WIDE_CHAR
> # else
> -# define RCX_M ecx
> # define SHIFT_REG rdi
> -# define VPCOMPRESS vpcompressb
> # define kunpck_2x kunpckdq
> # define kmov_2x kmovq
> # define maskz_2x rcx
> @@ -78,7 +74,7 @@ ENTRY_P2ALIGN(STRRCHR, 6)
> andl $(PAGE_SIZE - 1), %eax
> cmpl $(PAGE_SIZE - VEC_SIZE), %eax
> jg L(cross_page_boundary)
> -
> +L(page_cross_continue):
> VMOVU (%rdi), %VMM(1)
> /* k0 has a 1 for each zero CHAR in VEC(1). */
> VPTESTN %VMM(1), %VMM(1), %k0
> @@ -86,7 +82,6 @@ ENTRY_P2ALIGN(STRRCHR, 6)
> test %VRSI, %VRSI
> jz L(aligned_more)
> /* fallthrough: zero CHAR in first VEC. */
> -L(page_cross_return):
> /* K1 has a 1 for each search CHAR match in VEC(1). */
> VPCMPEQ %VMATCH, %VMM(1), %k1
> KMOV %k1, %VRAX
> @@ -197,7 +192,6 @@ L(first_vec_x2):
>
> .p2align 4,, 12
> L(aligned_more):
> -L(page_cross_continue):
> /* Need to keep original pointer incase VEC(1) has last match.
> */
> movq %rdi, %rsi
> @@ -353,53 +347,48 @@ L(return_new_match):
> leaq (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
> ret
>
> - .p2align 4,, 4
> L(cross_page_boundary):
> + /* eax contains all the page offset bits of src (rdi). `xor rdi,
> + rax` sets pointer will all page offset bits cleared so
> + offset of (PAGE_SIZE - VEC_SIZE) will get last aligned VEC
> + before page cross (guranteed to be safe to read). Doing this
> + as opposed to `movq %rdi, %rax; andq $-VEC_SIZE, %rax` saves
> + a bit of code size. */
> xorq %rdi, %rax
> - mov $-1, %VRDX
> - VMOVU (PAGE_SIZE - VEC_SIZE)(%rax), %VMM(6)
> - VPTESTN %VMM(6), %VMM(6), %k0
> - KMOV %k0, %VRSI
> -
> -# ifdef USE_AS_WCSRCHR
> - movl %edi, %ecx
> - and $(VEC_SIZE - 1), %ecx
> - shrl $2, %ecx
> -# endif
> - shlx %VGPR(SHIFT_REG), %VRDX, %VRDX
> + VMOVU (PAGE_SIZE - VEC_SIZE)(%rax), %VMM(1)
> + VPTESTN %VMM(1), %VMM(1), %k0
> + KMOV %k0, %VRCX
>
> + /* Shift out zero CHAR matches that are before the begining of
> + src (rdi). */
> # ifdef USE_AS_WCSRCHR
> - kmovb %edx, %k1
> -# else
> - KMOV %VRDX, %k1
> + movl %edi, %esi
> + andl $(VEC_SIZE - 1), %esi
> + shrl $2, %esi
> # endif
> + shrx %VGPR(SHIFT_REG), %VRCX, %VRCX
>
> - /* Need to adjust result to VEC(1) so it can be re-used by
> - L(return_vec_x0_test). The alternative is to collect VEC(1)
> - will a page cross load which is far more expensive. */
> - VPCOMPRESS %VMM(6), %VMM(1){%k1}{z}
> -
> - /* We could technically just jmp back after the vpcompress but
> - it doesn't save any 16-byte blocks. */
> - shrx %VGPR(SHIFT_REG), %VRSI, %VRSI
> - test %VRSI, %VRSI
> + test %VRCX, %VRCX
> jz L(page_cross_continue)
>
> - /* Duplicate of return logic from ENTRY. Doesn't cause spill to
> - next cache line so might as well copy it here. */
> - VPCMPEQ %VMATCH, %VMM(1), %k1
> + /* Found zero CHAR so need to test for search CHAR. */
> + VPCMP $0, %VMATCH, %VMM(1), %k1
> KMOV %k1, %VRAX
> - blsmsk %VRSI, %VRSI
> - and %VRSI, %VRAX
> - jz L(ret_page_cross)
> + /* Shift out search CHAR matches that are before the begining of
> + src (rdi). */
> + shrx %VGPR(SHIFT_REG), %VRAX, %VRAX
> +
> + /* Check if any search CHAR match in range. */
> + blsmsk %VRCX, %VRCX
> + and %VRCX, %VRAX
> + jz L(ret3)
> bsr %VRAX, %VRAX
> # ifdef USE_AS_WCSRCHR
> leaq (%rdi, %rax, CHAR_SIZE), %rax
> # else
> addq %rdi, %rax
> # endif
> -L(ret_page_cross):
> +L(ret3):
> ret
> - /* 1 byte till next cache line. */
> END(STRRCHR)
> #endif
> --
> 2.34.1
>
--
H.J.
^ permalink raw reply [flat|nested] 2+ messages in thread
end of thread, other threads:[~2022-10-20 17:59 UTC | newest]
Thread overview: 2+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-10-20 17:26 [PATCH v1] x86: Remove AVX512-BVMI2 instruction from strrchr-evex.S Noah Goldstein
2022-10-20 17:59 ` H.J. Lu
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).