public inbox for glibc-cvs@sourceware.org
help / color / mirror / Atom feed
* [glibc] x86: Remove AVX512-BVMI2 instruction from strrchr-evex.S
@ 2022-10-20 19:03 Noah Goldstein
0 siblings, 0 replies; only message in thread
From: Noah Goldstein @ 2022-10-20 19:03 UTC (permalink / raw)
To: glibc-cvs
https://sourceware.org/git/gitweb.cgi?p=glibc.git;h=c25eb94aed942761aabdcd05239b7e76dcdde098
commit c25eb94aed942761aabdcd05239b7e76dcdde098
Author: Noah Goldstein <goldstein.w.n@gmail.com>
Date: Thu Oct 20 10:26:46 2022 -0700
x86: Remove AVX512-BVMI2 instruction from strrchr-evex.S
commit b412213eee0afa3b51dfe92b736dfc7c981309f5
Author: Noah Goldstein <goldstein.w.n@gmail.com>
Date: Tue Oct 18 17:44:07 2022 -0700
x86: Optimize strrchr-evex.S and implement with VMM headers
Added `vpcompress{b|d}` to the page-cross logic with is an
AVX512-VBMI2 instruction. This is not supported on SKX. Since the
page-cross logic is relatively cold and the benefit is minimal
revert the page-cross case back to the old logic which is supported
on SKX.
Tested on x86-64.
Diff:
---
sysdeps/x86_64/multiarch/strrchr-evex.S | 69 ++++++++++++++-------------------
1 file changed, 29 insertions(+), 40 deletions(-)
diff --git a/sysdeps/x86_64/multiarch/strrchr-evex.S b/sysdeps/x86_64/multiarch/strrchr-evex.S
index 45487dc87a..26b3457875 100644
--- a/sysdeps/x86_64/multiarch/strrchr-evex.S
+++ b/sysdeps/x86_64/multiarch/strrchr-evex.S
@@ -29,9 +29,7 @@
# include "x86-evex256-vecs.h"
# ifdef USE_AS_WCSRCHR
-# define RCX_M cl
-# define SHIFT_REG rcx
-# define VPCOMPRESS vpcompressd
+# define SHIFT_REG rsi
# define kunpck_2x kunpckbw
# define kmov_2x kmovd
# define maskz_2x ecx
@@ -46,9 +44,7 @@
# define USE_WIDE_CHAR
# else
-# define RCX_M ecx
# define SHIFT_REG rdi
-# define VPCOMPRESS vpcompressb
# define kunpck_2x kunpckdq
# define kmov_2x kmovq
# define maskz_2x rcx
@@ -78,7 +74,7 @@ ENTRY_P2ALIGN(STRRCHR, 6)
andl $(PAGE_SIZE - 1), %eax
cmpl $(PAGE_SIZE - VEC_SIZE), %eax
jg L(cross_page_boundary)
-
+L(page_cross_continue):
VMOVU (%rdi), %VMM(1)
/* k0 has a 1 for each zero CHAR in VEC(1). */
VPTESTN %VMM(1), %VMM(1), %k0
@@ -86,7 +82,6 @@ ENTRY_P2ALIGN(STRRCHR, 6)
test %VRSI, %VRSI
jz L(aligned_more)
/* fallthrough: zero CHAR in first VEC. */
-L(page_cross_return):
/* K1 has a 1 for each search CHAR match in VEC(1). */
VPCMPEQ %VMATCH, %VMM(1), %k1
KMOV %k1, %VRAX
@@ -197,7 +192,6 @@ L(first_vec_x2):
.p2align 4,, 12
L(aligned_more):
-L(page_cross_continue):
/* Need to keep original pointer incase VEC(1) has last match.
*/
movq %rdi, %rsi
@@ -353,53 +347,48 @@ L(return_new_match):
leaq (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
ret
- .p2align 4,, 4
L(cross_page_boundary):
+ /* eax contains all the page offset bits of src (rdi). `xor rdi,
+ rax` sets pointer will all page offset bits cleared so
+ offset of (PAGE_SIZE - VEC_SIZE) will get last aligned VEC
+ before page cross (guranteed to be safe to read). Doing this
+ as opposed to `movq %rdi, %rax; andq $-VEC_SIZE, %rax` saves
+ a bit of code size. */
xorq %rdi, %rax
- mov $-1, %VRDX
- VMOVU (PAGE_SIZE - VEC_SIZE)(%rax), %VMM(6)
- VPTESTN %VMM(6), %VMM(6), %k0
- KMOV %k0, %VRSI
-
-# ifdef USE_AS_WCSRCHR
- movl %edi, %ecx
- and $(VEC_SIZE - 1), %ecx
- shrl $2, %ecx
-# endif
- shlx %VGPR(SHIFT_REG), %VRDX, %VRDX
+ VMOVU (PAGE_SIZE - VEC_SIZE)(%rax), %VMM(1)
+ VPTESTN %VMM(1), %VMM(1), %k0
+ KMOV %k0, %VRCX
+ /* Shift out zero CHAR matches that are before the begining of
+ src (rdi). */
# ifdef USE_AS_WCSRCHR
- kmovb %edx, %k1
-# else
- KMOV %VRDX, %k1
+ movl %edi, %esi
+ andl $(VEC_SIZE - 1), %esi
+ shrl $2, %esi
# endif
+ shrx %VGPR(SHIFT_REG), %VRCX, %VRCX
- /* Need to adjust result to VEC(1) so it can be re-used by
- L(return_vec_x0_test). The alternative is to collect VEC(1)
- will a page cross load which is far more expensive. */
- VPCOMPRESS %VMM(6), %VMM(1){%k1}{z}
-
- /* We could technically just jmp back after the vpcompress but
- it doesn't save any 16-byte blocks. */
- shrx %VGPR(SHIFT_REG), %VRSI, %VRSI
- test %VRSI, %VRSI
+ test %VRCX, %VRCX
jz L(page_cross_continue)
- /* Duplicate of return logic from ENTRY. Doesn't cause spill to
- next cache line so might as well copy it here. */
- VPCMPEQ %VMATCH, %VMM(1), %k1
+ /* Found zero CHAR so need to test for search CHAR. */
+ VPCMP $0, %VMATCH, %VMM(1), %k1
KMOV %k1, %VRAX
- blsmsk %VRSI, %VRSI
- and %VRSI, %VRAX
- jz L(ret_page_cross)
+ /* Shift out search CHAR matches that are before the begining of
+ src (rdi). */
+ shrx %VGPR(SHIFT_REG), %VRAX, %VRAX
+
+ /* Check if any search CHAR match in range. */
+ blsmsk %VRCX, %VRCX
+ and %VRCX, %VRAX
+ jz L(ret3)
bsr %VRAX, %VRAX
# ifdef USE_AS_WCSRCHR
leaq (%rdi, %rax, CHAR_SIZE), %rax
# else
addq %rdi, %rax
# endif
-L(ret_page_cross):
+L(ret3):
ret
- /* 1 byte till next cache line. */
END(STRRCHR)
#endif
^ permalink raw reply [flat|nested] only message in thread
only message in thread, other threads:[~2022-10-20 19:03 UTC | newest]
Thread overview: (only message) (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-10-20 19:03 [glibc] x86: Remove AVX512-BVMI2 instruction from strrchr-evex.S Noah Goldstein
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).