From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: by sourceware.org (Postfix, from userid 7844) id 2CD0B384D14D; Thu, 20 Oct 2022 19:03:55 +0000 (GMT) DKIM-Filter: OpenDKIM Filter v2.11.0 sourceware.org 2CD0B384D14D DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=sourceware.org; s=default; t=1666292635; bh=8tLg7d+LdTwBGjLmAW822k3lWabQI92ptx3Z/W63c08=; h=From:To:Subject:Date:From; b=uNlAhkRnD0qHgTvsUulzZX33Bm2zGb39PE19/c/ZFrXMKb6T6PWI/B+nTB2pre/ZP 7DbJo9L19/wkUsarWjXIS+Qcnx47uqvGywN2D/xpAT1t+BzoFak73CFt1srKoXQp/f 2Ndsd3bYd11f7oKNNZbjZ5gYs+c+UZDsdaJr6Mig= Content-Type: text/plain; charset="us-ascii" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit From: Noah Goldstein To: glibc-cvs@sourceware.org Subject: [glibc] x86: Remove AVX512-BVMI2 instruction from strrchr-evex.S X-Act-Checkin: glibc X-Git-Author: Noah Goldstein X-Git-Refname: refs/heads/master X-Git-Oldrev: a885fc2d6897d3baa438e2594a92f0125f1258a9 X-Git-Newrev: c25eb94aed942761aabdcd05239b7e76dcdde098 Message-Id: <20221020190355.2CD0B384D14D@sourceware.org> Date: Thu, 20 Oct 2022 19:03:55 +0000 (GMT) List-Id: https://sourceware.org/git/gitweb.cgi?p=glibc.git;h=c25eb94aed942761aabdcd05239b7e76dcdde098 commit c25eb94aed942761aabdcd05239b7e76dcdde098 Author: Noah Goldstein Date: Thu Oct 20 10:26:46 2022 -0700 x86: Remove AVX512-BVMI2 instruction from strrchr-evex.S commit b412213eee0afa3b51dfe92b736dfc7c981309f5 Author: Noah Goldstein Date: Tue Oct 18 17:44:07 2022 -0700 x86: Optimize strrchr-evex.S and implement with VMM headers Added `vpcompress{b|d}` to the page-cross logic with is an AVX512-VBMI2 instruction. This is not supported on SKX. Since the page-cross logic is relatively cold and the benefit is minimal revert the page-cross case back to the old logic which is supported on SKX. Tested on x86-64. Diff: --- sysdeps/x86_64/multiarch/strrchr-evex.S | 69 ++++++++++++++------------------- 1 file changed, 29 insertions(+), 40 deletions(-) diff --git a/sysdeps/x86_64/multiarch/strrchr-evex.S b/sysdeps/x86_64/multiarch/strrchr-evex.S index 45487dc87a..26b3457875 100644 --- a/sysdeps/x86_64/multiarch/strrchr-evex.S +++ b/sysdeps/x86_64/multiarch/strrchr-evex.S @@ -29,9 +29,7 @@ # include "x86-evex256-vecs.h" # ifdef USE_AS_WCSRCHR -# define RCX_M cl -# define SHIFT_REG rcx -# define VPCOMPRESS vpcompressd +# define SHIFT_REG rsi # define kunpck_2x kunpckbw # define kmov_2x kmovd # define maskz_2x ecx @@ -46,9 +44,7 @@ # define USE_WIDE_CHAR # else -# define RCX_M ecx # define SHIFT_REG rdi -# define VPCOMPRESS vpcompressb # define kunpck_2x kunpckdq # define kmov_2x kmovq # define maskz_2x rcx @@ -78,7 +74,7 @@ ENTRY_P2ALIGN(STRRCHR, 6) andl $(PAGE_SIZE - 1), %eax cmpl $(PAGE_SIZE - VEC_SIZE), %eax jg L(cross_page_boundary) - +L(page_cross_continue): VMOVU (%rdi), %VMM(1) /* k0 has a 1 for each zero CHAR in VEC(1). */ VPTESTN %VMM(1), %VMM(1), %k0 @@ -86,7 +82,6 @@ ENTRY_P2ALIGN(STRRCHR, 6) test %VRSI, %VRSI jz L(aligned_more) /* fallthrough: zero CHAR in first VEC. */ -L(page_cross_return): /* K1 has a 1 for each search CHAR match in VEC(1). */ VPCMPEQ %VMATCH, %VMM(1), %k1 KMOV %k1, %VRAX @@ -197,7 +192,6 @@ L(first_vec_x2): .p2align 4,, 12 L(aligned_more): -L(page_cross_continue): /* Need to keep original pointer incase VEC(1) has last match. */ movq %rdi, %rsi @@ -353,53 +347,48 @@ L(return_new_match): leaq (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax ret - .p2align 4,, 4 L(cross_page_boundary): + /* eax contains all the page offset bits of src (rdi). `xor rdi, + rax` sets pointer will all page offset bits cleared so + offset of (PAGE_SIZE - VEC_SIZE) will get last aligned VEC + before page cross (guranteed to be safe to read). Doing this + as opposed to `movq %rdi, %rax; andq $-VEC_SIZE, %rax` saves + a bit of code size. */ xorq %rdi, %rax - mov $-1, %VRDX - VMOVU (PAGE_SIZE - VEC_SIZE)(%rax), %VMM(6) - VPTESTN %VMM(6), %VMM(6), %k0 - KMOV %k0, %VRSI - -# ifdef USE_AS_WCSRCHR - movl %edi, %ecx - and $(VEC_SIZE - 1), %ecx - shrl $2, %ecx -# endif - shlx %VGPR(SHIFT_REG), %VRDX, %VRDX + VMOVU (PAGE_SIZE - VEC_SIZE)(%rax), %VMM(1) + VPTESTN %VMM(1), %VMM(1), %k0 + KMOV %k0, %VRCX + /* Shift out zero CHAR matches that are before the begining of + src (rdi). */ # ifdef USE_AS_WCSRCHR - kmovb %edx, %k1 -# else - KMOV %VRDX, %k1 + movl %edi, %esi + andl $(VEC_SIZE - 1), %esi + shrl $2, %esi # endif + shrx %VGPR(SHIFT_REG), %VRCX, %VRCX - /* Need to adjust result to VEC(1) so it can be re-used by - L(return_vec_x0_test). The alternative is to collect VEC(1) - will a page cross load which is far more expensive. */ - VPCOMPRESS %VMM(6), %VMM(1){%k1}{z} - - /* We could technically just jmp back after the vpcompress but - it doesn't save any 16-byte blocks. */ - shrx %VGPR(SHIFT_REG), %VRSI, %VRSI - test %VRSI, %VRSI + test %VRCX, %VRCX jz L(page_cross_continue) - /* Duplicate of return logic from ENTRY. Doesn't cause spill to - next cache line so might as well copy it here. */ - VPCMPEQ %VMATCH, %VMM(1), %k1 + /* Found zero CHAR so need to test for search CHAR. */ + VPCMP $0, %VMATCH, %VMM(1), %k1 KMOV %k1, %VRAX - blsmsk %VRSI, %VRSI - and %VRSI, %VRAX - jz L(ret_page_cross) + /* Shift out search CHAR matches that are before the begining of + src (rdi). */ + shrx %VGPR(SHIFT_REG), %VRAX, %VRAX + + /* Check if any search CHAR match in range. */ + blsmsk %VRCX, %VRCX + and %VRCX, %VRAX + jz L(ret3) bsr %VRAX, %VRAX # ifdef USE_AS_WCSRCHR leaq (%rdi, %rax, CHAR_SIZE), %rax # else addq %rdi, %rax # endif -L(ret_page_cross): +L(ret3): ret - /* 1 byte till next cache line. */ END(STRRCHR) #endif