From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: by sourceware.org (Postfix, from userid 7844) id 28142385382E; Thu, 20 Oct 2022 01:45:08 +0000 (GMT) DKIM-Filter: OpenDKIM Filter v2.11.0 sourceware.org 28142385382E DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=sourceware.org; s=default; t=1666230308; bh=KUbAvdOS4PTdDuG69TSKGYQnWR9h77+O5paUxutsc90=; h=From:To:Subject:Date:From; b=hW9vbBnu4wMhf98mxUfLtELzRgSLZR8tInfQnjUqdaWzjMO5xpDx8xDLzmEf7VJwP gyXzFIGaJfgX+mOSBuC9WoycqBIKumamOudVN4pAWBZJFn2oXqfqfpgzYTd/fDIeTe qtGczKsyX19VP6iEynK8DyEe1Euk07fMVReRSKJk= Content-Type: text/plain; charset="us-ascii" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit From: Noah Goldstein To: glibc-cvs@sourceware.org Subject: [glibc] x86: Optimize strnlen-evex.S and implement with VMM headers X-Act-Checkin: glibc X-Git-Author: Noah Goldstein X-Git-Refname: refs/heads/master X-Git-Oldrev: 69717709ec5c2769322678e96a7672d1e270de3a X-Git-Newrev: b79f8ff26aa6151d2d2167afcddcd1ec46cfbc81 Message-Id: <20221020014508.28142385382E@sourceware.org> Date: Thu, 20 Oct 2022 01:45:08 +0000 (GMT) List-Id: https://sourceware.org/git/gitweb.cgi?p=glibc.git;h=b79f8ff26aa6151d2d2167afcddcd1ec46cfbc81 commit b79f8ff26aa6151d2d2167afcddcd1ec46cfbc81 Author: Noah Goldstein Date: Tue Oct 18 17:44:05 2022 -0700 x86: Optimize strnlen-evex.S and implement with VMM headers Optimizations are: 1. Use the fact that bsf(0) leaves the destination unchanged to save a branch in short string case. 2. Restructure code so that small strings are given the hot path. - This is a net-zero on the benchmark suite but in general makes sense as smaller sizes are far more common. 3. Use more code-size efficient instructions. - tzcnt ... -> bsf ... - vpcmpb $0 ... -> vpcmpeq ... 4. Align labels less aggressively, especially if it doesn't save fetch blocks / causes the basic-block to span extra cache-lines. The optimizations (especially for point 2) make the strnlen and strlen code essentially incompatible so split strnlen-evex to a new file. Code Size Changes: strlen-evex.S : -23 bytes strnlen-evex.S : -167 bytes Net perf changes: Reported as geometric mean of all improvements / regressions from N=10 runs of the benchtests. Value as New Time / Old Time so < 1.0 is improvement and 1.0 is regression. strlen-evex.S : 0.992 (No real change) strnlen-evex.S : 0.947 Full results attached in email. Full check passes on x86-64. Diff: --- sysdeps/x86_64/multiarch/strlen-evex.S | 544 +++++++++----------------------- sysdeps/x86_64/multiarch/strnlen-evex.S | 427 ++++++++++++++++++++++++- sysdeps/x86_64/multiarch/wcsnlen-evex.S | 5 +- 3 files changed, 572 insertions(+), 404 deletions(-) diff --git a/sysdeps/x86_64/multiarch/strlen-evex.S b/sysdeps/x86_64/multiarch/strlen-evex.S index 2109ec2f7a..487846f098 100644 --- a/sysdeps/x86_64/multiarch/strlen-evex.S +++ b/sysdeps/x86_64/multiarch/strlen-evex.S @@ -26,466 +26,220 @@ # define STRLEN __strlen_evex # endif -# define VMOVA vmovdqa64 +# ifndef VEC_SIZE +# include "x86-evex256-vecs.h" +# endif # ifdef USE_AS_WCSLEN -# define VPCMP vpcmpd +# define VPCMPEQ vpcmpeqd +# define VPCMPNEQ vpcmpneqd +# define VPTESTN vptestnmd +# define VPTEST vptestmd # define VPMINU vpminud -# define SHIFT_REG ecx # define CHAR_SIZE 4 +# define CHAR_SIZE_SHIFT_REG(reg) sar $2, %reg # else -# define VPCMP vpcmpb +# define VPCMPEQ vpcmpeqb +# define VPCMPNEQ vpcmpneqb +# define VPTESTN vptestnmb +# define VPTEST vptestmb # define VPMINU vpminub -# define SHIFT_REG edx # define CHAR_SIZE 1 +# define CHAR_SIZE_SHIFT_REG(reg) + +# define REG_WIDTH VEC_SIZE # endif -# define XMMZERO xmm16 -# define YMMZERO ymm16 -# define YMM1 ymm17 -# define YMM2 ymm18 -# define YMM3 ymm19 -# define YMM4 ymm20 -# define YMM5 ymm21 -# define YMM6 ymm22 - -# define VEC_SIZE 32 -# define PAGE_SIZE 4096 -# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE) - - .section .text.evex,"ax",@progbits -ENTRY (STRLEN) -# ifdef USE_AS_STRNLEN - /* Check zero length. */ - test %RSI_LP, %RSI_LP - jz L(zero) -# ifdef __ILP32__ - /* Clear the upper 32 bits. */ - movl %esi, %esi -# endif - mov %RSI_LP, %R8_LP +# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE) + +# include "reg-macros.h" + +# if CHAR_PER_VEC == 64 + +# define TAIL_RETURN_LBL first_vec_x2 +# define TAIL_RETURN_OFFSET (CHAR_PER_VEC * 2) + +# define FALLTHROUGH_RETURN_LBL first_vec_x3 +# define FALLTHROUGH_RETURN_OFFSET (CHAR_PER_VEC * 3) + +# else + +# define TAIL_RETURN_LBL first_vec_x3 +# define TAIL_RETURN_OFFSET (CHAR_PER_VEC * 3) + +# define FALLTHROUGH_RETURN_LBL first_vec_x2 +# define FALLTHROUGH_RETURN_OFFSET (CHAR_PER_VEC * 2) # endif + +# define XZERO VMM_128(0) +# define VZERO VMM(0) +# define PAGE_SIZE 4096 + + .section SECTION(.text), "ax", @progbits +ENTRY_P2ALIGN (STRLEN, 6) movl %edi, %eax - vpxorq %XMMZERO, %XMMZERO, %XMMZERO - /* Clear high bits from edi. Only keeping bits relevant to page - cross check. */ + vpxorq %XZERO, %XZERO, %XZERO andl $(PAGE_SIZE - 1), %eax - /* Check if we may cross page boundary with one vector load. */ cmpl $(PAGE_SIZE - VEC_SIZE), %eax ja L(cross_page_boundary) /* Check the first VEC_SIZE bytes. Each bit in K0 represents a null byte. */ - VPCMP $0, (%rdi), %YMMZERO, %k0 - kmovd %k0, %eax -# ifdef USE_AS_STRNLEN - /* If length < CHAR_PER_VEC handle special. */ - cmpq $CHAR_PER_VEC, %rsi - jbe L(first_vec_x0) -# endif - testl %eax, %eax + VPCMPEQ (%rdi), %VZERO, %k0 + KMOV %k0, %VRAX + test %VRAX, %VRAX jz L(aligned_more) - tzcntl %eax, %eax - ret -# ifdef USE_AS_STRNLEN -L(zero): - xorl %eax, %eax - ret - - .p2align 4 -L(first_vec_x0): - /* Set bit for max len so that tzcnt will return min of max len - and position of first match. */ - btsq %rsi, %rax - tzcntl %eax, %eax - ret -# endif - - .p2align 4 -L(first_vec_x1): - tzcntl %eax, %eax - /* Safe to use 32 bit instructions as these are only called for - size = [1, 159]. */ -# ifdef USE_AS_STRNLEN - /* Use ecx which was computed earlier to compute correct value. - */ - leal -(CHAR_PER_VEC * 4 + 1)(%rcx, %rax), %eax -# else - subl %edx, %edi -# ifdef USE_AS_WCSLEN - /* NB: Divide bytes by 4 to get the wchar_t count. */ - sarl $2, %edi -# endif - leal CHAR_PER_VEC(%rdi, %rax), %eax -# endif - ret - - .p2align 4 -L(first_vec_x2): - tzcntl %eax, %eax - /* Safe to use 32 bit instructions as these are only called for - size = [1, 159]. */ -# ifdef USE_AS_STRNLEN - /* Use ecx which was computed earlier to compute correct value. - */ - leal -(CHAR_PER_VEC * 3 + 1)(%rcx, %rax), %eax -# else - subl %edx, %edi -# ifdef USE_AS_WCSLEN - /* NB: Divide bytes by 4 to get the wchar_t count. */ - sarl $2, %edi -# endif - leal (CHAR_PER_VEC * 2)(%rdi, %rax), %eax -# endif + bsf %VRAX, %VRAX ret - .p2align 4 -L(first_vec_x3): - tzcntl %eax, %eax - /* Safe to use 32 bit instructions as these are only called for - size = [1, 159]. */ -# ifdef USE_AS_STRNLEN - /* Use ecx which was computed earlier to compute correct value. - */ - leal -(CHAR_PER_VEC * 2 + 1)(%rcx, %rax), %eax -# else - subl %edx, %edi -# ifdef USE_AS_WCSLEN - /* NB: Divide bytes by 4 to get the wchar_t count. */ - sarl $2, %edi -# endif - leal (CHAR_PER_VEC * 3)(%rdi, %rax), %eax -# endif - ret - - .p2align 4 + .p2align 4,, 8 L(first_vec_x4): - tzcntl %eax, %eax - /* Safe to use 32 bit instructions as these are only called for - size = [1, 159]. */ -# ifdef USE_AS_STRNLEN - /* Use ecx which was computed earlier to compute correct value. - */ - leal -(CHAR_PER_VEC + 1)(%rcx, %rax), %eax -# else - subl %edx, %edi -# ifdef USE_AS_WCSLEN - /* NB: Divide bytes by 4 to get the wchar_t count. */ - sarl $2, %edi -# endif + bsf %VRAX, %VRAX + subl %ecx, %edi + CHAR_SIZE_SHIFT_REG (edi) leal (CHAR_PER_VEC * 4)(%rdi, %rax), %eax -# endif ret - .p2align 5 + + + /* Aligned more for strnlen compares remaining length vs 2 * + CHAR_PER_VEC, 4 * CHAR_PER_VEC, and 8 * CHAR_PER_VEC before + going to the loop. */ + .p2align 4,, 10 L(aligned_more): - movq %rdi, %rdx - /* Align data to VEC_SIZE. */ - andq $-(VEC_SIZE), %rdi + movq %rdi, %rcx + andq $(VEC_SIZE * -1), %rdi L(cross_page_continue): - /* Check the first 4 * VEC_SIZE. Only one VEC_SIZE at a time - since data is only aligned to VEC_SIZE. */ -# ifdef USE_AS_STRNLEN - /* + CHAR_SIZE because it simplies the logic in - last_4x_vec_or_less. */ - leaq (VEC_SIZE * 5 + CHAR_SIZE)(%rdi), %rcx - subq %rdx, %rcx -# ifdef USE_AS_WCSLEN - /* NB: Divide bytes by 4 to get the wchar_t count. */ - sarl $2, %ecx -# endif -# endif - /* Load first VEC regardless. */ - VPCMP $0, VEC_SIZE(%rdi), %YMMZERO, %k0 -# ifdef USE_AS_STRNLEN - /* Adjust length. If near end handle specially. */ - subq %rcx, %rsi - jb L(last_4x_vec_or_less) -# endif - kmovd %k0, %eax - testl %eax, %eax + /* Remaining length >= 2 * CHAR_PER_VEC so do VEC0/VEC1 without + rechecking bounds. */ + VPCMPEQ (VEC_SIZE * 1)(%rdi), %VZERO, %k0 + KMOV %k0, %VRAX + test %VRAX, %VRAX jnz L(first_vec_x1) - VPCMP $0, (VEC_SIZE * 2)(%rdi), %YMMZERO, %k0 - kmovd %k0, %eax - test %eax, %eax + VPCMPEQ (VEC_SIZE * 2)(%rdi), %VZERO, %k0 + KMOV %k0, %VRAX + test %VRAX, %VRAX jnz L(first_vec_x2) - VPCMP $0, (VEC_SIZE * 3)(%rdi), %YMMZERO, %k0 - kmovd %k0, %eax - testl %eax, %eax + VPCMPEQ (VEC_SIZE * 3)(%rdi), %VZERO, %k0 + KMOV %k0, %VRAX + test %VRAX, %VRAX jnz L(first_vec_x3) - VPCMP $0, (VEC_SIZE * 4)(%rdi), %YMMZERO, %k0 - kmovd %k0, %eax - testl %eax, %eax + VPCMPEQ (VEC_SIZE * 4)(%rdi), %VZERO, %k0 + KMOV %k0, %VRAX + test %VRAX, %VRAX jnz L(first_vec_x4) - addq $VEC_SIZE, %rdi -# ifdef USE_AS_STRNLEN - /* Check if at last VEC_SIZE * 4 length. */ - cmpq $(CHAR_PER_VEC * 4 - 1), %rsi - jbe L(last_4x_vec_or_less_load) - movl %edi, %ecx - andl $(VEC_SIZE * 4 - 1), %ecx -# ifdef USE_AS_WCSLEN - /* NB: Divide bytes by 4 to get the wchar_t count. */ - sarl $2, %ecx -# endif - /* Readjust length. */ - addq %rcx, %rsi -# endif - /* Align data to VEC_SIZE * 4. */ + subq $(VEC_SIZE * -1), %rdi + +# if CHAR_PER_VEC == 64 + /* No partial register stalls on processors that we use evex512 + on and this saves code size. */ + xorb %dil, %dil +# else andq $-(VEC_SIZE * 4), %rdi +# endif + + /* Compare 4 * VEC at a time forward. */ .p2align 4 L(loop_4x_vec): - /* Load first VEC regardless. */ - VMOVA (VEC_SIZE * 4)(%rdi), %YMM1 -# ifdef USE_AS_STRNLEN - /* Break if at end of length. */ - subq $(CHAR_PER_VEC * 4), %rsi - jb L(last_4x_vec_or_less_cmpeq) -# endif - /* Save some code size by microfusing VPMINU with the load. Since - the matches in ymm2/ymm4 can only be returned if there where no - matches in ymm1/ymm3 respectively there is no issue with overlap. - */ - VPMINU (VEC_SIZE * 5)(%rdi), %YMM1, %YMM2 - VMOVA (VEC_SIZE * 6)(%rdi), %YMM3 - VPMINU (VEC_SIZE * 7)(%rdi), %YMM3, %YMM4 + VMOVA (VEC_SIZE * 4)(%rdi), %VMM(1) + VPMINU (VEC_SIZE * 5)(%rdi), %VMM(1), %VMM(2) + VMOVA (VEC_SIZE * 6)(%rdi), %VMM(3) + VPMINU (VEC_SIZE * 7)(%rdi), %VMM(3), %VMM(4) + VPTESTN %VMM(2), %VMM(2), %k0 + VPTESTN %VMM(4), %VMM(4), %k2 - VPCMP $0, %YMM2, %YMMZERO, %k0 - VPCMP $0, %YMM4, %YMMZERO, %k1 subq $-(VEC_SIZE * 4), %rdi - kortestd %k0, %k1 + KORTEST %k0, %k2 jz L(loop_4x_vec) - /* Check if end was in first half. */ - kmovd %k0, %eax - subq %rdx, %rdi -# ifdef USE_AS_WCSLEN - shrq $2, %rdi -# endif - testl %eax, %eax - jz L(second_vec_return) + VPTESTN %VMM(1), %VMM(1), %k1 + KMOV %k1, %VRAX + test %VRAX, %VRAX + jnz L(first_vec_x0) - VPCMP $0, %YMM1, %YMMZERO, %k2 - kmovd %k2, %edx - /* Combine VEC1 matches (edx) with VEC2 matches (eax). */ -# ifdef USE_AS_WCSLEN - sall $CHAR_PER_VEC, %eax - orl %edx, %eax - tzcntl %eax, %eax -# else - salq $CHAR_PER_VEC, %rax - orq %rdx, %rax - tzcntq %rax, %rax -# endif - addq %rdi, %rax - ret - - -# ifdef USE_AS_STRNLEN - -L(last_4x_vec_or_less_load): - /* Depending on entry adjust rdi / prepare first VEC in YMM1. */ - VMOVA (VEC_SIZE * 4)(%rdi), %YMM1 -L(last_4x_vec_or_less_cmpeq): - VPCMP $0, %YMM1, %YMMZERO, %k0 - addq $(VEC_SIZE * 3), %rdi -L(last_4x_vec_or_less): - kmovd %k0, %eax - /* If remaining length > VEC_SIZE * 2. This works if esi is off by - VEC_SIZE * 4. */ - testl $(CHAR_PER_VEC * 2), %esi - jnz L(last_4x_vec) - - /* length may have been negative or positive by an offset of - CHAR_PER_VEC * 4 depending on where this was called from. This - fixes that. */ - andl $(CHAR_PER_VEC * 4 - 1), %esi - testl %eax, %eax - jnz L(last_vec_x1_check) + KMOV %k0, %VRAX + test %VRAX, %VRAX + jnz L(first_vec_x1) - /* Check the end of data. */ - subl $CHAR_PER_VEC, %esi - jb L(max) + VPTESTN %VMM(3), %VMM(3), %k0 - VPCMP $0, (VEC_SIZE * 2)(%rdi), %YMMZERO, %k0 - kmovd %k0, %eax - tzcntl %eax, %eax - /* Check the end of data. */ - cmpl %eax, %esi - jb L(max) - - subq %rdx, %rdi -# ifdef USE_AS_WCSLEN - /* NB: Divide bytes by 4 to get the wchar_t count. */ - sarq $2, %rdi -# endif - leaq (CHAR_PER_VEC * 2)(%rdi, %rax), %rax - ret -L(max): - movq %r8, %rax - ret -# endif - - /* Placed here in strnlen so that the jcc L(last_4x_vec_or_less) - in the 4x VEC loop can use 2 byte encoding. */ - .p2align 4 -L(second_vec_return): - VPCMP $0, %YMM3, %YMMZERO, %k0 - /* Combine YMM3 matches (k0) with YMM4 matches (k1). */ -# ifdef USE_AS_WCSLEN - kunpckbw %k0, %k1, %k0 - kmovd %k0, %eax - tzcntl %eax, %eax +# if CHAR_PER_VEC == 64 + KMOV %k0, %VRAX + test %VRAX, %VRAX + jnz L(first_vec_x2) + KMOV %k2, %VRAX # else - kunpckdq %k0, %k1, %k0 - kmovq %k0, %rax - tzcntq %rax, %rax + /* We can only combine last 2x VEC masks if CHAR_PER_VEC <= 32. + */ + kmovd %k2, %edx + kmovd %k0, %eax + salq $CHAR_PER_VEC, %rdx + orq %rdx, %rax # endif - leaq (CHAR_PER_VEC * 2)(%rdi, %rax), %rax - ret - -# ifdef USE_AS_STRNLEN -L(last_vec_x1_check): - tzcntl %eax, %eax - /* Check the end of data. */ - cmpl %eax, %esi - jb L(max) - subq %rdx, %rdi -# ifdef USE_AS_WCSLEN - /* NB: Divide bytes by 4 to get the wchar_t count. */ - sarq $2, %rdi -# endif - leaq (CHAR_PER_VEC)(%rdi, %rax), %rax + /* first_vec_x3 for strlen-ZMM and first_vec_x2 for strlen-YMM. + */ + .p2align 4,, 2 +L(FALLTHROUGH_RETURN_LBL): + bsfq %rax, %rax + subq %rcx, %rdi + CHAR_SIZE_SHIFT_REG (rdi) + leaq (FALLTHROUGH_RETURN_OFFSET)(%rdi, %rax), %rax ret - .p2align 4 -L(last_4x_vec): - /* Test first 2x VEC normally. */ - testl %eax, %eax - jnz L(last_vec_x1) - - VPCMP $0, (VEC_SIZE * 2)(%rdi), %YMMZERO, %k0 - kmovd %k0, %eax - testl %eax, %eax - jnz L(last_vec_x2) - - /* Normalize length. */ - andl $(CHAR_PER_VEC * 4 - 1), %esi - VPCMP $0, (VEC_SIZE * 3)(%rdi), %YMMZERO, %k0 - kmovd %k0, %eax - testl %eax, %eax - jnz L(last_vec_x3) - - /* Check the end of data. */ - subl $(CHAR_PER_VEC * 3), %esi - jb L(max) - - VPCMP $0, (VEC_SIZE * 4)(%rdi), %YMMZERO, %k0 - kmovd %k0, %eax - tzcntl %eax, %eax - /* Check the end of data. */ - cmpl %eax, %esi - jb L(max_end) - - subq %rdx, %rdi -# ifdef USE_AS_WCSLEN - /* NB: Divide bytes by 4 to get the wchar_t count. */ - sarq $2, %rdi -# endif - leaq (CHAR_PER_VEC * 4)(%rdi, %rax), %rax + .p2align 4,, 8 +L(first_vec_x0): + bsf %VRAX, %VRAX + sub %rcx, %rdi + CHAR_SIZE_SHIFT_REG (rdi) + addq %rdi, %rax ret - .p2align 4 -L(last_vec_x1): - tzcntl %eax, %eax - subq %rdx, %rdi -# ifdef USE_AS_WCSLEN - /* NB: Divide bytes by 4 to get the wchar_t count. */ - sarq $2, %rdi -# endif + .p2align 4,, 10 +L(first_vec_x1): + bsf %VRAX, %VRAX + sub %rcx, %rdi + CHAR_SIZE_SHIFT_REG (rdi) leaq (CHAR_PER_VEC)(%rdi, %rax), %rax ret - .p2align 4 -L(last_vec_x2): - tzcntl %eax, %eax - subq %rdx, %rdi -# ifdef USE_AS_WCSLEN - /* NB: Divide bytes by 4 to get the wchar_t count. */ - sarq $2, %rdi -# endif - leaq (CHAR_PER_VEC * 2)(%rdi, %rax), %rax - ret - - .p2align 4 -L(last_vec_x3): - tzcntl %eax, %eax - subl $(CHAR_PER_VEC * 2), %esi - /* Check the end of data. */ - cmpl %eax, %esi - jb L(max_end) - subq %rdx, %rdi -# ifdef USE_AS_WCSLEN - /* NB: Divide bytes by 4 to get the wchar_t count. */ - sarq $2, %rdi -# endif - leaq (CHAR_PER_VEC * 3)(%rdi, %rax), %rax - ret -L(max_end): - movq %r8, %rax + .p2align 4,, 10 + /* first_vec_x2 for strlen-ZMM and first_vec_x3 for strlen-YMM. + */ +L(TAIL_RETURN_LBL): + bsf %VRAX, %VRAX + sub %VRCX, %VRDI + CHAR_SIZE_SHIFT_REG (VRDI) + lea (TAIL_RETURN_OFFSET)(%rdi, %rax), %VRAX ret -# endif - /* Cold case for crossing page with first load. */ - .p2align 4 + .p2align 4,, 8 L(cross_page_boundary): - movq %rdi, %rdx + movq %rdi, %rcx /* Align data to VEC_SIZE. */ andq $-VEC_SIZE, %rdi - VPCMP $0, (%rdi), %YMMZERO, %k0 - kmovd %k0, %eax - /* Remove the leading bytes. */ + + VPCMPEQ (%rdi), %VZERO, %k0 + + KMOV %k0, %VRAX # ifdef USE_AS_WCSLEN - /* NB: Divide shift count by 4 since each bit in K0 represent 4 - bytes. */ - movl %edx, %ecx - shrl $2, %ecx - andl $(CHAR_PER_VEC - 1), %ecx -# endif - /* SHIFT_REG is ecx for USE_AS_WCSLEN and edx otherwise. */ - sarxl %SHIFT_REG, %eax, %eax + movl %ecx, %edx + shrl $2, %edx + andl $(CHAR_PER_VEC - 1), %edx + shrx %edx, %eax, %eax testl %eax, %eax -# ifndef USE_AS_STRNLEN - jz L(cross_page_continue) - tzcntl %eax, %eax - ret # else - jnz L(cross_page_less_vec) -# ifndef USE_AS_WCSLEN - movl %edx, %ecx - andl $(CHAR_PER_VEC - 1), %ecx -# endif - movl $CHAR_PER_VEC, %eax - subl %ecx, %eax - /* Check the end of data. */ - cmpq %rax, %rsi - ja L(cross_page_continue) - movl %esi, %eax - ret -L(cross_page_less_vec): - tzcntl %eax, %eax - /* Select min of length and position of first null. */ - cmpq %rax, %rsi - cmovb %esi, %eax - ret + shr %cl, %VRAX # endif + jz L(cross_page_continue) + bsf %VRAX, %VRAX + ret END (STRLEN) #endif diff --git a/sysdeps/x86_64/multiarch/strnlen-evex.S b/sysdeps/x86_64/multiarch/strnlen-evex.S index 64a9fc2606..443a32a749 100644 --- a/sysdeps/x86_64/multiarch/strnlen-evex.S +++ b/sysdeps/x86_64/multiarch/strnlen-evex.S @@ -1,8 +1,423 @@ -#ifndef STRNLEN -# define STRNLEN __strnlen_evex -#endif +/* strnlen/wcsnlen optimized with 256-bit EVEX instructions. + Copyright (C) 2022 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + . */ + +#include +#include + +#if ISA_SHOULD_BUILD (4) + +# ifndef VEC_SIZE +# include "x86-evex256-vecs.h" +# endif + + +# ifndef STRNLEN +# define STRNLEN __strnlen_evex +# endif + +# ifdef USE_AS_WCSLEN +# define VPCMPEQ vpcmpeqd +# define VPCMPNEQ vpcmpneqd +# define VPTESTN vptestnmd +# define VPTEST vptestmd +# define VPMINU vpminud +# define CHAR_SIZE 4 + +# else +# define VPCMPEQ vpcmpeqb +# define VPCMPNEQ vpcmpneqb +# define VPTESTN vptestnmb +# define VPTEST vptestmb +# define VPMINU vpminub +# define CHAR_SIZE 1 + +# define REG_WIDTH VEC_SIZE +# endif + +# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE) + +# include "reg-macros.h" + +# if CHAR_PER_VEC == 32 +# define SUB_SHORT(imm, reg) subb $(imm), %VGPR_SZ(reg, 8) +# else +# define SUB_SHORT(imm, reg) subl $(imm), %VGPR_SZ(reg, 32) +# endif + + + +# if CHAR_PER_VEC == 64 +# define FALLTHROUGH_RETURN_OFFSET (CHAR_PER_VEC * 3) +# else +# define FALLTHROUGH_RETURN_OFFSET (CHAR_PER_VEC * 2) +# endif + + +# define XZERO VMM_128(0) +# define VZERO VMM(0) +# define PAGE_SIZE 4096 + + .section SECTION(.text), "ax", @progbits +ENTRY_P2ALIGN (STRNLEN, 6) + /* Check zero length. */ + test %RSI_LP, %RSI_LP + jz L(zero) +# ifdef __ILP32__ + /* Clear the upper 32 bits. */ + movl %esi, %esi +# endif + + movl %edi, %eax + vpxorq %XZERO, %XZERO, %XZERO + andl $(PAGE_SIZE - 1), %eax + cmpl $(PAGE_SIZE - VEC_SIZE), %eax + ja L(cross_page_boundary) + + /* Check the first VEC_SIZE bytes. Each bit in K0 represents a + null byte. */ + VPCMPEQ (%rdi), %VZERO, %k0 + + KMOV %k0, %VRCX + movq %rsi, %rax + + /* If src (rcx) is zero, bsf does not change the result. NB: + Must use 64-bit bsf here so that upper bits of len are not + cleared. */ + bsfq %rcx, %rax + /* If rax > CHAR_PER_VEC then rcx must have been zero (no null + CHAR) and rsi must be > CHAR_PER_VEC. */ + cmpq $CHAR_PER_VEC, %rax + ja L(more_1x_vec) + /* Check if first match in bounds. */ + cmpq %rax, %rsi + cmovb %esi, %eax + ret + + +# if CHAR_PER_VEC != 32 + .p2align 4,, 2 +L(zero): +L(max_0): + movl %esi, %eax + ret +# endif + + /* Aligned more for strnlen compares remaining length vs 2 * + CHAR_PER_VEC, 4 * CHAR_PER_VEC, and 8 * CHAR_PER_VEC before + going to the loop. */ + .p2align 4,, 10 +L(more_1x_vec): +L(cross_page_continue): + /* Compute number of words checked after aligning. */ +# ifdef USE_AS_WCSLEN + /* Need to compute directly for wcslen as CHAR_SIZE * rsi can + overflow. */ + movq %rdi, %rax + andq $(VEC_SIZE * -1), %rdi + subq %rdi, %rax + sarq $2, %rax + leaq -(CHAR_PER_VEC * 1)(%rax, %rsi), %rax +# else + leaq (VEC_SIZE * -1)(%rsi, %rdi), %rax + andq $(VEC_SIZE * -1), %rdi + subq %rdi, %rax +# endif + + + VPCMPEQ VEC_SIZE(%rdi), %VZERO, %k0 + + cmpq $(CHAR_PER_VEC * 2), %rax + ja L(more_2x_vec) + +L(last_2x_vec_or_less): + KMOV %k0, %VRDX + test %VRDX, %VRDX + jnz L(last_vec_check) + + /* Check the end of data. */ + SUB_SHORT (CHAR_PER_VEC, rax) + jbe L(max_0) + VPCMPEQ (VEC_SIZE * 2)(%rdi), %VZERO, %k0 + KMOV %k0, %VRDX + test %VRDX, %VRDX + jz L(max_0) + /* Best place for LAST_VEC_CHECK if ZMM. */ + .p2align 4,, 8 +L(last_vec_check): + bsf %VRDX, %VRDX + sub %eax, %edx + lea (%rsi, %rdx), %eax + cmovae %esi, %eax + ret + +# if CHAR_PER_VEC == 32 + .p2align 4,, 2 +L(zero): +L(max_0): + movl %esi, %eax + ret +# endif + + .p2align 4,, 8 +L(last_4x_vec_or_less): + addl $(CHAR_PER_VEC * -4), %eax + VPCMPEQ (VEC_SIZE * 5)(%rdi), %VZERO, %k0 + subq $(VEC_SIZE * -4), %rdi + cmpl $(CHAR_PER_VEC * 2), %eax + jbe L(last_2x_vec_or_less) + + .p2align 4,, 6 +L(more_2x_vec): + /* Remaining length >= 2 * CHAR_PER_VEC so do VEC0/VEC1 without + rechecking bounds. */ -#define USE_AS_STRNLEN 1 -#define STRLEN STRNLEN + KMOV %k0, %VRDX -#include "strlen-evex.S" + test %VRDX, %VRDX + jnz L(first_vec_x1) + + VPCMPEQ (VEC_SIZE * 2)(%rdi), %VZERO, %k0 + KMOV %k0, %VRDX + test %VRDX, %VRDX + jnz L(first_vec_x2) + + cmpq $(CHAR_PER_VEC * 4), %rax + ja L(more_4x_vec) + + + VPCMPEQ (VEC_SIZE * 3)(%rdi), %VZERO, %k0 + KMOV %k0, %VRDX + addl $(CHAR_PER_VEC * -2), %eax + test %VRDX, %VRDX + jnz L(last_vec_check) + + subl $(CHAR_PER_VEC), %eax + jbe L(max_1) + + VPCMPEQ (VEC_SIZE * 4)(%rdi), %VZERO, %k0 + KMOV %k0, %VRDX + + test %VRDX, %VRDX + jnz L(last_vec_check) +L(max_1): + movl %esi, %eax + ret + + .p2align 4,, 3 +L(first_vec_x2): +# if VEC_SIZE == 64 + /* If VEC_SIZE == 64 we can fit logic for full return label in + spare bytes before next cache line. */ + bsf %VRDX, %VRDX + sub %eax, %esi + leal (CHAR_PER_VEC * 1)(%rsi, %rdx), %eax + ret + .p2align 4,, 6 +# else + addl $CHAR_PER_VEC, %esi +# endif +L(first_vec_x1): + bsf %VRDX, %VRDX + sub %eax, %esi + leal (CHAR_PER_VEC * 0)(%rsi, %rdx), %eax + ret + + + .p2align 4,, 6 +L(first_vec_x4): +# if VEC_SIZE == 64 + /* If VEC_SIZE == 64 we can fit logic for full return label in + spare bytes before next cache line. */ + bsf %VRDX, %VRDX + sub %eax, %esi + leal (CHAR_PER_VEC * 3)(%rsi, %rdx), %eax + ret + .p2align 4,, 6 +# else + addl $CHAR_PER_VEC, %esi +# endif +L(first_vec_x3): + bsf %VRDX, %VRDX + sub %eax, %esi + leal (CHAR_PER_VEC * 2)(%rsi, %rdx), %eax + ret + + .p2align 4,, 5 +L(more_4x_vec): + VPCMPEQ (VEC_SIZE * 3)(%rdi), %VZERO, %k0 + KMOV %k0, %VRDX + test %VRDX, %VRDX + jnz L(first_vec_x3) + + VPCMPEQ (VEC_SIZE * 4)(%rdi), %VZERO, %k0 + KMOV %k0, %VRDX + test %VRDX, %VRDX + jnz L(first_vec_x4) + + /* Check if at last VEC_SIZE * 4 length before aligning for the + loop. */ + cmpq $(CHAR_PER_VEC * 8), %rax + jbe L(last_4x_vec_or_less) + + + /* Compute number of words checked after aligning. */ +# ifdef USE_AS_WCSLEN + /* Need to compute directly for wcslen as CHAR_SIZE * rsi can + overflow. */ + leaq (VEC_SIZE * -3)(%rdi), %rdx +# else + leaq (VEC_SIZE * -3)(%rdi, %rax), %rax +# endif + + subq $(VEC_SIZE * -1), %rdi + + /* Align data to VEC_SIZE * 4. */ +# if VEC_SIZE == 64 + /* Saves code size. No evex512 processor has partial register + stalls. If that change this can be replaced with `andq + $-(VEC_SIZE * 4), %rdi`. */ + xorb %dil, %dil +# else + andq $-(VEC_SIZE * 4), %rdi +# endif + +# ifdef USE_AS_WCSLEN + subq %rdi, %rdx + sarq $2, %rdx + addq %rdx, %rax +# else + subq %rdi, %rax +# endif + /* Compare 4 * VEC at a time forward. */ + .p2align 4,, 11 +L(loop_4x_vec): + VMOVA (VEC_SIZE * 4)(%rdi), %VMM(1) + VPMINU (VEC_SIZE * 5)(%rdi), %VMM(1), %VMM(2) + VMOVA (VEC_SIZE * 6)(%rdi), %VMM(3) + VPMINU (VEC_SIZE * 7)(%rdi), %VMM(3), %VMM(4) + VPTESTN %VMM(2), %VMM(2), %k0 + VPTESTN %VMM(4), %VMM(4), %k2 + subq $-(VEC_SIZE * 4), %rdi + /* Break if at end of length. */ + subq $(CHAR_PER_VEC * 4), %rax + jbe L(loop_len_end) + + + KORTEST %k0, %k2 + jz L(loop_4x_vec) + + +L(loop_last_4x_vec): + movq %rsi, %rcx + subq %rax, %rsi + VPTESTN %VMM(1), %VMM(1), %k1 + KMOV %k1, %VRDX + test %VRDX, %VRDX + jnz L(last_vec_x0) + + KMOV %k0, %VRDX + test %VRDX, %VRDX + jnz L(last_vec_x1) + + VPTESTN %VMM(3), %VMM(3), %k0 + + /* Seperate logic for VEC_SIZE == 64 and VEC_SIZE == 32 for + returning last 2x VEC. For VEC_SIZE == 64 we test each VEC + individually, for VEC_SIZE == 32 we combine them in a single + 64-bit GPR. */ +# if CHAR_PER_VEC == 64 + KMOV %k0, %VRDX + test %VRDX, %VRDX + jnz L(last_vec_x2) + KMOV %k2, %VRDX +# else + /* We can only combine last 2x VEC masks if CHAR_PER_VEC <= 32. + */ + kmovd %k2, %edx + kmovd %k0, %eax + salq $CHAR_PER_VEC, %rdx + orq %rax, %rdx +# endif + + /* first_vec_x3 for strlen-ZMM and first_vec_x2 for strlen-YMM. + */ + bsfq %rdx, %rdx + leaq (FALLTHROUGH_RETURN_OFFSET - CHAR_PER_VEC * 4)(%rsi, %rdx), %rax + cmpq %rax, %rcx + cmovb %rcx, %rax + ret + + /* Handle last 4x VEC after loop. All VECs have been loaded. */ + .p2align 4,, 4 +L(loop_len_end): + KORTEST %k0, %k2 + jnz L(loop_last_4x_vec) + movq %rsi, %rax + ret + + +# if CHAR_PER_VEC == 64 + /* Since we can't combine the last 2x VEC for VEC_SIZE == 64 + need return label for it. */ + .p2align 4,, 8 +L(last_vec_x2): + bsf %VRDX, %VRDX + leaq (CHAR_PER_VEC * -2)(%rsi, %rdx), %rax + cmpq %rax, %rcx + cmovb %rcx, %rax + ret +# endif + + + .p2align 4,, 10 +L(last_vec_x1): + addq $CHAR_PER_VEC, %rsi +L(last_vec_x0): + bsf %VRDX, %VRDX + leaq (CHAR_PER_VEC * -4)(%rsi, %rdx), %rax + cmpq %rax, %rcx + cmovb %rcx, %rax + ret + + + .p2align 4,, 8 +L(cross_page_boundary): + /* Align data to VEC_SIZE. */ + movq %rdi, %rcx + andq $-VEC_SIZE, %rcx + VPCMPEQ (%rcx), %VZERO, %k0 + + KMOV %k0, %VRCX +# ifdef USE_AS_WCSLEN + shrl $2, %eax + andl $(CHAR_PER_VEC - 1), %eax +# endif + shrx %VRAX, %VRCX, %VRCX + + negl %eax + andl $(CHAR_PER_VEC - 1), %eax + movq %rsi, %rdx + bsf %VRCX, %VRDX + cmpq %rax, %rdx + ja L(cross_page_continue) + movl %edx, %eax + cmpq %rdx, %rsi + cmovb %esi, %eax + ret +END (STRNLEN) +#endif diff --git a/sysdeps/x86_64/multiarch/wcsnlen-evex.S b/sysdeps/x86_64/multiarch/wcsnlen-evex.S index e2aad94c1e..57a7e93fbf 100644 --- a/sysdeps/x86_64/multiarch/wcsnlen-evex.S +++ b/sysdeps/x86_64/multiarch/wcsnlen-evex.S @@ -2,8 +2,7 @@ # define WCSNLEN __wcsnlen_evex #endif -#define STRLEN WCSNLEN +#define STRNLEN WCSNLEN #define USE_AS_WCSLEN 1 -#define USE_AS_STRNLEN 1 -#include "strlen-evex.S" +#include "strnlen-evex.S"