On Mon, Oct 17, 2022 at 7:49 PM Noah Goldstein wrote: > > Optimizations are: > 1. Use the fact that bsf(0) leaves the destination unchanged to save a > branch in short string case. > 2. Restructure code so that small strings are given the hot path. > - This is a net-zero on the benchmark suite but in general makes > sense as smaller sizes are far more common. > 3. Use more code-size efficient instructions. > - tzcnt ... -> bsf ... > - vpcmpb $0 ... -> vpcmpeq ... > 4. Align labels less aggressively, especially if it doesn't save fetch > blocks / causes the basic-block to span extra cache-lines. > > The optimizations (especially for point 2) make the strnlen and > strlen code essentially incompatible so split strnlen-evex > to a new file. > > Code Size Changes: > strlen-evex.S : -23 bytes > strnlen-evex.S : -167 bytes > > Net perf changes: > > Reported as geometric mean of all improvements / regressions from N=10 > runs of the benchtests. Value as New Time / Old Time so < 1.0 is > improvement and 1.0 is regression. > > strlen-evex.S : 0.992 (No real change) > strnlen-evex.S : 0.947 > > Full results attached in email. > > Full check passes on x86-64. > --- > sysdeps/x86_64/multiarch/strlen-evex.S | 544 +++++++----------------- > sysdeps/x86_64/multiarch/strnlen-evex.S | 427 ++++++++++++++++++- > sysdeps/x86_64/multiarch/wcsnlen-evex.S | 5 +- > 3 files changed, 572 insertions(+), 404 deletions(-) > > diff --git a/sysdeps/x86_64/multiarch/strlen-evex.S b/sysdeps/x86_64/multiarch/strlen-evex.S > index 2109ec2f7a..487846f098 100644 > --- a/sysdeps/x86_64/multiarch/strlen-evex.S > +++ b/sysdeps/x86_64/multiarch/strlen-evex.S > @@ -26,466 +26,220 @@ > # define STRLEN __strlen_evex > # endif > > -# define VMOVA vmovdqa64 > +# ifndef VEC_SIZE > +# include "x86-evex256-vecs.h" > +# endif > > # ifdef USE_AS_WCSLEN > -# define VPCMP vpcmpd > +# define VPCMPEQ vpcmpeqd > +# define VPCMPNEQ vpcmpneqd > +# define VPTESTN vptestnmd > +# define VPTEST vptestmd > # define VPMINU vpminud > -# define SHIFT_REG ecx > # define CHAR_SIZE 4 > +# define CHAR_SIZE_SHIFT_REG(reg) sar $2, %reg > # else > -# define VPCMP vpcmpb > +# define VPCMPEQ vpcmpeqb > +# define VPCMPNEQ vpcmpneqb > +# define VPTESTN vptestnmb > +# define VPTEST vptestmb > # define VPMINU vpminub > -# define SHIFT_REG edx > # define CHAR_SIZE 1 > +# define CHAR_SIZE_SHIFT_REG(reg) > + > +# define REG_WIDTH VEC_SIZE > # endif > > -# define XMMZERO xmm16 > -# define YMMZERO ymm16 > -# define YMM1 ymm17 > -# define YMM2 ymm18 > -# define YMM3 ymm19 > -# define YMM4 ymm20 > -# define YMM5 ymm21 > -# define YMM6 ymm22 > - > -# define VEC_SIZE 32 > -# define PAGE_SIZE 4096 > -# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE) > - > - .section .text.evex,"ax",@progbits > -ENTRY (STRLEN) > -# ifdef USE_AS_STRNLEN > - /* Check zero length. */ > - test %RSI_LP, %RSI_LP > - jz L(zero) > -# ifdef __ILP32__ > - /* Clear the upper 32 bits. */ > - movl %esi, %esi > -# endif > - mov %RSI_LP, %R8_LP > +# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE) > + > +# include "reg-macros.h" > + > +# if CHAR_PER_VEC == 64 > + > +# define TAIL_RETURN_LBL first_vec_x2 > +# define TAIL_RETURN_OFFSET (CHAR_PER_VEC * 2) > + > +# define FALLTHROUGH_RETURN_LBL first_vec_x3 > +# define FALLTHROUGH_RETURN_OFFSET (CHAR_PER_VEC * 3) > + > +# else > + > +# define TAIL_RETURN_LBL first_vec_x3 > +# define TAIL_RETURN_OFFSET (CHAR_PER_VEC * 3) > + > +# define FALLTHROUGH_RETURN_LBL first_vec_x2 > +# define FALLTHROUGH_RETURN_OFFSET (CHAR_PER_VEC * 2) > # endif > + > +# define XZERO VMM_128(0) > +# define VZERO VMM(0) > +# define PAGE_SIZE 4096 > + > + .section SECTION(.text), "ax", @progbits > +ENTRY_P2ALIGN (STRLEN, 6) > movl %edi, %eax > - vpxorq %XMMZERO, %XMMZERO, %XMMZERO > - /* Clear high bits from edi. Only keeping bits relevant to page > - cross check. */ > + vpxorq %XZERO, %XZERO, %XZERO > andl $(PAGE_SIZE - 1), %eax > - /* Check if we may cross page boundary with one vector load. */ > cmpl $(PAGE_SIZE - VEC_SIZE), %eax > ja L(cross_page_boundary) > > /* Check the first VEC_SIZE bytes. Each bit in K0 represents a > null byte. */ > - VPCMP $0, (%rdi), %YMMZERO, %k0 > - kmovd %k0, %eax > -# ifdef USE_AS_STRNLEN > - /* If length < CHAR_PER_VEC handle special. */ > - cmpq $CHAR_PER_VEC, %rsi > - jbe L(first_vec_x0) > -# endif > - testl %eax, %eax > + VPCMPEQ (%rdi), %VZERO, %k0 > + KMOV %k0, %VRAX > + test %VRAX, %VRAX > jz L(aligned_more) > - tzcntl %eax, %eax > - ret > -# ifdef USE_AS_STRNLEN > -L(zero): > - xorl %eax, %eax > - ret > - > - .p2align 4 > -L(first_vec_x0): > - /* Set bit for max len so that tzcnt will return min of max len > - and position of first match. */ > - btsq %rsi, %rax > - tzcntl %eax, %eax > - ret > -# endif > - > - .p2align 4 > -L(first_vec_x1): > - tzcntl %eax, %eax > - /* Safe to use 32 bit instructions as these are only called for > - size = [1, 159]. */ > -# ifdef USE_AS_STRNLEN > - /* Use ecx which was computed earlier to compute correct value. > - */ > - leal -(CHAR_PER_VEC * 4 + 1)(%rcx, %rax), %eax > -# else > - subl %edx, %edi > -# ifdef USE_AS_WCSLEN > - /* NB: Divide bytes by 4 to get the wchar_t count. */ > - sarl $2, %edi > -# endif > - leal CHAR_PER_VEC(%rdi, %rax), %eax > -# endif > - ret > - > - .p2align 4 > -L(first_vec_x2): > - tzcntl %eax, %eax > - /* Safe to use 32 bit instructions as these are only called for > - size = [1, 159]. */ > -# ifdef USE_AS_STRNLEN > - /* Use ecx which was computed earlier to compute correct value. > - */ > - leal -(CHAR_PER_VEC * 3 + 1)(%rcx, %rax), %eax > -# else > - subl %edx, %edi > -# ifdef USE_AS_WCSLEN > - /* NB: Divide bytes by 4 to get the wchar_t count. */ > - sarl $2, %edi > -# endif > - leal (CHAR_PER_VEC * 2)(%rdi, %rax), %eax > -# endif > + bsf %VRAX, %VRAX > ret > > - .p2align 4 > -L(first_vec_x3): > - tzcntl %eax, %eax > - /* Safe to use 32 bit instructions as these are only called for > - size = [1, 159]. */ > -# ifdef USE_AS_STRNLEN > - /* Use ecx which was computed earlier to compute correct value. > - */ > - leal -(CHAR_PER_VEC * 2 + 1)(%rcx, %rax), %eax > -# else > - subl %edx, %edi > -# ifdef USE_AS_WCSLEN > - /* NB: Divide bytes by 4 to get the wchar_t count. */ > - sarl $2, %edi > -# endif > - leal (CHAR_PER_VEC * 3)(%rdi, %rax), %eax > -# endif > - ret > - > - .p2align 4 > + .p2align 4,, 8 > L(first_vec_x4): > - tzcntl %eax, %eax > - /* Safe to use 32 bit instructions as these are only called for > - size = [1, 159]. */ > -# ifdef USE_AS_STRNLEN > - /* Use ecx which was computed earlier to compute correct value. > - */ > - leal -(CHAR_PER_VEC + 1)(%rcx, %rax), %eax > -# else > - subl %edx, %edi > -# ifdef USE_AS_WCSLEN > - /* NB: Divide bytes by 4 to get the wchar_t count. */ > - sarl $2, %edi > -# endif > + bsf %VRAX, %VRAX > + subl %ecx, %edi > + CHAR_SIZE_SHIFT_REG (edi) > leal (CHAR_PER_VEC * 4)(%rdi, %rax), %eax > -# endif > ret > > - .p2align 5 > + > + > + /* Aligned more for strnlen compares remaining length vs 2 * > + CHAR_PER_VEC, 4 * CHAR_PER_VEC, and 8 * CHAR_PER_VEC before > + going to the loop. */ > + .p2align 4,, 10 > L(aligned_more): > - movq %rdi, %rdx > - /* Align data to VEC_SIZE. */ > - andq $-(VEC_SIZE), %rdi > + movq %rdi, %rcx > + andq $(VEC_SIZE * -1), %rdi > L(cross_page_continue): > - /* Check the first 4 * VEC_SIZE. Only one VEC_SIZE at a time > - since data is only aligned to VEC_SIZE. */ > -# ifdef USE_AS_STRNLEN > - /* + CHAR_SIZE because it simplies the logic in > - last_4x_vec_or_less. */ > - leaq (VEC_SIZE * 5 + CHAR_SIZE)(%rdi), %rcx > - subq %rdx, %rcx > -# ifdef USE_AS_WCSLEN > - /* NB: Divide bytes by 4 to get the wchar_t count. */ > - sarl $2, %ecx > -# endif > -# endif > - /* Load first VEC regardless. */ > - VPCMP $0, VEC_SIZE(%rdi), %YMMZERO, %k0 > -# ifdef USE_AS_STRNLEN > - /* Adjust length. If near end handle specially. */ > - subq %rcx, %rsi > - jb L(last_4x_vec_or_less) > -# endif > - kmovd %k0, %eax > - testl %eax, %eax > + /* Remaining length >= 2 * CHAR_PER_VEC so do VEC0/VEC1 without > + rechecking bounds. */ > + VPCMPEQ (VEC_SIZE * 1)(%rdi), %VZERO, %k0 > + KMOV %k0, %VRAX > + test %VRAX, %VRAX > jnz L(first_vec_x1) > > - VPCMP $0, (VEC_SIZE * 2)(%rdi), %YMMZERO, %k0 > - kmovd %k0, %eax > - test %eax, %eax > + VPCMPEQ (VEC_SIZE * 2)(%rdi), %VZERO, %k0 > + KMOV %k0, %VRAX > + test %VRAX, %VRAX > jnz L(first_vec_x2) > > - VPCMP $0, (VEC_SIZE * 3)(%rdi), %YMMZERO, %k0 > - kmovd %k0, %eax > - testl %eax, %eax > + VPCMPEQ (VEC_SIZE * 3)(%rdi), %VZERO, %k0 > + KMOV %k0, %VRAX > + test %VRAX, %VRAX > jnz L(first_vec_x3) > > - VPCMP $0, (VEC_SIZE * 4)(%rdi), %YMMZERO, %k0 > - kmovd %k0, %eax > - testl %eax, %eax > + VPCMPEQ (VEC_SIZE * 4)(%rdi), %VZERO, %k0 > + KMOV %k0, %VRAX > + test %VRAX, %VRAX > jnz L(first_vec_x4) > > - addq $VEC_SIZE, %rdi > -# ifdef USE_AS_STRNLEN > - /* Check if at last VEC_SIZE * 4 length. */ > - cmpq $(CHAR_PER_VEC * 4 - 1), %rsi > - jbe L(last_4x_vec_or_less_load) > - movl %edi, %ecx > - andl $(VEC_SIZE * 4 - 1), %ecx > -# ifdef USE_AS_WCSLEN > - /* NB: Divide bytes by 4 to get the wchar_t count. */ > - sarl $2, %ecx > -# endif > - /* Readjust length. */ > - addq %rcx, %rsi > -# endif > - /* Align data to VEC_SIZE * 4. */ > + subq $(VEC_SIZE * -1), %rdi > + > +# if CHAR_PER_VEC == 64 > + /* No partial register stalls on processors that we use evex512 > + on and this saves code size. */ > + xorb %dil, %dil > +# else > andq $-(VEC_SIZE * 4), %rdi > +# endif > + > + > > /* Compare 4 * VEC at a time forward. */ > .p2align 4 > L(loop_4x_vec): > - /* Load first VEC regardless. */ > - VMOVA (VEC_SIZE * 4)(%rdi), %YMM1 > -# ifdef USE_AS_STRNLEN > - /* Break if at end of length. */ > - subq $(CHAR_PER_VEC * 4), %rsi > - jb L(last_4x_vec_or_less_cmpeq) > -# endif > - /* Save some code size by microfusing VPMINU with the load. Since > - the matches in ymm2/ymm4 can only be returned if there where no > - matches in ymm1/ymm3 respectively there is no issue with overlap. > - */ > - VPMINU (VEC_SIZE * 5)(%rdi), %YMM1, %YMM2 > - VMOVA (VEC_SIZE * 6)(%rdi), %YMM3 > - VPMINU (VEC_SIZE * 7)(%rdi), %YMM3, %YMM4 > + VMOVA (VEC_SIZE * 4)(%rdi), %VMM(1) > + VPMINU (VEC_SIZE * 5)(%rdi), %VMM(1), %VMM(2) > + VMOVA (VEC_SIZE * 6)(%rdi), %VMM(3) > + VPMINU (VEC_SIZE * 7)(%rdi), %VMM(3), %VMM(4) > + VPTESTN %VMM(2), %VMM(2), %k0 > + VPTESTN %VMM(4), %VMM(4), %k2 > > - VPCMP $0, %YMM2, %YMMZERO, %k0 > - VPCMP $0, %YMM4, %YMMZERO, %k1 > subq $-(VEC_SIZE * 4), %rdi > - kortestd %k0, %k1 > + KORTEST %k0, %k2 > jz L(loop_4x_vec) > > - /* Check if end was in first half. */ > - kmovd %k0, %eax > - subq %rdx, %rdi > -# ifdef USE_AS_WCSLEN > - shrq $2, %rdi > -# endif > - testl %eax, %eax > - jz L(second_vec_return) > + VPTESTN %VMM(1), %VMM(1), %k1 > + KMOV %k1, %VRAX > + test %VRAX, %VRAX > + jnz L(first_vec_x0) > > - VPCMP $0, %YMM1, %YMMZERO, %k2 > - kmovd %k2, %edx > - /* Combine VEC1 matches (edx) with VEC2 matches (eax). */ > -# ifdef USE_AS_WCSLEN > - sall $CHAR_PER_VEC, %eax > - orl %edx, %eax > - tzcntl %eax, %eax > -# else > - salq $CHAR_PER_VEC, %rax > - orq %rdx, %rax > - tzcntq %rax, %rax > -# endif > - addq %rdi, %rax > - ret > - > - > -# ifdef USE_AS_STRNLEN > - > -L(last_4x_vec_or_less_load): > - /* Depending on entry adjust rdi / prepare first VEC in YMM1. */ > - VMOVA (VEC_SIZE * 4)(%rdi), %YMM1 > -L(last_4x_vec_or_less_cmpeq): > - VPCMP $0, %YMM1, %YMMZERO, %k0 > - addq $(VEC_SIZE * 3), %rdi > -L(last_4x_vec_or_less): > - kmovd %k0, %eax > - /* If remaining length > VEC_SIZE * 2. This works if esi is off by > - VEC_SIZE * 4. */ > - testl $(CHAR_PER_VEC * 2), %esi > - jnz L(last_4x_vec) > - > - /* length may have been negative or positive by an offset of > - CHAR_PER_VEC * 4 depending on where this was called from. This > - fixes that. */ > - andl $(CHAR_PER_VEC * 4 - 1), %esi > - testl %eax, %eax > - jnz L(last_vec_x1_check) > + KMOV %k0, %VRAX > + test %VRAX, %VRAX > + jnz L(first_vec_x1) > > - /* Check the end of data. */ > - subl $CHAR_PER_VEC, %esi > - jb L(max) > + VPTESTN %VMM(3), %VMM(3), %k0 > > - VPCMP $0, (VEC_SIZE * 2)(%rdi), %YMMZERO, %k0 > - kmovd %k0, %eax > - tzcntl %eax, %eax > - /* Check the end of data. */ > - cmpl %eax, %esi > - jb L(max) > - > - subq %rdx, %rdi > -# ifdef USE_AS_WCSLEN > - /* NB: Divide bytes by 4 to get the wchar_t count. */ > - sarq $2, %rdi > -# endif > - leaq (CHAR_PER_VEC * 2)(%rdi, %rax), %rax > - ret > -L(max): > - movq %r8, %rax > - ret > -# endif > - > - /* Placed here in strnlen so that the jcc L(last_4x_vec_or_less) > - in the 4x VEC loop can use 2 byte encoding. */ > - .p2align 4 > -L(second_vec_return): > - VPCMP $0, %YMM3, %YMMZERO, %k0 > - /* Combine YMM3 matches (k0) with YMM4 matches (k1). */ > -# ifdef USE_AS_WCSLEN > - kunpckbw %k0, %k1, %k0 > - kmovd %k0, %eax > - tzcntl %eax, %eax > +# if CHAR_PER_VEC == 64 > + KMOV %k0, %VRAX > + test %VRAX, %VRAX > + jnz L(first_vec_x2) > + KMOV %k2, %VRAX > # else > - kunpckdq %k0, %k1, %k0 > - kmovq %k0, %rax > - tzcntq %rax, %rax > + /* We can only combine last 2x VEC masks if CHAR_PER_VEC <= 32. > + */ > + kmovd %k2, %edx > + kmovd %k0, %eax > + salq $CHAR_PER_VEC, %rdx > + orq %rdx, %rax > # endif > - leaq (CHAR_PER_VEC * 2)(%rdi, %rax), %rax > - ret > > - > -# ifdef USE_AS_STRNLEN > -L(last_vec_x1_check): > - tzcntl %eax, %eax > - /* Check the end of data. */ > - cmpl %eax, %esi > - jb L(max) > - subq %rdx, %rdi > -# ifdef USE_AS_WCSLEN > - /* NB: Divide bytes by 4 to get the wchar_t count. */ > - sarq $2, %rdi > -# endif > - leaq (CHAR_PER_VEC)(%rdi, %rax), %rax > + /* first_vec_x3 for strlen-ZMM and first_vec_x2 for strlen-YMM. > + */ > + .p2align 4,, 2 > +L(FALLTHROUGH_RETURN_LBL): > + bsfq %rax, %rax > + subq %rcx, %rdi > + CHAR_SIZE_SHIFT_REG (rdi) > + leaq (FALLTHROUGH_RETURN_OFFSET)(%rdi, %rax), %rax > ret > > - .p2align 4 > -L(last_4x_vec): > - /* Test first 2x VEC normally. */ > - testl %eax, %eax > - jnz L(last_vec_x1) > - > - VPCMP $0, (VEC_SIZE * 2)(%rdi), %YMMZERO, %k0 > - kmovd %k0, %eax > - testl %eax, %eax > - jnz L(last_vec_x2) > - > - /* Normalize length. */ > - andl $(CHAR_PER_VEC * 4 - 1), %esi > - VPCMP $0, (VEC_SIZE * 3)(%rdi), %YMMZERO, %k0 > - kmovd %k0, %eax > - testl %eax, %eax > - jnz L(last_vec_x3) > - > - /* Check the end of data. */ > - subl $(CHAR_PER_VEC * 3), %esi > - jb L(max) > - > - VPCMP $0, (VEC_SIZE * 4)(%rdi), %YMMZERO, %k0 > - kmovd %k0, %eax > - tzcntl %eax, %eax > - /* Check the end of data. */ > - cmpl %eax, %esi > - jb L(max_end) > - > - subq %rdx, %rdi > -# ifdef USE_AS_WCSLEN > - /* NB: Divide bytes by 4 to get the wchar_t count. */ > - sarq $2, %rdi > -# endif > - leaq (CHAR_PER_VEC * 4)(%rdi, %rax), %rax > + .p2align 4,, 8 > +L(first_vec_x0): > + bsf %VRAX, %VRAX > + sub %rcx, %rdi > + CHAR_SIZE_SHIFT_REG (rdi) > + addq %rdi, %rax > ret > > - .p2align 4 > -L(last_vec_x1): > - tzcntl %eax, %eax > - subq %rdx, %rdi > -# ifdef USE_AS_WCSLEN > - /* NB: Divide bytes by 4 to get the wchar_t count. */ > - sarq $2, %rdi > -# endif > + .p2align 4,, 10 > +L(first_vec_x1): > + bsf %VRAX, %VRAX > + sub %rcx, %rdi > + CHAR_SIZE_SHIFT_REG (rdi) > leaq (CHAR_PER_VEC)(%rdi, %rax), %rax > ret > > - .p2align 4 > -L(last_vec_x2): > - tzcntl %eax, %eax > - subq %rdx, %rdi > -# ifdef USE_AS_WCSLEN > - /* NB: Divide bytes by 4 to get the wchar_t count. */ > - sarq $2, %rdi > -# endif > - leaq (CHAR_PER_VEC * 2)(%rdi, %rax), %rax > - ret > - > - .p2align 4 > -L(last_vec_x3): > - tzcntl %eax, %eax > - subl $(CHAR_PER_VEC * 2), %esi > - /* Check the end of data. */ > - cmpl %eax, %esi > - jb L(max_end) > - subq %rdx, %rdi > -# ifdef USE_AS_WCSLEN > - /* NB: Divide bytes by 4 to get the wchar_t count. */ > - sarq $2, %rdi > -# endif > - leaq (CHAR_PER_VEC * 3)(%rdi, %rax), %rax > - ret > -L(max_end): > - movq %r8, %rax > + .p2align 4,, 10 > + /* first_vec_x2 for strlen-ZMM and first_vec_x3 for strlen-YMM. > + */ > +L(TAIL_RETURN_LBL): > + bsf %VRAX, %VRAX > + sub %VRCX, %VRDI > + CHAR_SIZE_SHIFT_REG (VRDI) > + lea (TAIL_RETURN_OFFSET)(%rdi, %rax), %VRAX > ret > -# endif > > - /* Cold case for crossing page with first load. */ > - .p2align 4 > + .p2align 4,, 8 > L(cross_page_boundary): > - movq %rdi, %rdx > + movq %rdi, %rcx > /* Align data to VEC_SIZE. */ > andq $-VEC_SIZE, %rdi > - VPCMP $0, (%rdi), %YMMZERO, %k0 > - kmovd %k0, %eax > - /* Remove the leading bytes. */ > + > + VPCMPEQ (%rdi), %VZERO, %k0 > + > + KMOV %k0, %VRAX > # ifdef USE_AS_WCSLEN > - /* NB: Divide shift count by 4 since each bit in K0 represent 4 > - bytes. */ > - movl %edx, %ecx > - shrl $2, %ecx > - andl $(CHAR_PER_VEC - 1), %ecx > -# endif > - /* SHIFT_REG is ecx for USE_AS_WCSLEN and edx otherwise. */ > - sarxl %SHIFT_REG, %eax, %eax > + movl %ecx, %edx > + shrl $2, %edx > + andl $(CHAR_PER_VEC - 1), %edx > + shrx %edx, %eax, %eax > testl %eax, %eax > -# ifndef USE_AS_STRNLEN > - jz L(cross_page_continue) > - tzcntl %eax, %eax > - ret > # else > - jnz L(cross_page_less_vec) > -# ifndef USE_AS_WCSLEN > - movl %edx, %ecx > - andl $(CHAR_PER_VEC - 1), %ecx > -# endif > - movl $CHAR_PER_VEC, %eax > - subl %ecx, %eax > - /* Check the end of data. */ > - cmpq %rax, %rsi > - ja L(cross_page_continue) > - movl %esi, %eax > - ret > -L(cross_page_less_vec): > - tzcntl %eax, %eax > - /* Select min of length and position of first null. */ > - cmpq %rax, %rsi > - cmovb %esi, %eax > - ret > + shr %cl, %VRAX > # endif > + jz L(cross_page_continue) > + bsf %VRAX, %VRAX > + ret > > END (STRLEN) > #endif > diff --git a/sysdeps/x86_64/multiarch/strnlen-evex.S b/sysdeps/x86_64/multiarch/strnlen-evex.S > index 64a9fc2606..443a32a749 100644 > --- a/sysdeps/x86_64/multiarch/strnlen-evex.S > +++ b/sysdeps/x86_64/multiarch/strnlen-evex.S > @@ -1,8 +1,423 @@ > -#ifndef STRNLEN > -# define STRNLEN __strnlen_evex > -#endif > +/* strnlen/wcsnlen optimized with 256-bit EVEX instructions. > + Copyright (C) 2022 Free Software Foundation, Inc. > + This file is part of the GNU C Library. > + > + The GNU C Library is free software; you can redistribute it and/or > + modify it under the terms of the GNU Lesser General Public > + License as published by the Free Software Foundation; either > + version 2.1 of the License, or (at your option) any later version. > + > + The GNU C Library is distributed in the hope that it will be useful, > + but WITHOUT ANY WARRANTY; without even the implied warranty of > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > + Lesser General Public License for more details. > + > + You should have received a copy of the GNU Lesser General Public > + License along with the GNU C Library; if not, see > + . */ > + > +#include > +#include > + > +#if ISA_SHOULD_BUILD (4) > + > +# ifndef VEC_SIZE > +# include "x86-evex256-vecs.h" > +# endif > + > + > +# ifndef STRNLEN > +# define STRNLEN __strnlen_evex > +# endif > + > +# ifdef USE_AS_WCSLEN > +# define VPCMPEQ vpcmpeqd > +# define VPCMPNEQ vpcmpneqd > +# define VPTESTN vptestnmd > +# define VPTEST vptestmd > +# define VPMINU vpminud > +# define CHAR_SIZE 4 > + > +# else > +# define VPCMPEQ vpcmpeqb > +# define VPCMPNEQ vpcmpneqb > +# define VPTESTN vptestnmb > +# define VPTEST vptestmb > +# define VPMINU vpminub > +# define CHAR_SIZE 1 > + > +# define REG_WIDTH VEC_SIZE > +# endif > + > +# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE) > + > +# include "reg-macros.h" > + > +# if CHAR_PER_VEC == 32 > +# define SUB_SHORT(imm, reg) subb $(imm), %VGPR_SZ(reg, 8) > +# else > +# define SUB_SHORT(imm, reg) subl $(imm), %VGPR_SZ(reg, 32) > +# endif > + > + > + > +# if CHAR_PER_VEC == 64 > +# define FALLTHROUGH_RETURN_OFFSET (CHAR_PER_VEC * 3) > +# else > +# define FALLTHROUGH_RETURN_OFFSET (CHAR_PER_VEC * 2) > +# endif > + > + > +# define XZERO VMM_128(0) > +# define VZERO VMM(0) > +# define PAGE_SIZE 4096 > + > + .section SECTION(.text), "ax", @progbits > +ENTRY_P2ALIGN (STRNLEN, 6) > + /* Check zero length. */ > + test %RSI_LP, %RSI_LP > + jz L(zero) > +# ifdef __ILP32__ > + /* Clear the upper 32 bits. */ > + movl %esi, %esi > +# endif > + > + movl %edi, %eax > + vpxorq %XZERO, %XZERO, %XZERO > + andl $(PAGE_SIZE - 1), %eax > + cmpl $(PAGE_SIZE - VEC_SIZE), %eax > + ja L(cross_page_boundary) > + > + /* Check the first VEC_SIZE bytes. Each bit in K0 represents a > + null byte. */ > + VPCMPEQ (%rdi), %VZERO, %k0 > + > + KMOV %k0, %VRCX > + movq %rsi, %rax > + > + /* If src (rcx) is zero, bsf does not change the result. NB: > + Must use 64-bit bsf here so that upper bits of len are not > + cleared. */ > + bsfq %rcx, %rax > + /* If rax > CHAR_PER_VEC then rcx must have been zero (no null > + CHAR) and rsi must be > CHAR_PER_VEC. */ > + cmpq $CHAR_PER_VEC, %rax > + ja L(more_1x_vec) > + /* Check if first match in bounds. */ > + cmpq %rax, %rsi > + cmovb %esi, %eax > + ret > + > + > +# if CHAR_PER_VEC != 32 > + .p2align 4,, 2 > +L(zero): > +L(max_0): > + movl %esi, %eax > + ret > +# endif > + > + /* Aligned more for strnlen compares remaining length vs 2 * > + CHAR_PER_VEC, 4 * CHAR_PER_VEC, and 8 * CHAR_PER_VEC before > + going to the loop. */ > + .p2align 4,, 10 > +L(more_1x_vec): > +L(cross_page_continue): > + /* Compute number of words checked after aligning. */ > +# ifdef USE_AS_WCSLEN > + /* Need to compute directly for wcslen as CHAR_SIZE * rsi can > + overflow. */ > + movq %rdi, %rax > + andq $(VEC_SIZE * -1), %rdi > + subq %rdi, %rax > + sarq $2, %rax > + leaq -(CHAR_PER_VEC * 1)(%rax, %rsi), %rax > +# else > + leaq (VEC_SIZE * -1)(%rsi, %rdi), %rax > + andq $(VEC_SIZE * -1), %rdi > + subq %rdi, %rax > +# endif > + > + > + VPCMPEQ VEC_SIZE(%rdi), %VZERO, %k0 > + > + cmpq $(CHAR_PER_VEC * 2), %rax > + ja L(more_2x_vec) > + > +L(last_2x_vec_or_less): > + KMOV %k0, %VRDX > + test %VRDX, %VRDX > + jnz L(last_vec_check) > + > + /* Check the end of data. */ > + SUB_SHORT (CHAR_PER_VEC, rax) > + jbe L(max_0) > + VPCMPEQ (VEC_SIZE * 2)(%rdi), %VZERO, %k0 > + KMOV %k0, %VRDX > + test %VRDX, %VRDX > + jz L(max_0) > + /* Best place for LAST_VEC_CHECK if ZMM. */ > + .p2align 4,, 8 > +L(last_vec_check): > + bsf %VRDX, %VRDX > + sub %eax, %edx > + lea (%rsi, %rdx), %eax > + cmovae %esi, %eax > + ret > + > +# if CHAR_PER_VEC == 32 > + .p2align 4,, 2 > +L(zero): > +L(max_0): > + movl %esi, %eax > + ret > +# endif > + > + .p2align 4,, 8 > +L(last_4x_vec_or_less): > + addl $(CHAR_PER_VEC * -4), %eax > + VPCMPEQ (VEC_SIZE * 5)(%rdi), %VZERO, %k0 > + subq $(VEC_SIZE * -4), %rdi > + cmpl $(CHAR_PER_VEC * 2), %eax > + jbe L(last_2x_vec_or_less) > + > + .p2align 4,, 6 > +L(more_2x_vec): > + /* Remaining length >= 2 * CHAR_PER_VEC so do VEC0/VEC1 without > + rechecking bounds. */ > > -#define USE_AS_STRNLEN 1 > -#define STRLEN STRNLEN > + KMOV %k0, %VRDX > > -#include "strlen-evex.S" > + test %VRDX, %VRDX > + jnz L(first_vec_x1) > + > + VPCMPEQ (VEC_SIZE * 2)(%rdi), %VZERO, %k0 > + KMOV %k0, %VRDX > + test %VRDX, %VRDX > + jnz L(first_vec_x2) > + > + cmpq $(CHAR_PER_VEC * 4), %rax > + ja L(more_4x_vec) > + > + > + VPCMPEQ (VEC_SIZE * 3)(%rdi), %VZERO, %k0 > + KMOV %k0, %VRDX > + addl $(CHAR_PER_VEC * -2), %eax > + test %VRDX, %VRDX > + jnz L(last_vec_check) > + > + subl $(CHAR_PER_VEC), %eax > + jbe L(max_1) > + > + VPCMPEQ (VEC_SIZE * 4)(%rdi), %VZERO, %k0 > + KMOV %k0, %VRDX > + > + test %VRDX, %VRDX > + jnz L(last_vec_check) > +L(max_1): > + movl %esi, %eax > + ret > + > + .p2align 4,, 3 > +L(first_vec_x2): > +# if VEC_SIZE == 64 > + /* If VEC_SIZE == 64 we can fit logic for full return label in > + spare bytes before next cache line. */ > + bsf %VRDX, %VRDX > + sub %eax, %esi > + leal (CHAR_PER_VEC * 1)(%rsi, %rdx), %eax > + ret > + .p2align 4,, 6 > +# else > + addl $CHAR_PER_VEC, %esi > +# endif > +L(first_vec_x1): > + bsf %VRDX, %VRDX > + sub %eax, %esi > + leal (CHAR_PER_VEC * 0)(%rsi, %rdx), %eax > + ret > + > + > + .p2align 4,, 6 > +L(first_vec_x4): > +# if VEC_SIZE == 64 > + /* If VEC_SIZE == 64 we can fit logic for full return label in > + spare bytes before next cache line. */ > + bsf %VRDX, %VRDX > + sub %eax, %esi > + leal (CHAR_PER_VEC * 3)(%rsi, %rdx), %eax > + ret > + .p2align 4,, 6 > +# else > + addl $CHAR_PER_VEC, %esi > +# endif > +L(first_vec_x3): > + bsf %VRDX, %VRDX > + sub %eax, %esi > + leal (CHAR_PER_VEC * 2)(%rsi, %rdx), %eax > + ret > + > + .p2align 4,, 5 > +L(more_4x_vec): > + VPCMPEQ (VEC_SIZE * 3)(%rdi), %VZERO, %k0 > + KMOV %k0, %VRDX > + test %VRDX, %VRDX > + jnz L(first_vec_x3) > + > + VPCMPEQ (VEC_SIZE * 4)(%rdi), %VZERO, %k0 > + KMOV %k0, %VRDX > + test %VRDX, %VRDX > + jnz L(first_vec_x4) > + > + /* Check if at last VEC_SIZE * 4 length before aligning for the > + loop. */ > + cmpq $(CHAR_PER_VEC * 8), %rax > + jbe L(last_4x_vec_or_less) > + > + > + /* Compute number of words checked after aligning. */ > +# ifdef USE_AS_WCSLEN > + /* Need to compute directly for wcslen as CHAR_SIZE * rsi can > + overflow. */ > + leaq (VEC_SIZE * -3)(%rdi), %rdx > +# else > + leaq (VEC_SIZE * -3)(%rdi, %rax), %rax > +# endif > + > + subq $(VEC_SIZE * -1), %rdi > + > + /* Align data to VEC_SIZE * 4. */ > +# if VEC_SIZE == 64 > + /* Saves code size. No evex512 processor has partial register > + stalls. If that change this can be replaced with `andq > + $-(VEC_SIZE * 4), %rdi`. */ > + xorb %dil, %dil > +# else > + andq $-(VEC_SIZE * 4), %rdi > +# endif > + > +# ifdef USE_AS_WCSLEN > + subq %rdi, %rdx > + sarq $2, %rdx > + addq %rdx, %rax > +# else > + subq %rdi, %rax > +# endif > + /* Compare 4 * VEC at a time forward. */ > + .p2align 4,, 11 > +L(loop_4x_vec): > + VMOVA (VEC_SIZE * 4)(%rdi), %VMM(1) > + VPMINU (VEC_SIZE * 5)(%rdi), %VMM(1), %VMM(2) > + VMOVA (VEC_SIZE * 6)(%rdi), %VMM(3) > + VPMINU (VEC_SIZE * 7)(%rdi), %VMM(3), %VMM(4) > + VPTESTN %VMM(2), %VMM(2), %k0 > + VPTESTN %VMM(4), %VMM(4), %k2 > + subq $-(VEC_SIZE * 4), %rdi > + /* Break if at end of length. */ > + subq $(CHAR_PER_VEC * 4), %rax > + jbe L(loop_len_end) > + > + > + KORTEST %k0, %k2 > + jz L(loop_4x_vec) > + > + > +L(loop_last_4x_vec): > + movq %rsi, %rcx > + subq %rax, %rsi > + VPTESTN %VMM(1), %VMM(1), %k1 > + KMOV %k1, %VRDX > + test %VRDX, %VRDX > + jnz L(last_vec_x0) > + > + KMOV %k0, %VRDX > + test %VRDX, %VRDX > + jnz L(last_vec_x1) > + > + VPTESTN %VMM(3), %VMM(3), %k0 > + > + /* Seperate logic for VEC_SIZE == 64 and VEC_SIZE == 32 for > + returning last 2x VEC. For VEC_SIZE == 64 we test each VEC > + individually, for VEC_SIZE == 32 we combine them in a single > + 64-bit GPR. */ > +# if CHAR_PER_VEC == 64 > + KMOV %k0, %VRDX > + test %VRDX, %VRDX > + jnz L(last_vec_x2) > + KMOV %k2, %VRDX > +# else > + /* We can only combine last 2x VEC masks if CHAR_PER_VEC <= 32. > + */ > + kmovd %k2, %edx > + kmovd %k0, %eax > + salq $CHAR_PER_VEC, %rdx > + orq %rax, %rdx > +# endif > + > + /* first_vec_x3 for strlen-ZMM and first_vec_x2 for strlen-YMM. > + */ > + bsfq %rdx, %rdx > + leaq (FALLTHROUGH_RETURN_OFFSET - CHAR_PER_VEC * 4)(%rsi, %rdx), %rax > + cmpq %rax, %rcx > + cmovb %rcx, %rax > + ret > + > + /* Handle last 4x VEC after loop. All VECs have been loaded. */ > + .p2align 4,, 4 > +L(loop_len_end): > + KORTEST %k0, %k2 > + jnz L(loop_last_4x_vec) > + movq %rsi, %rax > + ret > + > + > +# if CHAR_PER_VEC == 64 > + /* Since we can't combine the last 2x VEC for VEC_SIZE == 64 > + need return label for it. */ > + .p2align 4,, 8 > +L(last_vec_x2): > + bsf %VRDX, %VRDX > + leaq (CHAR_PER_VEC * -2)(%rsi, %rdx), %rax > + cmpq %rax, %rcx > + cmovb %rcx, %rax > + ret > +# endif > + > + > + .p2align 4,, 10 > +L(last_vec_x1): > + addq $CHAR_PER_VEC, %rsi > +L(last_vec_x0): > + bsf %VRDX, %VRDX > + leaq (CHAR_PER_VEC * -4)(%rsi, %rdx), %rax > + cmpq %rax, %rcx > + cmovb %rcx, %rax > + ret > + > + > + .p2align 4,, 8 > +L(cross_page_boundary): > + /* Align data to VEC_SIZE. */ > + movq %rdi, %rcx > + andq $-VEC_SIZE, %rcx > + VPCMPEQ (%rcx), %VZERO, %k0 > + > + KMOV %k0, %VRCX > +# ifdef USE_AS_WCSLEN > + shrl $2, %eax > + andl $(CHAR_PER_VEC - 1), %eax > +# endif > + shrx %VRAX, %VRCX, %VRCX > + > + negl %eax > + andl $(CHAR_PER_VEC - 1), %eax > + movq %rsi, %rdx > + bsf %VRCX, %VRDX > + cmpq %rax, %rdx > + ja L(cross_page_continue) > + movl %edx, %eax > + cmpq %rdx, %rsi > + cmovb %esi, %eax > + ret > +END (STRNLEN) > +#endif > diff --git a/sysdeps/x86_64/multiarch/wcsnlen-evex.S b/sysdeps/x86_64/multiarch/wcsnlen-evex.S > index e2aad94c1e..57a7e93fbf 100644 > --- a/sysdeps/x86_64/multiarch/wcsnlen-evex.S > +++ b/sysdeps/x86_64/multiarch/wcsnlen-evex.S > @@ -2,8 +2,7 @@ > # define WCSNLEN __wcsnlen_evex > #endif > > -#define STRLEN WCSNLEN > +#define STRNLEN WCSNLEN > #define USE_AS_WCSLEN 1 > -#define USE_AS_STRNLEN 1 > > -#include "strlen-evex.S" > +#include "strnlen-evex.S" > -- > 2.34.1 >