On Mon, Oct 17, 2022 at 7:49 PM Noah Goldstein wrote: > > Optimizations are: > > 1. Use the fact that tzcnt(0) -> VEC_SIZE for memchr to save a branch > in short string case. > 2. Restructure code so that small strings are given the hot path. > - This is a net-zero on the benchmark suite but in general makes > sense as smaller sizes are far more common. > 3. Use more code-size efficient instructions. > - tzcnt ... -> bsf ... > - vpcmpb $0 ... -> vpcmpeq ... > 4. Align labels less aggressively, especially if it doesn't save fetch > blocks / causes the basic-block to span extra cache-lines. > > The optimizations (especially for point 2) make the memchr and > rawmemchr code essentially incompatible so split rawmemchr-evex > to a new file. > > Code Size Changes: > memchr-evex.S : -107 bytes > rawmemchr-evex.S : -53 bytes > > Net perf changes: > > Reported as geometric mean of all improvements / regressions from N=10 > runs of the benchtests. Value as New Time / Old Time so < 1.0 is > improvement and 1.0 is regression. > > memchr-evex.S : 0.928 > rawmemchr-evex.S : 0.986 (Less targets cross cache lines) > > Full results attached in email. > > Full check passes on x86-64. > --- > sysdeps/x86_64/multiarch/memchr-evex.S | 939 ++++++++++-------- > sysdeps/x86_64/multiarch/rawmemchr-evex-rtm.S | 9 +- > sysdeps/x86_64/multiarch/rawmemchr-evex.S | 313 +++++- > 3 files changed, 851 insertions(+), 410 deletions(-) > > diff --git a/sysdeps/x86_64/multiarch/memchr-evex.S b/sysdeps/x86_64/multiarch/memchr-evex.S > index 0dd4f1dcce..23a1c0018e 100644 > --- a/sysdeps/x86_64/multiarch/memchr-evex.S > +++ b/sysdeps/x86_64/multiarch/memchr-evex.S > @@ -21,17 +21,27 @@ > > #if ISA_SHOULD_BUILD (4) > > +# ifndef VEC_SIZE > +# include "x86-evex256-vecs.h" > +# endif > + > # ifndef MEMCHR > # define MEMCHR __memchr_evex > # endif > > # ifdef USE_AS_WMEMCHR > +# define PC_SHIFT_GPR rcx > +# define VPTESTN vptestnmd > # define VPBROADCAST vpbroadcastd > # define VPMINU vpminud > # define VPCMP vpcmpd > # define VPCMPEQ vpcmpeqd > # define CHAR_SIZE 4 > + > +# define USE_WIDE_CHAR > # else > +# define PC_SHIFT_GPR rdi > +# define VPTESTN vptestnmb > # define VPBROADCAST vpbroadcastb > # define VPMINU vpminub > # define VPCMP vpcmpb > @@ -39,534 +49,661 @@ > # define CHAR_SIZE 1 > # endif > > - /* In the 4x loop the RTM and non-RTM versions have data pointer > - off by VEC_SIZE * 4 with RTM version being VEC_SIZE * 4 greater. > - This is represented by BASE_OFFSET. As well because the RTM > - version uses vpcmp which stores a bit per element compared where > - the non-RTM version uses vpcmpeq which stores a bit per byte > - compared RET_SCALE of CHAR_SIZE is only relevant for the RTM > - version. */ > -# ifdef USE_IN_RTM > +# include "reg-macros.h" > + > + > +/* If not in an RTM and VEC_SIZE != 64 (the VEC_SIZE = 64 > + doesn't have VEX encoding), use VEX encoding in loop so we > + can use vpcmpeqb + vptern which is more efficient than the > + EVEX alternative. */ > +# if defined USE_IN_RTM || VEC_SIZE == 64 > +# undef COND_VZEROUPPER > +# undef VZEROUPPER_RETURN > +# undef VZEROUPPER > + > +# define COND_VZEROUPPER > +# define VZEROUPPER_RETURN ret > # define VZEROUPPER > -# define BASE_OFFSET (VEC_SIZE * 4) > -# define RET_SCALE CHAR_SIZE > + > +# define USE_TERN_IN_LOOP 0 > # else > +# define USE_TERN_IN_LOOP 1 > +# undef VZEROUPPER > # define VZEROUPPER vzeroupper > -# define BASE_OFFSET 0 > -# define RET_SCALE 1 > # endif > > - /* In the return from 4x loop memchr and rawmemchr versions have > - data pointers off by VEC_SIZE * 4 with memchr version being > - VEC_SIZE * 4 greater. */ > -# ifdef USE_AS_RAWMEMCHR > -# define RET_OFFSET (BASE_OFFSET - (VEC_SIZE * 4)) > -# define RAW_PTR_REG rcx > -# define ALGN_PTR_REG rdi > +# if USE_TERN_IN_LOOP > + /* Resulting bitmask for vpmovmskb has 4-bits set for each wchar > + so we don't want to multiply resulting index. */ > +# define TERN_CHAR_MULT 1 > + > +# ifdef USE_AS_WMEMCHR > +# define TEST_END() inc %VRCX > +# else > +# define TEST_END() add %rdx, %rcx > +# endif > # else > -# define RET_OFFSET BASE_OFFSET > -# define RAW_PTR_REG rdi > -# define ALGN_PTR_REG rcx > +# define TERN_CHAR_MULT CHAR_SIZE > +# define TEST_END() KORTEST %k2, %k3 > # endif > > -# define XMMZERO xmm23 > -# define YMMZERO ymm23 > -# define XMMMATCH xmm16 > -# define YMMMATCH ymm16 > -# define YMM1 ymm17 > -# define YMM2 ymm18 > -# define YMM3 ymm19 > -# define YMM4 ymm20 > -# define YMM5 ymm21 > -# define YMM6 ymm22 > +# if defined USE_AS_WMEMCHR || !USE_TERN_IN_LOOP > +# ifndef USE_AS_WMEMCHR > +# define GPR_X0_IS_RET 1 > +# else > +# define GPR_X0_IS_RET 0 > +# endif > +# define GPR_X0 rax > +# else > +# define GPR_X0_IS_RET 0 > +# define GPR_X0 rdx > +# endif > + > +# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE) > > -# ifndef SECTION > -# define SECTION(p) p##.evex > +# if CHAR_PER_VEC == 64 > +# define LAST_VEC_OFFSET (VEC_SIZE * 3) > +# else > +# define LAST_VEC_OFFSET (VEC_SIZE * 2) > +# endif > +# if CHAR_PER_VEC >= 32 > +# define MASK_GPR(...) VGPR(__VA_ARGS__) > +# elif CHAR_PER_VEC == 16 > +# define MASK_GPR(reg) VGPR_SZ(reg, 16) > +# else > +# define MASK_GPR(reg) VGPR_SZ(reg, 8) > # endif > > -# define VEC_SIZE 32 > -# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE) > -# define PAGE_SIZE 4096 > +# define VMATCH VMM(0) > +# define VMATCH_LO VMM_lo(0) > > - .section SECTION(.text),"ax",@progbits > +# define PAGE_SIZE 4096 > + > + > + .section SECTION(.text), "ax", @progbits > ENTRY_P2ALIGN (MEMCHR, 6) > -# ifndef USE_AS_RAWMEMCHR > /* Check for zero length. */ > test %RDX_LP, %RDX_LP > - jz L(zero) > + jz L(zero_0) > > -# ifdef __ILP32__ > +# ifdef __ILP32__ > /* Clear the upper 32 bits. */ > movl %edx, %edx > -# endif > # endif > - /* Broadcast CHAR to YMMMATCH. */ > - VPBROADCAST %esi, %YMMMATCH > + VPBROADCAST %esi, %VMATCH > /* Check if we may cross page boundary with one vector load. */ > movl %edi, %eax > andl $(PAGE_SIZE - 1), %eax > cmpl $(PAGE_SIZE - VEC_SIZE), %eax > - ja L(cross_page_boundary) > + ja L(page_cross) > + > + VPCMPEQ (%rdi), %VMATCH, %k0 > + KMOV %k0, %VRAX > +# ifndef USE_AS_WMEMCHR > + /* If rcx is zero then tzcnt -> CHAR_PER_VEC. NB: there is a > + already a dependency between rcx and rsi so no worries about > + false-dep here. */ > + tzcnt %VRAX, %VRSI > + /* If rdx <= rsi then either 1) rcx was non-zero (there was a > + match) but it was out of bounds or 2) rcx was zero and rdx > + was <= VEC_SIZE so we are done scanning. */ > + cmpq %rsi, %rdx > + /* NB: Use branch to return zero/non-zero. Common usage will > + branch on result of function (if return is null/non-null). > + This branch can be used to predict the ensuing one so there > + is no reason to extend the data-dependency with cmovcc. */ > + jbe L(zero_0) > + > + /* If rcx is zero then len must be > RDX, otherwise since we > + already tested len vs lzcnt(rcx) (in rsi) we are good to > + return this match. */ > + test %VRAX, %VRAX > + jz L(more_1x_vec) > + leaq (%rdi, %rsi), %rax > +# else > > - /* Check the first VEC_SIZE bytes. */ > - VPCMP $0, (%rdi), %YMMMATCH, %k0 > - kmovd %k0, %eax > -# ifndef USE_AS_RAWMEMCHR > - /* If length < CHAR_PER_VEC handle special. */ > + /* We can't use the `tzcnt` trick for wmemchr because CHAR_SIZE > + > 1 so if rcx is tzcnt != CHAR_PER_VEC. */ > cmpq $CHAR_PER_VEC, %rdx > - jbe L(first_vec_x0) > -# endif > - testl %eax, %eax > - jz L(aligned_more) > - tzcntl %eax, %eax > -# ifdef USE_AS_WMEMCHR > - /* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count. */ > + ja L(more_1x_vec) > + tzcnt %VRAX, %VRAX > + cmpl %eax, %edx > + jbe L(zero_0) > +L(first_vec_x0_ret): > leaq (%rdi, %rax, CHAR_SIZE), %rax > -# else > - addq %rdi, %rax > # endif > ret > > -# ifndef USE_AS_RAWMEMCHR > -L(zero): > - xorl %eax, %eax > - ret > - > - .p2align 4 > -L(first_vec_x0): > - /* Check if first match was before length. NB: tzcnt has false data- > - dependency on destination. eax already had a data-dependency on esi > - so this should have no affect here. */ > - tzcntl %eax, %esi > -# ifdef USE_AS_WMEMCHR > - leaq (%rdi, %rsi, CHAR_SIZE), %rdi > -# else > - addq %rsi, %rdi > -# endif > + /* Only fits in first cache line for VEC_SIZE == 32. */ > +# if VEC_SIZE == 32 > + .p2align 4,, 2 > +L(zero_0): > xorl %eax, %eax > - cmpl %esi, %edx > - cmovg %rdi, %rax > ret > # endif > > - .p2align 4 > -L(cross_page_boundary): > - /* Save pointer before aligning as its original value is > - necessary for computer return address if byte is found or > - adjusting length if it is not and this is memchr. */ > - movq %rdi, %rcx > - /* Align data to VEC_SIZE. ALGN_PTR_REG is rcx for memchr and rdi > - for rawmemchr. */ > - andq $-VEC_SIZE, %ALGN_PTR_REG > - VPCMP $0, (%ALGN_PTR_REG), %YMMMATCH, %k0 > - kmovd %k0, %r8d > + .p2align 4,, 9 > +L(more_1x_vec): > # ifdef USE_AS_WMEMCHR > - /* NB: Divide shift count by 4 since each bit in K0 represent 4 > - bytes. */ > - sarl $2, %eax > -# endif > -# ifndef USE_AS_RAWMEMCHR > - movl $(PAGE_SIZE / CHAR_SIZE), %esi > - subl %eax, %esi > + /* If wmemchr still need to test if there was a match in first > + VEC. Use bsf to test here so we can reuse > + L(first_vec_x0_ret). */ > + bsf %VRAX, %VRAX > + jnz L(first_vec_x0_ret) > # endif > + > +L(page_cross_continue): > # ifdef USE_AS_WMEMCHR > - andl $(CHAR_PER_VEC - 1), %eax > -# endif > - /* Remove the leading bytes. */ > - sarxl %eax, %r8d, %eax > -# ifndef USE_AS_RAWMEMCHR > - /* Check the end of data. */ > - cmpq %rsi, %rdx > - jbe L(first_vec_x0) > + /* We can't use end of the buffer to re-calculate length for > + wmemchr as len * CHAR_SIZE may overflow. */ > + leaq -(VEC_SIZE + CHAR_SIZE)(%rdi), %rax > + andq $(VEC_SIZE * -1), %rdi > + subq %rdi, %rax > + sarq $2, %rax > + addq %rdx, %rax > +# else > + leaq -(VEC_SIZE + 1)(%rdx, %rdi), %rax > + andq $(VEC_SIZE * -1), %rdi > + subq %rdi, %rax > # endif > - testl %eax, %eax > - jz L(cross_page_continue) > - tzcntl %eax, %eax > + > + /* rax contains remaining length - 1. -1 so we can get imm8 > + encoding in a few additional places saving code size. */ > + > + /* Needed regardless of remaining length. */ > + VPCMPEQ VEC_SIZE(%rdi), %VMATCH, %k0 > + KMOV %k0, %VRDX > + > + /* We cannot fold the above `sub %rdi, %rax` with the `cmp > + $(CHAR_PER_VEC * 2), %rax` because its possible for a very > + large length to overflow and cause the subtract to carry > + despite length being above CHAR_PER_VEC * 2. */ > + cmpq $(CHAR_PER_VEC * 2 - 1), %rax > + ja L(more_2x_vec) > +L(last_2x_vec): > + > + test %VRDX, %VRDX > + jnz L(first_vec_x1_check) > + > + /* Check the end of data. NB: use 8-bit operations to save code > + size. We no longer need the full-width of eax and will > + perform a write-only operation over eax so there will be no > + partial-register stalls. */ > + subb $(CHAR_PER_VEC * 1 - 1), %al > + jle L(zero_0) > + > + VPCMPEQ (VEC_SIZE * 2)(%rdi), %VMATCH, %k0 > + KMOV %k0, %VRCX > # ifdef USE_AS_WMEMCHR > - /* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count. */ > - leaq (%RAW_PTR_REG, %rax, CHAR_SIZE), %rax > + /* For wmemchr against we can't take advantage of tzcnt(0) == > + VEC_SIZE as CHAR_PER_VEC != VEC_SIZE. */ > + test %VRCX, %VRCX > + jz L(zero_0) > +# endif > + tzcnt %VRCX, %VRCX > + cmp %cl, %al > + > + /* Same CFG for VEC_SIZE == 64 and VEC_SIZE == 32. We give > + fallthrough to L(zero_0) for VEC_SIZE == 64 here as there is > + not enough space before the next cache line to fit the `lea` > + for return. */ > +# if VEC_SIZE == 64 > + ja L(first_vec_x2_ret) > +L(zero_0): > + xorl %eax, %eax > + ret > # else > - addq %RAW_PTR_REG, %rax > + jbe L(zero_0) > + leaq (VEC_SIZE * 2)(%rdi, %rcx, CHAR_SIZE), %rax > + ret > # endif > + > + .p2align 4,, 5 > +L(first_vec_x1_check): > + bsf %VRDX, %VRDX > + cmpb %dl, %al > + jb L(zero_4) > + leaq (VEC_SIZE * 1)(%rdi, %rdx, CHAR_SIZE), %rax > ret > > - .p2align 4 > -L(first_vec_x1): > - tzcntl %eax, %eax > - leaq VEC_SIZE(%rdi, %rax, CHAR_SIZE), %rax > + /* Fits at the end of the cache line here for VEC_SIZE == 32. > + */ > +# if VEC_SIZE == 32 > +L(zero_4): > + xorl %eax, %eax > ret > +# endif > > - .p2align 4 > + > + .p2align 4,, 4 > L(first_vec_x2): > - tzcntl %eax, %eax > - leaq (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax > + bsf %VRCX, %VRCX > +L(first_vec_x2_ret): > + leaq (VEC_SIZE * 2)(%rdi, %rcx, CHAR_SIZE), %rax > ret > > - .p2align 4 > -L(first_vec_x3): > - tzcntl %eax, %eax > - leaq (VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax > + /* Fits at the end of the cache line here for VEC_SIZE == 64. > + */ > +# if VEC_SIZE == 64 > +L(zero_4): > + xorl %eax, %eax > ret > +# endif > > - .p2align 4 > -L(first_vec_x4): > - tzcntl %eax, %eax > - leaq (VEC_SIZE * 4)(%rdi, %rax, CHAR_SIZE), %rax > + .p2align 4,, 4 > +L(first_vec_x1): > + bsf %VRDX, %VRDX > + leaq (VEC_SIZE * 1)(%rdi, %rdx, CHAR_SIZE), %rax > ret > > - .p2align 5 > -L(aligned_more): > - /* Check the first 4 * VEC_SIZE. Only one VEC_SIZE at a time > - since data is only aligned to VEC_SIZE. */ > > -# ifndef USE_AS_RAWMEMCHR > - /* Align data to VEC_SIZE. */ > -L(cross_page_continue): > - xorl %ecx, %ecx > - subl %edi, %ecx > - andq $-VEC_SIZE, %rdi > - /* esi is for adjusting length to see if near the end. */ > - leal (VEC_SIZE * 5)(%rdi, %rcx), %esi > -# ifdef USE_AS_WMEMCHR > - /* NB: Divide bytes by 4 to get the wchar_t count. */ > - sarl $2, %esi > -# endif > -# else > - andq $-VEC_SIZE, %rdi > -L(cross_page_continue): > -# endif > - /* Load first VEC regardless. */ > - VPCMP $0, (VEC_SIZE)(%rdi), %YMMMATCH, %k0 > - kmovd %k0, %eax > -# ifndef USE_AS_RAWMEMCHR > - /* Adjust length. If near end handle specially. */ > - subq %rsi, %rdx > - jbe L(last_4x_vec_or_less) > -# endif > - testl %eax, %eax > + .p2align 4,, 5 > +L(more_2x_vec): > + /* Length > VEC_SIZE * 2 so check first 2x VEC before rechecking > + length. */ > + > + > + /* Already computed matches for first VEC in rdx. */ > + test %VRDX, %VRDX > jnz L(first_vec_x1) > > - VPCMP $0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k0 > - kmovd %k0, %eax > - testl %eax, %eax > + > + VPCMPEQ (VEC_SIZE * 2)(%rdi), %VMATCH, %k0 > + KMOV %k0, %VRCX > + test %VRCX, %VRCX > jnz L(first_vec_x2) > > - VPCMP $0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k0 > - kmovd %k0, %eax > - testl %eax, %eax > + /* Needed regardless of next length check. */ > + VPCMPEQ (VEC_SIZE * 3)(%rdi), %VMATCH, %k0 > + KMOV %k0, %VRCX > + > + /* Check if we are near the end. */ > + cmpq $(CHAR_PER_VEC * 4 - 1), %rax > + ja L(more_4x_vec) > + > + test %VRCX, %VRCX > + jnz L(first_vec_x3_check) > + > + /* Use 8-bit instructions to save code size. We won't use full- > + width eax again and will perform a write-only operation to > + eax so no worries about partial-register stalls. */ > + subb $(CHAR_PER_VEC * 3), %al > + jb L(zero_2) > +L(last_vec_check): > + VPCMPEQ (VEC_SIZE * 4)(%rdi), %VMATCH, %k0 > + KMOV %k0, %VRCX > +# ifdef USE_AS_WMEMCHR > + /* For wmemchr against we can't take advantage of tzcnt(0) == > + VEC_SIZE as CHAR_PER_VEC != VEC_SIZE. */ > + test %VRCX, %VRCX > + jz L(zero_2) > +# endif > + tzcnt %VRCX, %VRCX > + cmp %cl, %al > + jae L(first_vec_x4_ret) > +L(zero_2): > + xorl %eax, %eax > + ret > + > + /* Fits at the end of the cache line here for VEC_SIZE == 64. > + For VEC_SIZE == 32 we put the return label at the end of > + L(first_vec_x4). */ > +# if VEC_SIZE == 64 > +L(first_vec_x4_ret): > + leaq (VEC_SIZE * 4)(%rdi, %rcx, CHAR_SIZE), %rax > + ret > +# endif > + > + .p2align 4,, 6 > +L(first_vec_x4): > + bsf %VRCX, %VRCX > +# if VEC_SIZE == 32 > + /* Place L(first_vec_x4_ret) here as we can't fit it in the same > + cache line as where it is called from so we might as well > + save code size by reusing return of L(first_vec_x4). */ > +L(first_vec_x4_ret): > +# endif > + leaq (VEC_SIZE * 4)(%rdi, %rcx, CHAR_SIZE), %rax > + ret > + > + .p2align 4,, 6 > +L(first_vec_x3_check): > + /* Need to adjust remaining length before checking. */ > + addb $-(CHAR_PER_VEC * 2), %al > + bsf %VRCX, %VRCX > + cmpb %cl, %al > + jb L(zero_2) > + leaq (VEC_SIZE * 3)(%rdi, %rcx, CHAR_SIZE), %rax > + ret > + > + .p2align 4,, 6 > +L(first_vec_x3): > + bsf %VRCX, %VRCX > + leaq (VEC_SIZE * 3)(%rdi, %rcx, CHAR_SIZE), %rax > + ret > + > + .p2align 4,, 3 > +# if !USE_TERN_IN_LOOP > + .p2align 4,, 10 > +# endif > +L(more_4x_vec): > + test %VRCX, %VRCX > jnz L(first_vec_x3) > > - VPCMP $0, (VEC_SIZE * 4)(%rdi), %YMMMATCH, %k0 > - kmovd %k0, %eax > - testl %eax, %eax > + VPCMPEQ (VEC_SIZE * 4)(%rdi), %VMATCH, %k0 > + KMOV %k0, %VRCX > + test %VRCX, %VRCX > jnz L(first_vec_x4) > > + subq $-(VEC_SIZE * 5), %rdi > + subq $(CHAR_PER_VEC * 8), %rax > + jb L(last_4x_vec) > > -# ifndef USE_AS_RAWMEMCHR > - /* Check if at last CHAR_PER_VEC * 4 length. */ > - subq $(CHAR_PER_VEC * 4), %rdx > - jbe L(last_4x_vec_or_less_cmpeq) > - /* +VEC_SIZE if USE_IN_RTM otherwise +VEC_SIZE * 5. */ > - addq $(VEC_SIZE + (VEC_SIZE * 4 - BASE_OFFSET)), %rdi > - > - /* Align data to VEC_SIZE * 4 for the loop and readjust length. > - */ > -# ifdef USE_AS_WMEMCHR > +# ifdef USE_AS_WMEMCHR > movl %edi, %ecx > - andq $-(4 * VEC_SIZE), %rdi > +# else > + addq %rdi, %rax > +# endif > + > + > +# if VEC_SIZE == 64 > + /* use xorb to do `andq $-(VEC_SIZE * 4), %rdi`. No evex > + processor has partial register stalls (all have merging > + uop). If that changes this can be removed. */ > + xorb %dil, %dil > +# else > + andq $-(VEC_SIZE * 4), %rdi > +# endif > + > +# ifdef USE_AS_WMEMCHR > subl %edi, %ecx > - /* NB: Divide bytes by 4 to get the wchar_t count. */ > sarl $2, %ecx > - addq %rcx, %rdx > -# else > - addq %rdi, %rdx > - andq $-(4 * VEC_SIZE), %rdi > - subq %rdi, %rdx > -# endif > + addq %rcx, %rax > # else > - addq $(VEC_SIZE + (VEC_SIZE * 4 - BASE_OFFSET)), %rdi > - andq $-(4 * VEC_SIZE), %rdi > + subq %rdi, %rax > # endif > -# ifdef USE_IN_RTM > - vpxorq %XMMZERO, %XMMZERO, %XMMZERO > -# else > - /* copy ymmmatch to ymm0 so we can use vpcmpeq which is not > - encodable with EVEX registers (ymm16-ymm31). */ > - vmovdqa64 %YMMMATCH, %ymm0 > + > + > + > +# if USE_TERN_IN_LOOP > + /* copy VMATCH to low ymm so we can use vpcmpeq which is not > + encodable with EVEX registers. NB: this is VEC_SIZE == 32 > + only as there is no way to encode vpcmpeq with zmm0-15. */ > + vmovdqa64 %VMATCH, %VMATCH_LO > # endif > > - /* Compare 4 * VEC at a time forward. */ > - .p2align 4 > + .p2align 4,, 11 > L(loop_4x_vec): > - /* Two versions of the loop. One that does not require > - vzeroupper by not using ymm0-ymm15 and another does that require > - vzeroupper because it uses ymm0-ymm15. The reason why ymm0-ymm15 > - is used at all is because there is no EVEX encoding vpcmpeq and > - with vpcmpeq this loop can be performed more efficiently. The > - non-vzeroupper version is safe for RTM while the vzeroupper > - version should be prefered if RTM are not supported. */ > -# ifdef USE_IN_RTM > - /* It would be possible to save some instructions using 4x VPCMP > - but bottleneck on port 5 makes it not woth it. */ > - VPCMP $4, (VEC_SIZE * 4)(%rdi), %YMMMATCH, %k1 > - /* xor will set bytes match esi to zero. */ > - vpxorq (VEC_SIZE * 5)(%rdi), %YMMMATCH, %YMM2 > - vpxorq (VEC_SIZE * 6)(%rdi), %YMMMATCH, %YMM3 > - VPCMP $0, (VEC_SIZE * 7)(%rdi), %YMMMATCH, %k3 > - /* Reduce VEC2 / VEC3 with min and VEC1 with zero mask. */ > - VPMINU %YMM2, %YMM3, %YMM3{%k1}{z} > - VPCMP $0, %YMM3, %YMMZERO, %k2 > -# else > + /* Two versions of the loop. One that does not require > + vzeroupper by not using ymmm0-15 and another does that > + require vzeroupper because it uses ymmm0-15. The reason why > + ymm0-15 is used at all is because there is no EVEX encoding > + vpcmpeq and with vpcmpeq this loop can be performed more > + efficiently. The non-vzeroupper version is safe for RTM > + while the vzeroupper version should be prefered if RTM are > + not supported. Which loop version we use is determined by > + USE_TERN_IN_LOOP. */ > + > +# if USE_TERN_IN_LOOP > /* Since vptern can only take 3x vectors fastest to do 1 vec > seperately with EVEX vpcmp. */ > # ifdef USE_AS_WMEMCHR > /* vptern can only accept masks for epi32/epi64 so can only save > - instruction using not equals mask on vptern with wmemchr. */ > - VPCMP $4, (%rdi), %YMMMATCH, %k1 > + instruction using not equals mask on vptern with wmemchr. > + */ > + VPCMP $4, (VEC_SIZE * 0)(%rdi), %VMATCH, %k1 > # else > - VPCMP $0, (%rdi), %YMMMATCH, %k1 > + VPCMPEQ (VEC_SIZE * 0)(%rdi), %VMATCH, %k1 > # endif > /* Compare 3x with vpcmpeq and or them all together with vptern. > */ > - VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm2 > - VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm0, %ymm3 > - VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm0, %ymm4 > + VPCMPEQ (VEC_SIZE * 1)(%rdi), %VMATCH_LO, %VMM_lo(2) > + VPCMPEQ (VEC_SIZE * 2)(%rdi), %VMATCH_LO, %VMM_lo(3) > + VPCMPEQ (VEC_SIZE * 3)(%rdi), %VMATCH_LO, %VMM_lo(4) > # ifdef USE_AS_WMEMCHR > - /* This takes the not of or between ymm2, ymm3, ymm4 as well as > - combines result from VEC0 with zero mask. */ > - vpternlogd $1, %ymm2, %ymm3, %ymm4{%k1}{z} > - vpmovmskb %ymm4, %ecx > + /* This takes the not of or between VEC_lo(2), VEC_lo(3), > + VEC_lo(4) as well as combines result from VEC(0) with zero > + mask. */ > + vpternlogd $1, %VMM_lo(2), %VMM_lo(3), %VMM_lo(4){%k1}{z} > + vpmovmskb %VMM_lo(4), %VRCX > # else > - /* 254 is mask for oring ymm2, ymm3, ymm4 into ymm4. */ > - vpternlogd $254, %ymm2, %ymm3, %ymm4 > - vpmovmskb %ymm4, %ecx > - kmovd %k1, %eax > + /* 254 is mask for oring VEC_lo(2), VEC_lo(3), VEC_lo(4) into > + VEC_lo(4). */ > + vpternlogd $254, %VMM_lo(2), %VMM_lo(3), %VMM_lo(4) > + vpmovmskb %VMM_lo(4), %VRCX > + KMOV %k1, %edx > # endif > -# endif > > -# ifdef USE_AS_RAWMEMCHR > - subq $-(VEC_SIZE * 4), %rdi > -# endif > -# ifdef USE_IN_RTM > - kortestd %k2, %k3 > # else > -# ifdef USE_AS_WMEMCHR > - /* ecx contains not of matches. All 1s means no matches. incl will > - overflow and set zeroflag if that is the case. */ > - incl %ecx > -# else > - /* If either VEC1 (eax) or VEC2-VEC4 (ecx) are not zero. Adding > - to ecx is not an issue because if eax is non-zero it will be > - used for returning the match. If it is zero the add does > - nothing. */ > - addq %rax, %rcx > -# endif > + /* Loop version that uses EVEX encoding. */ > + VPCMP $4, (VEC_SIZE * 0)(%rdi), %VMATCH, %k1 > + vpxorq (VEC_SIZE * 1)(%rdi), %VMATCH, %VMM(2) > + vpxorq (VEC_SIZE * 2)(%rdi), %VMATCH, %VMM(3) > + VPCMPEQ (VEC_SIZE * 3)(%rdi), %VMATCH, %k3 > + VPMINU %VMM(2), %VMM(3), %VMM(3){%k1}{z} > + VPTESTN %VMM(3), %VMM(3), %k2 > # endif > -# ifdef USE_AS_RAWMEMCHR > - jz L(loop_4x_vec) > -# else > - jnz L(loop_4x_vec_end) > + > + > + TEST_END () > + jnz L(loop_vec_ret) > > subq $-(VEC_SIZE * 4), %rdi > > - subq $(CHAR_PER_VEC * 4), %rdx > - ja L(loop_4x_vec) > + subq $(CHAR_PER_VEC * 4), %rax > + jae L(loop_4x_vec) > > - /* Fall through into less than 4 remaining vectors of length case. > + /* COND_VZEROUPPER is vzeroupper if we use the VEX encoded loop. > */ > - VPCMP $0, BASE_OFFSET(%rdi), %YMMMATCH, %k0 > - addq $(BASE_OFFSET - VEC_SIZE), %rdi > - kmovd %k0, %eax > - VZEROUPPER > - > -L(last_4x_vec_or_less): > - /* Check if first VEC contained match. */ > - testl %eax, %eax > - jnz L(first_vec_x1_check) > + COND_VZEROUPPER > > - /* If remaining length > CHAR_PER_VEC * 2. */ > - addl $(CHAR_PER_VEC * 2), %edx > - jg L(last_4x_vec) > - > -L(last_2x_vec): > - /* If remaining length < CHAR_PER_VEC. */ > - addl $CHAR_PER_VEC, %edx > - jle L(zero_end) > - > - /* Check VEC2 and compare any match with remaining length. */ > - VPCMP $0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k0 > - kmovd %k0, %eax > - tzcntl %eax, %eax > - cmpl %eax, %edx > - jbe L(set_zero_end) > - leaq (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax > -L(zero_end): > - ret > + .p2align 4,, 10 > +L(last_4x_vec): > + /* For CHAR_PER_VEC == 64 we don't need to mask as we use 8-bit > + instructions on eax from here on out. */ > +# if CHAR_PER_VEC != 64 > + andl $(CHAR_PER_VEC * 4 - 1), %eax > +# endif > + VPCMPEQ (VEC_SIZE * 0)(%rdi), %VMATCH, %k0 > + subq $(VEC_SIZE * 1), %rdi > + KMOV %k0, %VRDX > + cmpb $(CHAR_PER_VEC * 2 - 1), %al > + jbe L(last_2x_vec) > + test %VRDX, %VRDX > + jnz L(last_vec_x1_novzero) > + > + VPCMPEQ (VEC_SIZE * 2)(%rdi), %VMATCH, %k0 > + KMOV %k0, %VRDX > + test %VRDX, %VRDX > + jnz L(last_vec_x2_novzero) > + > + VPCMPEQ (VEC_SIZE * 3)(%rdi), %VMATCH, %k0 > + KMOV %k0, %VRCX > + test %VRCX, %VRCX > + jnz L(first_vec_x3_check) > + > + subb $(CHAR_PER_VEC * 3), %al > + jae L(last_vec_check) > > -L(set_zero_end): > xorl %eax, %eax > ret > > - .p2align 4 > -L(first_vec_x1_check): > - /* eax must be non-zero. Use bsfl to save code size. */ > - bsfl %eax, %eax > - /* Adjust length. */ > - subl $-(CHAR_PER_VEC * 4), %edx > - /* Check if match within remaining length. */ > - cmpl %eax, %edx > - jbe L(set_zero_end) > - /* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count. */ > - leaq VEC_SIZE(%rdi, %rax, CHAR_SIZE), %rax > +# if defined USE_AS_WMEMCHR && USE_TERN_IN_LOOP > +L(last_vec_x2_novzero): > + addq $VEC_SIZE, %rdi > +L(last_vec_x1_novzero): > + bsf %VRDX, %VRDX > + leaq (VEC_SIZE * 1)(%rdi, %rdx, CHAR_SIZE), %rax > ret > +# endif > > - .p2align 4 > -L(loop_4x_vec_end): > +# if CHAR_PER_VEC == 64 > + /* Since we can't combine the last 2x VEC when CHAR_PER_VEC == > + 64 it needs a seperate return label. */ > + .p2align 4,, 4 > +L(last_vec_x2): > +L(last_vec_x2_novzero): > + bsf %VRDX, %VRDX > + leaq (VEC_SIZE * 2)(%rdi, %rdx, TERN_CHAR_MULT), %rax > + ret > # endif > - /* rawmemchr will fall through into this if match was found in > - loop. */ > > -# if defined USE_IN_RTM || defined USE_AS_WMEMCHR > - /* k1 has not of matches with VEC1. */ > - kmovd %k1, %eax > -# ifdef USE_AS_WMEMCHR > - subl $((1 << CHAR_PER_VEC) - 1), %eax > -# else > - incl %eax > -# endif > + .p2align 4,, 4 > +L(loop_vec_ret): > +# if defined USE_AS_WMEMCHR || !USE_TERN_IN_LOOP > + KMOV %k1, %VRAX > + inc %MASK_GPR(rax) > # else > - /* eax already has matches for VEC1. */ > - testl %eax, %eax > + test %VRDX, %VRDX > # endif > - jnz L(last_vec_x1_return) > + jnz L(last_vec_x0) > > -# ifdef USE_IN_RTM > - VPCMP $0, %YMM2, %YMMZERO, %k0 > - kmovd %k0, %eax > + > +# if USE_TERN_IN_LOOP > + vpmovmskb %VMM_lo(2), %VRDX > # else > - vpmovmskb %ymm2, %eax > + VPTESTN %VMM(2), %VMM(2), %k1 > + KMOV %k1, %VRDX > # endif > - testl %eax, %eax > - jnz L(last_vec_x2_return) > + test %VRDX, %VRDX > + jnz L(last_vec_x1) > > -# ifdef USE_IN_RTM > - kmovd %k2, %eax > - testl %eax, %eax > - jnz L(last_vec_x3_return) > > - kmovd %k3, %eax > - tzcntl %eax, %eax > - leaq (VEC_SIZE * 3 + RET_OFFSET)(%rdi, %rax, CHAR_SIZE), %rax > +# if USE_TERN_IN_LOOP > + vpmovmskb %VMM_lo(3), %VRDX > # else > - vpmovmskb %ymm3, %eax > - /* Combine matches in VEC3 (eax) with matches in VEC4 (ecx). */ > - salq $VEC_SIZE, %rcx > - orq %rcx, %rax > - tzcntq %rax, %rax > - leaq (VEC_SIZE * 2 + RET_OFFSET)(%rdi, %rax), %rax > - VZEROUPPER > + KMOV %k2, %VRDX > # endif > - ret > > - .p2align 4,, 10 > -L(last_vec_x1_return): > - tzcntl %eax, %eax > -# if defined USE_AS_WMEMCHR || RET_OFFSET != 0 > - /* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count. */ > - leaq RET_OFFSET(%rdi, %rax, CHAR_SIZE), %rax > + /* No longer need any of the lo vecs (ymm0-15) so vzeroupper > + (only if used VEX encoded loop). */ > + COND_VZEROUPPER > + > + /* Seperate logic for CHAR_PER_VEC == 64 vs the rest. For > + CHAR_PER_VEC we test the last 2x VEC seperately, for > + CHAR_PER_VEC <= 32 we can combine the results from the 2x > + VEC in a single GPR. */ > +# if CHAR_PER_VEC == 64 > +# if USE_TERN_IN_LOOP > +# error "Unsupported" > +# endif > + > + > + /* If CHAR_PER_VEC == 64 we can't combine the last two VEC. */ > + test %VRDX, %VRDX > + jnz L(last_vec_x2) > + KMOV %k3, %VRDX > # else > - addq %rdi, %rax > + /* CHAR_PER_VEC <= 32 so we can combine the results from the > + last 2x VEC. */ > + > +# if !USE_TERN_IN_LOOP > + KMOV %k3, %VRCX > +# endif > + salq $(VEC_SIZE / TERN_CHAR_MULT), %rcx > + addq %rcx, %rdx > +# if !defined USE_AS_WMEMCHR || !USE_TERN_IN_LOOP > +L(last_vec_x2_novzero): > +# endif > # endif > - VZEROUPPER > + bsf %rdx, %rdx > + leaq (LAST_VEC_OFFSET)(%rdi, %rdx, TERN_CHAR_MULT), %rax > ret > > - .p2align 4 > -L(last_vec_x2_return): > - tzcntl %eax, %eax > - /* NB: Multiply bytes by RET_SCALE to get the wchar_t count > - if relevant (RET_SCALE = CHAR_SIZE if USE_AS_WMEMCHAR and > - USE_IN_RTM are both defined. Otherwise RET_SCALE = 1. */ > - leaq (VEC_SIZE + RET_OFFSET)(%rdi, %rax, RET_SCALE), %rax > - VZEROUPPER > + .p2align 4,, 8 > +L(last_vec_x1): > + COND_VZEROUPPER > +# if !defined USE_AS_WMEMCHR || !USE_TERN_IN_LOOP > +L(last_vec_x1_novzero): > +# endif > + bsf %VRDX, %VRDX > + leaq (VEC_SIZE * 1)(%rdi, %rdx, TERN_CHAR_MULT), %rax > ret > > -# ifdef USE_IN_RTM > - .p2align 4 > -L(last_vec_x3_return): > - tzcntl %eax, %eax > - /* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count. */ > - leaq (VEC_SIZE * 2 + RET_OFFSET)(%rdi, %rax, CHAR_SIZE), %rax > + > + .p2align 4,, 4 > +L(last_vec_x0): > + COND_VZEROUPPER > + bsf %VGPR(GPR_X0), %VGPR(GPR_X0) > +# if GPR_X0_IS_RET > + addq %rdi, %rax > +# else > + leaq (%rdi, %GPR_X0, CHAR_SIZE), %rax > +# endif > ret > + > + .p2align 4,, 6 > +L(page_cross): > + /* Need to preserve eax to compute inbound bytes we are > + checking. */ > +# ifdef USE_AS_WMEMCHR > + movl %eax, %ecx > +# else > + xorl %ecx, %ecx > + subl %eax, %ecx > # endif > > -# ifndef USE_AS_RAWMEMCHR > - .p2align 4,, 5 > -L(last_4x_vec_or_less_cmpeq): > - VPCMP $0, (VEC_SIZE * 5)(%rdi), %YMMMATCH, %k0 > - kmovd %k0, %eax > - subq $-(VEC_SIZE * 4), %rdi > - /* Check first VEC regardless. */ > - testl %eax, %eax > - jnz L(first_vec_x1_check) > + xorq %rdi, %rax > + VPCMPEQ (PAGE_SIZE - VEC_SIZE)(%rax), %VMATCH, %k0 > + KMOV %k0, %VRAX > > - /* If remaining length <= CHAR_PER_VEC * 2. */ > - addl $(CHAR_PER_VEC * 2), %edx > - jle L(last_2x_vec) > +# ifdef USE_AS_WMEMCHR > + /* NB: Divide by CHAR_SIZE to shift out out of bounds bytes. */ > + shrl $2, %ecx > + andl $(CHAR_PER_VEC - 1), %ecx > +# endif > > - .p2align 4 > -L(last_4x_vec): > - VPCMP $0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k0 > - kmovd %k0, %eax > - testl %eax, %eax > - jnz L(last_vec_x2) > > + shrx %VGPR(PC_SHIFT_GPR), %VRAX, %VRAX > > - VPCMP $0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k0 > - kmovd %k0, %eax > - /* Create mask for possible matches within remaining length. */ > -# ifdef USE_AS_WMEMCHR > - movl $((1 << (CHAR_PER_VEC * 2)) - 1), %ecx > - bzhil %edx, %ecx, %ecx > -# else > - movq $-1, %rcx > - bzhiq %rdx, %rcx, %rcx > -# endif > - /* Test matches in data against length match. */ > - andl %ecx, %eax > - jnz L(last_vec_x3) > +# ifdef USE_AS_WMEMCHR > + negl %ecx > +# endif > > - /* if remaining length <= CHAR_PER_VEC * 3 (Note this is after > - remaining length was found to be > CHAR_PER_VEC * 2. */ > - subl $CHAR_PER_VEC, %edx > - jbe L(zero_end2) > + /* mask lower bits from ecx (negative eax) to get bytes till > + next VEC. */ > + andl $(CHAR_PER_VEC - 1), %ecx > > + /* Check if VEC is entirely contained in the remainder of the > + page. */ > + cmpq %rcx, %rdx > + jbe L(page_cross_ret) > > - VPCMP $0, (VEC_SIZE * 4)(%rdi), %YMMMATCH, %k0 > - kmovd %k0, %eax > - /* Shift remaining length mask for last VEC. */ > -# ifdef USE_AS_WMEMCHR > - shrl $CHAR_PER_VEC, %ecx > -# else > - shrq $CHAR_PER_VEC, %rcx > -# endif > - andl %ecx, %eax > - jz L(zero_end2) > - bsfl %eax, %eax > - leaq (VEC_SIZE * 4)(%rdi, %rax, CHAR_SIZE), %rax > -L(zero_end2): > - ret > + /* Length crosses the page so if rax is zero (no matches) > + continue. */ > + test %VRAX, %VRAX > + jz L(page_cross_continue) > > -L(last_vec_x2): > - tzcntl %eax, %eax > - leaq (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax > + /* if rdx > rcx then any match here must be in [buf:buf + len]. > + */ > + tzcnt %VRAX, %VRAX > +# ifdef USE_AS_WMEMCHR > + leaq (%rdi, %rax, CHAR_SIZE), %rax > +# else > + addq %rdi, %rax > +# endif > ret > > - .p2align 4 > -L(last_vec_x3): > - tzcntl %eax, %eax > - leaq (VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax > + .p2align 4,, 2 > +L(page_cross_zero): > + xorl %eax, %eax > ret > + > + .p2align 4,, 4 > +L(page_cross_ret): > + /* Search is entirely contained in page cross case. */ > +# ifdef USE_AS_WMEMCHR > + test %VRAX, %VRAX > + jz L(page_cross_zero) > +# endif > + tzcnt %VRAX, %VRAX > + cmpl %eax, %edx > + jbe L(page_cross_zero) > +# ifdef USE_AS_WMEMCHR > + leaq (%rdi, %rax, CHAR_SIZE), %rax > +# else > + addq %rdi, %rax > # endif > - /* 7 bytes from next cache line. */ > + ret > END (MEMCHR) > #endif > diff --git a/sysdeps/x86_64/multiarch/rawmemchr-evex-rtm.S b/sysdeps/x86_64/multiarch/rawmemchr-evex-rtm.S > index deda1ca395..2073eaa620 100644 > --- a/sysdeps/x86_64/multiarch/rawmemchr-evex-rtm.S > +++ b/sysdeps/x86_64/multiarch/rawmemchr-evex-rtm.S > @@ -1,3 +1,6 @@ > -#define MEMCHR __rawmemchr_evex_rtm > -#define USE_AS_RAWMEMCHR 1 > -#include "memchr-evex-rtm.S" > +#define RAWMEMCHR __rawmemchr_evex_rtm > + > +#define USE_IN_RTM 1 > +#define SECTION(p) p##.evex.rtm > + > +#include "rawmemchr-evex.S" > diff --git a/sysdeps/x86_64/multiarch/rawmemchr-evex.S b/sysdeps/x86_64/multiarch/rawmemchr-evex.S > index dc1c450699..dad54def2b 100644 > --- a/sysdeps/x86_64/multiarch/rawmemchr-evex.S > +++ b/sysdeps/x86_64/multiarch/rawmemchr-evex.S > @@ -1,7 +1,308 @@ > -#ifndef RAWMEMCHR > -# define RAWMEMCHR __rawmemchr_evex > -#endif > -#define USE_AS_RAWMEMCHR 1 > -#define MEMCHR RAWMEMCHR > +/* rawmemchr optimized with 256-bit EVEX instructions. > + Copyright (C) 2022 Free Software Foundation, Inc. > + This file is part of the GNU C Library. > + > + The GNU C Library is free software; you can redistribute it and/or > + modify it under the terms of the GNU Lesser General Public > + License as published by the Free Software Foundation; either > + version 2.1 of the License, or (at your option) any later version. > + > + The GNU C Library is distributed in the hope that it will be useful, > + but WITHOUT ANY WARRANTY; without even the implied warranty of > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > + Lesser General Public License for more details. > + > + You should have received a copy of the GNU Lesser General Public > + License along with the GNU C Library; if not, see > + . */ > + > +#include > +#include > + > +#if ISA_SHOULD_BUILD (4) > + > +# ifndef VEC_SIZE > +# include "x86-evex256-vecs.h" > +# endif > + > +# ifndef RAWMEMCHR > +# define RAWMEMCHR __rawmemchr_evex > +# endif > + > + > +# define PC_SHIFT_GPR rdi > +# define REG_WIDTH VEC_SIZE > +# define VPTESTN vptestnmb > +# define VPBROADCAST vpbroadcastb > +# define VPMINU vpminub > +# define VPCMP vpcmpb > +# define VPCMPEQ vpcmpeqb > +# define CHAR_SIZE 1 > + > +# include "reg-macros.h" > + > +/* If not in an RTM and VEC_SIZE != 64 (the VEC_SIZE = 64 > + doesn't have VEX encoding), use VEX encoding in loop so we > + can use vpcmpeqb + vptern which is more efficient than the > + EVEX alternative. */ > +# if defined USE_IN_RTM || VEC_SIZE == 64 > +# undef COND_VZEROUPPER > +# undef VZEROUPPER_RETURN > +# undef VZEROUPPER > + > + > +# define COND_VZEROUPPER > +# define VZEROUPPER_RETURN ret > +# define VZEROUPPER > + > +# define USE_TERN_IN_LOOP 0 > +# else > +# define USE_TERN_IN_LOOP 1 > +# undef VZEROUPPER > +# define VZEROUPPER vzeroupper > +# endif > + > +# define CHAR_PER_VEC VEC_SIZE > + > +# if CHAR_PER_VEC == 64 > + > +# define TAIL_RETURN_LBL first_vec_x2 > +# define TAIL_RETURN_OFFSET (CHAR_PER_VEC * 2) > + > +# define FALLTHROUGH_RETURN_LBL first_vec_x3 > +# define FALLTHROUGH_RETURN_OFFSET (CHAR_PER_VEC * 3) > + > +# else /* !(CHAR_PER_VEC == 64) */ > + > +# define TAIL_RETURN_LBL first_vec_x3 > +# define TAIL_RETURN_OFFSET (CHAR_PER_VEC * 3) > + > +# define FALLTHROUGH_RETURN_LBL first_vec_x2 > +# define FALLTHROUGH_RETURN_OFFSET (CHAR_PER_VEC * 2) > +# endif /* !(CHAR_PER_VEC == 64) */ > + > + > +# define VMATCH VMM(0) > +# define VMATCH_LO VMM_lo(0) > + > +# define PAGE_SIZE 4096 > + > + .section SECTION(.text), "ax", @progbits > +ENTRY_P2ALIGN (RAWMEMCHR, 6) > + VPBROADCAST %esi, %VMATCH > + /* Check if we may cross page boundary with one vector load. */ > + movl %edi, %eax > + andl $(PAGE_SIZE - 1), %eax > + cmpl $(PAGE_SIZE - VEC_SIZE), %eax > + ja L(page_cross) > + > + VPCMPEQ (%rdi), %VMATCH, %k0 > + KMOV %k0, %VRAX > + > + test %VRAX, %VRAX > + jz L(aligned_more) > +L(first_vec_x0): > + bsf %VRAX, %VRAX > + addq %rdi, %rax > + ret > + > + .p2align 4,, 4 > +L(first_vec_x4): > + bsf %VRAX, %VRAX > + leaq (VEC_SIZE * 4)(%rdi, %rax), %rax > + ret > > -#include "memchr-evex.S" > + /* For VEC_SIZE == 32 we can fit this in aligning bytes so might > + as well place it more locally. For VEC_SIZE == 64 we reuse > + return code at the end of loop's return. */ > +# if VEC_SIZE == 32 > + .p2align 4,, 4 > +L(FALLTHROUGH_RETURN_LBL): > + bsf %VRAX, %VRAX > + leaq (FALLTHROUGH_RETURN_OFFSET)(%rdi, %rax), %rax > + ret > +# endif > + > + .p2align 4,, 6 > +L(page_cross): > + /* eax has lower page-offset bits of rdi so xor will zero them > + out. */ > + xorq %rdi, %rax > + VPCMPEQ (PAGE_SIZE - VEC_SIZE)(%rax), %VMATCH, %k0 > + KMOV %k0, %VRAX > + > + /* Shift out out-of-bounds matches. */ > + shrx %VRDI, %VRAX, %VRAX > + test %VRAX, %VRAX > + jnz L(first_vec_x0) > + > + .p2align 4,, 10 > +L(aligned_more): > +L(page_cross_continue): > + /* Align pointer. */ > + andq $(VEC_SIZE * -1), %rdi > + > + VPCMPEQ VEC_SIZE(%rdi), %VMATCH, %k0 > + KMOV %k0, %VRAX > + test %VRAX, %VRAX > + jnz L(first_vec_x1) > + > + VPCMPEQ (VEC_SIZE * 2)(%rdi), %VMATCH, %k0 > + KMOV %k0, %VRAX > + test %VRAX, %VRAX > + jnz L(first_vec_x2) > + > + VPCMPEQ (VEC_SIZE * 3)(%rdi), %VMATCH, %k0 > + KMOV %k0, %VRAX > + test %VRAX, %VRAX > + jnz L(first_vec_x3) > + > + VPCMPEQ (VEC_SIZE * 4)(%rdi), %VMATCH, %k0 > + KMOV %k0, %VRAX > + test %VRAX, %VRAX > + jnz L(first_vec_x4) > + > + subq $-(VEC_SIZE * 1), %rdi > +# if VEC_SIZE == 64 > + /* Saves code size. No evex512 processor has partial register > + stalls. If that change this can be replaced with `andq > + $-(VEC_SIZE * 4), %rdi`. */ > + xorb %dil, %dil > +# else > + andq $-(VEC_SIZE * 4), %rdi > +# endif > + > +# if USE_TERN_IN_LOOP > + /* copy VMATCH to low ymm so we can use vpcmpeq which is not > + encodable with EVEX registers. NB: this is VEC_SIZE == 32 > + only as there is no way to encode vpcmpeq with zmm0-15. */ > + vmovdqa64 %VMATCH, %VMATCH_LO > +# endif > + > + .p2align 4 > +L(loop_4x_vec): > + /* Two versions of the loop. One that does not require > + vzeroupper by not using ymm0-15 and another does that > + require vzeroupper because it uses ymm0-15. The reason why > + ymm0-15 is used at all is because there is no EVEX encoding > + vpcmpeq and with vpcmpeq this loop can be performed more > + efficiently. The non-vzeroupper version is safe for RTM > + while the vzeroupper version should be prefered if RTM are > + not supported. Which loop version we use is determined by > + USE_TERN_IN_LOOP. */ > + > +# if USE_TERN_IN_LOOP > + /* Since vptern can only take 3x vectors fastest to do 1 vec > + seperately with EVEX vpcmp. */ > + VPCMPEQ (VEC_SIZE * 4)(%rdi), %VMATCH, %k1 > + /* Compare 3x with vpcmpeq and or them all together with vptern. > + */ > + > + VPCMPEQ (VEC_SIZE * 5)(%rdi), %VMATCH_LO, %VMM_lo(2) > + subq $(VEC_SIZE * -4), %rdi > + VPCMPEQ (VEC_SIZE * 2)(%rdi), %VMATCH_LO, %VMM_lo(3) > + VPCMPEQ (VEC_SIZE * 3)(%rdi), %VMATCH_LO, %VMM_lo(4) > + > + /* 254 is mask for oring VEC_lo(2), VEC_lo(3), VEC_lo(4) into > + VEC_lo(4). */ > + vpternlogd $254, %VMM_lo(2), %VMM_lo(3), %VMM_lo(4) > + vpmovmskb %VMM_lo(4), %VRCX > + > + KMOV %k1, %eax > + > + /* NB: rax has match from first VEC and rcx has matches from > + VEC 2-4. If rax is non-zero we will return that match. If > + rax is zero adding won't disturb the bits in rcx. */ > + add %rax, %rcx > +# else > + /* Loop version that uses EVEX encoding. */ > + VPCMP $4, (VEC_SIZE * 4)(%rdi), %VMATCH, %k1 > + vpxorq (VEC_SIZE * 5)(%rdi), %VMATCH, %VMM(2) > + vpxorq (VEC_SIZE * 6)(%rdi), %VMATCH, %VMM(3) > + VPCMPEQ (VEC_SIZE * 7)(%rdi), %VMATCH, %k3 > + VPMINU %VMM(2), %VMM(3), %VMM(3){%k1}{z} > + VPTESTN %VMM(3), %VMM(3), %k2 > + subq $(VEC_SIZE * -4), %rdi > + KORTEST %k2, %k3 > +# endif > + jz L(loop_4x_vec) > + > +# if USE_TERN_IN_LOOP > + test %VRAX, %VRAX > +# else > + KMOV %k1, %VRAX > + inc %VRAX > +# endif > + jnz L(last_vec_x0) > + > + > +# if USE_TERN_IN_LOOP > + vpmovmskb %VMM_lo(2), %VRAX > +# else > + VPTESTN %VMM(2), %VMM(2), %k1 > + KMOV %k1, %VRAX > +# endif > + test %VRAX, %VRAX > + jnz L(last_vec_x1) > + > + > +# if USE_TERN_IN_LOOP > + vpmovmskb %VMM_lo(3), %VRAX > +# else > + KMOV %k2, %VRAX > +# endif > + > + /* No longer need any of the lo vecs (ymm0-15) so vzeroupper > + (only if used VEX encoded loop). */ > + COND_VZEROUPPER > + > + /* Seperate logic for VEC_SIZE == 64 and VEC_SIZE == 32 for > + returning last 2x VEC. For VEC_SIZE == 64 we test each VEC > + individually, for VEC_SIZE == 32 we combine them in a single > + 64-bit GPR. */ > +# if CHAR_PER_VEC == 64 > +# if USE_TERN_IN_LOOP > +# error "Unsupported" > +# endif > + > + > + /* If CHAR_PER_VEC == 64 we can't combine the last two VEC. */ > + test %VRAX, %VRAX > + jnz L(first_vec_x2) > + KMOV %k3, %VRAX > +L(FALLTHROUGH_RETURN_LBL): > +# else > + /* CHAR_PER_VEC <= 32 so we can combine the results from the > + last 2x VEC. */ > +# if !USE_TERN_IN_LOOP > + KMOV %k3, %VRCX > +# endif > + salq $CHAR_PER_VEC, %rcx > + addq %rcx, %rax > +# endif > + bsf %rax, %rax > + leaq (FALLTHROUGH_RETURN_OFFSET)(%rdi, %rax), %rax > + ret > + > + .p2align 4,, 8 > +L(TAIL_RETURN_LBL): > + bsf %rax, %rax > + leaq (TAIL_RETURN_OFFSET)(%rdi, %rax), %rax > + ret > + > + .p2align 4,, 8 > +L(last_vec_x1): > + COND_VZEROUPPER > +L(first_vec_x1): > + bsf %VRAX, %VRAX > + leaq (VEC_SIZE * 1)(%rdi, %rax), %rax > + ret > + > + .p2align 4,, 8 > +L(last_vec_x0): > + COND_VZEROUPPER > + bsf %VRAX, %VRAX > + addq %rdi, %rax > + ret > +END (RAWMEMCHR) > +#endif > -- > 2.34.1 >