From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: from mail-pg1-x536.google.com (mail-pg1-x536.google.com [IPv6:2607:f8b0:4864:20::536]) by sourceware.org (Postfix) with ESMTPS id D73C7385783D for ; Mon, 3 May 2021 22:58:26 +0000 (GMT) DMARC-Filter: OpenDMARC Filter v1.3.2 sourceware.org D73C7385783D Received: by mail-pg1-x536.google.com with SMTP id q10so4949019pgj.2 for ; Mon, 03 May 2021 15:58:26 -0700 (PDT) X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=1e100.net; s=20161025; h=x-gm-message-state:mime-version:references:in-reply-to:from:date :message-id:subject:to:cc; bh=JNrXeDlmcJsIXIRLhabH6hW8pCYbamxMfUf0C6WnPQs=; b=IDTBy+EQsN03Cq7cs9VaR6Kbe0C36xQQJMH+LOTbpCQH94xuzwd0k1t9k8eGjN8Dcf 3eTjNZ0Ev3ryYkpDGL/LUOjsf0yax/d1z7+XXHzsyHL1rvol4JCWyQxDUi7io9ZkInLM 71coYhaKeZn02eC4/pSbT74lHrz2D78picmEhi0GNdzClHyhIwlL1C946GP06ER0st9u f6Ig0IV+2i1mnRq/rx14oGZXVTgiFj4OPtk6wytBK6cDxivAqtHEBdUqvBPefayN15Hu QgRZCp717f37F4fBQlMPZS6CZeNGYbKLNwS83+uv6cr0fwWVejdSOsQk1zvwqsgZUe7U WrUQ== X-Gm-Message-State: AOAM532MDFI29J1FyYmmJnQhNUPQ2CkFYXKx4JCmInTl1ORuirNwYkv2 IOH14/G+ckokSEpbrWhbxhkrifynotsb5u3V4xY= X-Google-Smtp-Source: ABdhPJwpmocawmO3qcwF8Pd7kbNfemDYbJwCwaZ875ZddMTaHVjAbWed4x17XpaXZk9k7DAxcTXt3auFCv7eZFrZPCY= X-Received: by 2002:a17:90b:184:: with SMTP id t4mr1120452pjs.223.1620082705822; Mon, 03 May 2021 15:58:25 -0700 (PDT) MIME-Version: 1.0 References: <20210503084435.160548-2-goldstein.w.n@gmail.com> <20210503200655.2164487-1-goldstein.w.n@gmail.com> In-Reply-To: From: Noah Goldstein Date: Mon, 3 May 2021 18:58:15 -0400 Message-ID: Subject: Re: [PATCH v2 2/3] x86: Optimize memchr-avx2.S To: "H.J. Lu" Cc: GNU C Library , "Carlos O'Donell" Content-Type: text/plain; charset="UTF-8" X-Spam-Status: No, score=-9.9 required=5.0 tests=BAYES_00, DKIM_SIGNED, DKIM_VALID, DKIM_VALID_AU, DKIM_VALID_EF, FREEMAIL_FROM, GIT_PATCH_0, RCVD_IN_DNSWL_NONE, SPF_HELO_NONE, SPF_PASS, TXREP autolearn=ham autolearn_force=no version=3.4.2 X-Spam-Checker-Version: SpamAssassin 3.4.2 (2018-09-13) on server2.sourceware.org X-BeenThere: libc-alpha@sourceware.org X-Mailman-Version: 2.1.29 Precedence: list List-Id: Libc-alpha mailing list List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , X-List-Received-Date: Mon, 03 May 2021 22:58:29 -0000 On Mon, May 3, 2021 at 6:25 PM H.J. Lu wrote: > > On Mon, May 03, 2021 at 04:06:54PM -0400, Noah Goldstein wrote: > > No bug. This commit optimizes memchr-avx2.S. The optimizations include > > replacing some branches with cmovcc, avoiding some branches entirely > > in the less_4x_vec case, making the page cross logic less strict, > > asaving a few instructions the in loop return loop. test-memchr, > > test-rawmemchr, and test-wmemchr are all passing. > > > > Signed-off-by: Noah Goldstein > > --- > > sysdeps/x86_64/multiarch/memchr-avx2.S | 426 ++++++++++++++----------- > > 1 file changed, 247 insertions(+), 179 deletions(-) > > > > diff --git a/sysdeps/x86_64/multiarch/memchr-avx2.S b/sysdeps/x86_64/multiarch/memchr-avx2.S > > index 1fcb1c350f..8b862fb9d1 100644 > > --- a/sysdeps/x86_64/multiarch/memchr-avx2.S > > +++ b/sysdeps/x86_64/multiarch/memchr-avx2.S > > @@ -26,8 +26,22 @@ > > > > # ifdef USE_AS_WMEMCHR > > # define VPCMPEQ vpcmpeqd > > +# define VPBROADCAST vpbroadcastd > > +# define CHAR_SIZE 4 > > # else > > # define VPCMPEQ vpcmpeqb > > +# define VPBROADCAST vpbroadcastb > > +# define CHAR_SIZE 1 > > +# endif > > + > > +# ifdef USE_AS_RAWMEMCHR > > +# define ERAW_PTR_REG ecx > > +# define RRAW_PTR_REG rcx > > +# define ALGN_PTR_REG rdi > > +# else > > +# define ERAW_PTR_REG edi > > +# define RRAW_PTR_REG rdi > > +# define ALGN_PTR_REG rcx > > # endif > > > > # ifndef VZEROUPPER > > @@ -39,6 +53,7 @@ > > # endif > > > > # define VEC_SIZE 32 > > +# define PAGE_SIZE 4096 > > > > .section SECTION(.text),"ax",@progbits > > ENTRY (MEMCHR) > > @@ -47,295 +62,348 @@ ENTRY (MEMCHR) > > test %RDX_LP, %RDX_LP > > jz L(null) > > # endif > > - movl %edi, %ecx > > - /* Broadcast CHAR to YMM0. */ > > - vmovd %esi, %xmm0 > > # ifdef USE_AS_WMEMCHR > > shl $2, %RDX_LP > > - vpbroadcastd %xmm0, %ymm0 > > # else > > # ifdef __ILP32__ > > /* Clear the upper 32 bits. */ > > movl %edx, %edx > > # endif > > - vpbroadcastb %xmm0, %ymm0 > > # endif > > + /* Broadcast CHAR to YMMMATCH. */ > > + vmovd %esi, %xmm0 > > + VPBROADCAST %xmm0, %ymm0 > > /* Check if we may cross page boundary with one vector load. */ > > - andl $(2 * VEC_SIZE - 1), %ecx > > - cmpl $VEC_SIZE, %ecx > > - ja L(cros_page_boundary) > > + movl %edi, %eax > > + andl $(PAGE_SIZE - 1), %eax > > + cmpl $(PAGE_SIZE - VEC_SIZE), %eax > > + ja L(cross_page_boundary) > > > > /* Check the first VEC_SIZE bytes. */ > > - VPCMPEQ (%rdi), %ymm0, %ymm1 > > + VPCMPEQ (%rdi), %ymm0, %ymm1 > > vpmovmskb %ymm1, %eax > > - testl %eax, %eax > > - > > # ifndef USE_AS_RAWMEMCHR > > - jnz L(first_vec_x0_check) > > - /* Adjust length and check the end of data. */ > > - subq $VEC_SIZE, %rdx > > - jbe L(zero) > > -# else > > - jnz L(first_vec_x0) > > + /* If length < CHAR_PER_VEC handle special. */ > > + cmpq $VEC_SIZE, %rdx > > + jbe L(first_vec_x0) > > # endif > > - > > - /* Align data for aligned loads in the loop. */ > > - addq $VEC_SIZE, %rdi > > - andl $(VEC_SIZE - 1), %ecx > > - andq $-VEC_SIZE, %rdi > > + testl %eax, %eax > > + jz L(aligned_more) > > + tzcntl %eax, %eax > > + addq %rdi, %rax > > + VZEROUPPER_RETURN > > > > # ifndef USE_AS_RAWMEMCHR > > - /* Adjust length. */ > > - addq %rcx, %rdx > > - > > - subq $(VEC_SIZE * 4), %rdx > > - jbe L(last_4x_vec_or_less) > > + .p2align 5 > > +L(first_vec_x0): > > + /* Check if first match was before length. */ > > + tzcntl %eax, %eax > > + xorl %ecx, %ecx > > + cmpl %eax, %edx > > + leaq (%rdi, %rax), %rax > > + cmovle %rcx, %rax > > + VZEROUPPER_RETURN > > Please add a blank line here to indicate this begin a new block. Done. > OK with this change. You should be able to push it yourself now. Will do. Thanks! > > Thanks. > > > +L(null): > > + xorl %eax, %eax > > + ret > > # endif > > - jmp L(more_4x_vec) > > - > > .p2align 4 > > -L(cros_page_boundary): > > - andl $(VEC_SIZE - 1), %ecx > > - andq $-VEC_SIZE, %rdi > > - VPCMPEQ (%rdi), %ymm0, %ymm1 > > +L(cross_page_boundary): > > + /* Save pointer before aligning as its original value is necessary > > + for computer return address if byte is found or adjusting length > > + if it is not and this is memchr. */ > > + movq %rdi, %rcx > > + /* Align data to VEC_SIZE - 1. ALGN_PTR_REG is rcx for memchr and > > + rdi for rawmemchr. */ > > + orq $(VEC_SIZE - 1), %ALGN_PTR_REG > > + VPCMPEQ -(VEC_SIZE - 1)(%ALGN_PTR_REG), %ymm0, %ymm1 > > vpmovmskb %ymm1, %eax > > +# ifndef USE_AS_RAWMEMCHR > > + /* Calculate length until end of page (length checked for a > > + match). */ > > + leaq 1(%ALGN_PTR_REG), %rsi > > + subq %RRAW_PTR_REG, %rsi > > +# endif > > /* Remove the leading bytes. */ > > - sarl %cl, %eax > > - testl %eax, %eax > > - jz L(aligned_more) > > - tzcntl %eax, %eax > > + sarxl %ERAW_PTR_REG, %eax, %eax > > # ifndef USE_AS_RAWMEMCHR > > /* Check the end of data. */ > > - cmpq %rax, %rdx > > - jbe L(zero) > > + cmpq %rsi, %rdx > > + jbe L(first_vec_x0) > > # endif > > - addq %rdi, %rax > > - addq %rcx, %rax > > + testl %eax, %eax > > + jz L(cross_page_continue) > > + tzcntl %eax, %eax > > + addq %RRAW_PTR_REG, %rax > > L(return_vzeroupper): > > ZERO_UPPER_VEC_REGISTERS_RETURN > > > > .p2align 4 > > -L(aligned_more): > > -# ifndef USE_AS_RAWMEMCHR > > - /* Calculate "rdx + rcx - VEC_SIZE" with "rdx - (VEC_SIZE - rcx)" > > - instead of "(rdx + rcx) - VEC_SIZE" to void possible addition > > - overflow. */ > > - negq %rcx > > - addq $VEC_SIZE, %rcx > > +L(first_vec_x1): > > + tzcntl %eax, %eax > > + incq %rdi > > + addq %rdi, %rax > > + VZEROUPPER_RETURN > > > > - /* Check the end of data. */ > > - subq %rcx, %rdx > > - jbe L(zero) > > -# endif > > + .p2align 4 > > +L(first_vec_x2): > > + tzcntl %eax, %eax > > + addq $(VEC_SIZE + 1), %rdi > > + addq %rdi, %rax > > + VZEROUPPER_RETURN > > > > - addq $VEC_SIZE, %rdi > > + .p2align 4 > > +L(first_vec_x3): > > + tzcntl %eax, %eax > > + addq $(VEC_SIZE * 2 + 1), %rdi > > + addq %rdi, %rax > > + VZEROUPPER_RETURN > > > > -# ifndef USE_AS_RAWMEMCHR > > - subq $(VEC_SIZE * 4), %rdx > > - jbe L(last_4x_vec_or_less) > > -# endif > > > > -L(more_4x_vec): > > + .p2align 4 > > +L(first_vec_x4): > > + tzcntl %eax, %eax > > + addq $(VEC_SIZE * 3 + 1), %rdi > > + addq %rdi, %rax > > + VZEROUPPER_RETURN > > + > > + .p2align 4 > > +L(aligned_more): > > /* Check the first 4 * VEC_SIZE. Only one VEC_SIZE at a time > > since data is only aligned to VEC_SIZE. */ > > - VPCMPEQ (%rdi), %ymm0, %ymm1 > > - vpmovmskb %ymm1, %eax > > - testl %eax, %eax > > - jnz L(first_vec_x0) > > > > - VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1 > > +# ifndef USE_AS_RAWMEMCHR > > +L(cross_page_continue): > > + /* Align data to VEC_SIZE - 1. */ > > + xorl %ecx, %ecx > > + subl %edi, %ecx > > + orq $(VEC_SIZE - 1), %rdi > > + /* esi is for adjusting length to see if near the end. */ > > + leal (VEC_SIZE * 4 + 1)(%rdi, %rcx), %esi > > +# else > > + orq $(VEC_SIZE - 1), %rdi > > +L(cross_page_continue): > > +# endif > > + /* Load first VEC regardless. */ > > + VPCMPEQ 1(%rdi), %ymm0, %ymm1 > > vpmovmskb %ymm1, %eax > > +# ifndef USE_AS_RAWMEMCHR > > + /* Adjust length. If near end handle specially. */ > > + subq %rsi, %rdx > > + jbe L(last_4x_vec_or_less) > > +# endif > > testl %eax, %eax > > jnz L(first_vec_x1) > > > > - VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm0, %ymm1 > > + VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1 > > vpmovmskb %ymm1, %eax > > testl %eax, %eax > > jnz L(first_vec_x2) > > > > - VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1 > > + VPCMPEQ (VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm1 > > vpmovmskb %ymm1, %eax > > testl %eax, %eax > > jnz L(first_vec_x3) > > > > - addq $(VEC_SIZE * 4), %rdi > > + VPCMPEQ (VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm1 > > + vpmovmskb %ymm1, %eax > > + testl %eax, %eax > > + jnz L(first_vec_x4) > > > > # ifndef USE_AS_RAWMEMCHR > > + /* Check if at last VEC_SIZE * 4 length. */ > > subq $(VEC_SIZE * 4), %rdx > > - jbe L(last_4x_vec_or_less) > > -# endif > > - > > - /* Align data to 4 * VEC_SIZE. */ > > - movq %rdi, %rcx > > - andl $(4 * VEC_SIZE - 1), %ecx > > - andq $-(4 * VEC_SIZE), %rdi > > - > > -# ifndef USE_AS_RAWMEMCHR > > - /* Adjust length. */ > > + jbe L(last_4x_vec_or_less_cmpeq) > > + /* Align data to VEC_SIZE * 4 - 1 for the loop and readjust > > + length. */ > > + incq %rdi > > + movl %edi, %ecx > > + orq $(VEC_SIZE * 4 - 1), %rdi > > + andl $(VEC_SIZE * 4 - 1), %ecx > > addq %rcx, %rdx > > +# else > > + /* Align data to VEC_SIZE * 4 - 1 for loop. */ > > + incq %rdi > > + orq $(VEC_SIZE * 4 - 1), %rdi > > # endif > > > > + /* Compare 4 * VEC at a time forward. */ > > .p2align 4 > > L(loop_4x_vec): > > - /* Compare 4 * VEC at a time forward. */ > > - VPCMPEQ (%rdi), %ymm0, %ymm1 > > - VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm2 > > - VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm0, %ymm3 > > - VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm0, %ymm4 > > - > > + VPCMPEQ 1(%rdi), %ymm0, %ymm1 > > + VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm2 > > + VPCMPEQ (VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm3 > > + VPCMPEQ (VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm4 > > vpor %ymm1, %ymm2, %ymm5 > > vpor %ymm3, %ymm4, %ymm6 > > vpor %ymm5, %ymm6, %ymm5 > > > > - vpmovmskb %ymm5, %eax > > - testl %eax, %eax > > - jnz L(4x_vec_end) > > - > > - addq $(VEC_SIZE * 4), %rdi > > - > > + vpmovmskb %ymm5, %ecx > > # ifdef USE_AS_RAWMEMCHR > > - jmp L(loop_4x_vec) > > + subq $-(VEC_SIZE * 4), %rdi > > + testl %ecx, %ecx > > + jz L(loop_4x_vec) > > # else > > - subq $(VEC_SIZE * 4), %rdx > > - ja L(loop_4x_vec) > > + testl %ecx, %ecx > > + jnz L(loop_4x_vec_end) > > > > -L(last_4x_vec_or_less): > > - /* Less than 4 * VEC and aligned to VEC_SIZE. */ > > - addl $(VEC_SIZE * 2), %edx > > - jle L(last_2x_vec) > > + subq $-(VEC_SIZE * 4), %rdi > > > > - VPCMPEQ (%rdi), %ymm0, %ymm1 > > - vpmovmskb %ymm1, %eax > > - testl %eax, %eax > > - jnz L(first_vec_x0) > > + subq $(VEC_SIZE * 4), %rdx > > + ja L(loop_4x_vec) > > > > - VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1 > > + /* Fall through into less than 4 remaining vectors of length case. > > + */ > > + VPCMPEQ (VEC_SIZE * 0 + 1)(%rdi), %ymm0, %ymm1 > > vpmovmskb %ymm1, %eax > > + .p2align 4 > > +L(last_4x_vec_or_less): > > + /* Check if first VEC contained match. */ > > testl %eax, %eax > > - jnz L(first_vec_x1) > > + jnz L(first_vec_x1_check) > > > > - VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm0, %ymm1 > > - vpmovmskb %ymm1, %eax > > - testl %eax, %eax > > + /* If remaining length > VEC_SIZE * 2. */ > > + addl $(VEC_SIZE * 2), %edx > > + jg L(last_4x_vec) > > > > - jnz L(first_vec_x2_check) > > - subl $VEC_SIZE, %edx > > - jle L(zero) > > +L(last_2x_vec): > > + /* If remaining length < VEC_SIZE. */ > > + addl $VEC_SIZE, %edx > > + jle L(zero_end) > > > > - VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1 > > + /* Check VEC2 and compare any match with remaining length. */ > > + VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1 > > vpmovmskb %ymm1, %eax > > - testl %eax, %eax > > - > > - jnz L(first_vec_x3_check) > > - xorl %eax, %eax > > + tzcntl %eax, %eax > > + cmpl %eax, %edx > > + jbe L(set_zero_end) > > + addq $(VEC_SIZE + 1), %rdi > > + addq %rdi, %rax > > +L(zero_end): > > VZEROUPPER_RETURN > > > > .p2align 4 > > -L(last_2x_vec): > > - addl $(VEC_SIZE * 2), %edx > > - VPCMPEQ (%rdi), %ymm0, %ymm1 > > +L(loop_4x_vec_end): > > +# endif > > + /* rawmemchr will fall through into this if match was found in > > + loop. */ > > + > > vpmovmskb %ymm1, %eax > > testl %eax, %eax > > + jnz L(last_vec_x1_return) > > > > - jnz L(first_vec_x0_check) > > - subl $VEC_SIZE, %edx > > - jle L(zero) > > - > > - VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1 > > - vpmovmskb %ymm1, %eax > > + vpmovmskb %ymm2, %eax > > testl %eax, %eax > > - jnz L(first_vec_x1_check) > > - xorl %eax, %eax > > - VZEROUPPER_RETURN > > + jnz L(last_vec_x2_return) > > > > - .p2align 4 > > -L(first_vec_x0_check): > > - tzcntl %eax, %eax > > - /* Check the end of data. */ > > - cmpq %rax, %rdx > > - jbe L(zero) > > + vpmovmskb %ymm3, %eax > > + /* Combine VEC3 matches (eax) with VEC4 matches (ecx). */ > > + salq $32, %rcx > > + orq %rcx, %rax > > + tzcntq %rax, %rax > > +# ifdef USE_AS_RAWMEMCHR > > + subq $(VEC_SIZE * 2 - 1), %rdi > > +# else > > + subq $-(VEC_SIZE * 2 + 1), %rdi > > +# endif > > addq %rdi, %rax > > VZEROUPPER_RETURN > > +# ifndef USE_AS_RAWMEMCHR > > > > .p2align 4 > > L(first_vec_x1_check): > > tzcntl %eax, %eax > > - /* Check the end of data. */ > > - cmpq %rax, %rdx > > - jbe L(zero) > > - addq $VEC_SIZE, %rax > > + /* Adjust length. */ > > + subl $-(VEC_SIZE * 4), %edx > > + /* Check if match within remaining length. */ > > + cmpl %eax, %edx > > + jbe L(set_zero_end) > > + incq %rdi > > addq %rdi, %rax > > VZEROUPPER_RETURN > > + .p2align 4 > > +L(set_zero_end): > > + xorl %eax, %eax > > + VZEROUPPER_RETURN > > +# endif > > > > .p2align 4 > > -L(first_vec_x2_check): > > +L(last_vec_x1_return): > > tzcntl %eax, %eax > > - /* Check the end of data. */ > > - cmpq %rax, %rdx > > - jbe L(zero) > > - addq $(VEC_SIZE * 2), %rax > > +# ifdef USE_AS_RAWMEMCHR > > + subq $(VEC_SIZE * 4 - 1), %rdi > > +# else > > + incq %rdi > > +# endif > > addq %rdi, %rax > > VZEROUPPER_RETURN > > > > .p2align 4 > > -L(first_vec_x3_check): > > +L(last_vec_x2_return): > > tzcntl %eax, %eax > > - /* Check the end of data. */ > > - cmpq %rax, %rdx > > - jbe L(zero) > > - addq $(VEC_SIZE * 3), %rax > > +# ifdef USE_AS_RAWMEMCHR > > + subq $(VEC_SIZE * 3 - 1), %rdi > > +# else > > + subq $-(VEC_SIZE + 1), %rdi > > +# endif > > addq %rdi, %rax > > VZEROUPPER_RETURN > > > > +# ifndef USE_AS_RAWMEMCHR > > .p2align 4 > > -L(zero): > > - xorl %eax, %eax > > - jmp L(return_vzeroupper) > > +L(last_4x_vec_or_less_cmpeq): > > + VPCMPEQ (VEC_SIZE * 4 + 1)(%rdi), %ymm0, %ymm1 > > + vpmovmskb %ymm1, %eax > > + subq $-(VEC_SIZE * 4), %rdi > > + /* Check first VEC regardless. */ > > + testl %eax, %eax > > + jnz L(first_vec_x1_check) > > > > + /* If remaining length <= CHAR_PER_VEC * 2. */ > > + addl $(VEC_SIZE * 2), %edx > > + jle L(last_2x_vec) > > .p2align 4 > > -L(null): > > - xorl %eax, %eax > > - ret > > -# endif > > +L(last_4x_vec): > > + VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1 > > + vpmovmskb %ymm1, %eax > > + testl %eax, %eax > > + jnz L(last_vec_x2_return) > > > > - .p2align 4 > > -L(first_vec_x0): > > - tzcntl %eax, %eax > > - addq %rdi, %rax > > - VZEROUPPER_RETURN > > + VPCMPEQ (VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm1 > > + vpmovmskb %ymm1, %eax > > > > - .p2align 4 > > -L(first_vec_x1): > > - tzcntl %eax, %eax > > - addq $VEC_SIZE, %rax > > - addq %rdi, %rax > > - VZEROUPPER_RETURN > > + /* Create mask for possible matches within remaining length. */ > > + movq $-1, %rcx > > + bzhiq %rdx, %rcx, %rcx > > > > - .p2align 4 > > -L(first_vec_x2): > > + /* Test matches in data against length match. */ > > + andl %ecx, %eax > > + jnz L(last_vec_x3) > > + > > + /* if remaining length <= VEC_SIZE * 3 (Note this is after > > + remaining length was found to be > VEC_SIZE * 2. */ > > + subl $VEC_SIZE, %edx > > + jbe L(zero_end2) > > + > > + VPCMPEQ (VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm1 > > + vpmovmskb %ymm1, %eax > > + /* Shift remaining length mask for last VEC. */ > > + shrq $32, %rcx > > + andl %ecx, %eax > > + jz L(zero_end2) > > tzcntl %eax, %eax > > - addq $(VEC_SIZE * 2), %rax > > + addq $(VEC_SIZE * 3 + 1), %rdi > > addq %rdi, %rax > > +L(zero_end2): > > VZEROUPPER_RETURN > > > > .p2align 4 > > -L(4x_vec_end): > > - vpmovmskb %ymm1, %eax > > - testl %eax, %eax > > - jnz L(first_vec_x0) > > - vpmovmskb %ymm2, %eax > > - testl %eax, %eax > > - jnz L(first_vec_x1) > > - vpmovmskb %ymm3, %eax > > - testl %eax, %eax > > - jnz L(first_vec_x2) > > - vpmovmskb %ymm4, %eax > > - testl %eax, %eax > > -L(first_vec_x3): > > +L(last_vec_x3): > > tzcntl %eax, %eax > > - addq $(VEC_SIZE * 3), %rax > > + subq $-(VEC_SIZE * 2 + 1), %rdi > > addq %rdi, %rax > > VZEROUPPER_RETURN > > +# endif > > > > END (MEMCHR) > > #endif > > -- > > 2.29.2 > >