From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: from mail-pl1-x631.google.com (mail-pl1-x631.google.com [IPv6:2607:f8b0:4864:20::631]) by sourceware.org (Postfix) with ESMTPS id 554ED39490B0 for ; Tue, 7 Jun 2022 18:18:03 +0000 (GMT) DMARC-Filter: OpenDMARC Filter v1.4.1 sourceware.org 554ED39490B0 Received: by mail-pl1-x631.google.com with SMTP id b5so15468867plx.10 for ; Tue, 07 Jun 2022 11:18:03 -0700 (PDT) X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=1e100.net; s=20210112; h=x-gm-message-state:mime-version:references:in-reply-to:from:date :message-id:subject:to:cc; bh=UoXejDs18/nAJFqGtOxxLpK1kaJJGoM0gGdGO21Mc/I=; b=w1ImnvACppU+kp94Gz+bMQsJxdy/EnytvcrZKdswin8yWbcecJKAECczU07D8IofMf EMwNkXrNOC5enb/vj6MSA4Ys3dd+nNi62A776IIDtQV2/W/TtN3ozOZoFLemEx7xmhgm 5MevB+SRBy5m2xTUzk81wgGdTdQmdazvM8eaL3Gq/x2h/vbY7eEYais2GtezGdgayomS t3oXux76dw8RJGbjZSQeDdOKlnylTc8ltkqeavMOb8qBkKn9nx9Jagkbl6eogPc3zXl0 orfVZUrN43RS38rwtuJDvKwU93qLhTsBhr0eZoXH54yyFReElrT0OyrMGtR1lNs8ICKd 64sw== X-Gm-Message-State: AOAM533EitayC7H5st1qjIOuFRkP6D7p4Ia0sb8s+LMc9hxlyw7MaX2p WOiD0VoyEXbbYYVvO8Ds/CoFNgiKWBvXg+O3m3c= X-Google-Smtp-Source: ABdhPJyD3pyZnxN6VTbqNr3ErEOZ+3NbtAZexcATlJozEGxRyIlwwWooSe12T6djgBTlnIe3xOS/vq/rLu5AlkGk5Fk= X-Received: by 2002:a17:902:9349:b0:158:a6f7:e280 with SMTP id g9-20020a170902934900b00158a6f7e280mr29727402plp.155.1654625882118; Tue, 07 Jun 2022 11:18:02 -0700 (PDT) MIME-Version: 1.0 References: <20220603044229.2180216-2-goldstein.w.n@gmail.com> <20220607041134.2369903-1-goldstein.w.n@gmail.com> <20220607041134.2369903-6-goldstein.w.n@gmail.com> In-Reply-To: <20220607041134.2369903-6-goldstein.w.n@gmail.com> From: "H.J. Lu" Date: Tue, 7 Jun 2022 11:17:26 -0700 Message-ID: Subject: Re: [PATCH v6 6/8] x86: Optimize memrchr-avx2.S To: Noah Goldstein Cc: GNU C Library , "Carlos O'Donell" Content-Type: text/plain; charset="UTF-8" X-Spam-Status: No, score=-3025.3 required=5.0 tests=BAYES_00, DKIM_SIGNED, DKIM_VALID, DKIM_VALID_AU, DKIM_VALID_EF, FREEMAIL_FROM, GIT_PATCH_0, RCVD_IN_DNSWL_NONE, SPF_HELO_NONE, SPF_PASS, TXREP, T_SCC_BODY_TEXT_LINE autolearn=ham autolearn_force=no version=3.4.6 X-Spam-Checker-Version: SpamAssassin 3.4.6 (2021-04-09) on server2.sourceware.org X-BeenThere: libc-alpha@sourceware.org X-Mailman-Version: 2.1.29 Precedence: list List-Id: Libc-alpha mailing list List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , X-List-Received-Date: Tue, 07 Jun 2022 18:18:06 -0000 On Mon, Jun 6, 2022 at 9:11 PM Noah Goldstein wrote: > > The new code: > 1. prioritizes smaller user-arg lengths more. > 2. optimizes target placement more carefully > 3. reuses logic more > 4. fixes up various inefficiencies in the logic. The biggest > case here is the `lzcnt` logic for checking returns which > saves either a branch or multiple instructions. > > The total code size saving is: 306 bytes > Geometric Mean of all benchmarks New / Old: 0.760 > > Regressions: > There are some regressions. Particularly where the length (user arg > length) is large but the position of the match char is near the > beginning of the string (in first VEC). This case has roughly a > 10-20% regression. > > This is because the new logic gives the hot path for immediate matches > to shorter lengths (the more common input). This case has roughly > a 15-45% speedup. > > Full xcheck passes on x86_64. > --- > sysdeps/x86_64/multiarch/memrchr-avx2-rtm.S | 1 + > sysdeps/x86_64/multiarch/memrchr-avx2.S | 534 ++++++++++---------- > 2 files changed, 257 insertions(+), 278 deletions(-) > > diff --git a/sysdeps/x86_64/multiarch/memrchr-avx2-rtm.S b/sysdeps/x86_64/multiarch/memrchr-avx2-rtm.S > index cea2d2a72d..5e9beeeef2 100644 > --- a/sysdeps/x86_64/multiarch/memrchr-avx2-rtm.S > +++ b/sysdeps/x86_64/multiarch/memrchr-avx2-rtm.S > @@ -2,6 +2,7 @@ > # define MEMRCHR __memrchr_avx2_rtm > #endif > > +#define COND_VZEROUPPER COND_VZEROUPPER_XTEST > #define ZERO_UPPER_VEC_REGISTERS_RETURN \ > ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST > > diff --git a/sysdeps/x86_64/multiarch/memrchr-avx2.S b/sysdeps/x86_64/multiarch/memrchr-avx2.S > index ba2ce7cb03..bea4528068 100644 > --- a/sysdeps/x86_64/multiarch/memrchr-avx2.S > +++ b/sysdeps/x86_64/multiarch/memrchr-avx2.S > @@ -21,340 +21,318 @@ > # include > > # ifndef MEMRCHR > -# define MEMRCHR __memrchr_avx2 > +# define MEMRCHR __memrchr_avx2 > # endif > > # ifndef VZEROUPPER > -# define VZEROUPPER vzeroupper > +# define VZEROUPPER vzeroupper > # endif > > # ifndef SECTION > # define SECTION(p) p##.avx > # endif > > -# define VEC_SIZE 32 > +# define VEC_SIZE 32 > +# define PAGE_SIZE 4096 > + .section SECTION(.text), "ax", @progbits > +ENTRY(MEMRCHR) > +# ifdef __ILP32__ > + /* Clear upper bits. */ > + and %RDX_LP, %RDX_LP > +# else > + test %RDX_LP, %RDX_LP > +# endif > + jz L(zero_0) > > - .section SECTION(.text),"ax",@progbits > -ENTRY (MEMRCHR) > - /* Broadcast CHAR to YMM0. */ > vmovd %esi, %xmm0 > - vpbroadcastb %xmm0, %ymm0 > - > - sub $VEC_SIZE, %RDX_LP > - jbe L(last_vec_or_less) > - > - add %RDX_LP, %RDI_LP > - > - /* Check the last VEC_SIZE bytes. */ > - vpcmpeqb (%rdi), %ymm0, %ymm1 > - vpmovmskb %ymm1, %eax > - testl %eax, %eax > - jnz L(last_vec_x0) > + /* Get end pointer. Minus one for two reasons. 1) It is necessary for a > + correct page cross check and 2) it correctly sets up end ptr to be > + subtract by lzcnt aligned. */ > + leaq -1(%rdx, %rdi), %rax > > - subq $(VEC_SIZE * 4), %rdi > - movl %edi, %ecx > - andl $(VEC_SIZE - 1), %ecx > - jz L(aligned_more) > + vpbroadcastb %xmm0, %ymm0 > > - /* Align data for aligned loads in the loop. */ > - addq $VEC_SIZE, %rdi > - addq $VEC_SIZE, %rdx > - andq $-VEC_SIZE, %rdi > - subq %rcx, %rdx > + /* Check if we can load 1x VEC without cross a page. */ > + testl $(PAGE_SIZE - VEC_SIZE), %eax > + jz L(page_cross) > + > + vpcmpeqb -(VEC_SIZE - 1)(%rax), %ymm0, %ymm1 > + vpmovmskb %ymm1, %ecx > + cmpq $VEC_SIZE, %rdx > + ja L(more_1x_vec) > + > +L(ret_vec_x0_test): > + /* If ecx is zero (no matches) lzcnt will set it 32 (VEC_SIZE) which > + will gurantee edx (len) is less than it. */ > + lzcntl %ecx, %ecx > + > + /* Hoist vzeroupper (not great for RTM) to save code size. This allows > + all logic for edx (len) <= VEC_SIZE to fit in first cache line. */ > + COND_VZEROUPPER > + cmpl %ecx, %edx > + jle L(zero_0) > + subq %rcx, %rax > + ret > > - .p2align 4 > -L(aligned_more): > - subq $(VEC_SIZE * 4), %rdx > - jbe L(last_4x_vec_or_less) > - > - /* Check the last 4 * VEC_SIZE. Only one VEC_SIZE at a time > - since data is only aligned to VEC_SIZE. */ > - vpcmpeqb (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1 > - vpmovmskb %ymm1, %eax > - testl %eax, %eax > - jnz L(last_vec_x3) > - > - vpcmpeqb (VEC_SIZE * 2)(%rdi), %ymm0, %ymm2 > - vpmovmskb %ymm2, %eax > - testl %eax, %eax > - jnz L(last_vec_x2) > - > - vpcmpeqb VEC_SIZE(%rdi), %ymm0, %ymm3 > - vpmovmskb %ymm3, %eax > - testl %eax, %eax > - jnz L(last_vec_x1) > - > - vpcmpeqb (%rdi), %ymm0, %ymm4 > - vpmovmskb %ymm4, %eax > - testl %eax, %eax > - jnz L(last_vec_x0) > - > - /* Align data to 4 * VEC_SIZE for loop with fewer branches. > - There are some overlaps with above if data isn't aligned > - to 4 * VEC_SIZE. */ > - movl %edi, %ecx > - andl $(VEC_SIZE * 4 - 1), %ecx > - jz L(loop_4x_vec) > - > - addq $(VEC_SIZE * 4), %rdi > - addq $(VEC_SIZE * 4), %rdx > - andq $-(VEC_SIZE * 4), %rdi > - subq %rcx, %rdx > + /* Fits in aligning bytes of first cache line. */ > +L(zero_0): > + xorl %eax, %eax > + ret > > - .p2align 4 > -L(loop_4x_vec): > - /* Compare 4 * VEC at a time forward. */ > - subq $(VEC_SIZE * 4), %rdi > - subq $(VEC_SIZE * 4), %rdx > - jbe L(last_4x_vec_or_less) > - > - vmovdqa (%rdi), %ymm1 > - vmovdqa VEC_SIZE(%rdi), %ymm2 > - vmovdqa (VEC_SIZE * 2)(%rdi), %ymm3 > - vmovdqa (VEC_SIZE * 3)(%rdi), %ymm4 > - > - vpcmpeqb %ymm1, %ymm0, %ymm1 > - vpcmpeqb %ymm2, %ymm0, %ymm2 > - vpcmpeqb %ymm3, %ymm0, %ymm3 > - vpcmpeqb %ymm4, %ymm0, %ymm4 > - > - vpor %ymm1, %ymm2, %ymm5 > - vpor %ymm3, %ymm4, %ymm6 > - vpor %ymm5, %ymm6, %ymm5 > - > - vpmovmskb %ymm5, %eax > - testl %eax, %eax > - jz L(loop_4x_vec) > - > - /* There is a match. */ > - vpmovmskb %ymm4, %eax > - testl %eax, %eax > - jnz L(last_vec_x3) > - > - vpmovmskb %ymm3, %eax > - testl %eax, %eax > - jnz L(last_vec_x2) > - > - vpmovmskb %ymm2, %eax > - testl %eax, %eax > - jnz L(last_vec_x1) > - > - vpmovmskb %ymm1, %eax > - bsrl %eax, %eax > - addq %rdi, %rax > + .p2align 4,, 9 > +L(ret_vec_x0): > + lzcntl %ecx, %ecx > + subq %rcx, %rax > L(return_vzeroupper): > ZERO_UPPER_VEC_REGISTERS_RETURN > > - .p2align 4 > -L(last_4x_vec_or_less): > - addl $(VEC_SIZE * 4), %edx > - cmpl $(VEC_SIZE * 2), %edx > - jbe L(last_2x_vec) > - > - vpcmpeqb (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1 > - vpmovmskb %ymm1, %eax > - testl %eax, %eax > - jnz L(last_vec_x3) > - > - vpcmpeqb (VEC_SIZE * 2)(%rdi), %ymm0, %ymm2 > - vpmovmskb %ymm2, %eax > - testl %eax, %eax > - jnz L(last_vec_x2) > - > - vpcmpeqb VEC_SIZE(%rdi), %ymm0, %ymm3 > - vpmovmskb %ymm3, %eax > - testl %eax, %eax > - jnz L(last_vec_x1_check) > - cmpl $(VEC_SIZE * 3), %edx > - jbe L(zero) > - > - vpcmpeqb (%rdi), %ymm0, %ymm4 > - vpmovmskb %ymm4, %eax > - testl %eax, %eax > - jz L(zero) > - bsrl %eax, %eax > - subq $(VEC_SIZE * 4), %rdx > - addq %rax, %rdx > - jl L(zero) > - addq %rdi, %rax > - VZEROUPPER_RETURN > - > - .p2align 4 > + .p2align 4,, 10 > +L(more_1x_vec): > + testl %ecx, %ecx > + jnz L(ret_vec_x0) > + > + /* Align rax (string pointer). */ > + andq $-VEC_SIZE, %rax > + > + /* Recompute remaining length after aligning. */ > + movq %rax, %rdx > + /* Need this comparison next no matter what. */ > + vpcmpeqb -(VEC_SIZE)(%rax), %ymm0, %ymm1 > + subq %rdi, %rdx > + decq %rax > + vpmovmskb %ymm1, %ecx > + /* Fall through for short (hotter than length). */ > + cmpq $(VEC_SIZE * 2), %rdx > + ja L(more_2x_vec) > L(last_2x_vec): > - vpcmpeqb (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1 > - vpmovmskb %ymm1, %eax > - testl %eax, %eax > - jnz L(last_vec_x3_check) > cmpl $VEC_SIZE, %edx > - jbe L(zero) > - > - vpcmpeqb (VEC_SIZE * 2)(%rdi), %ymm0, %ymm1 > - vpmovmskb %ymm1, %eax > - testl %eax, %eax > - jz L(zero) > - bsrl %eax, %eax > - subq $(VEC_SIZE * 2), %rdx > - addq %rax, %rdx > - jl L(zero) > - addl $(VEC_SIZE * 2), %eax > - addq %rdi, %rax > - VZEROUPPER_RETURN > - > - .p2align 4 > -L(last_vec_x0): > - bsrl %eax, %eax > - addq %rdi, %rax > - VZEROUPPER_RETURN > + jbe L(ret_vec_x0_test) > + > + testl %ecx, %ecx > + jnz L(ret_vec_x0) > + > + vpcmpeqb -(VEC_SIZE * 2 - 1)(%rax), %ymm0, %ymm1 > + vpmovmskb %ymm1, %ecx > + /* 64-bit lzcnt. This will naturally add 32 to position. */ > + lzcntq %rcx, %rcx > + COND_VZEROUPPER > + cmpl %ecx, %edx > + jle L(zero_0) > + subq %rcx, %rax > + ret > > - .p2align 4 > -L(last_vec_x1): > - bsrl %eax, %eax > - addl $VEC_SIZE, %eax > - addq %rdi, %rax > - VZEROUPPER_RETURN > > - .p2align 4 > -L(last_vec_x2): > - bsrl %eax, %eax > - addl $(VEC_SIZE * 2), %eax > - addq %rdi, %rax > + /* Inexpensive place to put this regarding code size / target alignments > + / ICache NLP. Necessary for 2-byte encoding of jump to page cross > + case which in turn in necessary for hot path (len <= VEC_SIZE) to fit is necessary? > + in first cache line. */ > +L(page_cross): > + movq %rax, %rsi > + andq $-VEC_SIZE, %rsi > + vpcmpeqb (%rsi), %ymm0, %ymm1 > + vpmovmskb %ymm1, %ecx > + /* Shift out negative alignment (because we are starting from endptr and > + working backwards). */ > + movl %eax, %r8d > + /* notl because eax already has endptr - 1. (-x = ~(x - 1)). */ > + notl %r8d > + shlxl %r8d, %ecx, %ecx > + cmpq %rdi, %rsi > + ja L(more_1x_vec) > + lzcntl %ecx, %ecx > + COND_VZEROUPPER > + cmpl %ecx, %edx > + jle L(zero_0) > + subq %rcx, %rax > + ret > + .p2align 4,, 11 > +L(ret_vec_x1): > + /* This will naturally add 32 to position. */ > + lzcntq %rcx, %rcx > + subq %rcx, %rax > VZEROUPPER_RETURN > + .p2align 4,, 10 > +L(more_2x_vec): > + testl %ecx, %ecx > + jnz L(ret_vec_x0) > > - .p2align 4 > -L(last_vec_x3): > - bsrl %eax, %eax > - addl $(VEC_SIZE * 3), %eax > - addq %rdi, %rax > - ret > + vpcmpeqb -(VEC_SIZE * 2 - 1)(%rax), %ymm0, %ymm1 > + vpmovmskb %ymm1, %ecx > + testl %ecx, %ecx > + jnz L(ret_vec_x1) > > - .p2align 4 > -L(last_vec_x1_check): > - bsrl %eax, %eax > - subq $(VEC_SIZE * 3), %rdx > - addq %rax, %rdx > - jl L(zero) > - addl $VEC_SIZE, %eax > - addq %rdi, %rax > - VZEROUPPER_RETURN > > - .p2align 4 > -L(last_vec_x3_check): > - bsrl %eax, %eax > - subq $VEC_SIZE, %rdx > - addq %rax, %rdx > - jl L(zero) > - addl $(VEC_SIZE * 3), %eax > - addq %rdi, %rax > - VZEROUPPER_RETURN > + /* Needed no matter what. */ > + vpcmpeqb -(VEC_SIZE * 3 - 1)(%rax), %ymm0, %ymm1 > + vpmovmskb %ymm1, %ecx > > - .p2align 4 > -L(zero): > - xorl %eax, %eax > - VZEROUPPER_RETURN > + subq $(VEC_SIZE * 4), %rdx > + ja L(more_4x_vec) > + > + cmpl $(VEC_SIZE * -1), %edx > + jle L(ret_vec_x2_test) > + > +L(last_vec): > + testl %ecx, %ecx > + jnz L(ret_vec_x2) > + > + /* Needed no matter what. */ > + vpcmpeqb -(VEC_SIZE * 4 - 1)(%rax), %ymm0, %ymm1 > + vpmovmskb %ymm1, %ecx > + lzcntl %ecx, %ecx > + subq $(VEC_SIZE * 3), %rax > + COND_VZEROUPPER > + subq %rcx, %rax > + cmpq %rax, %rdi > + ja L(zero_2) > + ret > > - .p2align 4 > -L(null): > + /* First in aligning bytes. */ > +L(zero_2): > xorl %eax, %eax > ret > > - .p2align 4 > -L(last_vec_or_less_aligned): > - movl %edx, %ecx > + .p2align 4,, 4 > +L(ret_vec_x2_test): > + lzcntl %ecx, %ecx > + subq $(VEC_SIZE * 2), %rax > + COND_VZEROUPPER > + subq %rcx, %rax > + cmpq %rax, %rdi > + ja L(zero_2) > + ret > > - vpcmpeqb (%rdi), %ymm0, %ymm1 > > - movl $1, %edx > - /* Support rdx << 32. */ > - salq %cl, %rdx > - subq $1, %rdx > + .p2align 4,, 11 > +L(ret_vec_x2): > + /* ecx must be non-zero. */ > + bsrl %ecx, %ecx > + leaq (VEC_SIZE * -3 + 1)(%rcx, %rax), %rax > + VZEROUPPER_RETURN > > - vpmovmskb %ymm1, %eax > + .p2align 4,, 14 > +L(ret_vec_x3): > + /* ecx must be non-zero. */ > + bsrl %ecx, %ecx > + leaq (VEC_SIZE * -4 + 1)(%rcx, %rax), %rax > + VZEROUPPER_RETURN > > - /* Remove the trailing bytes. */ > - andl %edx, %eax > - testl %eax, %eax > - jz L(zero) > > - bsrl %eax, %eax > - addq %rdi, %rax > - VZEROUPPER_RETURN > > .p2align 4 > -L(last_vec_or_less): > - addl $VEC_SIZE, %edx > +L(more_4x_vec): > + testl %ecx, %ecx > + jnz L(ret_vec_x2) > > - /* Check for zero length. */ > - testl %edx, %edx > - jz L(null) > + vpcmpeqb -(VEC_SIZE * 4 - 1)(%rax), %ymm0, %ymm1 > + vpmovmskb %ymm1, %ecx > > - movl %edi, %ecx > - andl $(VEC_SIZE - 1), %ecx > - jz L(last_vec_or_less_aligned) > + testl %ecx, %ecx > + jnz L(ret_vec_x3) > > - movl %ecx, %esi > - movl %ecx, %r8d > - addl %edx, %esi > - andq $-VEC_SIZE, %rdi > + /* Check if near end before re-aligning (otherwise might do an > + unnecissary loop iteration). */ > + addq $-(VEC_SIZE * 4), %rax > + cmpq $(VEC_SIZE * 4), %rdx > + jbe L(last_4x_vec) > > - subl $VEC_SIZE, %esi > - ja L(last_vec_2x_aligned) > + /* Align rax to (VEC_SIZE - 1). */ > + orq $(VEC_SIZE * 4 - 1), %rax > + movq %rdi, %rdx > + /* Get endptr for loop in rdx. NB: Can't just do while rax > rdi because > + lengths that overflow can be valid and break the comparison. */ > + orq $(VEC_SIZE * 4 - 1), %rdx > > - /* Check the last VEC. */ > - vpcmpeqb (%rdi), %ymm0, %ymm1 > - vpmovmskb %ymm1, %eax > - > - /* Remove the leading and trailing bytes. */ > - sarl %cl, %eax > - movl %edx, %ecx > + .p2align 4 > +L(loop_4x_vec): > + /* Need this comparison next no matter what. */ > + vpcmpeqb -(VEC_SIZE * 1 - 1)(%rax), %ymm0, %ymm1 > + vpcmpeqb -(VEC_SIZE * 2 - 1)(%rax), %ymm0, %ymm2 > + vpcmpeqb -(VEC_SIZE * 3 - 1)(%rax), %ymm0, %ymm3 > + vpcmpeqb -(VEC_SIZE * 4 - 1)(%rax), %ymm0, %ymm4 > > - movl $1, %edx > - sall %cl, %edx > - subl $1, %edx > + vpor %ymm1, %ymm2, %ymm2 > + vpor %ymm3, %ymm4, %ymm4 > + vpor %ymm2, %ymm4, %ymm4 > + vpmovmskb %ymm4, %esi > > - andl %edx, %eax > - testl %eax, %eax > - jz L(zero) > + testl %esi, %esi > + jnz L(loop_end) > > - bsrl %eax, %eax > - addq %rdi, %rax > - addq %r8, %rax > - VZEROUPPER_RETURN > + addq $(VEC_SIZE * -4), %rax > + cmpq %rdx, %rax > + jne L(loop_4x_vec) > > - .p2align 4 > -L(last_vec_2x_aligned): > - movl %esi, %ecx > + subl %edi, %edx > + incl %edx > > - /* Check the last VEC. */ > - vpcmpeqb VEC_SIZE(%rdi), %ymm0, %ymm1 > +L(last_4x_vec): > + /* Used no matter what. */ > + vpcmpeqb -(VEC_SIZE * 1 - 1)(%rax), %ymm0, %ymm1 > + vpmovmskb %ymm1, %ecx > > - movl $1, %edx > - sall %cl, %edx > - subl $1, %edx > + cmpl $(VEC_SIZE * 2), %edx > + jbe L(last_2x_vec) > > - vpmovmskb %ymm1, %eax > + testl %ecx, %ecx > + jnz L(ret_vec_x0_end) > > - /* Remove the trailing bytes. */ > - andl %edx, %eax > + vpcmpeqb -(VEC_SIZE * 2 - 1)(%rax), %ymm0, %ymm1 > + vpmovmskb %ymm1, %ecx > + testl %ecx, %ecx > + jnz L(ret_vec_x1_end) > > - testl %eax, %eax > - jnz L(last_vec_x1) > + /* Used no matter what. */ > + vpcmpeqb -(VEC_SIZE * 3 - 1)(%rax), %ymm0, %ymm1 > + vpmovmskb %ymm1, %ecx > > - /* Check the second last VEC. */ > - vpcmpeqb (%rdi), %ymm0, %ymm1 > + cmpl $(VEC_SIZE * 3), %edx > + ja L(last_vec) > + > + lzcntl %ecx, %ecx > + subq $(VEC_SIZE * 2), %rax > + COND_VZEROUPPER > + subq %rcx, %rax > + cmpq %rax, %rdi > + jbe L(ret0) > + xorl %eax, %eax > +L(ret0): > + ret > > - movl %r8d, %ecx > > - vpmovmskb %ymm1, %eax > + .p2align 4 > +L(loop_end): > + vpmovmskb %ymm1, %ecx > + testl %ecx, %ecx > + jnz L(ret_vec_x0_end) > + > + vpmovmskb %ymm2, %ecx > + testl %ecx, %ecx > + jnz L(ret_vec_x1_end) > + > + vpmovmskb %ymm3, %ecx > + /* Combine last 2 VEC matches. If ecx (VEC3) is zero (no CHAR in VEC3) > + then it won't affect the result in esi (VEC4). If ecx is non-zero > + then CHAR in VEC3 and bsrq will use that position. */ > + salq $32, %rcx > + orq %rsi, %rcx > + bsrq %rcx, %rcx > + leaq (VEC_SIZE * -4 + 1)(%rcx, %rax), %rax > + VZEROUPPER_RETURN > > - /* Remove the leading bytes. Must use unsigned right shift for > - bsrl below. */ > - shrl %cl, %eax > - testl %eax, %eax > - jz L(zero) > + .p2align 4,, 4 > +L(ret_vec_x1_end): > + /* 64-bit version will automatically add 32 (VEC_SIZE). */ > + lzcntq %rcx, %rcx > + subq %rcx, %rax > + VZEROUPPER_RETURN > > - bsrl %eax, %eax > - addq %rdi, %rax > - addq %r8, %rax > + .p2align 4,, 4 > +L(ret_vec_x0_end): > + lzcntl %ecx, %ecx > + subq %rcx, %rax > VZEROUPPER_RETURN > -END (MEMRCHR) > + > + /* 2 bytes until next cache line. */ > +END(MEMRCHR) > #endif > -- > 2.34.1 > OK with the updated comments. Reviewed-by: H.J. Lu Thanks. -- H.J.