From mboxrd@z Thu Jan  1 00:00:00 1970
Return-Path: <hjl.tools@gmail.com>
Received: from mail-pl1-x631.google.com (mail-pl1-x631.google.com
 [IPv6:2607:f8b0:4864:20::631])
 by sourceware.org (Postfix) with ESMTPS id 554ED39490B0
 for <libc-alpha@sourceware.org>; Tue,  7 Jun 2022 18:18:03 +0000 (GMT)
DMARC-Filter: OpenDMARC Filter v1.4.1 sourceware.org 554ED39490B0
Received: by mail-pl1-x631.google.com with SMTP id b5so15468867plx.10
 for <libc-alpha@sourceware.org>; Tue, 07 Jun 2022 11:18:03 -0700 (PDT)
X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed;
 d=1e100.net; s=20210112;
 h=x-gm-message-state:mime-version:references:in-reply-to:from:date
 :message-id:subject:to:cc;
 bh=UoXejDs18/nAJFqGtOxxLpK1kaJJGoM0gGdGO21Mc/I=;
 b=w1ImnvACppU+kp94Gz+bMQsJxdy/EnytvcrZKdswin8yWbcecJKAECczU07D8IofMf
 EMwNkXrNOC5enb/vj6MSA4Ys3dd+nNi62A776IIDtQV2/W/TtN3ozOZoFLemEx7xmhgm
 5MevB+SRBy5m2xTUzk81wgGdTdQmdazvM8eaL3Gq/x2h/vbY7eEYais2GtezGdgayomS
 t3oXux76dw8RJGbjZSQeDdOKlnylTc8ltkqeavMOb8qBkKn9nx9Jagkbl6eogPc3zXl0
 orfVZUrN43RS38rwtuJDvKwU93qLhTsBhr0eZoXH54yyFReElrT0OyrMGtR1lNs8ICKd
 64sw==
X-Gm-Message-State: AOAM533EitayC7H5st1qjIOuFRkP6D7p4Ia0sb8s+LMc9hxlyw7MaX2p
 WOiD0VoyEXbbYYVvO8Ds/CoFNgiKWBvXg+O3m3c=
X-Google-Smtp-Source: ABdhPJyD3pyZnxN6VTbqNr3ErEOZ+3NbtAZexcATlJozEGxRyIlwwWooSe12T6djgBTlnIe3xOS/vq/rLu5AlkGk5Fk=
X-Received: by 2002:a17:902:9349:b0:158:a6f7:e280 with SMTP id
 g9-20020a170902934900b00158a6f7e280mr29727402plp.155.1654625882118; Tue, 07
 Jun 2022 11:18:02 -0700 (PDT)
MIME-Version: 1.0
References: <20220603044229.2180216-2-goldstein.w.n@gmail.com>
 <20220607041134.2369903-1-goldstein.w.n@gmail.com>
 <20220607041134.2369903-6-goldstein.w.n@gmail.com>
In-Reply-To: <20220607041134.2369903-6-goldstein.w.n@gmail.com>
From: "H.J. Lu" <hjl.tools@gmail.com>
Date: Tue, 7 Jun 2022 11:17:26 -0700
Message-ID: <CAMe9rOro=2tO1j1qvBQ1XGYgm2ODynVoEaeccvHzZLWBuhA4rQ@mail.gmail.com>
Subject: Re: [PATCH v6 6/8] x86: Optimize memrchr-avx2.S
To: Noah Goldstein <goldstein.w.n@gmail.com>
Cc: GNU C Library <libc-alpha@sourceware.org>,
 "Carlos O'Donell" <carlos@systemhalted.org>
Content-Type: text/plain; charset="UTF-8"
X-Spam-Status: No, score=-3025.3 required=5.0 tests=BAYES_00, DKIM_SIGNED,
 DKIM_VALID, DKIM_VALID_AU, DKIM_VALID_EF, FREEMAIL_FROM, GIT_PATCH_0,
 RCVD_IN_DNSWL_NONE, SPF_HELO_NONE, SPF_PASS, TXREP,
 T_SCC_BODY_TEXT_LINE autolearn=ham autolearn_force=no version=3.4.6
X-Spam-Checker-Version: SpamAssassin 3.4.6 (2021-04-09) on
 server2.sourceware.org
X-BeenThere: libc-alpha@sourceware.org
X-Mailman-Version: 2.1.29
Precedence: list
List-Id: Libc-alpha mailing list <libc-alpha.sourceware.org>
List-Unsubscribe: <https://sourceware.org/mailman/options/libc-alpha>,
 <mailto:libc-alpha-request@sourceware.org?subject=unsubscribe>
List-Archive: <https://sourceware.org/pipermail/libc-alpha/>
List-Post: <mailto:libc-alpha@sourceware.org>
List-Help: <mailto:libc-alpha-request@sourceware.org?subject=help>
List-Subscribe: <https://sourceware.org/mailman/listinfo/libc-alpha>,
 <mailto:libc-alpha-request@sourceware.org?subject=subscribe>
X-List-Received-Date: Tue, 07 Jun 2022 18:18:06 -0000

On Mon, Jun 6, 2022 at 9:11 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> The new code:
>     1. prioritizes smaller user-arg lengths more.
>     2. optimizes target placement more carefully
>     3. reuses logic more
>     4. fixes up various inefficiencies in the logic. The biggest
>        case here is the `lzcnt` logic for checking returns which
>        saves either a branch or multiple instructions.
>
> The total code size saving is: 306 bytes
> Geometric Mean of all benchmarks New / Old: 0.760
>
> Regressions:
> There are some regressions. Particularly where the length (user arg
> length) is large but the position of the match char is near the
> beginning of the string (in first VEC). This case has roughly a
> 10-20% regression.
>
> This is because the new logic gives the hot path for immediate matches
> to shorter lengths (the more common input). This case has roughly
> a 15-45% speedup.
>
> Full xcheck passes on x86_64.
> ---
>  sysdeps/x86_64/multiarch/memrchr-avx2-rtm.S |   1 +
>  sysdeps/x86_64/multiarch/memrchr-avx2.S     | 534 ++++++++++----------
>  2 files changed, 257 insertions(+), 278 deletions(-)
>
> diff --git a/sysdeps/x86_64/multiarch/memrchr-avx2-rtm.S b/sysdeps/x86_64/multiarch/memrchr-avx2-rtm.S
> index cea2d2a72d..5e9beeeef2 100644
> --- a/sysdeps/x86_64/multiarch/memrchr-avx2-rtm.S
> +++ b/sysdeps/x86_64/multiarch/memrchr-avx2-rtm.S
> @@ -2,6 +2,7 @@
>  # define MEMRCHR __memrchr_avx2_rtm
>  #endif
>
> +#define COND_VZEROUPPER        COND_VZEROUPPER_XTEST
>  #define ZERO_UPPER_VEC_REGISTERS_RETURN \
>    ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST
>
> diff --git a/sysdeps/x86_64/multiarch/memrchr-avx2.S b/sysdeps/x86_64/multiarch/memrchr-avx2.S
> index ba2ce7cb03..bea4528068 100644
> --- a/sysdeps/x86_64/multiarch/memrchr-avx2.S
> +++ b/sysdeps/x86_64/multiarch/memrchr-avx2.S
> @@ -21,340 +21,318 @@
>  # include <sysdep.h>
>
>  # ifndef MEMRCHR
> -#  define MEMRCHR      __memrchr_avx2
> +#  define MEMRCHR                              __memrchr_avx2
>  # endif
>
>  # ifndef VZEROUPPER
> -#  define VZEROUPPER   vzeroupper
> +#  define VZEROUPPER                   vzeroupper
>  # endif
>
>  # ifndef SECTION
>  #  define SECTION(p)   p##.avx
>  # endif
>
> -# define VEC_SIZE 32
> +# define VEC_SIZE                      32
> +# define PAGE_SIZE                     4096
> +       .section SECTION(.text), "ax", @progbits
> +ENTRY(MEMRCHR)
> +# ifdef __ILP32__
> +       /* Clear upper bits.  */
> +       and     %RDX_LP, %RDX_LP
> +# else
> +       test    %RDX_LP, %RDX_LP
> +# endif
> +       jz      L(zero_0)
>
> -       .section SECTION(.text),"ax",@progbits
> -ENTRY (MEMRCHR)
> -       /* Broadcast CHAR to YMM0.  */
>         vmovd   %esi, %xmm0
> -       vpbroadcastb %xmm0, %ymm0
> -
> -       sub     $VEC_SIZE, %RDX_LP
> -       jbe     L(last_vec_or_less)
> -
> -       add     %RDX_LP, %RDI_LP
> -
> -       /* Check the last VEC_SIZE bytes.  */
> -       vpcmpeqb (%rdi), %ymm0, %ymm1
> -       vpmovmskb %ymm1, %eax
> -       testl   %eax, %eax
> -       jnz     L(last_vec_x0)
> +       /* Get end pointer. Minus one for two reasons. 1) It is necessary for a
> +          correct page cross check and 2) it correctly sets up end ptr to be
> +          subtract by lzcnt aligned.  */
> +       leaq    -1(%rdx, %rdi), %rax
>
> -       subq    $(VEC_SIZE * 4), %rdi
> -       movl    %edi, %ecx
> -       andl    $(VEC_SIZE - 1), %ecx
> -       jz      L(aligned_more)
> +       vpbroadcastb %xmm0, %ymm0
>
> -       /* Align data for aligned loads in the loop.  */
> -       addq    $VEC_SIZE, %rdi
> -       addq    $VEC_SIZE, %rdx
> -       andq    $-VEC_SIZE, %rdi
> -       subq    %rcx, %rdx
> +       /* Check if we can load 1x VEC without cross a page.  */
> +       testl   $(PAGE_SIZE - VEC_SIZE), %eax
> +       jz      L(page_cross)
> +
> +       vpcmpeqb -(VEC_SIZE - 1)(%rax), %ymm0, %ymm1
> +       vpmovmskb %ymm1, %ecx
> +       cmpq    $VEC_SIZE, %rdx
> +       ja      L(more_1x_vec)
> +
> +L(ret_vec_x0_test):
> +       /* If ecx is zero (no matches) lzcnt will set it 32 (VEC_SIZE) which
> +          will gurantee edx (len) is less than it.  */
> +       lzcntl  %ecx, %ecx
> +
> +       /* Hoist vzeroupper (not great for RTM) to save code size. This allows
> +          all logic for edx (len) <= VEC_SIZE to fit in first cache line.  */
> +       COND_VZEROUPPER
> +       cmpl    %ecx, %edx
> +       jle     L(zero_0)
> +       subq    %rcx, %rax
> +       ret
>
> -       .p2align 4
> -L(aligned_more):
> -       subq    $(VEC_SIZE * 4), %rdx
> -       jbe     L(last_4x_vec_or_less)
> -
> -       /* Check the last 4 * VEC_SIZE.  Only one VEC_SIZE at a time
> -          since data is only aligned to VEC_SIZE.  */
> -       vpcmpeqb (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1
> -       vpmovmskb %ymm1, %eax
> -       testl   %eax, %eax
> -       jnz     L(last_vec_x3)
> -
> -       vpcmpeqb (VEC_SIZE * 2)(%rdi), %ymm0, %ymm2
> -       vpmovmskb %ymm2, %eax
> -       testl   %eax, %eax
> -       jnz     L(last_vec_x2)
> -
> -       vpcmpeqb VEC_SIZE(%rdi), %ymm0, %ymm3
> -       vpmovmskb %ymm3, %eax
> -       testl   %eax, %eax
> -       jnz     L(last_vec_x1)
> -
> -       vpcmpeqb (%rdi), %ymm0, %ymm4
> -       vpmovmskb %ymm4, %eax
> -       testl   %eax, %eax
> -       jnz     L(last_vec_x0)
> -
> -       /* Align data to 4 * VEC_SIZE for loop with fewer branches.
> -          There are some overlaps with above if data isn't aligned
> -          to 4 * VEC_SIZE.  */
> -       movl    %edi, %ecx
> -       andl    $(VEC_SIZE * 4 - 1), %ecx
> -       jz      L(loop_4x_vec)
> -
> -       addq    $(VEC_SIZE * 4), %rdi
> -       addq    $(VEC_SIZE * 4), %rdx
> -       andq    $-(VEC_SIZE * 4), %rdi
> -       subq    %rcx, %rdx
> +       /* Fits in aligning bytes of first cache line.  */
> +L(zero_0):
> +       xorl    %eax, %eax
> +       ret
>
> -       .p2align 4
> -L(loop_4x_vec):
> -       /* Compare 4 * VEC at a time forward.  */
> -       subq    $(VEC_SIZE * 4), %rdi
> -       subq    $(VEC_SIZE * 4), %rdx
> -       jbe     L(last_4x_vec_or_less)
> -
> -       vmovdqa (%rdi), %ymm1
> -       vmovdqa VEC_SIZE(%rdi), %ymm2
> -       vmovdqa (VEC_SIZE * 2)(%rdi), %ymm3
> -       vmovdqa (VEC_SIZE * 3)(%rdi), %ymm4
> -
> -       vpcmpeqb %ymm1, %ymm0, %ymm1
> -       vpcmpeqb %ymm2, %ymm0, %ymm2
> -       vpcmpeqb %ymm3, %ymm0, %ymm3
> -       vpcmpeqb %ymm4, %ymm0, %ymm4
> -
> -       vpor    %ymm1, %ymm2, %ymm5
> -       vpor    %ymm3, %ymm4, %ymm6
> -       vpor    %ymm5, %ymm6, %ymm5
> -
> -       vpmovmskb %ymm5, %eax
> -       testl   %eax, %eax
> -       jz      L(loop_4x_vec)
> -
> -       /* There is a match.  */
> -       vpmovmskb %ymm4, %eax
> -       testl   %eax, %eax
> -       jnz     L(last_vec_x3)
> -
> -       vpmovmskb %ymm3, %eax
> -       testl   %eax, %eax
> -       jnz     L(last_vec_x2)
> -
> -       vpmovmskb %ymm2, %eax
> -       testl   %eax, %eax
> -       jnz     L(last_vec_x1)
> -
> -       vpmovmskb %ymm1, %eax
> -       bsrl    %eax, %eax
> -       addq    %rdi, %rax
> +       .p2align 4,, 9
> +L(ret_vec_x0):
> +       lzcntl  %ecx, %ecx
> +       subq    %rcx, %rax
>  L(return_vzeroupper):
>         ZERO_UPPER_VEC_REGISTERS_RETURN
>
> -       .p2align 4
> -L(last_4x_vec_or_less):
> -       addl    $(VEC_SIZE * 4), %edx
> -       cmpl    $(VEC_SIZE * 2), %edx
> -       jbe     L(last_2x_vec)
> -
> -       vpcmpeqb (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1
> -       vpmovmskb %ymm1, %eax
> -       testl   %eax, %eax
> -       jnz     L(last_vec_x3)
> -
> -       vpcmpeqb (VEC_SIZE * 2)(%rdi), %ymm0, %ymm2
> -       vpmovmskb %ymm2, %eax
> -       testl   %eax, %eax
> -       jnz     L(last_vec_x2)
> -
> -       vpcmpeqb VEC_SIZE(%rdi), %ymm0, %ymm3
> -       vpmovmskb %ymm3, %eax
> -       testl   %eax, %eax
> -       jnz     L(last_vec_x1_check)
> -       cmpl    $(VEC_SIZE * 3), %edx
> -       jbe     L(zero)
> -
> -       vpcmpeqb (%rdi), %ymm0, %ymm4
> -       vpmovmskb %ymm4, %eax
> -       testl   %eax, %eax
> -       jz      L(zero)
> -       bsrl    %eax, %eax
> -       subq    $(VEC_SIZE * 4), %rdx
> -       addq    %rax, %rdx
> -       jl      L(zero)
> -       addq    %rdi, %rax
> -       VZEROUPPER_RETURN
> -
> -       .p2align 4
> +       .p2align 4,, 10
> +L(more_1x_vec):
> +       testl   %ecx, %ecx
> +       jnz     L(ret_vec_x0)
> +
> +       /* Align rax (string pointer).  */
> +       andq    $-VEC_SIZE, %rax
> +
> +       /* Recompute remaining length after aligning.  */
> +       movq    %rax, %rdx
> +       /* Need this comparison next no matter what.  */
> +       vpcmpeqb -(VEC_SIZE)(%rax), %ymm0, %ymm1
> +       subq    %rdi, %rdx
> +       decq    %rax
> +       vpmovmskb %ymm1, %ecx
> +       /* Fall through for short (hotter than length).  */
> +       cmpq    $(VEC_SIZE * 2), %rdx
> +       ja      L(more_2x_vec)
>  L(last_2x_vec):
> -       vpcmpeqb (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1
> -       vpmovmskb %ymm1, %eax
> -       testl   %eax, %eax
> -       jnz     L(last_vec_x3_check)
>         cmpl    $VEC_SIZE, %edx
> -       jbe     L(zero)
> -
> -       vpcmpeqb (VEC_SIZE * 2)(%rdi), %ymm0, %ymm1
> -       vpmovmskb %ymm1, %eax
> -       testl   %eax, %eax
> -       jz      L(zero)
> -       bsrl    %eax, %eax
> -       subq    $(VEC_SIZE * 2), %rdx
> -       addq    %rax, %rdx
> -       jl      L(zero)
> -       addl    $(VEC_SIZE * 2), %eax
> -       addq    %rdi, %rax
> -       VZEROUPPER_RETURN
> -
> -       .p2align 4
> -L(last_vec_x0):
> -       bsrl    %eax, %eax
> -       addq    %rdi, %rax
> -       VZEROUPPER_RETURN
> +       jbe     L(ret_vec_x0_test)
> +
> +       testl   %ecx, %ecx
> +       jnz     L(ret_vec_x0)
> +
> +       vpcmpeqb -(VEC_SIZE * 2 - 1)(%rax), %ymm0, %ymm1
> +       vpmovmskb %ymm1, %ecx
> +       /* 64-bit lzcnt. This will naturally add 32 to position.  */
> +       lzcntq  %rcx, %rcx
> +       COND_VZEROUPPER
> +       cmpl    %ecx, %edx
> +       jle     L(zero_0)
> +       subq    %rcx, %rax
> +       ret
>
> -       .p2align 4
> -L(last_vec_x1):
> -       bsrl    %eax, %eax
> -       addl    $VEC_SIZE, %eax
> -       addq    %rdi, %rax
> -       VZEROUPPER_RETURN
>
> -       .p2align 4
> -L(last_vec_x2):
> -       bsrl    %eax, %eax
> -       addl    $(VEC_SIZE * 2), %eax
> -       addq    %rdi, %rax
> +       /* Inexpensive place to put this regarding code size / target alignments
> +          / ICache NLP. Necessary for 2-byte encoding of jump to page cross
> +          case which in turn in necessary for hot path (len <= VEC_SIZE) to fit
                                             is necessary?
> +          in first cache line.  */
> +L(page_cross):
> +       movq    %rax, %rsi
> +       andq    $-VEC_SIZE, %rsi
> +       vpcmpeqb (%rsi), %ymm0, %ymm1
> +       vpmovmskb %ymm1, %ecx
> +       /* Shift out negative alignment (because we are starting from endptr and
> +          working backwards).  */
> +       movl    %eax, %r8d
> +       /* notl because eax already has endptr - 1.  (-x = ~(x - 1)).  */
> +       notl    %r8d
> +       shlxl   %r8d, %ecx, %ecx
> +       cmpq    %rdi, %rsi
> +       ja      L(more_1x_vec)
> +       lzcntl  %ecx, %ecx
> +       COND_VZEROUPPER
> +       cmpl    %ecx, %edx
> +       jle     L(zero_0)
> +       subq    %rcx, %rax
> +       ret
> +       .p2align 4,, 11
> +L(ret_vec_x1):
> +       /* This will naturally add 32 to position.  */
> +       lzcntq  %rcx, %rcx
> +       subq    %rcx, %rax
>         VZEROUPPER_RETURN
> +       .p2align 4,, 10
> +L(more_2x_vec):
> +       testl   %ecx, %ecx
> +       jnz     L(ret_vec_x0)
>
> -       .p2align 4
> -L(last_vec_x3):
> -       bsrl    %eax, %eax
> -       addl    $(VEC_SIZE * 3), %eax
> -       addq    %rdi, %rax
> -       ret
> +       vpcmpeqb -(VEC_SIZE * 2 - 1)(%rax), %ymm0, %ymm1
> +       vpmovmskb %ymm1, %ecx
> +       testl   %ecx, %ecx
> +       jnz     L(ret_vec_x1)
>
> -       .p2align 4
> -L(last_vec_x1_check):
> -       bsrl    %eax, %eax
> -       subq    $(VEC_SIZE * 3), %rdx
> -       addq    %rax, %rdx
> -       jl      L(zero)
> -       addl    $VEC_SIZE, %eax
> -       addq    %rdi, %rax
> -       VZEROUPPER_RETURN
>
> -       .p2align 4
> -L(last_vec_x3_check):
> -       bsrl    %eax, %eax
> -       subq    $VEC_SIZE, %rdx
> -       addq    %rax, %rdx
> -       jl      L(zero)
> -       addl    $(VEC_SIZE * 3), %eax
> -       addq    %rdi, %rax
> -       VZEROUPPER_RETURN
> +       /* Needed no matter what.  */
> +       vpcmpeqb -(VEC_SIZE * 3 - 1)(%rax), %ymm0, %ymm1
> +       vpmovmskb %ymm1, %ecx
>
> -       .p2align 4
> -L(zero):
> -       xorl    %eax, %eax
> -       VZEROUPPER_RETURN
> +       subq    $(VEC_SIZE * 4), %rdx
> +       ja      L(more_4x_vec)
> +
> +       cmpl    $(VEC_SIZE * -1), %edx
> +       jle     L(ret_vec_x2_test)
> +
> +L(last_vec):
> +       testl   %ecx, %ecx
> +       jnz     L(ret_vec_x2)
> +
> +       /* Needed no matter what.  */
> +       vpcmpeqb -(VEC_SIZE * 4 - 1)(%rax), %ymm0, %ymm1
> +       vpmovmskb %ymm1, %ecx
> +       lzcntl  %ecx, %ecx
> +       subq    $(VEC_SIZE * 3), %rax
> +       COND_VZEROUPPER
> +       subq    %rcx, %rax
> +       cmpq    %rax, %rdi
> +       ja      L(zero_2)
> +       ret
>
> -       .p2align 4
> -L(null):
> +       /* First in aligning bytes.  */
> +L(zero_2):
>         xorl    %eax, %eax
>         ret
>
> -       .p2align 4
> -L(last_vec_or_less_aligned):
> -       movl    %edx, %ecx
> +       .p2align 4,, 4
> +L(ret_vec_x2_test):
> +       lzcntl  %ecx, %ecx
> +       subq    $(VEC_SIZE * 2), %rax
> +       COND_VZEROUPPER
> +       subq    %rcx, %rax
> +       cmpq    %rax, %rdi
> +       ja      L(zero_2)
> +       ret
>
> -       vpcmpeqb (%rdi), %ymm0, %ymm1
>
> -       movl    $1, %edx
> -       /* Support rdx << 32.  */
> -       salq    %cl, %rdx
> -       subq    $1, %rdx
> +       .p2align 4,, 11
> +L(ret_vec_x2):
> +       /* ecx must be non-zero.  */
> +       bsrl    %ecx, %ecx
> +       leaq    (VEC_SIZE * -3 + 1)(%rcx, %rax), %rax
> +       VZEROUPPER_RETURN
>
> -       vpmovmskb %ymm1, %eax
> +       .p2align 4,, 14
> +L(ret_vec_x3):
> +       /* ecx must be non-zero.  */
> +       bsrl    %ecx, %ecx
> +       leaq    (VEC_SIZE * -4 + 1)(%rcx, %rax), %rax
> +       VZEROUPPER_RETURN
>
> -       /* Remove the trailing bytes.  */
> -       andl    %edx, %eax
> -       testl   %eax, %eax
> -       jz      L(zero)
>
> -       bsrl    %eax, %eax
> -       addq    %rdi, %rax
> -       VZEROUPPER_RETURN
>
>         .p2align 4
> -L(last_vec_or_less):
> -       addl    $VEC_SIZE, %edx
> +L(more_4x_vec):
> +       testl   %ecx, %ecx
> +       jnz     L(ret_vec_x2)
>
> -       /* Check for zero length.  */
> -       testl   %edx, %edx
> -       jz      L(null)
> +       vpcmpeqb -(VEC_SIZE * 4 - 1)(%rax), %ymm0, %ymm1
> +       vpmovmskb %ymm1, %ecx
>
> -       movl    %edi, %ecx
> -       andl    $(VEC_SIZE - 1), %ecx
> -       jz      L(last_vec_or_less_aligned)
> +       testl   %ecx, %ecx
> +       jnz     L(ret_vec_x3)
>
> -       movl    %ecx, %esi
> -       movl    %ecx, %r8d
> -       addl    %edx, %esi
> -       andq    $-VEC_SIZE, %rdi
> +       /* Check if near end before re-aligning (otherwise might do an
> +          unnecissary loop iteration).  */
> +       addq    $-(VEC_SIZE * 4), %rax
> +       cmpq    $(VEC_SIZE * 4), %rdx
> +       jbe     L(last_4x_vec)
>
> -       subl    $VEC_SIZE, %esi
> -       ja      L(last_vec_2x_aligned)
> +       /* Align rax to (VEC_SIZE - 1).  */
> +       orq     $(VEC_SIZE * 4 - 1), %rax
> +       movq    %rdi, %rdx
> +       /* Get endptr for loop in rdx. NB: Can't just do while rax > rdi because
> +          lengths that overflow can be valid and break the comparison.  */
> +       orq     $(VEC_SIZE * 4 - 1), %rdx
>
> -       /* Check the last VEC.  */
> -       vpcmpeqb (%rdi), %ymm0, %ymm1
> -       vpmovmskb %ymm1, %eax
> -
> -       /* Remove the leading and trailing bytes.  */
> -       sarl    %cl, %eax
> -       movl    %edx, %ecx
> +       .p2align 4
> +L(loop_4x_vec):
> +       /* Need this comparison next no matter what.  */
> +       vpcmpeqb -(VEC_SIZE * 1 - 1)(%rax), %ymm0, %ymm1
> +       vpcmpeqb -(VEC_SIZE * 2 - 1)(%rax), %ymm0, %ymm2
> +       vpcmpeqb -(VEC_SIZE * 3 - 1)(%rax), %ymm0, %ymm3
> +       vpcmpeqb -(VEC_SIZE * 4 - 1)(%rax), %ymm0, %ymm4
>
> -       movl    $1, %edx
> -       sall    %cl, %edx
> -       subl    $1, %edx
> +       vpor    %ymm1, %ymm2, %ymm2
> +       vpor    %ymm3, %ymm4, %ymm4
> +       vpor    %ymm2, %ymm4, %ymm4
> +       vpmovmskb %ymm4, %esi
>
> -       andl    %edx, %eax
> -       testl   %eax, %eax
> -       jz      L(zero)
> +       testl   %esi, %esi
> +       jnz     L(loop_end)
>
> -       bsrl    %eax, %eax
> -       addq    %rdi, %rax
> -       addq    %r8, %rax
> -       VZEROUPPER_RETURN
> +       addq    $(VEC_SIZE * -4), %rax
> +       cmpq    %rdx, %rax
> +       jne     L(loop_4x_vec)
>
> -       .p2align 4
> -L(last_vec_2x_aligned):
> -       movl    %esi, %ecx
> +       subl    %edi, %edx
> +       incl    %edx
>
> -       /* Check the last VEC.  */
> -       vpcmpeqb VEC_SIZE(%rdi), %ymm0, %ymm1
> +L(last_4x_vec):
> +       /* Used no matter what.  */
> +       vpcmpeqb -(VEC_SIZE * 1 - 1)(%rax), %ymm0, %ymm1
> +       vpmovmskb %ymm1, %ecx
>
> -       movl    $1, %edx
> -       sall    %cl, %edx
> -       subl    $1, %edx
> +       cmpl    $(VEC_SIZE * 2), %edx
> +       jbe     L(last_2x_vec)
>
> -       vpmovmskb %ymm1, %eax
> +       testl   %ecx, %ecx
> +       jnz     L(ret_vec_x0_end)
>
> -       /* Remove the trailing bytes.  */
> -       andl    %edx, %eax
> +       vpcmpeqb -(VEC_SIZE * 2 - 1)(%rax), %ymm0, %ymm1
> +       vpmovmskb %ymm1, %ecx
> +       testl   %ecx, %ecx
> +       jnz     L(ret_vec_x1_end)
>
> -       testl   %eax, %eax
> -       jnz     L(last_vec_x1)
> +       /* Used no matter what.  */
> +       vpcmpeqb -(VEC_SIZE * 3 - 1)(%rax), %ymm0, %ymm1
> +       vpmovmskb %ymm1, %ecx
>
> -       /* Check the second last VEC.  */
> -       vpcmpeqb (%rdi), %ymm0, %ymm1
> +       cmpl    $(VEC_SIZE * 3), %edx
> +       ja      L(last_vec)
> +
> +       lzcntl  %ecx, %ecx
> +       subq    $(VEC_SIZE * 2), %rax
> +       COND_VZEROUPPER
> +       subq    %rcx, %rax
> +       cmpq    %rax, %rdi
> +       jbe     L(ret0)
> +       xorl    %eax, %eax
> +L(ret0):
> +       ret
>
> -       movl    %r8d, %ecx
>
> -       vpmovmskb %ymm1, %eax
> +       .p2align 4
> +L(loop_end):
> +       vpmovmskb %ymm1, %ecx
> +       testl   %ecx, %ecx
> +       jnz     L(ret_vec_x0_end)
> +
> +       vpmovmskb %ymm2, %ecx
> +       testl   %ecx, %ecx
> +       jnz     L(ret_vec_x1_end)
> +
> +       vpmovmskb %ymm3, %ecx
> +       /* Combine last 2 VEC matches. If ecx (VEC3) is zero (no CHAR in VEC3)
> +          then it won't affect the result in esi (VEC4). If ecx is non-zero
> +          then CHAR in VEC3 and bsrq will use that position.  */
> +       salq    $32, %rcx
> +       orq     %rsi, %rcx
> +       bsrq    %rcx, %rcx
> +       leaq    (VEC_SIZE * -4 + 1)(%rcx, %rax), %rax
> +       VZEROUPPER_RETURN
>
> -       /* Remove the leading bytes.  Must use unsigned right shift for
> -          bsrl below.  */
> -       shrl    %cl, %eax
> -       testl   %eax, %eax
> -       jz      L(zero)
> +       .p2align 4,, 4
> +L(ret_vec_x1_end):
> +       /* 64-bit version will automatically add 32 (VEC_SIZE).  */
> +       lzcntq  %rcx, %rcx
> +       subq    %rcx, %rax
> +       VZEROUPPER_RETURN
>
> -       bsrl    %eax, %eax
> -       addq    %rdi, %rax
> -       addq    %r8, %rax
> +       .p2align 4,, 4
> +L(ret_vec_x0_end):
> +       lzcntl  %ecx, %ecx
> +       subq    %rcx, %rax
>         VZEROUPPER_RETURN
> -END (MEMRCHR)
> +
> +       /* 2 bytes until next cache line.  */
> +END(MEMRCHR)
>  #endif
> --
> 2.34.1
>

OK with the updated comments.

Reviewed-by: H.J. Lu <hjl.tools@gmail.com>

Thanks.

-- 
H.J.