From mboxrd@z Thu Jan  1 00:00:00 1970
Return-Path: <hjl.tools@gmail.com>
Received: from mail-pj1-x102a.google.com (mail-pj1-x102a.google.com
 [IPv6:2607:f8b0:4864:20::102a])
 by sourceware.org (Postfix) with ESMTPS id 73249385AC1F
 for <libc-alpha@sourceware.org>; Tue, 12 Jul 2022 22:59:19 +0000 (GMT)
DMARC-Filter: OpenDMARC Filter v1.4.1 sourceware.org 73249385AC1F
Received: by mail-pj1-x102a.google.com with SMTP id b8so8096572pjo.5
 for <libc-alpha@sourceware.org>; Tue, 12 Jul 2022 15:59:19 -0700 (PDT)
X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed;
 d=1e100.net; s=20210112;
 h=x-gm-message-state:mime-version:references:in-reply-to:from:date
 :message-id:subject:to:cc;
 bh=PHGroGtHzAg8+jXEEk1Vse+chCMjjHi0Iq2rBhpVCgA=;
 b=1TrheMnEmrReIWTZWtfy7Ox2EqFJtexUm0eZJeA1P07IY3/m7gsuns2dbrJ6sqsoWW
 /OS+RZGZTgd2JJ5dUvctNmIqeb8a1IKar9c2uNlHWt6fek59jmu7yIK3uzezeONqurI8
 Hdew7xLoJyAwjDewntNuCI2Tr3tgmHt46e9q9s8Rz9L7lUCFfxJOiXVx4dY24UdsItyM
 grDMKqtZfqEk1rTSW4pX9LDBIdOXLxzgubBhT/JBkj7mXY2phTru62ozfbM3FdkPF5gF
 9F8Mv/o5PWrHENu+/WoHRTPdAI9CE2z5B7vWPaV2K5zhHscvz1r9yfKRHSABm7eyhF4L
 PFLQ==
X-Gm-Message-State: AJIora94S/hTssXR74YkH+CuYXB70F8P62H0ffYDEtn5xn04/ENu/mXG
 +a/pMesYje/hkjy/4hBokDc70R1yIaFKECUoks4=
X-Google-Smtp-Source: AGRyM1uMQ4PJH8vYipjb9fW1GgQWf93xVE0PwEnUyOLZ1rIaZ5L1IGp4JRUm1MpG633spxhRNg3Z9l4Aw/yFQl71iWA=
X-Received: by 2002:a17:90b:1e0e:b0:1ef:97f9:dfb5 with SMTP id
 pg14-20020a17090b1e0e00b001ef97f9dfb5mr6912870pjb.217.1657666758241; Tue, 12
 Jul 2022 15:59:18 -0700 (PDT)
MIME-Version: 1.0
References: <20220712192910.351121-1-goldstein.w.n@gmail.com>
 <20220712192910.351121-3-goldstein.w.n@gmail.com>
In-Reply-To: <20220712192910.351121-3-goldstein.w.n@gmail.com>
From: "H.J. Lu" <hjl.tools@gmail.com>
Date: Tue, 12 Jul 2022 15:58:42 -0700
Message-ID: <CAMe9rOpw6FX76YPw6LKx9NMA4RROD6KD7cDCb1Ckb11io3Wpkw@mail.gmail.com>
Subject: Re: [PATCH v1] x86: Move memrchr SSE2 implementation to
 multiarch/memrchr-sse2.S
To: Noah Goldstein <goldstein.w.n@gmail.com>
Cc: GNU C Library <libc-alpha@sourceware.org>,
 "Carlos O'Donell" <carlos@systemhalted.org>
Content-Type: text/plain; charset="UTF-8"
X-Spam-Status: No, score=-3024.7 required=5.0 tests=BAYES_00, DKIM_SIGNED,
 DKIM_VALID, DKIM_VALID_AU, DKIM_VALID_EF, FREEMAIL_FROM, GIT_PATCH_0,
 KAM_SHORT, RCVD_IN_DNSWL_NONE, SPF_HELO_NONE, SPF_PASS, TXREP,
 T_SCC_BODY_TEXT_LINE autolearn=ham autolearn_force=no version=3.4.6
X-Spam-Checker-Version: SpamAssassin 3.4.6 (2021-04-09) on
 server2.sourceware.org
X-BeenThere: libc-alpha@sourceware.org
X-Mailman-Version: 2.1.29
Precedence: list
List-Id: Libc-alpha mailing list <libc-alpha.sourceware.org>
List-Unsubscribe: <https://sourceware.org/mailman/options/libc-alpha>,
 <mailto:libc-alpha-request@sourceware.org?subject=unsubscribe>
List-Archive: <https://sourceware.org/pipermail/libc-alpha/>
List-Post: <mailto:libc-alpha@sourceware.org>
List-Help: <mailto:libc-alpha-request@sourceware.org?subject=help>
List-Subscribe: <https://sourceware.org/mailman/listinfo/libc-alpha>,
 <mailto:libc-alpha-request@sourceware.org?subject=subscribe>
X-List-Received-Date: Tue, 12 Jul 2022 22:59:22 -0000

On Tue, Jul 12, 2022 at 12:29 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> This commit doesn't affect libc.so.6, its just housekeeping to prepare
> for adding explicit ISA level support.
>
> Tested build on x86_64 and x86_32 with/without multiarch.
> ---
>  sysdeps/x86_64/memrchr.S                | 332 +----------------------
>  sysdeps/x86_64/multiarch/memrchr-sse2.S | 336 +++++++++++++++++++++++-
>  2 files changed, 334 insertions(+), 334 deletions(-)
>
> diff --git a/sysdeps/x86_64/memrchr.S b/sysdeps/x86_64/memrchr.S
> index b0dffd2ae2..385e2c5668 100644
> --- a/sysdeps/x86_64/memrchr.S
> +++ b/sysdeps/x86_64/memrchr.S
> @@ -17,334 +17,6 @@
>     License along with the GNU C Library; if not, see
>     <https://www.gnu.org/licenses/>.  */
>
> -#include <sysdep.h>
> -#define VEC_SIZE                       16
> -#define PAGE_SIZE                      4096
> -
> -       .text
> -ENTRY_P2ALIGN(__memrchr, 6)
> -#ifdef __ILP32__
> -       /* Clear upper bits.  */
> -       mov     %RDX_LP, %RDX_LP
> -#endif
> -       movd    %esi, %xmm0
> -
> -       /* Get end pointer.  */
> -       leaq    (%rdx, %rdi), %rcx
> -
> -       punpcklbw %xmm0, %xmm0
> -       punpcklwd %xmm0, %xmm0
> -       pshufd  $0, %xmm0, %xmm0
> -
> -       /* Check if we can load 1x VEC without cross a page.  */
> -       testl   $(PAGE_SIZE - VEC_SIZE), %ecx
> -       jz      L(page_cross)
> -
> -       /* NB: This load happens regardless of whether rdx (len) is zero. Since
> -          it doesn't cross a page and the standard gurantees any pointer have
> -          at least one-valid byte this load must be safe. For the entire
> -          history of the x86 memrchr implementation this has been possible so
> -          no code "should" be relying on a zero-length check before this load.
> -          The zero-length check is moved to the page cross case because it is
> -          1) pretty cold and including it pushes the hot case len <= VEC_SIZE
> -          into 2-cache lines.  */
> -       movups  -(VEC_SIZE)(%rcx), %xmm1
> -       pcmpeqb %xmm0, %xmm1
> -       pmovmskb %xmm1, %eax
> -
> -       subq    $VEC_SIZE, %rdx
> -       ja      L(more_1x_vec)
> -L(ret_vec_x0_test):
> -       /* Zero-flag set if eax (src) is zero. Destination unchanged if src is
> -          zero.  */
> -       bsrl    %eax, %eax
> -       jz      L(ret_0)
> -       /* Check if the CHAR match is in bounds. Need to truly zero `eax` here
> -          if out of bounds.  */
> -       addl    %edx, %eax
> -       jl      L(zero_0)
> -       /* Since we subtracted VEC_SIZE from rdx earlier we can just add to base
> -          ptr.  */
> -       addq    %rdi, %rax
> -L(ret_0):
> -       ret
> -
> -       .p2align 4,, 5
> -L(ret_vec_x0):
> -       bsrl    %eax, %eax
> -       leaq    -(VEC_SIZE)(%rcx, %rax), %rax
> -       ret
> -
> -       .p2align 4,, 2
> -L(zero_0):
> -       xorl    %eax, %eax
> -       ret
> -
> -
> -       .p2align 4,, 8
> -L(more_1x_vec):
> -       testl   %eax, %eax
> -       jnz     L(ret_vec_x0)
> -
> -       /* Align rcx (pointer to string).  */
> -       decq    %rcx
> -       andq    $-VEC_SIZE, %rcx
> -
> -       movq    %rcx, %rdx
> -       /* NB: We could consistenyl save 1-byte in this pattern with `movaps
> -          %xmm0, %xmm1; pcmpeq IMM8(r), %xmm1; ...`. The reason against it is
> -          it adds more frontend uops (even if the moves can be eliminated) and
> -          some percentage of the time actual backend uops.  */
> -       movaps  -(VEC_SIZE)(%rcx), %xmm1
> -       pcmpeqb %xmm0, %xmm1
> -       subq    %rdi, %rdx
> -       pmovmskb %xmm1, %eax
> -
> -       cmpq    $(VEC_SIZE * 2), %rdx
> -       ja      L(more_2x_vec)
> -L(last_2x_vec):
> -       subl    $VEC_SIZE, %edx
> -       jbe     L(ret_vec_x0_test)
> -
> -       testl   %eax, %eax
> -       jnz     L(ret_vec_x0)
> -
> -       movaps  -(VEC_SIZE * 2)(%rcx), %xmm1
> -       pcmpeqb %xmm0, %xmm1
> -       pmovmskb %xmm1, %eax
> -
> -       subl    $VEC_SIZE, %edx
> -       bsrl    %eax, %eax
> -       jz      L(ret_1)
> -       addl    %edx, %eax
> -       jl      L(zero_0)
> -       addq    %rdi, %rax
> -L(ret_1):
> -       ret
> -
> -       /* Don't align. Otherwise lose 2-byte encoding in jump to L(page_cross)
> -          causes the hot pause (length <= VEC_SIZE) to span multiple cache
> -          lines.  Naturally aligned % 16 to 8-bytes.  */
> -L(page_cross):
> -       /* Zero length check.  */
> -       testq   %rdx, %rdx
> -       jz      L(zero_0)
> -
> -       leaq    -1(%rcx), %r8
> -       andq    $-(VEC_SIZE), %r8
> -
> -       movaps  (%r8), %xmm1
> -       pcmpeqb %xmm0, %xmm1
> -       pmovmskb %xmm1, %esi
> -       /* Shift out negative alignment (because we are starting from endptr and
> -          working backwards).  */
> -       negl    %ecx
> -       /* 32-bit shift but VEC_SIZE=16 so need to mask the shift count
> -          explicitly.  */
> -       andl    $(VEC_SIZE - 1), %ecx
> -       shl     %cl, %esi
> -       movzwl  %si, %eax
> -       leaq    (%rdi, %rdx), %rcx
> -       cmpq    %rdi, %r8
> -       ja      L(more_1x_vec)
> -       subl    $VEC_SIZE, %edx
> -       bsrl    %eax, %eax
> -       jz      L(ret_2)
> -       addl    %edx, %eax
> -       jl      L(zero_1)
> -       addq    %rdi, %rax
> -L(ret_2):
> -       ret
> -
> -       /* Fits in aliging bytes.  */
> -L(zero_1):
> -       xorl    %eax, %eax
> -       ret
> -
> -       .p2align 4,, 5
> -L(ret_vec_x1):
> -       bsrl    %eax, %eax
> -       leaq    -(VEC_SIZE * 2)(%rcx, %rax), %rax
> -       ret
> -
> -       .p2align 4,, 8
> -L(more_2x_vec):
> -       testl   %eax, %eax
> -       jnz     L(ret_vec_x0)
> -
> -       movaps  -(VEC_SIZE * 2)(%rcx), %xmm1
> -       pcmpeqb %xmm0, %xmm1
> -       pmovmskb %xmm1, %eax
> -       testl   %eax, %eax
> -       jnz     L(ret_vec_x1)
> -
> -
> -       movaps  -(VEC_SIZE * 3)(%rcx), %xmm1
> -       pcmpeqb %xmm0, %xmm1
> -       pmovmskb %xmm1, %eax
> -
> -       subq    $(VEC_SIZE * 4), %rdx
> -       ja      L(more_4x_vec)
> -
> -       addl    $(VEC_SIZE), %edx
> -       jle     L(ret_vec_x2_test)
> -
> -L(last_vec):
> -       testl   %eax, %eax
> -       jnz     L(ret_vec_x2)
> -
> -       movaps  -(VEC_SIZE * 4)(%rcx), %xmm1
> -       pcmpeqb %xmm0, %xmm1
> -       pmovmskb %xmm1, %eax
> -
> -       subl    $(VEC_SIZE), %edx
> -       bsrl    %eax, %eax
> -       jz      L(ret_3)
> -       addl    %edx, %eax
> -       jl      L(zero_2)
> -       addq    %rdi, %rax
> -L(ret_3):
> -       ret
> -
> -       .p2align 4,, 6
> -L(ret_vec_x2_test):
> -       bsrl    %eax, %eax
> -       jz      L(zero_2)
> -       addl    %edx, %eax
> -       jl      L(zero_2)
> -       addq    %rdi, %rax
> -       ret
> -
> -L(zero_2):
> -       xorl    %eax, %eax
> -       ret
> -
> -
> -       .p2align 4,, 5
> -L(ret_vec_x2):
> -       bsrl    %eax, %eax
> -       leaq    -(VEC_SIZE * 3)(%rcx, %rax), %rax
> -       ret
> -
> -       .p2align 4,, 5
> -L(ret_vec_x3):
> -       bsrl    %eax, %eax
> -       leaq    -(VEC_SIZE * 4)(%rcx, %rax), %rax
> -       ret
> -
> -       .p2align 4,, 8
> -L(more_4x_vec):
> -       testl   %eax, %eax
> -       jnz     L(ret_vec_x2)
> -
> -       movaps  -(VEC_SIZE * 4)(%rcx), %xmm1
> -       pcmpeqb %xmm0, %xmm1
> -       pmovmskb %xmm1, %eax
> -
> -       testl   %eax, %eax
> -       jnz     L(ret_vec_x3)
> -
> -       addq    $-(VEC_SIZE * 4), %rcx
> -       cmpq    $(VEC_SIZE * 4), %rdx
> -       jbe     L(last_4x_vec)
> -
> -       /* Offset everything by 4x VEC_SIZE here to save a few bytes at the end
> -          keeping the code from spilling to the next cache line.  */
> -       addq    $(VEC_SIZE * 4 - 1), %rcx
> -       andq    $-(VEC_SIZE * 4), %rcx
> -       leaq    (VEC_SIZE * 4)(%rdi), %rdx
> -       andq    $-(VEC_SIZE * 4), %rdx
> -
> -       .p2align 4,, 11
> -L(loop_4x_vec):
> -       movaps  (VEC_SIZE * -1)(%rcx), %xmm1
> -       movaps  (VEC_SIZE * -2)(%rcx), %xmm2
> -       movaps  (VEC_SIZE * -3)(%rcx), %xmm3
> -       movaps  (VEC_SIZE * -4)(%rcx), %xmm4
> -       pcmpeqb %xmm0, %xmm1
> -       pcmpeqb %xmm0, %xmm2
> -       pcmpeqb %xmm0, %xmm3
> -       pcmpeqb %xmm0, %xmm4
> -
> -       por     %xmm1, %xmm2
> -       por     %xmm3, %xmm4
> -       por     %xmm2, %xmm4
> -
> -       pmovmskb %xmm4, %esi
> -       testl   %esi, %esi
> -       jnz     L(loop_end)
> -
> -       addq    $-(VEC_SIZE * 4), %rcx
> -       cmpq    %rdx, %rcx
> -       jne     L(loop_4x_vec)
> -
> -       subl    %edi, %edx
> -
> -       /* Ends up being 1-byte nop.  */
> -       .p2align 4,, 2
> -L(last_4x_vec):
> -       movaps  -(VEC_SIZE)(%rcx), %xmm1
> -       pcmpeqb %xmm0, %xmm1
> -       pmovmskb %xmm1, %eax
> -
> -       cmpl    $(VEC_SIZE * 2), %edx
> -       jbe     L(last_2x_vec)
> -
> -       testl   %eax, %eax
> -       jnz     L(ret_vec_x0)
> -
> -
> -       movaps  -(VEC_SIZE * 2)(%rcx), %xmm1
> -       pcmpeqb %xmm0, %xmm1
> -       pmovmskb %xmm1, %eax
> -
> -       testl   %eax, %eax
> -       jnz     L(ret_vec_end)
> -
> -       movaps  -(VEC_SIZE * 3)(%rcx), %xmm1
> -       pcmpeqb %xmm0, %xmm1
> -       pmovmskb %xmm1, %eax
> -
> -       subl    $(VEC_SIZE * 3), %edx
> -       ja      L(last_vec)
> -       bsrl    %eax, %eax
> -       jz      L(ret_4)
> -       addl    %edx, %eax
> -       jl      L(zero_3)
> -       addq    %rdi, %rax
> -L(ret_4):
> -       ret
> -
> -       /* Ends up being 1-byte nop.  */
> -       .p2align 4,, 3
> -L(loop_end):
> -       pmovmskb %xmm1, %eax
> -       sall    $16, %eax
> -       jnz     L(ret_vec_end)
> -
> -       pmovmskb %xmm2, %eax
> -       testl   %eax, %eax
> -       jnz     L(ret_vec_end)
> -
> -       pmovmskb %xmm3, %eax
> -       /* Combine last 2 VEC matches. If ecx (VEC3) is zero (no CHAR in VEC3)
> -          then it won't affect the result in esi (VEC4). If ecx is non-zero
> -          then CHAR in VEC3 and bsrq will use that position.  */
> -       sall    $16, %eax
> -       orl     %esi, %eax
> -       bsrl    %eax, %eax
> -       leaq    -(VEC_SIZE * 4)(%rcx, %rax), %rax
> -       ret
> -
> -L(ret_vec_end):
> -       bsrl    %eax, %eax
> -       leaq    (VEC_SIZE * -2)(%rax, %rcx), %rax
> -       ret
> -       /* Use in L(last_4x_vec). In the same cache line. This is just a spare
> -          aligning bytes.  */
> -L(zero_3):
> -       xorl    %eax, %eax
> -       ret
> -       /* 2-bytes from next cache line.  */
> -END(__memrchr)
> +#define MEMRCHR        __memrchr
> +#include "multiarch/memrchr-sse2.S"
>  weak_alias (__memrchr, memrchr)
> diff --git a/sysdeps/x86_64/multiarch/memrchr-sse2.S b/sysdeps/x86_64/multiarch/memrchr-sse2.S
> index b04202e171..d92a4022dc 100644
> --- a/sysdeps/x86_64/multiarch/memrchr-sse2.S
> +++ b/sysdeps/x86_64/multiarch/memrchr-sse2.S
> @@ -17,10 +17,338 @@
>     <https://www.gnu.org/licenses/>.  */
>
>  #if IS_IN (libc)
> -# define __memrchr __memrchr_sse2
> +# ifndef MEMRCHR
> +#  define MEMRCHR __memrchr_sse2
> +# endif
> +#endif
> +
> +#include <sysdep.h>
> +#define VEC_SIZE                       16
> +#define PAGE_SIZE                      4096
>
> -# undef weak_alias
> -# define weak_alias(__memrchr, memrchr)
> +       .text
> +ENTRY_P2ALIGN(MEMRCHR, 6)
> +#ifdef __ILP32__
> +       /* Clear upper bits.  */
> +       mov     %RDX_LP, %RDX_LP
>  #endif
> +       movd    %esi, %xmm0
> +
> +       /* Get end pointer.  */
> +       leaq    (%rdx, %rdi), %rcx
> +
> +       punpcklbw %xmm0, %xmm0
> +       punpcklwd %xmm0, %xmm0
> +       pshufd  $0, %xmm0, %xmm0
> +
> +       /* Check if we can load 1x VEC without cross a page.  */
> +       testl   $(PAGE_SIZE - VEC_SIZE), %ecx
> +       jz      L(page_cross)
> +
> +       /* NB: This load happens regardless of whether rdx (len) is zero. Since
> +          it doesn't cross a page and the standard gurantees any pointer have
> +          at least one-valid byte this load must be safe. For the entire
> +          history of the x86 memrchr implementation this has been possible so
> +          no code "should" be relying on a zero-length check before this load.
> +          The zero-length check is moved to the page cross case because it is
> +          1) pretty cold and including it pushes the hot case len <= VEC_SIZE
> +          into 2-cache lines.  */
> +       movups  -(VEC_SIZE)(%rcx), %xmm1
> +       pcmpeqb %xmm0, %xmm1
> +       pmovmskb %xmm1, %eax
> +
> +       subq    $VEC_SIZE, %rdx
> +       ja      L(more_1x_vec)
> +L(ret_vec_x0_test):
> +       /* Zero-flag set if eax (src) is zero. Destination unchanged if src is
> +          zero.  */
> +       bsrl    %eax, %eax
> +       jz      L(ret_0)
> +       /* Check if the CHAR match is in bounds. Need to truly zero `eax` here
> +          if out of bounds.  */
> +       addl    %edx, %eax
> +       jl      L(zero_0)
> +       /* Since we subtracted VEC_SIZE from rdx earlier we can just add to base
> +          ptr.  */
> +       addq    %rdi, %rax
> +L(ret_0):
> +       ret
> +
> +       .p2align 4,, 5
> +L(ret_vec_x0):
> +       bsrl    %eax, %eax
> +       leaq    -(VEC_SIZE)(%rcx, %rax), %rax
> +       ret
> +
> +       .p2align 4,, 2
> +L(zero_0):
> +       xorl    %eax, %eax
> +       ret
> +
> +
> +       .p2align 4,, 8
> +L(more_1x_vec):
> +       testl   %eax, %eax
> +       jnz     L(ret_vec_x0)
> +
> +       /* Align rcx (pointer to string).  */
> +       decq    %rcx
> +       andq    $-VEC_SIZE, %rcx
> +
> +       movq    %rcx, %rdx
> +       /* NB: We could consistenyl save 1-byte in this pattern with `movaps
> +          %xmm0, %xmm1; pcmpeq IMM8(r), %xmm1; ...`. The reason against it is
> +          it adds more frontend uops (even if the moves can be eliminated) and
> +          some percentage of the time actual backend uops.  */
> +       movaps  -(VEC_SIZE)(%rcx), %xmm1
> +       pcmpeqb %xmm0, %xmm1
> +       subq    %rdi, %rdx
> +       pmovmskb %xmm1, %eax
> +
> +       cmpq    $(VEC_SIZE * 2), %rdx
> +       ja      L(more_2x_vec)
> +L(last_2x_vec):
> +       subl    $VEC_SIZE, %edx
> +       jbe     L(ret_vec_x0_test)
> +
> +       testl   %eax, %eax
> +       jnz     L(ret_vec_x0)
> +
> +       movaps  -(VEC_SIZE * 2)(%rcx), %xmm1
> +       pcmpeqb %xmm0, %xmm1
> +       pmovmskb %xmm1, %eax
> +
> +       subl    $VEC_SIZE, %edx
> +       bsrl    %eax, %eax
> +       jz      L(ret_1)
> +       addl    %edx, %eax
> +       jl      L(zero_0)
> +       addq    %rdi, %rax
> +L(ret_1):
> +       ret
> +
> +       /* Don't align. Otherwise lose 2-byte encoding in jump to L(page_cross)
> +          causes the hot pause (length <= VEC_SIZE) to span multiple cache
> +          lines.  Naturally aligned % 16 to 8-bytes.  */
> +L(page_cross):
> +       /* Zero length check.  */
> +       testq   %rdx, %rdx
> +       jz      L(zero_0)
> +
> +       leaq    -1(%rcx), %r8
> +       andq    $-(VEC_SIZE), %r8
> +
> +       movaps  (%r8), %xmm1
> +       pcmpeqb %xmm0, %xmm1
> +       pmovmskb %xmm1, %esi
> +       /* Shift out negative alignment (because we are starting from endptr and
> +          working backwards).  */
> +       negl    %ecx
> +       /* 32-bit shift but VEC_SIZE=16 so need to mask the shift count
> +          explicitly.  */
> +       andl    $(VEC_SIZE - 1), %ecx
> +       shl     %cl, %esi
> +       movzwl  %si, %eax
> +       leaq    (%rdi, %rdx), %rcx
> +       cmpq    %rdi, %r8
> +       ja      L(more_1x_vec)
> +       subl    $VEC_SIZE, %edx
> +       bsrl    %eax, %eax
> +       jz      L(ret_2)
> +       addl    %edx, %eax
> +       jl      L(zero_1)
> +       addq    %rdi, %rax
> +L(ret_2):
> +       ret
> +
> +       /* Fits in aliging bytes.  */
> +L(zero_1):
> +       xorl    %eax, %eax
> +       ret
> +
> +       .p2align 4,, 5
> +L(ret_vec_x1):
> +       bsrl    %eax, %eax
> +       leaq    -(VEC_SIZE * 2)(%rcx, %rax), %rax
> +       ret
> +
> +       .p2align 4,, 8
> +L(more_2x_vec):
> +       testl   %eax, %eax
> +       jnz     L(ret_vec_x0)
> +
> +       movaps  -(VEC_SIZE * 2)(%rcx), %xmm1
> +       pcmpeqb %xmm0, %xmm1
> +       pmovmskb %xmm1, %eax
> +       testl   %eax, %eax
> +       jnz     L(ret_vec_x1)
> +
> +
> +       movaps  -(VEC_SIZE * 3)(%rcx), %xmm1
> +       pcmpeqb %xmm0, %xmm1
> +       pmovmskb %xmm1, %eax
> +
> +       subq    $(VEC_SIZE * 4), %rdx
> +       ja      L(more_4x_vec)
> +
> +       addl    $(VEC_SIZE), %edx
> +       jle     L(ret_vec_x2_test)
> +
> +L(last_vec):
> +       testl   %eax, %eax
> +       jnz     L(ret_vec_x2)
> +
> +       movaps  -(VEC_SIZE * 4)(%rcx), %xmm1
> +       pcmpeqb %xmm0, %xmm1
> +       pmovmskb %xmm1, %eax
> +
> +       subl    $(VEC_SIZE), %edx
> +       bsrl    %eax, %eax
> +       jz      L(ret_3)
> +       addl    %edx, %eax
> +       jl      L(zero_2)
> +       addq    %rdi, %rax
> +L(ret_3):
> +       ret
> +
> +       .p2align 4,, 6
> +L(ret_vec_x2_test):
> +       bsrl    %eax, %eax
> +       jz      L(zero_2)
> +       addl    %edx, %eax
> +       jl      L(zero_2)
> +       addq    %rdi, %rax
> +       ret
> +
> +L(zero_2):
> +       xorl    %eax, %eax
> +       ret
> +
> +
> +       .p2align 4,, 5
> +L(ret_vec_x2):
> +       bsrl    %eax, %eax
> +       leaq    -(VEC_SIZE * 3)(%rcx, %rax), %rax
> +       ret
> +
> +       .p2align 4,, 5
> +L(ret_vec_x3):
> +       bsrl    %eax, %eax
> +       leaq    -(VEC_SIZE * 4)(%rcx, %rax), %rax
> +       ret
> +
> +       .p2align 4,, 8
> +L(more_4x_vec):
> +       testl   %eax, %eax
> +       jnz     L(ret_vec_x2)
> +
> +       movaps  -(VEC_SIZE * 4)(%rcx), %xmm1
> +       pcmpeqb %xmm0, %xmm1
> +       pmovmskb %xmm1, %eax
> +
> +       testl   %eax, %eax
> +       jnz     L(ret_vec_x3)
> +
> +       addq    $-(VEC_SIZE * 4), %rcx
> +       cmpq    $(VEC_SIZE * 4), %rdx
> +       jbe     L(last_4x_vec)
> +
> +       /* Offset everything by 4x VEC_SIZE here to save a few bytes at the end
> +          keeping the code from spilling to the next cache line.  */
> +       addq    $(VEC_SIZE * 4 - 1), %rcx
> +       andq    $-(VEC_SIZE * 4), %rcx
> +       leaq    (VEC_SIZE * 4)(%rdi), %rdx
> +       andq    $-(VEC_SIZE * 4), %rdx
> +
> +       .p2align 4,, 11
> +L(loop_4x_vec):
> +       movaps  (VEC_SIZE * -1)(%rcx), %xmm1
> +       movaps  (VEC_SIZE * -2)(%rcx), %xmm2
> +       movaps  (VEC_SIZE * -3)(%rcx), %xmm3
> +       movaps  (VEC_SIZE * -4)(%rcx), %xmm4
> +       pcmpeqb %xmm0, %xmm1
> +       pcmpeqb %xmm0, %xmm2
> +       pcmpeqb %xmm0, %xmm3
> +       pcmpeqb %xmm0, %xmm4
> +
> +       por     %xmm1, %xmm2
> +       por     %xmm3, %xmm4
> +       por     %xmm2, %xmm4
> +
> +       pmovmskb %xmm4, %esi
> +       testl   %esi, %esi
> +       jnz     L(loop_end)
> +
> +       addq    $-(VEC_SIZE * 4), %rcx
> +       cmpq    %rdx, %rcx
> +       jne     L(loop_4x_vec)
> +
> +       subl    %edi, %edx
> +
> +       /* Ends up being 1-byte nop.  */
> +       .p2align 4,, 2
> +L(last_4x_vec):
> +       movaps  -(VEC_SIZE)(%rcx), %xmm1
> +       pcmpeqb %xmm0, %xmm1
> +       pmovmskb %xmm1, %eax
> +
> +       cmpl    $(VEC_SIZE * 2), %edx
> +       jbe     L(last_2x_vec)
> +
> +       testl   %eax, %eax
> +       jnz     L(ret_vec_x0)
> +
> +
> +       movaps  -(VEC_SIZE * 2)(%rcx), %xmm1
> +       pcmpeqb %xmm0, %xmm1
> +       pmovmskb %xmm1, %eax
> +
> +       testl   %eax, %eax
> +       jnz     L(ret_vec_end)
> +
> +       movaps  -(VEC_SIZE * 3)(%rcx), %xmm1
> +       pcmpeqb %xmm0, %xmm1
> +       pmovmskb %xmm1, %eax
> +
> +       subl    $(VEC_SIZE * 3), %edx
> +       ja      L(last_vec)
> +       bsrl    %eax, %eax
> +       jz      L(ret_4)
> +       addl    %edx, %eax
> +       jl      L(zero_3)
> +       addq    %rdi, %rax
> +L(ret_4):
> +       ret
> +
> +       /* Ends up being 1-byte nop.  */
> +       .p2align 4,, 3
> +L(loop_end):
> +       pmovmskb %xmm1, %eax
> +       sall    $16, %eax
> +       jnz     L(ret_vec_end)
> +
> +       pmovmskb %xmm2, %eax
> +       testl   %eax, %eax
> +       jnz     L(ret_vec_end)
> +
> +       pmovmskb %xmm3, %eax
> +       /* Combine last 2 VEC matches. If ecx (VEC3) is zero (no CHAR in VEC3)
> +          then it won't affect the result in esi (VEC4). If ecx is non-zero
> +          then CHAR in VEC3 and bsrq will use that position.  */
> +       sall    $16, %eax
> +       orl     %esi, %eax
> +       bsrl    %eax, %eax
> +       leaq    -(VEC_SIZE * 4)(%rcx, %rax), %rax
> +       ret
>
> -#include "../memrchr.S"
> +L(ret_vec_end):
> +       bsrl    %eax, %eax
> +       leaq    (VEC_SIZE * -2)(%rax, %rcx), %rax
> +       ret
> +       /* Use in L(last_4x_vec). In the same cache line. This is just a spare
> +          aligning bytes.  */
> +L(zero_3):
> +       xorl    %eax, %eax
> +       ret
> +       /* 2-bytes from next cache line.  */
> +END(MEMRCHR)
> --
> 2.34.1
>

LGTM.

Thanks.

-- 
H.J.