From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: from mail-pj1-x102a.google.com (mail-pj1-x102a.google.com [IPv6:2607:f8b0:4864:20::102a]) by sourceware.org (Postfix) with ESMTPS id 73249385AC1F for ; Tue, 12 Jul 2022 22:59:19 +0000 (GMT) DMARC-Filter: OpenDMARC Filter v1.4.1 sourceware.org 73249385AC1F Received: by mail-pj1-x102a.google.com with SMTP id b8so8096572pjo.5 for ; Tue, 12 Jul 2022 15:59:19 -0700 (PDT) X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=1e100.net; s=20210112; h=x-gm-message-state:mime-version:references:in-reply-to:from:date :message-id:subject:to:cc; bh=PHGroGtHzAg8+jXEEk1Vse+chCMjjHi0Iq2rBhpVCgA=; b=1TrheMnEmrReIWTZWtfy7Ox2EqFJtexUm0eZJeA1P07IY3/m7gsuns2dbrJ6sqsoWW /OS+RZGZTgd2JJ5dUvctNmIqeb8a1IKar9c2uNlHWt6fek59jmu7yIK3uzezeONqurI8 Hdew7xLoJyAwjDewntNuCI2Tr3tgmHt46e9q9s8Rz9L7lUCFfxJOiXVx4dY24UdsItyM grDMKqtZfqEk1rTSW4pX9LDBIdOXLxzgubBhT/JBkj7mXY2phTru62ozfbM3FdkPF5gF 9F8Mv/o5PWrHENu+/WoHRTPdAI9CE2z5B7vWPaV2K5zhHscvz1r9yfKRHSABm7eyhF4L PFLQ== X-Gm-Message-State: AJIora94S/hTssXR74YkH+CuYXB70F8P62H0ffYDEtn5xn04/ENu/mXG +a/pMesYje/hkjy/4hBokDc70R1yIaFKECUoks4= X-Google-Smtp-Source: AGRyM1uMQ4PJH8vYipjb9fW1GgQWf93xVE0PwEnUyOLZ1rIaZ5L1IGp4JRUm1MpG633spxhRNg3Z9l4Aw/yFQl71iWA= X-Received: by 2002:a17:90b:1e0e:b0:1ef:97f9:dfb5 with SMTP id pg14-20020a17090b1e0e00b001ef97f9dfb5mr6912870pjb.217.1657666758241; Tue, 12 Jul 2022 15:59:18 -0700 (PDT) MIME-Version: 1.0 References: <20220712192910.351121-1-goldstein.w.n@gmail.com> <20220712192910.351121-3-goldstein.w.n@gmail.com> In-Reply-To: <20220712192910.351121-3-goldstein.w.n@gmail.com> From: "H.J. Lu" Date: Tue, 12 Jul 2022 15:58:42 -0700 Message-ID: Subject: Re: [PATCH v1] x86: Move memrchr SSE2 implementation to multiarch/memrchr-sse2.S To: Noah Goldstein Cc: GNU C Library , "Carlos O'Donell" Content-Type: text/plain; charset="UTF-8" X-Spam-Status: No, score=-3024.7 required=5.0 tests=BAYES_00, DKIM_SIGNED, DKIM_VALID, DKIM_VALID_AU, DKIM_VALID_EF, FREEMAIL_FROM, GIT_PATCH_0, KAM_SHORT, RCVD_IN_DNSWL_NONE, SPF_HELO_NONE, SPF_PASS, TXREP, T_SCC_BODY_TEXT_LINE autolearn=ham autolearn_force=no version=3.4.6 X-Spam-Checker-Version: SpamAssassin 3.4.6 (2021-04-09) on server2.sourceware.org X-BeenThere: libc-alpha@sourceware.org X-Mailman-Version: 2.1.29 Precedence: list List-Id: Libc-alpha mailing list List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , X-List-Received-Date: Tue, 12 Jul 2022 22:59:22 -0000 On Tue, Jul 12, 2022 at 12:29 PM Noah Goldstein wrote: > > This commit doesn't affect libc.so.6, its just housekeeping to prepare > for adding explicit ISA level support. > > Tested build on x86_64 and x86_32 with/without multiarch. > --- > sysdeps/x86_64/memrchr.S | 332 +---------------------- > sysdeps/x86_64/multiarch/memrchr-sse2.S | 336 +++++++++++++++++++++++- > 2 files changed, 334 insertions(+), 334 deletions(-) > > diff --git a/sysdeps/x86_64/memrchr.S b/sysdeps/x86_64/memrchr.S > index b0dffd2ae2..385e2c5668 100644 > --- a/sysdeps/x86_64/memrchr.S > +++ b/sysdeps/x86_64/memrchr.S > @@ -17,334 +17,6 @@ > License along with the GNU C Library; if not, see > . */ > > -#include > -#define VEC_SIZE 16 > -#define PAGE_SIZE 4096 > - > - .text > -ENTRY_P2ALIGN(__memrchr, 6) > -#ifdef __ILP32__ > - /* Clear upper bits. */ > - mov %RDX_LP, %RDX_LP > -#endif > - movd %esi, %xmm0 > - > - /* Get end pointer. */ > - leaq (%rdx, %rdi), %rcx > - > - punpcklbw %xmm0, %xmm0 > - punpcklwd %xmm0, %xmm0 > - pshufd $0, %xmm0, %xmm0 > - > - /* Check if we can load 1x VEC without cross a page. */ > - testl $(PAGE_SIZE - VEC_SIZE), %ecx > - jz L(page_cross) > - > - /* NB: This load happens regardless of whether rdx (len) is zero. Since > - it doesn't cross a page and the standard gurantees any pointer have > - at least one-valid byte this load must be safe. For the entire > - history of the x86 memrchr implementation this has been possible so > - no code "should" be relying on a zero-length check before this load. > - The zero-length check is moved to the page cross case because it is > - 1) pretty cold and including it pushes the hot case len <= VEC_SIZE > - into 2-cache lines. */ > - movups -(VEC_SIZE)(%rcx), %xmm1 > - pcmpeqb %xmm0, %xmm1 > - pmovmskb %xmm1, %eax > - > - subq $VEC_SIZE, %rdx > - ja L(more_1x_vec) > -L(ret_vec_x0_test): > - /* Zero-flag set if eax (src) is zero. Destination unchanged if src is > - zero. */ > - bsrl %eax, %eax > - jz L(ret_0) > - /* Check if the CHAR match is in bounds. Need to truly zero `eax` here > - if out of bounds. */ > - addl %edx, %eax > - jl L(zero_0) > - /* Since we subtracted VEC_SIZE from rdx earlier we can just add to base > - ptr. */ > - addq %rdi, %rax > -L(ret_0): > - ret > - > - .p2align 4,, 5 > -L(ret_vec_x0): > - bsrl %eax, %eax > - leaq -(VEC_SIZE)(%rcx, %rax), %rax > - ret > - > - .p2align 4,, 2 > -L(zero_0): > - xorl %eax, %eax > - ret > - > - > - .p2align 4,, 8 > -L(more_1x_vec): > - testl %eax, %eax > - jnz L(ret_vec_x0) > - > - /* Align rcx (pointer to string). */ > - decq %rcx > - andq $-VEC_SIZE, %rcx > - > - movq %rcx, %rdx > - /* NB: We could consistenyl save 1-byte in this pattern with `movaps > - %xmm0, %xmm1; pcmpeq IMM8(r), %xmm1; ...`. The reason against it is > - it adds more frontend uops (even if the moves can be eliminated) and > - some percentage of the time actual backend uops. */ > - movaps -(VEC_SIZE)(%rcx), %xmm1 > - pcmpeqb %xmm0, %xmm1 > - subq %rdi, %rdx > - pmovmskb %xmm1, %eax > - > - cmpq $(VEC_SIZE * 2), %rdx > - ja L(more_2x_vec) > -L(last_2x_vec): > - subl $VEC_SIZE, %edx > - jbe L(ret_vec_x0_test) > - > - testl %eax, %eax > - jnz L(ret_vec_x0) > - > - movaps -(VEC_SIZE * 2)(%rcx), %xmm1 > - pcmpeqb %xmm0, %xmm1 > - pmovmskb %xmm1, %eax > - > - subl $VEC_SIZE, %edx > - bsrl %eax, %eax > - jz L(ret_1) > - addl %edx, %eax > - jl L(zero_0) > - addq %rdi, %rax > -L(ret_1): > - ret > - > - /* Don't align. Otherwise lose 2-byte encoding in jump to L(page_cross) > - causes the hot pause (length <= VEC_SIZE) to span multiple cache > - lines. Naturally aligned % 16 to 8-bytes. */ > -L(page_cross): > - /* Zero length check. */ > - testq %rdx, %rdx > - jz L(zero_0) > - > - leaq -1(%rcx), %r8 > - andq $-(VEC_SIZE), %r8 > - > - movaps (%r8), %xmm1 > - pcmpeqb %xmm0, %xmm1 > - pmovmskb %xmm1, %esi > - /* Shift out negative alignment (because we are starting from endptr and > - working backwards). */ > - negl %ecx > - /* 32-bit shift but VEC_SIZE=16 so need to mask the shift count > - explicitly. */ > - andl $(VEC_SIZE - 1), %ecx > - shl %cl, %esi > - movzwl %si, %eax > - leaq (%rdi, %rdx), %rcx > - cmpq %rdi, %r8 > - ja L(more_1x_vec) > - subl $VEC_SIZE, %edx > - bsrl %eax, %eax > - jz L(ret_2) > - addl %edx, %eax > - jl L(zero_1) > - addq %rdi, %rax > -L(ret_2): > - ret > - > - /* Fits in aliging bytes. */ > -L(zero_1): > - xorl %eax, %eax > - ret > - > - .p2align 4,, 5 > -L(ret_vec_x1): > - bsrl %eax, %eax > - leaq -(VEC_SIZE * 2)(%rcx, %rax), %rax > - ret > - > - .p2align 4,, 8 > -L(more_2x_vec): > - testl %eax, %eax > - jnz L(ret_vec_x0) > - > - movaps -(VEC_SIZE * 2)(%rcx), %xmm1 > - pcmpeqb %xmm0, %xmm1 > - pmovmskb %xmm1, %eax > - testl %eax, %eax > - jnz L(ret_vec_x1) > - > - > - movaps -(VEC_SIZE * 3)(%rcx), %xmm1 > - pcmpeqb %xmm0, %xmm1 > - pmovmskb %xmm1, %eax > - > - subq $(VEC_SIZE * 4), %rdx > - ja L(more_4x_vec) > - > - addl $(VEC_SIZE), %edx > - jle L(ret_vec_x2_test) > - > -L(last_vec): > - testl %eax, %eax > - jnz L(ret_vec_x2) > - > - movaps -(VEC_SIZE * 4)(%rcx), %xmm1 > - pcmpeqb %xmm0, %xmm1 > - pmovmskb %xmm1, %eax > - > - subl $(VEC_SIZE), %edx > - bsrl %eax, %eax > - jz L(ret_3) > - addl %edx, %eax > - jl L(zero_2) > - addq %rdi, %rax > -L(ret_3): > - ret > - > - .p2align 4,, 6 > -L(ret_vec_x2_test): > - bsrl %eax, %eax > - jz L(zero_2) > - addl %edx, %eax > - jl L(zero_2) > - addq %rdi, %rax > - ret > - > -L(zero_2): > - xorl %eax, %eax > - ret > - > - > - .p2align 4,, 5 > -L(ret_vec_x2): > - bsrl %eax, %eax > - leaq -(VEC_SIZE * 3)(%rcx, %rax), %rax > - ret > - > - .p2align 4,, 5 > -L(ret_vec_x3): > - bsrl %eax, %eax > - leaq -(VEC_SIZE * 4)(%rcx, %rax), %rax > - ret > - > - .p2align 4,, 8 > -L(more_4x_vec): > - testl %eax, %eax > - jnz L(ret_vec_x2) > - > - movaps -(VEC_SIZE * 4)(%rcx), %xmm1 > - pcmpeqb %xmm0, %xmm1 > - pmovmskb %xmm1, %eax > - > - testl %eax, %eax > - jnz L(ret_vec_x3) > - > - addq $-(VEC_SIZE * 4), %rcx > - cmpq $(VEC_SIZE * 4), %rdx > - jbe L(last_4x_vec) > - > - /* Offset everything by 4x VEC_SIZE here to save a few bytes at the end > - keeping the code from spilling to the next cache line. */ > - addq $(VEC_SIZE * 4 - 1), %rcx > - andq $-(VEC_SIZE * 4), %rcx > - leaq (VEC_SIZE * 4)(%rdi), %rdx > - andq $-(VEC_SIZE * 4), %rdx > - > - .p2align 4,, 11 > -L(loop_4x_vec): > - movaps (VEC_SIZE * -1)(%rcx), %xmm1 > - movaps (VEC_SIZE * -2)(%rcx), %xmm2 > - movaps (VEC_SIZE * -3)(%rcx), %xmm3 > - movaps (VEC_SIZE * -4)(%rcx), %xmm4 > - pcmpeqb %xmm0, %xmm1 > - pcmpeqb %xmm0, %xmm2 > - pcmpeqb %xmm0, %xmm3 > - pcmpeqb %xmm0, %xmm4 > - > - por %xmm1, %xmm2 > - por %xmm3, %xmm4 > - por %xmm2, %xmm4 > - > - pmovmskb %xmm4, %esi > - testl %esi, %esi > - jnz L(loop_end) > - > - addq $-(VEC_SIZE * 4), %rcx > - cmpq %rdx, %rcx > - jne L(loop_4x_vec) > - > - subl %edi, %edx > - > - /* Ends up being 1-byte nop. */ > - .p2align 4,, 2 > -L(last_4x_vec): > - movaps -(VEC_SIZE)(%rcx), %xmm1 > - pcmpeqb %xmm0, %xmm1 > - pmovmskb %xmm1, %eax > - > - cmpl $(VEC_SIZE * 2), %edx > - jbe L(last_2x_vec) > - > - testl %eax, %eax > - jnz L(ret_vec_x0) > - > - > - movaps -(VEC_SIZE * 2)(%rcx), %xmm1 > - pcmpeqb %xmm0, %xmm1 > - pmovmskb %xmm1, %eax > - > - testl %eax, %eax > - jnz L(ret_vec_end) > - > - movaps -(VEC_SIZE * 3)(%rcx), %xmm1 > - pcmpeqb %xmm0, %xmm1 > - pmovmskb %xmm1, %eax > - > - subl $(VEC_SIZE * 3), %edx > - ja L(last_vec) > - bsrl %eax, %eax > - jz L(ret_4) > - addl %edx, %eax > - jl L(zero_3) > - addq %rdi, %rax > -L(ret_4): > - ret > - > - /* Ends up being 1-byte nop. */ > - .p2align 4,, 3 > -L(loop_end): > - pmovmskb %xmm1, %eax > - sall $16, %eax > - jnz L(ret_vec_end) > - > - pmovmskb %xmm2, %eax > - testl %eax, %eax > - jnz L(ret_vec_end) > - > - pmovmskb %xmm3, %eax > - /* Combine last 2 VEC matches. If ecx (VEC3) is zero (no CHAR in VEC3) > - then it won't affect the result in esi (VEC4). If ecx is non-zero > - then CHAR in VEC3 and bsrq will use that position. */ > - sall $16, %eax > - orl %esi, %eax > - bsrl %eax, %eax > - leaq -(VEC_SIZE * 4)(%rcx, %rax), %rax > - ret > - > -L(ret_vec_end): > - bsrl %eax, %eax > - leaq (VEC_SIZE * -2)(%rax, %rcx), %rax > - ret > - /* Use in L(last_4x_vec). In the same cache line. This is just a spare > - aligning bytes. */ > -L(zero_3): > - xorl %eax, %eax > - ret > - /* 2-bytes from next cache line. */ > -END(__memrchr) > +#define MEMRCHR __memrchr > +#include "multiarch/memrchr-sse2.S" > weak_alias (__memrchr, memrchr) > diff --git a/sysdeps/x86_64/multiarch/memrchr-sse2.S b/sysdeps/x86_64/multiarch/memrchr-sse2.S > index b04202e171..d92a4022dc 100644 > --- a/sysdeps/x86_64/multiarch/memrchr-sse2.S > +++ b/sysdeps/x86_64/multiarch/memrchr-sse2.S > @@ -17,10 +17,338 @@ > . */ > > #if IS_IN (libc) > -# define __memrchr __memrchr_sse2 > +# ifndef MEMRCHR > +# define MEMRCHR __memrchr_sse2 > +# endif > +#endif > + > +#include > +#define VEC_SIZE 16 > +#define PAGE_SIZE 4096 > > -# undef weak_alias > -# define weak_alias(__memrchr, memrchr) > + .text > +ENTRY_P2ALIGN(MEMRCHR, 6) > +#ifdef __ILP32__ > + /* Clear upper bits. */ > + mov %RDX_LP, %RDX_LP > #endif > + movd %esi, %xmm0 > + > + /* Get end pointer. */ > + leaq (%rdx, %rdi), %rcx > + > + punpcklbw %xmm0, %xmm0 > + punpcklwd %xmm0, %xmm0 > + pshufd $0, %xmm0, %xmm0 > + > + /* Check if we can load 1x VEC without cross a page. */ > + testl $(PAGE_SIZE - VEC_SIZE), %ecx > + jz L(page_cross) > + > + /* NB: This load happens regardless of whether rdx (len) is zero. Since > + it doesn't cross a page and the standard gurantees any pointer have > + at least one-valid byte this load must be safe. For the entire > + history of the x86 memrchr implementation this has been possible so > + no code "should" be relying on a zero-length check before this load. > + The zero-length check is moved to the page cross case because it is > + 1) pretty cold and including it pushes the hot case len <= VEC_SIZE > + into 2-cache lines. */ > + movups -(VEC_SIZE)(%rcx), %xmm1 > + pcmpeqb %xmm0, %xmm1 > + pmovmskb %xmm1, %eax > + > + subq $VEC_SIZE, %rdx > + ja L(more_1x_vec) > +L(ret_vec_x0_test): > + /* Zero-flag set if eax (src) is zero. Destination unchanged if src is > + zero. */ > + bsrl %eax, %eax > + jz L(ret_0) > + /* Check if the CHAR match is in bounds. Need to truly zero `eax` here > + if out of bounds. */ > + addl %edx, %eax > + jl L(zero_0) > + /* Since we subtracted VEC_SIZE from rdx earlier we can just add to base > + ptr. */ > + addq %rdi, %rax > +L(ret_0): > + ret > + > + .p2align 4,, 5 > +L(ret_vec_x0): > + bsrl %eax, %eax > + leaq -(VEC_SIZE)(%rcx, %rax), %rax > + ret > + > + .p2align 4,, 2 > +L(zero_0): > + xorl %eax, %eax > + ret > + > + > + .p2align 4,, 8 > +L(more_1x_vec): > + testl %eax, %eax > + jnz L(ret_vec_x0) > + > + /* Align rcx (pointer to string). */ > + decq %rcx > + andq $-VEC_SIZE, %rcx > + > + movq %rcx, %rdx > + /* NB: We could consistenyl save 1-byte in this pattern with `movaps > + %xmm0, %xmm1; pcmpeq IMM8(r), %xmm1; ...`. The reason against it is > + it adds more frontend uops (even if the moves can be eliminated) and > + some percentage of the time actual backend uops. */ > + movaps -(VEC_SIZE)(%rcx), %xmm1 > + pcmpeqb %xmm0, %xmm1 > + subq %rdi, %rdx > + pmovmskb %xmm1, %eax > + > + cmpq $(VEC_SIZE * 2), %rdx > + ja L(more_2x_vec) > +L(last_2x_vec): > + subl $VEC_SIZE, %edx > + jbe L(ret_vec_x0_test) > + > + testl %eax, %eax > + jnz L(ret_vec_x0) > + > + movaps -(VEC_SIZE * 2)(%rcx), %xmm1 > + pcmpeqb %xmm0, %xmm1 > + pmovmskb %xmm1, %eax > + > + subl $VEC_SIZE, %edx > + bsrl %eax, %eax > + jz L(ret_1) > + addl %edx, %eax > + jl L(zero_0) > + addq %rdi, %rax > +L(ret_1): > + ret > + > + /* Don't align. Otherwise lose 2-byte encoding in jump to L(page_cross) > + causes the hot pause (length <= VEC_SIZE) to span multiple cache > + lines. Naturally aligned % 16 to 8-bytes. */ > +L(page_cross): > + /* Zero length check. */ > + testq %rdx, %rdx > + jz L(zero_0) > + > + leaq -1(%rcx), %r8 > + andq $-(VEC_SIZE), %r8 > + > + movaps (%r8), %xmm1 > + pcmpeqb %xmm0, %xmm1 > + pmovmskb %xmm1, %esi > + /* Shift out negative alignment (because we are starting from endptr and > + working backwards). */ > + negl %ecx > + /* 32-bit shift but VEC_SIZE=16 so need to mask the shift count > + explicitly. */ > + andl $(VEC_SIZE - 1), %ecx > + shl %cl, %esi > + movzwl %si, %eax > + leaq (%rdi, %rdx), %rcx > + cmpq %rdi, %r8 > + ja L(more_1x_vec) > + subl $VEC_SIZE, %edx > + bsrl %eax, %eax > + jz L(ret_2) > + addl %edx, %eax > + jl L(zero_1) > + addq %rdi, %rax > +L(ret_2): > + ret > + > + /* Fits in aliging bytes. */ > +L(zero_1): > + xorl %eax, %eax > + ret > + > + .p2align 4,, 5 > +L(ret_vec_x1): > + bsrl %eax, %eax > + leaq -(VEC_SIZE * 2)(%rcx, %rax), %rax > + ret > + > + .p2align 4,, 8 > +L(more_2x_vec): > + testl %eax, %eax > + jnz L(ret_vec_x0) > + > + movaps -(VEC_SIZE * 2)(%rcx), %xmm1 > + pcmpeqb %xmm0, %xmm1 > + pmovmskb %xmm1, %eax > + testl %eax, %eax > + jnz L(ret_vec_x1) > + > + > + movaps -(VEC_SIZE * 3)(%rcx), %xmm1 > + pcmpeqb %xmm0, %xmm1 > + pmovmskb %xmm1, %eax > + > + subq $(VEC_SIZE * 4), %rdx > + ja L(more_4x_vec) > + > + addl $(VEC_SIZE), %edx > + jle L(ret_vec_x2_test) > + > +L(last_vec): > + testl %eax, %eax > + jnz L(ret_vec_x2) > + > + movaps -(VEC_SIZE * 4)(%rcx), %xmm1 > + pcmpeqb %xmm0, %xmm1 > + pmovmskb %xmm1, %eax > + > + subl $(VEC_SIZE), %edx > + bsrl %eax, %eax > + jz L(ret_3) > + addl %edx, %eax > + jl L(zero_2) > + addq %rdi, %rax > +L(ret_3): > + ret > + > + .p2align 4,, 6 > +L(ret_vec_x2_test): > + bsrl %eax, %eax > + jz L(zero_2) > + addl %edx, %eax > + jl L(zero_2) > + addq %rdi, %rax > + ret > + > +L(zero_2): > + xorl %eax, %eax > + ret > + > + > + .p2align 4,, 5 > +L(ret_vec_x2): > + bsrl %eax, %eax > + leaq -(VEC_SIZE * 3)(%rcx, %rax), %rax > + ret > + > + .p2align 4,, 5 > +L(ret_vec_x3): > + bsrl %eax, %eax > + leaq -(VEC_SIZE * 4)(%rcx, %rax), %rax > + ret > + > + .p2align 4,, 8 > +L(more_4x_vec): > + testl %eax, %eax > + jnz L(ret_vec_x2) > + > + movaps -(VEC_SIZE * 4)(%rcx), %xmm1 > + pcmpeqb %xmm0, %xmm1 > + pmovmskb %xmm1, %eax > + > + testl %eax, %eax > + jnz L(ret_vec_x3) > + > + addq $-(VEC_SIZE * 4), %rcx > + cmpq $(VEC_SIZE * 4), %rdx > + jbe L(last_4x_vec) > + > + /* Offset everything by 4x VEC_SIZE here to save a few bytes at the end > + keeping the code from spilling to the next cache line. */ > + addq $(VEC_SIZE * 4 - 1), %rcx > + andq $-(VEC_SIZE * 4), %rcx > + leaq (VEC_SIZE * 4)(%rdi), %rdx > + andq $-(VEC_SIZE * 4), %rdx > + > + .p2align 4,, 11 > +L(loop_4x_vec): > + movaps (VEC_SIZE * -1)(%rcx), %xmm1 > + movaps (VEC_SIZE * -2)(%rcx), %xmm2 > + movaps (VEC_SIZE * -3)(%rcx), %xmm3 > + movaps (VEC_SIZE * -4)(%rcx), %xmm4 > + pcmpeqb %xmm0, %xmm1 > + pcmpeqb %xmm0, %xmm2 > + pcmpeqb %xmm0, %xmm3 > + pcmpeqb %xmm0, %xmm4 > + > + por %xmm1, %xmm2 > + por %xmm3, %xmm4 > + por %xmm2, %xmm4 > + > + pmovmskb %xmm4, %esi > + testl %esi, %esi > + jnz L(loop_end) > + > + addq $-(VEC_SIZE * 4), %rcx > + cmpq %rdx, %rcx > + jne L(loop_4x_vec) > + > + subl %edi, %edx > + > + /* Ends up being 1-byte nop. */ > + .p2align 4,, 2 > +L(last_4x_vec): > + movaps -(VEC_SIZE)(%rcx), %xmm1 > + pcmpeqb %xmm0, %xmm1 > + pmovmskb %xmm1, %eax > + > + cmpl $(VEC_SIZE * 2), %edx > + jbe L(last_2x_vec) > + > + testl %eax, %eax > + jnz L(ret_vec_x0) > + > + > + movaps -(VEC_SIZE * 2)(%rcx), %xmm1 > + pcmpeqb %xmm0, %xmm1 > + pmovmskb %xmm1, %eax > + > + testl %eax, %eax > + jnz L(ret_vec_end) > + > + movaps -(VEC_SIZE * 3)(%rcx), %xmm1 > + pcmpeqb %xmm0, %xmm1 > + pmovmskb %xmm1, %eax > + > + subl $(VEC_SIZE * 3), %edx > + ja L(last_vec) > + bsrl %eax, %eax > + jz L(ret_4) > + addl %edx, %eax > + jl L(zero_3) > + addq %rdi, %rax > +L(ret_4): > + ret > + > + /* Ends up being 1-byte nop. */ > + .p2align 4,, 3 > +L(loop_end): > + pmovmskb %xmm1, %eax > + sall $16, %eax > + jnz L(ret_vec_end) > + > + pmovmskb %xmm2, %eax > + testl %eax, %eax > + jnz L(ret_vec_end) > + > + pmovmskb %xmm3, %eax > + /* Combine last 2 VEC matches. If ecx (VEC3) is zero (no CHAR in VEC3) > + then it won't affect the result in esi (VEC4). If ecx is non-zero > + then CHAR in VEC3 and bsrq will use that position. */ > + sall $16, %eax > + orl %esi, %eax > + bsrl %eax, %eax > + leaq -(VEC_SIZE * 4)(%rcx, %rax), %rax > + ret > > -#include "../memrchr.S" > +L(ret_vec_end): > + bsrl %eax, %eax > + leaq (VEC_SIZE * -2)(%rax, %rcx), %rax > + ret > + /* Use in L(last_4x_vec). In the same cache line. This is just a spare > + aligning bytes. */ > +L(zero_3): > + xorl %eax, %eax > + ret > + /* 2-bytes from next cache line. */ > +END(MEMRCHR) > -- > 2.34.1 > LGTM. Thanks. -- H.J.