From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: from mail-pj1-x102f.google.com (mail-pj1-x102f.google.com [IPv6:2607:f8b0:4864:20::102f]) by sourceware.org (Postfix) with ESMTPS id 295EE385AC1F for ; Tue, 12 Jul 2022 22:29:30 +0000 (GMT) DMARC-Filter: OpenDMARC Filter v1.4.1 sourceware.org 295EE385AC1F Received: by mail-pj1-x102f.google.com with SMTP id 89-20020a17090a09e200b001ef7638e536so602234pjo.3 for ; Tue, 12 Jul 2022 15:29:30 -0700 (PDT) X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=1e100.net; s=20210112; h=x-gm-message-state:mime-version:references:in-reply-to:from:date :message-id:subject:to:cc; bh=6RHusdSCB1k4hUJnC4S+RmpX5wLldjxFpOncfMu5Vuo=; b=3CTaen4Rhul3r21F+fIsRrp/nSnsgf5G5HB5ScIfBUSi2fwclijOHVoKyeEhdDW4oq k68BXqwVjwXrgY48mIWSebqUx136uWmKbYK2G9zdUCMVJb6aCDJAZduATEVBNy4TsmR+ TSQoWNi5jCvciHQT0R38MI05WWSHVbIUFmC9F3yY/2+y4j+skdmh7/mK8ED9PDLsjE5o InipqIgH9RTZnfFSfWUVB/7HOzCMtWfFrycga6nCZfV+RFLmzUiCSd70dSVVDxdZGDmY AMkej1Vhw2U1xPdU2kiwavVtAGQIdHb4oGmWbWLNx8V8dOED9wPbaRofNSZbX2+u/vMZ R+Gg== X-Gm-Message-State: AJIora+UPL9DlKwLBRhFWFO2X8HCn/xZb63dHPrBWWC25HLt0H6LGvhh hUMeOaf/mNuPGvdm5NOwZX+aR4CBVYn9Ot1hOYA= X-Google-Smtp-Source: AGRyM1t5Tm4vVmZqigxK0mRHg+KyxfT3X7IgShqhc3sXWenRYPziAz4MjAWRDGx4mD+kCcIWcj8NR211/sBKnLLDHaY= X-Received: by 2002:a17:902:d50e:b0:16c:1664:81e5 with SMTP id b14-20020a170902d50e00b0016c166481e5mr121629plg.149.1657664968911; Tue, 12 Jul 2022 15:29:28 -0700 (PDT) MIME-Version: 1.0 References: <20220712192910.351121-1-goldstein.w.n@gmail.com> <20220712192910.351121-4-goldstein.w.n@gmail.com> In-Reply-To: <20220712192910.351121-4-goldstein.w.n@gmail.com> From: "H.J. Lu" Date: Tue, 12 Jul 2022 15:28:53 -0700 Message-ID: Subject: Re: [PATCH v1] x86: Move strrchr SSE2 implementation to multiarch/strrchr-sse2.S To: Noah Goldstein Cc: GNU C Library , "Carlos O'Donell" Content-Type: text/plain; charset="UTF-8" X-Spam-Status: No, score=-3024.7 required=5.0 tests=BAYES_00, DKIM_SIGNED, DKIM_VALID, DKIM_VALID_AU, DKIM_VALID_EF, FREEMAIL_FROM, GIT_PATCH_0, KAM_SHORT, RCVD_IN_DNSWL_NONE, SPF_HELO_NONE, SPF_PASS, TXREP, T_SCC_BODY_TEXT_LINE autolearn=ham autolearn_force=no version=3.4.6 X-Spam-Checker-Version: SpamAssassin 3.4.6 (2021-04-09) on server2.sourceware.org X-BeenThere: libc-alpha@sourceware.org X-Mailman-Version: 2.1.29 Precedence: list List-Id: Libc-alpha mailing list List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , X-List-Received-Date: Tue, 12 Jul 2022 22:29:33 -0000 On Tue, Jul 12, 2022 at 12:29 PM Noah Goldstein wrote: > > This commit doesn't affect libc.so.6, its just housekeeping to prepare > for adding explicit ISA level support. > > Tested build on x86_64 and x86_32 with/without multiarch. > --- > sysdeps/x86_64/multiarch/strrchr-sse2.S | 358 ++++++++++++++++++++++- > sysdeps/x86_64/multiarch/wcsrchr-sse2.S | 10 +- > sysdeps/x86_64/strrchr.S | 364 +----------------------- > sysdeps/x86_64/wcsrchr.S | 11 +- > 4 files changed, 366 insertions(+), 377 deletions(-) > > diff --git a/sysdeps/x86_64/multiarch/strrchr-sse2.S b/sysdeps/x86_64/multiarch/strrchr-sse2.S > index 866396e947..6ee7a5e33a 100644 > --- a/sysdeps/x86_64/multiarch/strrchr-sse2.S > +++ b/sysdeps/x86_64/multiarch/strrchr-sse2.S > @@ -17,12 +17,358 @@ > . */ > > #if IS_IN (libc) > -# define STRRCHR __strrchr_sse2 > +# ifndef STRRCHR > +# define STRRCHR __strrchr_sse2 > +# endif > +#endif > + > +#include > + > +#ifdef USE_AS_WCSRCHR > +# define PCMPEQ pcmpeqd > +# define CHAR_SIZE 4 > +# define PMINU pminud > +#else > +# define PCMPEQ pcmpeqb > +# define CHAR_SIZE 1 > +# define PMINU pminub > +#endif > + > +#define PAGE_SIZE 4096 > +#define VEC_SIZE 16 > + > + .text > +ENTRY(STRRCHR) > + movd %esi, %xmm0 > + movq %rdi, %rax > + andl $(PAGE_SIZE - 1), %eax > +#ifndef USE_AS_WCSRCHR > + punpcklbw %xmm0, %xmm0 > + punpcklwd %xmm0, %xmm0 > +#endif > + pshufd $0, %xmm0, %xmm0 > + cmpl $(PAGE_SIZE - VEC_SIZE), %eax > + ja L(cross_page) > + > +L(cross_page_continue): > + movups (%rdi), %xmm1 > + pxor %xmm2, %xmm2 > + PCMPEQ %xmm1, %xmm2 > + pmovmskb %xmm2, %ecx > + testl %ecx, %ecx > + jz L(aligned_more) > + > + PCMPEQ %xmm0, %xmm1 > + pmovmskb %xmm1, %eax > + leal -1(%rcx), %edx > + xorl %edx, %ecx > + andl %ecx, %eax > + jz L(ret0) > + bsrl %eax, %eax > + addq %rdi, %rax > + /* We are off by 3 for wcsrchr if search CHAR is non-zero. If > + search CHAR is zero we are correct. Either way `andq > + -CHAR_SIZE, %rax` gets the correct result. */ > +#ifdef USE_AS_WCSRCHR > + andq $-CHAR_SIZE, %rax > +#endif > +L(ret0): > + ret > + > + /* Returns for first vec x1/x2 have hard coded backward search > + path for earlier matches. */ > + .p2align 4 > +L(first_vec_x0_test): > + PCMPEQ %xmm0, %xmm1 > + pmovmskb %xmm1, %eax > + testl %eax, %eax > + jz L(ret0) > + bsrl %eax, %eax > + addq %r8, %rax > +#ifdef USE_AS_WCSRCHR > + andq $-CHAR_SIZE, %rax > +#endif > + ret > + > + .p2align 4 > +L(first_vec_x1): > + PCMPEQ %xmm0, %xmm2 > + pmovmskb %xmm2, %eax > + leal -1(%rcx), %edx > + xorl %edx, %ecx > + andl %ecx, %eax > + jz L(first_vec_x0_test) > + bsrl %eax, %eax > + leaq (VEC_SIZE)(%rdi, %rax), %rax > +#ifdef USE_AS_WCSRCHR > + andq $-CHAR_SIZE, %rax > +#endif > + ret > + > + .p2align 4 > +L(first_vec_x1_test): > + PCMPEQ %xmm0, %xmm2 > + pmovmskb %xmm2, %eax > + testl %eax, %eax > + jz L(first_vec_x0_test) > + bsrl %eax, %eax > + leaq (VEC_SIZE)(%rdi, %rax), %rax > +#ifdef USE_AS_WCSRCHR > + andq $-CHAR_SIZE, %rax > +#endif > + ret > + > + .p2align 4 > +L(first_vec_x2): > + PCMPEQ %xmm0, %xmm3 > + pmovmskb %xmm3, %eax > + leal -1(%rcx), %edx > + xorl %edx, %ecx > + andl %ecx, %eax > + jz L(first_vec_x1_test) > + bsrl %eax, %eax > + leaq (VEC_SIZE * 2)(%rdi, %rax), %rax > +#ifdef USE_AS_WCSRCHR > + andq $-CHAR_SIZE, %rax > +#endif > + ret > + > + .p2align 4 > +L(aligned_more): > + /* Save original pointer if match was in VEC 0. */ > + movq %rdi, %r8 > + andq $-VEC_SIZE, %rdi > + > + movaps VEC_SIZE(%rdi), %xmm2 > + pxor %xmm3, %xmm3 > + PCMPEQ %xmm2, %xmm3 > + pmovmskb %xmm3, %ecx > + testl %ecx, %ecx > + jnz L(first_vec_x1) > + > + movaps (VEC_SIZE * 2)(%rdi), %xmm3 > + pxor %xmm4, %xmm4 > + PCMPEQ %xmm3, %xmm4 > + pmovmskb %xmm4, %ecx > + testl %ecx, %ecx > + jnz L(first_vec_x2) > + > + addq $VEC_SIZE, %rdi > + /* Save pointer again before realigning. */ > + movq %rdi, %rsi > + andq $-(VEC_SIZE * 2), %rdi > + .p2align 4 > +L(first_loop): > + /* Do 2x VEC at a time. */ > + movaps (VEC_SIZE * 2)(%rdi), %xmm4 > + movaps (VEC_SIZE * 3)(%rdi), %xmm5 > + /* Since SSE2 no pminud so wcsrchr needs seperate logic for > + detecting zero. Note if this is found to be a bottleneck it > + may be worth adding an SSE4.1 wcsrchr implementation. */ > +#ifdef USE_AS_WCSRCHR > + movaps %xmm5, %xmm6 > + pxor %xmm8, %xmm8 > + > + PCMPEQ %xmm8, %xmm5 > + PCMPEQ %xmm4, %xmm8 > + por %xmm5, %xmm8 > +#else > + movaps %xmm5, %xmm6 > + PMINU %xmm4, %xmm5 > +#endif > + > + movaps %xmm4, %xmm9 > + PCMPEQ %xmm0, %xmm4 > + PCMPEQ %xmm0, %xmm6 > + movaps %xmm6, %xmm7 > + por %xmm4, %xmm6 > +#ifndef USE_AS_WCSRCHR > + pxor %xmm8, %xmm8 > + PCMPEQ %xmm5, %xmm8 > +#endif > + pmovmskb %xmm8, %ecx > + pmovmskb %xmm6, %eax > > -# undef weak_alias > -# define weak_alias(strrchr, rindex) > -# undef libc_hidden_builtin_def > -# define libc_hidden_builtin_def(strrchr) > + addq $(VEC_SIZE * 2), %rdi > + /* Use `addl` 1) so we can undo it with `subl` and 2) it can > + macro-fuse with `jz`. */ > + addl %ecx, %eax > + jz L(first_loop) > + > + /* Check if there is zero match. */ > + testl %ecx, %ecx > + jz L(second_loop_match) > + > + /* Check if there was a match in last iteration. */ > + subl %ecx, %eax > + jnz L(new_match) > + > +L(first_loop_old_match): > + PCMPEQ %xmm0, %xmm2 > + PCMPEQ %xmm0, %xmm3 > + pmovmskb %xmm2, %ecx > + pmovmskb %xmm3, %eax > + addl %eax, %ecx > + jz L(first_vec_x0_test) > + /* NB: We could move this shift to before the branch and save a > + bit of code size / performance on the fall through. The > + branch leads to the null case which generally seems hotter > + than char in first 3x VEC. */ > + sall $16, %eax > + orl %ecx, %eax > + > + bsrl %eax, %eax > + addq %rsi, %rax > +#ifdef USE_AS_WCSRCHR > + andq $-CHAR_SIZE, %rax > +#endif > + ret > + > + .p2align 4 > +L(new_match): > + pxor %xmm6, %xmm6 > + PCMPEQ %xmm9, %xmm6 > + pmovmskb %xmm6, %eax > + sall $16, %ecx > + orl %eax, %ecx > + > + /* We can't reuse either of the old comparisons as since we mask > + of zeros after first zero (instead of using the full > + comparison) we can't gurantee no interference between match > + after end of string and valid match. */ > + pmovmskb %xmm4, %eax > + pmovmskb %xmm7, %edx > + sall $16, %edx > + orl %edx, %eax > + > + leal -1(%ecx), %edx > + xorl %edx, %ecx > + andl %ecx, %eax > + jz L(first_loop_old_match) > + bsrl %eax, %eax > + addq %rdi, %rax > +#ifdef USE_AS_WCSRCHR > + andq $-CHAR_SIZE, %rax > +#endif > + ret > + > + /* Save minimum state for getting most recent match. We can > + throw out all previous work. */ > + .p2align 4 > +L(second_loop_match): > + movq %rdi, %rsi > + movaps %xmm4, %xmm2 > + movaps %xmm7, %xmm3 > + > + .p2align 4 > +L(second_loop): > + movaps (VEC_SIZE * 2)(%rdi), %xmm4 > + movaps (VEC_SIZE * 3)(%rdi), %xmm5 > + /* Since SSE2 no pminud so wcsrchr needs seperate logic for > + detecting zero. Note if this is found to be a bottleneck it > + may be worth adding an SSE4.1 wcsrchr implementation. */ > +#ifdef USE_AS_WCSRCHR > + movaps %xmm5, %xmm6 > + pxor %xmm8, %xmm8 > + > + PCMPEQ %xmm8, %xmm5 > + PCMPEQ %xmm4, %xmm8 > + por %xmm5, %xmm8 > +#else > + movaps %xmm5, %xmm6 > + PMINU %xmm4, %xmm5 > +#endif > + > + movaps %xmm4, %xmm9 > + PCMPEQ %xmm0, %xmm4 > + PCMPEQ %xmm0, %xmm6 > + movaps %xmm6, %xmm7 > + por %xmm4, %xmm6 > +#ifndef USE_AS_WCSRCHR > + pxor %xmm8, %xmm8 > + PCMPEQ %xmm5, %xmm8 > #endif > > -#include "../strrchr.S" > + pmovmskb %xmm8, %ecx > + pmovmskb %xmm6, %eax > + > + addq $(VEC_SIZE * 2), %rdi > + /* Either null term or new occurence of CHAR. */ > + addl %ecx, %eax > + jz L(second_loop) > + > + /* No null term so much be new occurence of CHAR. */ > + testl %ecx, %ecx > + jz L(second_loop_match) > + > + > + subl %ecx, %eax > + jnz L(second_loop_new_match) > + > +L(second_loop_old_match): > + pmovmskb %xmm2, %ecx > + pmovmskb %xmm3, %eax > + sall $16, %eax > + orl %ecx, %eax > + bsrl %eax, %eax > + addq %rsi, %rax > +#ifdef USE_AS_WCSRCHR > + andq $-CHAR_SIZE, %rax > +#endif > + ret > + > + .p2align 4 > +L(second_loop_new_match): > + pxor %xmm6, %xmm6 > + PCMPEQ %xmm9, %xmm6 > + pmovmskb %xmm6, %eax > + sall $16, %ecx > + orl %eax, %ecx > + > + /* We can't reuse either of the old comparisons as since we mask > + of zeros after first zero (instead of using the full > + comparison) we can't gurantee no interference between match > + after end of string and valid match. */ > + pmovmskb %xmm4, %eax > + pmovmskb %xmm7, %edx > + sall $16, %edx > + orl %edx, %eax > + > + leal -1(%ecx), %edx > + xorl %edx, %ecx > + andl %ecx, %eax > + jz L(second_loop_old_match) > + bsrl %eax, %eax > + addq %rdi, %rax > +#ifdef USE_AS_WCSRCHR > + andq $-CHAR_SIZE, %rax > +#endif > + ret > + > + .p2align 4,, 4 > +L(cross_page): > + movq %rdi, %rsi > + andq $-VEC_SIZE, %rsi > + movaps (%rsi), %xmm1 > + pxor %xmm2, %xmm2 > + PCMPEQ %xmm1, %xmm2 > + pmovmskb %xmm2, %edx > + movl %edi, %ecx > + andl $(VEC_SIZE - 1), %ecx > + sarl %cl, %edx > + jz L(cross_page_continue) > + PCMPEQ %xmm0, %xmm1 > + pmovmskb %xmm1, %eax > + sarl %cl, %eax > + leal -1(%rdx), %ecx > + xorl %edx, %ecx > + andl %ecx, %eax > + jz L(ret1) > + bsrl %eax, %eax > + addq %rdi, %rax > +#ifdef USE_AS_WCSRCHR > + andq $-CHAR_SIZE, %rax > +#endif > +L(ret1): > + ret > +END(STRRCHR) > diff --git a/sysdeps/x86_64/multiarch/wcsrchr-sse2.S b/sysdeps/x86_64/multiarch/wcsrchr-sse2.S > index 69d2f3cdb1..d9259720f8 100644 > --- a/sysdeps/x86_64/multiarch/wcsrchr-sse2.S > +++ b/sysdeps/x86_64/multiarch/wcsrchr-sse2.S > @@ -17,6 +17,12 @@ > . */ > > #if IS_IN (libc) > -# define STRRCHR __wcsrchr_sse2 > +# ifndef STRRCHR > +# define STRRCHR __wcsrchr_sse2 > +# endif > #endif > -#include "../wcsrchr.S" > + > +#define USE_AS_WCSRCHR 1 > +#define NO_PMINU 1 > + > +#include "strrchr-sse2.S" > diff --git a/sysdeps/x86_64/strrchr.S b/sysdeps/x86_64/strrchr.S > index 4d7ba4ceb2..f39da60454 100644 > --- a/sysdeps/x86_64/strrchr.S > +++ b/sysdeps/x86_64/strrchr.S > @@ -16,363 +16,7 @@ > License along with the GNU C Library; if not, see > . */ > > - > -#include > - > -#ifndef STRRCHR > -# define STRRCHR strrchr > -#endif > - > -#ifdef USE_AS_WCSRCHR > -# define PCMPEQ pcmpeqd > -# define CHAR_SIZE 4 > -# define PMINU pminud > -#else > -# define PCMPEQ pcmpeqb > -# define CHAR_SIZE 1 > -# define PMINU pminub > -#endif > - > -#define PAGE_SIZE 4096 > -#define VEC_SIZE 16 > - > - .text > -ENTRY(STRRCHR) > - movd %esi, %xmm0 > - movq %rdi, %rax > - andl $(PAGE_SIZE - 1), %eax > -#ifndef USE_AS_WCSRCHR > - punpcklbw %xmm0, %xmm0 > - punpcklwd %xmm0, %xmm0 > -#endif > - pshufd $0, %xmm0, %xmm0 > - cmpl $(PAGE_SIZE - VEC_SIZE), %eax > - ja L(cross_page) > - > -L(cross_page_continue): > - movups (%rdi), %xmm1 > - pxor %xmm2, %xmm2 > - PCMPEQ %xmm1, %xmm2 > - pmovmskb %xmm2, %ecx > - testl %ecx, %ecx > - jz L(aligned_more) > - > - PCMPEQ %xmm0, %xmm1 > - pmovmskb %xmm1, %eax > - leal -1(%rcx), %edx > - xorl %edx, %ecx > - andl %ecx, %eax > - jz L(ret0) > - bsrl %eax, %eax > - addq %rdi, %rax > - /* We are off by 3 for wcsrchr if search CHAR is non-zero. If > - search CHAR is zero we are correct. Either way `andq > - -CHAR_SIZE, %rax` gets the correct result. */ > -#ifdef USE_AS_WCSRCHR > - andq $-CHAR_SIZE, %rax > -#endif > -L(ret0): > - ret > - > - /* Returns for first vec x1/x2 have hard coded backward search > - path for earlier matches. */ > - .p2align 4 > -L(first_vec_x0_test): > - PCMPEQ %xmm0, %xmm1 > - pmovmskb %xmm1, %eax > - testl %eax, %eax > - jz L(ret0) > - bsrl %eax, %eax > - addq %r8, %rax > -#ifdef USE_AS_WCSRCHR > - andq $-CHAR_SIZE, %rax > -#endif > - ret > - > - .p2align 4 > -L(first_vec_x1): > - PCMPEQ %xmm0, %xmm2 > - pmovmskb %xmm2, %eax > - leal -1(%rcx), %edx > - xorl %edx, %ecx > - andl %ecx, %eax > - jz L(first_vec_x0_test) > - bsrl %eax, %eax > - leaq (VEC_SIZE)(%rdi, %rax), %rax > -#ifdef USE_AS_WCSRCHR > - andq $-CHAR_SIZE, %rax > -#endif > - ret > - > - .p2align 4 > -L(first_vec_x1_test): > - PCMPEQ %xmm0, %xmm2 > - pmovmskb %xmm2, %eax > - testl %eax, %eax > - jz L(first_vec_x0_test) > - bsrl %eax, %eax > - leaq (VEC_SIZE)(%rdi, %rax), %rax > -#ifdef USE_AS_WCSRCHR > - andq $-CHAR_SIZE, %rax > -#endif > - ret > - > - .p2align 4 > -L(first_vec_x2): > - PCMPEQ %xmm0, %xmm3 > - pmovmskb %xmm3, %eax > - leal -1(%rcx), %edx > - xorl %edx, %ecx > - andl %ecx, %eax > - jz L(first_vec_x1_test) > - bsrl %eax, %eax > - leaq (VEC_SIZE * 2)(%rdi, %rax), %rax > -#ifdef USE_AS_WCSRCHR > - andq $-CHAR_SIZE, %rax > -#endif > - ret > - > - .p2align 4 > -L(aligned_more): > - /* Save original pointer if match was in VEC 0. */ > - movq %rdi, %r8 > - andq $-VEC_SIZE, %rdi > - > - movaps VEC_SIZE(%rdi), %xmm2 > - pxor %xmm3, %xmm3 > - PCMPEQ %xmm2, %xmm3 > - pmovmskb %xmm3, %ecx > - testl %ecx, %ecx > - jnz L(first_vec_x1) > - > - movaps (VEC_SIZE * 2)(%rdi), %xmm3 > - pxor %xmm4, %xmm4 > - PCMPEQ %xmm3, %xmm4 > - pmovmskb %xmm4, %ecx > - testl %ecx, %ecx > - jnz L(first_vec_x2) > - > - addq $VEC_SIZE, %rdi > - /* Save pointer again before realigning. */ > - movq %rdi, %rsi > - andq $-(VEC_SIZE * 2), %rdi > - .p2align 4 > -L(first_loop): > - /* Do 2x VEC at a time. */ > - movaps (VEC_SIZE * 2)(%rdi), %xmm4 > - movaps (VEC_SIZE * 3)(%rdi), %xmm5 > - /* Since SSE2 no pminud so wcsrchr needs seperate logic for > - detecting zero. Note if this is found to be a bottleneck it > - may be worth adding an SSE4.1 wcsrchr implementation. */ > -#ifdef USE_AS_WCSRCHR > - movaps %xmm5, %xmm6 > - pxor %xmm8, %xmm8 > - > - PCMPEQ %xmm8, %xmm5 > - PCMPEQ %xmm4, %xmm8 > - por %xmm5, %xmm8 > -#else > - movaps %xmm5, %xmm6 > - PMINU %xmm4, %xmm5 > -#endif > - > - movaps %xmm4, %xmm9 > - PCMPEQ %xmm0, %xmm4 > - PCMPEQ %xmm0, %xmm6 > - movaps %xmm6, %xmm7 > - por %xmm4, %xmm6 > -#ifndef USE_AS_WCSRCHR > - pxor %xmm8, %xmm8 > - PCMPEQ %xmm5, %xmm8 > -#endif > - pmovmskb %xmm8, %ecx > - pmovmskb %xmm6, %eax > - > - addq $(VEC_SIZE * 2), %rdi > - /* Use `addl` 1) so we can undo it with `subl` and 2) it can > - macro-fuse with `jz`. */ > - addl %ecx, %eax > - jz L(first_loop) > - > - /* Check if there is zero match. */ > - testl %ecx, %ecx > - jz L(second_loop_match) > - > - /* Check if there was a match in last iteration. */ > - subl %ecx, %eax > - jnz L(new_match) > - > -L(first_loop_old_match): > - PCMPEQ %xmm0, %xmm2 > - PCMPEQ %xmm0, %xmm3 > - pmovmskb %xmm2, %ecx > - pmovmskb %xmm3, %eax > - addl %eax, %ecx > - jz L(first_vec_x0_test) > - /* NB: We could move this shift to before the branch and save a > - bit of code size / performance on the fall through. The > - branch leads to the null case which generally seems hotter > - than char in first 3x VEC. */ > - sall $16, %eax > - orl %ecx, %eax > - > - bsrl %eax, %eax > - addq %rsi, %rax > -#ifdef USE_AS_WCSRCHR > - andq $-CHAR_SIZE, %rax > -#endif > - ret > - > - .p2align 4 > -L(new_match): > - pxor %xmm6, %xmm6 > - PCMPEQ %xmm9, %xmm6 > - pmovmskb %xmm6, %eax > - sall $16, %ecx > - orl %eax, %ecx > - > - /* We can't reuse either of the old comparisons as since we mask > - of zeros after first zero (instead of using the full > - comparison) we can't gurantee no interference between match > - after end of string and valid match. */ > - pmovmskb %xmm4, %eax > - pmovmskb %xmm7, %edx > - sall $16, %edx > - orl %edx, %eax > - > - leal -1(%ecx), %edx > - xorl %edx, %ecx > - andl %ecx, %eax > - jz L(first_loop_old_match) > - bsrl %eax, %eax > - addq %rdi, %rax > -#ifdef USE_AS_WCSRCHR > - andq $-CHAR_SIZE, %rax > -#endif > - ret > - > - /* Save minimum state for getting most recent match. We can > - throw out all previous work. */ > - .p2align 4 > -L(second_loop_match): > - movq %rdi, %rsi > - movaps %xmm4, %xmm2 > - movaps %xmm7, %xmm3 > - > - .p2align 4 > -L(second_loop): > - movaps (VEC_SIZE * 2)(%rdi), %xmm4 > - movaps (VEC_SIZE * 3)(%rdi), %xmm5 > - /* Since SSE2 no pminud so wcsrchr needs seperate logic for > - detecting zero. Note if this is found to be a bottleneck it > - may be worth adding an SSE4.1 wcsrchr implementation. */ > -#ifdef USE_AS_WCSRCHR > - movaps %xmm5, %xmm6 > - pxor %xmm8, %xmm8 > - > - PCMPEQ %xmm8, %xmm5 > - PCMPEQ %xmm4, %xmm8 > - por %xmm5, %xmm8 > -#else > - movaps %xmm5, %xmm6 > - PMINU %xmm4, %xmm5 > -#endif > - > - movaps %xmm4, %xmm9 > - PCMPEQ %xmm0, %xmm4 > - PCMPEQ %xmm0, %xmm6 > - movaps %xmm6, %xmm7 > - por %xmm4, %xmm6 > -#ifndef USE_AS_WCSRCHR > - pxor %xmm8, %xmm8 > - PCMPEQ %xmm5, %xmm8 > -#endif > - > - pmovmskb %xmm8, %ecx > - pmovmskb %xmm6, %eax > - > - addq $(VEC_SIZE * 2), %rdi > - /* Either null term or new occurence of CHAR. */ > - addl %ecx, %eax > - jz L(second_loop) > - > - /* No null term so much be new occurence of CHAR. */ > - testl %ecx, %ecx > - jz L(second_loop_match) > - > - > - subl %ecx, %eax > - jnz L(second_loop_new_match) > - > -L(second_loop_old_match): > - pmovmskb %xmm2, %ecx > - pmovmskb %xmm3, %eax > - sall $16, %eax > - orl %ecx, %eax > - bsrl %eax, %eax > - addq %rsi, %rax > -#ifdef USE_AS_WCSRCHR > - andq $-CHAR_SIZE, %rax > -#endif > - ret > - > - .p2align 4 > -L(second_loop_new_match): > - pxor %xmm6, %xmm6 > - PCMPEQ %xmm9, %xmm6 > - pmovmskb %xmm6, %eax > - sall $16, %ecx > - orl %eax, %ecx > - > - /* We can't reuse either of the old comparisons as since we mask > - of zeros after first zero (instead of using the full > - comparison) we can't gurantee no interference between match > - after end of string and valid match. */ > - pmovmskb %xmm4, %eax > - pmovmskb %xmm7, %edx > - sall $16, %edx > - orl %edx, %eax > - > - leal -1(%ecx), %edx > - xorl %edx, %ecx > - andl %ecx, %eax > - jz L(second_loop_old_match) > - bsrl %eax, %eax > - addq %rdi, %rax > -#ifdef USE_AS_WCSRCHR > - andq $-CHAR_SIZE, %rax > -#endif > - ret > - > - .p2align 4,, 4 > -L(cross_page): > - movq %rdi, %rsi > - andq $-VEC_SIZE, %rsi > - movaps (%rsi), %xmm1 > - pxor %xmm2, %xmm2 > - PCMPEQ %xmm1, %xmm2 > - pmovmskb %xmm2, %edx > - movl %edi, %ecx > - andl $(VEC_SIZE - 1), %ecx > - sarl %cl, %edx > - jz L(cross_page_continue) > - PCMPEQ %xmm0, %xmm1 > - pmovmskb %xmm1, %eax > - sarl %cl, %eax > - leal -1(%rdx), %ecx > - xorl %edx, %ecx > - andl %ecx, %eax > - jz L(ret1) > - bsrl %eax, %eax > - addq %rdi, %rax > -#ifdef USE_AS_WCSRCHR > - andq $-CHAR_SIZE, %rax > -#endif > -L(ret1): > - ret > -END(STRRCHR) > - > -#ifndef USE_AS_WCSRCHR > - weak_alias (STRRCHR, rindex) > - libc_hidden_builtin_def (STRRCHR) > -#endif > +#define STRRCHR strrchr > +#include "multiarch/strrchr-sse2.S" > +weak_alias (strrchr, rindex) > +libc_hidden_builtin_def (strrchr) > diff --git a/sysdeps/x86_64/wcsrchr.S b/sysdeps/x86_64/wcsrchr.S > index 2b80efc5ef..1d4b1eb21c 100644 > --- a/sysdeps/x86_64/wcsrchr.S > +++ b/sysdeps/x86_64/wcsrchr.S > @@ -16,12 +16,5 @@ > License along with the GNU C Library; if not, see > . */ > > - > -#define USE_AS_WCSRCHR 1 > -#define NO_PMINU 1 > - > -#ifndef STRRCHR > -# define STRRCHR wcsrchr > -#endif > - > -#include "../strrchr.S" > +#define STRRCHR wcsrchr > +#include "multiarch/wcsrchr-sse2.S" > -- > 2.34.1 > LGTM. Thanks. -- H.J.