From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: from mail-oa1-x2a.google.com (mail-oa1-x2a.google.com [IPv6:2001:4860:4864:20::2a]) by sourceware.org (Postfix) with ESMTPS id E51E13857410 for ; Tue, 12 Jul 2022 20:56:30 +0000 (GMT) DMARC-Filter: OpenDMARC Filter v1.4.1 sourceware.org E51E13857410 Received: by mail-oa1-x2a.google.com with SMTP id 586e51a60fabf-10c8e8d973eso9992203fac.5 for ; Tue, 12 Jul 2022 13:56:30 -0700 (PDT) X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=1e100.net; s=20210112; h=x-gm-message-state:mime-version:references:in-reply-to:from:date :message-id:subject:to:cc; bh=ySGdy2riU++ZYBe7ab2BEJj+KCS+x3sNU/eQ0YQCRc4=; b=pPsnFV87QRj6VtEsUv/iCJ6Ee9mcqCuhVCy2vq2lUZ6kR6PP+Y7X6VjlNf13R/cIJo GrBWHhUf4gyi43d8VSgA34ei8nst5iP+YMzDOFSk9wYpNr5K9FtrQuJ8M+4Ua74F0aJe U2O34PTI0fLgVYMWz16/Xq/DQiI4kvweZqg1QXrQ4m4iTCiMejzNGMgk91oSluqQrcQX Sx1QYlKC+dNvv1MGtxIdq9PlnVGMZmpZESAXeuAJrnT7iRaCpzw5hM2gHQDbS+RzdKMo bPcELJofu4W7P9oCumouWtS8m8Agz1U89trPJ0cq+rrwk+J1RqUw2bejOfN0779OY74l sXwA== X-Gm-Message-State: AJIora9rRbb+p2MqhDq46yHL8TsgXZbR6etnz+WNfxUPy32wh04Gtn/J CssegRbmn13ts0WT3BP7LRlA97DDE6ZWSuwSgUE= X-Google-Smtp-Source: AGRyM1tetV7rZNwQR17huMExizeFpK+JsyEC+G65OoE58KE8toONE0rFWz8odJ0IX9i0DiYzbAQ5LWjbHuah4isn+A0= X-Received: by 2002:a05:6870:fbaa:b0:10b:fafd:4a91 with SMTP id kv42-20020a056870fbaa00b0010bfafd4a91mr49939oab.94.1657659390244; Tue, 12 Jul 2022 13:56:30 -0700 (PDT) MIME-Version: 1.0 References: <20220712192910.351121-1-goldstein.w.n@gmail.com> <20220712192910.351121-7-goldstein.w.n@gmail.com> In-Reply-To: <20220712192910.351121-7-goldstein.w.n@gmail.com> From: "H.J. Lu" Date: Tue, 12 Jul 2022 13:55:54 -0700 Message-ID: Subject: Re: [PATCH v1] x86: Move wcschr SSE2 implementation to multiarch/wcschr-sse2.S To: Noah Goldstein Cc: GNU C Library , "Carlos O'Donell" Content-Type: text/plain; charset="UTF-8" X-Spam-Status: No, score=-3024.1 required=5.0 tests=BAYES_00, DKIM_SIGNED, DKIM_VALID, DKIM_VALID_AU, DKIM_VALID_EF, FREEMAIL_FROM, GIT_PATCH_0, KAM_SHORT, RCVD_IN_DNSWL_NONE, SPF_HELO_NONE, SPF_PASS, TXREP, T_SCC_BODY_TEXT_LINE autolearn=ham autolearn_force=no version=3.4.6 X-Spam-Checker-Version: SpamAssassin 3.4.6 (2021-04-09) on server2.sourceware.org X-BeenThere: libc-alpha@sourceware.org X-Mailman-Version: 2.1.29 Precedence: list List-Id: Libc-alpha mailing list List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , X-List-Received-Date: Tue, 12 Jul 2022 20:56:33 -0000 On Tue, Jul 12, 2022 at 12:29 PM Noah Goldstein wrote: > > This commit doesn't affect libc.so.6, its just housekeeping to prepare > for adding explicit ISA level support. > > Tested build on x86_64 and x86_32 with/without multiarch. > --- > sysdeps/x86_64/multiarch/wcschr-sse2.S | 145 +++++++++++++++++++++++-- > sysdeps/x86_64/wcschr.S | 135 +---------------------- > 2 files changed, 138 insertions(+), 142 deletions(-) > > diff --git a/sysdeps/x86_64/multiarch/wcschr-sse2.S b/sysdeps/x86_64/multiarch/wcschr-sse2.S > index 218ea609b9..c872926ba9 100644 > --- a/sysdeps/x86_64/multiarch/wcschr-sse2.S > +++ b/sysdeps/x86_64/multiarch/wcschr-sse2.S > @@ -17,14 +17,141 @@ > . */ > > #if IS_IN (libc) > -# define __wcschr __wcschr_sse2 > - > -# undef weak_alias > -# define weak_alias(__wcschr, wcschr) > -# undef libc_hidden_def > -# define libc_hidden_def(__wcschr) > -# undef libc_hidden_weak > -# define libc_hidden_weak(wcschr) > +# ifndef WCSCHR > +# define WCSCHR __wcschr_sse2 > +# endif > #endif > > -#include "../wcschr.S" > +#include > + > + .text > +ENTRY (WCSCHR) > + > + movd %rsi, %xmm1 > + pxor %xmm2, %xmm2 > + mov %rdi, %rcx > + punpckldq %xmm1, %xmm1 > + punpckldq %xmm1, %xmm1 > + > + and $63, %rcx > + cmp $48, %rcx > + ja L(cross_cache) > + > + movdqu (%rdi), %xmm0 > + pcmpeqd %xmm0, %xmm2 > + add $16, %rdi > + pcmpeqd %xmm1, %xmm0 > + pmovmskb %xmm2, %rdx > + pmovmskb %xmm0, %rax > + or %rax, %rdx > + jnz L(matches) > + > + and $-16, %rdi > + > + movdqa (%rdi), %xmm0 > + pcmpeqd %xmm0, %xmm2 > + add $16, %rdi > + pcmpeqd %xmm1, %xmm0 > + pmovmskb %xmm2, %rdx > + pmovmskb %xmm0, %rax > + or %rax, %rdx > + jnz L(matches) > + > + jmp L(loop) > + > +L(cross_cache): > + and $15, %rcx > + and $-16, %rdi > + movdqa (%rdi), %xmm0 > + pcmpeqd %xmm0, %xmm2 > + pcmpeqd %xmm1, %xmm0 > + pmovmskb %xmm2, %rdx > + pmovmskb %xmm0, %rax > + > + sar %cl, %rdx > + sar %cl, %rax > + test %rax, %rax > + je L(unaligned_no_match) > + > + bsf %rax, %rax > + test %rdx, %rdx > + je L(unaligned_match) > + bsf %rdx, %rdx > + cmp %rdx, %rax > + ja L(return_null) > + > +L(unaligned_match): > + add %rdi, %rax > + add %rcx, %rax > + ret > + > + .p2align 4 > +L(unaligned_no_match): > + test %rdx, %rdx > + jne L(return_null) > + pxor %xmm2, %xmm2 > + > + add $16, %rdi > + > + .p2align 4 > +/* Loop start on aligned string. */ > +L(loop): > + movdqa (%rdi), %xmm0 > + pcmpeqd %xmm0, %xmm2 > + add $16, %rdi > + pcmpeqd %xmm1, %xmm0 > + pmovmskb %xmm2, %rdx > + pmovmskb %xmm0, %rax > + or %rax, %rdx > + jnz L(matches) > + > + movdqa (%rdi), %xmm0 > + pcmpeqd %xmm0, %xmm2 > + add $16, %rdi > + pcmpeqd %xmm1, %xmm0 > + pmovmskb %xmm2, %rdx > + pmovmskb %xmm0, %rax > + or %rax, %rdx > + jnz L(matches) > + > + movdqa (%rdi), %xmm0 > + pcmpeqd %xmm0, %xmm2 > + add $16, %rdi > + pcmpeqd %xmm1, %xmm0 > + pmovmskb %xmm2, %rdx > + pmovmskb %xmm0, %rax > + or %rax, %rdx > + jnz L(matches) > + > + movdqa (%rdi), %xmm0 > + pcmpeqd %xmm0, %xmm2 > + add $16, %rdi > + pcmpeqd %xmm1, %xmm0 > + pmovmskb %xmm2, %rdx > + pmovmskb %xmm0, %rax > + or %rax, %rdx > + jnz L(matches) > + jmp L(loop) > + > + .p2align 4 > +L(matches): > + pmovmskb %xmm2, %rdx > + test %rax, %rax > + jz L(return_null) > + bsf %rax, %rax > + test %rdx, %rdx > + je L(match) > + bsf %rdx, %rcx > + cmp %rcx, %rax > + ja L(return_null) > +L(match): > + sub $16, %rdi > + add %rdi, %rax > + ret > + > + .p2align 4 > +L(return_null): > + xor %rax, %rax > + ret > + > +END (WCSCHR) > diff --git a/sysdeps/x86_64/wcschr.S b/sysdeps/x86_64/wcschr.S > index 2131220382..80b12c4286 100644 > --- a/sysdeps/x86_64/wcschr.S > +++ b/sysdeps/x86_64/wcschr.S > @@ -16,140 +16,9 @@ > License along with the GNU C Library; if not, see > . */ > > -#include > - > - .text > -ENTRY (__wcschr) > - > - movd %rsi, %xmm1 > - pxor %xmm2, %xmm2 > - mov %rdi, %rcx > - punpckldq %xmm1, %xmm1 > - punpckldq %xmm1, %xmm1 > - > - and $63, %rcx > - cmp $48, %rcx > - ja L(cross_cache) > - > - movdqu (%rdi), %xmm0 > - pcmpeqd %xmm0, %xmm2 > - add $16, %rdi > - pcmpeqd %xmm1, %xmm0 > - pmovmskb %xmm2, %rdx > - pmovmskb %xmm0, %rax > - or %rax, %rdx > - jnz L(matches) > - > - and $-16, %rdi > - > - movdqa (%rdi), %xmm0 > - pcmpeqd %xmm0, %xmm2 > - add $16, %rdi > - pcmpeqd %xmm1, %xmm0 > - pmovmskb %xmm2, %rdx > - pmovmskb %xmm0, %rax > - or %rax, %rdx > - jnz L(matches) > - > - jmp L(loop) > - > -L(cross_cache): > - and $15, %rcx > - and $-16, %rdi > - movdqa (%rdi), %xmm0 > - pcmpeqd %xmm0, %xmm2 > - pcmpeqd %xmm1, %xmm0 > - pmovmskb %xmm2, %rdx > - pmovmskb %xmm0, %rax > - > - sar %cl, %rdx > - sar %cl, %rax > - test %rax, %rax > - je L(unaligned_no_match) > - > - bsf %rax, %rax > - test %rdx, %rdx > - je L(unaligned_match) > - bsf %rdx, %rdx > - cmp %rdx, %rax > - ja L(return_null) > - > -L(unaligned_match): > - add %rdi, %rax > - add %rcx, %rax > - ret > - > - .p2align 4 > -L(unaligned_no_match): > - test %rdx, %rdx > - jne L(return_null) > - pxor %xmm2, %xmm2 > - > - add $16, %rdi > - > - .p2align 4 > -/* Loop start on aligned string. */ > -L(loop): > - movdqa (%rdi), %xmm0 > - pcmpeqd %xmm0, %xmm2 > - add $16, %rdi > - pcmpeqd %xmm1, %xmm0 > - pmovmskb %xmm2, %rdx > - pmovmskb %xmm0, %rax > - or %rax, %rdx > - jnz L(matches) > - > - movdqa (%rdi), %xmm0 > - pcmpeqd %xmm0, %xmm2 > - add $16, %rdi > - pcmpeqd %xmm1, %xmm0 > - pmovmskb %xmm2, %rdx > - pmovmskb %xmm0, %rax > - or %rax, %rdx > - jnz L(matches) > - > - movdqa (%rdi), %xmm0 > - pcmpeqd %xmm0, %xmm2 > - add $16, %rdi > - pcmpeqd %xmm1, %xmm0 > - pmovmskb %xmm2, %rdx > - pmovmskb %xmm0, %rax > - or %rax, %rdx > - jnz L(matches) > - > - movdqa (%rdi), %xmm0 > - pcmpeqd %xmm0, %xmm2 > - add $16, %rdi > - pcmpeqd %xmm1, %xmm0 > - pmovmskb %xmm2, %rdx > - pmovmskb %xmm0, %rax > - or %rax, %rdx > - jnz L(matches) > - jmp L(loop) > - > - .p2align 4 > -L(matches): > - pmovmskb %xmm2, %rdx > - test %rax, %rax > - jz L(return_null) > - bsf %rax, %rax > - test %rdx, %rdx > - je L(match) > - bsf %rdx, %rcx > - cmp %rcx, %rax > - ja L(return_null) > -L(match): > - sub $16, %rdi > - add %rdi, %rax > - ret > - > - .p2align 4 > -L(return_null): > - xor %rax, %rax > - ret > - > -END (__wcschr) > > +#define WCSCHR __wcschr > +#include "multiarch/wcschr-sse2.S" > libc_hidden_def(__wcschr) > weak_alias (__wcschr, wcschr) > libc_hidden_weak (wcschr) > -- > 2.34.1 > LGTM. Thanks. -- H.J.