From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: by sourceware.org (Postfix, from userid 7844) id AC4BD385801D; Wed, 13 Jul 2022 22:55:08 +0000 (GMT) DKIM-Filter: OpenDKIM Filter v2.11.0 sourceware.org AC4BD385801D Content-Type: text/plain; charset="us-ascii" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit From: Noah Goldstein To: glibc-cvs@sourceware.org Subject: [glibc] x86: Move wcschr SSE2 implementation to multiarch/wcschr-sse2.S X-Act-Checkin: glibc X-Git-Author: Noah Goldstein X-Git-Refname: refs/heads/master X-Git-Oldrev: 72a48ec0f78c7fd948fe476eb41f69c071f48964 X-Git-Newrev: 64479f11b721fa33d17d623db31d047a11f363a1 Message-Id: <20220713225508.AC4BD385801D@sourceware.org> Date: Wed, 13 Jul 2022 22:55:08 +0000 (GMT) X-BeenThere: glibc-cvs@sourceware.org X-Mailman-Version: 2.1.29 Precedence: list List-Id: Glibc-cvs mailing list List-Unsubscribe: , List-Archive: List-Help: List-Subscribe: , X-List-Received-Date: Wed, 13 Jul 2022 22:55:08 -0000 https://sourceware.org/git/gitweb.cgi?p=glibc.git;h=64479f11b721fa33d17d623db31d047a11f363a1 commit 64479f11b721fa33d17d623db31d047a11f363a1 Author: Noah Goldstein Date: Tue Jul 12 12:29:07 2022 -0700 x86: Move wcschr SSE2 implementation to multiarch/wcschr-sse2.S This commit doesn't affect libc.so.6, its just housekeeping to prepare for adding explicit ISA level support. Tested build on x86_64 and x86_32 with/without multiarch. Diff: --- sysdeps/x86_64/multiarch/wcschr-sse2.S | 145 +++++++++++++++++++++++++++++++-- sysdeps/x86_64/wcschr.S | 135 +----------------------------- 2 files changed, 138 insertions(+), 142 deletions(-) diff --git a/sysdeps/x86_64/multiarch/wcschr-sse2.S b/sysdeps/x86_64/multiarch/wcschr-sse2.S index 218ea609b9..c872926ba9 100644 --- a/sysdeps/x86_64/multiarch/wcschr-sse2.S +++ b/sysdeps/x86_64/multiarch/wcschr-sse2.S @@ -17,14 +17,141 @@ . */ #if IS_IN (libc) -# define __wcschr __wcschr_sse2 - -# undef weak_alias -# define weak_alias(__wcschr, wcschr) -# undef libc_hidden_def -# define libc_hidden_def(__wcschr) -# undef libc_hidden_weak -# define libc_hidden_weak(wcschr) +# ifndef WCSCHR +# define WCSCHR __wcschr_sse2 +# endif #endif -#include "../wcschr.S" +#include + + .text +ENTRY (WCSCHR) + + movd %rsi, %xmm1 + pxor %xmm2, %xmm2 + mov %rdi, %rcx + punpckldq %xmm1, %xmm1 + punpckldq %xmm1, %xmm1 + + and $63, %rcx + cmp $48, %rcx + ja L(cross_cache) + + movdqu (%rdi), %xmm0 + pcmpeqd %xmm0, %xmm2 + add $16, %rdi + pcmpeqd %xmm1, %xmm0 + pmovmskb %xmm2, %rdx + pmovmskb %xmm0, %rax + or %rax, %rdx + jnz L(matches) + + and $-16, %rdi + + movdqa (%rdi), %xmm0 + pcmpeqd %xmm0, %xmm2 + add $16, %rdi + pcmpeqd %xmm1, %xmm0 + pmovmskb %xmm2, %rdx + pmovmskb %xmm0, %rax + or %rax, %rdx + jnz L(matches) + + jmp L(loop) + +L(cross_cache): + and $15, %rcx + and $-16, %rdi + movdqa (%rdi), %xmm0 + pcmpeqd %xmm0, %xmm2 + pcmpeqd %xmm1, %xmm0 + pmovmskb %xmm2, %rdx + pmovmskb %xmm0, %rax + + sar %cl, %rdx + sar %cl, %rax + test %rax, %rax + je L(unaligned_no_match) + + bsf %rax, %rax + test %rdx, %rdx + je L(unaligned_match) + bsf %rdx, %rdx + cmp %rdx, %rax + ja L(return_null) + +L(unaligned_match): + add %rdi, %rax + add %rcx, %rax + ret + + .p2align 4 +L(unaligned_no_match): + test %rdx, %rdx + jne L(return_null) + pxor %xmm2, %xmm2 + + add $16, %rdi + + .p2align 4 +/* Loop start on aligned string. */ +L(loop): + movdqa (%rdi), %xmm0 + pcmpeqd %xmm0, %xmm2 + add $16, %rdi + pcmpeqd %xmm1, %xmm0 + pmovmskb %xmm2, %rdx + pmovmskb %xmm0, %rax + or %rax, %rdx + jnz L(matches) + + movdqa (%rdi), %xmm0 + pcmpeqd %xmm0, %xmm2 + add $16, %rdi + pcmpeqd %xmm1, %xmm0 + pmovmskb %xmm2, %rdx + pmovmskb %xmm0, %rax + or %rax, %rdx + jnz L(matches) + + movdqa (%rdi), %xmm0 + pcmpeqd %xmm0, %xmm2 + add $16, %rdi + pcmpeqd %xmm1, %xmm0 + pmovmskb %xmm2, %rdx + pmovmskb %xmm0, %rax + or %rax, %rdx + jnz L(matches) + + movdqa (%rdi), %xmm0 + pcmpeqd %xmm0, %xmm2 + add $16, %rdi + pcmpeqd %xmm1, %xmm0 + pmovmskb %xmm2, %rdx + pmovmskb %xmm0, %rax + or %rax, %rdx + jnz L(matches) + jmp L(loop) + + .p2align 4 +L(matches): + pmovmskb %xmm2, %rdx + test %rax, %rax + jz L(return_null) + bsf %rax, %rax + test %rdx, %rdx + je L(match) + bsf %rdx, %rcx + cmp %rcx, %rax + ja L(return_null) +L(match): + sub $16, %rdi + add %rdi, %rax + ret + + .p2align 4 +L(return_null): + xor %rax, %rax + ret + +END (WCSCHR) diff --git a/sysdeps/x86_64/wcschr.S b/sysdeps/x86_64/wcschr.S index 2131220382..80b12c4286 100644 --- a/sysdeps/x86_64/wcschr.S +++ b/sysdeps/x86_64/wcschr.S @@ -16,140 +16,9 @@ License along with the GNU C Library; if not, see . */ -#include - - .text -ENTRY (__wcschr) - - movd %rsi, %xmm1 - pxor %xmm2, %xmm2 - mov %rdi, %rcx - punpckldq %xmm1, %xmm1 - punpckldq %xmm1, %xmm1 - - and $63, %rcx - cmp $48, %rcx - ja L(cross_cache) - - movdqu (%rdi), %xmm0 - pcmpeqd %xmm0, %xmm2 - add $16, %rdi - pcmpeqd %xmm1, %xmm0 - pmovmskb %xmm2, %rdx - pmovmskb %xmm0, %rax - or %rax, %rdx - jnz L(matches) - - and $-16, %rdi - - movdqa (%rdi), %xmm0 - pcmpeqd %xmm0, %xmm2 - add $16, %rdi - pcmpeqd %xmm1, %xmm0 - pmovmskb %xmm2, %rdx - pmovmskb %xmm0, %rax - or %rax, %rdx - jnz L(matches) - - jmp L(loop) - -L(cross_cache): - and $15, %rcx - and $-16, %rdi - movdqa (%rdi), %xmm0 - pcmpeqd %xmm0, %xmm2 - pcmpeqd %xmm1, %xmm0 - pmovmskb %xmm2, %rdx - pmovmskb %xmm0, %rax - - sar %cl, %rdx - sar %cl, %rax - test %rax, %rax - je L(unaligned_no_match) - - bsf %rax, %rax - test %rdx, %rdx - je L(unaligned_match) - bsf %rdx, %rdx - cmp %rdx, %rax - ja L(return_null) - -L(unaligned_match): - add %rdi, %rax - add %rcx, %rax - ret - - .p2align 4 -L(unaligned_no_match): - test %rdx, %rdx - jne L(return_null) - pxor %xmm2, %xmm2 - - add $16, %rdi - - .p2align 4 -/* Loop start on aligned string. */ -L(loop): - movdqa (%rdi), %xmm0 - pcmpeqd %xmm0, %xmm2 - add $16, %rdi - pcmpeqd %xmm1, %xmm0 - pmovmskb %xmm2, %rdx - pmovmskb %xmm0, %rax - or %rax, %rdx - jnz L(matches) - - movdqa (%rdi), %xmm0 - pcmpeqd %xmm0, %xmm2 - add $16, %rdi - pcmpeqd %xmm1, %xmm0 - pmovmskb %xmm2, %rdx - pmovmskb %xmm0, %rax - or %rax, %rdx - jnz L(matches) - - movdqa (%rdi), %xmm0 - pcmpeqd %xmm0, %xmm2 - add $16, %rdi - pcmpeqd %xmm1, %xmm0 - pmovmskb %xmm2, %rdx - pmovmskb %xmm0, %rax - or %rax, %rdx - jnz L(matches) - - movdqa (%rdi), %xmm0 - pcmpeqd %xmm0, %xmm2 - add $16, %rdi - pcmpeqd %xmm1, %xmm0 - pmovmskb %xmm2, %rdx - pmovmskb %xmm0, %rax - or %rax, %rdx - jnz L(matches) - jmp L(loop) - - .p2align 4 -L(matches): - pmovmskb %xmm2, %rdx - test %rax, %rax - jz L(return_null) - bsf %rax, %rax - test %rdx, %rdx - je L(match) - bsf %rdx, %rcx - cmp %rcx, %rax - ja L(return_null) -L(match): - sub $16, %rdi - add %rdi, %rax - ret - - .p2align 4 -L(return_null): - xor %rax, %rax - ret - -END (__wcschr) +#define WCSCHR __wcschr +#include "multiarch/wcschr-sse2.S" libc_hidden_def(__wcschr) weak_alias (__wcschr, wcschr) libc_hidden_weak (wcschr)