From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: by sourceware.org (Postfix, from userid 7844) id C992E3858429; Wed, 13 Jul 2022 22:55:13 +0000 (GMT) DKIM-Filter: OpenDKIM Filter v2.11.0 sourceware.org C992E3858429 Content-Type: text/plain; charset="us-ascii" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit From: Noah Goldstein To: glibc-cvs@sourceware.org Subject: [glibc] x86: Move wcslen SSE2 implementation to multiarch/wcslen-sse2.S X-Act-Checkin: glibc X-Git-Author: Noah Goldstein X-Git-Refname: refs/heads/master X-Git-Oldrev: 64479f11b721fa33d17d623db31d047a11f363a1 X-Git-Newrev: e19bb87c97a3a109c418f68cebbea27ebc2808f9 Message-Id: <20220713225513.C992E3858429@sourceware.org> Date: Wed, 13 Jul 2022 22:55:13 +0000 (GMT) X-BeenThere: glibc-cvs@sourceware.org X-Mailman-Version: 2.1.29 Precedence: list List-Id: Glibc-cvs mailing list List-Unsubscribe: , List-Archive: List-Help: List-Subscribe: , X-List-Received-Date: Wed, 13 Jul 2022 22:55:13 -0000 https://sourceware.org/git/gitweb.cgi?p=glibc.git;h=e19bb87c97a3a109c418f68cebbea27ebc2808f9 commit e19bb87c97a3a109c418f68cebbea27ebc2808f9 Author: Noah Goldstein Date: Tue Jul 12 12:29:08 2022 -0700 x86: Move wcslen SSE2 implementation to multiarch/wcslen-sse2.S This commit doesn't affect libc.so.6, its just housekeeping to prepare for adding explicit ISA level support. Tested build on x86_64 and x86_32 with/without multiarch. Diff: --- sysdeps/x86_64/multiarch/wcslen-sse2.S | 221 ++++++++++++++++++++++++++++++++- sysdeps/x86_64/wcslen.S | 216 +------------------------------- 2 files changed, 218 insertions(+), 219 deletions(-) diff --git a/sysdeps/x86_64/multiarch/wcslen-sse2.S b/sysdeps/x86_64/multiarch/wcslen-sse2.S index 2b3a9efd64..944c3bd9c6 100644 --- a/sysdeps/x86_64/multiarch/wcslen-sse2.S +++ b/sysdeps/x86_64/multiarch/wcslen-sse2.S @@ -17,10 +17,221 @@ . */ #if IS_IN (libc) -# define __wcslen __wcslen_sse2 - -# undef weak_alias -# define weak_alias(__wcslen, wcslen) +# ifndef WCSLEN +# define WCSLEN __wcslen_sse2 +# endif #endif -#include "../wcslen.S" +#include + + .text +ENTRY (WCSLEN) + cmpl $0, (%rdi) + jz L(exit_tail0) + cmpl $0, 4(%rdi) + jz L(exit_tail1) + cmpl $0, 8(%rdi) + jz L(exit_tail2) + cmpl $0, 12(%rdi) + jz L(exit_tail3) + cmpl $0, 16(%rdi) + jz L(exit_tail4) + cmpl $0, 20(%rdi) + jz L(exit_tail5) + cmpl $0, 24(%rdi) + jz L(exit_tail6) + cmpl $0, 28(%rdi) + jz L(exit_tail7) + + pxor %xmm0, %xmm0 + + lea 32(%rdi), %rax + addq $16, %rdi + and $-16, %rax + + pcmpeqd (%rax), %xmm0 + pmovmskb %xmm0, %edx + pxor %xmm1, %xmm1 + addq $16, %rax + test %edx, %edx + jnz L(exit) + + pcmpeqd (%rax), %xmm1 + pmovmskb %xmm1, %edx + pxor %xmm2, %xmm2 + addq $16, %rax + test %edx, %edx + jnz L(exit) + + pcmpeqd (%rax), %xmm2 + pmovmskb %xmm2, %edx + pxor %xmm3, %xmm3 + addq $16, %rax + test %edx, %edx + jnz L(exit) + + pcmpeqd (%rax), %xmm3 + pmovmskb %xmm3, %edx + addq $16, %rax + test %edx, %edx + jnz L(exit) + + pcmpeqd (%rax), %xmm0 + pmovmskb %xmm0, %edx + addq $16, %rax + test %edx, %edx + jnz L(exit) + + pcmpeqd (%rax), %xmm1 + pmovmskb %xmm1, %edx + addq $16, %rax + test %edx, %edx + jnz L(exit) + + pcmpeqd (%rax), %xmm2 + pmovmskb %xmm2, %edx + addq $16, %rax + test %edx, %edx + jnz L(exit) + + pcmpeqd (%rax), %xmm3 + pmovmskb %xmm3, %edx + addq $16, %rax + test %edx, %edx + jnz L(exit) + + pcmpeqd (%rax), %xmm0 + pmovmskb %xmm0, %edx + addq $16, %rax + test %edx, %edx + jnz L(exit) + + pcmpeqd (%rax), %xmm1 + pmovmskb %xmm1, %edx + addq $16, %rax + test %edx, %edx + jnz L(exit) + + pcmpeqd (%rax), %xmm2 + pmovmskb %xmm2, %edx + addq $16, %rax + test %edx, %edx + jnz L(exit) + + pcmpeqd (%rax), %xmm3 + pmovmskb %xmm3, %edx + addq $16, %rax + test %edx, %edx + jnz L(exit) + + and $-0x40, %rax + + .p2align 4 +L(aligned_64_loop): + movaps (%rax), %xmm0 + movaps 16(%rax), %xmm1 + movaps 32(%rax), %xmm2 + movaps 48(%rax), %xmm6 + + pminub %xmm1, %xmm0 + pminub %xmm6, %xmm2 + pminub %xmm0, %xmm2 + pcmpeqd %xmm3, %xmm2 + pmovmskb %xmm2, %edx + addq $64, %rax + test %edx, %edx + jz L(aligned_64_loop) + + pcmpeqd -64(%rax), %xmm3 + pmovmskb %xmm3, %edx + addq $48, %rdi + test %edx, %edx + jnz L(exit) + + pcmpeqd %xmm1, %xmm3 + pmovmskb %xmm3, %edx + addq $-16, %rdi + test %edx, %edx + jnz L(exit) + + pcmpeqd -32(%rax), %xmm3 + pmovmskb %xmm3, %edx + addq $-16, %rdi + test %edx, %edx + jnz L(exit) + + pcmpeqd %xmm6, %xmm3 + pmovmskb %xmm3, %edx + addq $-16, %rdi + test %edx, %edx + jz L(aligned_64_loop) + + .p2align 4 +L(exit): + sub %rdi, %rax + shr $2, %rax + test %dl, %dl + jz L(exit_high) + + andl $15, %edx + jz L(exit_1) + ret + + /* No align here. Naturally aligned % 16 == 1. */ +L(exit_high): + andl $(15 << 8), %edx + jz L(exit_3) + add $2, %rax + ret + + .p2align 3 +L(exit_1): + add $1, %rax + ret + + .p2align 3 +L(exit_3): + add $3, %rax + ret + + .p2align 3 +L(exit_tail0): + xorl %eax, %eax + ret + + .p2align 3 +L(exit_tail1): + movl $1, %eax + ret + + .p2align 3 +L(exit_tail2): + movl $2, %eax + ret + + .p2align 3 +L(exit_tail3): + movl $3, %eax + ret + + .p2align 3 +L(exit_tail4): + movl $4, %eax + ret + + .p2align 3 +L(exit_tail5): + movl $5, %eax + ret + + .p2align 3 +L(exit_tail6): + movl $6, %eax + ret + + .p2align 3 +L(exit_tail7): + movl $7, %eax + ret + +END (WCSLEN) diff --git a/sysdeps/x86_64/wcslen.S b/sysdeps/x86_64/wcslen.S index d641141d75..588a0fbe01 100644 --- a/sysdeps/x86_64/wcslen.S +++ b/sysdeps/x86_64/wcslen.S @@ -16,218 +16,6 @@ License along with the GNU C Library; if not, see . */ -#include - - .text -ENTRY (__wcslen) - cmpl $0, (%rdi) - jz L(exit_tail0) - cmpl $0, 4(%rdi) - jz L(exit_tail1) - cmpl $0, 8(%rdi) - jz L(exit_tail2) - cmpl $0, 12(%rdi) - jz L(exit_tail3) - cmpl $0, 16(%rdi) - jz L(exit_tail4) - cmpl $0, 20(%rdi) - jz L(exit_tail5) - cmpl $0, 24(%rdi) - jz L(exit_tail6) - cmpl $0, 28(%rdi) - jz L(exit_tail7) - - pxor %xmm0, %xmm0 - - lea 32(%rdi), %rax - addq $16, %rdi - and $-16, %rax - - pcmpeqd (%rax), %xmm0 - pmovmskb %xmm0, %edx - pxor %xmm1, %xmm1 - addq $16, %rax - test %edx, %edx - jnz L(exit) - - pcmpeqd (%rax), %xmm1 - pmovmskb %xmm1, %edx - pxor %xmm2, %xmm2 - addq $16, %rax - test %edx, %edx - jnz L(exit) - - pcmpeqd (%rax), %xmm2 - pmovmskb %xmm2, %edx - pxor %xmm3, %xmm3 - addq $16, %rax - test %edx, %edx - jnz L(exit) - - pcmpeqd (%rax), %xmm3 - pmovmskb %xmm3, %edx - addq $16, %rax - test %edx, %edx - jnz L(exit) - - pcmpeqd (%rax), %xmm0 - pmovmskb %xmm0, %edx - addq $16, %rax - test %edx, %edx - jnz L(exit) - - pcmpeqd (%rax), %xmm1 - pmovmskb %xmm1, %edx - addq $16, %rax - test %edx, %edx - jnz L(exit) - - pcmpeqd (%rax), %xmm2 - pmovmskb %xmm2, %edx - addq $16, %rax - test %edx, %edx - jnz L(exit) - - pcmpeqd (%rax), %xmm3 - pmovmskb %xmm3, %edx - addq $16, %rax - test %edx, %edx - jnz L(exit) - - pcmpeqd (%rax), %xmm0 - pmovmskb %xmm0, %edx - addq $16, %rax - test %edx, %edx - jnz L(exit) - - pcmpeqd (%rax), %xmm1 - pmovmskb %xmm1, %edx - addq $16, %rax - test %edx, %edx - jnz L(exit) - - pcmpeqd (%rax), %xmm2 - pmovmskb %xmm2, %edx - addq $16, %rax - test %edx, %edx - jnz L(exit) - - pcmpeqd (%rax), %xmm3 - pmovmskb %xmm3, %edx - addq $16, %rax - test %edx, %edx - jnz L(exit) - - and $-0x40, %rax - - .p2align 4 -L(aligned_64_loop): - movaps (%rax), %xmm0 - movaps 16(%rax), %xmm1 - movaps 32(%rax), %xmm2 - movaps 48(%rax), %xmm6 - - pminub %xmm1, %xmm0 - pminub %xmm6, %xmm2 - pminub %xmm0, %xmm2 - pcmpeqd %xmm3, %xmm2 - pmovmskb %xmm2, %edx - addq $64, %rax - test %edx, %edx - jz L(aligned_64_loop) - - pcmpeqd -64(%rax), %xmm3 - pmovmskb %xmm3, %edx - addq $48, %rdi - test %edx, %edx - jnz L(exit) - - pcmpeqd %xmm1, %xmm3 - pmovmskb %xmm3, %edx - addq $-16, %rdi - test %edx, %edx - jnz L(exit) - - pcmpeqd -32(%rax), %xmm3 - pmovmskb %xmm3, %edx - addq $-16, %rdi - test %edx, %edx - jnz L(exit) - - pcmpeqd %xmm6, %xmm3 - pmovmskb %xmm3, %edx - addq $-16, %rdi - test %edx, %edx - jz L(aligned_64_loop) - - .p2align 4 -L(exit): - sub %rdi, %rax - shr $2, %rax - test %dl, %dl - jz L(exit_high) - - andl $15, %edx - jz L(exit_1) - ret - - /* No align here. Naturally aligned % 16 == 1. */ -L(exit_high): - andl $(15 << 8), %edx - jz L(exit_3) - add $2, %rax - ret - - .p2align 3 -L(exit_1): - add $1, %rax - ret - - .p2align 3 -L(exit_3): - add $3, %rax - ret - - .p2align 3 -L(exit_tail0): - xorl %eax, %eax - ret - - .p2align 3 -L(exit_tail1): - movl $1, %eax - ret - - .p2align 3 -L(exit_tail2): - movl $2, %eax - ret - - .p2align 3 -L(exit_tail3): - movl $3, %eax - ret - - .p2align 3 -L(exit_tail4): - movl $4, %eax - ret - - .p2align 3 -L(exit_tail5): - movl $5, %eax - ret - - .p2align 3 -L(exit_tail6): - movl $6, %eax - ret - - .p2align 3 -L(exit_tail7): - movl $7, %eax - ret - -END (__wcslen) - +#define WCSLEN __wcslen +#include "multiarch/wcslen-sse2.S" weak_alias(__wcslen, wcslen)