public inbox for glibc-cvs@sourceware.org help / color / mirror / Atom feed
From: Noah Goldstein <nwg@sourceware.org> To: glibc-cvs@sourceware.org Subject: [glibc] x86: Small improvements for wcslen Date: Mon, 28 Mar 2022 20:55:49 +0000 (GMT) [thread overview] Message-ID: <20220328205549.42C443857811@sourceware.org> (raw) https://sourceware.org/git/gitweb.cgi?p=glibc.git;h=244b415d386487521882debb845a040a4758cb18 commit 244b415d386487521882debb845a040a4758cb18 Author: Noah Goldstein <goldstein.w.n@gmail.com> Date: Fri Mar 25 17:13:33 2022 -0500 x86: Small improvements for wcslen Just a few QOL changes. 1. Prefer `add` > `lea` as it has high execution units it can run on. 2. Don't break macro-fusion between `test` and `jcc` 3. Reduce code size by removing gratuitous padding bytes (-90 bytes). geometric_mean(N=20) of all benchmarks New / Original: 0.959 All string/memory tests pass. Reviewed-by: H.J. Lu <hjl.tools@gmail.com> Diff: --- sysdeps/x86_64/wcslen.S | 86 +++++++++++++++++++++++-------------------------- 1 file changed, 41 insertions(+), 45 deletions(-) diff --git a/sysdeps/x86_64/wcslen.S b/sysdeps/x86_64/wcslen.S index c9165dbf03..d641141d75 100644 --- a/sysdeps/x86_64/wcslen.S +++ b/sysdeps/x86_64/wcslen.S @@ -40,82 +40,82 @@ ENTRY (__wcslen) pxor %xmm0, %xmm0 lea 32(%rdi), %rax - lea 16(%rdi), %rcx + addq $16, %rdi and $-16, %rax pcmpeqd (%rax), %xmm0 pmovmskb %xmm0, %edx pxor %xmm1, %xmm1 + addq $16, %rax test %edx, %edx - lea 16(%rax), %rax jnz L(exit) pcmpeqd (%rax), %xmm1 pmovmskb %xmm1, %edx pxor %xmm2, %xmm2 + addq $16, %rax test %edx, %edx - lea 16(%rax), %rax jnz L(exit) pcmpeqd (%rax), %xmm2 pmovmskb %xmm2, %edx pxor %xmm3, %xmm3 + addq $16, %rax test %edx, %edx - lea 16(%rax), %rax jnz L(exit) pcmpeqd (%rax), %xmm3 pmovmskb %xmm3, %edx + addq $16, %rax test %edx, %edx - lea 16(%rax), %rax jnz L(exit) pcmpeqd (%rax), %xmm0 pmovmskb %xmm0, %edx + addq $16, %rax test %edx, %edx - lea 16(%rax), %rax jnz L(exit) pcmpeqd (%rax), %xmm1 pmovmskb %xmm1, %edx + addq $16, %rax test %edx, %edx - lea 16(%rax), %rax jnz L(exit) pcmpeqd (%rax), %xmm2 pmovmskb %xmm2, %edx + addq $16, %rax test %edx, %edx - lea 16(%rax), %rax jnz L(exit) pcmpeqd (%rax), %xmm3 pmovmskb %xmm3, %edx + addq $16, %rax test %edx, %edx - lea 16(%rax), %rax jnz L(exit) pcmpeqd (%rax), %xmm0 pmovmskb %xmm0, %edx + addq $16, %rax test %edx, %edx - lea 16(%rax), %rax jnz L(exit) pcmpeqd (%rax), %xmm1 pmovmskb %xmm1, %edx + addq $16, %rax test %edx, %edx - lea 16(%rax), %rax jnz L(exit) pcmpeqd (%rax), %xmm2 pmovmskb %xmm2, %edx + addq $16, %rax test %edx, %edx - lea 16(%rax), %rax jnz L(exit) pcmpeqd (%rax), %xmm3 pmovmskb %xmm3, %edx + addq $16, %rax test %edx, %edx - lea 16(%rax), %rax jnz L(exit) and $-0x40, %rax @@ -132,104 +132,100 @@ L(aligned_64_loop): pminub %xmm0, %xmm2 pcmpeqd %xmm3, %xmm2 pmovmskb %xmm2, %edx + addq $64, %rax test %edx, %edx - lea 64(%rax), %rax jz L(aligned_64_loop) pcmpeqd -64(%rax), %xmm3 pmovmskb %xmm3, %edx + addq $48, %rdi test %edx, %edx - lea 48(%rcx), %rcx jnz L(exit) pcmpeqd %xmm1, %xmm3 pmovmskb %xmm3, %edx + addq $-16, %rdi test %edx, %edx - lea -16(%rcx), %rcx jnz L(exit) pcmpeqd -32(%rax), %xmm3 pmovmskb %xmm3, %edx + addq $-16, %rdi test %edx, %edx - lea -16(%rcx), %rcx jnz L(exit) pcmpeqd %xmm6, %xmm3 pmovmskb %xmm3, %edx + addq $-16, %rdi test %edx, %edx - lea -16(%rcx), %rcx - jnz L(exit) - - jmp L(aligned_64_loop) + jz L(aligned_64_loop) .p2align 4 L(exit): - sub %rcx, %rax + sub %rdi, %rax shr $2, %rax test %dl, %dl jz L(exit_high) - mov %dl, %cl - and $15, %cl + andl $15, %edx jz L(exit_1) ret - .p2align 4 + /* No align here. Naturally aligned % 16 == 1. */ L(exit_high): - mov %dh, %ch - and $15, %ch + andl $(15 << 8), %edx jz L(exit_3) add $2, %rax ret - .p2align 4 + .p2align 3 L(exit_1): add $1, %rax ret - .p2align 4 + .p2align 3 L(exit_3): add $3, %rax ret - .p2align 4 + .p2align 3 L(exit_tail0): - xor %rax, %rax + xorl %eax, %eax ret - .p2align 4 + .p2align 3 L(exit_tail1): - mov $1, %rax + movl $1, %eax ret - .p2align 4 + .p2align 3 L(exit_tail2): - mov $2, %rax + movl $2, %eax ret - .p2align 4 + .p2align 3 L(exit_tail3): - mov $3, %rax + movl $3, %eax ret - .p2align 4 + .p2align 3 L(exit_tail4): - mov $4, %rax + movl $4, %eax ret - .p2align 4 + .p2align 3 L(exit_tail5): - mov $5, %rax + movl $5, %eax ret - .p2align 4 + .p2align 3 L(exit_tail6): - mov $6, %rax + movl $6, %eax ret - .p2align 4 + .p2align 3 L(exit_tail7): - mov $7, %rax + movl $7, %eax ret END (__wcslen)
reply other threads:[~2022-03-28 20:55 UTC|newest] Thread overview: [no followups] expand[flat|nested] mbox.gz Atom feed
Reply instructions: You may reply publicly to this message via plain-text email using any one of the following methods: * Save the following mbox file, import it into your mail client, and reply-to-all from there: mbox Avoid top-posting and favor interleaved quoting: https://en.wikipedia.org/wiki/Posting_style#Interleaved_style * Reply using the --to, --cc, and --in-reply-to switches of git-send-email(1): git send-email \ --in-reply-to=20220328205549.42C443857811@sourceware.org \ --to=nwg@sourceware.org \ --cc=glibc-cvs@sourceware.org \ /path/to/YOUR_REPLY https://kernel.org/pub/software/scm/git/docs/git-send-email.html * If your mail client supports setting the In-Reply-To header via mailto: links, try the mailto: linkBe sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions for how to clone and mirror all data and code used for this inbox; as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).