From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: by sourceware.org (Postfix, from userid 7844) id 265B53857C71; Mon, 28 Mar 2022 20:55:44 +0000 (GMT) DKIM-Filter: OpenDKIM Filter v2.11.0 sourceware.org 265B53857C71 Content-Type: text/plain; charset="us-ascii" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit From: Noah Goldstein To: glibc-cvs@sourceware.org Subject: [glibc] x86: Small improvements for wcscpy-ssse3 X-Act-Checkin: glibc X-Git-Author: Noah Goldstein X-Git-Refname: refs/heads/master X-Git-Oldrev: 811c635dbae42a0ced67d2bffa8ad68b58d6e44e X-Git-Newrev: f5bff979d02cf115be94c0c0c6f1a1a505964772 Message-Id: <20220328205544.265B53857C71@sourceware.org> Date: Mon, 28 Mar 2022 20:55:44 +0000 (GMT) X-BeenThere: glibc-cvs@sourceware.org X-Mailman-Version: 2.1.29 Precedence: list List-Id: Glibc-cvs mailing list List-Unsubscribe: , List-Archive: List-Help: List-Subscribe: , X-List-Received-Date: Mon, 28 Mar 2022 20:55:44 -0000 https://sourceware.org/git/gitweb.cgi?p=glibc.git;h=f5bff979d02cf115be94c0c0c6f1a1a505964772 commit f5bff979d02cf115be94c0c0c6f1a1a505964772 Author: Noah Goldstein Date: Fri Mar 25 17:13:32 2022 -0500 x86: Small improvements for wcscpy-ssse3 Just a few small QOL changes. 1. Prefer `add` > `lea` as it has high execution units it can run on. 2. Don't break macro-fusion between `test` and `jcc` geometric_mean(N=20) of all benchmarks New / Original: 0.973 All string/memory tests pass. Reviewed-by: H.J. Lu Diff: --- sysdeps/x86_64/multiarch/wcscpy-ssse3.S | 194 ++++++++++++++++---------------- 1 file changed, 97 insertions(+), 97 deletions(-) diff --git a/sysdeps/x86_64/multiarch/wcscpy-ssse3.S b/sysdeps/x86_64/multiarch/wcscpy-ssse3.S index 34b09af327..aa2b9d030f 100644 --- a/sysdeps/x86_64/multiarch/wcscpy-ssse3.S +++ b/sysdeps/x86_64/multiarch/wcscpy-ssse3.S @@ -52,7 +52,7 @@ ENTRY (__wcscpy_ssse3) jnz L(CopyFrom1To16Bytes) mov %rdx, %rax - lea 16(%rdx), %rdx + addq $16, %rdx and $-16, %rdx sub %rdx, %rax sub %rax, %rcx @@ -75,55 +75,55 @@ L(Align16Both): movaps 16(%rcx), %xmm2 movaps %xmm1, (%rdx) pcmpeqd %xmm2, %xmm0 - pmovmskb %xmm0, %rax - lea 16(%rsi), %rsi + pmovmskb %xmm0, %eax + addq $16, %rsi - test %rax, %rax + test %eax, %eax jnz L(CopyFrom1To16Bytes) movaps 16(%rcx, %rsi), %xmm3 movaps %xmm2, (%rdx, %rsi) pcmpeqd %xmm3, %xmm0 - pmovmskb %xmm0, %rax - lea 16(%rsi), %rsi + pmovmskb %xmm0, %eax + addq $16, %rsi - test %rax, %rax + test %eax, %eax jnz L(CopyFrom1To16Bytes) movaps 16(%rcx, %rsi), %xmm4 movaps %xmm3, (%rdx, %rsi) pcmpeqd %xmm4, %xmm0 - pmovmskb %xmm0, %rax - lea 16(%rsi), %rsi + pmovmskb %xmm0, %eax + addq $16, %rsi - test %rax, %rax + test %eax, %eax jnz L(CopyFrom1To16Bytes) movaps 16(%rcx, %rsi), %xmm1 movaps %xmm4, (%rdx, %rsi) pcmpeqd %xmm1, %xmm0 - pmovmskb %xmm0, %rax - lea 16(%rsi), %rsi + pmovmskb %xmm0, %eax + addq $16, %rsi - test %rax, %rax + test %eax, %eax jnz L(CopyFrom1To16Bytes) movaps 16(%rcx, %rsi), %xmm2 movaps %xmm1, (%rdx, %rsi) pcmpeqd %xmm2, %xmm0 - pmovmskb %xmm0, %rax - lea 16(%rsi), %rsi + pmovmskb %xmm0, %eax + addq $16, %rsi - test %rax, %rax + test %eax, %eax jnz L(CopyFrom1To16Bytes) movaps 16(%rcx, %rsi), %xmm3 movaps %xmm2, (%rdx, %rsi) pcmpeqd %xmm3, %xmm0 - pmovmskb %xmm0, %rax - lea 16(%rsi), %rsi + pmovmskb %xmm0, %eax + addq $16, %rsi - test %rax, %rax + test %eax, %eax jnz L(CopyFrom1To16Bytes) movaps %xmm3, (%rdx, %rsi) @@ -147,10 +147,10 @@ L(Aligned64Loop): pminub %xmm7, %xmm3 pminub %xmm2, %xmm3 pcmpeqd %xmm0, %xmm3 - pmovmskb %xmm3, %rax - lea 64(%rdx), %rdx - lea 64(%rcx), %rcx - test %rax, %rax + pmovmskb %xmm3, %eax + addq $64, %rdx + addq $64, %rcx + testl %eax, %eax jnz L(Aligned64Leave) movaps %xmm4, -64(%rdx) movaps %xmm5, -48(%rdx) @@ -160,32 +160,32 @@ L(Aligned64Loop): L(Aligned64Leave): pcmpeqd %xmm4, %xmm0 - pmovmskb %xmm0, %rax - test %rax, %rax + pmovmskb %xmm0, %eax + test %eax, %eax jnz L(CopyFrom1To16Bytes) pcmpeqd %xmm5, %xmm0 - pmovmskb %xmm0, %rax + pmovmskb %xmm0, %eax movaps %xmm4, -64(%rdx) - test %rax, %rax - lea 16(%rsi), %rsi + addq $16, %rsi + test %eax, %eax jnz L(CopyFrom1To16Bytes) pcmpeqd %xmm6, %xmm0 - pmovmskb %xmm0, %rax + pmovmskb %xmm0, %eax movaps %xmm5, -48(%rdx) - test %rax, %rax - lea 16(%rsi), %rsi + addq $16, %rsi + test %eax, %eax jnz L(CopyFrom1To16Bytes) movaps %xmm6, -32(%rdx) pcmpeqd %xmm7, %xmm0 - pmovmskb %xmm0, %rax - lea 16(%rsi), %rsi - test %rax, %rax + pmovmskb %xmm0, %eax + addq $16, %rsi + test %eax, %eax jnz L(CopyFrom1To16Bytes) mov $-0x40, %rsi @@ -198,10 +198,10 @@ L(Shl4): movaps 12(%rcx), %xmm2 L(Shl4Start): pcmpeqd %xmm2, %xmm0 - pmovmskb %xmm0, %rax + pmovmskb %xmm0, %eax movaps %xmm2, %xmm3 - test %rax, %rax + test %eax, %eax jnz L(Shl4LoopExit) palignr $4, %xmm1, %xmm2 @@ -209,12 +209,12 @@ L(Shl4Start): movaps 28(%rcx), %xmm2 pcmpeqd %xmm2, %xmm0 - lea 16(%rdx), %rdx - pmovmskb %xmm0, %rax - lea 16(%rcx), %rcx + addq $16, %rdx + pmovmskb %xmm0, %eax + addq $16, %rcx movaps %xmm2, %xmm1 - test %rax, %rax + test %eax, %eax jnz L(Shl4LoopExit) palignr $4, %xmm3, %xmm2 @@ -222,12 +222,12 @@ L(Shl4Start): movaps 28(%rcx), %xmm2 pcmpeqd %xmm2, %xmm0 - lea 16(%rdx), %rdx - pmovmskb %xmm0, %rax - lea 16(%rcx), %rcx + addq $16, %rdx + pmovmskb %xmm0, %eax + addq $16, %rcx movaps %xmm2, %xmm3 - test %rax, %rax + test %eax, %eax jnz L(Shl4LoopExit) palignr $4, %xmm1, %xmm2 @@ -235,22 +235,22 @@ L(Shl4Start): movaps 28(%rcx), %xmm2 pcmpeqd %xmm2, %xmm0 - lea 16(%rdx), %rdx - pmovmskb %xmm0, %rax - lea 16(%rcx), %rcx + addq $16, %rdx + pmovmskb %xmm0, %eax + addq $16, %rcx - test %rax, %rax + test %eax, %eax jnz L(Shl4LoopExit) palignr $4, %xmm3, %xmm2 movaps %xmm2, (%rdx) - lea 28(%rcx), %rcx - lea 16(%rdx), %rdx + addq $28, %rcx + addq $16, %rdx mov %rcx, %rax and $-0x40, %rcx sub %rcx, %rax - lea -12(%rcx), %rcx + addq $-12, %rcx sub %rax, %rdx movaps -4(%rcx), %xmm1 @@ -267,22 +267,22 @@ L(Shl4LoopStart): pminub %xmm5, %xmm7 pminub %xmm6, %xmm7 pcmpeqd %xmm0, %xmm7 - pmovmskb %xmm7, %rax + pmovmskb %xmm7, %eax movaps %xmm5, %xmm7 palignr $4, %xmm4, %xmm5 - test %rax, %rax palignr $4, %xmm3, %xmm4 + test %eax, %eax jnz L(Shl4Start) palignr $4, %xmm2, %xmm3 - lea 64(%rcx), %rcx + addq $64, %rcx palignr $4, %xmm1, %xmm2 movaps %xmm7, %xmm1 movaps %xmm5, 48(%rdx) movaps %xmm4, 32(%rdx) movaps %xmm3, 16(%rdx) movaps %xmm2, (%rdx) - lea 64(%rdx), %rdx + addq $64, %rdx jmp L(Shl4LoopStart) L(Shl4LoopExit): @@ -297,10 +297,10 @@ L(Shl8): movaps 8(%rcx), %xmm2 L(Shl8Start): pcmpeqd %xmm2, %xmm0 - pmovmskb %xmm0, %rax + pmovmskb %xmm0, %eax movaps %xmm2, %xmm3 - test %rax, %rax + test %eax, %eax jnz L(Shl8LoopExit) palignr $8, %xmm1, %xmm2 @@ -308,12 +308,12 @@ L(Shl8Start): movaps 24(%rcx), %xmm2 pcmpeqd %xmm2, %xmm0 - lea 16(%rdx), %rdx - pmovmskb %xmm0, %rax - lea 16(%rcx), %rcx + addq $16, %rdx + pmovmskb %xmm0, %eax + addq $16, %rcx movaps %xmm2, %xmm1 - test %rax, %rax + test %eax, %eax jnz L(Shl8LoopExit) palignr $8, %xmm3, %xmm2 @@ -321,12 +321,12 @@ L(Shl8Start): movaps 24(%rcx), %xmm2 pcmpeqd %xmm2, %xmm0 - lea 16(%rdx), %rdx - pmovmskb %xmm0, %rax - lea 16(%rcx), %rcx + addq $16, %rdx + pmovmskb %xmm0, %eax + addq $16, %rcx movaps %xmm2, %xmm3 - test %rax, %rax + test %eax, %eax jnz L(Shl8LoopExit) palignr $8, %xmm1, %xmm2 @@ -334,22 +334,22 @@ L(Shl8Start): movaps 24(%rcx), %xmm2 pcmpeqd %xmm2, %xmm0 - lea 16(%rdx), %rdx - pmovmskb %xmm0, %rax - lea 16(%rcx), %rcx + addq $16, %rdx + pmovmskb %xmm0, %eax + addq $16, %rcx - test %rax, %rax + test %eax, %eax jnz L(Shl8LoopExit) palignr $8, %xmm3, %xmm2 movaps %xmm2, (%rdx) - lea 24(%rcx), %rcx - lea 16(%rdx), %rdx + addq $24, %rcx + addq $16, %rdx mov %rcx, %rax and $-0x40, %rcx sub %rcx, %rax - lea -8(%rcx), %rcx + addq $-8, %rcx sub %rax, %rdx movaps -8(%rcx), %xmm1 @@ -366,22 +366,22 @@ L(Shl8LoopStart): pminub %xmm5, %xmm7 pminub %xmm6, %xmm7 pcmpeqd %xmm0, %xmm7 - pmovmskb %xmm7, %rax + pmovmskb %xmm7, %eax movaps %xmm5, %xmm7 palignr $8, %xmm4, %xmm5 - test %rax, %rax palignr $8, %xmm3, %xmm4 + test %eax, %eax jnz L(Shl8Start) palignr $8, %xmm2, %xmm3 - lea 64(%rcx), %rcx + addq $64, %rcx palignr $8, %xmm1, %xmm2 movaps %xmm7, %xmm1 movaps %xmm5, 48(%rdx) movaps %xmm4, 32(%rdx) movaps %xmm3, 16(%rdx) movaps %xmm2, (%rdx) - lea 64(%rdx), %rdx + addq $64, %rdx jmp L(Shl8LoopStart) L(Shl8LoopExit): @@ -396,10 +396,10 @@ L(Shl12): movaps 4(%rcx), %xmm2 L(Shl12Start): pcmpeqd %xmm2, %xmm0 - pmovmskb %xmm0, %rax + pmovmskb %xmm0, %eax movaps %xmm2, %xmm3 - test %rax, %rax + test %eax, %eax jnz L(Shl12LoopExit) palignr $12, %xmm1, %xmm2 @@ -407,12 +407,12 @@ L(Shl12Start): movaps 20(%rcx), %xmm2 pcmpeqd %xmm2, %xmm0 - lea 16(%rdx), %rdx - pmovmskb %xmm0, %rax - lea 16(%rcx), %rcx + addq $16, %rdx + pmovmskb %xmm0, %eax + addq $16, %rcx movaps %xmm2, %xmm1 - test %rax, %rax + test %eax, %eax jnz L(Shl12LoopExit) palignr $12, %xmm3, %xmm2 @@ -420,12 +420,12 @@ L(Shl12Start): movaps 20(%rcx), %xmm2 pcmpeqd %xmm2, %xmm0 - lea 16(%rdx), %rdx - pmovmskb %xmm0, %rax - lea 16(%rcx), %rcx + addq $16, %rdx + pmovmskb %xmm0, %eax + addq $16, %rcx movaps %xmm2, %xmm3 - test %rax, %rax + test %eax, %eax jnz L(Shl12LoopExit) palignr $12, %xmm1, %xmm2 @@ -433,22 +433,22 @@ L(Shl12Start): movaps 20(%rcx), %xmm2 pcmpeqd %xmm2, %xmm0 - lea 16(%rdx), %rdx - pmovmskb %xmm0, %rax - lea 16(%rcx), %rcx + addq $16, %rdx + pmovmskb %xmm0, %eax + addq $16, %rcx - test %rax, %rax + test %eax, %eax jnz L(Shl12LoopExit) palignr $12, %xmm3, %xmm2 movaps %xmm2, (%rdx) - lea 20(%rcx), %rcx - lea 16(%rdx), %rdx + addq $20, %rcx + addq $16, %rdx mov %rcx, %rax and $-0x40, %rcx sub %rcx, %rax - lea -4(%rcx), %rcx + addq $-4, %rcx sub %rax, %rdx movaps -12(%rcx), %xmm1 @@ -465,21 +465,21 @@ L(Shl12LoopStart): pminub %xmm5, %xmm7 pminub %xmm6, %xmm7 pcmpeqd %xmm0, %xmm7 - pmovmskb %xmm7, %rax + pmovmskb %xmm7, %eax movaps %xmm5, %xmm7 palignr $12, %xmm4, %xmm5 - test %rax, %rax palignr $12, %xmm3, %xmm4 + test %eax, %eax jnz L(Shl12Start) palignr $12, %xmm2, %xmm3 - lea 64(%rcx), %rcx + addq $64, %rcx palignr $12, %xmm1, %xmm2 movaps %xmm7, %xmm1 movaps %xmm5, 48(%rdx) movaps %xmm4, 32(%rdx) movaps %xmm3, 16(%rdx) movaps %xmm2, (%rdx) - lea 64(%rdx), %rdx + addq $64, %rdx jmp L(Shl12LoopStart) L(Shl12LoopExit):