* [PATCH v1 1/2] x86: Small improvements for wcscpy-ssse3
@ 2022-03-25 22:13 Noah Goldstein
2022-03-25 22:13 ` [PATCH v1 2/2] x86: Small improvements for wcslen Noah Goldstein
2022-03-28 18:51 ` [PATCH v1 1/2] x86: Small improvements for wcscpy-ssse3 H.J. Lu
0 siblings, 2 replies; 5+ messages in thread
From: Noah Goldstein @ 2022-03-25 22:13 UTC (permalink / raw)
To: libc-alpha
Just a few small QOL changes.
1. Prefer `add` > `lea` as it has high execution units it can run
on.
2. Don't break macro-fusion between `test` and `jcc`
geometric_mean(N=20) of all benchmarks New / Original: 0.973
All string/memory tests pass.
---
sysdeps/x86_64/multiarch/wcscpy-ssse3.S | 194 ++++++++++++------------
1 file changed, 97 insertions(+), 97 deletions(-)
diff --git a/sysdeps/x86_64/multiarch/wcscpy-ssse3.S b/sysdeps/x86_64/multiarch/wcscpy-ssse3.S
index 34b09af327..aa2b9d030f 100644
--- a/sysdeps/x86_64/multiarch/wcscpy-ssse3.S
+++ b/sysdeps/x86_64/multiarch/wcscpy-ssse3.S
@@ -52,7 +52,7 @@ ENTRY (__wcscpy_ssse3)
jnz L(CopyFrom1To16Bytes)
mov %rdx, %rax
- lea 16(%rdx), %rdx
+ addq $16, %rdx
and $-16, %rdx
sub %rdx, %rax
sub %rax, %rcx
@@ -75,55 +75,55 @@ L(Align16Both):
movaps 16(%rcx), %xmm2
movaps %xmm1, (%rdx)
pcmpeqd %xmm2, %xmm0
- pmovmskb %xmm0, %rax
- lea 16(%rsi), %rsi
+ pmovmskb %xmm0, %eax
+ addq $16, %rsi
- test %rax, %rax
+ test %eax, %eax
jnz L(CopyFrom1To16Bytes)
movaps 16(%rcx, %rsi), %xmm3
movaps %xmm2, (%rdx, %rsi)
pcmpeqd %xmm3, %xmm0
- pmovmskb %xmm0, %rax
- lea 16(%rsi), %rsi
+ pmovmskb %xmm0, %eax
+ addq $16, %rsi
- test %rax, %rax
+ test %eax, %eax
jnz L(CopyFrom1To16Bytes)
movaps 16(%rcx, %rsi), %xmm4
movaps %xmm3, (%rdx, %rsi)
pcmpeqd %xmm4, %xmm0
- pmovmskb %xmm0, %rax
- lea 16(%rsi), %rsi
+ pmovmskb %xmm0, %eax
+ addq $16, %rsi
- test %rax, %rax
+ test %eax, %eax
jnz L(CopyFrom1To16Bytes)
movaps 16(%rcx, %rsi), %xmm1
movaps %xmm4, (%rdx, %rsi)
pcmpeqd %xmm1, %xmm0
- pmovmskb %xmm0, %rax
- lea 16(%rsi), %rsi
+ pmovmskb %xmm0, %eax
+ addq $16, %rsi
- test %rax, %rax
+ test %eax, %eax
jnz L(CopyFrom1To16Bytes)
movaps 16(%rcx, %rsi), %xmm2
movaps %xmm1, (%rdx, %rsi)
pcmpeqd %xmm2, %xmm0
- pmovmskb %xmm0, %rax
- lea 16(%rsi), %rsi
+ pmovmskb %xmm0, %eax
+ addq $16, %rsi
- test %rax, %rax
+ test %eax, %eax
jnz L(CopyFrom1To16Bytes)
movaps 16(%rcx, %rsi), %xmm3
movaps %xmm2, (%rdx, %rsi)
pcmpeqd %xmm3, %xmm0
- pmovmskb %xmm0, %rax
- lea 16(%rsi), %rsi
+ pmovmskb %xmm0, %eax
+ addq $16, %rsi
- test %rax, %rax
+ test %eax, %eax
jnz L(CopyFrom1To16Bytes)
movaps %xmm3, (%rdx, %rsi)
@@ -147,10 +147,10 @@ L(Aligned64Loop):
pminub %xmm7, %xmm3
pminub %xmm2, %xmm3
pcmpeqd %xmm0, %xmm3
- pmovmskb %xmm3, %rax
- lea 64(%rdx), %rdx
- lea 64(%rcx), %rcx
- test %rax, %rax
+ pmovmskb %xmm3, %eax
+ addq $64, %rdx
+ addq $64, %rcx
+ testl %eax, %eax
jnz L(Aligned64Leave)
movaps %xmm4, -64(%rdx)
movaps %xmm5, -48(%rdx)
@@ -160,32 +160,32 @@ L(Aligned64Loop):
L(Aligned64Leave):
pcmpeqd %xmm4, %xmm0
- pmovmskb %xmm0, %rax
- test %rax, %rax
+ pmovmskb %xmm0, %eax
+ test %eax, %eax
jnz L(CopyFrom1To16Bytes)
pcmpeqd %xmm5, %xmm0
- pmovmskb %xmm0, %rax
+ pmovmskb %xmm0, %eax
movaps %xmm4, -64(%rdx)
- test %rax, %rax
- lea 16(%rsi), %rsi
+ addq $16, %rsi
+ test %eax, %eax
jnz L(CopyFrom1To16Bytes)
pcmpeqd %xmm6, %xmm0
- pmovmskb %xmm0, %rax
+ pmovmskb %xmm0, %eax
movaps %xmm5, -48(%rdx)
- test %rax, %rax
- lea 16(%rsi), %rsi
+ addq $16, %rsi
+ test %eax, %eax
jnz L(CopyFrom1To16Bytes)
movaps %xmm6, -32(%rdx)
pcmpeqd %xmm7, %xmm0
- pmovmskb %xmm0, %rax
- lea 16(%rsi), %rsi
- test %rax, %rax
+ pmovmskb %xmm0, %eax
+ addq $16, %rsi
+ test %eax, %eax
jnz L(CopyFrom1To16Bytes)
mov $-0x40, %rsi
@@ -198,10 +198,10 @@ L(Shl4):
movaps 12(%rcx), %xmm2
L(Shl4Start):
pcmpeqd %xmm2, %xmm0
- pmovmskb %xmm0, %rax
+ pmovmskb %xmm0, %eax
movaps %xmm2, %xmm3
- test %rax, %rax
+ test %eax, %eax
jnz L(Shl4LoopExit)
palignr $4, %xmm1, %xmm2
@@ -209,12 +209,12 @@ L(Shl4Start):
movaps 28(%rcx), %xmm2
pcmpeqd %xmm2, %xmm0
- lea 16(%rdx), %rdx
- pmovmskb %xmm0, %rax
- lea 16(%rcx), %rcx
+ addq $16, %rdx
+ pmovmskb %xmm0, %eax
+ addq $16, %rcx
movaps %xmm2, %xmm1
- test %rax, %rax
+ test %eax, %eax
jnz L(Shl4LoopExit)
palignr $4, %xmm3, %xmm2
@@ -222,12 +222,12 @@ L(Shl4Start):
movaps 28(%rcx), %xmm2
pcmpeqd %xmm2, %xmm0
- lea 16(%rdx), %rdx
- pmovmskb %xmm0, %rax
- lea 16(%rcx), %rcx
+ addq $16, %rdx
+ pmovmskb %xmm0, %eax
+ addq $16, %rcx
movaps %xmm2, %xmm3
- test %rax, %rax
+ test %eax, %eax
jnz L(Shl4LoopExit)
palignr $4, %xmm1, %xmm2
@@ -235,22 +235,22 @@ L(Shl4Start):
movaps 28(%rcx), %xmm2
pcmpeqd %xmm2, %xmm0
- lea 16(%rdx), %rdx
- pmovmskb %xmm0, %rax
- lea 16(%rcx), %rcx
+ addq $16, %rdx
+ pmovmskb %xmm0, %eax
+ addq $16, %rcx
- test %rax, %rax
+ test %eax, %eax
jnz L(Shl4LoopExit)
palignr $4, %xmm3, %xmm2
movaps %xmm2, (%rdx)
- lea 28(%rcx), %rcx
- lea 16(%rdx), %rdx
+ addq $28, %rcx
+ addq $16, %rdx
mov %rcx, %rax
and $-0x40, %rcx
sub %rcx, %rax
- lea -12(%rcx), %rcx
+ addq $-12, %rcx
sub %rax, %rdx
movaps -4(%rcx), %xmm1
@@ -267,22 +267,22 @@ L(Shl4LoopStart):
pminub %xmm5, %xmm7
pminub %xmm6, %xmm7
pcmpeqd %xmm0, %xmm7
- pmovmskb %xmm7, %rax
+ pmovmskb %xmm7, %eax
movaps %xmm5, %xmm7
palignr $4, %xmm4, %xmm5
- test %rax, %rax
palignr $4, %xmm3, %xmm4
+ test %eax, %eax
jnz L(Shl4Start)
palignr $4, %xmm2, %xmm3
- lea 64(%rcx), %rcx
+ addq $64, %rcx
palignr $4, %xmm1, %xmm2
movaps %xmm7, %xmm1
movaps %xmm5, 48(%rdx)
movaps %xmm4, 32(%rdx)
movaps %xmm3, 16(%rdx)
movaps %xmm2, (%rdx)
- lea 64(%rdx), %rdx
+ addq $64, %rdx
jmp L(Shl4LoopStart)
L(Shl4LoopExit):
@@ -297,10 +297,10 @@ L(Shl8):
movaps 8(%rcx), %xmm2
L(Shl8Start):
pcmpeqd %xmm2, %xmm0
- pmovmskb %xmm0, %rax
+ pmovmskb %xmm0, %eax
movaps %xmm2, %xmm3
- test %rax, %rax
+ test %eax, %eax
jnz L(Shl8LoopExit)
palignr $8, %xmm1, %xmm2
@@ -308,12 +308,12 @@ L(Shl8Start):
movaps 24(%rcx), %xmm2
pcmpeqd %xmm2, %xmm0
- lea 16(%rdx), %rdx
- pmovmskb %xmm0, %rax
- lea 16(%rcx), %rcx
+ addq $16, %rdx
+ pmovmskb %xmm0, %eax
+ addq $16, %rcx
movaps %xmm2, %xmm1
- test %rax, %rax
+ test %eax, %eax
jnz L(Shl8LoopExit)
palignr $8, %xmm3, %xmm2
@@ -321,12 +321,12 @@ L(Shl8Start):
movaps 24(%rcx), %xmm2
pcmpeqd %xmm2, %xmm0
- lea 16(%rdx), %rdx
- pmovmskb %xmm0, %rax
- lea 16(%rcx), %rcx
+ addq $16, %rdx
+ pmovmskb %xmm0, %eax
+ addq $16, %rcx
movaps %xmm2, %xmm3
- test %rax, %rax
+ test %eax, %eax
jnz L(Shl8LoopExit)
palignr $8, %xmm1, %xmm2
@@ -334,22 +334,22 @@ L(Shl8Start):
movaps 24(%rcx), %xmm2
pcmpeqd %xmm2, %xmm0
- lea 16(%rdx), %rdx
- pmovmskb %xmm0, %rax
- lea 16(%rcx), %rcx
+ addq $16, %rdx
+ pmovmskb %xmm0, %eax
+ addq $16, %rcx
- test %rax, %rax
+ test %eax, %eax
jnz L(Shl8LoopExit)
palignr $8, %xmm3, %xmm2
movaps %xmm2, (%rdx)
- lea 24(%rcx), %rcx
- lea 16(%rdx), %rdx
+ addq $24, %rcx
+ addq $16, %rdx
mov %rcx, %rax
and $-0x40, %rcx
sub %rcx, %rax
- lea -8(%rcx), %rcx
+ addq $-8, %rcx
sub %rax, %rdx
movaps -8(%rcx), %xmm1
@@ -366,22 +366,22 @@ L(Shl8LoopStart):
pminub %xmm5, %xmm7
pminub %xmm6, %xmm7
pcmpeqd %xmm0, %xmm7
- pmovmskb %xmm7, %rax
+ pmovmskb %xmm7, %eax
movaps %xmm5, %xmm7
palignr $8, %xmm4, %xmm5
- test %rax, %rax
palignr $8, %xmm3, %xmm4
+ test %eax, %eax
jnz L(Shl8Start)
palignr $8, %xmm2, %xmm3
- lea 64(%rcx), %rcx
+ addq $64, %rcx
palignr $8, %xmm1, %xmm2
movaps %xmm7, %xmm1
movaps %xmm5, 48(%rdx)
movaps %xmm4, 32(%rdx)
movaps %xmm3, 16(%rdx)
movaps %xmm2, (%rdx)
- lea 64(%rdx), %rdx
+ addq $64, %rdx
jmp L(Shl8LoopStart)
L(Shl8LoopExit):
@@ -396,10 +396,10 @@ L(Shl12):
movaps 4(%rcx), %xmm2
L(Shl12Start):
pcmpeqd %xmm2, %xmm0
- pmovmskb %xmm0, %rax
+ pmovmskb %xmm0, %eax
movaps %xmm2, %xmm3
- test %rax, %rax
+ test %eax, %eax
jnz L(Shl12LoopExit)
palignr $12, %xmm1, %xmm2
@@ -407,12 +407,12 @@ L(Shl12Start):
movaps 20(%rcx), %xmm2
pcmpeqd %xmm2, %xmm0
- lea 16(%rdx), %rdx
- pmovmskb %xmm0, %rax
- lea 16(%rcx), %rcx
+ addq $16, %rdx
+ pmovmskb %xmm0, %eax
+ addq $16, %rcx
movaps %xmm2, %xmm1
- test %rax, %rax
+ test %eax, %eax
jnz L(Shl12LoopExit)
palignr $12, %xmm3, %xmm2
@@ -420,12 +420,12 @@ L(Shl12Start):
movaps 20(%rcx), %xmm2
pcmpeqd %xmm2, %xmm0
- lea 16(%rdx), %rdx
- pmovmskb %xmm0, %rax
- lea 16(%rcx), %rcx
+ addq $16, %rdx
+ pmovmskb %xmm0, %eax
+ addq $16, %rcx
movaps %xmm2, %xmm3
- test %rax, %rax
+ test %eax, %eax
jnz L(Shl12LoopExit)
palignr $12, %xmm1, %xmm2
@@ -433,22 +433,22 @@ L(Shl12Start):
movaps 20(%rcx), %xmm2
pcmpeqd %xmm2, %xmm0
- lea 16(%rdx), %rdx
- pmovmskb %xmm0, %rax
- lea 16(%rcx), %rcx
+ addq $16, %rdx
+ pmovmskb %xmm0, %eax
+ addq $16, %rcx
- test %rax, %rax
+ test %eax, %eax
jnz L(Shl12LoopExit)
palignr $12, %xmm3, %xmm2
movaps %xmm2, (%rdx)
- lea 20(%rcx), %rcx
- lea 16(%rdx), %rdx
+ addq $20, %rcx
+ addq $16, %rdx
mov %rcx, %rax
and $-0x40, %rcx
sub %rcx, %rax
- lea -4(%rcx), %rcx
+ addq $-4, %rcx
sub %rax, %rdx
movaps -12(%rcx), %xmm1
@@ -465,21 +465,21 @@ L(Shl12LoopStart):
pminub %xmm5, %xmm7
pminub %xmm6, %xmm7
pcmpeqd %xmm0, %xmm7
- pmovmskb %xmm7, %rax
+ pmovmskb %xmm7, %eax
movaps %xmm5, %xmm7
palignr $12, %xmm4, %xmm5
- test %rax, %rax
palignr $12, %xmm3, %xmm4
+ test %eax, %eax
jnz L(Shl12Start)
palignr $12, %xmm2, %xmm3
- lea 64(%rcx), %rcx
+ addq $64, %rcx
palignr $12, %xmm1, %xmm2
movaps %xmm7, %xmm1
movaps %xmm5, 48(%rdx)
movaps %xmm4, 32(%rdx)
movaps %xmm3, 16(%rdx)
movaps %xmm2, (%rdx)
- lea 64(%rdx), %rdx
+ addq $64, %rdx
jmp L(Shl12LoopStart)
L(Shl12LoopExit):
--
2.25.1
^ permalink raw reply [flat|nested] 5+ messages in thread
* [PATCH v1 2/2] x86: Small improvements for wcslen
2022-03-25 22:13 [PATCH v1 1/2] x86: Small improvements for wcscpy-ssse3 Noah Goldstein
@ 2022-03-25 22:13 ` Noah Goldstein
2022-03-28 18:51 ` H.J. Lu
2022-03-28 18:51 ` [PATCH v1 1/2] x86: Small improvements for wcscpy-ssse3 H.J. Lu
1 sibling, 1 reply; 5+ messages in thread
From: Noah Goldstein @ 2022-03-25 22:13 UTC (permalink / raw)
To: libc-alpha
Just a few QOL changes.
1. Prefer `add` > `lea` as it has high execution units it can run
on.
2. Don't break macro-fusion between `test` and `jcc`
3. Reduce code size by removing gratuitous padding bytes (-90
bytes).
geometric_mean(N=20) of all benchmarks New / Original: 0.959
All string/memory tests pass.
---
sysdeps/x86_64/wcslen.S | 86 ++++++++++++++++++++---------------------
1 file changed, 41 insertions(+), 45 deletions(-)
diff --git a/sysdeps/x86_64/wcslen.S b/sysdeps/x86_64/wcslen.S
index c9165dbf03..d641141d75 100644
--- a/sysdeps/x86_64/wcslen.S
+++ b/sysdeps/x86_64/wcslen.S
@@ -40,82 +40,82 @@ ENTRY (__wcslen)
pxor %xmm0, %xmm0
lea 32(%rdi), %rax
- lea 16(%rdi), %rcx
+ addq $16, %rdi
and $-16, %rax
pcmpeqd (%rax), %xmm0
pmovmskb %xmm0, %edx
pxor %xmm1, %xmm1
+ addq $16, %rax
test %edx, %edx
- lea 16(%rax), %rax
jnz L(exit)
pcmpeqd (%rax), %xmm1
pmovmskb %xmm1, %edx
pxor %xmm2, %xmm2
+ addq $16, %rax
test %edx, %edx
- lea 16(%rax), %rax
jnz L(exit)
pcmpeqd (%rax), %xmm2
pmovmskb %xmm2, %edx
pxor %xmm3, %xmm3
+ addq $16, %rax
test %edx, %edx
- lea 16(%rax), %rax
jnz L(exit)
pcmpeqd (%rax), %xmm3
pmovmskb %xmm3, %edx
+ addq $16, %rax
test %edx, %edx
- lea 16(%rax), %rax
jnz L(exit)
pcmpeqd (%rax), %xmm0
pmovmskb %xmm0, %edx
+ addq $16, %rax
test %edx, %edx
- lea 16(%rax), %rax
jnz L(exit)
pcmpeqd (%rax), %xmm1
pmovmskb %xmm1, %edx
+ addq $16, %rax
test %edx, %edx
- lea 16(%rax), %rax
jnz L(exit)
pcmpeqd (%rax), %xmm2
pmovmskb %xmm2, %edx
+ addq $16, %rax
test %edx, %edx
- lea 16(%rax), %rax
jnz L(exit)
pcmpeqd (%rax), %xmm3
pmovmskb %xmm3, %edx
+ addq $16, %rax
test %edx, %edx
- lea 16(%rax), %rax
jnz L(exit)
pcmpeqd (%rax), %xmm0
pmovmskb %xmm0, %edx
+ addq $16, %rax
test %edx, %edx
- lea 16(%rax), %rax
jnz L(exit)
pcmpeqd (%rax), %xmm1
pmovmskb %xmm1, %edx
+ addq $16, %rax
test %edx, %edx
- lea 16(%rax), %rax
jnz L(exit)
pcmpeqd (%rax), %xmm2
pmovmskb %xmm2, %edx
+ addq $16, %rax
test %edx, %edx
- lea 16(%rax), %rax
jnz L(exit)
pcmpeqd (%rax), %xmm3
pmovmskb %xmm3, %edx
+ addq $16, %rax
test %edx, %edx
- lea 16(%rax), %rax
jnz L(exit)
and $-0x40, %rax
@@ -132,104 +132,100 @@ L(aligned_64_loop):
pminub %xmm0, %xmm2
pcmpeqd %xmm3, %xmm2
pmovmskb %xmm2, %edx
+ addq $64, %rax
test %edx, %edx
- lea 64(%rax), %rax
jz L(aligned_64_loop)
pcmpeqd -64(%rax), %xmm3
pmovmskb %xmm3, %edx
+ addq $48, %rdi
test %edx, %edx
- lea 48(%rcx), %rcx
jnz L(exit)
pcmpeqd %xmm1, %xmm3
pmovmskb %xmm3, %edx
+ addq $-16, %rdi
test %edx, %edx
- lea -16(%rcx), %rcx
jnz L(exit)
pcmpeqd -32(%rax), %xmm3
pmovmskb %xmm3, %edx
+ addq $-16, %rdi
test %edx, %edx
- lea -16(%rcx), %rcx
jnz L(exit)
pcmpeqd %xmm6, %xmm3
pmovmskb %xmm3, %edx
+ addq $-16, %rdi
test %edx, %edx
- lea -16(%rcx), %rcx
- jnz L(exit)
-
- jmp L(aligned_64_loop)
+ jz L(aligned_64_loop)
.p2align 4
L(exit):
- sub %rcx, %rax
+ sub %rdi, %rax
shr $2, %rax
test %dl, %dl
jz L(exit_high)
- mov %dl, %cl
- and $15, %cl
+ andl $15, %edx
jz L(exit_1)
ret
- .p2align 4
+ /* No align here. Naturally aligned % 16 == 1. */
L(exit_high):
- mov %dh, %ch
- and $15, %ch
+ andl $(15 << 8), %edx
jz L(exit_3)
add $2, %rax
ret
- .p2align 4
+ .p2align 3
L(exit_1):
add $1, %rax
ret
- .p2align 4
+ .p2align 3
L(exit_3):
add $3, %rax
ret
- .p2align 4
+ .p2align 3
L(exit_tail0):
- xor %rax, %rax
+ xorl %eax, %eax
ret
- .p2align 4
+ .p2align 3
L(exit_tail1):
- mov $1, %rax
+ movl $1, %eax
ret
- .p2align 4
+ .p2align 3
L(exit_tail2):
- mov $2, %rax
+ movl $2, %eax
ret
- .p2align 4
+ .p2align 3
L(exit_tail3):
- mov $3, %rax
+ movl $3, %eax
ret
- .p2align 4
+ .p2align 3
L(exit_tail4):
- mov $4, %rax
+ movl $4, %eax
ret
- .p2align 4
+ .p2align 3
L(exit_tail5):
- mov $5, %rax
+ movl $5, %eax
ret
- .p2align 4
+ .p2align 3
L(exit_tail6):
- mov $6, %rax
+ movl $6, %eax
ret
- .p2align 4
+ .p2align 3
L(exit_tail7):
- mov $7, %rax
+ movl $7, %eax
ret
END (__wcslen)
--
2.25.1
^ permalink raw reply [flat|nested] 5+ messages in thread
* Re: [PATCH v1 1/2] x86: Small improvements for wcscpy-ssse3
2022-03-25 22:13 [PATCH v1 1/2] x86: Small improvements for wcscpy-ssse3 Noah Goldstein
2022-03-25 22:13 ` [PATCH v1 2/2] x86: Small improvements for wcslen Noah Goldstein
@ 2022-03-28 18:51 ` H.J. Lu
1 sibling, 0 replies; 5+ messages in thread
From: H.J. Lu @ 2022-03-28 18:51 UTC (permalink / raw)
To: Noah Goldstein; +Cc: GNU C Library, Carlos O'Donell
On Fri, Mar 25, 2022 at 3:13 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> Just a few small QOL changes.
> 1. Prefer `add` > `lea` as it has high execution units it can run
> on.
> 2. Don't break macro-fusion between `test` and `jcc`
>
> geometric_mean(N=20) of all benchmarks New / Original: 0.973
>
> All string/memory tests pass.
> ---
> sysdeps/x86_64/multiarch/wcscpy-ssse3.S | 194 ++++++++++++------------
> 1 file changed, 97 insertions(+), 97 deletions(-)
>
> diff --git a/sysdeps/x86_64/multiarch/wcscpy-ssse3.S b/sysdeps/x86_64/multiarch/wcscpy-ssse3.S
> index 34b09af327..aa2b9d030f 100644
> --- a/sysdeps/x86_64/multiarch/wcscpy-ssse3.S
> +++ b/sysdeps/x86_64/multiarch/wcscpy-ssse3.S
> @@ -52,7 +52,7 @@ ENTRY (__wcscpy_ssse3)
> jnz L(CopyFrom1To16Bytes)
>
> mov %rdx, %rax
> - lea 16(%rdx), %rdx
> + addq $16, %rdx
> and $-16, %rdx
> sub %rdx, %rax
> sub %rax, %rcx
> @@ -75,55 +75,55 @@ L(Align16Both):
> movaps 16(%rcx), %xmm2
> movaps %xmm1, (%rdx)
> pcmpeqd %xmm2, %xmm0
> - pmovmskb %xmm0, %rax
> - lea 16(%rsi), %rsi
> + pmovmskb %xmm0, %eax
> + addq $16, %rsi
>
> - test %rax, %rax
> + test %eax, %eax
> jnz L(CopyFrom1To16Bytes)
>
> movaps 16(%rcx, %rsi), %xmm3
> movaps %xmm2, (%rdx, %rsi)
> pcmpeqd %xmm3, %xmm0
> - pmovmskb %xmm0, %rax
> - lea 16(%rsi), %rsi
> + pmovmskb %xmm0, %eax
> + addq $16, %rsi
>
> - test %rax, %rax
> + test %eax, %eax
> jnz L(CopyFrom1To16Bytes)
>
> movaps 16(%rcx, %rsi), %xmm4
> movaps %xmm3, (%rdx, %rsi)
> pcmpeqd %xmm4, %xmm0
> - pmovmskb %xmm0, %rax
> - lea 16(%rsi), %rsi
> + pmovmskb %xmm0, %eax
> + addq $16, %rsi
>
> - test %rax, %rax
> + test %eax, %eax
> jnz L(CopyFrom1To16Bytes)
>
> movaps 16(%rcx, %rsi), %xmm1
> movaps %xmm4, (%rdx, %rsi)
> pcmpeqd %xmm1, %xmm0
> - pmovmskb %xmm0, %rax
> - lea 16(%rsi), %rsi
> + pmovmskb %xmm0, %eax
> + addq $16, %rsi
>
> - test %rax, %rax
> + test %eax, %eax
> jnz L(CopyFrom1To16Bytes)
>
> movaps 16(%rcx, %rsi), %xmm2
> movaps %xmm1, (%rdx, %rsi)
> pcmpeqd %xmm2, %xmm0
> - pmovmskb %xmm0, %rax
> - lea 16(%rsi), %rsi
> + pmovmskb %xmm0, %eax
> + addq $16, %rsi
>
> - test %rax, %rax
> + test %eax, %eax
> jnz L(CopyFrom1To16Bytes)
>
> movaps 16(%rcx, %rsi), %xmm3
> movaps %xmm2, (%rdx, %rsi)
> pcmpeqd %xmm3, %xmm0
> - pmovmskb %xmm0, %rax
> - lea 16(%rsi), %rsi
> + pmovmskb %xmm0, %eax
> + addq $16, %rsi
>
> - test %rax, %rax
> + test %eax, %eax
> jnz L(CopyFrom1To16Bytes)
>
> movaps %xmm3, (%rdx, %rsi)
> @@ -147,10 +147,10 @@ L(Aligned64Loop):
> pminub %xmm7, %xmm3
> pminub %xmm2, %xmm3
> pcmpeqd %xmm0, %xmm3
> - pmovmskb %xmm3, %rax
> - lea 64(%rdx), %rdx
> - lea 64(%rcx), %rcx
> - test %rax, %rax
> + pmovmskb %xmm3, %eax
> + addq $64, %rdx
> + addq $64, %rcx
> + testl %eax, %eax
> jnz L(Aligned64Leave)
> movaps %xmm4, -64(%rdx)
> movaps %xmm5, -48(%rdx)
> @@ -160,32 +160,32 @@ L(Aligned64Loop):
>
> L(Aligned64Leave):
> pcmpeqd %xmm4, %xmm0
> - pmovmskb %xmm0, %rax
> - test %rax, %rax
> + pmovmskb %xmm0, %eax
> + test %eax, %eax
> jnz L(CopyFrom1To16Bytes)
>
> pcmpeqd %xmm5, %xmm0
>
> - pmovmskb %xmm0, %rax
> + pmovmskb %xmm0, %eax
> movaps %xmm4, -64(%rdx)
> - test %rax, %rax
> - lea 16(%rsi), %rsi
> + addq $16, %rsi
> + test %eax, %eax
> jnz L(CopyFrom1To16Bytes)
>
> pcmpeqd %xmm6, %xmm0
>
> - pmovmskb %xmm0, %rax
> + pmovmskb %xmm0, %eax
> movaps %xmm5, -48(%rdx)
> - test %rax, %rax
> - lea 16(%rsi), %rsi
> + addq $16, %rsi
> + test %eax, %eax
> jnz L(CopyFrom1To16Bytes)
>
> movaps %xmm6, -32(%rdx)
> pcmpeqd %xmm7, %xmm0
>
> - pmovmskb %xmm0, %rax
> - lea 16(%rsi), %rsi
> - test %rax, %rax
> + pmovmskb %xmm0, %eax
> + addq $16, %rsi
> + test %eax, %eax
> jnz L(CopyFrom1To16Bytes)
>
> mov $-0x40, %rsi
> @@ -198,10 +198,10 @@ L(Shl4):
> movaps 12(%rcx), %xmm2
> L(Shl4Start):
> pcmpeqd %xmm2, %xmm0
> - pmovmskb %xmm0, %rax
> + pmovmskb %xmm0, %eax
> movaps %xmm2, %xmm3
>
> - test %rax, %rax
> + test %eax, %eax
> jnz L(Shl4LoopExit)
>
> palignr $4, %xmm1, %xmm2
> @@ -209,12 +209,12 @@ L(Shl4Start):
> movaps 28(%rcx), %xmm2
>
> pcmpeqd %xmm2, %xmm0
> - lea 16(%rdx), %rdx
> - pmovmskb %xmm0, %rax
> - lea 16(%rcx), %rcx
> + addq $16, %rdx
> + pmovmskb %xmm0, %eax
> + addq $16, %rcx
> movaps %xmm2, %xmm1
>
> - test %rax, %rax
> + test %eax, %eax
> jnz L(Shl4LoopExit)
>
> palignr $4, %xmm3, %xmm2
> @@ -222,12 +222,12 @@ L(Shl4Start):
> movaps 28(%rcx), %xmm2
>
> pcmpeqd %xmm2, %xmm0
> - lea 16(%rdx), %rdx
> - pmovmskb %xmm0, %rax
> - lea 16(%rcx), %rcx
> + addq $16, %rdx
> + pmovmskb %xmm0, %eax
> + addq $16, %rcx
> movaps %xmm2, %xmm3
>
> - test %rax, %rax
> + test %eax, %eax
> jnz L(Shl4LoopExit)
>
> palignr $4, %xmm1, %xmm2
> @@ -235,22 +235,22 @@ L(Shl4Start):
> movaps 28(%rcx), %xmm2
>
> pcmpeqd %xmm2, %xmm0
> - lea 16(%rdx), %rdx
> - pmovmskb %xmm0, %rax
> - lea 16(%rcx), %rcx
> + addq $16, %rdx
> + pmovmskb %xmm0, %eax
> + addq $16, %rcx
>
> - test %rax, %rax
> + test %eax, %eax
> jnz L(Shl4LoopExit)
>
> palignr $4, %xmm3, %xmm2
> movaps %xmm2, (%rdx)
> - lea 28(%rcx), %rcx
> - lea 16(%rdx), %rdx
> + addq $28, %rcx
> + addq $16, %rdx
>
> mov %rcx, %rax
> and $-0x40, %rcx
> sub %rcx, %rax
> - lea -12(%rcx), %rcx
> + addq $-12, %rcx
> sub %rax, %rdx
>
> movaps -4(%rcx), %xmm1
> @@ -267,22 +267,22 @@ L(Shl4LoopStart):
> pminub %xmm5, %xmm7
> pminub %xmm6, %xmm7
> pcmpeqd %xmm0, %xmm7
> - pmovmskb %xmm7, %rax
> + pmovmskb %xmm7, %eax
> movaps %xmm5, %xmm7
> palignr $4, %xmm4, %xmm5
> - test %rax, %rax
> palignr $4, %xmm3, %xmm4
> + test %eax, %eax
> jnz L(Shl4Start)
>
> palignr $4, %xmm2, %xmm3
> - lea 64(%rcx), %rcx
> + addq $64, %rcx
> palignr $4, %xmm1, %xmm2
> movaps %xmm7, %xmm1
> movaps %xmm5, 48(%rdx)
> movaps %xmm4, 32(%rdx)
> movaps %xmm3, 16(%rdx)
> movaps %xmm2, (%rdx)
> - lea 64(%rdx), %rdx
> + addq $64, %rdx
> jmp L(Shl4LoopStart)
>
> L(Shl4LoopExit):
> @@ -297,10 +297,10 @@ L(Shl8):
> movaps 8(%rcx), %xmm2
> L(Shl8Start):
> pcmpeqd %xmm2, %xmm0
> - pmovmskb %xmm0, %rax
> + pmovmskb %xmm0, %eax
> movaps %xmm2, %xmm3
>
> - test %rax, %rax
> + test %eax, %eax
> jnz L(Shl8LoopExit)
>
> palignr $8, %xmm1, %xmm2
> @@ -308,12 +308,12 @@ L(Shl8Start):
> movaps 24(%rcx), %xmm2
>
> pcmpeqd %xmm2, %xmm0
> - lea 16(%rdx), %rdx
> - pmovmskb %xmm0, %rax
> - lea 16(%rcx), %rcx
> + addq $16, %rdx
> + pmovmskb %xmm0, %eax
> + addq $16, %rcx
> movaps %xmm2, %xmm1
>
> - test %rax, %rax
> + test %eax, %eax
> jnz L(Shl8LoopExit)
>
> palignr $8, %xmm3, %xmm2
> @@ -321,12 +321,12 @@ L(Shl8Start):
> movaps 24(%rcx), %xmm2
>
> pcmpeqd %xmm2, %xmm0
> - lea 16(%rdx), %rdx
> - pmovmskb %xmm0, %rax
> - lea 16(%rcx), %rcx
> + addq $16, %rdx
> + pmovmskb %xmm0, %eax
> + addq $16, %rcx
> movaps %xmm2, %xmm3
>
> - test %rax, %rax
> + test %eax, %eax
> jnz L(Shl8LoopExit)
>
> palignr $8, %xmm1, %xmm2
> @@ -334,22 +334,22 @@ L(Shl8Start):
> movaps 24(%rcx), %xmm2
>
> pcmpeqd %xmm2, %xmm0
> - lea 16(%rdx), %rdx
> - pmovmskb %xmm0, %rax
> - lea 16(%rcx), %rcx
> + addq $16, %rdx
> + pmovmskb %xmm0, %eax
> + addq $16, %rcx
>
> - test %rax, %rax
> + test %eax, %eax
> jnz L(Shl8LoopExit)
>
> palignr $8, %xmm3, %xmm2
> movaps %xmm2, (%rdx)
> - lea 24(%rcx), %rcx
> - lea 16(%rdx), %rdx
> + addq $24, %rcx
> + addq $16, %rdx
>
> mov %rcx, %rax
> and $-0x40, %rcx
> sub %rcx, %rax
> - lea -8(%rcx), %rcx
> + addq $-8, %rcx
> sub %rax, %rdx
>
> movaps -8(%rcx), %xmm1
> @@ -366,22 +366,22 @@ L(Shl8LoopStart):
> pminub %xmm5, %xmm7
> pminub %xmm6, %xmm7
> pcmpeqd %xmm0, %xmm7
> - pmovmskb %xmm7, %rax
> + pmovmskb %xmm7, %eax
> movaps %xmm5, %xmm7
> palignr $8, %xmm4, %xmm5
> - test %rax, %rax
> palignr $8, %xmm3, %xmm4
> + test %eax, %eax
> jnz L(Shl8Start)
>
> palignr $8, %xmm2, %xmm3
> - lea 64(%rcx), %rcx
> + addq $64, %rcx
> palignr $8, %xmm1, %xmm2
> movaps %xmm7, %xmm1
> movaps %xmm5, 48(%rdx)
> movaps %xmm4, 32(%rdx)
> movaps %xmm3, 16(%rdx)
> movaps %xmm2, (%rdx)
> - lea 64(%rdx), %rdx
> + addq $64, %rdx
> jmp L(Shl8LoopStart)
>
> L(Shl8LoopExit):
> @@ -396,10 +396,10 @@ L(Shl12):
> movaps 4(%rcx), %xmm2
> L(Shl12Start):
> pcmpeqd %xmm2, %xmm0
> - pmovmskb %xmm0, %rax
> + pmovmskb %xmm0, %eax
> movaps %xmm2, %xmm3
>
> - test %rax, %rax
> + test %eax, %eax
> jnz L(Shl12LoopExit)
>
> palignr $12, %xmm1, %xmm2
> @@ -407,12 +407,12 @@ L(Shl12Start):
> movaps 20(%rcx), %xmm2
>
> pcmpeqd %xmm2, %xmm0
> - lea 16(%rdx), %rdx
> - pmovmskb %xmm0, %rax
> - lea 16(%rcx), %rcx
> + addq $16, %rdx
> + pmovmskb %xmm0, %eax
> + addq $16, %rcx
> movaps %xmm2, %xmm1
>
> - test %rax, %rax
> + test %eax, %eax
> jnz L(Shl12LoopExit)
>
> palignr $12, %xmm3, %xmm2
> @@ -420,12 +420,12 @@ L(Shl12Start):
> movaps 20(%rcx), %xmm2
>
> pcmpeqd %xmm2, %xmm0
> - lea 16(%rdx), %rdx
> - pmovmskb %xmm0, %rax
> - lea 16(%rcx), %rcx
> + addq $16, %rdx
> + pmovmskb %xmm0, %eax
> + addq $16, %rcx
> movaps %xmm2, %xmm3
>
> - test %rax, %rax
> + test %eax, %eax
> jnz L(Shl12LoopExit)
>
> palignr $12, %xmm1, %xmm2
> @@ -433,22 +433,22 @@ L(Shl12Start):
> movaps 20(%rcx), %xmm2
>
> pcmpeqd %xmm2, %xmm0
> - lea 16(%rdx), %rdx
> - pmovmskb %xmm0, %rax
> - lea 16(%rcx), %rcx
> + addq $16, %rdx
> + pmovmskb %xmm0, %eax
> + addq $16, %rcx
>
> - test %rax, %rax
> + test %eax, %eax
> jnz L(Shl12LoopExit)
>
> palignr $12, %xmm3, %xmm2
> movaps %xmm2, (%rdx)
> - lea 20(%rcx), %rcx
> - lea 16(%rdx), %rdx
> + addq $20, %rcx
> + addq $16, %rdx
>
> mov %rcx, %rax
> and $-0x40, %rcx
> sub %rcx, %rax
> - lea -4(%rcx), %rcx
> + addq $-4, %rcx
> sub %rax, %rdx
>
> movaps -12(%rcx), %xmm1
> @@ -465,21 +465,21 @@ L(Shl12LoopStart):
> pminub %xmm5, %xmm7
> pminub %xmm6, %xmm7
> pcmpeqd %xmm0, %xmm7
> - pmovmskb %xmm7, %rax
> + pmovmskb %xmm7, %eax
> movaps %xmm5, %xmm7
> palignr $12, %xmm4, %xmm5
> - test %rax, %rax
> palignr $12, %xmm3, %xmm4
> + test %eax, %eax
> jnz L(Shl12Start)
> palignr $12, %xmm2, %xmm3
> - lea 64(%rcx), %rcx
> + addq $64, %rcx
> palignr $12, %xmm1, %xmm2
> movaps %xmm7, %xmm1
> movaps %xmm5, 48(%rdx)
> movaps %xmm4, 32(%rdx)
> movaps %xmm3, 16(%rdx)
> movaps %xmm2, (%rdx)
> - lea 64(%rdx), %rdx
> + addq $64, %rdx
> jmp L(Shl12LoopStart)
>
> L(Shl12LoopExit):
> --
> 2.25.1
>
LGTM.
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
Thanks.
--
H.J.
^ permalink raw reply [flat|nested] 5+ messages in thread
* Re: [PATCH v1 2/2] x86: Small improvements for wcslen
2022-03-25 22:13 ` [PATCH v1 2/2] x86: Small improvements for wcslen Noah Goldstein
@ 2022-03-28 18:51 ` H.J. Lu
2022-05-12 19:55 ` Sunil Pandey
0 siblings, 1 reply; 5+ messages in thread
From: H.J. Lu @ 2022-03-28 18:51 UTC (permalink / raw)
To: Noah Goldstein; +Cc: GNU C Library, Carlos O'Donell
On Fri, Mar 25, 2022 at 3:13 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> Just a few QOL changes.
> 1. Prefer `add` > `lea` as it has high execution units it can run
> on.
> 2. Don't break macro-fusion between `test` and `jcc`
> 3. Reduce code size by removing gratuitous padding bytes (-90
> bytes).
>
> geometric_mean(N=20) of all benchmarks New / Original: 0.959
>
> All string/memory tests pass.
> ---
> sysdeps/x86_64/wcslen.S | 86 ++++++++++++++++++++---------------------
> 1 file changed, 41 insertions(+), 45 deletions(-)
>
> diff --git a/sysdeps/x86_64/wcslen.S b/sysdeps/x86_64/wcslen.S
> index c9165dbf03..d641141d75 100644
> --- a/sysdeps/x86_64/wcslen.S
> +++ b/sysdeps/x86_64/wcslen.S
> @@ -40,82 +40,82 @@ ENTRY (__wcslen)
> pxor %xmm0, %xmm0
>
> lea 32(%rdi), %rax
> - lea 16(%rdi), %rcx
> + addq $16, %rdi
> and $-16, %rax
>
> pcmpeqd (%rax), %xmm0
> pmovmskb %xmm0, %edx
> pxor %xmm1, %xmm1
> + addq $16, %rax
> test %edx, %edx
> - lea 16(%rax), %rax
> jnz L(exit)
>
> pcmpeqd (%rax), %xmm1
> pmovmskb %xmm1, %edx
> pxor %xmm2, %xmm2
> + addq $16, %rax
> test %edx, %edx
> - lea 16(%rax), %rax
> jnz L(exit)
>
> pcmpeqd (%rax), %xmm2
> pmovmskb %xmm2, %edx
> pxor %xmm3, %xmm3
> + addq $16, %rax
> test %edx, %edx
> - lea 16(%rax), %rax
> jnz L(exit)
>
> pcmpeqd (%rax), %xmm3
> pmovmskb %xmm3, %edx
> + addq $16, %rax
> test %edx, %edx
> - lea 16(%rax), %rax
> jnz L(exit)
>
> pcmpeqd (%rax), %xmm0
> pmovmskb %xmm0, %edx
> + addq $16, %rax
> test %edx, %edx
> - lea 16(%rax), %rax
> jnz L(exit)
>
> pcmpeqd (%rax), %xmm1
> pmovmskb %xmm1, %edx
> + addq $16, %rax
> test %edx, %edx
> - lea 16(%rax), %rax
> jnz L(exit)
>
> pcmpeqd (%rax), %xmm2
> pmovmskb %xmm2, %edx
> + addq $16, %rax
> test %edx, %edx
> - lea 16(%rax), %rax
> jnz L(exit)
>
> pcmpeqd (%rax), %xmm3
> pmovmskb %xmm3, %edx
> + addq $16, %rax
> test %edx, %edx
> - lea 16(%rax), %rax
> jnz L(exit)
>
> pcmpeqd (%rax), %xmm0
> pmovmskb %xmm0, %edx
> + addq $16, %rax
> test %edx, %edx
> - lea 16(%rax), %rax
> jnz L(exit)
>
> pcmpeqd (%rax), %xmm1
> pmovmskb %xmm1, %edx
> + addq $16, %rax
> test %edx, %edx
> - lea 16(%rax), %rax
> jnz L(exit)
>
> pcmpeqd (%rax), %xmm2
> pmovmskb %xmm2, %edx
> + addq $16, %rax
> test %edx, %edx
> - lea 16(%rax), %rax
> jnz L(exit)
>
> pcmpeqd (%rax), %xmm3
> pmovmskb %xmm3, %edx
> + addq $16, %rax
> test %edx, %edx
> - lea 16(%rax), %rax
> jnz L(exit)
>
> and $-0x40, %rax
> @@ -132,104 +132,100 @@ L(aligned_64_loop):
> pminub %xmm0, %xmm2
> pcmpeqd %xmm3, %xmm2
> pmovmskb %xmm2, %edx
> + addq $64, %rax
> test %edx, %edx
> - lea 64(%rax), %rax
> jz L(aligned_64_loop)
>
> pcmpeqd -64(%rax), %xmm3
> pmovmskb %xmm3, %edx
> + addq $48, %rdi
> test %edx, %edx
> - lea 48(%rcx), %rcx
> jnz L(exit)
>
> pcmpeqd %xmm1, %xmm3
> pmovmskb %xmm3, %edx
> + addq $-16, %rdi
> test %edx, %edx
> - lea -16(%rcx), %rcx
> jnz L(exit)
>
> pcmpeqd -32(%rax), %xmm3
> pmovmskb %xmm3, %edx
> + addq $-16, %rdi
> test %edx, %edx
> - lea -16(%rcx), %rcx
> jnz L(exit)
>
> pcmpeqd %xmm6, %xmm3
> pmovmskb %xmm3, %edx
> + addq $-16, %rdi
> test %edx, %edx
> - lea -16(%rcx), %rcx
> - jnz L(exit)
> -
> - jmp L(aligned_64_loop)
> + jz L(aligned_64_loop)
>
> .p2align 4
> L(exit):
> - sub %rcx, %rax
> + sub %rdi, %rax
> shr $2, %rax
> test %dl, %dl
> jz L(exit_high)
>
> - mov %dl, %cl
> - and $15, %cl
> + andl $15, %edx
> jz L(exit_1)
> ret
>
> - .p2align 4
> + /* No align here. Naturally aligned % 16 == 1. */
> L(exit_high):
> - mov %dh, %ch
> - and $15, %ch
> + andl $(15 << 8), %edx
> jz L(exit_3)
> add $2, %rax
> ret
>
> - .p2align 4
> + .p2align 3
> L(exit_1):
> add $1, %rax
> ret
>
> - .p2align 4
> + .p2align 3
> L(exit_3):
> add $3, %rax
> ret
>
> - .p2align 4
> + .p2align 3
> L(exit_tail0):
> - xor %rax, %rax
> + xorl %eax, %eax
> ret
>
> - .p2align 4
> + .p2align 3
> L(exit_tail1):
> - mov $1, %rax
> + movl $1, %eax
> ret
>
> - .p2align 4
> + .p2align 3
> L(exit_tail2):
> - mov $2, %rax
> + movl $2, %eax
> ret
>
> - .p2align 4
> + .p2align 3
> L(exit_tail3):
> - mov $3, %rax
> + movl $3, %eax
> ret
>
> - .p2align 4
> + .p2align 3
> L(exit_tail4):
> - mov $4, %rax
> + movl $4, %eax
> ret
>
> - .p2align 4
> + .p2align 3
> L(exit_tail5):
> - mov $5, %rax
> + movl $5, %eax
> ret
>
> - .p2align 4
> + .p2align 3
> L(exit_tail6):
> - mov $6, %rax
> + movl $6, %eax
> ret
>
> - .p2align 4
> + .p2align 3
> L(exit_tail7):
> - mov $7, %rax
> + movl $7, %eax
> ret
>
> END (__wcslen)
> --
> 2.25.1
>
LGTM.
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
Thanks.
--
H.J.
^ permalink raw reply [flat|nested] 5+ messages in thread
* Re: [PATCH v1 2/2] x86: Small improvements for wcslen
2022-03-28 18:51 ` H.J. Lu
@ 2022-05-12 19:55 ` Sunil Pandey
0 siblings, 0 replies; 5+ messages in thread
From: Sunil Pandey @ 2022-05-12 19:55 UTC (permalink / raw)
To: H.J. Lu, Libc-stable Mailing List; +Cc: Noah Goldstein, GNU C Library
On Mon, Mar 28, 2022 at 11:53 AM H.J. Lu via Libc-alpha
<libc-alpha@sourceware.org> wrote:
>
> On Fri, Mar 25, 2022 at 3:13 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> >
> > Just a few QOL changes.
> > 1. Prefer `add` > `lea` as it has high execution units it can run
> > on.
> > 2. Don't break macro-fusion between `test` and `jcc`
> > 3. Reduce code size by removing gratuitous padding bytes (-90
> > bytes).
> >
> > geometric_mean(N=20) of all benchmarks New / Original: 0.959
> >
> > All string/memory tests pass.
> > ---
> > sysdeps/x86_64/wcslen.S | 86 ++++++++++++++++++++---------------------
> > 1 file changed, 41 insertions(+), 45 deletions(-)
> >
> > diff --git a/sysdeps/x86_64/wcslen.S b/sysdeps/x86_64/wcslen.S
> > index c9165dbf03..d641141d75 100644
> > --- a/sysdeps/x86_64/wcslen.S
> > +++ b/sysdeps/x86_64/wcslen.S
> > @@ -40,82 +40,82 @@ ENTRY (__wcslen)
> > pxor %xmm0, %xmm0
> >
> > lea 32(%rdi), %rax
> > - lea 16(%rdi), %rcx
> > + addq $16, %rdi
> > and $-16, %rax
> >
> > pcmpeqd (%rax), %xmm0
> > pmovmskb %xmm0, %edx
> > pxor %xmm1, %xmm1
> > + addq $16, %rax
> > test %edx, %edx
> > - lea 16(%rax), %rax
> > jnz L(exit)
> >
> > pcmpeqd (%rax), %xmm1
> > pmovmskb %xmm1, %edx
> > pxor %xmm2, %xmm2
> > + addq $16, %rax
> > test %edx, %edx
> > - lea 16(%rax), %rax
> > jnz L(exit)
> >
> > pcmpeqd (%rax), %xmm2
> > pmovmskb %xmm2, %edx
> > pxor %xmm3, %xmm3
> > + addq $16, %rax
> > test %edx, %edx
> > - lea 16(%rax), %rax
> > jnz L(exit)
> >
> > pcmpeqd (%rax), %xmm3
> > pmovmskb %xmm3, %edx
> > + addq $16, %rax
> > test %edx, %edx
> > - lea 16(%rax), %rax
> > jnz L(exit)
> >
> > pcmpeqd (%rax), %xmm0
> > pmovmskb %xmm0, %edx
> > + addq $16, %rax
> > test %edx, %edx
> > - lea 16(%rax), %rax
> > jnz L(exit)
> >
> > pcmpeqd (%rax), %xmm1
> > pmovmskb %xmm1, %edx
> > + addq $16, %rax
> > test %edx, %edx
> > - lea 16(%rax), %rax
> > jnz L(exit)
> >
> > pcmpeqd (%rax), %xmm2
> > pmovmskb %xmm2, %edx
> > + addq $16, %rax
> > test %edx, %edx
> > - lea 16(%rax), %rax
> > jnz L(exit)
> >
> > pcmpeqd (%rax), %xmm3
> > pmovmskb %xmm3, %edx
> > + addq $16, %rax
> > test %edx, %edx
> > - lea 16(%rax), %rax
> > jnz L(exit)
> >
> > pcmpeqd (%rax), %xmm0
> > pmovmskb %xmm0, %edx
> > + addq $16, %rax
> > test %edx, %edx
> > - lea 16(%rax), %rax
> > jnz L(exit)
> >
> > pcmpeqd (%rax), %xmm1
> > pmovmskb %xmm1, %edx
> > + addq $16, %rax
> > test %edx, %edx
> > - lea 16(%rax), %rax
> > jnz L(exit)
> >
> > pcmpeqd (%rax), %xmm2
> > pmovmskb %xmm2, %edx
> > + addq $16, %rax
> > test %edx, %edx
> > - lea 16(%rax), %rax
> > jnz L(exit)
> >
> > pcmpeqd (%rax), %xmm3
> > pmovmskb %xmm3, %edx
> > + addq $16, %rax
> > test %edx, %edx
> > - lea 16(%rax), %rax
> > jnz L(exit)
> >
> > and $-0x40, %rax
> > @@ -132,104 +132,100 @@ L(aligned_64_loop):
> > pminub %xmm0, %xmm2
> > pcmpeqd %xmm3, %xmm2
> > pmovmskb %xmm2, %edx
> > + addq $64, %rax
> > test %edx, %edx
> > - lea 64(%rax), %rax
> > jz L(aligned_64_loop)
> >
> > pcmpeqd -64(%rax), %xmm3
> > pmovmskb %xmm3, %edx
> > + addq $48, %rdi
> > test %edx, %edx
> > - lea 48(%rcx), %rcx
> > jnz L(exit)
> >
> > pcmpeqd %xmm1, %xmm3
> > pmovmskb %xmm3, %edx
> > + addq $-16, %rdi
> > test %edx, %edx
> > - lea -16(%rcx), %rcx
> > jnz L(exit)
> >
> > pcmpeqd -32(%rax), %xmm3
> > pmovmskb %xmm3, %edx
> > + addq $-16, %rdi
> > test %edx, %edx
> > - lea -16(%rcx), %rcx
> > jnz L(exit)
> >
> > pcmpeqd %xmm6, %xmm3
> > pmovmskb %xmm3, %edx
> > + addq $-16, %rdi
> > test %edx, %edx
> > - lea -16(%rcx), %rcx
> > - jnz L(exit)
> > -
> > - jmp L(aligned_64_loop)
> > + jz L(aligned_64_loop)
> >
> > .p2align 4
> > L(exit):
> > - sub %rcx, %rax
> > + sub %rdi, %rax
> > shr $2, %rax
> > test %dl, %dl
> > jz L(exit_high)
> >
> > - mov %dl, %cl
> > - and $15, %cl
> > + andl $15, %edx
> > jz L(exit_1)
> > ret
> >
> > - .p2align 4
> > + /* No align here. Naturally aligned % 16 == 1. */
> > L(exit_high):
> > - mov %dh, %ch
> > - and $15, %ch
> > + andl $(15 << 8), %edx
> > jz L(exit_3)
> > add $2, %rax
> > ret
> >
> > - .p2align 4
> > + .p2align 3
> > L(exit_1):
> > add $1, %rax
> > ret
> >
> > - .p2align 4
> > + .p2align 3
> > L(exit_3):
> > add $3, %rax
> > ret
> >
> > - .p2align 4
> > + .p2align 3
> > L(exit_tail0):
> > - xor %rax, %rax
> > + xorl %eax, %eax
> > ret
> >
> > - .p2align 4
> > + .p2align 3
> > L(exit_tail1):
> > - mov $1, %rax
> > + movl $1, %eax
> > ret
> >
> > - .p2align 4
> > + .p2align 3
> > L(exit_tail2):
> > - mov $2, %rax
> > + movl $2, %eax
> > ret
> >
> > - .p2align 4
> > + .p2align 3
> > L(exit_tail3):
> > - mov $3, %rax
> > + movl $3, %eax
> > ret
> >
> > - .p2align 4
> > + .p2align 3
> > L(exit_tail4):
> > - mov $4, %rax
> > + movl $4, %eax
> > ret
> >
> > - .p2align 4
> > + .p2align 3
> > L(exit_tail5):
> > - mov $5, %rax
> > + movl $5, %eax
> > ret
> >
> > - .p2align 4
> > + .p2align 3
> > L(exit_tail6):
> > - mov $6, %rax
> > + movl $6, %eax
> > ret
> >
> > - .p2align 4
> > + .p2align 3
> > L(exit_tail7):
> > - mov $7, %rax
> > + movl $7, %eax
> > ret
> >
> > END (__wcslen)
> > --
> > 2.25.1
> >
>
> LGTM.
>
> Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
>
> Thanks.
>
> --
> H.J.
I would like to backport this patch to release branches.
Any comments or objections?
--Sunil
^ permalink raw reply [flat|nested] 5+ messages in thread
end of thread, other threads:[~2022-05-12 19:55 UTC | newest]
Thread overview: 5+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-03-25 22:13 [PATCH v1 1/2] x86: Small improvements for wcscpy-ssse3 Noah Goldstein
2022-03-25 22:13 ` [PATCH v1 2/2] x86: Small improvements for wcslen Noah Goldstein
2022-03-28 18:51 ` H.J. Lu
2022-05-12 19:55 ` Sunil Pandey
2022-03-28 18:51 ` [PATCH v1 1/2] x86: Small improvements for wcscpy-ssse3 H.J. Lu
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).