* [PATCH] AArch64: Improve strlen_asimd
@ 2023-01-12 15:51 Wilco Dijkstra
2023-01-13 12:25 ` Szabolcs Nagy
0 siblings, 1 reply; 4+ messages in thread
From: Wilco Dijkstra @ 2023-01-12 15:51 UTC (permalink / raw)
To: 'GNU C Library'; +Cc: Szabolcs Nagy
Use shrn for the mask, merge tst+bne into cbnz, and tweak code alignment.
Performance improves slightly as a result. Passes regress.
---
diff --git a/sysdeps/aarch64/multiarch/strlen_asimd.S b/sysdeps/aarch64/multiarch/strlen_asimd.S
index ca6ab96ecf2de45def79539facd8e0b86f4edc95..490439491d19c3f14b0228f42248bc8aa6e9e8bd 100644
--- a/sysdeps/aarch64/multiarch/strlen_asimd.S
+++ b/sysdeps/aarch64/multiarch/strlen_asimd.S
@@ -48,6 +48,7 @@
#define tmp x2
#define tmpw w2
#define synd x3
+#define syndw w3
#define shift x4
/* For the first 32 bytes, NUL detection works on the principle that
@@ -87,7 +88,6 @@
ENTRY (__strlen_asimd)
PTR_ARG (0)
-
and tmp1, srcin, MIN_PAGE_SIZE - 1
cmp tmp1, MIN_PAGE_SIZE - 32
b.hi L(page_cross)
@@ -123,7 +123,6 @@ ENTRY (__strlen_asimd)
add len, len, tmp1, lsr 3
ret
- .p2align 3
/* Look for a NUL byte at offset 16..31 in the string. */
L(bytes16_31):
ldp data1, data2, [srcin, 16]
@@ -151,6 +150,7 @@ L(bytes16_31):
add len, len, tmp1, lsr 3
ret
+ nop
L(loop_entry):
bic src, srcin, 31
@@ -166,18 +166,12 @@ L(loop):
/* Low 32 bits of synd are non-zero if a NUL was found in datav1. */
cmeq maskv.16b, datav1.16b, 0
sub len, src, srcin
- tst synd, 0xffffffff
- b.ne 1f
+ cbnz syndw, 1f
cmeq maskv.16b, datav2.16b, 0
add len, len, 16
1:
/* Generate a bitmask and compute correct byte offset. */
-#ifdef __AARCH64EB__
- bic maskv.8h, 0xf0
-#else
- bic maskv.8h, 0x0f, lsl 8
-#endif
- umaxp maskv.16b, maskv.16b, maskv.16b
+ shrn maskv.8b, maskv.8h, 4
fmov synd, maskd
#ifndef __AARCH64EB__
rbit synd, synd
@@ -186,8 +180,6 @@ L(loop):
add len, len, tmp, lsr 2
ret
- .p2align 4
-
L(page_cross):
bic src, srcin, 31
mov tmpw, 0x0c03
^ permalink raw reply [flat|nested] 4+ messages in thread
* Re: [PATCH] AArch64: Improve strlen_asimd
2023-01-12 15:51 [PATCH] AArch64: Improve strlen_asimd Wilco Dijkstra
@ 2023-01-13 12:25 ` Szabolcs Nagy
2023-01-16 22:16 ` Carlos O'Donell
0 siblings, 1 reply; 4+ messages in thread
From: Szabolcs Nagy @ 2023-01-13 12:25 UTC (permalink / raw)
To: Wilco Dijkstra; +Cc: 'GNU C Library'
The 01/12/2023 15:51, Wilco Dijkstra wrote:
> Use shrn for the mask, merge tst+bne into cbnz, and tweak code alignment.
> Performance improves slightly as a result. Passes regress.
>
I prefer to commit this and the other string function optimization
patches and not delay to next release so start using and widely
testing them sooner (we can fix and backport perf regressions).
please commit it, thanks.
Reviewed-by: Szabolcs Nagy <szabolcs.nagy@arm.com>
> ---
>
> diff --git a/sysdeps/aarch64/multiarch/strlen_asimd.S b/sysdeps/aarch64/multiarch/strlen_asimd.S
> index ca6ab96ecf2de45def79539facd8e0b86f4edc95..490439491d19c3f14b0228f42248bc8aa6e9e8bd 100644
> --- a/sysdeps/aarch64/multiarch/strlen_asimd.S
> +++ b/sysdeps/aarch64/multiarch/strlen_asimd.S
> @@ -48,6 +48,7 @@
> #define tmp x2
> #define tmpw w2
> #define synd x3
> +#define syndw w3
> #define shift x4
>
> /* For the first 32 bytes, NUL detection works on the principle that
> @@ -87,7 +88,6 @@
>
> ENTRY (__strlen_asimd)
> PTR_ARG (0)
> -
> and tmp1, srcin, MIN_PAGE_SIZE - 1
> cmp tmp1, MIN_PAGE_SIZE - 32
> b.hi L(page_cross)
> @@ -123,7 +123,6 @@ ENTRY (__strlen_asimd)
> add len, len, tmp1, lsr 3
> ret
>
> - .p2align 3
> /* Look for a NUL byte at offset 16..31 in the string. */
> L(bytes16_31):
> ldp data1, data2, [srcin, 16]
> @@ -151,6 +150,7 @@ L(bytes16_31):
> add len, len, tmp1, lsr 3
> ret
>
> + nop
> L(loop_entry):
> bic src, srcin, 31
>
> @@ -166,18 +166,12 @@ L(loop):
> /* Low 32 bits of synd are non-zero if a NUL was found in datav1. */
> cmeq maskv.16b, datav1.16b, 0
> sub len, src, srcin
> - tst synd, 0xffffffff
> - b.ne 1f
> + cbnz syndw, 1f
> cmeq maskv.16b, datav2.16b, 0
> add len, len, 16
> 1:
> /* Generate a bitmask and compute correct byte offset. */
> -#ifdef __AARCH64EB__
> - bic maskv.8h, 0xf0
> -#else
> - bic maskv.8h, 0x0f, lsl 8
> -#endif
> - umaxp maskv.16b, maskv.16b, maskv.16b
> + shrn maskv.8b, maskv.8h, 4
> fmov synd, maskd
> #ifndef __AARCH64EB__
> rbit synd, synd
> @@ -186,8 +180,6 @@ L(loop):
> add len, len, tmp, lsr 2
> ret
>
> - .p2align 4
> -
> L(page_cross):
> bic src, srcin, 31
> mov tmpw, 0x0c03
^ permalink raw reply [flat|nested] 4+ messages in thread
* Re: [PATCH] AArch64: Improve strlen_asimd
2023-01-13 12:25 ` Szabolcs Nagy
@ 2023-01-16 22:16 ` Carlos O'Donell
2023-01-17 16:37 ` Wilco Dijkstra
0 siblings, 1 reply; 4+ messages in thread
From: Carlos O'Donell @ 2023-01-16 22:16 UTC (permalink / raw)
To: Szabolcs Nagy, Wilco Dijkstra; +Cc: 'GNU C Library'
On 1/13/23 07:25, Szabolcs Nagy via Libc-alpha wrote:
> The 01/12/2023 15:51, Wilco Dijkstra wrote:
>> Use shrn for the mask, merge tst+bne into cbnz, and tweak code alignment.
>> Performance improves slightly as a result. Passes regress.
>>
>
> I prefer to commit this and the other string function optimization
> patches and not delay to next release so start using and widely
> testing them sooner (we can fix and backport perf regressions).
>
> please commit it, thanks.
>
> Reviewed-by: Szabolcs Nagy <szabolcs.nagy@arm.com>
Yes, please commit this ASAP.
As RM I would like to see machine testing start next week and so I'd like to
see such changes committed early this week.
--
Cheers,
Carlos.
^ permalink raw reply [flat|nested] 4+ messages in thread
* Re: [PATCH] AArch64: Improve strlen_asimd
2023-01-16 22:16 ` Carlos O'Donell
@ 2023-01-17 16:37 ` Wilco Dijkstra
0 siblings, 0 replies; 4+ messages in thread
From: Wilco Dijkstra @ 2023-01-17 16:37 UTC (permalink / raw)
To: Carlos O'Donell, Szabolcs Nagy; +Cc: 'GNU C Library'
Hi Carlos,
>> I prefer to commit this and the other string function optimization
>> patches and not delay to next release so start using and widely
>> testing them sooner (we can fix and backport perf regressions).
>>
>> please commit it, thanks.
>>
>> Reviewed-by: Szabolcs Nagy <szabolcs.nagy@arm.com>
>
> Yes, please commit this ASAP.
>
> As RM I would like to see machine testing start next week and so I'd like to
> see such changes committed early this week.
Thanks, I've pushed it.
Cheers,
Wilco
^ permalink raw reply [flat|nested] 4+ messages in thread
end of thread, other threads:[~2023-01-17 16:37 UTC | newest]
Thread overview: 4+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2023-01-12 15:51 [PATCH] AArch64: Improve strlen_asimd Wilco Dijkstra
2023-01-13 12:25 ` Szabolcs Nagy
2023-01-16 22:16 ` Carlos O'Donell
2023-01-17 16:37 ` Wilco Dijkstra
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).