* [PATCH v2] aarch64: Optimized implementation of memcmp
@ 2019-10-22 9:38 Xuelei Zhang
2019-10-22 18:06 ` Wilco Dijkstra
0 siblings, 1 reply; 3+ messages in thread
From: Xuelei Zhang @ 2019-10-22 9:38 UTC (permalink / raw)
To: libc-alpha, siddhesh, Szabolcs.Nagy, Wilco.Dijkstra, jiangyikun,
yikunkero
The loop body is expanded from a 16-byte comparison to a 64-byte
comparison, and the usage of ldp is replaced by the Post-index
mode to the Base plus offset mode. Hence, compare can faster 18%
around > 128 bytes in all.
---
sysdeps/aarch64/memcmp.S | 135 ++++++++++++++++++++++++++++-------------------
1 file changed, 82 insertions(+), 53 deletions(-)
diff --git a/sysdeps/aarch64/memcmp.S b/sysdeps/aarch64/memcmp.S
index f330154c7a0..390336b58f5 100644
--- a/sysdeps/aarch64/memcmp.S
+++ b/sysdeps/aarch64/memcmp.S
@@ -46,75 +46,83 @@ ENTRY_ALIGN (memcmp, 6)
DELOUSE (1)
DELOUSE (2)
- subs limit, limit, 8
- b.lo L(less8)
+ subs limit, limit, 16
+ b.lo L(less16)
- ldr data1, [src1], 8
- ldr data2, [src2], 8
- cmp data1, data2
- b.ne L(return)
-
- subs limit, limit, 8
- b.gt L(more16)
-
- ldr data1, [src1, limit]
- ldr data2, [src2, limit]
- b L(return)
-
-L(more16):
- ldr data1, [src1], 8
- ldr data2, [src2], 8
- cmp data1, data2
- bne L(return)
-
- /* Jump directly to comparing the last 16 bytes for 32 byte (or less)
- strings. */
- subs limit, limit, 16
- b.ls L(last_bytes)
+ ldp data1, data1h, [src1], 16
+ ldp data2, data2h, [src2], 16
+ ccmp data1, data2, 0, ne
+ ccmp data1h, data2h, 0, eq
+ b.ne L(return64)
- /* We overlap loads between 0-32 bytes at either side of SRC1 when we
- try to align, so limit it only to strings larger than 128 bytes. */
- cmp limit, 96
- b.ls L(loop16)
+ subs limit, limit, 16
+ b.ls L(last_bytes)
+ cmp limit, 112
+ b.lo L(loop16)
- /* Align src1 and adjust src2 with bytes not yet done. */
and tmp1, src1, 15
add limit, limit, tmp1
sub src1, src1, tmp1
sub src2, src2, tmp1
+ subs limit, limit, 48
- /* Loop performing 16 bytes per iteration using aligned src1.
- Limit is pre-decremented by 16 and must be larger than zero.
- Exit if <= 16 bytes left to do or if the data is not equal. */
+ /* Compare 128 up bytes using aligned access. */
.p2align 4
+L(loop64):
+ ldp data1, data1h, [src1]
+ ldp data2, data2h, [src2]
+ cmp data1, data2
+ ccmp data1h, data2h, 0, eq
+ b.ne L(return64)
+
+ ldp data1, data1h, [src1, 16]
+ ldp data2, data2h, [src2, 16]
+ cmp data1, data2
+ ccmp data1h, data2h, 0, eq
+ b.ne L(return64)
+
+ ldp data1, data1h, [src1, 32]
+ ldp data2, data2h, [src2, 32]
+ cmp data1, data2
+ ccmp data1h, data2h, 0, eq
+ b.ne L(return64)
+
+ ldp data1, data1h, [src1, 48]
+ ldp data2, data2h, [src2, 48]
+ cmp data1, data2
+ ccmp data1h, data2h, 0, eq
+ b.ne L(return64)
+
+ subs limit, limit, 64
+ add src1, src1, 64
+ add src2, src2, 64
+ b.pl L(loop64)
+ adds limit, limit, 48
+ b.lo L(last_bytes)
+
L(loop16):
ldp data1, data1h, [src1], 16
ldp data2, data2h, [src2], 16
- subs limit, limit, 16
- ccmp data1, data2, 0, hi
+ cmp data1, data2
ccmp data1h, data2h, 0, eq
- b.eq L(loop16)
-
- cmp data1, data2
- bne L(return)
- mov data1, data1h
- mov data2, data2h
- cmp data1, data2
- bne L(return)
+ b.ne L(return64)
+ subs limit, limit, 16
+ b.hi L(loop16)
/* Compare last 1-16 bytes using unaligned access. */
L(last_bytes):
add src1, src1, limit
add src2, src2, limit
ldp data1, data1h, [src1]
ldp data2, data2h, [src2]
- cmp data1, data2
+
+ /* Compare data bytes and set return value to 0, -1 or 1. */
+L(return64):
+ cmp data1, data2
bne L(return)
+L(return_pre):
mov data1, data1h
mov data2, data2h
- cmp data1, data2
-
- /* Compare data bytes and set return value to 0, -1 or 1. */
L(return):
#ifndef __AARCH64EB__
rev data1, data1
@@ -127,25 +135,46 @@ L(ret_eq):
ret
.p2align 4
- /* Compare up to 8 bytes. Limit is [-8..-1]. */
+L(less16):
+ adds limit, limit, 8
+ b.lo L(less8) //lo:<
+ ldr data1, [src1]
+ ldr data2, [src2]
+ /* equal 8 optimized */
+ ccmp data1, data2, 0, ne
+ b.ne L(return)
+
+ ldr data1, [src1, limit]
+ ldr data2, [src2, limit]
+ b L(return)
+
+ .p2align 4
L(less8):
adds limit, limit, 4
b.lo L(less4)
- ldr data1w, [src1], 4
- ldr data2w, [src2], 4
- cmp data1w, data2w
+ ldr data1w, [src1]
+ ldr data2w, [src2]
+ ccmp data1, data2, 0, ne
b.ne L(return)
- sub limit, limit, 4
+ ldr data1w, [src1, limit]
+ ldr data2w, [src2, limit]
+ b L(return)
+
+ .p2align 4
L(less4):
adds limit, limit, 4
- beq L(ret_eq)
+ beq L(ret_0)
+
L(byte_loop):
ldrb data1w, [src1], 1
ldrb data2w, [src2], 1
subs limit, limit, 1
ccmp data1w, data2w, 0, ne /* NZCV = 0b0000. */
b.eq L(byte_loop)
- sub result, data1w, data2w
+ sub result, data1w, data2w
+ ret
+L(ret_0):
+ mov result, 0
ret
END (memcmp)
--
2.14.1.windows.1
^ permalink raw reply [flat|nested] 3+ messages in thread
* Re: [PATCH v2] aarch64: Optimized implementation of memcmp
2019-10-22 9:38 [PATCH v2] aarch64: Optimized implementation of memcmp Xuelei Zhang
@ 2019-10-22 18:06 ` Wilco Dijkstra
0 siblings, 0 replies; 3+ messages in thread
From: Wilco Dijkstra @ 2019-10-22 18:06 UTC (permalink / raw)
To: Xuelei Zhang, libc-alpha, siddhesh, Szabolcs Nagy, jiangyikun, yikunkero
Cc: nd
Hi Xuelei,
> The loop body is expanded from a 16-byte comparison to a 64-byte
> comparison, and the usage of ldp is replaced by the Post-index
> mode to the Base plus offset mode. Hence, compare can faster 18%
> around > 128 bytes in all.
This looks quite good - I can reproduce significant gains for large sizes
on various microarchitectures. It seems there are some regressions in
the 8-16 byte range, presumably due to handling these sizes differently.
A few comments inline:
+ /* Compare data bytes and set return value to 0, -1 or 1. */
+L(return64):
+ cmp data1, data2
bne L(return)
+L(return_pre):
mov data1, data1h
mov data2, data2h
- cmp data1, data2
L(return):
The label return_pre is unused. So why not use 2xCSEL rather than a branch across
the moves? That's going to be faster since the branch will be hard to predict.
L(less8):
adds limit, limit, 4
b.lo L(less4)
- ldr data1w, [src1], 4
- ldr data2w, [src2], 4
- cmp data1w, data2w
+ ldr data1w, [src1]
+ ldr data2w, [src2]
+ ccmp data1, data2, 0, ne
Using data1w and data2w would be better here.
b.eq L(byte_loop)
- sub result, data1w, data2w
+ sub result, data1w, data2w
The formatting has gone wrong...
+ ret
+L(ret_0):
+ mov result, 0
ret
^ permalink raw reply [flat|nested] 3+ messages in thread
* Re: [PATCH v2] aarch64: Optimized implementation of memcmp
@ 2019-10-23 14:31 Zhangxuelei (Derek)
0 siblings, 0 replies; 3+ messages in thread
From: Zhangxuelei (Derek) @ 2019-10-23 14:31 UTC (permalink / raw)
To: Wilco Dijkstra, libc-alpha, siddhesh, Szabolcs Nagy, jiangyikun,
yikunkero
Cc: nd
Hi Wilco,
> It seems there are some regressions in the 8-16 byte range,
> presumably due to handling these sizes differently.
Yep, we judge 16 byte rather than 8 byte at the beginning of function, resulting in 8-16 byte range to be judged and jumped once more. But it impacts less on small sizes and benefits more on middle and large sizes.
> So why not use 2xCSEL rather than a branch across the moves?
> That's going to be faster since the branch will be hard to predict.
Great! This can reduce one branch prediction, and I have modified as suggested.
Other problems like unused label and format is also corrected.
And the patch v3 link: https://sourceware.org/ml/libc-alpha/2019-10/msg00684.html
Xuelei
^ permalink raw reply [flat|nested] 3+ messages in thread
end of thread, other threads:[~2019-10-23 14:31 UTC | newest]
Thread overview: 3+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2019-10-22 9:38 [PATCH v2] aarch64: Optimized implementation of memcmp Xuelei Zhang
2019-10-22 18:06 ` Wilco Dijkstra
2019-10-23 14:31 Zhangxuelei (Derek)
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).