* [2.26 COMMITTED][AArch64] Backport memcmp improvements
@ 2019-01-01 0:00 Wilco Dijkstra
0 siblings, 0 replies; only message in thread
From: Wilco Dijkstra @ 2019-01-01 0:00 UTC (permalink / raw)
To: libc-stable; +Cc: nd
commit ec4512194f035856b8a231476c9139d72f47c58f
Author: Siddhesh Poyarekar <siddhesh@sourceware.org>
Date: Tue Mar 6 19:22:39 2018 +0530
aarch64: Optimized memcmp for medium to large sizes
This improved memcmp provides a fast path for compares up to 16 bytes
and then compares 16 bytes at a time, thus optimizing loads from both
sources. The glibc memcmp microbenchmark retains performance (with an
error of ~1ns) for smaller compare sizes and reduces up to 31% of
execution time for compares up to 4K on the APM Mustang. On Qualcomm
Falkor this improves to almost 48%, i.e. it is almost 2x improvement
for sizes of 2K and above.
* sysdeps/aarch64/memcmp.S: Widen comparison to 16 bytes at a
time.
(cherry picked from commit 30a81dae5b752f8aa5f96e7f7c341ec57cba3585)
commit 600e4e866c4de0cc0b16aec482c65da732960367
Author: Siddhesh Poyarekar <siddhesh@sourceware.org>
Date: Fri Feb 2 10:15:20 2018 +0530
aarch64: Use the L() macro for labels in memcmp
The L() macro makes the assembly a bit more readable.
* sysdeps/aarch64/memcmp.S: Use L() macro for labels.
(cherry picked from commit 84c94d2fd90d84ae7e67657ee8e22c2d1b796f63)
commit 1896de3d926d299a1ed5c9f0a4f03f5a81969200
Author: Wilco Dijkstra <wdijkstr@arm.com>
Date: Thu Aug 10 17:00:38 2017 +0100
[AArch64] Optimized memcmp.
This is an optimized memcmp for AArch64. This is a complete rewrite
using a different algorithm. The previous version split into cases
where both inputs were aligned, the inputs were mutually aligned and
unaligned using a byte loop. The new version combines all these cases,
while small inputs of less than 8 bytes are handled separately.
This allows the main code to be sped up using unaligned loads since
there are now at least 8 bytes to be compared. After the first 8 bytes,
align the first input. This ensures each iteration does at most one
unaligned access and mutually aligned inputs behave as aligned.
After the main loop, process the last 8 bytes using unaligned accesses.
This improves performance of (mutually) aligned cases by 25% and
unaligned by >500% (yes >6 times faster) on large inputs.
* sysdeps/aarch64/memcmp.S (memcmp):
Rewrite of optimized memcmp.
(cherry picked from commit 922369032c604b4dcfd535e1bcddd4687e7126a5)
diff --git a/ChangeLog b/ChangeLog
index 4cd747a..204d047 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,17 @@
+2019-09-06 Siddhesh Poyarekar <siddhesh@sourceware.org>
+
+ * sysdeps/aarch64/memcmp.S: Widen comparison to 16 bytes at a
+ time.
+
+2019-09-06 Siddhesh Poyarekar <siddhesh@sourceware.org>
+
+ * sysdeps/aarch64/memcmp.S: Use L() macro for labels.
+
+2019-09-06 Wilco Dijkstra <wdijkstr@arm.com>
+
+ * sysdeps/aarch64/memcmp.S (memcmp):
+ Rewrite of optimized memcmp.
+
2019-07-12 Adhemerval Zanella <adhemerval.zanella@linaro.org>
[BZ #24699]
diff --git a/sysdeps/aarch64/memcmp.S b/sysdeps/aarch64/memcmp.S
index 4cfcb89..d074c98 100644
--- a/sysdeps/aarch64/memcmp.S
+++ b/sysdeps/aarch64/memcmp.S
@@ -22,132 +22,132 @@
/* Assumptions:
*
- * ARMv8-a, AArch64
+ * ARMv8-a, AArch64, unaligned accesses.
*/
/* Parameters and result. */
#define src1 x0
#define src2 x1
#define limit x2
-#define result x0
+#define result w0
/* Internal variables. */
#define data1 x3
#define data1w w3
-#define data2 x4
-#define data2w w4
-#define has_nul x5
-#define diff x6
-#define endloop x7
-#define tmp1 x8
-#define tmp2 x9
-#define tmp3 x10
-#define pos x11
-#define limit_wd x12
-#define mask x13
+#define data1h x4
+#define data2 x5
+#define data2w w5
+#define data2h x6
+#define tmp1 x7
+#define tmp2 x8
ENTRY_ALIGN (memcmp, 6)
DELOUSE (0)
DELOUSE (1)
DELOUSE (2)
- cbz limit, L(ret0)
- eor tmp1, src1, src2
- tst tmp1, #7
- b.ne L(misaligned8)
- ands tmp1, src1, #7
- b.ne L(mutual_align)
- add limit_wd, limit, #7
- lsr limit_wd, limit_wd, #3
- /* Start of performance-critical section -- one 64B cache line. */
-L(loop_aligned):
- ldr data1, [src1], #8
- ldr data2, [src2], #8
-L(start_realigned):
- subs limit_wd, limit_wd, #1
- eor diff, data1, data2 /* Non-zero if differences found. */
- csinv endloop, diff, xzr, ne /* Last Dword or differences. */
- cbz endloop, L(loop_aligned)
- /* End of performance-critical section -- one 64B cache line. */
-
- /* Not reached the limit, must have found a diff. */
- cbnz limit_wd, L(not_limit)
-
- /* Limit % 8 == 0 => all bytes significant. */
- ands limit, limit, #7
- b.eq L(not_limit)
-
- lsl limit, limit, #3 /* Bits -> bytes. */
- mov mask, #~0
-#ifdef __AARCH64EB__
- lsr mask, mask, limit
-#else
- lsl mask, mask, limit
-#endif
- bic data1, data1, mask
- bic data2, data2, mask
-
- orr diff, diff, mask
-L(not_limit):
-#ifndef __AARCH64EB__
- rev diff, diff
+ subs limit, limit, 8
+ b.lo L(less8)
+
+ ldr data1, [src1], 8
+ ldr data2, [src2], 8
+ cmp data1, data2
+ b.ne L(return)
+
+ subs limit, limit, 8
+ b.gt L(more16)
+
+ ldr data1, [src1, limit]
+ ldr data2, [src2, limit]
+ b L(return)
+
+L(more16):
+ ldr data1, [src1], 8
+ ldr data2, [src2], 8
+ cmp data1, data2
+ bne L(return)
+
+ /* Jump directly to comparing the last 16 bytes for 32 byte (or less)
+ strings. */
+ subs limit, limit, 16
+ b.ls L(last_bytes)
+
+ /* We overlap loads between 0-32 bytes at either side of SRC1 when we
+ try to align, so limit it only to strings larger than 128 bytes. */
+ cmp limit, 96
+ b.ls L(loop8)
+
+ /* Align src1 and adjust src2 with bytes not yet done. */
+ and tmp1, src1, 15
+ add limit, limit, tmp1
+ sub src1, src1, tmp1
+ sub src2, src2, tmp1
+
+ /* Loop performing 16 bytes per iteration using aligned src1.
+ Limit is pre-decremented by 16 and must be larger than zero.
+ Exit if <= 16 bytes left to do or if the data is not equal. */
+ .p2align 4
+L(loop16):
+ ldp data1, data1h, [src1], 16
+ ldp data2, data2h, [src2], 16
+ subs limit, limit, 16
+ ccmp data1, data2, 0, hi
+ ccmp data1h, data2h, 0, eq
+ b.eq L(loop16)
+
+ cmp data1, data2
+ bne L(return)
+ mov data1, data1h
+ mov data2, data2h
+ cmp data1, data2
+ bne L(return)
+
+ /* Compare last 1-16 bytes using unaligned access. */
+L(last_bytes):
+ add src1, src1, limit
+ add src2, src2, limit
+ ldp data1, data1h, [src1]
+ ldp data2, data2h, [src2]
+ cmp data1, data2
+ bne L(return)
+ mov data1, data1h
+ mov data2, data2h
+ cmp data1, data2
+
+ /* Compare data bytes and set return value to 0, -1 or 1. */
+L(return):
+#ifndef __AARCH64EB__
rev data1, data1
rev data2, data2
#endif
- /* The MS-non-zero bit of DIFF marks either the first bit
- that is different, or the end of the significant data.
- Shifting left now will bring the critical information into the
- top bits. */
- clz pos, diff
- lsl data1, data1, pos
- lsl data2, data2, pos
- /* But we need to zero-extend (char is unsigned) the value and then
- perform a signed 32-bit subtraction. */
- lsr data1, data1, #56
- sub result, data1, data2, lsr #56
- RET
-
-L(mutual_align):
- /* Sources are mutually aligned, but are not currently at an
- alignment boundary. Round down the addresses and then mask off
- the bytes that precede the start point. */
- bic src1, src1, #7
- bic src2, src2, #7
- add limit, limit, tmp1 /* Adjust the limit for the extra. */
- lsl tmp1, tmp1, #3 /* Bytes beyond alignment -> bits. */
- ldr data1, [src1], #8
- neg tmp1, tmp1 /* Bits to alignment -64. */
- ldr data2, [src2], #8
- mov tmp2, #~0
-#ifdef __AARCH64EB__
- /* Big-endian. Early bytes are at MSB. */
- lsl tmp2, tmp2, tmp1 /* Shift (tmp1 & 63). */
-#else
- /* Little-endian. Early bytes are at LSB. */
- lsr tmp2, tmp2, tmp1 /* Shift (tmp1 & 63). */
-#endif
- add limit_wd, limit, #7
- orr data1, data1, tmp2
- orr data2, data2, tmp2
- lsr limit_wd, limit_wd, #3
- b L(start_realigned)
-
-L(ret0):
- mov result, #0
- RET
-
- .p2align 6
-L(misaligned8):
- sub limit, limit, #1
-1:
- /* Perhaps we can do better than this. */
- ldrb data1w, [src1], #1
- ldrb data2w, [src2], #1
- subs limit, limit, #1
- ccmp data1w, data2w, #0, cs /* NZCV = 0b0000. */
- b.eq 1b
- sub result, data1, data2
- RET
+ cmp data1, data2
+L(ret_eq):
+ cset result, ne
+ cneg result, result, lo
+ ret
+
+ .p2align 4
+ /* Compare up to 8 bytes. Limit is [-8..-1]. */
+L(less8):
+ adds limit, limit, 4
+ b.lo L(less4)
+ ldr data1w, [src1], 4
+ ldr data2w, [src2], 4
+ cmp data1w, data2w
+ b.ne L(return)
+ sub limit, limit, 4
+L(less4):
+ adds limit, limit, 4
+ beq L(ret_eq)
+L(byte_loop):
+ ldrb data1w, [src1], 1
+ ldrb data2w, [src2], 1
+ subs limit, limit, 1
+ ccmp data1w, data2w, 0, ne /* NZCV = 0b0000. */
+ b.eq L(byte_loop)
+ sub result, data1w, data2w
+ ret
+
END (memcmp)
#undef bcmp
weak_alias (memcmp, bcmp)
^ permalink raw reply [flat|nested] only message in thread
only message in thread, other threads:[~2019-09-06 16:51 UTC | newest]
Thread overview: (only message) (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2019-01-01 0:00 [2.26 COMMITTED][AArch64] Backport memcmp improvements Wilco Dijkstra
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).