* [2.27 COMMITTED][AArch64] Backport memcpy_falkor improvements
@ 2019-01-01 0:00 Wilco Dijkstra
0 siblings, 0 replies; 2+ messages in thread
From: Wilco Dijkstra @ 2019-01-01 0:00 UTC (permalink / raw)
To: libc-stable; +Cc: nd
commit ad64510e5c74729108a02a6c22f03aa8ee07a8d3
Author: Siddhesh Poyarekar <siddhesh@sourceware.org>
Date: Fri Jun 29 22:45:59 2018 +0530
aarch64,falkor: Use vector registers for memcpy
Vector registers perform better than scalar register pairs for copying
data so prefer them instead. This results in a time reduction of over
50% (i.e. 2x speed improvemnet) for some smaller sizes for memcpy-walk.
Larger sizes show improvements of around 1% to 2%. memcpy-random shows
a very small improvement, in the range of 1-2%.
* sysdeps/aarch64/multiarch/memcpy_falkor.S (__memcpy_falkor):
Use vector registers.
(cherry picked from commit 0aec4c1d1801e8016ebe89281d16597e0557b8be)
commit d3c05bfffa65b39ba64d62e24ed4a6c118fab3ef
Author: Siddhesh Poyarekar <siddhesh@sourceware.org>
Date: Fri May 11 00:11:52 2018 +0530
aarch64,falkor: Ignore prefetcher tagging for smaller copies
For smaller and medium sized copies, the effect of hardware
prefetching are not as dominant as instruction level parallelism.
Hence it makes more sense to load data into multiple registers than to
try and route them to the same prefetch unit. This is also the case
for the loop exit where we are unable to latch on to the same prefetch
unit anyway so it makes more sense to have data loaded in parallel.
The performance results are a bit mixed with memcpy-random, with
numbers jumping between -1% and +3%, i.e. the numbers don't seem
repeatable. memcpy-walk sees a 70% improvement (i.e. > 2x) for 128
bytes and that improvement reduces down as the impact of the tail copy
decreases in comparison to the loop.
* sysdeps/aarch64/multiarch/memcpy_falkor.S (__memcpy_falkor):
Use multiple registers to copy data in loop tail.
(cherry picked from commit db725a458e1cb0e17204daa543744faf08bb2e06)
diff --git a/ChangeLog b/ChangeLog
index e9557b8..0482b0c 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,5 +1,15 @@
2019-09-06 Siddhesh Poyarekar <siddhesh@sourceware.org>
+ * sysdeps/aarch64/multiarch/memcpy_falkor.S (__memcpy_falkor):
+ Use vector registers.
+
+2019-09-06 Siddhesh Poyarekar <siddhesh@sourceware.org>
+
+ * sysdeps/aarch64/multiarch/memcpy_falkor.S (__memcpy_falkor):
+ Use multiple registers to copy data in loop tail.
+
+2019-09-06 Siddhesh Poyarekar <siddhesh@sourceware.org>
+
* sysdeps/aarch64/strncmp.S (strncmp): Use lsr instead of
mov + lsr.
diff --git a/sysdeps/aarch64/multiarch/memcpy_falkor.S b/sysdeps/aarch64/multiarch/memcpy_falkor.S
index dea4f22..9cde8dc 100644
--- a/sysdeps/aarch64/multiarch/memcpy_falkor.S
+++ b/sysdeps/aarch64/multiarch/memcpy_falkor.S
@@ -29,11 +29,19 @@
#define dst x3
#define srcend x4
#define dstend x5
-#define A_l x6
-#define A_lw w6
-#define A_h x7
-#define A_hw w7
#define tmp1 x14
+#define A_x x6
+#define B_x x7
+#define A_w w6
+#define B_w w7
+
+#define A_q q0
+#define B_q q1
+#define C_q q2
+#define D_q q3
+#define E_q q4
+#define F_q q5
+#define G_q q6
/* Copies are split into 3 main cases:
@@ -53,9 +61,9 @@
bumping up the small copies up to 32 bytes allows us to do that without
cost and also allows us to reduce the size of the prep code before loop64.
- All copies are done only via two registers r6 and r7. This is to ensure
- that all loads hit a single hardware prefetcher which can get correctly
- trained to prefetch a single stream.
+ The copy loop uses only one register q0. This is to ensure that all loads
+ hit a single hardware prefetcher which can get correctly trained to prefetch
+ a single stream.
The non-temporal stores help optimize cache utilization. */
@@ -66,29 +74,29 @@ ENTRY_ALIGN (__memcpy_falkor, 6)
add srcend, src, count
add dstend, dstin, count
b.ls L(copy32)
- ldp A_l, A_h, [src]
+ ldr A_q, [src]
cmp count, 128
- stp A_l, A_h, [dstin]
+ str A_q, [dstin]
b.hi L(copy_long)
/* Medium copies: 33..128 bytes. */
sub tmp1, count, 1
- ldp A_l, A_h, [src, 16]
- stp A_l, A_h, [dstin, 16]
+ ldr A_q, [src, 16]
+ ldr B_q, [srcend, -32]
+ ldr C_q, [srcend, -16]
tbz tmp1, 6, 1f
- ldp A_l, A_h, [src, 32]
- stp A_l, A_h, [dstin, 32]
- ldp A_l, A_h, [src, 48]
- stp A_l, A_h, [dstin, 48]
- ldp A_l, A_h, [srcend, -64]
- stp A_l, A_h, [dstend, -64]
- ldp A_l, A_h, [srcend, -48]
- stp A_l, A_h, [dstend, -48]
+ ldr D_q, [src, 32]
+ ldr E_q, [src, 48]
+ str D_q, [dstin, 32]
+ str E_q, [dstin, 48]
+ ldr F_q, [srcend, -64]
+ ldr G_q, [srcend, -48]
+ str F_q, [dstend, -64]
+ str G_q, [dstend, -48]
1:
- ldp A_l, A_h, [srcend, -32]
- stp A_l, A_h, [dstend, -32]
- ldp A_l, A_h, [srcend, -16]
- stp A_l, A_h, [dstend, -16]
+ str A_q, [dstin, 16]
+ str B_q, [dstend, -32]
+ str C_q, [dstend, -16]
ret
.p2align 4
@@ -97,44 +105,44 @@ L(copy32):
/* 16-32 */
cmp count, 16
b.lo 1f
- ldp A_l, A_h, [src]
- stp A_l, A_h, [dstin]
- ldp A_l, A_h, [srcend, -16]
- stp A_l, A_h, [dstend, -16]
+ ldr A_q, [src]
+ ldr B_q, [srcend, -16]
+ str A_q, [dstin]
+ str B_q, [dstend, -16]
ret
.p2align 4
1:
/* 8-15 */
tbz count, 3, 1f
- ldr A_l, [src]
- str A_l, [dstin]
- ldr A_l, [srcend, -8]
- str A_l, [dstend, -8]
+ ldr A_x, [src]
+ ldr B_x, [srcend, -8]
+ str A_x, [dstin]
+ str B_x, [dstend, -8]
ret
.p2align 4
1:
/* 4-7 */
tbz count, 2, 1f
- ldr A_lw, [src]
- str A_lw, [dstin]
- ldr A_lw, [srcend, -4]
- str A_lw, [dstend, -4]
+ ldr A_w, [src]
+ ldr B_w, [srcend, -4]
+ str A_w, [dstin]
+ str B_w, [dstend, -4]
ret
.p2align 4
1:
/* 2-3 */
tbz count, 1, 1f
- ldrh A_lw, [src]
- strh A_lw, [dstin]
- ldrh A_lw, [srcend, -2]
- strh A_lw, [dstend, -2]
+ ldrh A_w, [src]
+ ldrh B_w, [srcend, -2]
+ strh A_w, [dstin]
+ strh B_w, [dstend, -2]
ret
.p2align 4
1:
/* 0-1 */
tbz count, 0, 1f
- ldrb A_lw, [src]
- strb A_lw, [dstin]
+ ldrb A_w, [src]
+ strb A_w, [dstin]
1:
ret
@@ -153,30 +161,29 @@ L(copy_long):
add count, count, tmp1
L(loop64):
- ldp A_l, A_h, [src, 16]!
- stnp A_l, A_h, [dst, 16]
- ldp A_l, A_h, [src, 16]!
+ ldr A_q, [src, 16]!
+ str A_q, [dst, 16]
+ ldr A_q, [src, 16]!
subs count, count, 64
- stnp A_l, A_h, [dst, 32]
- ldp A_l, A_h, [src, 16]!
- stnp A_l, A_h, [dst, 48]
- ldp A_l, A_h, [src, 16]!
- stnp A_l, A_h, [dst, 64]
- add dst, dst, 64
+ str A_q, [dst, 32]
+ ldr A_q, [src, 16]!
+ str A_q, [dst, 48]
+ ldr A_q, [src, 16]!
+ str A_q, [dst, 64]!
b.hi L(loop64)
/* Write the last full set of 64 bytes. The remainder is at most 64
bytes, so it is safe to always copy 64 bytes from the end even if
there is just 1 byte left. */
L(last64):
- ldp A_l, A_h, [srcend, -64]
- stnp A_l, A_h, [dstend, -64]
- ldp A_l, A_h, [srcend, -48]
- stnp A_l, A_h, [dstend, -48]
- ldp A_l, A_h, [srcend, -32]
- stnp A_l, A_h, [dstend, -32]
- ldp A_l, A_h, [srcend, -16]
- stnp A_l, A_h, [dstend, -16]
+ ldr E_q, [srcend, -64]
+ str E_q, [dstend, -64]
+ ldr D_q, [srcend, -48]
+ str D_q, [dstend, -48]
+ ldr C_q, [srcend, -32]
+ str C_q, [dstend, -32]
+ ldr B_q, [srcend, -16]
+ str B_q, [dstend, -16]
ret
END (__memcpy_falkor)
^ permalink raw reply [flat|nested] 2+ messages in thread
* [2.27 COMMITTED][AArch64] Backport memcpy_falkor improvements
@ 2019-01-01 0:00 Wilco Dijkstra
0 siblings, 0 replies; 2+ messages in thread
From: Wilco Dijkstra @ 2019-01-01 0:00 UTC (permalink / raw)
To: libc-stable; +Cc: nd
commit e6b7252040755cc965e71622084b9b5ee05345ff
Author: Siddhesh Poyarekar <siddhesh@sourceware.org>
Date: Fri Jun 29 22:45:59 2018 +0530
aarch64,falkor: Use vector registers for memcpy
Vector registers perform better than scalar register pairs for copying
data so prefer them instead. This results in a time reduction of over
50% (i.e. 2x speed improvemnet) for some smaller sizes for memcpy-walk.
Larger sizes show improvements of around 1% to 2%. memcpy-random shows
a very small improvement, in the range of 1-2%.
* sysdeps/aarch64/multiarch/memcpy_falkor.S (__memcpy_falkor):
Use vector registers.
(cherry picked from commit 0aec4c1d1801e8016ebe89281d16597e0557b8be)
commit c74b884f705aa54998c4b94ac8b098b3ac40e465
Author: Siddhesh Poyarekar <siddhesh@sourceware.org>
Date: Fri May 11 00:11:52 2018 +0530
aarch64,falkor: Ignore prefetcher tagging for smaller copies
For smaller and medium sized copies, the effect of hardware
prefetching are not as dominant as instruction level parallelism.
Hence it makes more sense to load data into multiple registers than to
try and route them to the same prefetch unit. This is also the case
for the loop exit where we are unable to latch on to the same prefetch
unit anyway so it makes more sense to have data loaded in parallel.
The performance results are a bit mixed with memcpy-random, with
numbers jumping between -1% and +3%, i.e. the numbers don't seem
repeatable. memcpy-walk sees a 70% improvement (i.e. > 2x) for 128
bytes and that improvement reduces down as the impact of the tail copy
decreases in comparison to the loop.
* sysdeps/aarch64/multiarch/memcpy_falkor.S (__memcpy_falkor):
Use multiple registers to copy data in loop tail.
(cherry picked from commit db725a458e1cb0e17204daa543744faf08bb2e06)
diff --git a/ChangeLog b/ChangeLog
index 99b6180..dd2106c 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,5 +1,15 @@
2019-09-06 Siddhesh Poyarekar <siddhesh@sourceware.org>
+ * sysdeps/aarch64/multiarch/memcpy_falkor.S (__memcpy_falkor):
+ Use vector registers.
+
+2019-09-06 Siddhesh Poyarekar <siddhesh@sourceware.org>
+
+ * sysdeps/aarch64/multiarch/memcpy_falkor.S (__memcpy_falkor):
+ Use multiple registers to copy data in loop tail.
+
+2019-09-06 Siddhesh Poyarekar <siddhesh@sourceware.org>
+
* sysdeps/aarch64/strncmp.S (strncmp): Use lsr instead of
mov + lsr.
diff --git a/sysdeps/aarch64/multiarch/memcpy_falkor.S b/sysdeps/aarch64/multiarch/memcpy_falkor.S
index 8dd8c1e..cdc2de4 100644
--- a/sysdeps/aarch64/multiarch/memcpy_falkor.S
+++ b/sysdeps/aarch64/multiarch/memcpy_falkor.S
@@ -29,11 +29,19 @@
#define dst x3
#define srcend x4
#define dstend x5
-#define A_l x6
-#define A_lw w6
-#define A_h x7
-#define A_hw w7
#define tmp1 x14
+#define A_x x6
+#define B_x x7
+#define A_w w6
+#define B_w w7
+
+#define A_q q0
+#define B_q q1
+#define C_q q2
+#define D_q q3
+#define E_q q4
+#define F_q q5
+#define G_q q6
/* Copies are split into 3 main cases:
@@ -53,9 +61,9 @@
bumping up the small copies up to 32 bytes allows us to do that without
cost and also allows us to reduce the size of the prep code before loop64.
- All copies are done only via two registers r6 and r7. This is to ensure
- that all loads hit a single hardware prefetcher which can get correctly
- trained to prefetch a single stream.
+ The copy loop uses only one register q0. This is to ensure that all loads
+ hit a single hardware prefetcher which can get correctly trained to prefetch
+ a single stream.
The non-temporal stores help optimize cache utilization. */
@@ -66,29 +74,29 @@ ENTRY_ALIGN (__memcpy_falkor, 6)
add srcend, src, count
add dstend, dstin, count
b.ls L(copy32)
- ldp A_l, A_h, [src]
+ ldr A_q, [src]
cmp count, 128
- stp A_l, A_h, [dstin]
+ str A_q, [dstin]
b.hi L(copy_long)
/* Medium copies: 33..128 bytes. */
sub tmp1, count, 1
- ldp A_l, A_h, [src, 16]
- stp A_l, A_h, [dstin, 16]
+ ldr A_q, [src, 16]
+ ldr B_q, [srcend, -32]
+ ldr C_q, [srcend, -16]
tbz tmp1, 6, 1f
- ldp A_l, A_h, [src, 32]
- stp A_l, A_h, [dstin, 32]
- ldp A_l, A_h, [src, 48]
- stp A_l, A_h, [dstin, 48]
- ldp A_l, A_h, [srcend, -64]
- stp A_l, A_h, [dstend, -64]
- ldp A_l, A_h, [srcend, -48]
- stp A_l, A_h, [dstend, -48]
+ ldr D_q, [src, 32]
+ ldr E_q, [src, 48]
+ str D_q, [dstin, 32]
+ str E_q, [dstin, 48]
+ ldr F_q, [srcend, -64]
+ ldr G_q, [srcend, -48]
+ str F_q, [dstend, -64]
+ str G_q, [dstend, -48]
1:
- ldp A_l, A_h, [srcend, -32]
- stp A_l, A_h, [dstend, -32]
- ldp A_l, A_h, [srcend, -16]
- stp A_l, A_h, [dstend, -16]
+ str A_q, [dstin, 16]
+ str B_q, [dstend, -32]
+ str C_q, [dstend, -16]
ret
.p2align 4
@@ -97,44 +105,44 @@ L(copy32):
/* 16-32 */
cmp count, 16
b.lo 1f
- ldp A_l, A_h, [src]
- stp A_l, A_h, [dstin]
- ldp A_l, A_h, [srcend, -16]
- stp A_l, A_h, [dstend, -16]
+ ldr A_q, [src]
+ ldr B_q, [srcend, -16]
+ str A_q, [dstin]
+ str B_q, [dstend, -16]
ret
.p2align 4
1:
/* 8-15 */
tbz count, 3, 1f
- ldr A_l, [src]
- str A_l, [dstin]
- ldr A_l, [srcend, -8]
- str A_l, [dstend, -8]
+ ldr A_x, [src]
+ ldr B_x, [srcend, -8]
+ str A_x, [dstin]
+ str B_x, [dstend, -8]
ret
.p2align 4
1:
/* 4-7 */
tbz count, 2, 1f
- ldr A_lw, [src]
- str A_lw, [dstin]
- ldr A_lw, [srcend, -4]
- str A_lw, [dstend, -4]
+ ldr A_w, [src]
+ ldr B_w, [srcend, -4]
+ str A_w, [dstin]
+ str B_w, [dstend, -4]
ret
.p2align 4
1:
/* 2-3 */
tbz count, 1, 1f
- ldrh A_lw, [src]
- strh A_lw, [dstin]
- ldrh A_lw, [srcend, -2]
- strh A_lw, [dstend, -2]
+ ldrh A_w, [src]
+ ldrh B_w, [srcend, -2]
+ strh A_w, [dstin]
+ strh B_w, [dstend, -2]
ret
.p2align 4
1:
/* 0-1 */
tbz count, 0, 1f
- ldrb A_lw, [src]
- strb A_lw, [dstin]
+ ldrb A_w, [src]
+ strb A_w, [dstin]
1:
ret
@@ -153,30 +161,29 @@ L(copy_long):
add count, count, tmp1
L(loop64):
- ldp A_l, A_h, [src, 16]!
- stnp A_l, A_h, [dst, 16]
- ldp A_l, A_h, [src, 16]!
+ ldr A_q, [src, 16]!
+ str A_q, [dst, 16]
+ ldr A_q, [src, 16]!
subs count, count, 64
- stnp A_l, A_h, [dst, 32]
- ldp A_l, A_h, [src, 16]!
- stnp A_l, A_h, [dst, 48]
- ldp A_l, A_h, [src, 16]!
- stnp A_l, A_h, [dst, 64]
- add dst, dst, 64
+ str A_q, [dst, 32]
+ ldr A_q, [src, 16]!
+ str A_q, [dst, 48]
+ ldr A_q, [src, 16]!
+ str A_q, [dst, 64]!
b.hi L(loop64)
/* Write the last full set of 64 bytes. The remainder is at most 64
bytes, so it is safe to always copy 64 bytes from the end even if
there is just 1 byte left. */
L(last64):
- ldp A_l, A_h, [srcend, -64]
- stnp A_l, A_h, [dstend, -64]
- ldp A_l, A_h, [srcend, -48]
- stnp A_l, A_h, [dstend, -48]
- ldp A_l, A_h, [srcend, -32]
- stnp A_l, A_h, [dstend, -32]
- ldp A_l, A_h, [srcend, -16]
- stnp A_l, A_h, [dstend, -16]
+ ldr E_q, [srcend, -64]
+ str E_q, [dstend, -64]
+ ldr D_q, [srcend, -48]
+ str D_q, [dstend, -48]
+ ldr C_q, [srcend, -32]
+ str C_q, [dstend, -32]
+ ldr B_q, [srcend, -16]
+ str B_q, [dstend, -16]
ret
END (__memcpy_falkor)
^ permalink raw reply [flat|nested] 2+ messages in thread
end of thread, other threads:[~2019-09-06 18:43 UTC | newest]
Thread overview: 2+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2019-01-01 0:00 [2.27 COMMITTED][AArch64] Backport memcpy_falkor improvements Wilco Dijkstra
-- strict thread matches above, loose matches on Subject: below --
2019-01-01 0:00 Wilco Dijkstra
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).