* Re: [PATCH][AArch64] Tune memcpy
@ 2015-12-15 16:43 Wilco Dijkstra
2016-04-15 12:44 ` Wilco Dijkstra
2016-05-12 14:02 ` Wilco Dijkstra
0 siblings, 2 replies; 8+ messages in thread
From: Wilco Dijkstra @ 2015-12-15 16:43 UTC (permalink / raw)
To: 'GNU C Library'; +Cc: nd
Ping
-----Original Message-----
From: Wilco Dijkstra [mailto:Wilco.Dijkstra@arm.com]
Sent: 19 November 2015 12:35
To: 'GNU C Library'
Subject: [PATCH][AArch64] Tune memcpy
This patch further tunes memcpy - avoid one branch for sizes 1-3, add a prefetch and improve small copies that are exact powers of 2.
OK for commit? (depends on https://sourceware.org/ml/libc-alpha/2015-09/msg00633.html )
ChangeLog:
2015-11-19 Wilco Dijkstra <wdijkstr@arm.com>
* sysdeps/aarch64/memcpy.S (memcpy):
Further tuning for performance.
---
sysdeps/aarch64/memcpy.S | 56 +++++++++++++++++++++++++++---------------------
1 file changed, 32 insertions(+), 24 deletions(-)
diff --git a/sysdeps/aarch64/memcpy.S b/sysdeps/aarch64/memcpy.S
index 51e7268..6b8610e 100644
--- a/sysdeps/aarch64/memcpy.S
+++ b/sysdeps/aarch64/memcpy.S
@@ -35,6 +35,7 @@
#define A_h x7
#define A_hw w7
#define B_l x8
+#define B_lw w8
#define B_h x9
#define C_l x10
#define C_h x11
@@ -70,21 +71,40 @@ END (memmove)
libc_hidden_builtin_def (memmove)
ENTRY (memcpy)
+ prfm PLDL1KEEP, [src]
add srcend, src, count
add dstend, dstin, count
+ cmp count, 16
+ b.ls L(copy16)
cmp count, 96
b.hi L(copy_long)
- cmp count, 16
- b.hs L(copy_medium)
+ /* Medium copies: 17..96 bytes. */
+ sub tmp1, count, 1
+ ldp A_l, A_h, [src]
+ tbnz tmp1, 6, L(copy96)
+ ldp D_l, D_h, [srcend, -16]
+ tbz tmp1, 5, 1f
+ ldp B_l, B_h, [src, 16]
+ ldp C_l, C_h, [srcend, -32]
+ stp B_l, B_h, [dstin, 16]
+ stp C_l, C_h, [dstend, -32]
+1:
+ stp A_l, A_h, [dstin]
+ stp D_l, D_h, [dstend, -16]
+ ret
+
+ .p2align 4
/* Small copies: 0..16 bytes. */
L(copy16):
- tbz count, 3, 1f
+ cmp count, 8
+ b.lo 1f
ldr A_l, [src]
ldr A_h, [srcend, -8]
str A_l, [dstin]
str A_h, [dstend, -8]
ret
+ .p2align 4
1:
tbz count, 2, 1f
ldr A_lw, [src]
@@ -92,33 +112,21 @@ L(copy16):
str A_lw, [dstin]
str A_hw, [dstend, -4]
ret
- .p2align 4
+
+ /* Copy 0..3 bytes. Use a branchless sequence that copies the same
+ byte 3 times if count==1, or the 2nd byte twice if count==2. */
1:
cbz count, 2f
+ lsr tmp1, count, 1
ldrb A_lw, [src]
- tbz count, 1, 1f
- ldrh A_hw, [srcend, -2]
- strh A_hw, [dstend, -2]
-1: strb A_lw, [dstin]
+ ldrb A_hw, [srcend, -1]
+ ldrb B_lw, [src, tmp1]
+ strb A_lw, [dstin]
+ strb B_lw, [dstin, tmp1]
+ strb A_hw, [dstend, -1]
2: ret
.p2align 4
- /* Medium copies: 17..96 bytes. */
-L(copy_medium):
- ldp A_l, A_h, [src]
- tbnz count, 6, L(copy96)
- ldp D_l, D_h, [srcend, -16]
- tbz count, 5, 1f
- ldp B_l, B_h, [src, 16]
- ldp C_l, C_h, [srcend, -32]
- stp B_l, B_h, [dstin, 16]
- stp C_l, C_h, [dstend, -32]
-1:
- stp A_l, A_h, [dstin]
- stp D_l, D_h, [dstend, -16]
- ret
-
- .p2align 4
/* Copy 64..96 bytes. Copy 64 bytes from the start and
32 bytes from the end. */
L(copy96):
--
1.9.1
^ permalink raw reply [flat|nested] 8+ messages in thread
* Re: [PATCH][AArch64] Tune memcpy
2015-12-15 16:43 [PATCH][AArch64] Tune memcpy Wilco Dijkstra
@ 2016-04-15 12:44 ` Wilco Dijkstra
2016-05-12 14:02 ` Wilco Dijkstra
1 sibling, 0 replies; 8+ messages in thread
From: Wilco Dijkstra @ 2016-04-15 12:44 UTC (permalink / raw)
To: 'GNU C Library'; +Cc: nd, Richard Earnshaw, Marcus Shawcroft
ping
________________________________________
From: Wilco Dijkstra
Sent: 15 December 2015 16:42
To: 'GNU C Library'
Cc: nd
Subject: Re: [PATCH][AArch64] Tune memcpy
Ping
-----Original Message-----
From: Wilco Dijkstra [mailto:Wilco.Dijkstra@arm.com]
Sent: 19 November 2015 12:35
To: 'GNU C Library'
Subject: [PATCH][AArch64] Tune memcpy
This patch further tunes memcpy - avoid one branch for sizes 1-3, add a prefetch and improve small copies that are exact powers of 2.
OK for commit? (depends on https://sourceware.org/ml/libc-alpha/2015-09/msg00633.html )
ChangeLog:
2015-11-19 Wilco Dijkstra <wdijkstr@arm.com>
* sysdeps/aarch64/memcpy.S (memcpy):
Further tuning for performance.
---
sysdeps/aarch64/memcpy.S | 56 +++++++++++++++++++++++++++---------------------
1 file changed, 32 insertions(+), 24 deletions(-)
diff --git a/sysdeps/aarch64/memcpy.S b/sysdeps/aarch64/memcpy.S
index 51e7268..6b8610e 100644
--- a/sysdeps/aarch64/memcpy.S
+++ b/sysdeps/aarch64/memcpy.S
@@ -35,6 +35,7 @@
#define A_h x7
#define A_hw w7
#define B_l x8
+#define B_lw w8
#define B_h x9
#define C_l x10
#define C_h x11
@@ -70,21 +71,40 @@ END (memmove)
libc_hidden_builtin_def (memmove)
ENTRY (memcpy)
+ prfm PLDL1KEEP, [src]
add srcend, src, count
add dstend, dstin, count
+ cmp count, 16
+ b.ls L(copy16)
cmp count, 96
b.hi L(copy_long)
- cmp count, 16
- b.hs L(copy_medium)
+ /* Medium copies: 17..96 bytes. */
+ sub tmp1, count, 1
+ ldp A_l, A_h, [src]
+ tbnz tmp1, 6, L(copy96)
+ ldp D_l, D_h, [srcend, -16]
+ tbz tmp1, 5, 1f
+ ldp B_l, B_h, [src, 16]
+ ldp C_l, C_h, [srcend, -32]
+ stp B_l, B_h, [dstin, 16]
+ stp C_l, C_h, [dstend, -32]
+1:
+ stp A_l, A_h, [dstin]
+ stp D_l, D_h, [dstend, -16]
+ ret
+
+ .p2align 4
/* Small copies: 0..16 bytes. */
L(copy16):
- tbz count, 3, 1f
+ cmp count, 8
+ b.lo 1f
ldr A_l, [src]
ldr A_h, [srcend, -8]
str A_l, [dstin]
str A_h, [dstend, -8]
ret
+ .p2align 4
1:
tbz count, 2, 1f
ldr A_lw, [src]
@@ -92,33 +112,21 @@ L(copy16):
str A_lw, [dstin]
str A_hw, [dstend, -4]
ret
- .p2align 4
+
+ /* Copy 0..3 bytes. Use a branchless sequence that copies the same
+ byte 3 times if count==1, or the 2nd byte twice if count==2. */
1:
cbz count, 2f
+ lsr tmp1, count, 1
ldrb A_lw, [src]
- tbz count, 1, 1f
- ldrh A_hw, [srcend, -2]
- strh A_hw, [dstend, -2]
-1: strb A_lw, [dstin]
+ ldrb A_hw, [srcend, -1]
+ ldrb B_lw, [src, tmp1]
+ strb A_lw, [dstin]
+ strb B_lw, [dstin, tmp1]
+ strb A_hw, [dstend, -1]
2: ret
.p2align 4
- /* Medium copies: 17..96 bytes. */
-L(copy_medium):
- ldp A_l, A_h, [src]
- tbnz count, 6, L(copy96)
- ldp D_l, D_h, [srcend, -16]
- tbz count, 5, 1f
- ldp B_l, B_h, [src, 16]
- ldp C_l, C_h, [srcend, -32]
- stp B_l, B_h, [dstin, 16]
- stp C_l, C_h, [dstend, -32]
-1:
- stp A_l, A_h, [dstin]
- stp D_l, D_h, [dstend, -16]
- ret
-
- .p2align 4
/* Copy 64..96 bytes. Copy 64 bytes from the start and
32 bytes from the end. */
L(copy96):
--
1.9.1
^ permalink raw reply [flat|nested] 8+ messages in thread
* Re: [PATCH][AArch64] Tune memcpy
2015-12-15 16:43 [PATCH][AArch64] Tune memcpy Wilco Dijkstra
2016-04-15 12:44 ` Wilco Dijkstra
@ 2016-05-12 14:02 ` Wilco Dijkstra
2016-05-19 11:34 ` Wilco Dijkstra
1 sibling, 1 reply; 8+ messages in thread
From: Wilco Dijkstra @ 2016-05-12 14:02 UTC (permalink / raw)
To: 'GNU C Library', Marcus Shawcroft; +Cc: nd
ping
-----Original Message-----
From: Wilco Dijkstra [mailto:Wilco.Dijkstra@arm.com]
Sent: 19 November 2015 12:35
To: 'GNU C Library'
Subject: [PATCH][AArch64] Tune memcpy
This patch further tunes memcpy - avoid one branch for sizes 1-3, add a prefetch and improve small copies that are exact powers of 2.
OK for commit? (depends on https://sourceware.org/ml/libc-alpha/2015-09/msg00633.html )
ChangeLog:
2015-11-19 Wilco Dijkstra <wdijkstr@arm.com>
* sysdeps/aarch64/memcpy.S (memcpy):
Further tuning for performance.
---
sysdeps/aarch64/memcpy.S | 56 +++++++++++++++++++++++++++---------------------
1 file changed, 32 insertions(+), 24 deletions(-)
diff --git a/sysdeps/aarch64/memcpy.S b/sysdeps/aarch64/memcpy.S
index 51e7268..6b8610e 100644
--- a/sysdeps/aarch64/memcpy.S
+++ b/sysdeps/aarch64/memcpy.S
@@ -35,6 +35,7 @@
#define A_h x7
#define A_hw w7
#define B_l x8
+#define B_lw w8
#define B_h x9
#define C_l x10
#define C_h x11
@@ -70,21 +71,40 @@ END (memmove)
libc_hidden_builtin_def (memmove)
ENTRY (memcpy)
+ prfm PLDL1KEEP, [src]
add srcend, src, count
add dstend, dstin, count
+ cmp count, 16
+ b.ls L(copy16)
cmp count, 96
b.hi L(copy_long)
- cmp count, 16
- b.hs L(copy_medium)
+ /* Medium copies: 17..96 bytes. */
+ sub tmp1, count, 1
+ ldp A_l, A_h, [src]
+ tbnz tmp1, 6, L(copy96)
+ ldp D_l, D_h, [srcend, -16]
+ tbz tmp1, 5, 1f
+ ldp B_l, B_h, [src, 16]
+ ldp C_l, C_h, [srcend, -32]
+ stp B_l, B_h, [dstin, 16]
+ stp C_l, C_h, [dstend, -32]
+1:
+ stp A_l, A_h, [dstin]
+ stp D_l, D_h, [dstend, -16]
+ ret
+
+ .p2align 4
/* Small copies: 0..16 bytes. */
L(copy16):
- tbz count, 3, 1f
+ cmp count, 8
+ b.lo 1f
ldr A_l, [src]
ldr A_h, [srcend, -8]
str A_l, [dstin]
str A_h, [dstend, -8]
ret
+ .p2align 4
1:
tbz count, 2, 1f
ldr A_lw, [src]
@@ -92,33 +112,21 @@ L(copy16):
str A_lw, [dstin]
str A_hw, [dstend, -4]
ret
- .p2align 4
+
+ /* Copy 0..3 bytes. Use a branchless sequence that copies the same
+ byte 3 times if count==1, or the 2nd byte twice if count==2. */
1:
cbz count, 2f
+ lsr tmp1, count, 1
ldrb A_lw, [src]
- tbz count, 1, 1f
- ldrh A_hw, [srcend, -2]
- strh A_hw, [dstend, -2]
-1: strb A_lw, [dstin]
+ ldrb A_hw, [srcend, -1]
+ ldrb B_lw, [src, tmp1]
+ strb A_lw, [dstin]
+ strb B_lw, [dstin, tmp1]
+ strb A_hw, [dstend, -1]
2: ret
.p2align 4
- /* Medium copies: 17..96 bytes. */
-L(copy_medium):
- ldp A_l, A_h, [src]
- tbnz count, 6, L(copy96)
- ldp D_l, D_h, [srcend, -16]
- tbz count, 5, 1f
- ldp B_l, B_h, [src, 16]
- ldp C_l, C_h, [srcend, -32]
- stp B_l, B_h, [dstin, 16]
- stp C_l, C_h, [dstend, -32]
-1:
- stp A_l, A_h, [dstin]
- stp D_l, D_h, [dstend, -16]
- ret
-
- .p2align 4
/* Copy 64..96 bytes. Copy 64 bytes from the start and
32 bytes from the end. */
L(copy96):
--
1.9.1
^ permalink raw reply [flat|nested] 8+ messages in thread
* Re: [PATCH][AArch64] Tune memcpy
2016-05-12 14:02 ` Wilco Dijkstra
@ 2016-05-19 11:34 ` Wilco Dijkstra
2016-06-03 12:37 ` Fw: " Wilco Dijkstra
0 siblings, 1 reply; 8+ messages in thread
From: Wilco Dijkstra @ 2016-05-19 11:34 UTC (permalink / raw)
To: 'GNU C Library', Marcus Shawcroft; +Cc: nd
ping
________________________________________
From: Wilco Dijkstra
Sent: 12 May 2016 15:01
To: 'GNU C Library'; Marcus Shawcroft
Cc: nd
Subject: Re: [PATCH][AArch64] Tune memcpy
ping
-----Original Message-----
From: Wilco Dijkstra [mailto:Wilco.Dijkstra@arm.com]
Sent: 19 November 2015 12:35
To: 'GNU C Library'
Subject: [PATCH][AArch64] Tune memcpy
This patch further tunes memcpy - avoid one branch for sizes 1-3, add a prefetch and improve small copies that are exact powers of 2.
OK for commit? (depends on https://sourceware.org/ml/libc-alpha/2015-09/msg00633.html )
ChangeLog:
2015-11-19 Wilco Dijkstra <wdijkstr@arm.com>
* sysdeps/aarch64/memcpy.S (memcpy):
Further tuning for performance.
---
sysdeps/aarch64/memcpy.S | 56 +++++++++++++++++++++++++++---------------------
1 file changed, 32 insertions(+), 24 deletions(-)
diff --git a/sysdeps/aarch64/memcpy.S b/sysdeps/aarch64/memcpy.S
index 51e7268..6b8610e 100644
--- a/sysdeps/aarch64/memcpy.S
+++ b/sysdeps/aarch64/memcpy.S
@@ -35,6 +35,7 @@
#define A_h x7
#define A_hw w7
#define B_l x8
+#define B_lw w8
#define B_h x9
#define C_l x10
#define C_h x11
@@ -70,21 +71,40 @@ END (memmove)
libc_hidden_builtin_def (memmove)
ENTRY (memcpy)
+ prfm PLDL1KEEP, [src]
add srcend, src, count
add dstend, dstin, count
+ cmp count, 16
+ b.ls L(copy16)
cmp count, 96
b.hi L(copy_long)
- cmp count, 16
- b.hs L(copy_medium)
+ /* Medium copies: 17..96 bytes. */
+ sub tmp1, count, 1
+ ldp A_l, A_h, [src]
+ tbnz tmp1, 6, L(copy96)
+ ldp D_l, D_h, [srcend, -16]
+ tbz tmp1, 5, 1f
+ ldp B_l, B_h, [src, 16]
+ ldp C_l, C_h, [srcend, -32]
+ stp B_l, B_h, [dstin, 16]
+ stp C_l, C_h, [dstend, -32]
+1:
+ stp A_l, A_h, [dstin]
+ stp D_l, D_h, [dstend, -16]
+ ret
+
+ .p2align 4
/* Small copies: 0..16 bytes. */
L(copy16):
- tbz count, 3, 1f
+ cmp count, 8
+ b.lo 1f
ldr A_l, [src]
ldr A_h, [srcend, -8]
str A_l, [dstin]
str A_h, [dstend, -8]
ret
+ .p2align 4
1:
tbz count, 2, 1f
ldr A_lw, [src]
@@ -92,33 +112,21 @@ L(copy16):
str A_lw, [dstin]
str A_hw, [dstend, -4]
ret
- .p2align 4
+
+ /* Copy 0..3 bytes. Use a branchless sequence that copies the same
+ byte 3 times if count==1, or the 2nd byte twice if count==2. */
1:
cbz count, 2f
+ lsr tmp1, count, 1
ldrb A_lw, [src]
- tbz count, 1, 1f
- ldrh A_hw, [srcend, -2]
- strh A_hw, [dstend, -2]
-1: strb A_lw, [dstin]
+ ldrb A_hw, [srcend, -1]
+ ldrb B_lw, [src, tmp1]
+ strb A_lw, [dstin]
+ strb B_lw, [dstin, tmp1]
+ strb A_hw, [dstend, -1]
2: ret
.p2align 4
- /* Medium copies: 17..96 bytes. */
-L(copy_medium):
- ldp A_l, A_h, [src]
- tbnz count, 6, L(copy96)
- ldp D_l, D_h, [srcend, -16]
- tbz count, 5, 1f
- ldp B_l, B_h, [src, 16]
- ldp C_l, C_h, [srcend, -32]
- stp B_l, B_h, [dstin, 16]
- stp C_l, C_h, [dstend, -32]
-1:
- stp A_l, A_h, [dstin]
- stp D_l, D_h, [dstend, -16]
- ret
-
- .p2align 4
/* Copy 64..96 bytes. Copy 64 bytes from the start and
32 bytes from the end. */
L(copy96):
--
1.9.1
^ permalink raw reply [flat|nested] 8+ messages in thread
* Fw: [PATCH][AArch64] Tune memcpy
2016-05-19 11:34 ` Wilco Dijkstra
@ 2016-06-03 12:37 ` Wilco Dijkstra
2016-06-21 13:34 ` Wilco Dijkstra
0 siblings, 1 reply; 8+ messages in thread
From: Wilco Dijkstra @ 2016-06-03 12:37 UTC (permalink / raw)
To: Marcus Shawcroft; +Cc: nd, 'GNU C Library'
ping
________________________________________
From: Wilco Dijkstra
Sent: 12 May 2016 15:01
To: 'GNU C Library'; Marcus Shawcroft
Cc: nd
Subject: Re: [PATCH][AArch64] Tune memcpy
ping
-----Original Message-----
From: Wilco Dijkstra [mailto:Wilco.Dijkstra@arm.com]
Sent: 19 November 2015 12:35
To: 'GNU C Library'
Subject: [PATCH][AArch64] Tune memcpy
This patch further tunes memcpy - avoid one branch for sizes 1-3, add a prefetch and improve small copies that are exact powers of 2.
OK for commit? (depends on https://sourceware.org/ml/libc-alpha/2015-09/msg00633.html )
ChangeLog:
2015-11-19 Wilco Dijkstra <wdijkstr@arm.com>
* sysdeps/aarch64/memcpy.S (memcpy):
Further tuning for performance.
---
sysdeps/aarch64/memcpy.S | 56 +++++++++++++++++++++++++++---------------------
1 file changed, 32 insertions(+), 24 deletions(-)
diff --git a/sysdeps/aarch64/memcpy.S b/sysdeps/aarch64/memcpy.S
index 51e7268..6b8610e 100644
--- a/sysdeps/aarch64/memcpy.S
+++ b/sysdeps/aarch64/memcpy.S
@@ -35,6 +35,7 @@
#define A_h x7
#define A_hw w7
#define B_l x8
+#define B_lw w8
#define B_h x9
#define C_l x10
#define C_h x11
@@ -70,21 +71,40 @@ END (memmove)
libc_hidden_builtin_def (memmove)
ENTRY (memcpy)
+ prfm PLDL1KEEP, [src]
add srcend, src, count
add dstend, dstin, count
+ cmp count, 16
+ b.ls L(copy16)
cmp count, 96
b.hi L(copy_long)
- cmp count, 16
- b.hs L(copy_medium)
+ /* Medium copies: 17..96 bytes. */
+ sub tmp1, count, 1
+ ldp A_l, A_h, [src]
+ tbnz tmp1, 6, L(copy96)
+ ldp D_l, D_h, [srcend, -16]
+ tbz tmp1, 5, 1f
+ ldp B_l, B_h, [src, 16]
+ ldp C_l, C_h, [srcend, -32]
+ stp B_l, B_h, [dstin, 16]
+ stp C_l, C_h, [dstend, -32]
+1:
+ stp A_l, A_h, [dstin]
+ stp D_l, D_h, [dstend, -16]
+ ret
+
+ .p2align 4
/* Small copies: 0..16 bytes. */
L(copy16):
- tbz count, 3, 1f
+ cmp count, 8
+ b.lo 1f
ldr A_l, [src]
ldr A_h, [srcend, -8]
str A_l, [dstin]
str A_h, [dstend, -8]
ret
+ .p2align 4
1:
tbz count, 2, 1f
ldr A_lw, [src]
@@ -92,33 +112,21 @@ L(copy16):
str A_lw, [dstin]
str A_hw, [dstend, -4]
ret
- .p2align 4
+
+ /* Copy 0..3 bytes. Use a branchless sequence that copies the same
+ byte 3 times if count==1, or the 2nd byte twice if count==2. */
1:
cbz count, 2f
+ lsr tmp1, count, 1
ldrb A_lw, [src]
- tbz count, 1, 1f
- ldrh A_hw, [srcend, -2]
- strh A_hw, [dstend, -2]
-1: strb A_lw, [dstin]
+ ldrb A_hw, [srcend, -1]
+ ldrb B_lw, [src, tmp1]
+ strb A_lw, [dstin]
+ strb B_lw, [dstin, tmp1]
+ strb A_hw, [dstend, -1]
2: ret
.p2align 4
- /* Medium copies: 17..96 bytes. */
-L(copy_medium):
- ldp A_l, A_h, [src]
- tbnz count, 6, L(copy96)
- ldp D_l, D_h, [srcend, -16]
- tbz count, 5, 1f
- ldp B_l, B_h, [src, 16]
- ldp C_l, C_h, [srcend, -32]
- stp B_l, B_h, [dstin, 16]
- stp C_l, C_h, [dstend, -32]
-1:
- stp A_l, A_h, [dstin]
- stp D_l, D_h, [dstend, -16]
- ret
-
- .p2align 4
/* Copy 64..96 bytes. Copy 64 bytes from the start and
32 bytes from the end. */
L(copy96):
--
1.9.1
^ permalink raw reply [flat|nested] 8+ messages in thread
* Re: [PATCH][AArch64] Tune memcpy
2016-06-03 12:37 ` Fw: " Wilco Dijkstra
@ 2016-06-21 13:34 ` Wilco Dijkstra
0 siblings, 0 replies; 8+ messages in thread
From: Wilco Dijkstra @ 2016-06-21 13:34 UTC (permalink / raw)
To: Marcus Shawcroft; +Cc: nd, 'GNU C Library'
ping
________________________________________
From: Wilco Dijkstra
Sent: 12 May 2016 15:01
To: 'GNU C Library'; Marcus Shawcroft
Cc: nd
Subject: Re: [PATCH][AArch64] Tune memcpy
ping
-----Original Message-----
From: Wilco Dijkstra [mailto:Wilco.Dijkstra@arm.com]
Sent: 19 November 2015 12:35
To: 'GNU C Library'
Subject: [PATCH][AArch64] Tune memcpy
This patch further tunes memcpy - avoid one branch for sizes 1-3, add a prefetch and improve small copies that are exact powers of 2.
OK for commit? (depends on https://sourceware.org/ml/libc-alpha/2015-09/msg00633.html )
ChangeLog:
2015-11-19 Wilco Dijkstra <wdijkstr@arm.com>
* sysdeps/aarch64/memcpy.S (memcpy):
Further tuning for performance.
---
sysdeps/aarch64/memcpy.S | 56 +++++++++++++++++++++++++++---------------------
1 file changed, 32 insertions(+), 24 deletions(-)
diff --git a/sysdeps/aarch64/memcpy.S b/sysdeps/aarch64/memcpy.S
index 51e7268..6b8610e 100644
--- a/sysdeps/aarch64/memcpy.S
+++ b/sysdeps/aarch64/memcpy.S
@@ -35,6 +35,7 @@
#define A_h x7
#define A_hw w7
#define B_l x8
+#define B_lw w8
#define B_h x9
#define C_l x10
#define C_h x11
@@ -70,21 +71,40 @@ END (memmove)
libc_hidden_builtin_def (memmove)
ENTRY (memcpy)
+ prfm PLDL1KEEP, [src]
add srcend, src, count
add dstend, dstin, count
+ cmp count, 16
+ b.ls L(copy16)
cmp count, 96
b.hi L(copy_long)
- cmp count, 16
- b.hs L(copy_medium)
+ /* Medium copies: 17..96 bytes. */
+ sub tmp1, count, 1
+ ldp A_l, A_h, [src]
+ tbnz tmp1, 6, L(copy96)
+ ldp D_l, D_h, [srcend, -16]
+ tbz tmp1, 5, 1f
+ ldp B_l, B_h, [src, 16]
+ ldp C_l, C_h, [srcend, -32]
+ stp B_l, B_h, [dstin, 16]
+ stp C_l, C_h, [dstend, -32]
+1:
+ stp A_l, A_h, [dstin]
+ stp D_l, D_h, [dstend, -16]
+ ret
+
+ .p2align 4
/* Small copies: 0..16 bytes. */
L(copy16):
- tbz count, 3, 1f
+ cmp count, 8
+ b.lo 1f
ldr A_l, [src]
ldr A_h, [srcend, -8]
str A_l, [dstin]
str A_h, [dstend, -8]
ret
+ .p2align 4
1:
tbz count, 2, 1f
ldr A_lw, [src]
@@ -92,33 +112,21 @@ L(copy16):
str A_lw, [dstin]
str A_hw, [dstend, -4]
ret
- .p2align 4
+
+ /* Copy 0..3 bytes. Use a branchless sequence that copies the same
+ byte 3 times if count==1, or the 2nd byte twice if count==2. */
1:
cbz count, 2f
+ lsr tmp1, count, 1
ldrb A_lw, [src]
- tbz count, 1, 1f
- ldrh A_hw, [srcend, -2]
- strh A_hw, [dstend, -2]
-1: strb A_lw, [dstin]
+ ldrb A_hw, [srcend, -1]
+ ldrb B_lw, [src, tmp1]
+ strb A_lw, [dstin]
+ strb B_lw, [dstin, tmp1]
+ strb A_hw, [dstend, -1]
2: ret
.p2align 4
- /* Medium copies: 17..96 bytes. */
-L(copy_medium):
- ldp A_l, A_h, [src]
- tbnz count, 6, L(copy96)
- ldp D_l, D_h, [srcend, -16]
- tbz count, 5, 1f
- ldp B_l, B_h, [src, 16]
- ldp C_l, C_h, [srcend, -32]
- stp B_l, B_h, [dstin, 16]
- stp C_l, C_h, [dstend, -32]
-1:
- stp A_l, A_h, [dstin]
- stp D_l, D_h, [dstend, -16]
- ret
-
- .p2align 4
/* Copy 64..96 bytes. Copy 64 bytes from the start and
32 bytes from the end. */
L(copy96):
--
1.9.1
^ permalink raw reply [flat|nested] 8+ messages in thread
* Re: [PATCH][AArch64] Tune memcpy
2015-11-19 12:35 Wilco Dijkstra
@ 2016-06-22 7:47 ` Marcus Shawcroft
0 siblings, 0 replies; 8+ messages in thread
From: Marcus Shawcroft @ 2016-06-22 7:47 UTC (permalink / raw)
To: Wilco Dijkstra; +Cc: GNU C Library
On 19 November 2015 at 12:34, Wilco Dijkstra <Wilco.Dijkstra@arm.com> wrote:
> This patch further tunes memcpy - avoid one branch for sizes 1-3, add a
> prefetch and improve small copies that are exact powers of 2.
>
> OK for commit? (depends on
> https://sourceware.org/ml/libc-alpha/2015-09/msg00633.html )
>
> ChangeLog:
> 2015-11-19 Wilco Dijkstra <wdijkstr@arm.com>
>
> * sysdeps/aarch64/memcpy.S (memcpy):
> Further tuning for performance.
OK /Marcus
^ permalink raw reply [flat|nested] 8+ messages in thread
* [PATCH][AArch64] Tune memcpy
@ 2015-11-19 12:35 Wilco Dijkstra
2016-06-22 7:47 ` Marcus Shawcroft
0 siblings, 1 reply; 8+ messages in thread
From: Wilco Dijkstra @ 2015-11-19 12:35 UTC (permalink / raw)
To: 'GNU C Library'
This patch further tunes memcpy - avoid one branch for sizes 1-3, add a
prefetch and improve small copies that are exact powers of 2.
OK for commit? (depends on
https://sourceware.org/ml/libc-alpha/2015-09/msg00633.html )
ChangeLog:
2015-11-19 Wilco Dijkstra <wdijkstr@arm.com>
* sysdeps/aarch64/memcpy.S (memcpy):
Further tuning for performance.
---
sysdeps/aarch64/memcpy.S | 56
+++++++++++++++++++++++++++---------------------
1 file changed, 32 insertions(+), 24 deletions(-)
diff --git a/sysdeps/aarch64/memcpy.S b/sysdeps/aarch64/memcpy.S
index 51e7268..6b8610e 100644
--- a/sysdeps/aarch64/memcpy.S
+++ b/sysdeps/aarch64/memcpy.S
@@ -35,6 +35,7 @@
#define A_h x7
#define A_hw w7
#define B_l x8
+#define B_lw w8
#define B_h x9
#define C_l x10
#define C_h x11
@@ -70,21 +71,40 @@ END (memmove)
libc_hidden_builtin_def (memmove)
ENTRY (memcpy)
+ prfm PLDL1KEEP, [src]
add srcend, src, count
add dstend, dstin, count
+ cmp count, 16
+ b.ls L(copy16)
cmp count, 96
b.hi L(copy_long)
- cmp count, 16
- b.hs L(copy_medium)
+ /* Medium copies: 17..96 bytes. */
+ sub tmp1, count, 1
+ ldp A_l, A_h, [src]
+ tbnz tmp1, 6, L(copy96)
+ ldp D_l, D_h, [srcend, -16]
+ tbz tmp1, 5, 1f
+ ldp B_l, B_h, [src, 16]
+ ldp C_l, C_h, [srcend, -32]
+ stp B_l, B_h, [dstin, 16]
+ stp C_l, C_h, [dstend, -32]
+1:
+ stp A_l, A_h, [dstin]
+ stp D_l, D_h, [dstend, -16]
+ ret
+
+ .p2align 4
/* Small copies: 0..16 bytes. */
L(copy16):
- tbz count, 3, 1f
+ cmp count, 8
+ b.lo 1f
ldr A_l, [src]
ldr A_h, [srcend, -8]
str A_l, [dstin]
str A_h, [dstend, -8]
ret
+ .p2align 4
1:
tbz count, 2, 1f
ldr A_lw, [src]
@@ -92,33 +112,21 @@ L(copy16):
str A_lw, [dstin]
str A_hw, [dstend, -4]
ret
- .p2align 4
+
+ /* Copy 0..3 bytes. Use a branchless sequence that copies the same
+ byte 3 times if count==1, or the 2nd byte twice if count==2. */
1:
cbz count, 2f
+ lsr tmp1, count, 1
ldrb A_lw, [src]
- tbz count, 1, 1f
- ldrh A_hw, [srcend, -2]
- strh A_hw, [dstend, -2]
-1: strb A_lw, [dstin]
+ ldrb A_hw, [srcend, -1]
+ ldrb B_lw, [src, tmp1]
+ strb A_lw, [dstin]
+ strb B_lw, [dstin, tmp1]
+ strb A_hw, [dstend, -1]
2: ret
.p2align 4
- /* Medium copies: 17..96 bytes. */
-L(copy_medium):
- ldp A_l, A_h, [src]
- tbnz count, 6, L(copy96)
- ldp D_l, D_h, [srcend, -16]
- tbz count, 5, 1f
- ldp B_l, B_h, [src, 16]
- ldp C_l, C_h, [srcend, -32]
- stp B_l, B_h, [dstin, 16]
- stp C_l, C_h, [dstend, -32]
-1:
- stp A_l, A_h, [dstin]
- stp D_l, D_h, [dstend, -16]
- ret
-
- .p2align 4
/* Copy 64..96 bytes. Copy 64 bytes from the start and
32 bytes from the end. */
L(copy96):
--
1.9.1
^ permalink raw reply [flat|nested] 8+ messages in thread
end of thread, other threads:[~2016-06-22 7:47 UTC | newest]
Thread overview: 8+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2015-12-15 16:43 [PATCH][AArch64] Tune memcpy Wilco Dijkstra
2016-04-15 12:44 ` Wilco Dijkstra
2016-05-12 14:02 ` Wilco Dijkstra
2016-05-19 11:34 ` Wilco Dijkstra
2016-06-03 12:37 ` Fw: " Wilco Dijkstra
2016-06-21 13:34 ` Wilco Dijkstra
-- strict thread matches above, loose matches on Subject: below --
2015-11-19 12:35 Wilco Dijkstra
2016-06-22 7:47 ` Marcus Shawcroft
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).