public inbox for libc-alpha@sourceware.org
 help / color / mirror / Atom feed
* Re: [PATCH][AArch64] Tune memcpy
@ 2015-12-15 16:43 Wilco Dijkstra
  2016-04-15 12:44 ` Wilco Dijkstra
  2016-05-12 14:02 ` Wilco Dijkstra
  0 siblings, 2 replies; 8+ messages in thread
From: Wilco Dijkstra @ 2015-12-15 16:43 UTC (permalink / raw)
  To: 'GNU C Library'; +Cc: nd

Ping

-----Original Message-----
From: Wilco Dijkstra [mailto:Wilco.Dijkstra@arm.com] 
Sent: 19 November 2015 12:35
To: 'GNU C Library'
Subject: [PATCH][AArch64] Tune memcpy

This patch further tunes memcpy - avoid one branch for sizes 1-3, add a prefetch and improve small copies that are exact powers of 2.

OK for commit? (depends on https://sourceware.org/ml/libc-alpha/2015-09/msg00633.html )

ChangeLog:
2015-11-19  Wilco Dijkstra  <wdijkstr@arm.com>

	* sysdeps/aarch64/memcpy.S (memcpy):
	Further tuning for performance.


---
 sysdeps/aarch64/memcpy.S | 56 +++++++++++++++++++++++++++---------------------
 1 file changed, 32 insertions(+), 24 deletions(-)

diff --git a/sysdeps/aarch64/memcpy.S b/sysdeps/aarch64/memcpy.S
index 51e7268..6b8610e 100644
--- a/sysdeps/aarch64/memcpy.S
+++ b/sysdeps/aarch64/memcpy.S
@@ -35,6 +35,7 @@
 #define A_h	x7
 #define A_hw	w7
 #define B_l	x8
+#define B_lw	w8
 #define B_h	x9
 #define C_l	x10
 #define C_h	x11
@@ -70,21 +71,40 @@ END (memmove)
 libc_hidden_builtin_def (memmove)
 ENTRY (memcpy)
 
+	prfm	PLDL1KEEP, [src]
 	add	srcend, src, count
 	add	dstend, dstin, count
+	cmp	count, 16
+	b.ls	L(copy16)
 	cmp	count, 96
 	b.hi	L(copy_long)
-	cmp	count, 16
-	b.hs	L(copy_medium)
 
+	/* Medium copies: 17..96 bytes.  */
+	sub	tmp1, count, 1
+	ldp	A_l, A_h, [src]
+	tbnz	tmp1, 6, L(copy96)
+	ldp	D_l, D_h, [srcend, -16]
+	tbz	tmp1, 5, 1f
+	ldp	B_l, B_h, [src, 16]
+	ldp	C_l, C_h, [srcend, -32]
+	stp	B_l, B_h, [dstin, 16]
+	stp	C_l, C_h, [dstend, -32]
+1:
+	stp	A_l, A_h, [dstin]
+	stp	D_l, D_h, [dstend, -16]
+	ret
+
+	.p2align 4
 	/* Small copies: 0..16 bytes.  */
 L(copy16):
-	tbz	count, 3, 1f
+	cmp	count, 8
+	b.lo	1f
 	ldr	A_l, [src]
 	ldr	A_h, [srcend, -8]
 	str	A_l, [dstin]
 	str	A_h, [dstend, -8]
 	ret
+	.p2align 4
 1:
 	tbz	count, 2, 1f
 	ldr	A_lw, [src]
@@ -92,33 +112,21 @@ L(copy16):
 	str	A_lw, [dstin]
 	str	A_hw, [dstend, -4]
 	ret
-	.p2align 4
+
+	/* Copy 0..3 bytes.  Use a branchless sequence that copies the same
+	   byte 3 times if count==1, or the 2nd byte twice if count==2.  */
 1:
 	cbz	count, 2f
+	lsr	tmp1, count, 1
 	ldrb	A_lw, [src]
-	tbz	count, 1, 1f
-	ldrh	A_hw, [srcend, -2]
-	strh	A_hw, [dstend, -2]
-1:	strb	A_lw, [dstin]
+	ldrb	A_hw, [srcend, -1]
+	ldrb	B_lw, [src, tmp1]
+	strb	A_lw, [dstin]
+	strb	B_lw, [dstin, tmp1]
+	strb	A_hw, [dstend, -1]
 2:	ret
 
 	.p2align 4
-	/* Medium copies: 17..96 bytes.	 */
-L(copy_medium):
-	ldp	A_l, A_h, [src]
-	tbnz	count, 6, L(copy96)
-	ldp	D_l, D_h, [srcend, -16]
-	tbz	count, 5, 1f
-	ldp	B_l, B_h, [src, 16]
-	ldp	C_l, C_h, [srcend, -32]
-	stp	B_l, B_h, [dstin, 16]
-	stp	C_l, C_h, [dstend, -32]
-1:
-	stp	A_l, A_h, [dstin]
-	stp	D_l, D_h, [dstend, -16]
-	ret
-
-	.p2align 4
 	/* Copy 64..96 bytes.  Copy 64 bytes from the start and
 	   32 bytes from the end.  */
 L(copy96):
-- 
1.9.1

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH][AArch64] Tune memcpy
  2015-12-15 16:43 [PATCH][AArch64] Tune memcpy Wilco Dijkstra
@ 2016-04-15 12:44 ` Wilco Dijkstra
  2016-05-12 14:02 ` Wilco Dijkstra
  1 sibling, 0 replies; 8+ messages in thread
From: Wilco Dijkstra @ 2016-04-15 12:44 UTC (permalink / raw)
  To: 'GNU C Library'; +Cc: nd, Richard Earnshaw, Marcus Shawcroft

ping

________________________________________
From: Wilco Dijkstra
Sent: 15 December 2015 16:42
To: 'GNU C Library'
Cc: nd
Subject: Re: [PATCH][AArch64] Tune memcpy

Ping

-----Original Message-----
From: Wilco Dijkstra [mailto:Wilco.Dijkstra@arm.com]
Sent: 19 November 2015 12:35
To: 'GNU C Library'
Subject: [PATCH][AArch64] Tune memcpy

This patch further tunes memcpy - avoid one branch for sizes 1-3, add a prefetch and improve small copies that are exact powers of 2.

OK for commit? (depends on https://sourceware.org/ml/libc-alpha/2015-09/msg00633.html )

ChangeLog:
2015-11-19  Wilco Dijkstra  <wdijkstr@arm.com>

        * sysdeps/aarch64/memcpy.S (memcpy):
        Further tuning for performance.


---
 sysdeps/aarch64/memcpy.S | 56 +++++++++++++++++++++++++++---------------------
 1 file changed, 32 insertions(+), 24 deletions(-)

diff --git a/sysdeps/aarch64/memcpy.S b/sysdeps/aarch64/memcpy.S
index 51e7268..6b8610e 100644
--- a/sysdeps/aarch64/memcpy.S
+++ b/sysdeps/aarch64/memcpy.S
@@ -35,6 +35,7 @@
 #define A_h    x7
 #define A_hw   w7
 #define B_l    x8
+#define B_lw   w8
 #define B_h    x9
 #define C_l    x10
 #define C_h    x11
@@ -70,21 +71,40 @@ END (memmove)
 libc_hidden_builtin_def (memmove)
 ENTRY (memcpy)

+       prfm    PLDL1KEEP, [src]
        add     srcend, src, count
        add     dstend, dstin, count
+       cmp     count, 16
+       b.ls    L(copy16)
        cmp     count, 96
        b.hi    L(copy_long)
-       cmp     count, 16
-       b.hs    L(copy_medium)

+       /* Medium copies: 17..96 bytes.  */
+       sub     tmp1, count, 1
+       ldp     A_l, A_h, [src]
+       tbnz    tmp1, 6, L(copy96)
+       ldp     D_l, D_h, [srcend, -16]
+       tbz     tmp1, 5, 1f
+       ldp     B_l, B_h, [src, 16]
+       ldp     C_l, C_h, [srcend, -32]
+       stp     B_l, B_h, [dstin, 16]
+       stp     C_l, C_h, [dstend, -32]
+1:
+       stp     A_l, A_h, [dstin]
+       stp     D_l, D_h, [dstend, -16]
+       ret
+
+       .p2align 4
        /* Small copies: 0..16 bytes.  */
 L(copy16):
-       tbz     count, 3, 1f
+       cmp     count, 8
+       b.lo    1f
        ldr     A_l, [src]
        ldr     A_h, [srcend, -8]
        str     A_l, [dstin]
        str     A_h, [dstend, -8]
        ret
+       .p2align 4
 1:
        tbz     count, 2, 1f
        ldr     A_lw, [src]
@@ -92,33 +112,21 @@ L(copy16):
        str     A_lw, [dstin]
        str     A_hw, [dstend, -4]
        ret
-       .p2align 4
+
+       /* Copy 0..3 bytes.  Use a branchless sequence that copies the same
+          byte 3 times if count==1, or the 2nd byte twice if count==2.  */
 1:
        cbz     count, 2f
+       lsr     tmp1, count, 1
        ldrb    A_lw, [src]
-       tbz     count, 1, 1f
-       ldrh    A_hw, [srcend, -2]
-       strh    A_hw, [dstend, -2]
-1:     strb    A_lw, [dstin]
+       ldrb    A_hw, [srcend, -1]
+       ldrb    B_lw, [src, tmp1]
+       strb    A_lw, [dstin]
+       strb    B_lw, [dstin, tmp1]
+       strb    A_hw, [dstend, -1]
 2:     ret

        .p2align 4
-       /* Medium copies: 17..96 bytes.  */
-L(copy_medium):
-       ldp     A_l, A_h, [src]
-       tbnz    count, 6, L(copy96)
-       ldp     D_l, D_h, [srcend, -16]
-       tbz     count, 5, 1f
-       ldp     B_l, B_h, [src, 16]
-       ldp     C_l, C_h, [srcend, -32]
-       stp     B_l, B_h, [dstin, 16]
-       stp     C_l, C_h, [dstend, -32]
-1:
-       stp     A_l, A_h, [dstin]
-       stp     D_l, D_h, [dstend, -16]
-       ret
-
-       .p2align 4
        /* Copy 64..96 bytes.  Copy 64 bytes from the start and
           32 bytes from the end.  */
 L(copy96):
--
1.9.1

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH][AArch64] Tune memcpy
  2015-12-15 16:43 [PATCH][AArch64] Tune memcpy Wilco Dijkstra
  2016-04-15 12:44 ` Wilco Dijkstra
@ 2016-05-12 14:02 ` Wilco Dijkstra
  2016-05-19 11:34   ` Wilco Dijkstra
  1 sibling, 1 reply; 8+ messages in thread
From: Wilco Dijkstra @ 2016-05-12 14:02 UTC (permalink / raw)
  To: 'GNU C Library', Marcus Shawcroft; +Cc: nd


ping


-----Original Message-----
From: Wilco Dijkstra [mailto:Wilco.Dijkstra@arm.com]
Sent: 19 November 2015 12:35
To: 'GNU C Library'
Subject: [PATCH][AArch64] Tune memcpy

This patch further tunes memcpy - avoid one branch for sizes 1-3, add a prefetch and improve small copies that are exact powers of 2.

OK for commit? (depends on https://sourceware.org/ml/libc-alpha/2015-09/msg00633.html )

ChangeLog:
2015-11-19  Wilco Dijkstra  <wdijkstr@arm.com>

        * sysdeps/aarch64/memcpy.S (memcpy):
        Further tuning for performance.


---
 sysdeps/aarch64/memcpy.S | 56 +++++++++++++++++++++++++++---------------------
 1 file changed, 32 insertions(+), 24 deletions(-)

diff --git a/sysdeps/aarch64/memcpy.S b/sysdeps/aarch64/memcpy.S
index 51e7268..6b8610e 100644
--- a/sysdeps/aarch64/memcpy.S
+++ b/sysdeps/aarch64/memcpy.S
@@ -35,6 +35,7 @@
 #define A_h    x7
 #define A_hw   w7
 #define B_l    x8
+#define B_lw   w8
 #define B_h    x9
 #define C_l    x10
 #define C_h    x11
@@ -70,21 +71,40 @@ END (memmove)
 libc_hidden_builtin_def (memmove)
 ENTRY (memcpy)

+       prfm    PLDL1KEEP, [src]
        add     srcend, src, count
        add     dstend, dstin, count
+       cmp     count, 16
+       b.ls    L(copy16)
        cmp     count, 96
        b.hi    L(copy_long)
-       cmp     count, 16
-       b.hs    L(copy_medium)

+       /* Medium copies: 17..96 bytes.  */
+       sub     tmp1, count, 1
+       ldp     A_l, A_h, [src]
+       tbnz    tmp1, 6, L(copy96)
+       ldp     D_l, D_h, [srcend, -16]
+       tbz     tmp1, 5, 1f
+       ldp     B_l, B_h, [src, 16]
+       ldp     C_l, C_h, [srcend, -32]
+       stp     B_l, B_h, [dstin, 16]
+       stp     C_l, C_h, [dstend, -32]
+1:
+       stp     A_l, A_h, [dstin]
+       stp     D_l, D_h, [dstend, -16]
+       ret
+
+       .p2align 4
        /* Small copies: 0..16 bytes.  */
 L(copy16):
-       tbz     count, 3, 1f
+       cmp     count, 8
+       b.lo    1f
        ldr     A_l, [src]
        ldr     A_h, [srcend, -8]
        str     A_l, [dstin]
        str     A_h, [dstend, -8]
        ret
+       .p2align 4
 1:
        tbz     count, 2, 1f
        ldr     A_lw, [src]
@@ -92,33 +112,21 @@ L(copy16):
        str     A_lw, [dstin]
        str     A_hw, [dstend, -4]
        ret
-       .p2align 4
+
+       /* Copy 0..3 bytes.  Use a branchless sequence that copies the same
+          byte 3 times if count==1, or the 2nd byte twice if count==2.  */
 1:
        cbz     count, 2f
+       lsr     tmp1, count, 1
        ldrb    A_lw, [src]
-       tbz     count, 1, 1f
-       ldrh    A_hw, [srcend, -2]
-       strh    A_hw, [dstend, -2]
-1:     strb    A_lw, [dstin]
+       ldrb    A_hw, [srcend, -1]
+       ldrb    B_lw, [src, tmp1]
+       strb    A_lw, [dstin]
+       strb    B_lw, [dstin, tmp1]
+       strb    A_hw, [dstend, -1]
 2:     ret

        .p2align 4
-       /* Medium copies: 17..96 bytes.  */
-L(copy_medium):
-       ldp     A_l, A_h, [src]
-       tbnz    count, 6, L(copy96)
-       ldp     D_l, D_h, [srcend, -16]
-       tbz     count, 5, 1f
-       ldp     B_l, B_h, [src, 16]
-       ldp     C_l, C_h, [srcend, -32]
-       stp     B_l, B_h, [dstin, 16]
-       stp     C_l, C_h, [dstend, -32]
-1:
-       stp     A_l, A_h, [dstin]
-       stp     D_l, D_h, [dstend, -16]
-       ret
-
-       .p2align 4
        /* Copy 64..96 bytes.  Copy 64 bytes from the start and
           32 bytes from the end.  */
 L(copy96):
--
1.9.1

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH][AArch64] Tune memcpy
  2016-05-12 14:02 ` Wilco Dijkstra
@ 2016-05-19 11:34   ` Wilco Dijkstra
  2016-06-03 12:37     ` Fw: " Wilco Dijkstra
  0 siblings, 1 reply; 8+ messages in thread
From: Wilco Dijkstra @ 2016-05-19 11:34 UTC (permalink / raw)
  To: 'GNU C Library', Marcus Shawcroft; +Cc: nd

ping

________________________________________
From: Wilco Dijkstra
Sent: 12 May 2016 15:01
To: 'GNU C Library'; Marcus Shawcroft
Cc: nd
Subject: Re: [PATCH][AArch64] Tune memcpy

ping


-----Original Message-----
From: Wilco Dijkstra [mailto:Wilco.Dijkstra@arm.com]
Sent: 19 November 2015 12:35
To: 'GNU C Library'
Subject: [PATCH][AArch64] Tune memcpy

This patch further tunes memcpy - avoid one branch for sizes 1-3, add a prefetch and improve small copies that are exact powers of 2.

OK for commit? (depends on https://sourceware.org/ml/libc-alpha/2015-09/msg00633.html )

ChangeLog:
2015-11-19  Wilco Dijkstra  <wdijkstr@arm.com>

        * sysdeps/aarch64/memcpy.S (memcpy):
        Further tuning for performance.


---
 sysdeps/aarch64/memcpy.S | 56 +++++++++++++++++++++++++++---------------------
 1 file changed, 32 insertions(+), 24 deletions(-)

diff --git a/sysdeps/aarch64/memcpy.S b/sysdeps/aarch64/memcpy.S
index 51e7268..6b8610e 100644
--- a/sysdeps/aarch64/memcpy.S
+++ b/sysdeps/aarch64/memcpy.S
@@ -35,6 +35,7 @@
 #define A_h	x7
 #define A_hw	w7
 #define B_l	x8
+#define B_lw	w8
 #define B_h	x9
 #define C_l	x10
 #define C_h	x11
@@ -70,21 +71,40 @@ END (memmove)
 libc_hidden_builtin_def (memmove)
 ENTRY (memcpy)
 
+	prfm	PLDL1KEEP, [src]
 	add	srcend, src, count
 	add	dstend, dstin, count
+	cmp	count, 16
+	b.ls	L(copy16)
 	cmp	count, 96
 	b.hi	L(copy_long)
-	cmp	count, 16
-	b.hs	L(copy_medium)
 
+	/* Medium copies: 17..96 bytes.  */
+	sub	tmp1, count, 1
+	ldp	A_l, A_h, [src]
+	tbnz	tmp1, 6, L(copy96)
+	ldp	D_l, D_h, [srcend, -16]
+	tbz	tmp1, 5, 1f
+	ldp	B_l, B_h, [src, 16]
+	ldp	C_l, C_h, [srcend, -32]
+	stp	B_l, B_h, [dstin, 16]
+	stp	C_l, C_h, [dstend, -32]
+1:
+	stp	A_l, A_h, [dstin]
+	stp	D_l, D_h, [dstend, -16]
+	ret
+
+	.p2align 4
 	/* Small copies: 0..16 bytes.  */
 L(copy16):
-	tbz	count, 3, 1f
+	cmp	count, 8
+	b.lo	1f
 	ldr	A_l, [src]
 	ldr	A_h, [srcend, -8]
 	str	A_l, [dstin]
 	str	A_h, [dstend, -8]
 	ret
+	.p2align 4
 1:
 	tbz	count, 2, 1f
 	ldr	A_lw, [src]
@@ -92,33 +112,21 @@ L(copy16):
 	str	A_lw, [dstin]
 	str	A_hw, [dstend, -4]
 	ret
-	.p2align 4
+
+	/* Copy 0..3 bytes.  Use a branchless sequence that copies the same
+	   byte 3 times if count==1, or the 2nd byte twice if count==2.  */
 1:
 	cbz	count, 2f
+	lsr	tmp1, count, 1
 	ldrb	A_lw, [src]
-	tbz	count, 1, 1f
-	ldrh	A_hw, [srcend, -2]
-	strh	A_hw, [dstend, -2]
-1:	strb	A_lw, [dstin]
+	ldrb	A_hw, [srcend, -1]
+	ldrb	B_lw, [src, tmp1]
+	strb	A_lw, [dstin]
+	strb	B_lw, [dstin, tmp1]
+	strb	A_hw, [dstend, -1]
 2:	ret
 
 	.p2align 4
-	/* Medium copies: 17..96 bytes.	 */
-L(copy_medium):
-	ldp	A_l, A_h, [src]
-	tbnz	count, 6, L(copy96)
-	ldp	D_l, D_h, [srcend, -16]
-	tbz	count, 5, 1f
-	ldp	B_l, B_h, [src, 16]
-	ldp	C_l, C_h, [srcend, -32]
-	stp	B_l, B_h, [dstin, 16]
-	stp	C_l, C_h, [dstend, -32]
-1:
-	stp	A_l, A_h, [dstin]
-	stp	D_l, D_h, [dstend, -16]
-	ret
-
-	.p2align 4
 	/* Copy 64..96 bytes.  Copy 64 bytes from the start and
 	   32 bytes from the end.  */
 L(copy96):
-- 
1.9.1

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Fw: [PATCH][AArch64] Tune memcpy
  2016-05-19 11:34   ` Wilco Dijkstra
@ 2016-06-03 12:37     ` Wilco Dijkstra
  2016-06-21 13:34       ` Wilco Dijkstra
  0 siblings, 1 reply; 8+ messages in thread
From: Wilco Dijkstra @ 2016-06-03 12:37 UTC (permalink / raw)
  To: Marcus Shawcroft; +Cc: nd, 'GNU C Library'



ping

________________________________________
From: Wilco Dijkstra
Sent: 12 May 2016 15:01
To: 'GNU C Library'; Marcus Shawcroft
Cc: nd
Subject: Re: [PATCH][AArch64] Tune memcpy

ping


-----Original Message-----
From: Wilco Dijkstra [mailto:Wilco.Dijkstra@arm.com]
Sent: 19 November 2015 12:35
To: 'GNU C Library'
Subject: [PATCH][AArch64] Tune memcpy

This patch further tunes memcpy - avoid one branch for sizes 1-3, add a prefetch and improve small copies that are exact powers of 2.

OK for commit? (depends on https://sourceware.org/ml/libc-alpha/2015-09/msg00633.html )

ChangeLog:
2015-11-19  Wilco Dijkstra  <wdijkstr@arm.com>

        * sysdeps/aarch64/memcpy.S (memcpy):
        Further tuning for performance.


---
 sysdeps/aarch64/memcpy.S | 56 +++++++++++++++++++++++++++---------------------
 1 file changed, 32 insertions(+), 24 deletions(-)

diff --git a/sysdeps/aarch64/memcpy.S b/sysdeps/aarch64/memcpy.S
index 51e7268..6b8610e 100644
--- a/sysdeps/aarch64/memcpy.S
+++ b/sysdeps/aarch64/memcpy.S
@@ -35,6 +35,7 @@
 #define A_h    x7
 #define A_hw   w7
 #define B_l    x8
+#define B_lw   w8
 #define B_h    x9
 #define C_l    x10
 #define C_h    x11
@@ -70,21 +71,40 @@ END (memmove)
 libc_hidden_builtin_def (memmove)
 ENTRY (memcpy)

+       prfm    PLDL1KEEP, [src]
        add     srcend, src, count
        add     dstend, dstin, count
+       cmp     count, 16
+       b.ls    L(copy16)
        cmp     count, 96
        b.hi    L(copy_long)
-       cmp     count, 16
-       b.hs    L(copy_medium)

+       /* Medium copies: 17..96 bytes.  */
+       sub     tmp1, count, 1
+       ldp     A_l, A_h, [src]
+       tbnz    tmp1, 6, L(copy96)
+       ldp     D_l, D_h, [srcend, -16]
+       tbz     tmp1, 5, 1f
+       ldp     B_l, B_h, [src, 16]
+       ldp     C_l, C_h, [srcend, -32]
+       stp     B_l, B_h, [dstin, 16]
+       stp     C_l, C_h, [dstend, -32]
+1:
+       stp     A_l, A_h, [dstin]
+       stp     D_l, D_h, [dstend, -16]
+       ret
+
+       .p2align 4
        /* Small copies: 0..16 bytes.  */
 L(copy16):
-       tbz     count, 3, 1f
+       cmp     count, 8
+       b.lo    1f
        ldr     A_l, [src]
        ldr     A_h, [srcend, -8]
        str     A_l, [dstin]
        str     A_h, [dstend, -8]
        ret
+       .p2align 4
 1:
        tbz     count, 2, 1f
        ldr     A_lw, [src]
@@ -92,33 +112,21 @@ L(copy16):
        str     A_lw, [dstin]
        str     A_hw, [dstend, -4]
        ret
-       .p2align 4
+
+       /* Copy 0..3 bytes.  Use a branchless sequence that copies the same
+          byte 3 times if count==1, or the 2nd byte twice if count==2.  */
 1:
        cbz     count, 2f
+       lsr     tmp1, count, 1
        ldrb    A_lw, [src]
-       tbz     count, 1, 1f
-       ldrh    A_hw, [srcend, -2]
-       strh    A_hw, [dstend, -2]
-1:     strb    A_lw, [dstin]
+       ldrb    A_hw, [srcend, -1]
+       ldrb    B_lw, [src, tmp1]
+       strb    A_lw, [dstin]
+       strb    B_lw, [dstin, tmp1]
+       strb    A_hw, [dstend, -1]
 2:     ret

        .p2align 4
-       /* Medium copies: 17..96 bytes.  */
-L(copy_medium):
-       ldp     A_l, A_h, [src]
-       tbnz    count, 6, L(copy96)
-       ldp     D_l, D_h, [srcend, -16]
-       tbz     count, 5, 1f
-       ldp     B_l, B_h, [src, 16]
-       ldp     C_l, C_h, [srcend, -32]
-       stp     B_l, B_h, [dstin, 16]
-       stp     C_l, C_h, [dstend, -32]
-1:
-       stp     A_l, A_h, [dstin]
-       stp     D_l, D_h, [dstend, -16]
-       ret
-
-       .p2align 4
        /* Copy 64..96 bytes.  Copy 64 bytes from the start and
           32 bytes from the end.  */
 L(copy96):
--
1.9.1

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH][AArch64] Tune memcpy
  2016-06-03 12:37     ` Fw: " Wilco Dijkstra
@ 2016-06-21 13:34       ` Wilco Dijkstra
  0 siblings, 0 replies; 8+ messages in thread
From: Wilco Dijkstra @ 2016-06-21 13:34 UTC (permalink / raw)
  To: Marcus Shawcroft; +Cc: nd, 'GNU C Library'

 ping

________________________________________
From: Wilco Dijkstra
Sent: 12 May 2016 15:01
To: 'GNU C Library'; Marcus Shawcroft
Cc: nd
Subject: Re: [PATCH][AArch64] Tune memcpy

ping


-----Original Message-----
From: Wilco Dijkstra [mailto:Wilco.Dijkstra@arm.com]
Sent: 19 November 2015 12:35
To: 'GNU C Library'
Subject: [PATCH][AArch64] Tune memcpy

This patch further tunes memcpy - avoid one branch for sizes 1-3, add a prefetch and improve small copies that are exact powers of 2.

OK for commit? (depends on  https://sourceware.org/ml/libc-alpha/2015-09/msg00633.html )

ChangeLog:
2015-11-19  Wilco Dijkstra  <wdijkstr@arm.com>

        * sysdeps/aarch64/memcpy.S (memcpy):
        Further tuning for performance.


---
 sysdeps/aarch64/memcpy.S | 56 +++++++++++++++++++++++++++---------------------
 1 file changed, 32 insertions(+), 24 deletions(-)

diff --git a/sysdeps/aarch64/memcpy.S b/sysdeps/aarch64/memcpy.S
index 51e7268..6b8610e 100644
--- a/sysdeps/aarch64/memcpy.S
+++ b/sysdeps/aarch64/memcpy.S
@@ -35,6 +35,7 @@
 #define A_h    x7
 #define A_hw   w7
 #define B_l    x8
+#define B_lw   w8
 #define B_h    x9
 #define C_l    x10
 #define C_h    x11
@@ -70,21 +71,40 @@ END (memmove)
 libc_hidden_builtin_def (memmove)
 ENTRY (memcpy)

+       prfm    PLDL1KEEP, [src]
        add     srcend, src, count
        add     dstend, dstin, count
+       cmp     count, 16
+       b.ls    L(copy16)
        cmp     count, 96
        b.hi    L(copy_long)
-       cmp     count, 16
-       b.hs    L(copy_medium)

+       /* Medium copies: 17..96 bytes.  */
+       sub     tmp1, count, 1
+       ldp     A_l, A_h, [src]
+       tbnz    tmp1, 6, L(copy96)
+       ldp     D_l, D_h, [srcend, -16]
+       tbz     tmp1, 5, 1f
+       ldp     B_l, B_h, [src, 16]
+       ldp     C_l, C_h, [srcend, -32]
+       stp     B_l, B_h, [dstin, 16]
+       stp     C_l, C_h, [dstend, -32]
+1:
+       stp     A_l, A_h, [dstin]
+       stp     D_l, D_h, [dstend, -16]
+       ret
+
+       .p2align 4
        /* Small copies: 0..16 bytes.  */
 L(copy16):
-       tbz     count, 3, 1f
+       cmp     count, 8
+       b.lo    1f
        ldr     A_l, [src]
        ldr     A_h, [srcend, -8]
        str     A_l, [dstin]
        str     A_h, [dstend, -8]
        ret
+       .p2align 4
 1:
        tbz     count, 2, 1f
        ldr     A_lw, [src]
@@ -92,33 +112,21 @@ L(copy16):
        str     A_lw, [dstin]
        str     A_hw, [dstend, -4]
        ret
-       .p2align 4
+
+       /* Copy 0..3 bytes.  Use a branchless sequence that copies the same
+          byte 3 times if count==1, or the 2nd byte twice if count==2.  */
 1:
        cbz     count, 2f
+       lsr     tmp1, count, 1
        ldrb    A_lw, [src]
-       tbz     count, 1, 1f
-       ldrh    A_hw, [srcend, -2]
-       strh    A_hw, [dstend, -2]
-1:     strb    A_lw, [dstin]
+       ldrb    A_hw, [srcend, -1]
+       ldrb    B_lw, [src, tmp1]
+       strb    A_lw, [dstin]
+       strb    B_lw, [dstin, tmp1]
+       strb    A_hw, [dstend, -1]
 2:     ret

        .p2align 4
-       /* Medium copies: 17..96 bytes.  */
-L(copy_medium):
-       ldp     A_l, A_h, [src]
-       tbnz    count, 6, L(copy96)
-       ldp     D_l, D_h, [srcend, -16]
-       tbz     count, 5, 1f
-       ldp     B_l, B_h, [src, 16]
-       ldp     C_l, C_h, [srcend, -32]
-       stp     B_l, B_h, [dstin, 16]
-       stp     C_l, C_h, [dstend, -32]
-1:
-       stp     A_l, A_h, [dstin]
-       stp     D_l, D_h, [dstend, -16]
-       ret
-
-       .p2align 4
        /* Copy 64..96 bytes.  Copy 64 bytes from the start and
           32 bytes from the end.  */
 L(copy96):
--
1.9.1



^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH][AArch64] Tune memcpy
  2015-11-19 12:35 Wilco Dijkstra
@ 2016-06-22  7:47 ` Marcus Shawcroft
  0 siblings, 0 replies; 8+ messages in thread
From: Marcus Shawcroft @ 2016-06-22  7:47 UTC (permalink / raw)
  To: Wilco Dijkstra; +Cc: GNU C Library

On 19 November 2015 at 12:34, Wilco Dijkstra <Wilco.Dijkstra@arm.com> wrote:
> This patch further tunes memcpy - avoid one branch for sizes 1-3, add a
> prefetch and improve small copies that are exact powers of 2.
>
> OK for commit? (depends on
> https://sourceware.org/ml/libc-alpha/2015-09/msg00633.html )
>
> ChangeLog:
> 2015-11-19  Wilco Dijkstra  <wdijkstr@arm.com>
>
>         * sysdeps/aarch64/memcpy.S (memcpy):
>         Further tuning for performance.

OK /Marcus

^ permalink raw reply	[flat|nested] 8+ messages in thread

* [PATCH][AArch64] Tune memcpy
@ 2015-11-19 12:35 Wilco Dijkstra
  2016-06-22  7:47 ` Marcus Shawcroft
  0 siblings, 1 reply; 8+ messages in thread
From: Wilco Dijkstra @ 2015-11-19 12:35 UTC (permalink / raw)
  To: 'GNU C Library'

This patch further tunes memcpy - avoid one branch for sizes 1-3, add a
prefetch and improve small copies that are exact powers of 2.

OK for commit? (depends on
https://sourceware.org/ml/libc-alpha/2015-09/msg00633.html )

ChangeLog:
2015-11-19  Wilco Dijkstra  <wdijkstr@arm.com>

	* sysdeps/aarch64/memcpy.S (memcpy):
	Further tuning for performance.


---
 sysdeps/aarch64/memcpy.S | 56
+++++++++++++++++++++++++++---------------------
 1 file changed, 32 insertions(+), 24 deletions(-)

diff --git a/sysdeps/aarch64/memcpy.S b/sysdeps/aarch64/memcpy.S
index 51e7268..6b8610e 100644
--- a/sysdeps/aarch64/memcpy.S
+++ b/sysdeps/aarch64/memcpy.S
@@ -35,6 +35,7 @@
 #define A_h	x7
 #define A_hw	w7
 #define B_l	x8
+#define B_lw	w8
 #define B_h	x9
 #define C_l	x10
 #define C_h	x11
@@ -70,21 +71,40 @@ END (memmove)
 libc_hidden_builtin_def (memmove)
 ENTRY (memcpy)
 
+	prfm	PLDL1KEEP, [src]
 	add	srcend, src, count
 	add	dstend, dstin, count
+	cmp	count, 16
+	b.ls	L(copy16)
 	cmp	count, 96
 	b.hi	L(copy_long)
-	cmp	count, 16
-	b.hs	L(copy_medium)
 
+	/* Medium copies: 17..96 bytes.  */
+	sub	tmp1, count, 1
+	ldp	A_l, A_h, [src]
+	tbnz	tmp1, 6, L(copy96)
+	ldp	D_l, D_h, [srcend, -16]
+	tbz	tmp1, 5, 1f
+	ldp	B_l, B_h, [src, 16]
+	ldp	C_l, C_h, [srcend, -32]
+	stp	B_l, B_h, [dstin, 16]
+	stp	C_l, C_h, [dstend, -32]
+1:
+	stp	A_l, A_h, [dstin]
+	stp	D_l, D_h, [dstend, -16]
+	ret
+
+	.p2align 4
 	/* Small copies: 0..16 bytes.  */
 L(copy16):
-	tbz	count, 3, 1f
+	cmp	count, 8
+	b.lo	1f
 	ldr	A_l, [src]
 	ldr	A_h, [srcend, -8]
 	str	A_l, [dstin]
 	str	A_h, [dstend, -8]
 	ret
+	.p2align 4
 1:
 	tbz	count, 2, 1f
 	ldr	A_lw, [src]
@@ -92,33 +112,21 @@ L(copy16):
 	str	A_lw, [dstin]
 	str	A_hw, [dstend, -4]
 	ret
-	.p2align 4
+
+	/* Copy 0..3 bytes.  Use a branchless sequence that copies the same
+	   byte 3 times if count==1, or the 2nd byte twice if count==2.  */
 1:
 	cbz	count, 2f
+	lsr	tmp1, count, 1
 	ldrb	A_lw, [src]
-	tbz	count, 1, 1f
-	ldrh	A_hw, [srcend, -2]
-	strh	A_hw, [dstend, -2]
-1:	strb	A_lw, [dstin]
+	ldrb	A_hw, [srcend, -1]
+	ldrb	B_lw, [src, tmp1]
+	strb	A_lw, [dstin]
+	strb	B_lw, [dstin, tmp1]
+	strb	A_hw, [dstend, -1]
 2:	ret
 
 	.p2align 4
-	/* Medium copies: 17..96 bytes.	 */
-L(copy_medium):
-	ldp	A_l, A_h, [src]
-	tbnz	count, 6, L(copy96)
-	ldp	D_l, D_h, [srcend, -16]
-	tbz	count, 5, 1f
-	ldp	B_l, B_h, [src, 16]
-	ldp	C_l, C_h, [srcend, -32]
-	stp	B_l, B_h, [dstin, 16]
-	stp	C_l, C_h, [dstend, -32]
-1:
-	stp	A_l, A_h, [dstin]
-	stp	D_l, D_h, [dstend, -16]
-	ret
-
-	.p2align 4
 	/* Copy 64..96 bytes.  Copy 64 bytes from the start and
 	   32 bytes from the end.  */
 L(copy96):
-- 
1.9.1



^ permalink raw reply	[flat|nested] 8+ messages in thread

end of thread, other threads:[~2016-06-22  7:47 UTC | newest]

Thread overview: 8+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2015-12-15 16:43 [PATCH][AArch64] Tune memcpy Wilco Dijkstra
2016-04-15 12:44 ` Wilco Dijkstra
2016-05-12 14:02 ` Wilco Dijkstra
2016-05-19 11:34   ` Wilco Dijkstra
2016-06-03 12:37     ` Fw: " Wilco Dijkstra
2016-06-21 13:34       ` Wilco Dijkstra
  -- strict thread matches above, loose matches on Subject: below --
2015-11-19 12:35 Wilco Dijkstra
2016-06-22  7:47 ` Marcus Shawcroft

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).