public inbox for libc-ports@sourceware.org
 help / color / mirror / Atom feed
* [PATCH] Optimize MIPS memcpy
@ 2012-09-01  6:16 Maxim Kuvyrkov
  2012-09-01 16:37 ` Joseph S. Myers
  2012-09-03  9:12 ` Andrew T Pinski
  0 siblings, 2 replies; 50+ messages in thread
From: Maxim Kuvyrkov @ 2012-09-01  6:16 UTC (permalink / raw)
  To: libc-ports; +Cc: Joseph S. Myers

[-- Attachment #1: Type: text/plain, Size: 1027 bytes --]

This patch improves MIPS assembly implementations of memcpy.  Two optimizations are added: prefetching of data for subsequent iterations of memcpy loop and pipelined expansion of unaligned memcpy.  These optimizations speed up MIPS memcpy by about 10%.

The prefetching part is straightforward: it adds prefetching of a cache line (32 bytes) for +1 iteration for unaligned case and +2 iteration for aligned case.  The rationale here is that it will take prefetch to acquire data about same time as 1 iteration of unaligned loop or 2 iterations of aligned loop.  Values for these parameters were tuned on a modern MIPS processor.

The pipelined expansion of unaligned loop is implemented in a similar fashion as expansion of the aligned loop.  The assembly is tricky, but it works.

These changes are almost 3 years old, and have been thoroughly tested in CodeSourcery MIPS toolchains.  Retested with current trunk with no regressions for n32, n64 and o32 ABIs.

OK to apply?

--
Maxim Kuvyrkov
Mentor Graphics



[-- Attachment #2: 0001-Optimize-MIPS-memcpy.patch --]
[-- Type: application/octet-stream, Size: 7992 bytes --]

From 689030542b798d0ac711b55b1363a37729be9ad4 Mon Sep 17 00:00:00 2001
From: Maxim Kuvyrkov <maxim@codesourcery.com>
Date: Fri, 31 Aug 2012 21:45:41 -0700
Subject: [PATCH 1/2] Optimize MIPS memcpy

	* sysdeps/mips/memcpy.S, sysdeps/mips64/memcpy.S: Add prefetching and
	optimize unaligned case.
	(PREFETCH_ITERATION_OFFSET_ALIGNED)
	(PREFETCH_ITERATION_OFFSET_MISALIGNED): Define.
	* sysdeps/mips/sys/asm.h (PREFETCH): New macro.
---
 ports/sysdeps/mips/memcpy.S        |  105 ++++++++++++++++++++++++++++++++---
 ports/sysdeps/mips/mips64/memcpy.S |  107 +++++++++++++++++++++++++++++++++---
 ports/sysdeps/mips/sys/asm.h       |    8 +++
 3 files changed, 203 insertions(+), 17 deletions(-)

diff --git a/ports/sysdeps/mips/memcpy.S b/ports/sysdeps/mips/memcpy.S
index 753f67c..55b000d 100644
--- a/ports/sysdeps/mips/memcpy.S
+++ b/ports/sysdeps/mips/memcpy.S
@@ -17,6 +17,7 @@
    <http://www.gnu.org/licenses/>.  */
 
 #include <sysdep.h>
+#include <sys/asm.h>
 
 
 /* void *memcpy(void *s1, const void *s2, size_t n);  */
@@ -33,6 +34,12 @@
 #  define SWLO	swl		/* low part is left in little-endian	*/
 #endif
 
+/* 32-byte prefetch size assumed.  */
+/* Prefetch data for (current iteration + PREFETCH_ITERATION_OFFSET) during
+   current iteration.  */
+#define PREFETCH_ITERATION_OFFSET_ALIGNED 2 /* TUNING KNOB */
+#define PREFETCH_ITERATION_OFFSET_MISALIGNED 1 /* TUNING KNOB */
+
 ENTRY (memcpy)
 	.set	noreorder
 
@@ -67,6 +74,7 @@ L(lop8w):
 	lw	t5, 20(a1)
 	lw	t6, 24(a1)
 	lw	t7, 28(a1)
+	PREFETCH (4, 32+PREFETCH_ITERATION_OFFSET_ALIGNED*32, a1)
 	addiu	a0, 32
 	addiu	a1, 32
 	sw	t0, -32(a0)
@@ -76,8 +84,9 @@ L(lop8w):
 	sw	t4, -16(a0)
 	sw	t5, -12(a0)
 	sw	t6,  -8(a0)
-	bne	a1, a3, L(lop8w)
 	sw	t7,  -4(a0)
+	bne	a1, a3, L(lop8w)
+	PREFETCH (5, 0+PREFETCH_ITERATION_OFFSET_ALIGNED*32, a0)
 
 L(chk1w):	
 	andi	t0, a2, 0x3		# 4 or more bytes left?
@@ -116,19 +125,97 @@ L(shift):
 	SWHI	t0, 0(a0)
 	addu	a0, a3
 L(shft1):	
-	andi	t0, a2, 0x3
+	andi	t0, a2, 0x1f
+	beq	t0, a2, L(shfth_last)
 	subu	a3, a2, t0
 	addu	a3, a1
-L(shfth):	
-	LWHI	t1, 0(a1)		# Limp through, word by word
-	LWLO	t1, 3(a1)
-	addiu	a0, 4
-	addiu	a1, 4
+	move	a2, t0
+L(shfth):
+	LWHI	t0,  0(a1)
+	LWHI	t1,  4(a1)
+	LWHI	t2,  8(a1)
+	LWHI	t3, 12(a1)
+	LWHI	t4, 16(a1)
+	LWHI	t5, 20(a1)
+	LWHI	t6, 24(a1)
+	LWHI	t7, 28(a1)
+	LWLO	t0,  3(a1)
+	LWLO	t1,  7(a1)
+	LWLO	t2, 11(a1)
+	LWLO	t3, 15(a1)
+	LWLO	t4, 19(a1)
+	LWLO	t5, 23(a1)
+	LWLO	t6, 27(a1)
+	LWLO	t7, 31(a1)
+	PREFETCH (4, 32+PREFETCH_ITERATION_OFFSET_MISALIGNED*32, a1)
+	addiu	a0, 32
+	addiu	a1, 32
+	sw	t0, -32(a0)
+	sw	t1, -28(a0)
+	sw	t2, -24(a0)
+	sw	t3, -20(a0)
+	sw	t4, -16(a0)
+	sw	t5, -12(a0)
+	sw	t6,  -8(a0)
+	sw	t7,  -4(a0)
 	bne	a1, a3, L(shfth)
-	sw	t1, -4(a0)
-	b	L(last8)		# Handle anything which may be left
+	PREFETCH (5, PREFETCH_ITERATION_OFFSET_MISALIGNED*32, a0)
+
+L(shfth_last):
+	andi	t0, a2, 0x3
+	beq	t0, a2, L(last8)
+	subu	t8, a2, t0
+	move	t7, ra
+
+	bal	1f
+	addiu	a3, t8, 1f - L(shfth_lwhi)
+1:	subu	a3, ra, a3
+	jr	a3
 	move	a2, t0
 
+	LWHI	t6, 24(a1)
+	LWHI	t5, 20(a1)
+	LWHI	t4, 16(a1)
+	LWHI	t3, 12(a1)
+	LWHI	t2,  8(a1)
+	LWHI	t1,  4(a1)
+	LWHI	t0,  0(a1)
+L(shfth_lwhi):
+
+	bal	1f
+	addiu	a3, t8, 1f - L(shfth_lwlo)
+1:	subu	a3, ra, a3
+	jr	a3
+	nop
+
+	LWLO	t6, 27(a1)
+	LWLO	t5, 23(a1)
+	LWLO	t4, 19(a1)
+	LWLO	t3, 15(a1)
+	LWLO	t2, 11(a1)
+	LWLO	t1,  7(a1)
+	LWLO	t0,  3(a1)
+L(shfth_lwlo):
+
+	bal	1f
+	addiu	a3, t8, 1f - L(shfth_sw)
+1:	subu	a3, ra, a3
+	jr	a3
+	addu	a1, t8
+
+	sw	t6, 24(a0)
+	sw	t5, 20(a0)
+	sw	t4, 16(a0)
+	sw	t3, 12(a0)
+	sw	t2,  8(a0)
+	sw	t1,  4(a0)
+	sw	t0,  0(a0)
+L(shfth_sw):
+
+	move	ra, t7
+	b	L(last8)		# Handle last 3 bytes
+	addu	a0, t8
+
 	.set	reorder
 END (memcpy)
 libc_hidden_builtin_def (memcpy)
diff --git a/ports/sysdeps/mips/mips64/memcpy.S b/ports/sysdeps/mips/mips64/memcpy.S
index 49ef34d..3a5b33c 100644
--- a/ports/sysdeps/mips/mips64/memcpy.S
+++ b/ports/sysdeps/mips/mips64/memcpy.S
@@ -37,6 +37,12 @@
 #  define SDLO	sdl		/* low part is left in little-endian	*/
 #endif
 
+/* 32-byte prefetch size assumed.  */
+/* Prefetch data for (current iteration + PREFETCH_ITERATION_OFFSET) during
+   current iteration.  */
+#define PREFETCH_ITERATION_OFFSET_ALIGNED 2 /* TUNING KNOB */
+#define PREFETCH_ITERATION_OFFSET_MISALIGNED 1 /* TUNING KNOB */
+
 ENTRY (memcpy)
 	.set	noreorder
 
@@ -67,21 +73,25 @@ L(lop8w):
 	ld	t1,  8(a1)
 	ld	t2, 16(a1)
 	ld	t3, 24(a1)
+	PREFETCH (4, 32+PREFETCH_ITERATION_OFFSET_ALIGNED*32, a1)
 	ld	ta0, 32(a1)
 	ld	ta1, 40(a1)
 	ld	ta2, 48(a1)
 	ld	ta3, 56(a1)
+	PREFETCH (4, 64+PREFETCH_ITERATION_OFFSET_ALIGNED*32, a1)
 	PTR_ADDIU a0, 64
 	PTR_ADDIU a1, 64
 	sd	t0, -64(a0)
 	sd	t1, -56(a0)
 	sd	t2, -48(a0)
 	sd	t3, -40(a0)
+	PREFETCH (5, -32+PREFETCH_ITERATION_OFFSET_ALIGNED*32, a0)
 	sd	ta0, -32(a0)
 	sd	ta1, -24(a0)
 	sd	ta2, -16(a0)
-	bne	a1, a3, L(lop8w)
 	sd	ta3,  -8(a0)
+	bne	a1, a3, L(lop8w)
+	PREFETCH (5, 0+PREFETCH_ITERATION_OFFSET_ALIGNED*32, a0)
 
 L(chk1w):
 	andi	t0, a2, 0x7		# 8 or more bytes left?
@@ -120,19 +130,100 @@ L(shift):
 	SDHI	t0, 0(a0)
 	PTR_ADDU a0, a3
 L(shft1):
-	andi	t0, a2, 0x7
+	andi	t0, a2, 0x3f
+	beq	t0, a2, L(shfth_last)
 	PTR_SUBU a3, a2, t0
 	PTR_ADDU a3, a1
+	move	a2, t0
 L(shfth):
-	LDHI	t1, 0(a1)		# Limp through, dword by dword
-	LDLO	t1, 7(a1)
-	PTR_ADDIU a0, 8
-	PTR_ADDIU a1, 8
+	LDHI	t0,  0(a1)
+	LDHI	t1,  8(a1)
+	LDHI	t2, 16(a1)
+	LDHI	t3, 24(a1)
+	LDHI	ta0, 32(a1)
+	LDHI	ta1, 40(a1)
+	LDHI	ta2, 48(a1)
+	LDHI	ta3, 56(a1)
+	PREFETCH (4, 32+PREFETCH_ITERATION_OFFSET_MISALIGNED*32, a1)
+	LDLO	t0,  7(a1)
+	LDLO	t1, 15(a1)
+	LDLO	t2, 23(a1)
+	LDLO	t3, 31(a1)
+	LDLO	ta0, 39(a1)
+	LDLO	ta1, 47(a1)
+	LDLO	ta2, 55(a1)
+	LDLO	ta3, 63(a1)
+	PREFETCH (4, 64+PREFETCH_ITERATION_OFFSET_MISALIGNED*32, a1)
+	PTR_ADDIU a0, 64
+	PTR_ADDIU a1, 64
+	sd	t0, -64(a0)
+	sd	t1, -56(a0)
+	sd	t2, -48(a0)
+	sd	t3, -40(a0)
+	PREFETCH (5, -32+PREFETCH_ITERATION_OFFSET_MISALIGNED*32, a0)
+	sd	ta0, -32(a0)
+	sd	ta1, -24(a0)
+	sd	ta2, -16(a0)
+	sd	ta3,  -8(a0)
 	bne	a1, a3, L(shfth)
-	sd	t1, -8(a0)
-	b	L(last16)		# Handle anything which may be left
+	PREFETCH (5, 0+PREFETCH_ITERATION_OFFSET_MISALIGNED*32, a0)
+
+L(shfth_last):
+	andi	t0, a2, 0x7
+	beq	t0, a2, L(last16)
+	PTR_SUBU t8, a2, t0
+	PTR_SRL t9, t8, 1
+	move	ta3, ra
+
+	bal	1f
+	PTR_ADDIU a3, t9, 1f - L(shfth_ldhi)
+1:	PTR_SUBU a3, ra, a3
+	jr	a3
 	move	a2, t0
 
+	LDHI	ta2, 48(a1)
+	LDHI	ta1, 40(a1)
+	LDHI	ta0, 32(a1)
+	LDHI	t3, 24(a1)
+	LDHI	t2, 16(a1)
+	LDHI	t1,  8(a1)
+	LDHI	t0,  0(a1)
+L(shfth_ldhi):
+
+	bal	1f
+	PTR_ADDIU a3, t9, 1f - L(shfth_ldlo)
+1:	PTR_SUBU a3, ra, a3
+	jr	a3
+	nop
+
+	LDLO	ta2, 55(a1)
+	LDLO	ta1, 47(a1)
+	LDLO	ta0, 39(a1)
+	LDLO	t3, 31(a1)
+	LDLO	t2, 23(a1)
+	LDLO	t1, 15(a1)
+	LDLO	t0,  7(a1)
+L(shfth_ldlo):
+
+	bal	1f
+	PTR_ADDIU a3, t9, 1f - L(shfth_sd)
+1:	PTR_SUBU a3, ra, a3
+	jr	a3
+	PTR_ADDU a1, t8
+
+	sd	ta2, 48(a0)
+	sd	ta1, 40(a0)
+	sd	ta0, 32(a0)
+	sd	t3, 24(a0)
+	sd	t2, 16(a0)
+	sd	t1,  8(a0)
+	sd	t0,  0(a0)
+L(shfth_sd):
+
+	move	ra, ta3
+	b	L(last16)		# Handle last 7 bytes
+	PTR_ADDU a0, t8
+
 	.set	reorder
 END (memcpy)
 libc_hidden_builtin_def (memcpy)
diff --git a/ports/sysdeps/mips/sys/asm.h b/ports/sysdeps/mips/sys/asm.h
index 0f5edf9..e4057e5 100644
--- a/ports/sysdeps/mips/sys/asm.h
+++ b/ports/sysdeps/mips/sys/asm.h
@@ -482,4 +482,12 @@ symbol		=	value
 # define MIPS_SYNC	sync
 #endif
 
+#if _MIPS_ISA == _MIPS_ISA_MIPS32 || _MIPS_ISA == _MIPS_ISA_MIPS64
+# define PREFETCH(hint, offset, reg) pref hint, offset(reg)
+#else
+/* Don't leave assembler prefetch undefined to avoid surprises from
+   delay slot placement.  */
+# define PREFETCH(hint, offset, reg) nop
+#endif
+
 #endif /* sys/asm.h */
-- 
1.7.0.4


^ permalink raw reply	[flat|nested] 50+ messages in thread

end of thread, other threads:[~2012-12-19 16:59 UTC | newest]

Thread overview: 50+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2012-09-01  6:16 [PATCH] Optimize MIPS memcpy Maxim Kuvyrkov
2012-09-01 16:37 ` Joseph S. Myers
2012-09-03  9:12 ` Andrew T Pinski
2012-09-03 17:12   ` Maxim Kuvyrkov
2012-09-04 15:09   ` Steve Ellcey
2012-09-04 15:14     ` Carlos O'Donell
2012-09-04 17:03       ` Steve Ellcey
2012-09-04 17:28         ` Carlos O'Donell
2012-09-05  0:43     ` Maxim Kuvyrkov
2012-09-06 16:25       ` Steve Ellcey
2012-09-06 18:43         ` Roland McGrath
2012-09-06 19:37           ` Steve Ellcey
2012-09-07 21:24         ` Maxim Kuvyrkov
2012-09-11  4:35         ` Maxim Kuvyrkov
2012-09-11 15:18           ` Steve Ellcey
2012-09-20  9:05             ` Maxim Kuvyrkov
2012-09-20 18:38               ` Steve Ellcey
2012-09-28  3:48                 ` Maxim Kuvyrkov
2012-10-06  4:43                   ` Maxim Kuvyrkov
2012-10-08 17:04                     ` Steve Ellcey
2012-10-08 22:31                       ` Maxim Kuvyrkov
2012-10-09 20:50                         ` Steve Ellcey
2012-10-15 17:49                         ` Steve Ellcey
2012-10-15 20:20                           ` Andrew Pinski
2012-10-15 20:34                             ` Steve Ellcey
2012-10-15 20:42                               ` Andrew Pinski
2012-10-15 20:50                                 ` Andrew Pinski
2012-10-15 21:36                                   ` Steve Ellcey
2012-10-15 21:47                                     ` Maxim Kuvyrkov
2012-10-17 17:30                                       ` Steve Ellcey
2012-10-29 18:00                                         ` Steve Ellcey
2012-10-29 18:03                                           ` Maxim Kuvyrkov
2012-10-30  7:16                                           ` Maxim Kuvyrkov
2012-10-30  7:19                                             ` Maxim Kuvyrkov
2012-10-30 17:46                                             ` Steve Ellcey
2012-10-30 21:56                                               ` Maxim Kuvyrkov
2012-10-30 22:19                                                 ` Steve Ellcey
2012-12-19  1:51                                                   ` Maxim Kuvyrkov
2012-12-19 16:59                                                     ` Steve Ellcey
2012-10-31 19:27                                         ` Andreas Jaeger
2012-10-31 20:04                                           ` Steve Ellcey
2012-10-15 22:10                                     ` Joseph S. Myers
2012-10-15 21:29                               ` Maciej W. Rozycki
2012-10-15 22:05                           ` Maxim Kuvyrkov
2012-09-21 18:47               ` Steve Ellcey
2012-09-21 18:57                 ` Joseph S. Myers
2012-09-21 20:41                   ` [PATCH] Optimize MIPS memcpy (mips glibc test results) Steve Ellcey
2012-09-21 20:49                     ` Joseph S. Myers
2012-09-21 20:56                       ` Steve Ellcey
2012-09-21 19:12                 ` [PATCH] Optimize MIPS memcpy Maxim Kuvyrkov

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).