public inbox for libc-ports@sourceware.org
 help / color / mirror / Atom feed
* [PATCH] Optimize MIPS memcpy
@ 2012-09-01  6:16 Maxim Kuvyrkov
  2012-09-01 16:37 ` Joseph S. Myers
  2012-09-03  9:12 ` Andrew T Pinski
  0 siblings, 2 replies; 50+ messages in thread
From: Maxim Kuvyrkov @ 2012-09-01  6:16 UTC (permalink / raw)
  To: libc-ports; +Cc: Joseph S. Myers

[-- Attachment #1: Type: text/plain, Size: 1027 bytes --]

This patch improves MIPS assembly implementations of memcpy.  Two optimizations are added: prefetching of data for subsequent iterations of memcpy loop and pipelined expansion of unaligned memcpy.  These optimizations speed up MIPS memcpy by about 10%.

The prefetching part is straightforward: it adds prefetching of a cache line (32 bytes) for +1 iteration for unaligned case and +2 iteration for aligned case.  The rationale here is that it will take prefetch to acquire data about same time as 1 iteration of unaligned loop or 2 iterations of aligned loop.  Values for these parameters were tuned on a modern MIPS processor.

The pipelined expansion of unaligned loop is implemented in a similar fashion as expansion of the aligned loop.  The assembly is tricky, but it works.

These changes are almost 3 years old, and have been thoroughly tested in CodeSourcery MIPS toolchains.  Retested with current trunk with no regressions for n32, n64 and o32 ABIs.

OK to apply?

--
Maxim Kuvyrkov
Mentor Graphics



[-- Attachment #2: 0001-Optimize-MIPS-memcpy.patch --]
[-- Type: application/octet-stream, Size: 7992 bytes --]

From 689030542b798d0ac711b55b1363a37729be9ad4 Mon Sep 17 00:00:00 2001
From: Maxim Kuvyrkov <maxim@codesourcery.com>
Date: Fri, 31 Aug 2012 21:45:41 -0700
Subject: [PATCH 1/2] Optimize MIPS memcpy

	* sysdeps/mips/memcpy.S, sysdeps/mips64/memcpy.S: Add prefetching and
	optimize unaligned case.
	(PREFETCH_ITERATION_OFFSET_ALIGNED)
	(PREFETCH_ITERATION_OFFSET_MISALIGNED): Define.
	* sysdeps/mips/sys/asm.h (PREFETCH): New macro.
---
 ports/sysdeps/mips/memcpy.S        |  105 ++++++++++++++++++++++++++++++++---
 ports/sysdeps/mips/mips64/memcpy.S |  107 +++++++++++++++++++++++++++++++++---
 ports/sysdeps/mips/sys/asm.h       |    8 +++
 3 files changed, 203 insertions(+), 17 deletions(-)

diff --git a/ports/sysdeps/mips/memcpy.S b/ports/sysdeps/mips/memcpy.S
index 753f67c..55b000d 100644
--- a/ports/sysdeps/mips/memcpy.S
+++ b/ports/sysdeps/mips/memcpy.S
@@ -17,6 +17,7 @@
    <http://www.gnu.org/licenses/>.  */
 
 #include <sysdep.h>
+#include <sys/asm.h>
 
 
 /* void *memcpy(void *s1, const void *s2, size_t n);  */
@@ -33,6 +34,12 @@
 #  define SWLO	swl		/* low part is left in little-endian	*/
 #endif
 
+/* 32-byte prefetch size assumed.  */
+/* Prefetch data for (current iteration + PREFETCH_ITERATION_OFFSET) during
+   current iteration.  */
+#define PREFETCH_ITERATION_OFFSET_ALIGNED 2 /* TUNING KNOB */
+#define PREFETCH_ITERATION_OFFSET_MISALIGNED 1 /* TUNING KNOB */
+
 ENTRY (memcpy)
 	.set	noreorder
 
@@ -67,6 +74,7 @@ L(lop8w):
 	lw	t5, 20(a1)
 	lw	t6, 24(a1)
 	lw	t7, 28(a1)
+	PREFETCH (4, 32+PREFETCH_ITERATION_OFFSET_ALIGNED*32, a1)
 	addiu	a0, 32
 	addiu	a1, 32
 	sw	t0, -32(a0)
@@ -76,8 +84,9 @@ L(lop8w):
 	sw	t4, -16(a0)
 	sw	t5, -12(a0)
 	sw	t6,  -8(a0)
-	bne	a1, a3, L(lop8w)
 	sw	t7,  -4(a0)
+	bne	a1, a3, L(lop8w)
+	PREFETCH (5, 0+PREFETCH_ITERATION_OFFSET_ALIGNED*32, a0)
 
 L(chk1w):	
 	andi	t0, a2, 0x3		# 4 or more bytes left?
@@ -116,19 +125,97 @@ L(shift):
 	SWHI	t0, 0(a0)
 	addu	a0, a3
 L(shft1):	
-	andi	t0, a2, 0x3
+	andi	t0, a2, 0x1f
+	beq	t0, a2, L(shfth_last)
 	subu	a3, a2, t0
 	addu	a3, a1
-L(shfth):	
-	LWHI	t1, 0(a1)		# Limp through, word by word
-	LWLO	t1, 3(a1)
-	addiu	a0, 4
-	addiu	a1, 4
+	move	a2, t0
+L(shfth):
+	LWHI	t0,  0(a1)
+	LWHI	t1,  4(a1)
+	LWHI	t2,  8(a1)
+	LWHI	t3, 12(a1)
+	LWHI	t4, 16(a1)
+	LWHI	t5, 20(a1)
+	LWHI	t6, 24(a1)
+	LWHI	t7, 28(a1)
+	LWLO	t0,  3(a1)
+	LWLO	t1,  7(a1)
+	LWLO	t2, 11(a1)
+	LWLO	t3, 15(a1)
+	LWLO	t4, 19(a1)
+	LWLO	t5, 23(a1)
+	LWLO	t6, 27(a1)
+	LWLO	t7, 31(a1)
+	PREFETCH (4, 32+PREFETCH_ITERATION_OFFSET_MISALIGNED*32, a1)
+	addiu	a0, 32
+	addiu	a1, 32
+	sw	t0, -32(a0)
+	sw	t1, -28(a0)
+	sw	t2, -24(a0)
+	sw	t3, -20(a0)
+	sw	t4, -16(a0)
+	sw	t5, -12(a0)
+	sw	t6,  -8(a0)
+	sw	t7,  -4(a0)
 	bne	a1, a3, L(shfth)
-	sw	t1, -4(a0)
-	b	L(last8)		# Handle anything which may be left
+	PREFETCH (5, PREFETCH_ITERATION_OFFSET_MISALIGNED*32, a0)
+
+L(shfth_last):
+	andi	t0, a2, 0x3
+	beq	t0, a2, L(last8)
+	subu	t8, a2, t0
+	move	t7, ra
+
+	bal	1f
+	addiu	a3, t8, 1f - L(shfth_lwhi)
+1:	subu	a3, ra, a3
+	jr	a3
 	move	a2, t0
 
+	LWHI	t6, 24(a1)
+	LWHI	t5, 20(a1)
+	LWHI	t4, 16(a1)
+	LWHI	t3, 12(a1)
+	LWHI	t2,  8(a1)
+	LWHI	t1,  4(a1)
+	LWHI	t0,  0(a1)
+L(shfth_lwhi):
+
+	bal	1f
+	addiu	a3, t8, 1f - L(shfth_lwlo)
+1:	subu	a3, ra, a3
+	jr	a3
+	nop
+
+	LWLO	t6, 27(a1)
+	LWLO	t5, 23(a1)
+	LWLO	t4, 19(a1)
+	LWLO	t3, 15(a1)
+	LWLO	t2, 11(a1)
+	LWLO	t1,  7(a1)
+	LWLO	t0,  3(a1)
+L(shfth_lwlo):
+
+	bal	1f
+	addiu	a3, t8, 1f - L(shfth_sw)
+1:	subu	a3, ra, a3
+	jr	a3
+	addu	a1, t8
+
+	sw	t6, 24(a0)
+	sw	t5, 20(a0)
+	sw	t4, 16(a0)
+	sw	t3, 12(a0)
+	sw	t2,  8(a0)
+	sw	t1,  4(a0)
+	sw	t0,  0(a0)
+L(shfth_sw):
+
+	move	ra, t7
+	b	L(last8)		# Handle last 3 bytes
+	addu	a0, t8
+
 	.set	reorder
 END (memcpy)
 libc_hidden_builtin_def (memcpy)
diff --git a/ports/sysdeps/mips/mips64/memcpy.S b/ports/sysdeps/mips/mips64/memcpy.S
index 49ef34d..3a5b33c 100644
--- a/ports/sysdeps/mips/mips64/memcpy.S
+++ b/ports/sysdeps/mips/mips64/memcpy.S
@@ -37,6 +37,12 @@
 #  define SDLO	sdl		/* low part is left in little-endian	*/
 #endif
 
+/* 32-byte prefetch size assumed.  */
+/* Prefetch data for (current iteration + PREFETCH_ITERATION_OFFSET) during
+   current iteration.  */
+#define PREFETCH_ITERATION_OFFSET_ALIGNED 2 /* TUNING KNOB */
+#define PREFETCH_ITERATION_OFFSET_MISALIGNED 1 /* TUNING KNOB */
+
 ENTRY (memcpy)
 	.set	noreorder
 
@@ -67,21 +73,25 @@ L(lop8w):
 	ld	t1,  8(a1)
 	ld	t2, 16(a1)
 	ld	t3, 24(a1)
+	PREFETCH (4, 32+PREFETCH_ITERATION_OFFSET_ALIGNED*32, a1)
 	ld	ta0, 32(a1)
 	ld	ta1, 40(a1)
 	ld	ta2, 48(a1)
 	ld	ta3, 56(a1)
+	PREFETCH (4, 64+PREFETCH_ITERATION_OFFSET_ALIGNED*32, a1)
 	PTR_ADDIU a0, 64
 	PTR_ADDIU a1, 64
 	sd	t0, -64(a0)
 	sd	t1, -56(a0)
 	sd	t2, -48(a0)
 	sd	t3, -40(a0)
+	PREFETCH (5, -32+PREFETCH_ITERATION_OFFSET_ALIGNED*32, a0)
 	sd	ta0, -32(a0)
 	sd	ta1, -24(a0)
 	sd	ta2, -16(a0)
-	bne	a1, a3, L(lop8w)
 	sd	ta3,  -8(a0)
+	bne	a1, a3, L(lop8w)
+	PREFETCH (5, 0+PREFETCH_ITERATION_OFFSET_ALIGNED*32, a0)
 
 L(chk1w):
 	andi	t0, a2, 0x7		# 8 or more bytes left?
@@ -120,19 +130,100 @@ L(shift):
 	SDHI	t0, 0(a0)
 	PTR_ADDU a0, a3
 L(shft1):
-	andi	t0, a2, 0x7
+	andi	t0, a2, 0x3f
+	beq	t0, a2, L(shfth_last)
 	PTR_SUBU a3, a2, t0
 	PTR_ADDU a3, a1
+	move	a2, t0
 L(shfth):
-	LDHI	t1, 0(a1)		# Limp through, dword by dword
-	LDLO	t1, 7(a1)
-	PTR_ADDIU a0, 8
-	PTR_ADDIU a1, 8
+	LDHI	t0,  0(a1)
+	LDHI	t1,  8(a1)
+	LDHI	t2, 16(a1)
+	LDHI	t3, 24(a1)
+	LDHI	ta0, 32(a1)
+	LDHI	ta1, 40(a1)
+	LDHI	ta2, 48(a1)
+	LDHI	ta3, 56(a1)
+	PREFETCH (4, 32+PREFETCH_ITERATION_OFFSET_MISALIGNED*32, a1)
+	LDLO	t0,  7(a1)
+	LDLO	t1, 15(a1)
+	LDLO	t2, 23(a1)
+	LDLO	t3, 31(a1)
+	LDLO	ta0, 39(a1)
+	LDLO	ta1, 47(a1)
+	LDLO	ta2, 55(a1)
+	LDLO	ta3, 63(a1)
+	PREFETCH (4, 64+PREFETCH_ITERATION_OFFSET_MISALIGNED*32, a1)
+	PTR_ADDIU a0, 64
+	PTR_ADDIU a1, 64
+	sd	t0, -64(a0)
+	sd	t1, -56(a0)
+	sd	t2, -48(a0)
+	sd	t3, -40(a0)
+	PREFETCH (5, -32+PREFETCH_ITERATION_OFFSET_MISALIGNED*32, a0)
+	sd	ta0, -32(a0)
+	sd	ta1, -24(a0)
+	sd	ta2, -16(a0)
+	sd	ta3,  -8(a0)
 	bne	a1, a3, L(shfth)
-	sd	t1, -8(a0)
-	b	L(last16)		# Handle anything which may be left
+	PREFETCH (5, 0+PREFETCH_ITERATION_OFFSET_MISALIGNED*32, a0)
+
+L(shfth_last):
+	andi	t0, a2, 0x7
+	beq	t0, a2, L(last16)
+	PTR_SUBU t8, a2, t0
+	PTR_SRL t9, t8, 1
+	move	ta3, ra
+
+	bal	1f
+	PTR_ADDIU a3, t9, 1f - L(shfth_ldhi)
+1:	PTR_SUBU a3, ra, a3
+	jr	a3
 	move	a2, t0
 
+	LDHI	ta2, 48(a1)
+	LDHI	ta1, 40(a1)
+	LDHI	ta0, 32(a1)
+	LDHI	t3, 24(a1)
+	LDHI	t2, 16(a1)
+	LDHI	t1,  8(a1)
+	LDHI	t0,  0(a1)
+L(shfth_ldhi):
+
+	bal	1f
+	PTR_ADDIU a3, t9, 1f - L(shfth_ldlo)
+1:	PTR_SUBU a3, ra, a3
+	jr	a3
+	nop
+
+	LDLO	ta2, 55(a1)
+	LDLO	ta1, 47(a1)
+	LDLO	ta0, 39(a1)
+	LDLO	t3, 31(a1)
+	LDLO	t2, 23(a1)
+	LDLO	t1, 15(a1)
+	LDLO	t0,  7(a1)
+L(shfth_ldlo):
+
+	bal	1f
+	PTR_ADDIU a3, t9, 1f - L(shfth_sd)
+1:	PTR_SUBU a3, ra, a3
+	jr	a3
+	PTR_ADDU a1, t8
+
+	sd	ta2, 48(a0)
+	sd	ta1, 40(a0)
+	sd	ta0, 32(a0)
+	sd	t3, 24(a0)
+	sd	t2, 16(a0)
+	sd	t1,  8(a0)
+	sd	t0,  0(a0)
+L(shfth_sd):
+
+	move	ra, ta3
+	b	L(last16)		# Handle last 7 bytes
+	PTR_ADDU a0, t8
+
 	.set	reorder
 END (memcpy)
 libc_hidden_builtin_def (memcpy)
diff --git a/ports/sysdeps/mips/sys/asm.h b/ports/sysdeps/mips/sys/asm.h
index 0f5edf9..e4057e5 100644
--- a/ports/sysdeps/mips/sys/asm.h
+++ b/ports/sysdeps/mips/sys/asm.h
@@ -482,4 +482,12 @@ symbol		=	value
 # define MIPS_SYNC	sync
 #endif
 
+#if _MIPS_ISA == _MIPS_ISA_MIPS32 || _MIPS_ISA == _MIPS_ISA_MIPS64
+# define PREFETCH(hint, offset, reg) pref hint, offset(reg)
+#else
+/* Don't leave assembler prefetch undefined to avoid surprises from
+   delay slot placement.  */
+# define PREFETCH(hint, offset, reg) nop
+#endif
+
 #endif /* sys/asm.h */
-- 
1.7.0.4


^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH] Optimize MIPS memcpy
  2012-09-01  6:16 [PATCH] Optimize MIPS memcpy Maxim Kuvyrkov
@ 2012-09-01 16:37 ` Joseph S. Myers
  2012-09-03  9:12 ` Andrew T Pinski
  1 sibling, 0 replies; 50+ messages in thread
From: Joseph S. Myers @ 2012-09-01 16:37 UTC (permalink / raw)
  To: Maxim Kuvyrkov; +Cc: libc-ports

On Sat, 1 Sep 2012, Maxim Kuvyrkov wrote:

> Retested with current trunk with no regressions for n32, n64 and o32 
> ABIs.
> 
> OK to apply?

OK if this testing has been done for both big and little endian (so six 
endian / ABI combinations).

-- 
Joseph S. Myers
joseph@codesourcery.com

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH] Optimize MIPS memcpy
  2012-09-01  6:16 [PATCH] Optimize MIPS memcpy Maxim Kuvyrkov
  2012-09-01 16:37 ` Joseph S. Myers
@ 2012-09-03  9:12 ` Andrew T Pinski
  2012-09-03 17:12   ` Maxim Kuvyrkov
  2012-09-04 15:09   ` Steve Ellcey
  1 sibling, 2 replies; 50+ messages in thread
From: Andrew T Pinski @ 2012-09-03  9:12 UTC (permalink / raw)
  To: Maxim Kuvyrkov; +Cc: Joseph S.  Myers, libc-ports

Forgot to CC libc-ports@ .
On Sat, 2012-09-01 at 18:15 +1200, Maxim Kuvyrkov wrote:
> This patch improves MIPS assembly implementations of memcpy.  Two optimizations are added: prefetching of data for subsequent iterations of memcpy loop and pipelined expansion of unaligned memcpy.  These optimizations speed up MIPS memcpy by about 10%.
> 
> The prefetching part is straightforward: it adds prefetching of a cache line (32 bytes) for +1 iteration for unaligned case and +2 iteration for aligned case.  The rationale here is that it will take prefetch to acquire data about same time as 1 iteration of unaligned loop or 2 iterations of aligned loop.  Values for these parameters were tuned on a modern MIPS processor.
> 

This might hurt Octeon as the cache line size there is 128 bytes.  Can
you say which modern MIPS processor which this has been tuned with?  And
is there a way to not hard code 32 in the assembly but in a macro
instead.

Thanks,
Andrew Pinski


> The pipelined expansion of unaligned loop is implemented in a similar fashion as expansion of the aligned loop.  The assembly is tricky, but it works.
> 
> These changes are almost 3 years old, and have been thoroughly tested in CodeSourcery MIPS toolchains.  Retested with current trunk with no regressions for n32, n64 and o32 ABIs.
> 
> OK to apply?
> 
> --
> Maxim Kuvyrkov
> Mentor Graphics
> 
> 



^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH] Optimize MIPS memcpy
  2012-09-03  9:12 ` Andrew T Pinski
@ 2012-09-03 17:12   ` Maxim Kuvyrkov
  2012-09-04 15:09   ` Steve Ellcey
  1 sibling, 0 replies; 50+ messages in thread
From: Maxim Kuvyrkov @ 2012-09-03 17:12 UTC (permalink / raw)
  To: Andrew T Pinski; +Cc: Joseph S.  Myers, libc-ports

On 3/09/2012, at 9:12 PM, Andrew T Pinski wrote:

> Forgot to CC libc-ports@ .
> On Sat, 2012-09-01 at 18:15 +1200, Maxim Kuvyrkov wrote:
>> This patch improves MIPS assembly implementations of memcpy.  Two optimizations are added: prefetching of data for subsequent iterations of memcpy loop and pipelined expansion of unaligned memcpy.  These optimizations speed up MIPS memcpy by about 10%.
>> 
>> The prefetching part is straightforward: it adds prefetching of a cache line (32 bytes) for +1 iteration for unaligned case and +2 iteration for aligned case.  The rationale here is that it will take prefetch to acquire data about same time as 1 iteration of unaligned loop or 2 iterations of aligned loop.  Values for these parameters were tuned on a modern MIPS processor.
>> 
> 
> This might hurt Octeon as the cache line size there is 128 bytes.  Can
> you say which modern MIPS processor which this has been tuned with?  And
> is there a way to not hard code 32 in the assembly but in a macro
> instead.

This was implemented with NetLogic XLR/XLP in mind.

The above description that I wrote was not completely accurate with regards to why we are assuming 32-byte prefetch (as I mentioned, this patch was developed almost 3 years ago).  For 32-bit ABIs one iteration of the loop processes 32-bytes of data -- that's how much can fit into available 8 registers at once.  Therefore we are choosing to prefetch in 32-byte blocks and have 1 prefetch instruction per iteration (well, 2 prefetches actually -- one for read and one for write).  It is possible to issue prefetch instructions only every Nth iteration, but the overhead of doing so will likely be greater than the benefit.

For 64-bit ABIs we process 64 bytes per iteration, so we could deal with just a single 64-byte-or-wider prefetch per iteration.  As it happens, XLR/XLP prefetch 32 bytes at a time, so the current implementation issues 2 prefetches per iteration.

It is feasible to use 2 macros for 64-bit implementation: PREFETCH32 and PREFETCH64.  XLR/XLP would define both these macros to "pref", while Octeon would define PREFETCH64 to "pref" and PREFETCH32 to "nop", thus issuing a single prefetch per iteration.  

However, I doubt that the above improvement worths the increased complexity of the memcpy implementation.  I would expect most modern CPU to quickly discard extraneous prefetch instructions.  And the most we can reasonably save here is to remove 1 read and 1 write prefetch instructions for 64-bit memcpy.

Andrew, if you still think that it would provide significant performance improvement to Octeon to issue as few prefetches as possible, would you please compare performance between the two approaches (removing the second prefetch from 64-bit implementation is a trivial change) and get back to the list with the results?

Thank you,

--
Maxim Kuvyrkov
Mentor Graphics

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH] Optimize MIPS memcpy
  2012-09-03  9:12 ` Andrew T Pinski
  2012-09-03 17:12   ` Maxim Kuvyrkov
@ 2012-09-04 15:09   ` Steve Ellcey
  2012-09-04 15:14     ` Carlos O'Donell
  2012-09-05  0:43     ` Maxim Kuvyrkov
  1 sibling, 2 replies; 50+ messages in thread
From: Steve Ellcey @ 2012-09-04 15:09 UTC (permalink / raw)
  To: Andrew T Pinski; +Cc: Maxim Kuvyrkov, Joseph S.  Myers, libc-ports

[-- Attachment #1: Type: text/plain, Size: 2102 bytes --]

On Mon, 2012-09-03 at 02:12 -0700, Andrew T Pinski wrote:
> Forgot to CC libc-ports@ .
> On Sat, 2012-09-01 at 18:15 +1200, Maxim Kuvyrkov wrote:
> > This patch improves MIPS assembly implementations of memcpy.  Two optimizations are added:
> prefetching of data for subsequent iterations of memcpy loop and pipelined expansion of unaligned
> memcpy.  These optimizations speed up MIPS memcpy by about 10%.
> > 
> > The prefetching part is straightforward: it adds prefetching of a cache line (32 bytes) for +1
> iteration for unaligned case and +2 iteration for aligned case.  The rationale here is that it will
> take prefetch to acquire data about same time as 1 iteration of unaligned loop or 2 iterations of aligned loop.  Values for these parameters were tuned on a modern MIPS processor.
> > 
> 
> This might hurt Octeon as the cache line size there is 128 bytes.  Can
> you say which modern MIPS processor which this has been tuned with?  And
> is there a way to not hard code 32 in the assembly but in a macro
> instead.
> 
> Thanks,
> Andrew Pinski

I've been looking at the MIPS memcpy and was planning on submitting a
new version based on the one that MIPS submitted to Android.  It has
prefetching like Maxim's though I found that using the load and 'prepare
for store' hints instead of 'load streaming' and 'store streaming' hints
gave me better results on the 74k and 24k that I did performance testing
on.

This version has more unrolling too and between that and the hints
difference I got a small performance improvement over Maxim's version
when doing small memcpy's and a fairly substantial improvement on large
memcpy's.

I also merged the 32 and 64 bit versions together so we would only have
one copy to maintain.  I haven't tried building it as part of glibc yet,
I have been testing it standalone first and was going to try and
integrate it into glibc and submit it this week or next.  I'll attach it
to this email so folks can look at it and I will see if I can
parameterize the cache line size.  This one also assumes a 32 byte cache
prefetch.

Steve Ellcey
sellcey@mips.com

[-- Attachment #2: memcpy.S --]
[-- Type: text/x-csrc, Size: 16761 bytes --]

/*
 * Copyright (c) 2009-2012
 *      MIPS Technologies, Inc., California.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. Neither the name of the MIPS Technologies, Inc., nor the names of its
 *    contributors may be used to endorse or promote products derived from
 *    this software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE MIPS TECHNOLOGIES, INC. ``AS IS'' AND
 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 * ARE DISCLAIMED.  IN NO EVENT SHALL THE MIPS TECHNOLOGIES, INC. BE LIABLE
 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 * SUCH DAMAGE.
 */

/************************************************************************
 *
 *  memcpy.S
 *  Version: "043009"
 *
 ************************************************************************/


/************************************************************************
 *  Include files
 ************************************************************************/

#ifdef __BIONIC__
#include "machine/asm.h"
#include "machine/regdef.h"
#define ALLOW_OVERLAP
#define USE_PREFETCH
#else
#ifdef _LIBC
#include <sysdep.h>
#define USE_PREFETCH
#endif
#include <regdef.h>
#include <sys/asm.h>
#if _MIPS_ISA == _MIPS_ISA_MIPS32 || _MIPS_ISA == _MIPS_ISA_MIPS64
#define PREFETCH
#endif
#if _MIPS_SIM == _ABI64
#define USE_DOUBLE
#endif
#endif



/* Some asm.h files do not have the L macro definition.  */
#ifndef L
#if _MIPS_SIM == _ABIO32
# define L(label) $L ## label
#else
# define L(label) .L ## label
#endif
#endif

/* Some regdef.h files deo not have the PTR_ADDIU macro definition.  */
#ifndef PTR_ADDIU
#ifdef USE_DOUBLE
#define PTR_ADDIU	daddiu
#else
#define PTR_ADDIU	addiu
#endif
#endif


/*
 * Using PREF_LOAD_STREAMED instead of PREF_LOAD on load prefetches offers
 * a slight preformance advantage, using PREF_PREPAREFORSTORE instead of
 * PREF_STORE_STREAMED or PREF_STORE offers a large performance advantage.
 */

#ifdef USE_PREFETCH
# define PREF_LOAD		0
# define PREF_STORE		1
# define PREF_LOAD_STREAMED	4
# define PREF_STORE_STREAMED	5
# define PREF_LOAD_RETAINED	6
# define PREF_STORE_RETAINED	7
# define PREF_WRITEBACK_INVAL	25
# define PREF_PREPAREFORSTORE	30

/*
 * We double everything when USE_DOUBLE is true so we do 2 prefetches to
 * get 64 bytes in that case.  The assumption is that each individual 
 * prefetch brings in 32 bytes.
 */
#ifdef USE_DOUBLE
# define PREF_CHUNK 64
# define PREFETCH_FOR_LOAD(chunk, reg) \
 pref PREF_LOAD_STREAMED, (chunk)*32(reg); \
 pref PREF_LOAD_STREAMED, ((chunk)+1)*32(reg)
# define PREFETCH_FOR_STORE(chunk, reg) \
 pref PREF_PREPAREFORSTORE, (chunk)*32(reg); \
 pref PREF_PREPAREFORSTORE, ((chunk)+1)*32(reg)
#else
# define PREF_CHUNK 32
# define PREFETCH_FOR_LOAD(chunk, reg) \
 pref PREF_LOAD_STREAMED, (chunk)*32(reg)
# define PREFETCH_FOR_STORE(chunk, reg) \
 pref PREF_PREPAREFORSTORE, (chunk)*32(reg)
#endif
#define PREF_LIMIT (5 * PREF_CHUNK)
#else
# define PREFETCH_FOR_LOAD(offset, reg)
# define PREFETCH_FOR_STORE(offset, reg)
#endif

/* Allow the routine to be named something else if desired.  */
#ifndef MEMCPY_NAME
#define MEMCPY_NAME memcpy
#endif

/* We use these 32/64 bit registers as temporaries to do the copying.  */
#define REG0 t0
#define REG1 t1
#define REG2 t2
#define REG3 t3
#ifdef USE_DOUBLE
#  define REG4 ta0
#  define REG5 ta1
#  define REG6 ta2
#  define REG7 ta3
#else
#  define REG4 t4
#  define REG5 t5
#  define REG6 t6
#  define REG7 t7
#endif

/* We load/store 64 bits at a time when USE_DOUBLE is true.  */
#ifdef USE_DOUBLE
#  define ST	sd
#  define LD	ld
#if __MIPSEB
#  define LDHI	ldl		/* high part is left in big-endian	*/
#  define STHI	sdl		/* high part is left in big-endian	*/
#  define LDLO	ldr		/* low part is right in big-endian	*/
#  define STLO	sdr		/* low part is right in big-endian	*/
#else
#  define LDHI	ldr		/* high part is right in little-endian	*/
#  define STHI	sdr		/* high part is right in little-endian	*/
#  define LDLO	ldl		/* low part is left in little-endian	*/
#  define STLO	sdl		/* low part is left in little-endian	*/
#endif
#else
#  define ST	sw
#  define LD	lw
#if __MIPSEB
#  define LDHI	lwl		/* high part is left in big-endian	*/
#  define STHI	swl		/* high part is left in big-endian	*/
#  define LDLO	lwr		/* low part is right in big-endian	*/
#  define STLO	swr		/* low part is right in big-endian	*/
#else
#  define LDHI	lwr		/* high part is right in little-endian	*/
#  define STHI	swr		/* high part is right in little-endian	*/
#  define LDLO	lwl		/* low part is left in little-endian	*/
#  define STLO	swl		/* low part is left in little-endian	*/
#endif
#endif

/* Bookkeeping values for 32 vs. 64 bit mode.  */
#ifdef USE_DOUBLE
#  define NSIZE 8
#  define NSIZEMASK 0x3f
#  define NSIZEDMASK 0x7f
#else
#  define NSIZE 4
#  define NSIZEMASK 0x1f
#  define NSIZEDMASK 0x3f
#endif
#define UNIT(unit) ((unit)*NSIZE)
#define UNITM1(unit) (((unit)*NSIZE)-1)

#ifdef __BIONIC__
LEAF(MEMCPY_NAME, 0)
#else
LEAF(MEMCPY_NAME)
#endif
	.set	nomips16
	.set	noreorder
/*
 * Below we handle the case where memcpy is called with overlapping src and dst.
 * Although memcpy is not required to handle this case, some parts of Android
 * like Skia rely on such usage. We call memmove to handle such cases.
 */
#ifdef ALLOW_OVERLAP
	PTR_SUBU t0,a0,a1
	PTR_SRA	t2,t0,31
	xor	t1,t0,t2
	PTR_SUBU t0,t1,t2
	sltu	t2,t0,a2
	beq	t2,zero,L(memcpy)
	la	t9,memmove
	jr	t9
	 nop
L(memcpy):
#endif
/*
 * If the size is less then 2*NSIZE (8 or 16), go to L(lastb).  Regardless of
 * size, copy dst pointer to v0 for the return value.
 */
	slti	t2,a2,(2 * NSIZE)
	bne	t2,zero,L(lastb)
	move	v0,a0
/*
 * If src and dst have different alignments, go to L(unaligned), if they
 * have the same alignment (but are not actually aligned) do a partial
 * load/store to make them aligned.  If they are both already aligned
 * we can start copying at L(aligned).
 */
	xor	t8,a1,a0
	andi	t8,t8,(NSIZE-1)		/* t8 is a0/a1 word-displacement */
	bne	t8,zero,L(unaligned)
	PTR_SUBU a3, zero, a0

	andi	a3,a3,(NSIZE-1)		/* copy a3 bytes to align a0/a1	  */
	beq	a3,zero,L(aligned)	/* if a3=0, it is already aligned */
	PTR_SUBU a2,a2,a3		/* a2 is the remining bytes count */

	LDHI	t8,0(a1)
	PTR_ADDU a1,a1,a3
	STHI	t8,0(a0)
	PTR_ADDU a0,a0,a3

/*
 * Now dst/src are both aligned to (word or double word) aligned addresses
 * Set a2 to count how many bytes we have to copy after all the 64/128 byte
 * chunks are copied and a3 to the dst pointer after all the 64/128 byte 
 * chunks have been copied.  We will loop, incrementing a0 and a1 until a0
 * equals a3.
 */

L(aligned):
	andi	t8,a2,NSIZEDMASK /* any whole 64-byte/128-byte chunks? */
	beq	a2,t8,L(chkw)	 /* if a2==t8, no 64-byte/128-byte chunks */
	PTR_SUBU a3,a2,t8	 /* subtract from a2 the reminder */
	PTR_ADDU a3,a0,a3	 /* Now a3 is the final dst after loop */

/* When in the loop we exercise "pref 30,x(a0)", the a0+x should not be past
 * the "t0-32" address.  This means: for x=128 the last "safe" a0 address is
 * "t0-160".  Alternatively, for x=64 the last "safe" a0 address is "t0-96"
 * In the current version we will use "pref 30,128(a0)", so "t0-160" is the
 * limit
 */
#ifdef USE_PREFETCH
	PTR_ADDU t0,a0,a2		/* t0 is the "past the end" address */
	PTR_SUBU t9,t0,PREF_LIMIT	/* t9 is the "last safe pref" address */
	PREFETCH_FOR_LOAD  (0, a1)
	PREFETCH_FOR_LOAD  (1, a1)
	PREFETCH_FOR_LOAD  (2, a1)
	PREFETCH_FOR_STORE (1, a0)
	sltu	v1,t9,a0		/* If a0 > t9 don't use next prefetch */
	bgtz	v1,L(loop16w)
	nop
#endif
	PREFETCH_FOR_STORE (2, a0)
L(loop16w):
	PREFETCH_FOR_LOAD  (3, a1)
	LD	t0,UNIT(0)(a1)
#ifdef USE_PREFETCH
	bgtz	v1,L(skip_pref30_96)
#endif
	LD	t1,UNIT(1)(a1)
	PREFETCH_FOR_STORE (3, a0)
L(skip_pref30_96):
	LD	REG2,UNIT(2)(a1)
	LD	REG3,UNIT(3)(a1)
	LD	REG4,UNIT(4)(a1)
	LD	REG5,UNIT(5)(a1)
	LD	REG6,UNIT(6)(a1)
	LD	REG7,UNIT(7)(a1)
        PREFETCH_FOR_LOAD (4, a1)

	ST	t0,UNIT(0)(a0)
	ST	t1,UNIT(1)(a0)
	ST	REG2,UNIT(2)(a0)
	ST	REG3,UNIT(3)(a0)
	ST	REG4,UNIT(4)(a0)
	ST	REG5,UNIT(5)(a0)
	ST	REG6,UNIT(6)(a0)
	ST	REG7,UNIT(7)(a0)

	LD	t0,UNIT(8)(a1)
#ifdef USE_PREFETCH
	bgtz	v1,L(skip_pref30_128)
#endif
	LD	t1,UNIT(9)(a1)
	PREFETCH_FOR_STORE (4, a0)
L(skip_pref30_128):
	LD	REG2,UNIT(10)(a1)
	LD	REG3,UNIT(11)(a1)
	LD	REG4,UNIT(12)(a1)
	LD	REG5,UNIT(13)(a1)
	LD	REG6,UNIT(14)(a1)
	LD	REG7,UNIT(15)(a1)
        PREFETCH_FOR_LOAD (5, a1)
	ST	t0,UNIT(8)(a0)
	ST	t1,UNIT(9)(a0)
	ST	REG2,UNIT(10)(a0)
	ST	REG3,UNIT(11)(a0)
	ST	REG4,UNIT(12)(a0)
	ST	REG5,UNIT(13)(a0)
	ST	REG6,UNIT(14)(a0)
	ST	REG7,UNIT(15)(a0)
	PTR_ADDIU a0,a0,UNIT(16)	/* adding 64/128 to dest */
#ifdef USE_PREFETCH
	sltu	v1,t9,a0
#endif
	bne	a0,a3,L(loop16w)
	PTR_ADDIU a1,a1,UNIT(16)	/* adding 64/128 to src */
	move	a2,t8

/* Here we have src and dest word-aligned but less than 64-bytes or
 * 128 bytes to go.  Check for a 32(64) byte chunk and copy if if there
 * is one.  Otherwise jump down to L(chk1w) to handle the tail end of
 * the copy.
 */

L(chkw):
	PREFETCH_FOR_LOAD (0, a1)
	andi	t8,a2,NSIZEMASK	/* Is there a 32-byte/64-byte chunk.  */
				/* The t8 is the reminder count past 32-bytes */
	beq	a2,t8,L(chk1w)	/* When a2=t8, no 32-byte chunk  */
	nop
	LD	t0,UNIT(0)(a1)
	LD	t1,UNIT(1)(a1)
	LD	REG2,UNIT(2)(a1)
	LD	REG3,UNIT(3)(a1)
	LD	REG4,UNIT(4)(a1)
	LD	REG5,UNIT(5)(a1)
	LD	REG6,UNIT(6)(a1)
	LD	REG7,UNIT(7)(a1)
	PTR_ADDIU a1,a1,UNIT(8)
	ST	t0,UNIT(0)(a0)
	ST	t1,UNIT(1)(a0)
	ST	REG2,UNIT(2)(a0)
	ST	REG3,UNIT(3)(a0)
	ST	REG4,UNIT(4)(a0)
	ST	REG5,UNIT(5)(a0)
	ST	REG6,UNIT(6)(a0)
	ST	REG7,UNIT(7)(a0)
	PTR_ADDIU a0,a0,UNIT(8)

/*
 * Here we have less then 32(64) bytes to copy.  Set up for a loop to
 * copy one word (or double word) at a time.  Set a2 to count how many
 * bytes we have to copy after all the word (or double word) chunks are
 * copied and a3 to the dst pointer after all the (d)word chunks have
 * been copied.  We will loop, incrementing a0 and a1 until a0 equals a3.
 */
L(chk1w):
	andi	a2,t8,(NSIZE-1)	/* a2 is the reminder past one (d)word chunks */
	beq	a2,t8,L(lastb)
	PTR_SUBU a3,t8,a2	/* a3 is count of bytes in one (d)word chunks */
	PTR_ADDU a3,a0,a3	/* a3 is the dst address after loop */

/* copying in words (4-byte or 8-byte chunks) */
L(wordCopy_loop):
	LD	REG3,UNIT(0)(a1)
	PTR_ADDIU a1,a1,UNIT(1)
	PTR_ADDIU a0,a0,UNIT(1)
	bne	a0,a3,L(wordCopy_loop)
	ST	REG3,UNIT(-1)(a0)

/* Copy the last 8 (or 16) bytes */
L(lastb):
	blez	a2,L(leave)
	PTR_ADDU a3,a0,a2	/* a3 is the last dst address */
L(lastbloop):
	lb	v1,0(a1)
	PTR_ADDIU a1,a1,1
	PTR_ADDIU a0,a0,1
	bne	a0,a3,L(lastbloop)
	sb	v1,-1(a0)
L(leave):
	j	ra
	nop
/*
 * UNALIGNED case, got here with a3 = "negu a0"
 * This code is nearly identical to the aligned code above
 * but only the destination (not the source) gets aligned
 * so we need to do partial loads of the source followed
 * by normal stores to the destination (once we have aligned
 * the destination).
 */

L(unaligned):
	andi	a3,a3,(NSIZE-1)	/* copy a3 bytes to align a0/a1 */
	beqz	a3,L(ua_chk16w) /* if a3=0, it is already aligned */
	PTR_SUBU a2,a2,a3	/* a2 is the remining bytes count */

	LDHI	v1,UNIT(0)(a1)
	LDLO	v1,UNITM1(1)(a1)
	PTR_ADDU a1,a1,a3
	STHI	v1,UNIT(0)(a0)
	PTR_ADDU a0,a0,a3

/*
 *  Now the destination (but not the source) is aligned
 * Set a2 to count how many bytes we have to copy after all the 64/128 byte
 * chunks are copied and a3 to the dst pointer after all the 64/128 byte
 * chunks have been copied.  We will loop, incrementing a0 and a1 until a0
 * equals a3.
 */

L(ua_chk16w):
	andi	t8,a2,NSIZEDMASK /* any whole 64-byte/128-byte chunks? */
	beq	a2,t8,L(ua_chkw) /* if a2==t8, no 64-byte/128-byte chunks */
	PTR_SUBU a3,a2,t8	 /* subtract from a2 the reminder */
	PTR_ADDU a3,a0,a3	 /* Now a3 is the final dst after loop */

#ifdef USE_PREFETCH
	PTR_ADDU t0,a0,a2	  /* t0 is the "past the end" address */
	PTR_SUBU t9,t0,PREF_LIMIT /* t9 is the "last safe pref" address */
	PREFETCH_FOR_LOAD  (0, a1)
	PREFETCH_FOR_LOAD  (1, a1)
	PREFETCH_FOR_LOAD  (2, a1)
	PREFETCH_FOR_STORE (1, a0)
	sltu	v1,t9,a0
	bgtz	v1,L(ua_loop16w)  /* skip prefetch for too short arrays */
	nop
#endif
	PREFETCH_FOR_STORE (2, a0)
L(ua_loop16w):
	PREFETCH_FOR_LOAD  (3, a1)
	LDHI	t0,UNIT(0)(a1)
	LDLO	t0,UNITM1(1)(a1)
	LDHI	t1,UNIT(1)(a1)
#ifdef USE_PREFETCH
	bgtz	v1,L(ua_skip_pref30_96)
#endif
	LDLO	t1,UNITM1(2)(a1)
	PREFETCH_FOR_STORE (3, a0)
L(ua_skip_pref30_96):
	LDHI	REG2,UNIT(2)(a1)
	LDLO	REG2,UNITM1(3)(a1)
	LDHI	REG3,UNIT(3)(a1)
	LDLO	REG3,UNITM1(4)(a1)
	LDHI	REG4,UNIT(4)(a1)
	LDLO	REG4,UNITM1(5)(a1)
	LDHI	REG5,UNIT(5)(a1)
	LDLO	REG5,UNITM1(6)(a1)
	LDHI	REG6,UNIT(6)(a1)
	LDLO	REG6,UNITM1(7)(a1)
	LDHI	REG7,UNIT(7)(a1)
	LDLO	REG7,UNITM1(8)(a1)
        PREFETCH_FOR_LOAD (4, a1)
	ST	t0,UNIT(0)(a0)
	ST	t1,UNIT(1)(a0)
	ST	REG2,UNIT(2)(a0)
	ST	REG3,UNIT(3)(a0)
	ST	REG4,UNIT(4)(a0)
	ST	REG5,UNIT(5)(a0)
	ST	REG6,UNIT(6)(a0)
	ST	REG7,UNIT(7)(a0)
	LDHI	t0,UNIT(8)(a1)
	LDLO	t0,UNITM1(9)(a1)
	LDHI	t1,UNIT(9)(a1)
#ifdef USE_PREFETCH
	bgtz	v1,L(ua_skip_pref30_128)
#endif
	LDLO	t1,UNITM1(10)(a1)
	PREFETCH_FOR_STORE (4, a0)
L(ua_skip_pref30_128):
	LDHI	REG2,UNIT(10)(a1)
	LDLO	REG2,UNITM1(11)(a1)
	LDHI	REG3,UNIT(11)(a1)
	LDLO	REG3,UNITM1(12)(a1)
	LDHI	REG4,UNIT(12)(a1)
	LDLO	REG4,UNITM1(13)(a1)
	LDHI	REG5,UNIT(13)(a1)
	LDLO	REG5,UNITM1(14)(a1)
	LDHI	REG6,UNIT(14)(a1)
	LDLO	REG6,UNITM1(15)(a1)
	LDHI	REG7,UNIT(15)(a1)
	LDLO	REG7,UNITM1(16)(a1)
        PREFETCH_FOR_LOAD (5, a1)
	ST	t0,UNIT(8)(a0)
	ST	t1,UNIT(9)(a0)
	ST	REG2,UNIT(10)(a0)
	ST	REG3,UNIT(11)(a0)
	ST	REG4,UNIT(12)(a0)
	ST	REG5,UNIT(13)(a0)
	ST	REG6,UNIT(14)(a0)
	ST	REG7,UNIT(15)(a0)
	PTR_ADDIU a0,a0,UNIT(16)	/* adding 64/128 to dest */
#ifdef USE_PREFETCH
	sltu	v1,t9,a0
#endif
	bne	a0,a3,L(ua_loop16w)
	PTR_ADDIU a1,a1,UNIT(16)	/* adding 64/128 to src */
	move	a2,t8

/* Here we have src and dest word-aligned but less than 64-bytes or
 * 128 bytes to go.  Check for a 32(64) byte chunk and copy if if there
 * is one.  Otherwise jump down to L(ua_chk1w) to handle the tail end of
 * the copy.  */

L(ua_chkw):
	PREFETCH_FOR_LOAD (0, a1)
	andi	t8,a2,NSIZEMASK	  /* Is there a 32-byte/64-byte chunk.  */
				  /* t8 is the reminder count past 32-bytes */
	beq	a2,t8,L(ua_chk1w) /* When a2=t8, no 32-byte chunk */
	nop
	LDHI	t0,UNIT(0)(a1)
	LDLO	t0,UNITM1(1)(a1)
	LDHI	t1,UNIT(1)(a1)
	LDLO	t1,UNITM1(2)(a1)
	LDHI	REG2,UNIT(2)(a1)
	LDLO	REG2,UNITM1(3)(a1)
	LDHI	REG3,UNIT(3)(a1)
	LDLO	REG3,UNITM1(4)(a1)
	LDHI	REG4,UNIT(4)(a1)
	LDLO	REG4,UNITM1(5)(a1)
	LDHI	REG5,UNIT(5)(a1)
	LDLO	REG5,UNITM1(6)(a1)
	LDHI	REG6,UNIT(6)(a1)
	LDLO	REG6,UNITM1(7)(a1)
	LDHI	REG7,UNIT(7)(a1)
	LDLO	REG7,UNITM1(8)(a1)
	PTR_ADDIU a1,a1,UNIT(8)
	ST	t0,UNIT(0)(a0)
	ST	t1,UNIT(1)(a0)
	ST	REG2,UNIT(2)(a0)
	ST	REG3,UNIT(3)(a0)
	ST	REG4,UNIT(4)(a0)
	ST	REG5,UNIT(5)(a0)
	ST	REG6,UNIT(6)(a0)
	ST	REG7,UNIT(7)(a0)
	PTR_ADDIU a0,a0,UNIT(8)
/*
 * Here we have less then 32(64) bytes to copy.  Set up for a loop to
 * copy one word (or double word) at a time.
 */
L(ua_chk1w):
	andi	a2,t8,(NSIZE-1)	/* a2 is the reminder past one (d)word chunks */
	beq	a2,t8,L(ua_smallCopy)
	PTR_SUBU a3,t8,a2	/* a3 is count of bytes in one (d)word chunks */
	PTR_ADDU a3,a0,a3	/* a3 is the dst address after loop */

/* copying in words (4-byte or 8-byte chunks) */
L(ua_wordCopy_loop):
	LDHI	v1,UNIT(0)(a1)
	LDLO	v1,UNITM1(1)(a1)
	PTR_ADDIU a1,a1,UNIT(1)
	PTR_ADDIU a0,a0,UNIT(1)
	bne	a0,a3,L(ua_wordCopy_loop)
	ST	v1,UNIT(-1)(a0)

/* Copy the last 8 (or 16) bytes */
L(ua_smallCopy):
	beqz	a2,L(leave)
	PTR_ADDU a3,a0,a2	/* a3 is the last dst address */
L(ua_smallCopy_loop):
	lb	v1,0(a1)
	PTR_ADDIU a1,a1,1
	PTR_ADDIU a0,a0,1
	bne	a0,a3,L(ua_smallCopy_loop)
	sb	v1,-1(a0)

	j	ra
	nop

	.set	at
	.set	reorder
END(MEMCPY_NAME)


/************************************************************************
 *  Implementation : Static functions
 ************************************************************************/

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH] Optimize MIPS memcpy
  2012-09-04 15:09   ` Steve Ellcey
@ 2012-09-04 15:14     ` Carlos O'Donell
  2012-09-04 17:03       ` Steve Ellcey
  2012-09-05  0:43     ` Maxim Kuvyrkov
  1 sibling, 1 reply; 50+ messages in thread
From: Carlos O'Donell @ 2012-09-04 15:14 UTC (permalink / raw)
  To: Steve Ellcey; +Cc: Andrew T Pinski, Maxim Kuvyrkov, Joseph S. Myers, libc-ports

On 9/4/2012 11:09 AM, Steve Ellcey wrote:
> On Mon, 2012-09-03 at 02:12 -0700, Andrew T Pinski wrote:
>> Forgot to CC libc-ports@ .
>> On Sat, 2012-09-01 at 18:15 +1200, Maxim Kuvyrkov wrote:
>>> This patch improves MIPS assembly implementations of memcpy.  Two optimizations are added:
>> prefetching of data for subsequent iterations of memcpy loop and pipelined expansion of unaligned
>> memcpy.  These optimizations speed up MIPS memcpy by about 10%.
>>>
>>> The prefetching part is straightforward: it adds prefetching of a cache line (32 bytes) for +1
>> iteration for unaligned case and +2 iteration for aligned case.  The rationale here is that it will
>> take prefetch to acquire data about same time as 1 iteration of unaligned loop or 2 iterations of aligned loop.  Values for these parameters were tuned on a modern MIPS processor.
>>>
>>
>> This might hurt Octeon as the cache line size there is 128 bytes.  Can
>> you say which modern MIPS processor which this has been tuned with?  And
>> is there a way to not hard code 32 in the assembly but in a macro
>> instead.
>>
>> Thanks,
>> Andrew Pinski
> 
> I've been looking at the MIPS memcpy and was planning on submitting a
> new version based on the one that MIPS submitted to Android.  It has
> prefetching like Maxim's though I found that using the load and 'prepare
> for store' hints instead of 'load streaming' and 'store streaming' hints
> gave me better results on the 74k and 24k that I did performance testing
> on.
> 
> This version has more unrolling too and between that and the hints
> difference I got a small performance improvement over Maxim's version
> when doing small memcpy's and a fairly substantial improvement on large
> memcpy's.
> 
> I also merged the 32 and 64 bit versions together so we would only have
> one copy to maintain.  I haven't tried building it as part of glibc yet,
> I have been testing it standalone first and was going to try and
> integrate it into glibc and submit it this week or next.  I'll attach it
> to this email so folks can look at it and I will see if I can
> parameterize the cache line size.  This one also assumes a 32 byte cache
> prefetch.

Exactly what benchmarks did you run to verify the performance gains?

The one thing I'd like to continue seeing is strong rationalization for
performance patches such that we have reproducible data in the event that
someone else comes along and wants to make a change.

For example see:
http://sourceware.org/glibc/wiki/benchmarking/results_2_17

and:
http://sourceware.org/glibc/wiki/benchmarking/benchmarks

Cheers,
Carlos.
-- 
Carlos O'Donell
Mentor Graphics / CodeSourcery
carlos_odonell@mentor.com
carlos@codesourcery.com
+1 (613) 963 1026

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH] Optimize MIPS memcpy
  2012-09-04 15:14     ` Carlos O'Donell
@ 2012-09-04 17:03       ` Steve Ellcey
  2012-09-04 17:28         ` Carlos O'Donell
  0 siblings, 1 reply; 50+ messages in thread
From: Steve Ellcey @ 2012-09-04 17:03 UTC (permalink / raw)
  To: Carlos O'Donell
  Cc: Andrew T Pinski, Maxim Kuvyrkov, Joseph S. Myers, libc-ports

[-- Attachment #1: Type: text/plain, Size: 865 bytes --]


> Exactly what benchmarks did you run to verify the performance gains?
> 
> The one thing I'd like to continue seeing is strong rationalization for
> performance patches such that we have reproducible data in the event that
> someone else comes along and wants to make a change.
> 
> For example see:
> http://sourceware.org/glibc/wiki/benchmarking/results_2_17
> 
> and:
> http://sourceware.org/glibc/wiki/benchmarking/benchmarks
> 
> Cheers,
> Carlos.

We had a few tests around here that I used and I wrote one of my own
too.  I have attached my test, when using it with -UVERIFY and testing
on a 74K system I got the following timings (32 bits, little-endian):

The FSF memcpy: 3m9.34s
Maxim's memcpy: 2m0.41s
My memcpy:      1m22.20s

If there are any official or recommended memcpy benchmarks I'd be happy
to try them as well.

Steve Ellcey
sellcey@mips.com

[-- Attachment #2: test_memcpy.c --]
[-- Type: text/x-csrc, Size: 1379 bytes --]

#include <string.h>
#include <stdio.h>

#define SIZE 1024*100
#define MAXCOPYSIZE 1024*50
#define MAXSRCOFFSET 13
#define MAXDSTOFFSET 18
#define SRCVAL(N) ((N+10000) % 13)
#define DSTVAL(N) ((N+20001) % 17)
char src[SIZE], dst[SIZE];

#ifndef MEMCPY_NAME
#define MEMCPY_NAME memcpy
#endif

extern void *MEMCPY_NAME(void *, const void *, size_t);

test(int src_offset, int dst_offset, int size)
{
  int i;
  char *x, *y;
  for (i = 0; i < SIZE; i++) {
    src[i] = SRCVAL(i);
    dst[i] = DSTVAL(i);
  }
  x = src;
  y = dst;
  x = x + src_offset;
  y = y + dst_offset;
  MEMCPY_NAME(&dst[dst_offset], &src[src_offset], size);
  for (i = 0; i < SIZE; i++) {
    if (src[i] != SRCVAL(i)) printf("FAIL, src got changed\n");
    if (i < dst_offset) {
      if (dst[i] != DSTVAL(i))
	printf("FAIL, dst got changed before it should be\n");
    } else if (i >= (dst_offset+size)) {
      if (dst[i] != DSTVAL(i))
	printf("FAIL, dst got changed after it should be (%d %d %d %d)\n", src_offset, dst_offset, size, i);
    } else {
      if (dst[i] != SRCVAL(i-dst_offset+src_offset)) {
	printf("FAIL, dst was not changed when it should be (%d %d %d %d)\n", src_offset, dst_offset, size, i);
      }
    }
  }
}

main()
{
  int i, j, k;
  for (i = 8; i < MAXDSTOFFSET; i++)
    for (j = 8; j < MAXSRCOFFSET; j++)
      for (k = MAXCOPYSIZE-20; k < MAXCOPYSIZE; k++)
      test(i, j, k);
}

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH] Optimize MIPS memcpy
  2012-09-04 17:03       ` Steve Ellcey
@ 2012-09-04 17:28         ` Carlos O'Donell
  0 siblings, 0 replies; 50+ messages in thread
From: Carlos O'Donell @ 2012-09-04 17:28 UTC (permalink / raw)
  To: Steve Ellcey; +Cc: Andrew T Pinski, Maxim Kuvyrkov, Joseph S. Myers, libc-ports

On 9/4/2012 1:02 PM, Steve Ellcey wrote:
> 
>> Exactly what benchmarks did you run to verify the performance gains?
>>
>> The one thing I'd like to continue seeing is strong rationalization for
>> performance patches such that we have reproducible data in the event that
>> someone else comes along and wants to make a change.
>>
>> For example see:
>> http://sourceware.org/glibc/wiki/benchmarking/results_2_17
>>
>> and:
>> http://sourceware.org/glibc/wiki/benchmarking/benchmarks
>>
>> Cheers,
>> Carlos.
> 
> We had a few tests around here that I used and I wrote one of my own
> too.  I have attached my test, when using it with -UVERIFY and testing
> on a 74K system I got the following timings (32 bits, little-endian):
> 
> The FSF memcpy: 3m9.34s
> Maxim's memcpy: 2m0.41s
> My memcpy:      1m22.20s
> 
> If there are any official or recommended memcpy benchmarks I'd be happy
> to try them as well.

There are none, but that's the kind of consensus we're trying to build by
documenting exactly which tests were used to benchmark which functions.

Thanks for posting the test sources!

Cheers,
Carlos.
-- 
Carlos O'Donell
Mentor Graphics / CodeSourcery
carlos_odonell@mentor.com
carlos@codesourcery.com
+1 (613) 963 1026

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH] Optimize MIPS memcpy
  2012-09-04 15:09   ` Steve Ellcey
  2012-09-04 15:14     ` Carlos O'Donell
@ 2012-09-05  0:43     ` Maxim Kuvyrkov
  2012-09-06 16:25       ` Steve Ellcey
  1 sibling, 1 reply; 50+ messages in thread
From: Maxim Kuvyrkov @ 2012-09-05  0:43 UTC (permalink / raw)
  To: Steve Ellcey; +Cc: Andrew T Pinski, Joseph S.  Myers, libc-ports

On 5/09/2012, at 3:09 AM, Steve Ellcey wrote:

> On Mon, 2012-09-03 at 02:12 -0700, Andrew T Pinski wrote:
>> Forgot to CC libc-ports@ .
>> On Sat, 2012-09-01 at 18:15 +1200, Maxim Kuvyrkov wrote:
>>> This patch improves MIPS assembly implementations of memcpy.  Two optimizations are added:
>> prefetching of data for subsequent iterations of memcpy loop and pipelined expansion of unaligned
>> memcpy.  These optimizations speed up MIPS memcpy by about 10%.
>>> 
>>> The prefetching part is straightforward: it adds prefetching of a cache line (32 bytes) for +1
>> iteration for unaligned case and +2 iteration for aligned case.  The rationale here is that it will
>> take prefetch to acquire data about same time as 1 iteration of unaligned loop or 2 iterations of aligned loop.  Values for these parameters were tuned on a modern MIPS processor.
>>> 
>> 
>> This might hurt Octeon as the cache line size there is 128 bytes.  Can
>> you say which modern MIPS processor which this has been tuned with?  And
>> is there a way to not hard code 32 in the assembly but in a macro
>> instead.
>> 
>> Thanks,
>> Andrew Pinski
> 
> I've been looking at the MIPS memcpy and was planning on submitting a
> new version based on the one that MIPS submitted to Android.  It has
> prefetching like Maxim's though I found that using the load and 'prepare
> for store' hints instead of 'load streaming' and 'store streaming' hints
> gave me better results on the 74k and 24k that I did performance testing
> on.

I didn't experiment with various prefetching hints, so this very well may be the case.

> 
> This version has more unrolling too and between that and the hints
> difference I got a small performance improvement over Maxim's version
> when doing small memcpy's and a fairly substantial improvement on large
> memcpy's.
> 
> I also merged the 32 and 64 bit versions together so we would only have
> one copy to maintain.  I haven't tried building it as part of glibc yet,
> I have been testing it standalone first and was going to try and
> integrate it into glibc and submit it this week or next.  I'll attach it
> to this email so folks can look at it and I will see if I can
> parameterize the cache line size.  This one also assumes a 32 byte cache
> prefetch.
> 

Your version looks quite good.  If you could wrap it up into a glibc patch I would test it on our setup to confirm that it indeed provides better performance.

Thanks,

--
Maxim Kuvyrkov
Mentor Graphics

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH] Optimize MIPS memcpy
  2012-09-05  0:43     ` Maxim Kuvyrkov
@ 2012-09-06 16:25       ` Steve Ellcey
  2012-09-06 18:43         ` Roland McGrath
                           ` (2 more replies)
  0 siblings, 3 replies; 50+ messages in thread
From: Steve Ellcey @ 2012-09-06 16:25 UTC (permalink / raw)
  To: Maxim Kuvyrkov; +Cc: Andrew T Pinski, Joseph S.  Myers, libc-ports

[-- Attachment #1: Type: text/plain, Size: 553 bytes --]

On Wed, 2012-09-05 at 12:43 +1200, Maxim Kuvyrkov wrote:

> Your version looks quite good.  If you could wrap it up into a glibc patch I would test it on our
> setup to confirm that it indeed provides better performance.
> 
> Thanks,
> 
> --
> Maxim Kuvyrkov
> Mentor Graphics

I have attached a glibc patch for my version of memcpy.


2012-09-06  Steve Ellcey  <sellcey@mips.com>

	* sysdeps/mips/memcpy.S: Add prefetching and more unrolling, make
	it work in 32 or 64 bit modes.
	* sysdeps/mips/mips64/memcpy.S: Remove.

Steve Ellcey
sellcey@mips.com

[-- Attachment #2: memcpy.patch --]
[-- Type: text/x-patch, Size: 24714 bytes --]

diff --git a/ports/sysdeps/mips/memcpy.S b/ports/sysdeps/mips/memcpy.S
index 753f67c..58d6e46 100644
--- a/ports/sysdeps/mips/memcpy.S
+++ b/ports/sysdeps/mips/memcpy.S
@@ -1,134 +1,573 @@
-/* Copyright (C) 2002-2012 Free Software Foundation, Inc.
-   This file is part of the GNU C Library.
-   Contributed by Hartvig Ekner <hartvige@mips.com>, 2002.
+/* Copyright (C) 2012 Free Software Foundation, Inc.
+ *  This file is part of the GNU C Library.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the MIPS Technologies, Inc., nor the names of its
+ *    contributors may be used to endorse or promote products derived from
+ *    this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE MIPS TECHNOLOGIES, INC. ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE MIPS TECHNOLOGIES, INC. BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
 
-   The GNU C Library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 2.1 of the License, or (at your option) any later version.
+#ifdef __BIONIC__
+#include "machine/asm.h"
+#include "machine/regdef.h"
+#define ALLOW_OVERLAP
+#define USE_PREFETCH
+#else
+#ifdef _LIBC
+#include <sysdep.h>
+#define USE_PREFETCH
+#endif
+#include <regdef.h>
+#include <sys/asm.h>
+#if _MIPS_ISA == _MIPS_ISA_MIPS32 || _MIPS_ISA == _MIPS_ISA_MIPS64
+#define PREFETCH
+#endif
+#if _MIPS_SIM == _ABI64
+#define USE_DOUBLE
+#endif
+#endif
 
-   The GNU C Library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   Lesser General Public License for more details.
 
-   You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library.  If not, see
-   <http://www.gnu.org/licenses/>.  */
 
-#include <sysdep.h>
+/* Some asm.h files do not have the L macro definition.  */
+#ifndef L
+#if _MIPS_SIM == _ABIO32
+# define L(label) $L ## label
+#else
+# define L(label) .L ## label
+#endif
+#endif
+
+/* Some regdef.h files deo not have the PTR_ADDIU macro definition.  */
+#ifndef PTR_ADDIU
+#ifdef USE_DOUBLE
+#define PTR_ADDIU	daddiu
+#else
+#define PTR_ADDIU	addiu
+#endif
+#endif
 
 
-/* void *memcpy(void *s1, const void *s2, size_t n);  */
+/*
+ * Using PREF_LOAD_STREAMED instead of PREF_LOAD on load prefetches offers
+ * a slight preformance advantage, using PREF_PREPAREFORSTORE instead of
+ * PREF_STORE_STREAMED or PREF_STORE offers a large performance advantage.
+ */
 
+#ifdef USE_PREFETCH
+# define PREF_LOAD		0
+# define PREF_STORE		1
+# define PREF_LOAD_STREAMED	4
+# define PREF_STORE_STREAMED	5
+# define PREF_LOAD_RETAINED	6
+# define PREF_STORE_RETAINED	7
+# define PREF_WRITEBACK_INVAL	25
+# define PREF_PREPAREFORSTORE	30
+
+/*
+ * We double everything when USE_DOUBLE is true so we do 2 prefetches to
+ * get 64 bytes in that case.  The assumption is that each individual 
+ * prefetch brings in 32 bytes.
+ */
+#ifdef USE_DOUBLE
+# define PREF_CHUNK 64
+# define PREFETCH_FOR_LOAD(chunk, reg) \
+ pref PREF_LOAD_STREAMED, (chunk)*32(reg); \
+ pref PREF_LOAD_STREAMED, ((chunk)+1)*32(reg)
+# define PREFETCH_FOR_STORE(chunk, reg) \
+ pref PREF_PREPAREFORSTORE, (chunk)*32(reg); \
+ pref PREF_PREPAREFORSTORE, ((chunk)+1)*32(reg)
+#else
+# define PREF_CHUNK 32
+# define PREFETCH_FOR_LOAD(chunk, reg) \
+ pref PREF_LOAD_STREAMED, (chunk)*32(reg)
+# define PREFETCH_FOR_STORE(chunk, reg) \
+ pref PREF_PREPAREFORSTORE, (chunk)*32(reg)
+#endif
+#define PREF_LIMIT (5 * PREF_CHUNK)
+#else
+# define PREFETCH_FOR_LOAD(offset, reg)
+# define PREFETCH_FOR_STORE(offset, reg)
+#endif
+
+/* Allow the routine to be named something else if desired.  */
+#ifndef MEMCPY_NAME
+#define MEMCPY_NAME memcpy
+#endif
+
+/* We use these 32/64 bit registers as temporaries to do the copying.  */
+#define REG0 t0
+#define REG1 t1
+#define REG2 t2
+#define REG3 t3
+#ifdef USE_DOUBLE
+#  define REG4 ta0
+#  define REG5 ta1
+#  define REG6 ta2
+#  define REG7 ta3
+#else
+#  define REG4 t4
+#  define REG5 t5
+#  define REG6 t6
+#  define REG7 t7
+#endif
+
+/* We load/store 64 bits at a time when USE_DOUBLE is true.  */
+#ifdef USE_DOUBLE
+#  define ST	sd
+#  define LD	ld
 #if __MIPSEB
-#  define LWHI	lwl		/* high part is left in big-endian	*/
-#  define SWHI	swl		/* high part is left in big-endian	*/
-#  define LWLO	lwr		/* low part is right in big-endian	*/
-#  define SWLO	swr		/* low part is right in big-endian	*/
+#  define LDHI	ldl		/* high part is left in big-endian	*/
+#  define STHI	sdl		/* high part is left in big-endian	*/
+#  define LDLO	ldr		/* low part is right in big-endian	*/
+#  define STLO	sdr		/* low part is right in big-endian	*/
 #else
-#  define LWHI	lwr		/* high part is right in little-endian	*/
-#  define SWHI	swr		/* high part is right in little-endian	*/
-#  define LWLO	lwl		/* low part is left in little-endian	*/
-#  define SWLO	swl		/* low part is left in little-endian	*/
+#  define LDHI	ldr		/* high part is right in little-endian	*/
+#  define STHI	sdr		/* high part is right in little-endian	*/
+#  define LDLO	ldl		/* low part is left in little-endian	*/
+#  define STLO	sdl		/* low part is left in little-endian	*/
 #endif
+#else
+#  define ST	sw
+#  define LD	lw
+#if __MIPSEB
+#  define LDHI	lwl		/* high part is left in big-endian	*/
+#  define STHI	swl		/* high part is left in big-endian	*/
+#  define LDLO	lwr		/* low part is right in big-endian	*/
+#  define STLO	swr		/* low part is right in big-endian	*/
+#else
+#  define LDHI	lwr		/* high part is right in little-endian	*/
+#  define STHI	swr		/* high part is right in little-endian	*/
+#  define LDLO	lwl		/* low part is left in little-endian	*/
+#  define STLO	swl		/* low part is left in little-endian	*/
+#endif
+#endif
+
+/* Bookkeeping values for 32 vs. 64 bit mode.  */
+#ifdef USE_DOUBLE
+#  define NSIZE 8
+#  define NSIZEMASK 0x3f
+#  define NSIZEDMASK 0x7f
+#else
+#  define NSIZE 4
+#  define NSIZEMASK 0x1f
+#  define NSIZEDMASK 0x3f
+#endif
+#define UNIT(unit) ((unit)*NSIZE)
+#define UNITM1(unit) (((unit)*NSIZE)-1)
 
-ENTRY (memcpy)
+#ifdef __BIONIC__
+LEAF(MEMCPY_NAME, 0)
+#else
+LEAF(MEMCPY_NAME)
+#endif
+	.set	nomips16
 	.set	noreorder
+/*
+ * Below we handle the case where memcpy is called with overlapping src and dst.
+ * Although memcpy is not required to handle this case, some parts of Android
+ * like Skia rely on such usage. We call memmove to handle such cases.
+ */
+#ifdef ALLOW_OVERLAP
+	PTR_SUBU t0,a0,a1
+	PTR_SRA	t2,t0,31
+	xor	t1,t0,t2
+	PTR_SUBU t0,t1,t2
+	sltu	t2,t0,a2
+	beq	t2,zero,L(memcpy)
+	la	t9,memmove
+	jr	t9
+	 nop
+L(memcpy):
+#endif
+/*
+ * If the size is less then 2*NSIZE (8 or 16), go to L(lastb).  Regardless of
+ * size, copy dst pointer to v0 for the return value.
+ */
+	slti	t2,a2,(2 * NSIZE)
+	bne	t2,zero,L(lastb)
+	move	v0,a0
+/*
+ * If src and dst have different alignments, go to L(unaligned), if they
+ * have the same alignment (but are not actually aligned) do a partial
+ * load/store to make them aligned.  If they are both already aligned
+ * we can start copying at L(aligned).
+ */
+	xor	t8,a1,a0
+	andi	t8,t8,(NSIZE-1)		/* t8 is a0/a1 word-displacement */
+	bne	t8,zero,L(unaligned)
+	PTR_SUBU a3, zero, a0
+
+	andi	a3,a3,(NSIZE-1)		/* copy a3 bytes to align a0/a1	  */
+	beq	a3,zero,L(aligned)	/* if a3=0, it is already aligned */
+	PTR_SUBU a2,a2,a3		/* a2 is the remining bytes count */
+
+	LDHI	t8,0(a1)
+	PTR_ADDU a1,a1,a3
+	STHI	t8,0(a0)
+	PTR_ADDU a0,a0,a3
+
+/*
+ * Now dst/src are both aligned to (word or double word) aligned addresses
+ * Set a2 to count how many bytes we have to copy after all the 64/128 byte
+ * chunks are copied and a3 to the dst pointer after all the 64/128 byte 
+ * chunks have been copied.  We will loop, incrementing a0 and a1 until a0
+ * equals a3.
+ */
+
+L(aligned):
+	andi	t8,a2,NSIZEDMASK /* any whole 64-byte/128-byte chunks? */
+	beq	a2,t8,L(chkw)	 /* if a2==t8, no 64-byte/128-byte chunks */
+	PTR_SUBU a3,a2,t8	 /* subtract from a2 the reminder */
+	PTR_ADDU a3,a0,a3	 /* Now a3 is the final dst after loop */
+
+/* When in the loop we exercise "pref 30,x(a0)", the a0+x should not be past
+ * the "t0-32" address.  This means: for x=128 the last "safe" a0 address is
+ * "t0-160".  Alternatively, for x=64 the last "safe" a0 address is "t0-96"
+ * In the current version we will use "pref 30,128(a0)", so "t0-160" is the
+ * limit
+ */
+#ifdef USE_PREFETCH
+	PTR_ADDU t0,a0,a2		/* t0 is the "past the end" address */
+	PTR_SUBU t9,t0,PREF_LIMIT	/* t9 is the "last safe pref" address */
+	PREFETCH_FOR_LOAD  (0, a1)
+	PREFETCH_FOR_LOAD  (1, a1)
+	PREFETCH_FOR_LOAD  (2, a1)
+	PREFETCH_FOR_STORE (1, a0)
+	sltu	v1,t9,a0		/* If a0 > t9 don't use next prefetch */
+	bgtz	v1,L(loop16w)
+	nop
+#endif
+	PREFETCH_FOR_STORE (2, a0)
+L(loop16w):
+	PREFETCH_FOR_LOAD  (3, a1)
+	LD	t0,UNIT(0)(a1)
+#ifdef USE_PREFETCH
+	bgtz	v1,L(skip_pref30_96)
+#endif
+	LD	t1,UNIT(1)(a1)
+	PREFETCH_FOR_STORE (3, a0)
+L(skip_pref30_96):
+	LD	REG2,UNIT(2)(a1)
+	LD	REG3,UNIT(3)(a1)
+	LD	REG4,UNIT(4)(a1)
+	LD	REG5,UNIT(5)(a1)
+	LD	REG6,UNIT(6)(a1)
+	LD	REG7,UNIT(7)(a1)
+        PREFETCH_FOR_LOAD (4, a1)
+
+	ST	t0,UNIT(0)(a0)
+	ST	t1,UNIT(1)(a0)
+	ST	REG2,UNIT(2)(a0)
+	ST	REG3,UNIT(3)(a0)
+	ST	REG4,UNIT(4)(a0)
+	ST	REG5,UNIT(5)(a0)
+	ST	REG6,UNIT(6)(a0)
+	ST	REG7,UNIT(7)(a0)
+
+	LD	t0,UNIT(8)(a1)
+#ifdef USE_PREFETCH
+	bgtz	v1,L(skip_pref30_128)
+#endif
+	LD	t1,UNIT(9)(a1)
+	PREFETCH_FOR_STORE (4, a0)
+L(skip_pref30_128):
+	LD	REG2,UNIT(10)(a1)
+	LD	REG3,UNIT(11)(a1)
+	LD	REG4,UNIT(12)(a1)
+	LD	REG5,UNIT(13)(a1)
+	LD	REG6,UNIT(14)(a1)
+	LD	REG7,UNIT(15)(a1)
+        PREFETCH_FOR_LOAD (5, a1)
+	ST	t0,UNIT(8)(a0)
+	ST	t1,UNIT(9)(a0)
+	ST	REG2,UNIT(10)(a0)
+	ST	REG3,UNIT(11)(a0)
+	ST	REG4,UNIT(12)(a0)
+	ST	REG5,UNIT(13)(a0)
+	ST	REG6,UNIT(14)(a0)
+	ST	REG7,UNIT(15)(a0)
+	PTR_ADDIU a0,a0,UNIT(16)	/* adding 64/128 to dest */
+#ifdef USE_PREFETCH
+	sltu	v1,t9,a0
+#endif
+	bne	a0,a3,L(loop16w)
+	PTR_ADDIU a1,a1,UNIT(16)	/* adding 64/128 to src */
+	move	a2,t8
+
+/* Here we have src and dest word-aligned but less than 64-bytes or
+ * 128 bytes to go.  Check for a 32(64) byte chunk and copy if if there
+ * is one.  Otherwise jump down to L(chk1w) to handle the tail end of
+ * the copy.
+ */
+
+L(chkw):
+	PREFETCH_FOR_LOAD (0, a1)
+	andi	t8,a2,NSIZEMASK	/* Is there a 32-byte/64-byte chunk.  */
+				/* The t8 is the reminder count past 32-bytes */
+	beq	a2,t8,L(chk1w)	/* When a2=t8, no 32-byte chunk  */
+	nop
+	LD	t0,UNIT(0)(a1)
+	LD	t1,UNIT(1)(a1)
+	LD	REG2,UNIT(2)(a1)
+	LD	REG3,UNIT(3)(a1)
+	LD	REG4,UNIT(4)(a1)
+	LD	REG5,UNIT(5)(a1)
+	LD	REG6,UNIT(6)(a1)
+	LD	REG7,UNIT(7)(a1)
+	PTR_ADDIU a1,a1,UNIT(8)
+	ST	t0,UNIT(0)(a0)
+	ST	t1,UNIT(1)(a0)
+	ST	REG2,UNIT(2)(a0)
+	ST	REG3,UNIT(3)(a0)
+	ST	REG4,UNIT(4)(a0)
+	ST	REG5,UNIT(5)(a0)
+	ST	REG6,UNIT(6)(a0)
+	ST	REG7,UNIT(7)(a0)
+	PTR_ADDIU a0,a0,UNIT(8)
+
+/*
+ * Here we have less then 32(64) bytes to copy.  Set up for a loop to
+ * copy one word (or double word) at a time.  Set a2 to count how many
+ * bytes we have to copy after all the word (or double word) chunks are
+ * copied and a3 to the dst pointer after all the (d)word chunks have
+ * been copied.  We will loop, incrementing a0 and a1 until a0 equals a3.
+ */
+L(chk1w):
+	andi	a2,t8,(NSIZE-1)	/* a2 is the reminder past one (d)word chunks */
+	beq	a2,t8,L(lastb)
+	PTR_SUBU a3,t8,a2	/* a3 is count of bytes in one (d)word chunks */
+	PTR_ADDU a3,a0,a3	/* a3 is the dst address after loop */
+
+/* copying in words (4-byte or 8-byte chunks) */
+L(wordCopy_loop):
+	LD	REG3,UNIT(0)(a1)
+	PTR_ADDIU a1,a1,UNIT(1)
+	PTR_ADDIU a0,a0,UNIT(1)
+	bne	a0,a3,L(wordCopy_loop)
+	ST	REG3,UNIT(-1)(a0)
 
-	slti	t0, a2, 8		# Less than 8?
-	bne	t0, zero, L(last8)
-	move	v0, a0			# Setup exit value before too late
-
-	xor	t0, a1, a0		# Find a0/a1 displacement
-	andi	t0, 0x3
-	bne	t0, zero, L(shift)	# Go handle the unaligned case
-	subu	t1, zero, a1
-	andi	t1, 0x3			# a0/a1 are aligned, but are we
-	beq	t1, zero, L(chk8w)	#  starting in the middle of a word?
-	subu	a2, t1
-	LWHI	t0, 0(a1)		# Yes we are... take care of that
-	addu	a1, t1
-	SWHI	t0, 0(a0)
-	addu	a0, t1
-
-L(chk8w):	
-	andi	t0, a2, 0x1f		# 32 or more bytes left?
-	beq	t0, a2, L(chk1w)
-	subu	a3, a2, t0		# Yes
-	addu	a3, a1			# a3 = end address of loop
-	move	a2, t0			# a2 = what will be left after loop
-L(lop8w):	
-	lw	t0,  0(a1)		# Loop taking 8 words at a time
-	lw	t1,  4(a1)
-	lw	t2,  8(a1)
-	lw	t3, 12(a1)
-	lw	t4, 16(a1)
-	lw	t5, 20(a1)
-	lw	t6, 24(a1)
-	lw	t7, 28(a1)
-	addiu	a0, 32
-	addiu	a1, 32
-	sw	t0, -32(a0)
-	sw	t1, -28(a0)
-	sw	t2, -24(a0)
-	sw	t3, -20(a0)
-	sw	t4, -16(a0)
-	sw	t5, -12(a0)
-	sw	t6,  -8(a0)
-	bne	a1, a3, L(lop8w)
-	sw	t7,  -4(a0)
-
-L(chk1w):	
-	andi	t0, a2, 0x3		# 4 or more bytes left?
-	beq	t0, a2, L(last8)
-	subu	a3, a2, t0		# Yes, handle them one word at a time
-	addu	a3, a1			# a3 again end address
-	move	a2, t0
-L(lop1w):	
-	lw	t0, 0(a1)
-	addiu	a0, 4
-	addiu	a1, 4
-	bne	a1, a3, L(lop1w)
-	sw	t0, -4(a0)
-
-L(last8):	
-	blez	a2, L(lst8e)		# Handle last 8 bytes, one at a time
-	addu	a3, a2, a1
-L(lst8l):	
-	lb	t0, 0(a1)
-	addiu	a0, 1
-	addiu	a1, 1
-	bne	a1, a3, L(lst8l)
-	sb	t0, -1(a0)
-L(lst8e):	
-	jr	ra			# Bye, bye
+/* Copy the last 8 (or 16) bytes */
+L(lastb):
+	blez	a2,L(leave)
+	PTR_ADDU a3,a0,a2	/* a3 is the last dst address */
+L(lastbloop):
+	lb	v1,0(a1)
+	PTR_ADDIU a1,a1,1
+	PTR_ADDIU a0,a0,1
+	bne	a0,a3,L(lastbloop)
+	sb	v1,-1(a0)
+L(leave):
+	j	ra
 	nop
+/*
+ * UNALIGNED case, got here with a3 = "negu a0"
+ * This code is nearly identical to the aligned code above
+ * but only the destination (not the source) gets aligned
+ * so we need to do partial loads of the source followed
+ * by normal stores to the destination (once we have aligned
+ * the destination).
+ */
 
-L(shift):	
-	subu	a3, zero, a0		# Src and Dest unaligned 
-	andi	a3, 0x3			#  (unoptimized case...)
-	beq	a3, zero, L(shft1)
-	subu	a2, a3			# a2 = bytes left
-	LWHI	t0, 0(a1)		# Take care of first odd part
-	LWLO	t0, 3(a1)
-	addu	a1, a3
-	SWHI	t0, 0(a0)
-	addu	a0, a3
-L(shft1):	
-	andi	t0, a2, 0x3
-	subu	a3, a2, t0
-	addu	a3, a1
-L(shfth):	
-	LWHI	t1, 0(a1)		# Limp through, word by word
-	LWLO	t1, 3(a1)
-	addiu	a0, 4
-	addiu	a1, 4
-	bne	a1, a3, L(shfth)
-	sw	t1, -4(a0)
-	b	L(last8)		# Handle anything which may be left
-	move	a2, t0
+L(unaligned):
+	andi	a3,a3,(NSIZE-1)	/* copy a3 bytes to align a0/a1 */
+	beqz	a3,L(ua_chk16w) /* if a3=0, it is already aligned */
+	PTR_SUBU a2,a2,a3	/* a2 is the remining bytes count */
 
+	LDHI	v1,UNIT(0)(a1)
+	LDLO	v1,UNITM1(1)(a1)
+	PTR_ADDU a1,a1,a3
+	STHI	v1,UNIT(0)(a0)
+	PTR_ADDU a0,a0,a3
+
+/*
+ *  Now the destination (but not the source) is aligned
+ * Set a2 to count how many bytes we have to copy after all the 64/128 byte
+ * chunks are copied and a3 to the dst pointer after all the 64/128 byte
+ * chunks have been copied.  We will loop, incrementing a0 and a1 until a0
+ * equals a3.
+ */
+
+L(ua_chk16w):
+	andi	t8,a2,NSIZEDMASK /* any whole 64-byte/128-byte chunks? */
+	beq	a2,t8,L(ua_chkw) /* if a2==t8, no 64-byte/128-byte chunks */
+	PTR_SUBU a3,a2,t8	 /* subtract from a2 the reminder */
+	PTR_ADDU a3,a0,a3	 /* Now a3 is the final dst after loop */
+
+#ifdef USE_PREFETCH
+	PTR_ADDU t0,a0,a2	  /* t0 is the "past the end" address */
+	PTR_SUBU t9,t0,PREF_LIMIT /* t9 is the "last safe pref" address */
+	PREFETCH_FOR_LOAD  (0, a1)
+	PREFETCH_FOR_LOAD  (1, a1)
+	PREFETCH_FOR_LOAD  (2, a1)
+	PREFETCH_FOR_STORE (1, a0)
+	sltu	v1,t9,a0
+	bgtz	v1,L(ua_loop16w)  /* skip prefetch for too short arrays */
+	nop
+#endif
+	PREFETCH_FOR_STORE (2, a0)
+L(ua_loop16w):
+	PREFETCH_FOR_LOAD  (3, a1)
+	LDHI	t0,UNIT(0)(a1)
+	LDLO	t0,UNITM1(1)(a1)
+	LDHI	t1,UNIT(1)(a1)
+#ifdef USE_PREFETCH
+	bgtz	v1,L(ua_skip_pref30_96)
+#endif
+	LDLO	t1,UNITM1(2)(a1)
+	PREFETCH_FOR_STORE (3, a0)
+L(ua_skip_pref30_96):
+	LDHI	REG2,UNIT(2)(a1)
+	LDLO	REG2,UNITM1(3)(a1)
+	LDHI	REG3,UNIT(3)(a1)
+	LDLO	REG3,UNITM1(4)(a1)
+	LDHI	REG4,UNIT(4)(a1)
+	LDLO	REG4,UNITM1(5)(a1)
+	LDHI	REG5,UNIT(5)(a1)
+	LDLO	REG5,UNITM1(6)(a1)
+	LDHI	REG6,UNIT(6)(a1)
+	LDLO	REG6,UNITM1(7)(a1)
+	LDHI	REG7,UNIT(7)(a1)
+	LDLO	REG7,UNITM1(8)(a1)
+        PREFETCH_FOR_LOAD (4, a1)
+	ST	t0,UNIT(0)(a0)
+	ST	t1,UNIT(1)(a0)
+	ST	REG2,UNIT(2)(a0)
+	ST	REG3,UNIT(3)(a0)
+	ST	REG4,UNIT(4)(a0)
+	ST	REG5,UNIT(5)(a0)
+	ST	REG6,UNIT(6)(a0)
+	ST	REG7,UNIT(7)(a0)
+	LDHI	t0,UNIT(8)(a1)
+	LDLO	t0,UNITM1(9)(a1)
+	LDHI	t1,UNIT(9)(a1)
+#ifdef USE_PREFETCH
+	bgtz	v1,L(ua_skip_pref30_128)
+#endif
+	LDLO	t1,UNITM1(10)(a1)
+	PREFETCH_FOR_STORE (4, a0)
+L(ua_skip_pref30_128):
+	LDHI	REG2,UNIT(10)(a1)
+	LDLO	REG2,UNITM1(11)(a1)
+	LDHI	REG3,UNIT(11)(a1)
+	LDLO	REG3,UNITM1(12)(a1)
+	LDHI	REG4,UNIT(12)(a1)
+	LDLO	REG4,UNITM1(13)(a1)
+	LDHI	REG5,UNIT(13)(a1)
+	LDLO	REG5,UNITM1(14)(a1)
+	LDHI	REG6,UNIT(14)(a1)
+	LDLO	REG6,UNITM1(15)(a1)
+	LDHI	REG7,UNIT(15)(a1)
+	LDLO	REG7,UNITM1(16)(a1)
+        PREFETCH_FOR_LOAD (5, a1)
+	ST	t0,UNIT(8)(a0)
+	ST	t1,UNIT(9)(a0)
+	ST	REG2,UNIT(10)(a0)
+	ST	REG3,UNIT(11)(a0)
+	ST	REG4,UNIT(12)(a0)
+	ST	REG5,UNIT(13)(a0)
+	ST	REG6,UNIT(14)(a0)
+	ST	REG7,UNIT(15)(a0)
+	PTR_ADDIU a0,a0,UNIT(16)	/* adding 64/128 to dest */
+#ifdef USE_PREFETCH
+	sltu	v1,t9,a0
+#endif
+	bne	a0,a3,L(ua_loop16w)
+	PTR_ADDIU a1,a1,UNIT(16)	/* adding 64/128 to src */
+	move	a2,t8
+
+/* Here we have src and dest word-aligned but less than 64-bytes or
+ * 128 bytes to go.  Check for a 32(64) byte chunk and copy if if there
+ * is one.  Otherwise jump down to L(ua_chk1w) to handle the tail end of
+ * the copy.  */
+
+L(ua_chkw):
+	PREFETCH_FOR_LOAD (0, a1)
+	andi	t8,a2,NSIZEMASK	  /* Is there a 32-byte/64-byte chunk.  */
+				  /* t8 is the reminder count past 32-bytes */
+	beq	a2,t8,L(ua_chk1w) /* When a2=t8, no 32-byte chunk */
+	nop
+	LDHI	t0,UNIT(0)(a1)
+	LDLO	t0,UNITM1(1)(a1)
+	LDHI	t1,UNIT(1)(a1)
+	LDLO	t1,UNITM1(2)(a1)
+	LDHI	REG2,UNIT(2)(a1)
+	LDLO	REG2,UNITM1(3)(a1)
+	LDHI	REG3,UNIT(3)(a1)
+	LDLO	REG3,UNITM1(4)(a1)
+	LDHI	REG4,UNIT(4)(a1)
+	LDLO	REG4,UNITM1(5)(a1)
+	LDHI	REG5,UNIT(5)(a1)
+	LDLO	REG5,UNITM1(6)(a1)
+	LDHI	REG6,UNIT(6)(a1)
+	LDLO	REG6,UNITM1(7)(a1)
+	LDHI	REG7,UNIT(7)(a1)
+	LDLO	REG7,UNITM1(8)(a1)
+	PTR_ADDIU a1,a1,UNIT(8)
+	ST	t0,UNIT(0)(a0)
+	ST	t1,UNIT(1)(a0)
+	ST	REG2,UNIT(2)(a0)
+	ST	REG3,UNIT(3)(a0)
+	ST	REG4,UNIT(4)(a0)
+	ST	REG5,UNIT(5)(a0)
+	ST	REG6,UNIT(6)(a0)
+	ST	REG7,UNIT(7)(a0)
+	PTR_ADDIU a0,a0,UNIT(8)
+/*
+ * Here we have less then 32(64) bytes to copy.  Set up for a loop to
+ * copy one word (or double word) at a time.
+ */
+L(ua_chk1w):
+	andi	a2,t8,(NSIZE-1)	/* a2 is the reminder past one (d)word chunks */
+	beq	a2,t8,L(ua_smallCopy)
+	PTR_SUBU a3,t8,a2	/* a3 is count of bytes in one (d)word chunks */
+	PTR_ADDU a3,a0,a3	/* a3 is the dst address after loop */
+
+/* copying in words (4-byte or 8-byte chunks) */
+L(ua_wordCopy_loop):
+	LDHI	v1,UNIT(0)(a1)
+	LDLO	v1,UNITM1(1)(a1)
+	PTR_ADDIU a1,a1,UNIT(1)
+	PTR_ADDIU a0,a0,UNIT(1)
+	bne	a0,a3,L(ua_wordCopy_loop)
+	ST	v1,UNIT(-1)(a0)
+
+/* Copy the last 8 (or 16) bytes */
+L(ua_smallCopy):
+	beqz	a2,L(leave)
+	PTR_ADDU a3,a0,a2	/* a3 is the last dst address */
+L(ua_smallCopy_loop):
+	lb	v1,0(a1)
+	PTR_ADDIU a1,a1,1
+	PTR_ADDIU a0,a0,1
+	bne	a0,a3,L(ua_smallCopy_loop)
+	sb	v1,-1(a0)
+
+	j	ra
+	nop
+
+	.set	at
 	.set	reorder
-END (memcpy)
-libc_hidden_builtin_def (memcpy)
+END(MEMCPY_NAME)
+#ifdef _LIBC
+libc_hidden_builtin_def (MEMCPY_NAME)
+#endif
diff --git a/ports/sysdeps/mips/mips64/memcpy.S b/ports/sysdeps/mips/mips64/memcpy.S
deleted file mode 100644
index 49ef34d..0000000
--- a/ports/sysdeps/mips/mips64/memcpy.S
+++ /dev/null
@@ -1,138 +0,0 @@
-/* Copyright (C) 2002-2012 Free Software Foundation, Inc.
-   This file is part of the GNU C Library.
-   Contributed by Hartvig Ekner <hartvige@mips.com>, 2002.
-   Ported to mips3 n32/n64 by Alexandre Oliva <aoliva@redhat.com>
-
-   The GNU C Library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 2.1 of the License, or (at your option) any later version.
-
-   The GNU C Library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   Lesser General Public License for more details.
-
-   You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library.  If not, see
-   <http://www.gnu.org/licenses/>.  */
-
-#include <sysdep.h>
-#include <sys/asm.h>
-
-
-/* void *memcpy(void *s1, const void *s2, size_t n);
-	
-   This could probably be optimized further.  */
-
-#if __MIPSEB
-#  define LDHI	ldl		/* high part is left in big-endian	*/
-#  define SDHI	sdl		/* high part is left in big-endian	*/
-#  define LDLO	ldr		/* low part is right in big-endian	*/
-#  define SDLO	sdr		/* low part is right in big-endian	*/
-#else
-#  define LDHI	ldr		/* high part is right in little-endian	*/
-#  define SDHI	sdr		/* high part is right in little-endian	*/
-#  define LDLO	ldl		/* low part is left in little-endian	*/
-#  define SDLO	sdl		/* low part is left in little-endian	*/
-#endif
-
-ENTRY (memcpy)
-	.set	noreorder
-
-	slti	t0, a2, 16		# Less than 16?
-	bne	t0, zero, L(last16)
-	move	v0, a0			# Setup exit value before too late
-
-	xor	t0, a1, a0		# Find a0/a1 displacement
-	andi	t0, 0x7
-	bne	t0, zero, L(shift)	# Go handle the unaligned case
-	PTR_SUBU t1, zero, a1
-	andi	t1, 0x7			# a0/a1 are aligned, but are we
-	beq	t1, zero, L(chk8w)	#  starting in the middle of a word?
-	PTR_SUBU a2, t1
-	LDHI	t0, 0(a1)		# Yes we are... take care of that
-	PTR_ADDU a1, t1
-	SDHI	t0, 0(a0)
-	PTR_ADDU a0, t1
-
-L(chk8w):
-	andi	t0, a2, 0x3f		# 64 or more bytes left?
-	beq	t0, a2, L(chk1w)
-	PTR_SUBU a3, a2, t0		# Yes
-	PTR_ADDU a3, a1			# a3 = end address of loop
-	move	a2, t0			# a2 = what will be left after loop
-L(lop8w):	
-	ld	t0,  0(a1)		# Loop taking 8 words at a time
-	ld	t1,  8(a1)
-	ld	t2, 16(a1)
-	ld	t3, 24(a1)
-	ld	ta0, 32(a1)
-	ld	ta1, 40(a1)
-	ld	ta2, 48(a1)
-	ld	ta3, 56(a1)
-	PTR_ADDIU a0, 64
-	PTR_ADDIU a1, 64
-	sd	t0, -64(a0)
-	sd	t1, -56(a0)
-	sd	t2, -48(a0)
-	sd	t3, -40(a0)
-	sd	ta0, -32(a0)
-	sd	ta1, -24(a0)
-	sd	ta2, -16(a0)
-	bne	a1, a3, L(lop8w)
-	sd	ta3,  -8(a0)
-
-L(chk1w):
-	andi	t0, a2, 0x7		# 8 or more bytes left?
-	beq	t0, a2, L(last16)
-	PTR_SUBU a3, a2, t0		# Yes, handle them one dword at a time
-	PTR_ADDU a3, a1			# a3 again end address
-	move	a2, t0
-L(lop1w):
-	ld	t0, 0(a1)
-	PTR_ADDIU a0, 8
-	PTR_ADDIU a1, 8
-	bne	a1, a3, L(lop1w)
-	sd	t0, -8(a0)
-
-L(last16):
-	blez	a2, L(lst16e)		# Handle last 16 bytes, one at a time
-	PTR_ADDU a3, a2, a1
-L(lst16l):
-	lb	t0, 0(a1)
-	PTR_ADDIU a0, 1
-	PTR_ADDIU a1, 1
-	bne	a1, a3, L(lst16l)
-	sb	t0, -1(a0)
-L(lst16e):
-	jr	ra			# Bye, bye
-	nop
-
-L(shift):
-	PTR_SUBU a3, zero, a0		# Src and Dest unaligned 
-	andi	a3, 0x7			#  (unoptimized case...)
-	beq	a3, zero, L(shft1)
-	PTR_SUBU a2, a3			# a2 = bytes left
-	LDHI	t0, 0(a1)		# Take care of first odd part
-	LDLO	t0, 7(a1)
-	PTR_ADDU a1, a3
-	SDHI	t0, 0(a0)
-	PTR_ADDU a0, a3
-L(shft1):
-	andi	t0, a2, 0x7
-	PTR_SUBU a3, a2, t0
-	PTR_ADDU a3, a1
-L(shfth):
-	LDHI	t1, 0(a1)		# Limp through, dword by dword
-	LDLO	t1, 7(a1)
-	PTR_ADDIU a0, 8
-	PTR_ADDIU a1, 8
-	bne	a1, a3, L(shfth)
-	sd	t1, -8(a0)
-	b	L(last16)		# Handle anything which may be left
-	move	a2, t0
-
-	.set	reorder
-END (memcpy)
-libc_hidden_builtin_def (memcpy)

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH] Optimize MIPS memcpy
  2012-09-06 16:25       ` Steve Ellcey
@ 2012-09-06 18:43         ` Roland McGrath
  2012-09-06 19:37           ` Steve Ellcey
  2012-09-07 21:24         ` Maxim Kuvyrkov
  2012-09-11  4:35         ` Maxim Kuvyrkov
  2 siblings, 1 reply; 50+ messages in thread
From: Roland McGrath @ 2012-09-06 18:43 UTC (permalink / raw)
  To: Steve Ellcey
  Cc: Maxim Kuvyrkov, Andrew T Pinski, Joseph S.  Myers, libc-ports

If you are contributing code to the GNU C library, then its copyright
terms must not be changed.  Your patch left FSF as the copyright owner
but changed the terms, which doesn't make sense at all.  We cannot
accept code that has not had its copyright assigned to the FSF.  If
you and your employer have not already done the assignment paperwork,
we need that first.  As the copyright owner, FSF will choose the exact
copyright terms, which will be the same ones used for the rest of the
library code.


Thanks,
Roland

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH] Optimize MIPS memcpy
  2012-09-06 18:43         ` Roland McGrath
@ 2012-09-06 19:37           ` Steve Ellcey
  0 siblings, 0 replies; 50+ messages in thread
From: Steve Ellcey @ 2012-09-06 19:37 UTC (permalink / raw)
  To: Roland McGrath
  Cc: Maxim Kuvyrkov, Andrew T Pinski, Joseph S.  Myers, libc-ports

On Thu, 2012-09-06 at 11:43 -0700, Roland McGrath wrote:
> If you are contributing code to the GNU C library, then its copyright
> terms must not be changed.  Your patch left FSF as the copyright owner
> but changed the terms, which doesn't make sense at all.  We cannot
> accept code that has not had its copyright assigned to the FSF.  If
> you and your employer have not already done the assignment paperwork,
> we need that first.  As the copyright owner, FSF will choose the exact
> copyright terms, which will be the same ones used for the rest of the
> library code.
> 
> 
> Thanks,
> Roland

Sorry about that.  I guess if I had thought it about I would have
realized that just changing the owner to FSF and not changing the
actual notice was the wrong thing to do.  I do have a copyright
assignment on file already so that shouldn't be a problem.

Steve Ellcey
sellcey@mips.com

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH] Optimize MIPS memcpy
  2012-09-06 16:25       ` Steve Ellcey
  2012-09-06 18:43         ` Roland McGrath
@ 2012-09-07 21:24         ` Maxim Kuvyrkov
  2012-09-11  4:35         ` Maxim Kuvyrkov
  2 siblings, 0 replies; 50+ messages in thread
From: Maxim Kuvyrkov @ 2012-09-07 21:24 UTC (permalink / raw)
  To: Steve Ellcey; +Cc: Andrew T Pinski, Joseph S.  Myers, libc-ports

On 7/09/2012, at 4:25 AM, Steve Ellcey wrote:

> On Wed, 2012-09-05 at 12:43 +1200, Maxim Kuvyrkov wrote:
> 
>> Your version looks quite good.  If you could wrap it up into a glibc patch I would test it on our
>> setup to confirm that it indeed provides better performance.
>> 
>> Thanks,
>> 
>> --
>> Maxim Kuvyrkov
>> Mentor Graphics
> 
> I have attached a glibc patch for my version of memcpy.
> 
> 
> 2012-09-06  Steve Ellcey  <sellcey@mips.com>
> 
> 	* sysdeps/mips/memcpy.S: Add prefetching and more unrolling, make
> 	it work in 32 or 64 bit modes.
> 	* sysdeps/mips/mips64/memcpy.S: Remove.

Thanks, I will benchmark it shortly.

Meanwhile, would you please test your memcpy implementation together with patch in http://sourceware.org/ml/libc-alpha/2012-09/msg00197.html to make sure your memcpy also can be used in memmove.

Thanks,

--
Maxim Kuvyrkov
CodeSourcery / Mentor Graphics


^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH] Optimize MIPS memcpy
  2012-09-06 16:25       ` Steve Ellcey
  2012-09-06 18:43         ` Roland McGrath
  2012-09-07 21:24         ` Maxim Kuvyrkov
@ 2012-09-11  4:35         ` Maxim Kuvyrkov
  2012-09-11 15:18           ` Steve Ellcey
  2 siblings, 1 reply; 50+ messages in thread
From: Maxim Kuvyrkov @ 2012-09-11  4:35 UTC (permalink / raw)
  To: Steve Ellcey; +Cc: Andrew T Pinski, Joseph S.  Myers, libc-ports

On 7/09/2012, at 4:25 AM, Steve Ellcey wrote:

> On Wed, 2012-09-05 at 12:43 +1200, Maxim Kuvyrkov wrote:
> 
>> Your version looks quite good.  If you could wrap it up into a glibc patch I would test it on our
>> setup to confirm that it indeed provides better performance.
>> 
>> Thanks,
>> 
>> --
>> Maxim Kuvyrkov
>> Mentor Graphics
> 
> I have attached a glibc patch for my version of memcpy.
> 
> 
> 2012-09-06  Steve Ellcey  <sellcey@mips.com>
> 
> 	* sysdeps/mips/memcpy.S: Add prefetching and more unrolling, make
> 	it work in 32 or 64 bit modes.
> 	* sysdeps/mips/mips64/memcpy.S: Remove.

This fails to build for me at least for N32 ABI.

../ports/sysdeps/mips/memcpy.S: Assembler messages:
../ports/sysdeps/mips/memcpy.S:272: Error: Illegal operands `lw t4,((4)*4)($5)'
../ports/sysdeps/mips/memcpy.S:273: Error: Illegal operands `lw t5,((5)*4)($5)'

I guess the extra parenthesis screw up assembler syntax.

--
Maxim Kuvyrkov
CodeSourcery / Mentor Graphics

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH] Optimize MIPS memcpy
  2012-09-11  4:35         ` Maxim Kuvyrkov
@ 2012-09-11 15:18           ` Steve Ellcey
  2012-09-20  9:05             ` Maxim Kuvyrkov
  0 siblings, 1 reply; 50+ messages in thread
From: Steve Ellcey @ 2012-09-11 15:18 UTC (permalink / raw)
  To: Maxim Kuvyrkov; +Cc: Andrew T Pinski, Joseph S.  Myers, libc-ports

[-- Attachment #1: Type: text/plain, Size: 985 bytes --]

On Tue, 2012-09-11 at 16:34 +1200, Maxim Kuvyrkov wrote:
 
> > 2012-09-06  Steve Ellcey  <sellcey@mips.com>
> > 
> > 	* sysdeps/mips/memcpy.S: Add prefetching and more unrolling, make
> > 	it work in 32 or 64 bit modes.
> > 	* sysdeps/mips/mips64/memcpy.S: Remove.
> 
> This fails to build for me at least for N32 ABI.
> 
> ../ports/sysdeps/mips/memcpy.S: Assembler messages:
> ../ports/sysdeps/mips/memcpy.S:272: Error: Illegal operands `lw t4,((4)*4)($5)'
> ../ports/sysdeps/mips/memcpy.S:273: Error: Illegal operands `lw t5,((5)*4)($5)'
> 
> I guess the extra parenthesis screw up assembler syntax.
> 
> --
> Maxim Kuvyrkov
> CodeSourcery / Mentor Graphics

It is not the parenthesis, it is the use of t4, t5, t6, and t7 instead
of ta0, ta1, ta2, and ta3.  I use the t[4567] for 32 bit mode but I
guess I want to use ta[0123] for N32 ABI mode as well as for the 64 bit
mode.  Here is a new version with this change and with a fixed copyright
notice.

Steve Ellcey
sellcey@mips.com


[-- Attachment #2: memcpy.patch --]
[-- Type: text/x-patch, Size: 22793 bytes --]

diff --git a/ports/sysdeps/mips/memcpy.S b/ports/sysdeps/mips/memcpy.S
index 753f67c..2e31946 100644
--- a/ports/sysdeps/mips/memcpy.S
+++ b/ports/sysdeps/mips/memcpy.S
@@ -1,7 +1,8 @@
-/* Copyright (C) 2002-2012 Free Software Foundation, Inc.
+/* Copyright (C) 2012 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
-   Contributed by Hartvig Ekner <hartvige@mips.com>, 2002.
-
+  
+   Contributed by MIPS Technologies, Inc.
+  
    The GNU C Library is free software; you can redistribute it and/or
    modify it under the terms of the GNU Lesser General Public
    License as published by the Free Software Foundation; either
@@ -16,119 +17,548 @@
    License along with the GNU C Library.  If not, see
    <http://www.gnu.org/licenses/>.  */
 
+#ifdef __BIONIC__
+#include "machine/asm.h"
+#include "machine/regdef.h"
+#define ALLOW_OVERLAP
+#define USE_PREFETCH
+#else
+#ifdef _LIBC
 #include <sysdep.h>
+#define USE_PREFETCH
+#endif
+#include <regdef.h>
+#include <sys/asm.h>
+#if _MIPS_ISA == _MIPS_ISA_MIPS32 || _MIPS_ISA == _MIPS_ISA_MIPS64
+#define PREFETCH
+#endif
+#if _MIPS_SIM == _ABI64
+#define USE_DOUBLE
+#endif
+#endif
+
+
+
+/* Some asm.h files do not have the L macro definition.  */
+#ifndef L
+#if _MIPS_SIM == _ABIO32
+# define L(label) $L ## label
+#else
+# define L(label) .L ## label
+#endif
+#endif
+
+/* Some regdef.h files deo not have the PTR_ADDIU macro definition.  */
+#ifndef PTR_ADDIU
+#ifdef USE_DOUBLE
+#define PTR_ADDIU	daddiu
+#else
+#define PTR_ADDIU	addiu
+#endif
+#endif
+
+
+/*
+ * Using PREF_LOAD_STREAMED instead of PREF_LOAD on load prefetches offers
+ * a slight preformance advantage, using PREF_PREPAREFORSTORE instead of
+ * PREF_STORE_STREAMED or PREF_STORE offers a large performance advantage.
+ */
 
+#ifdef USE_PREFETCH
+# define PREF_LOAD		0
+# define PREF_STORE		1
+# define PREF_LOAD_STREAMED	4
+# define PREF_STORE_STREAMED	5
+# define PREF_LOAD_RETAINED	6
+# define PREF_STORE_RETAINED	7
+# define PREF_WRITEBACK_INVAL	25
+# define PREF_PREPAREFORSTORE	30
 
-/* void *memcpy(void *s1, const void *s2, size_t n);  */
+/*
+ * We double everything when USE_DOUBLE is true so we do 2 prefetches to
+ * get 64 bytes in that case.  The assumption is that each individual 
+ * prefetch brings in 32 bytes.
+ */
+#ifdef USE_DOUBLE
+# define PREF_CHUNK 64
+# define PREFETCH_FOR_LOAD(chunk, reg) \
+ pref PREF_LOAD_STREAMED, (chunk)*32(reg); \
+ pref PREF_LOAD_STREAMED, ((chunk)+1)*32(reg)
+# define PREFETCH_FOR_STORE(chunk, reg) \
+ pref PREF_PREPAREFORSTORE, (chunk)*32(reg); \
+ pref PREF_PREPAREFORSTORE, ((chunk)+1)*32(reg)
+#else
+# define PREF_CHUNK 32
+# define PREFETCH_FOR_LOAD(chunk, reg) \
+ pref PREF_LOAD_STREAMED, (chunk)*32(reg)
+# define PREFETCH_FOR_STORE(chunk, reg) \
+ pref PREF_PREPAREFORSTORE, (chunk)*32(reg)
+#endif
+#define PREF_LIMIT (5 * PREF_CHUNK)
+#else
+# define PREFETCH_FOR_LOAD(offset, reg)
+# define PREFETCH_FOR_STORE(offset, reg)
+#endif
+
+/* Allow the routine to be named something else if desired.  */
+#ifndef MEMCPY_NAME
+#define MEMCPY_NAME memcpy
+#endif
+
+/* We use these 32/64 bit registers as temporaries to do the copying.  */
+#define REG0 t0
+#define REG1 t1
+#define REG2 t2
+#define REG3 t3
+#if _MIPS_SIM == _ABIO32
+#  define REG4 t4
+#  define REG5 t5
+#  define REG6 t6
+#  define REG7 t7
+#else
+#  define REG4 ta0
+#  define REG5 ta1
+#  define REG6 ta2
+#  define REG7 ta3
+#endif
 
+/* We load/store 64 bits at a time when USE_DOUBLE is true.  */
+#ifdef USE_DOUBLE
+#  define ST	sd
+#  define LD	ld
 #if __MIPSEB
-#  define LWHI	lwl		/* high part is left in big-endian	*/
-#  define SWHI	swl		/* high part is left in big-endian	*/
-#  define LWLO	lwr		/* low part is right in big-endian	*/
-#  define SWLO	swr		/* low part is right in big-endian	*/
+#  define LDHI	ldl		/* high part is left in big-endian	*/
+#  define STHI	sdl		/* high part is left in big-endian	*/
+#  define LDLO	ldr		/* low part is right in big-endian	*/
+#  define STLO	sdr		/* low part is right in big-endian	*/
 #else
-#  define LWHI	lwr		/* high part is right in little-endian	*/
-#  define SWHI	swr		/* high part is right in little-endian	*/
-#  define LWLO	lwl		/* low part is left in little-endian	*/
-#  define SWLO	swl		/* low part is left in little-endian	*/
+#  define LDHI	ldr		/* high part is right in little-endian	*/
+#  define STHI	sdr		/* high part is right in little-endian	*/
+#  define LDLO	ldl		/* low part is left in little-endian	*/
+#  define STLO	sdl		/* low part is left in little-endian	*/
+#endif
+#else
+#  define ST	sw
+#  define LD	lw
+#if __MIPSEB
+#  define LDHI	lwl		/* high part is left in big-endian	*/
+#  define STHI	swl		/* high part is left in big-endian	*/
+#  define LDLO	lwr		/* low part is right in big-endian	*/
+#  define STLO	swr		/* low part is right in big-endian	*/
+#else
+#  define LDHI	lwr		/* high part is right in little-endian	*/
+#  define STHI	swr		/* high part is right in little-endian	*/
+#  define LDLO	lwl		/* low part is left in little-endian	*/
+#  define STLO	swl		/* low part is left in little-endian	*/
+#endif
 #endif
 
-ENTRY (memcpy)
+/* Bookkeeping values for 32 vs. 64 bit mode.  */
+#ifdef USE_DOUBLE
+#  define NSIZE 8
+#  define NSIZEMASK 0x3f
+#  define NSIZEDMASK 0x7f
+#else
+#  define NSIZE 4
+#  define NSIZEMASK 0x1f
+#  define NSIZEDMASK 0x3f
+#endif
+#define UNIT(unit) ((unit)*NSIZE)
+#define UNITM1(unit) (((unit)*NSIZE)-1)
+
+#ifdef __BIONIC__
+LEAF(MEMCPY_NAME, 0)
+#else
+LEAF(MEMCPY_NAME)
+#endif
+	.set	nomips16
 	.set	noreorder
+/*
+ * Below we handle the case where memcpy is called with overlapping src and dst.
+ * Although memcpy is not required to handle this case, some parts of Android
+ * like Skia rely on such usage. We call memmove to handle such cases.
+ */
+#ifdef ALLOW_OVERLAP
+	PTR_SUBU t0,a0,a1
+	PTR_SRA	t2,t0,31
+	xor	t1,t0,t2
+	PTR_SUBU t0,t1,t2
+	sltu	t2,t0,a2
+	beq	t2,zero,L(memcpy)
+	la	t9,memmove
+	jr	t9
+	 nop
+L(memcpy):
+#endif
+/*
+ * If the size is less then 2*NSIZE (8 or 16), go to L(lastb).  Regardless of
+ * size, copy dst pointer to v0 for the return value.
+ */
+	slti	t2,a2,(2 * NSIZE)
+	bne	t2,zero,L(lastb)
+	move	v0,a0
+/*
+ * If src and dst have different alignments, go to L(unaligned), if they
+ * have the same alignment (but are not actually aligned) do a partial
+ * load/store to make them aligned.  If they are both already aligned
+ * we can start copying at L(aligned).
+ */
+	xor	t8,a1,a0
+	andi	t8,t8,(NSIZE-1)		/* t8 is a0/a1 word-displacement */
+	bne	t8,zero,L(unaligned)
+	PTR_SUBU a3, zero, a0
+
+	andi	a3,a3,(NSIZE-1)		/* copy a3 bytes to align a0/a1	  */
+	beq	a3,zero,L(aligned)	/* if a3=0, it is already aligned */
+	PTR_SUBU a2,a2,a3		/* a2 is the remining bytes count */
+
+	LDHI	t8,0(a1)
+	PTR_ADDU a1,a1,a3
+	STHI	t8,0(a0)
+	PTR_ADDU a0,a0,a3
+
+/*
+ * Now dst/src are both aligned to (word or double word) aligned addresses
+ * Set a2 to count how many bytes we have to copy after all the 64/128 byte
+ * chunks are copied and a3 to the dst pointer after all the 64/128 byte 
+ * chunks have been copied.  We will loop, incrementing a0 and a1 until a0
+ * equals a3.
+ */
+
+L(aligned):
+	andi	t8,a2,NSIZEDMASK /* any whole 64-byte/128-byte chunks? */
+	beq	a2,t8,L(chkw)	 /* if a2==t8, no 64-byte/128-byte chunks */
+	PTR_SUBU a3,a2,t8	 /* subtract from a2 the reminder */
+	PTR_ADDU a3,a0,a3	 /* Now a3 is the final dst after loop */
+
+/* When in the loop we exercise "pref 30,x(a0)", the a0+x should not be past
+ * the "t0-32" address.  This means: for x=128 the last "safe" a0 address is
+ * "t0-160".  Alternatively, for x=64 the last "safe" a0 address is "t0-96"
+ * In the current version we will use "pref 30,128(a0)", so "t0-160" is the
+ * limit
+ */
+#ifdef USE_PREFETCH
+	PTR_ADDU t0,a0,a2		/* t0 is the "past the end" address */
+	PTR_SUBU t9,t0,PREF_LIMIT	/* t9 is the "last safe pref" address */
+	PREFETCH_FOR_LOAD  (0, a1)
+	PREFETCH_FOR_LOAD  (1, a1)
+	PREFETCH_FOR_LOAD  (2, a1)
+	PREFETCH_FOR_STORE (1, a0)
+	sltu	v1,t9,a0		/* If a0 > t9 don't use next prefetch */
+	bgtz	v1,L(loop16w)
+	nop
+#endif
+	PREFETCH_FOR_STORE (2, a0)
+L(loop16w):
+	PREFETCH_FOR_LOAD  (3, a1)
+	LD	t0,UNIT(0)(a1)
+#ifdef USE_PREFETCH
+	bgtz	v1,L(skip_pref30_96)
+#endif
+	LD	t1,UNIT(1)(a1)
+	PREFETCH_FOR_STORE (3, a0)
+L(skip_pref30_96):
+	LD	REG2,UNIT(2)(a1)
+	LD	REG3,UNIT(3)(a1)
+	LD	REG4,UNIT(4)(a1)
+	LD	REG5,UNIT(5)(a1)
+	LD	REG6,UNIT(6)(a1)
+	LD	REG7,UNIT(7)(a1)
+        PREFETCH_FOR_LOAD (4, a1)
+
+	ST	t0,UNIT(0)(a0)
+	ST	t1,UNIT(1)(a0)
+	ST	REG2,UNIT(2)(a0)
+	ST	REG3,UNIT(3)(a0)
+	ST	REG4,UNIT(4)(a0)
+	ST	REG5,UNIT(5)(a0)
+	ST	REG6,UNIT(6)(a0)
+	ST	REG7,UNIT(7)(a0)
+
+	LD	t0,UNIT(8)(a1)
+#ifdef USE_PREFETCH
+	bgtz	v1,L(skip_pref30_128)
+#endif
+	LD	t1,UNIT(9)(a1)
+	PREFETCH_FOR_STORE (4, a0)
+L(skip_pref30_128):
+	LD	REG2,UNIT(10)(a1)
+	LD	REG3,UNIT(11)(a1)
+	LD	REG4,UNIT(12)(a1)
+	LD	REG5,UNIT(13)(a1)
+	LD	REG6,UNIT(14)(a1)
+	LD	REG7,UNIT(15)(a1)
+        PREFETCH_FOR_LOAD (5, a1)
+	ST	t0,UNIT(8)(a0)
+	ST	t1,UNIT(9)(a0)
+	ST	REG2,UNIT(10)(a0)
+	ST	REG3,UNIT(11)(a0)
+	ST	REG4,UNIT(12)(a0)
+	ST	REG5,UNIT(13)(a0)
+	ST	REG6,UNIT(14)(a0)
+	ST	REG7,UNIT(15)(a0)
+	PTR_ADDIU a0,a0,UNIT(16)	/* adding 64/128 to dest */
+#ifdef USE_PREFETCH
+	sltu	v1,t9,a0
+#endif
+	bne	a0,a3,L(loop16w)
+	PTR_ADDIU a1,a1,UNIT(16)	/* adding 64/128 to src */
+	move	a2,t8
+
+/* Here we have src and dest word-aligned but less than 64-bytes or
+ * 128 bytes to go.  Check for a 32(64) byte chunk and copy if if there
+ * is one.  Otherwise jump down to L(chk1w) to handle the tail end of
+ * the copy.
+ */
+
+L(chkw):
+	PREFETCH_FOR_LOAD (0, a1)
+	andi	t8,a2,NSIZEMASK	/* Is there a 32-byte/64-byte chunk.  */
+				/* The t8 is the reminder count past 32-bytes */
+	beq	a2,t8,L(chk1w)	/* When a2=t8, no 32-byte chunk  */
+	nop
+	LD	t0,UNIT(0)(a1)
+	LD	t1,UNIT(1)(a1)
+	LD	REG2,UNIT(2)(a1)
+	LD	REG3,UNIT(3)(a1)
+	LD	REG4,UNIT(4)(a1)
+	LD	REG5,UNIT(5)(a1)
+	LD	REG6,UNIT(6)(a1)
+	LD	REG7,UNIT(7)(a1)
+	PTR_ADDIU a1,a1,UNIT(8)
+	ST	t0,UNIT(0)(a0)
+	ST	t1,UNIT(1)(a0)
+	ST	REG2,UNIT(2)(a0)
+	ST	REG3,UNIT(3)(a0)
+	ST	REG4,UNIT(4)(a0)
+	ST	REG5,UNIT(5)(a0)
+	ST	REG6,UNIT(6)(a0)
+	ST	REG7,UNIT(7)(a0)
+	PTR_ADDIU a0,a0,UNIT(8)
+
+/*
+ * Here we have less then 32(64) bytes to copy.  Set up for a loop to
+ * copy one word (or double word) at a time.  Set a2 to count how many
+ * bytes we have to copy after all the word (or double word) chunks are
+ * copied and a3 to the dst pointer after all the (d)word chunks have
+ * been copied.  We will loop, incrementing a0 and a1 until a0 equals a3.
+ */
+L(chk1w):
+	andi	a2,t8,(NSIZE-1)	/* a2 is the reminder past one (d)word chunks */
+	beq	a2,t8,L(lastb)
+	PTR_SUBU a3,t8,a2	/* a3 is count of bytes in one (d)word chunks */
+	PTR_ADDU a3,a0,a3	/* a3 is the dst address after loop */
 
-	slti	t0, a2, 8		# Less than 8?
-	bne	t0, zero, L(last8)
-	move	v0, a0			# Setup exit value before too late
-
-	xor	t0, a1, a0		# Find a0/a1 displacement
-	andi	t0, 0x3
-	bne	t0, zero, L(shift)	# Go handle the unaligned case
-	subu	t1, zero, a1
-	andi	t1, 0x3			# a0/a1 are aligned, but are we
-	beq	t1, zero, L(chk8w)	#  starting in the middle of a word?
-	subu	a2, t1
-	LWHI	t0, 0(a1)		# Yes we are... take care of that
-	addu	a1, t1
-	SWHI	t0, 0(a0)
-	addu	a0, t1
-
-L(chk8w):	
-	andi	t0, a2, 0x1f		# 32 or more bytes left?
-	beq	t0, a2, L(chk1w)
-	subu	a3, a2, t0		# Yes
-	addu	a3, a1			# a3 = end address of loop
-	move	a2, t0			# a2 = what will be left after loop
-L(lop8w):	
-	lw	t0,  0(a1)		# Loop taking 8 words at a time
-	lw	t1,  4(a1)
-	lw	t2,  8(a1)
-	lw	t3, 12(a1)
-	lw	t4, 16(a1)
-	lw	t5, 20(a1)
-	lw	t6, 24(a1)
-	lw	t7, 28(a1)
-	addiu	a0, 32
-	addiu	a1, 32
-	sw	t0, -32(a0)
-	sw	t1, -28(a0)
-	sw	t2, -24(a0)
-	sw	t3, -20(a0)
-	sw	t4, -16(a0)
-	sw	t5, -12(a0)
-	sw	t6,  -8(a0)
-	bne	a1, a3, L(lop8w)
-	sw	t7,  -4(a0)
-
-L(chk1w):	
-	andi	t0, a2, 0x3		# 4 or more bytes left?
-	beq	t0, a2, L(last8)
-	subu	a3, a2, t0		# Yes, handle them one word at a time
-	addu	a3, a1			# a3 again end address
-	move	a2, t0
-L(lop1w):	
-	lw	t0, 0(a1)
-	addiu	a0, 4
-	addiu	a1, 4
-	bne	a1, a3, L(lop1w)
-	sw	t0, -4(a0)
-
-L(last8):	
-	blez	a2, L(lst8e)		# Handle last 8 bytes, one at a time
-	addu	a3, a2, a1
-L(lst8l):	
-	lb	t0, 0(a1)
-	addiu	a0, 1
-	addiu	a1, 1
-	bne	a1, a3, L(lst8l)
-	sb	t0, -1(a0)
-L(lst8e):	
-	jr	ra			# Bye, bye
+/* copying in words (4-byte or 8-byte chunks) */
+L(wordCopy_loop):
+	LD	REG3,UNIT(0)(a1)
+	PTR_ADDIU a1,a1,UNIT(1)
+	PTR_ADDIU a0,a0,UNIT(1)
+	bne	a0,a3,L(wordCopy_loop)
+	ST	REG3,UNIT(-1)(a0)
+
+/* Copy the last 8 (or 16) bytes */
+L(lastb):
+	blez	a2,L(leave)
+	PTR_ADDU a3,a0,a2	/* a3 is the last dst address */
+L(lastbloop):
+	lb	v1,0(a1)
+	PTR_ADDIU a1,a1,1
+	PTR_ADDIU a0,a0,1
+	bne	a0,a3,L(lastbloop)
+	sb	v1,-1(a0)
+L(leave):
+	j	ra
+	nop
+/*
+ * UNALIGNED case, got here with a3 = "negu a0"
+ * This code is nearly identical to the aligned code above
+ * but only the destination (not the source) gets aligned
+ * so we need to do partial loads of the source followed
+ * by normal stores to the destination (once we have aligned
+ * the destination).
+ */
+
+L(unaligned):
+	andi	a3,a3,(NSIZE-1)	/* copy a3 bytes to align a0/a1 */
+	beqz	a3,L(ua_chk16w) /* if a3=0, it is already aligned */
+	PTR_SUBU a2,a2,a3	/* a2 is the remining bytes count */
+
+	LDHI	v1,UNIT(0)(a1)
+	LDLO	v1,UNITM1(1)(a1)
+	PTR_ADDU a1,a1,a3
+	STHI	v1,UNIT(0)(a0)
+	PTR_ADDU a0,a0,a3
+
+/*
+ *  Now the destination (but not the source) is aligned
+ * Set a2 to count how many bytes we have to copy after all the 64/128 byte
+ * chunks are copied and a3 to the dst pointer after all the 64/128 byte
+ * chunks have been copied.  We will loop, incrementing a0 and a1 until a0
+ * equals a3.
+ */
+
+L(ua_chk16w):
+	andi	t8,a2,NSIZEDMASK /* any whole 64-byte/128-byte chunks? */
+	beq	a2,t8,L(ua_chkw) /* if a2==t8, no 64-byte/128-byte chunks */
+	PTR_SUBU a3,a2,t8	 /* subtract from a2 the reminder */
+	PTR_ADDU a3,a0,a3	 /* Now a3 is the final dst after loop */
+
+#ifdef USE_PREFETCH
+	PTR_ADDU t0,a0,a2	  /* t0 is the "past the end" address */
+	PTR_SUBU t9,t0,PREF_LIMIT /* t9 is the "last safe pref" address */
+	PREFETCH_FOR_LOAD  (0, a1)
+	PREFETCH_FOR_LOAD  (1, a1)
+	PREFETCH_FOR_LOAD  (2, a1)
+	PREFETCH_FOR_STORE (1, a0)
+	sltu	v1,t9,a0
+	bgtz	v1,L(ua_loop16w)  /* skip prefetch for too short arrays */
+	nop
+#endif
+	PREFETCH_FOR_STORE (2, a0)
+L(ua_loop16w):
+	PREFETCH_FOR_LOAD  (3, a1)
+	LDHI	t0,UNIT(0)(a1)
+	LDLO	t0,UNITM1(1)(a1)
+	LDHI	t1,UNIT(1)(a1)
+#ifdef USE_PREFETCH
+	bgtz	v1,L(ua_skip_pref30_96)
+#endif
+	LDLO	t1,UNITM1(2)(a1)
+	PREFETCH_FOR_STORE (3, a0)
+L(ua_skip_pref30_96):
+	LDHI	REG2,UNIT(2)(a1)
+	LDLO	REG2,UNITM1(3)(a1)
+	LDHI	REG3,UNIT(3)(a1)
+	LDLO	REG3,UNITM1(4)(a1)
+	LDHI	REG4,UNIT(4)(a1)
+	LDLO	REG4,UNITM1(5)(a1)
+	LDHI	REG5,UNIT(5)(a1)
+	LDLO	REG5,UNITM1(6)(a1)
+	LDHI	REG6,UNIT(6)(a1)
+	LDLO	REG6,UNITM1(7)(a1)
+	LDHI	REG7,UNIT(7)(a1)
+	LDLO	REG7,UNITM1(8)(a1)
+        PREFETCH_FOR_LOAD (4, a1)
+	ST	t0,UNIT(0)(a0)
+	ST	t1,UNIT(1)(a0)
+	ST	REG2,UNIT(2)(a0)
+	ST	REG3,UNIT(3)(a0)
+	ST	REG4,UNIT(4)(a0)
+	ST	REG5,UNIT(5)(a0)
+	ST	REG6,UNIT(6)(a0)
+	ST	REG7,UNIT(7)(a0)
+	LDHI	t0,UNIT(8)(a1)
+	LDLO	t0,UNITM1(9)(a1)
+	LDHI	t1,UNIT(9)(a1)
+#ifdef USE_PREFETCH
+	bgtz	v1,L(ua_skip_pref30_128)
+#endif
+	LDLO	t1,UNITM1(10)(a1)
+	PREFETCH_FOR_STORE (4, a0)
+L(ua_skip_pref30_128):
+	LDHI	REG2,UNIT(10)(a1)
+	LDLO	REG2,UNITM1(11)(a1)
+	LDHI	REG3,UNIT(11)(a1)
+	LDLO	REG3,UNITM1(12)(a1)
+	LDHI	REG4,UNIT(12)(a1)
+	LDLO	REG4,UNITM1(13)(a1)
+	LDHI	REG5,UNIT(13)(a1)
+	LDLO	REG5,UNITM1(14)(a1)
+	LDHI	REG6,UNIT(14)(a1)
+	LDLO	REG6,UNITM1(15)(a1)
+	LDHI	REG7,UNIT(15)(a1)
+	LDLO	REG7,UNITM1(16)(a1)
+        PREFETCH_FOR_LOAD (5, a1)
+	ST	t0,UNIT(8)(a0)
+	ST	t1,UNIT(9)(a0)
+	ST	REG2,UNIT(10)(a0)
+	ST	REG3,UNIT(11)(a0)
+	ST	REG4,UNIT(12)(a0)
+	ST	REG5,UNIT(13)(a0)
+	ST	REG6,UNIT(14)(a0)
+	ST	REG7,UNIT(15)(a0)
+	PTR_ADDIU a0,a0,UNIT(16)	/* adding 64/128 to dest */
+#ifdef USE_PREFETCH
+	sltu	v1,t9,a0
+#endif
+	bne	a0,a3,L(ua_loop16w)
+	PTR_ADDIU a1,a1,UNIT(16)	/* adding 64/128 to src */
+	move	a2,t8
+
+/* Here we have src and dest word-aligned but less than 64-bytes or
+ * 128 bytes to go.  Check for a 32(64) byte chunk and copy if if there
+ * is one.  Otherwise jump down to L(ua_chk1w) to handle the tail end of
+ * the copy.  */
+
+L(ua_chkw):
+	PREFETCH_FOR_LOAD (0, a1)
+	andi	t8,a2,NSIZEMASK	  /* Is there a 32-byte/64-byte chunk.  */
+				  /* t8 is the reminder count past 32-bytes */
+	beq	a2,t8,L(ua_chk1w) /* When a2=t8, no 32-byte chunk */
 	nop
+	LDHI	t0,UNIT(0)(a1)
+	LDLO	t0,UNITM1(1)(a1)
+	LDHI	t1,UNIT(1)(a1)
+	LDLO	t1,UNITM1(2)(a1)
+	LDHI	REG2,UNIT(2)(a1)
+	LDLO	REG2,UNITM1(3)(a1)
+	LDHI	REG3,UNIT(3)(a1)
+	LDLO	REG3,UNITM1(4)(a1)
+	LDHI	REG4,UNIT(4)(a1)
+	LDLO	REG4,UNITM1(5)(a1)
+	LDHI	REG5,UNIT(5)(a1)
+	LDLO	REG5,UNITM1(6)(a1)
+	LDHI	REG6,UNIT(6)(a1)
+	LDLO	REG6,UNITM1(7)(a1)
+	LDHI	REG7,UNIT(7)(a1)
+	LDLO	REG7,UNITM1(8)(a1)
+	PTR_ADDIU a1,a1,UNIT(8)
+	ST	t0,UNIT(0)(a0)
+	ST	t1,UNIT(1)(a0)
+	ST	REG2,UNIT(2)(a0)
+	ST	REG3,UNIT(3)(a0)
+	ST	REG4,UNIT(4)(a0)
+	ST	REG5,UNIT(5)(a0)
+	ST	REG6,UNIT(6)(a0)
+	ST	REG7,UNIT(7)(a0)
+	PTR_ADDIU a0,a0,UNIT(8)
+/*
+ * Here we have less then 32(64) bytes to copy.  Set up for a loop to
+ * copy one word (or double word) at a time.
+ */
+L(ua_chk1w):
+	andi	a2,t8,(NSIZE-1)	/* a2 is the reminder past one (d)word chunks */
+	beq	a2,t8,L(ua_smallCopy)
+	PTR_SUBU a3,t8,a2	/* a3 is count of bytes in one (d)word chunks */
+	PTR_ADDU a3,a0,a3	/* a3 is the dst address after loop */
 
-L(shift):	
-	subu	a3, zero, a0		# Src and Dest unaligned 
-	andi	a3, 0x3			#  (unoptimized case...)
-	beq	a3, zero, L(shft1)
-	subu	a2, a3			# a2 = bytes left
-	LWHI	t0, 0(a1)		# Take care of first odd part
-	LWLO	t0, 3(a1)
-	addu	a1, a3
-	SWHI	t0, 0(a0)
-	addu	a0, a3
-L(shft1):	
-	andi	t0, a2, 0x3
-	subu	a3, a2, t0
-	addu	a3, a1
-L(shfth):	
-	LWHI	t1, 0(a1)		# Limp through, word by word
-	LWLO	t1, 3(a1)
-	addiu	a0, 4
-	addiu	a1, 4
-	bne	a1, a3, L(shfth)
-	sw	t1, -4(a0)
-	b	L(last8)		# Handle anything which may be left
-	move	a2, t0
+/* copying in words (4-byte or 8-byte chunks) */
+L(ua_wordCopy_loop):
+	LDHI	v1,UNIT(0)(a1)
+	LDLO	v1,UNITM1(1)(a1)
+	PTR_ADDIU a1,a1,UNIT(1)
+	PTR_ADDIU a0,a0,UNIT(1)
+	bne	a0,a3,L(ua_wordCopy_loop)
+	ST	v1,UNIT(-1)(a0)
 
+/* Copy the last 8 (or 16) bytes */
+L(ua_smallCopy):
+	beqz	a2,L(leave)
+	PTR_ADDU a3,a0,a2	/* a3 is the last dst address */
+L(ua_smallCopy_loop):
+	lb	v1,0(a1)
+	PTR_ADDIU a1,a1,1
+	PTR_ADDIU a0,a0,1
+	bne	a0,a3,L(ua_smallCopy_loop)
+	sb	v1,-1(a0)
+
+	j	ra
+	nop
+
+	.set	at
 	.set	reorder
-END (memcpy)
-libc_hidden_builtin_def (memcpy)
+END(MEMCPY_NAME)
+#ifdef _LIBC
+libc_hidden_builtin_def (MEMCPY_NAME)
+#endif
diff --git a/ports/sysdeps/mips/mips64/memcpy.S b/ports/sysdeps/mips/mips64/memcpy.S
deleted file mode 100644
index 49ef34d..0000000
--- a/ports/sysdeps/mips/mips64/memcpy.S
+++ /dev/null
@@ -1,138 +0,0 @@
-/* Copyright (C) 2002-2012 Free Software Foundation, Inc.
-   This file is part of the GNU C Library.
-   Contributed by Hartvig Ekner <hartvige@mips.com>, 2002.
-   Ported to mips3 n32/n64 by Alexandre Oliva <aoliva@redhat.com>
-
-   The GNU C Library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 2.1 of the License, or (at your option) any later version.
-
-   The GNU C Library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   Lesser General Public License for more details.
-
-   You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library.  If not, see
-   <http://www.gnu.org/licenses/>.  */
-
-#include <sysdep.h>
-#include <sys/asm.h>
-
-
-/* void *memcpy(void *s1, const void *s2, size_t n);
-	
-   This could probably be optimized further.  */
-
-#if __MIPSEB
-#  define LDHI	ldl		/* high part is left in big-endian	*/
-#  define SDHI	sdl		/* high part is left in big-endian	*/
-#  define LDLO	ldr		/* low part is right in big-endian	*/
-#  define SDLO	sdr		/* low part is right in big-endian	*/
-#else
-#  define LDHI	ldr		/* high part is right in little-endian	*/
-#  define SDHI	sdr		/* high part is right in little-endian	*/
-#  define LDLO	ldl		/* low part is left in little-endian	*/
-#  define SDLO	sdl		/* low part is left in little-endian	*/
-#endif
-
-ENTRY (memcpy)
-	.set	noreorder
-
-	slti	t0, a2, 16		# Less than 16?
-	bne	t0, zero, L(last16)
-	move	v0, a0			# Setup exit value before too late
-
-	xor	t0, a1, a0		# Find a0/a1 displacement
-	andi	t0, 0x7
-	bne	t0, zero, L(shift)	# Go handle the unaligned case
-	PTR_SUBU t1, zero, a1
-	andi	t1, 0x7			# a0/a1 are aligned, but are we
-	beq	t1, zero, L(chk8w)	#  starting in the middle of a word?
-	PTR_SUBU a2, t1
-	LDHI	t0, 0(a1)		# Yes we are... take care of that
-	PTR_ADDU a1, t1
-	SDHI	t0, 0(a0)
-	PTR_ADDU a0, t1
-
-L(chk8w):
-	andi	t0, a2, 0x3f		# 64 or more bytes left?
-	beq	t0, a2, L(chk1w)
-	PTR_SUBU a3, a2, t0		# Yes
-	PTR_ADDU a3, a1			# a3 = end address of loop
-	move	a2, t0			# a2 = what will be left after loop
-L(lop8w):	
-	ld	t0,  0(a1)		# Loop taking 8 words at a time
-	ld	t1,  8(a1)
-	ld	t2, 16(a1)
-	ld	t3, 24(a1)
-	ld	ta0, 32(a1)
-	ld	ta1, 40(a1)
-	ld	ta2, 48(a1)
-	ld	ta3, 56(a1)
-	PTR_ADDIU a0, 64
-	PTR_ADDIU a1, 64
-	sd	t0, -64(a0)
-	sd	t1, -56(a0)
-	sd	t2, -48(a0)
-	sd	t3, -40(a0)
-	sd	ta0, -32(a0)
-	sd	ta1, -24(a0)
-	sd	ta2, -16(a0)
-	bne	a1, a3, L(lop8w)
-	sd	ta3,  -8(a0)
-
-L(chk1w):
-	andi	t0, a2, 0x7		# 8 or more bytes left?
-	beq	t0, a2, L(last16)
-	PTR_SUBU a3, a2, t0		# Yes, handle them one dword at a time
-	PTR_ADDU a3, a1			# a3 again end address
-	move	a2, t0
-L(lop1w):
-	ld	t0, 0(a1)
-	PTR_ADDIU a0, 8
-	PTR_ADDIU a1, 8
-	bne	a1, a3, L(lop1w)
-	sd	t0, -8(a0)
-
-L(last16):
-	blez	a2, L(lst16e)		# Handle last 16 bytes, one at a time
-	PTR_ADDU a3, a2, a1
-L(lst16l):
-	lb	t0, 0(a1)
-	PTR_ADDIU a0, 1
-	PTR_ADDIU a1, 1
-	bne	a1, a3, L(lst16l)
-	sb	t0, -1(a0)
-L(lst16e):
-	jr	ra			# Bye, bye
-	nop
-
-L(shift):
-	PTR_SUBU a3, zero, a0		# Src and Dest unaligned 
-	andi	a3, 0x7			#  (unoptimized case...)
-	beq	a3, zero, L(shft1)
-	PTR_SUBU a2, a3			# a2 = bytes left
-	LDHI	t0, 0(a1)		# Take care of first odd part
-	LDLO	t0, 7(a1)
-	PTR_ADDU a1, a3
-	SDHI	t0, 0(a0)
-	PTR_ADDU a0, a3
-L(shft1):
-	andi	t0, a2, 0x7
-	PTR_SUBU a3, a2, t0
-	PTR_ADDU a3, a1
-L(shfth):
-	LDHI	t1, 0(a1)		# Limp through, dword by dword
-	LDLO	t1, 7(a1)
-	PTR_ADDIU a0, 8
-	PTR_ADDIU a1, 8
-	bne	a1, a3, L(shfth)
-	sd	t1, -8(a0)
-	b	L(last16)		# Handle anything which may be left
-	move	a2, t0
-
-	.set	reorder
-END (memcpy)
-libc_hidden_builtin_def (memcpy)

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH] Optimize MIPS memcpy
  2012-09-11 15:18           ` Steve Ellcey
@ 2012-09-20  9:05             ` Maxim Kuvyrkov
  2012-09-20 18:38               ` Steve Ellcey
  2012-09-21 18:47               ` Steve Ellcey
  0 siblings, 2 replies; 50+ messages in thread
From: Maxim Kuvyrkov @ 2012-09-20  9:05 UTC (permalink / raw)
  To: Steve Ellcey; +Cc: Andrew T Pinski, Joseph S.  Myers, libc-ports

On 12/09/2012, at 3:17 AM, Steve Ellcey wrote:

> On Tue, 2012-09-11 at 16:34 +1200, Maxim Kuvyrkov wrote:
> 
>>> 2012-09-06  Steve Ellcey  <sellcey@mips.com>
>>> 
>>> 	* sysdeps/mips/memcpy.S: Add prefetching and more unrolling, make
>>> 	it work in 32 or 64 bit modes.
>>> 	* sysdeps/mips/mips64/memcpy.S: Remove.
>> 
>> This fails to build for me at least for N32 ABI.
>> 
>> ../ports/sysdeps/mips/memcpy.S: Assembler messages:
>> ../ports/sysdeps/mips/memcpy.S:272: Error: Illegal operands `lw t4,((4)*4)($5)'
>> ../ports/sysdeps/mips/memcpy.S:273: Error: Illegal operands `lw t5,((5)*4)($5)'
>> 
>> I guess the extra parenthesis screw up assembler syntax.
>> 
>> --
>> Maxim Kuvyrkov
>> CodeSourcery / Mentor Graphics
> 
> It is not the parenthesis, it is the use of t4, t5, t6, and t7 instead
> of ta0, ta1, ta2, and ta3.  I use the t[4567] for 32 bit mode but I
> guess I want to use ta[0123] for N32 ABI mode as well as for the 64 bit
> mode.  Here is a new version with this change and with a fixed copyright
> notice.

What testing was done for this patch, does it pass glibc testsuite?

I have a benchmark that exercises various string and mem* routines failing with it.

Thank you,

--
Maxim Kuvyrkov
CodeSourcery / Mentor Graphics

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH] Optimize MIPS memcpy
  2012-09-20  9:05             ` Maxim Kuvyrkov
@ 2012-09-20 18:38               ` Steve Ellcey
  2012-09-28  3:48                 ` Maxim Kuvyrkov
  2012-09-21 18:47               ` Steve Ellcey
  1 sibling, 1 reply; 50+ messages in thread
From: Steve Ellcey @ 2012-09-20 18:38 UTC (permalink / raw)
  To: Maxim Kuvyrkov; +Cc: Andrew T Pinski, Joseph S.  Myers, libc-ports

On Thu, 2012-09-20 at 21:05 +1200, Maxim Kuvyrkov wrote:

> What testing was done for this patch, does it pass glibc testsuite?
> 
> I have a benchmark that exercises various string and mem* routines failing with it.
> 
> Thank you,
> 
> --
> Maxim Kuvyrkov
> CodeSourcery / Mentor Graphics

Is the benchmark anything you can share?  I ran the glibc testsuite and
got some failures but I don't think they are due to the new memcpy.  I
am going back now and running the glibc testsuite with no changes to get
a baseline so I can verify that.  Hopefully I will have an answer later
today.

I ran some other tests like the gcc testsuite using a glibc with this
change in it and that didn't have any problems and there is the one I
sent to the list
http://sourceware.org/ml/libc-ports/2012-09/msg00007.html that also ran
with no problems.

Steve Ellcey
sellcey@mips.com



^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH] Optimize MIPS memcpy
  2012-09-20  9:05             ` Maxim Kuvyrkov
  2012-09-20 18:38               ` Steve Ellcey
@ 2012-09-21 18:47               ` Steve Ellcey
  2012-09-21 18:57                 ` Joseph S. Myers
  2012-09-21 19:12                 ` [PATCH] Optimize MIPS memcpy Maxim Kuvyrkov
  1 sibling, 2 replies; 50+ messages in thread
From: Steve Ellcey @ 2012-09-21 18:47 UTC (permalink / raw)
  To: Maxim Kuvyrkov; +Cc: Andrew T Pinski, Joseph S.  Myers, libc-ports

On Thu, 2012-09-20 at 21:05 +1200, Maxim Kuvyrkov wrote:

> What testing was done for this patch, does it pass glibc testsuite?

Maxim,  I get the same failures with this memcpy as without it when
running the glibc testsuite:

glibc/localedata/tst-fmon.out
glibc/math/test-float.out
glibc/math/test-double.out
glibc/math/test-ifloat.out
glibc/math/test-idouble.out
glibc/stdlib/tst-strtod-overflow.out
glibc/stdio-common/bug22.out
glibc/malloc/tst-trim1.out
glibc/nptl/tst-cancel7.out
glibc/nptl/tst-cancelx7.out

I did find a build problem, the version I sent you sets USE_PREFETCH for
any libc build.  If building for something like MIPS1 that is not right,
so I need to change how that macro is set.

Steve Ellcey
sellcey@mips.com

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH] Optimize MIPS memcpy
  2012-09-21 18:47               ` Steve Ellcey
@ 2012-09-21 18:57                 ` Joseph S. Myers
  2012-09-21 20:41                   ` [PATCH] Optimize MIPS memcpy (mips glibc test results) Steve Ellcey
  2012-09-21 19:12                 ` [PATCH] Optimize MIPS memcpy Maxim Kuvyrkov
  1 sibling, 1 reply; 50+ messages in thread
From: Joseph S. Myers @ 2012-09-21 18:57 UTC (permalink / raw)
  To: Steve Ellcey; +Cc: Maxim Kuvyrkov, Andrew T Pinski, libc-ports

On Fri, 21 Sep 2012, Steve Ellcey wrote:

> glibc/localedata/tst-fmon.out

You should investigate why you see this failure, it's not one of the known 
issues listed at <http://sourceware.org/glibc/wiki/Release/2.16>

> glibc/math/test-float.out
> glibc/math/test-double.out
> glibc/math/test-ifloat.out
> glibc/math/test-idouble.out

Expected for soft-float configurations.

> glibc/stdlib/tst-strtod-overflow.out

Not expected, should investigate.  (The test should exit cleanly on memory 
allocation failure.)

> glibc/stdio-common/bug22.out

Expected for low memory, bug 14231.

> glibc/malloc/tst-trim1.out

Not expected, should investigate.

> glibc/nptl/tst-cancel7.out
> glibc/nptl/tst-cancelx7.out

Known race condition, bug 14232.

-- 
Joseph S. Myers
joseph@codesourcery.com

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH] Optimize MIPS memcpy
  2012-09-21 18:47               ` Steve Ellcey
  2012-09-21 18:57                 ` Joseph S. Myers
@ 2012-09-21 19:12                 ` Maxim Kuvyrkov
  1 sibling, 0 replies; 50+ messages in thread
From: Maxim Kuvyrkov @ 2012-09-21 19:12 UTC (permalink / raw)
  To: Steve Ellcey; +Cc: Andrew T Pinski, Joseph S.  Myers, libc-ports

On 22/09/2012, at 6:46 AM, Steve Ellcey wrote:

> On Thu, 2012-09-20 at 21:05 +1200, Maxim Kuvyrkov wrote:
> 
>> What testing was done for this patch, does it pass glibc testsuite?
> 
> Maxim,  I get the same failures with this memcpy as without it when
> running the glibc testsuite:

Thanks Steve.  I'll look into the failures that I'm seeing and try to get a testcase for you.  Also, it's possible that I flunked testing the first time, we'll see.

--
Maxim Kuvyrkov
CodeSourcery / Mentor Graphics


^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH] Optimize MIPS memcpy (mips glibc test results)
  2012-09-21 18:57                 ` Joseph S. Myers
@ 2012-09-21 20:41                   ` Steve Ellcey
  2012-09-21 20:49                     ` Joseph S. Myers
  0 siblings, 1 reply; 50+ messages in thread
From: Steve Ellcey @ 2012-09-21 20:41 UTC (permalink / raw)
  To: Joseph S. Myers; +Cc: Maxim Kuvyrkov, Andrew T Pinski, libc-ports

On Fri, 2012-09-21 at 18:55 +0000, Joseph S. Myers wrote:
> On Fri, 21 Sep 2012, Steve Ellcey wrote:
> 
> > glibc/localedata/tst-fmon.out
> 
> You should investigate why you see this failure, it's not one of the known 
> issues listed at <http://sourceware.org/glibc/wiki/Release/2.16>

Hm, the out file has a bunch of lines like this:

Locale: "de_DE.ISO-8859-1" Format: "%n" Value: "1.23" Received: "1,23 EUR" Expected: "	1,23 EUR" => false

I am not sure why this would be mips specific or where to look for the problem.
I guess it expects more spaces in the output.

> 
> > glibc/math/test-float.out
> > glibc/math/test-double.out
> > glibc/math/test-ifloat.out
> > glibc/math/test-idouble.out
> 
> Expected for soft-float configurations.

But I thought I was doing a hard float build.  It looks like the
failures are with cos, sincos, clog10, clog, ctan*.  Maybe not having
those instructions in hardware counts as soft-float?


> > glibc/stdlib/tst-strtod-overflow.out
> 
> Not expected, should investigate.  (The test should exit cleanly on memory 
> allocation failure.)

Hm, the out file is completely empty.  But I do see:

Timed out: killed the child process

In the output.

> 
> > glibc/stdio-common/bug22.out
> 
> Expected for low memory, bug 14231.
> 
> > glibc/malloc/tst-trim1.out
> 
> Not expected, should investigate.

Also has the Timed out message like tst-strtod-overflow.out.
> 
> > glibc/nptl/tst-cancel7.out
> > glibc/nptl/tst-cancelx7.out
> 
> Known race condition, bug 14232.

Thanks for the info on known failures.

Steve Ellcey
sellcey@mips.com

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH] Optimize MIPS memcpy (mips glibc test results)
  2012-09-21 20:41                   ` [PATCH] Optimize MIPS memcpy (mips glibc test results) Steve Ellcey
@ 2012-09-21 20:49                     ` Joseph S. Myers
  2012-09-21 20:56                       ` Steve Ellcey
  0 siblings, 1 reply; 50+ messages in thread
From: Joseph S. Myers @ 2012-09-21 20:49 UTC (permalink / raw)
  To: Steve Ellcey; +Cc: Maxim Kuvyrkov, Andrew T Pinski, libc-ports

On Fri, 21 Sep 2012, Steve Ellcey wrote:

> > > glibc/math/test-float.out
> > > glibc/math/test-double.out
> > > glibc/math/test-ifloat.out
> > > glibc/math/test-idouble.out
> > 
> > Expected for soft-float configurations.
> 
> But I thought I was doing a hard float build.  It looks like the
> failures are with cos, sincos, clog10, clog, ctan*.  Maybe not having
> those instructions in hardware counts as soft-float?

Those have had tests added since I updated libm-test-ulps for 2.16, so if 
it's just small ulps values for new tests then it's also fine and will go 
away when the ulps are updated again.

> > > glibc/stdlib/tst-strtod-overflow.out
> > 
> > Not expected, should investigate.  (The test should exit cleanly on memory 
> > allocation failure.)
> 
> Hm, the out file is completely empty.  But I do see:
> 
> Timed out: killed the child process
> 
> In the output.

You may need to increase your TIMEOUTFACTOR (or depending on how long it 
takes with a sufficiently long timeout, propose an increase of the TIMEOUT 
value in the test itself on libc-alpha).

> > > glibc/malloc/tst-trim1.out
> > 
> > Not expected, should investigate.
> 
> Also has the Timed out message like tst-strtod-overflow.out.

Again, maybe should set an increased TIMEOUT value, depending on how long 
it takes.

-- 
Joseph S. Myers
joseph@codesourcery.com

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH] Optimize MIPS memcpy (mips glibc test results)
  2012-09-21 20:49                     ` Joseph S. Myers
@ 2012-09-21 20:56                       ` Steve Ellcey
  0 siblings, 0 replies; 50+ messages in thread
From: Steve Ellcey @ 2012-09-21 20:56 UTC (permalink / raw)
  To: Joseph S. Myers; +Cc: Maxim Kuvyrkov, Andrew T Pinski, libc-ports

On Fri, 2012-09-21 at 20:48 +0000, Joseph S. Myers wrote:
> On Fri, 21 Sep 2012, Steve Ellcey wrote:
> 
> > > > glibc/math/test-float.out
> > > > glibc/math/test-double.out
> > > > glibc/math/test-ifloat.out
> > > > glibc/math/test-idouble.out
> > > 
> > > Expected for soft-float configurations.
> > 
> > But I thought I was doing a hard float build.  It looks like the
> > failures are with cos, sincos, clog10, clog, ctan*.  Maybe not having
> > those instructions in hardware counts as soft-float?
> 
> Those have had tests added since I updated libm-test-ulps for 2.16, so if 
> it's just small ulps values for new tests then it's also fine and will go 
> away when the ulps are updated again.

Yes, it is mostly 1 ulp differences with a couple of 2 ulp diffs.

Steve Ellcey
sellcey@mips.com

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH] Optimize MIPS memcpy
  2012-09-20 18:38               ` Steve Ellcey
@ 2012-09-28  3:48                 ` Maxim Kuvyrkov
  2012-10-06  4:43                   ` Maxim Kuvyrkov
  0 siblings, 1 reply; 50+ messages in thread
From: Maxim Kuvyrkov @ 2012-09-28  3:48 UTC (permalink / raw)
  To: Steve Ellcey; +Cc: Andrew T Pinski, Joseph S.  Myers, libc-ports

On 21/09/2012, at 6:38 AM, Steve Ellcey wrote:

> On Thu, 2012-09-20 at 21:05 +1200, Maxim Kuvyrkov wrote:
> 
>> What testing was done for this patch, does it pass glibc testsuite?
>> 
>> I have a benchmark that exercises various string and mem* routines failing with it.
>> 
>> Thank you,
>> 
>> --
>> Maxim Kuvyrkov
>> CodeSourcery / Mentor Graphics
> 
> Is the benchmark anything you can share?  I ran the glibc testsuite and
> got some failures but I don't think they are due to the new memcpy.  I
> am going back now and running the glibc testsuite with no changes to get
> a baseline so I can verify that.  Hopefully I will have an answer later
> today.
> 
> I ran some other tests like the gcc testsuite using a glibc with this
> change in it and that didn't have any problems and there is the one I
> sent to the list
> http://sourceware.org/ml/libc-ports/2012-09/msg00007.html that also ran
> with no problems.

As I mentioned in a different email, I can't share the benchmark, but I think I've got a testcase of sorts for you to investigate.  It appears your memcpy clobbers a couple of bytes just before DEST in certain cases.

In particular when ABI=N32, DEST=0x10060008, SRC=0x1002c088, N=0x172 it clobbers DEST[-1] and DEST[-2] bytes.

--
Maxim Kuvyrkov
CodeSourcery / Mentor Graphics


^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH] Optimize MIPS memcpy
  2012-09-28  3:48                 ` Maxim Kuvyrkov
@ 2012-10-06  4:43                   ` Maxim Kuvyrkov
  2012-10-08 17:04                     ` Steve Ellcey
  0 siblings, 1 reply; 50+ messages in thread
From: Maxim Kuvyrkov @ 2012-10-06  4:43 UTC (permalink / raw)
  To: Steve Ellcey; +Cc: Andrew T Pinski, Joseph S. Myers, libc-ports

On 28/09/2012, at 3:47 PM, Maxim Kuvyrkov wrote:

> On 21/09/2012, at 6:38 AM, Steve Ellcey wrote:
> 
>> On Thu, 2012-09-20 at 21:05 +1200, Maxim Kuvyrkov wrote:
>> 
>>> What testing was done for this patch, does it pass glibc testsuite?
>>> 
>>> I have a benchmark that exercises various string and mem* routines failing with it.
>>> 
>>> Thank you,
>>> 
>>> --
>>> Maxim Kuvyrkov
>>> CodeSourcery / Mentor Graphics
>> 
>> Is the benchmark anything you can share?  I ran the glibc testsuite and
>> got some failures but I don't think they are due to the new memcpy.  I
>> am going back now and running the glibc testsuite with no changes to get
>> a baseline so I can verify that.  Hopefully I will have an answer later
>> today.
>> 
>> I ran some other tests like the gcc testsuite using a glibc with this
>> change in it and that didn't have any problems and there is the one I
>> sent to the list
>> http://sourceware.org/ml/libc-ports/2012-09/msg00007.html that also ran
>> with no problems.
> 
> As I mentioned in a different email, I can't share the benchmark, but I think I've got a testcase of sorts for you to investigate.  It appears your memcpy clobbers a couple of bytes just before DEST in certain cases.
> 
> In particular when ABI=N32, DEST=0x10060008, SRC=0x1002c088, N=0x172 it clobbers DEST[-1] and DEST[-2] bytes.

Steve and I have debugged these failures and they now seem to be resolved.  I'll let Steve to followup with analysis and a new patch.

Meanwhile, I've benchmarked Steve's patch against mine.  On the benchmark that I use both implementations provide equal performance for N64 ABI, but on N32 ABI Steve's patch is only half as fast.  This is, probably, due to using 4-byte operations instead of 8-byte operations for N32 ABI:

#if _MIPS_SIM == _ABI64
#define USE_DOUBLE
#endif

It should be easy to improve Steve's patch for N32 ABI.  Steve, will you look into that?

I would also appreciate if you look into making your version of memcpy memmove-safe, if it is not already.

Thank you,

--
Maxim Kuvyrkov
CodeSourcery / Mentor Graphics


^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH] Optimize MIPS memcpy
  2012-10-06  4:43                   ` Maxim Kuvyrkov
@ 2012-10-08 17:04                     ` Steve Ellcey
  2012-10-08 22:31                       ` Maxim Kuvyrkov
  0 siblings, 1 reply; 50+ messages in thread
From: Steve Ellcey @ 2012-10-08 17:04 UTC (permalink / raw)
  To: Maxim Kuvyrkov; +Cc: Andrew T Pinski, Joseph S. Myers, libc-ports

On Sat, 2012-10-06 at 17:43 +1300, Maxim Kuvyrkov wrote:

> Steve and I have debugged these failures and they now seem to be resolved.  I'll let Steve to followup with analysis and a new patch.
> 
> Meanwhile, I've benchmarked Steve's patch against mine.  On the benchmark that I use both implementations provide equal performance for N64 ABI, but on N32 ABI Steve's patch is only half as fast.  This is, probably, due to using 4-byte operations instead of 8-byte operations for N32 ABI:
> 
> #if _MIPS_SIM == _ABI64
> #define USE_DOUBLE
> #endif
> 
> It should be easy to improve Steve's patch for N32 ABI.  Steve, will you look into that?
> 
> I would also appreciate if you look into making your version of memcpy memmove-safe, if it is not already.
> 
> Thank you,
> 
> --
> Maxim Kuvyrkov
> CodeSourcery / Mentor Graphics

Maxim, do you know if your test is doing a memcpy on overlapping memory?
While our analysis showed that the problem was due to the use of the
'prepare to store' prefetch hint, the code I sent earlier should have
worked fine for any code that was not doing an overlapping memcpy.

For anyone who may be interested, the 'prepare for store' prefetch hint
is different then other 'safe' prefetches which can be executed or
ignored without affecting the results of the code being executed. 

Instead of bringing a chunk of memory into the cache, it simply
allocates a line of cache for use and zeros it out.  If you write to
every byte of that line of cache, you are OK.  But if you use the
'prepare to store' cache hint and do not write to the entire cache line
then the bytes you don't write to get written back to memory as zeros,
overwriting whatever was there before.  The code in my memcpy routine
accounts for this, by checking the length of the buffer before doing the
'prepare to store' prefetches and only using them when it knows that it
is going to write to the entire cache line.

The other issue though is if the source and destination of the memcpy
overlap and if you use the prepare to store prefetch on a memory address
that is also part of the source of the memcpy you will get incorrect
results.  That means that if we want to have memcpy be 'memmove-safe'
we cannot use the 'prepare to store' hint.

I will fix the code to use double loads and stores with the N32 ABI
and add comments about the 'prepare to store' hint.  I hate to give up
on using the 'prepare for store' prefetch hint, since it does result in
the best peformance,  but given the various issues maybe it is not the
best idea for glibc.

Steve Ellcey
sellcey@mips.com



^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH] Optimize MIPS memcpy
  2012-10-08 17:04                     ` Steve Ellcey
@ 2012-10-08 22:31                       ` Maxim Kuvyrkov
  2012-10-09 20:50                         ` Steve Ellcey
  2012-10-15 17:49                         ` Steve Ellcey
  0 siblings, 2 replies; 50+ messages in thread
From: Maxim Kuvyrkov @ 2012-10-08 22:31 UTC (permalink / raw)
  To: Steve Ellcey; +Cc: Andrew T Pinski, Joseph S. Myers, libc-ports

On 9/10/2012, at 6:03 AM, Steve Ellcey wrote:

> On Sat, 2012-10-06 at 17:43 +1300, Maxim Kuvyrkov wrote:
> 
>> Steve and I have debugged these failures and they now seem to be resolved.  I'll let Steve to followup with analysis and a new patch.
>> 
>> Meanwhile, I've benchmarked Steve's patch against mine.  On the benchmark that I use both implementations provide equal performance for N64 ABI, but on N32 ABI Steve's patch is only half as fast.  This is, probably, due to using 4-byte operations instead of 8-byte operations for N32 ABI:
>> 
>> #if _MIPS_SIM == _ABI64
>> #define USE_DOUBLE
>> #endif
>> 
>> It should be easy to improve Steve's patch for N32 ABI.  Steve, will you look into that?
>> 
>> I would also appreciate if you look into making your version of memcpy memmove-safe, if it is not already.
>> 
>> Thank you,
>> 
>> --
>> Maxim Kuvyrkov
>> CodeSourcery / Mentor Graphics
> 
> Maxim, do you know if your test is doing a memcpy on overlapping memory?
> While our analysis showed that the problem was due to the use of the
> 'prepare to store' prefetch hint, the code I sent earlier should have
> worked fine for any code that was not doing an overlapping memcpy.

The test does not use overlapping memcpy.

> 
> For anyone who may be interested, the 'prepare for store' prefetch hint
> is different then other 'safe' prefetches which can be executed or
> ignored without affecting the results of the code being executed. 
> 
> Instead of bringing a chunk of memory into the cache, it simply
> allocates a line of cache for use and zeros it out.  If you write to
> every byte of that line of cache, you are OK.  But if you use the
> 'prepare to store' cache hint and do not write to the entire cache line
> then the bytes you don't write to get written back to memory as zeros,
> overwriting whatever was there before.  The code in my memcpy routine
> accounts for this, by checking the length of the buffer before doing the
> 'prepare to store' prefetches and only using them when it knows that it
> is going to write to the entire cache line.

Can there be a bug in logic that decides that a prepare-for-store prefetch is safe?

I've checked documentation for XLP (which is the target I'm using for testing) and it specifies 32-byte prefetch.

> 
> The other issue though is if the source and destination of the memcpy
> overlap and if you use the prepare to store prefetch on a memory address
> that is also part of the source of the memcpy you will get incorrect
> results.  That means that if we want to have memcpy be 'memmove-safe'
> we cannot use the 'prepare to store' hint.

I don't think this is a concern.  Memmove will use memcpy only if the memory locations don't overlap.  And for the record's sake, I'm testing without the memcpy-in-memmove patch.

> 
> I will fix the code to use double loads and stores with the N32 ABI
> and add comments about the 'prepare to store' hint.  I hate to give up
> on using the 'prepare for store' prefetch hint, since it does result in
> the best peformance,  but given the various issues maybe it is not the
> best idea for glibc.

I too want to keep prepare-for-store prefetches is possible.  For debugging purposes you could amend prepare-for-store prefetch macros to trigger a loop that would unconditionally clobber memory locations that prepare-for-store is expected to zero-out.  Or add some other assertions to help out with debugging.

Thanks,

--
Maxim Kuvyrkov
CodeSourcery / Mentor Graphics


^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH] Optimize MIPS memcpy
  2012-10-08 22:31                       ` Maxim Kuvyrkov
@ 2012-10-09 20:50                         ` Steve Ellcey
  2012-10-15 17:49                         ` Steve Ellcey
  1 sibling, 0 replies; 50+ messages in thread
From: Steve Ellcey @ 2012-10-09 20:50 UTC (permalink / raw)
  To: Maxim Kuvyrkov; +Cc: Andrew T Pinski, Joseph S. Myers, libc-ports

[-- Attachment #1: Type: text/plain, Size: 1207 bytes --]

On Tue, 2012-10-09 at 11:30 +1300, Maxim Kuvyrkov wrote:


> Can there be a bug in logic that decides that a prepare-for-store prefetch is safe?
> 
> I've checked documentation for XLP (which is the target I'm using for testing) and it specifies 32-byte prefetch.

It's possible but I haven't found it yet if there is.  One thought I had
was endianness, when you see the problem are you running in big-endian
or little-endian mode?

> 
> I too want to keep prepare-for-store prefetches is possible.  For debugging purposes you could amend
> prepare-for-store prefetch macros to trigger a loop that would unconditionally clobber memory locations
> that prepare-for-store is expected to zero-out.  Or add some other assertions to help out with debugging.

That is an interesting idea.  Here is a new copy of memcpy, it has a
macro 'DEBUG_PREFETCH' which, if you set it, will replace the prefetch
prepare-for-store with a set of writes to write out 32 bytes worth of
zeros.  I still can't find any problems using this macro though.  I also
fixed the N32 version to use double registers.  Could you try this
version (with DEBUG_PREFETCH set) and see if you still get the problem.

Steve Ellcey
sellcey@mips.com

[-- Attachment #2: memcpy.patch --]
[-- Type: text/x-patch, Size: 23390 bytes --]

diff --git a/ports/sysdeps/mips/memcpy.S b/ports/sysdeps/mips/memcpy.S
index 753f67c..49e22e5 100644
--- a/ports/sysdeps/mips/memcpy.S
+++ b/ports/sysdeps/mips/memcpy.S
@@ -1,7 +1,8 @@
-/* Copyright (C) 2002-2012 Free Software Foundation, Inc.
+/* Copyright (C) 2012 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
-   Contributed by Hartvig Ekner <hartvige@mips.com>, 2002.
-
+  
+   Contributed by MIPS Technologies, Inc.
+  
    The GNU C Library is free software; you can redistribute it and/or
    modify it under the terms of the GNU Lesser General Public
    License as published by the Free Software Foundation; either
@@ -16,119 +17,648 @@
    License along with the GNU C Library.  If not, see
    <http://www.gnu.org/licenses/>.  */
 
+#ifdef ANDROID_CHANGES
+#include "machine/asm.h"
+#include "machine/regdef.h"
+#define USE_MEMMOVE_FOR_OVERLAP
+#define PREFETCH_LOAD_HINT PREFETCH_HINT_LOAD_STREAMED
+#define PREFETCH_STORE_HINT PREFETCH_HINT_PREPAREFORSTORE
+#elif _LIBC
 #include <sysdep.h>
+#include <regdef.h>
+#include <sys/asm.h>
+#define PREFETCH_LOAD_HINT PREFETCH_HINT_LOAD_STREAMED
+#define PREFETCH_STORE_HINT PREFETCH_HINT_PREPAREFORSTORE
+#elif _NEWLIB
+#include "machine/asm.h"
+#include "machine/regdef.h"
+#define PREFETCH_LOAD_HINT PREFETCH_HINT_LOAD_STREAMED
+#define PREFETCH_STORE_HINT PREFETCH_HINT_PREPAREFORSTORE
+#else
+#include <regdef.h>
+#include <sys/asm.h>
+#endif
+
+#if (_MIPS_ISA == _MIPS_ISA_MIPS4) || (_MIPS_ISA == _MIPS_ISA_MIPS5) || \
+    (_MIPS_ISA == _MIPS_ISA_MIPS32) || (_MIPS_ISA == _MIPS_ISA_MIPS64)
+#ifndef DISABLE_PREFETCH
+#define USE_PREFETCH
+#endif
+#endif
+
+#if (_MIPS_SIM == _ABI64) || (_MIPS_SIM == _ABIN32)
+#ifndef DISABLE_DOUBLE
+#define USE_DOUBLE
+#endif
+#endif
+
+
+
+/* Some asm.h files do not have the L macro definition.  */
+#ifndef L
+#if _MIPS_SIM == _ABIO32
+# define L(label) $L ## label
+#else
+# define L(label) .L ## label
+#endif
+#endif
+
+/* Some asm.h files do not have the PTR_ADDIU macro definition.  */
+#ifndef PTR_ADDIU
+#ifdef USE_DOUBLE
+#define PTR_ADDIU	daddiu
+#else
+#define PTR_ADDIU	addiu
+#endif
+#endif
+
+/* Some asm.h files do not have the PTR_SRA macro definition.  */
+#ifndef PTR_SRA
+#ifdef USE_DOUBLE
+#define PTR_SRA		dsra
+#else
+#define PTR_SRA		sra
+#endif
+#endif
 
 
-/* void *memcpy(void *s1, const void *s2, size_t n);  */
+/*
+ * Using PREFETCH_HINT_LOAD_STREAMED instead of PREFETCH_LOAD on load
+ * prefetches appears to offer a slight preformance advantage.
+ *
+ * Using PREFETCH_HINT_PREPAREFORSTORE instead of PREFETCH_STORE
+ * or PREFETCH_STORE_STREAMED offers a large performance advantage
+ * but PREPAREFORSTORE has some special restrictions to consider.
+ *
+ * Prefetch with the 'prepare for store' hint does not copy a memory
+ * location into the cache, it just allocates a cache line and zeros
+ * it out.  This means that if you do not write to the entire cache
+ * line before writing it out to memory some data will get zero'ed out
+ * when the cache line is written back to memory and data will be lost.
+ *
+ * Also if you are using this memcpy to copy overlapping buffers it may
+ * not behave correctly when using the 'prepare for store' hint.  If you
+ * use the 'prepare for store' prefetch on a memory area that is in the
+ * memcpy source (as well as the memcpy destination), then you will get
+ * some data zero'ed out before you have a chance to read it and data will
+ * be lost.
+ *
+ * If you are going to use this memcpy routine with the 'prepare for store'
+ * prefetch you may want to set USE_MEMMOVE_FOR_OVERLAP in order to avoid
+ * the problem of running memcpy on overlapping buffers.
+ *
+ * There are ifdef'ed sections of this memcpy to make sure that it does not
+ * do prefetches on cache lines that are not going to be completely written.
+ * This code is only needed and only used when PREFETCH_STORE_HINT is set to 
+ * PREFETCH_HINT_PREPAREFORSTORE.  This code assumes that cache lines are
+ * 32 bytes.
+ */
 
+#ifdef USE_PREFETCH
+# define PREFETCH_HINT_LOAD		0
+# define PREFETCH_HINT_STORE		1
+# define PREFETCH_HINT_LOAD_STREAMED	4
+# define PREFETCH_HINT_STORE_STREAMED	5
+# define PREFETCH_HINT_LOAD_RETAINED	6
+# define PREFETCH_HINT_STORE_RETAINED	7
+# define PREFETCH_HINT_WRITEBACK_INVAL	25
+# define PREFETCH_HINT_PREPAREFORSTORE	30
+
+/*
+ * If we have not picked out what hints to use at this point use the
+ * standard load and store prefetch hints.
+ */
+#ifndef PREFETCH_STORE_HINT
+#define PREFETCH_STORE_HINT PREFETCH_HINT_STORE
+#endif
+#ifndef PREFETCH_LOAD_HINT
+#define PREFETCH_LOAD_HINT PREFETCH_HINT_LOAD
+#endif
+
+/*
+ * We double everything when USE_DOUBLE is true so we do 2 prefetches to
+ * get 64 bytes in that case.  The assumption is that each individual 
+ * prefetch brings in 32 bytes.
+ *
+ * You can try defining DEBUG_PREFETCH if you are having problems with
+ * the 'prepare for store' prefetch hint, if set the code will zero out
+ * the 32 bytes that are part of the prefetch instead of doing the actual
+ * prefetch.  If this causes problems then you know that you are prefetching
+ * memory that you are not writing to.
+ */
+#ifdef USE_DOUBLE
+# define PREFETCH_CHUNK 64
+# define PREFETCH_FOR_LOAD(chunk, reg) \
+ pref PREFETCH_LOAD_HINT, (chunk)*32(reg); \
+ pref PREFETCH_LOAD_HINT, ((chunk)+1)*32(reg)
+#if defined(DEBUG_PREFETCH) \
+    && (PREFETCH_STORE_HINT == PREFETCH_HINT_PREPAREFORSTORE)
+# define PREFETCH_FOR_STORE(chunk, reg) \
+  sd zero, (((chunk)*32)+0)(reg); \
+  sd zero, (((chunk)*32)+8)(reg); \
+  sd zero, (((chunk)*32)+16)(reg); \
+  sd zero, (((chunk)*32)+24)(reg); \
+  sd zero, (((chunk)*32)+32)(reg); \
+  sd zero, (((chunk)*32)+40)(reg); \
+  sd zero, (((chunk)*32)+48)(reg); \
+  sd zero, (((chunk)*32)+56)(reg)
+#else
+# define PREFETCH_FOR_STORE(chunk, reg) \
+ pref PREFETCH_STORE_HINT, (chunk)*32(reg); \
+ pref PREFETCH_STORE_HINT, ((chunk)+1)*32(reg)
+#endif
+#else
+# define PREFETCH_CHUNK 32
+# define PREFETCH_FOR_LOAD(chunk, reg) \
+ pref PREFETCH_LOAD_HINT, (chunk)*32(reg)
+#if defined(DEBUG_PREFETCH) \
+    && (PREFETCH_STORE_HINT == PREFETCH_HINT_PREPAREFORSTORE)
+# define PREFETCH_FOR_STORE(chunk, reg) \
+  sw zero, (((chunk)*32)+0)(reg); \
+  sw zero, (((chunk)*32)+4)(reg); \
+  sw zero, (((chunk)*32)+8)(reg); \
+  sw zero, (((chunk)*32)+12)(reg); \
+  sw zero, (((chunk)*32)+16)(reg); \
+  sw zero, (((chunk)*32)+20)(reg); \
+  sw zero, (((chunk)*32)+24)(reg); \
+  sw zero, (((chunk)*32)+28)(reg)
+#else
+# define PREFETCH_FOR_STORE(chunk, reg) \
+ pref PREFETCH_STORE_HINT, (chunk)*32(reg)
+#endif
+#endif
+#define PREFETCH_LIMIT (5 * PREFETCH_CHUNK)
+#else
+# define PREFETCH_FOR_LOAD(offset, reg)
+# define PREFETCH_FOR_STORE(offset, reg)
+#endif
+
+/* Allow the routine to be named something else if desired.  */
+#ifndef MEMCPY_NAME
+#define MEMCPY_NAME memcpy
+#endif
+
+/* We use these 32/64 bit registers as temporaries to do the copying.  */
+#define REG0 t0
+#define REG1 t1
+#define REG2 t2
+#define REG3 t3
+#if _MIPS_SIM == _ABIO32
+#  define REG4 t4
+#  define REG5 t5
+#  define REG6 t6
+#  define REG7 t7
+#else
+#  define REG4 ta0
+#  define REG5 ta1
+#  define REG6 ta2
+#  define REG7 ta3
+#endif
+
+/* We load/store 64 bits at a time when USE_DOUBLE is true.
+ * The C_ prefix stands for CHUNK and is used to avoid macro name
+ * conflicts with system header files.  */
+
+#ifdef USE_DOUBLE
+#  define C_ST	sd
+#  define C_LD	ld
 #if __MIPSEB
-#  define LWHI	lwl		/* high part is left in big-endian	*/
-#  define SWHI	swl		/* high part is left in big-endian	*/
-#  define LWLO	lwr		/* low part is right in big-endian	*/
-#  define SWLO	swr		/* low part is right in big-endian	*/
+#  define C_LDHI	ldl	/* high part is left in big-endian	*/
+#  define C_STHI	sdl	/* high part is left in big-endian	*/
+#  define C_LDLO	ldr	/* low part is right in big-endian	*/
+#  define C_STLO	sdr	/* low part is right in big-endian	*/
 #else
-#  define LWHI	lwr		/* high part is right in little-endian	*/
-#  define SWHI	swr		/* high part is right in little-endian	*/
-#  define LWLO	lwl		/* low part is left in little-endian	*/
-#  define SWLO	swl		/* low part is left in little-endian	*/
+#  define C_LDHI	ldr	/* high part is right in little-endian	*/
+#  define C_STHI	sdr	/* high part is right in little-endian	*/
+#  define C_LDLO	ldl	/* low part is left in little-endian	*/
+#  define C_STLO	sdl	/* low part is left in little-endian	*/
 #endif
+#else
+#  define C_ST	sw
+#  define C_LD	lw
+#if __MIPSEB
+#  define C_LDHI	lwl	/* high part is left in big-endian	*/
+#  define C_STHI	swl	/* high part is left in big-endian	*/
+#  define C_LDLO	lwr	/* low part is right in big-endian	*/
+#  define C_STLO	swr	/* low part is right in big-endian	*/
+#else
+#  define C_LDHI	lwr	/* high part is right in little-endian	*/
+#  define C_STHI	swr	/* high part is right in little-endian	*/
+#  define C_LDLO	lwl	/* low part is left in little-endian	*/
+#  define C_STLO	swl	/* low part is left in little-endian	*/
+#endif
+#endif
+
+/* Bookkeeping values for 32 vs. 64 bit mode.  */
+#ifdef USE_DOUBLE
+#  define NSIZE 8
+#  define NSIZEMASK 0x3f
+#  define NSIZEDMASK 0x7f
+#else
+#  define NSIZE 4
+#  define NSIZEMASK 0x1f
+#  define NSIZEDMASK 0x3f
+#endif
+#define UNIT(unit) ((unit)*NSIZE)
+#define UNITM1(unit) (((unit)*NSIZE)-1)
 
-ENTRY (memcpy)
+#ifdef ANDROID_CHANGES
+LEAF(MEMCPY_NAME, 0)
+#else
+LEAF(MEMCPY_NAME)
+#endif
+	.set	nomips16
 	.set	noreorder
+/*
+ * Below we handle the case where memcpy is called with overlapping src and dst.
+ * Although memcpy is not required to handle this case, some parts of Android
+ * like Skia rely on such usage. We call memmove to handle such cases.
+ */
+#ifdef USE_MEMMOVE_FOR_OVERLAP
+	PTR_SUBU t0,a0,a1
+	PTR_SRA	t2,t0,31
+	xor	t1,t0,t2
+	PTR_SUBU t0,t1,t2
+	sltu	t2,t0,a2
+	beq	t2,zero,L(memcpy)
+	la	t9,memmove
+	jr	t9
+	 nop
+L(memcpy):
+#endif
+/*
+ * If the size is less then 2*NSIZE (8 or 16), go to L(lastb).  Regardless of
+ * size, copy dst pointer to v0 for the return value.
+ */
+	slti	t2,a2,(2 * NSIZE)
+	bne	t2,zero,L(lastb)
+	move	v0,a0
+/*
+ * If src and dst have different alignments, go to L(unaligned), if they
+ * have the same alignment (but are not actually aligned) do a partial
+ * load/store to make them aligned.  If they are both already aligned
+ * we can start copying at L(aligned).
+ */
+	xor	t8,a1,a0
+	andi	t8,t8,(NSIZE-1)		/* t8 is a0/a1 word-displacement */
+	bne	t8,zero,L(unaligned)
+	PTR_SUBU a3, zero, a0
+
+	andi	a3,a3,(NSIZE-1)		/* copy a3 bytes to align a0/a1	  */
+	beq	a3,zero,L(aligned)	/* if a3=0, it is already aligned */
+	PTR_SUBU a2,a2,a3		/* a2 is the remining bytes count */
+
+	C_LDHI	t8,0(a1)
+	PTR_ADDU a1,a1,a3
+	C_STHI	t8,0(a0)
+	PTR_ADDU a0,a0,a3
+
+/*
+ * Now dst/src are both aligned to (word or double word) aligned addresses
+ * Set a2 to count how many bytes we have to copy after all the 64/128 byte
+ * chunks are copied and a3 to the dst pointer after all the 64/128 byte 
+ * chunks have been copied.  We will loop, incrementing a0 and a1 until a0
+ * equals a3.
+ */
+
+L(aligned):
+	andi	t8,a2,NSIZEDMASK /* any whole 64-byte/128-byte chunks? */
+	beq	a2,t8,L(chkw)	 /* if a2==t8, no 64-byte/128-byte chunks */
+	PTR_SUBU a3,a2,t8	 /* subtract from a2 the reminder */
+	PTR_ADDU a3,a0,a3	 /* Now a3 is the final dst after loop */
+
+/* When in the loop we may prefetch with the 'prepare to store' hint,
+ * in this case the a0+x should not be past the "t0-32" address.  This
+ * means: for x=128 the last "safe" a0 address is "t0-160".  Alternatively,
+ * for x=64 the last "safe" a0 address is "t0-96" In the current version we
+ * will use "prefetch hint,128(a0)", so "t0-160" is the limit.
+ */
+#if (PREFETCH_STORE_HINT == PREFETCH_HINT_PREPAREFORSTORE)
+	PTR_ADDU t0,a0,a2		/* t0 is the "past the end" address */
+	PTR_SUBU t9,t0,PREFETCH_LIMIT	/* t9 is the "last safe pref" address */
+#endif
+	PREFETCH_FOR_LOAD  (0, a1)
+	PREFETCH_FOR_LOAD  (1, a1)
+	PREFETCH_FOR_LOAD  (2, a1)
+	PREFETCH_FOR_STORE (1, a0)
+#if (PREFETCH_STORE_HINT == PREFETCH_HINT_PREPAREFORSTORE)
+	sltu	v1,t9,a0		/* If a0 > t9 don't use next prefetch */
+	bgtz	v1,L(loop16w)
+	nop
+#endif
+	PREFETCH_FOR_STORE (2, a0)
+L(loop16w):
+	PREFETCH_FOR_LOAD  (3, a1)
+	C_LD	t0,UNIT(0)(a1)
+#if (PREFETCH_STORE_HINT == PREFETCH_HINT_PREPAREFORSTORE)
+	bgtz	v1,L(skip_pref30_96)
+#endif
+	C_LD	t1,UNIT(1)(a1)
+	PREFETCH_FOR_STORE (3, a0)
+L(skip_pref30_96):
+	C_LD	REG2,UNIT(2)(a1)
+	C_LD	REG3,UNIT(3)(a1)
+	C_LD	REG4,UNIT(4)(a1)
+	C_LD	REG5,UNIT(5)(a1)
+	C_LD	REG6,UNIT(6)(a1)
+	C_LD	REG7,UNIT(7)(a1)
+        PREFETCH_FOR_LOAD (4, a1)
+
+	C_ST	t0,UNIT(0)(a0)
+	C_ST	t1,UNIT(1)(a0)
+	C_ST	REG2,UNIT(2)(a0)
+	C_ST	REG3,UNIT(3)(a0)
+	C_ST	REG4,UNIT(4)(a0)
+	C_ST	REG5,UNIT(5)(a0)
+	C_ST	REG6,UNIT(6)(a0)
+	C_ST	REG7,UNIT(7)(a0)
+
+	C_LD	t0,UNIT(8)(a1)
+#if (PREFETCH_STORE_HINT == PREFETCH_HINT_PREPAREFORSTORE)
+	bgtz	v1,L(skip_pref30_128)
+#endif
+	C_LD	t1,UNIT(9)(a1)
+	PREFETCH_FOR_STORE (4, a0)
+L(skip_pref30_128):
+	C_LD	REG2,UNIT(10)(a1)
+	C_LD	REG3,UNIT(11)(a1)
+	C_LD	REG4,UNIT(12)(a1)
+	C_LD	REG5,UNIT(13)(a1)
+	C_LD	REG6,UNIT(14)(a1)
+	C_LD	REG7,UNIT(15)(a1)
+        PREFETCH_FOR_LOAD (5, a1)
+	C_ST	t0,UNIT(8)(a0)
+	C_ST	t1,UNIT(9)(a0)
+	C_ST	REG2,UNIT(10)(a0)
+	C_ST	REG3,UNIT(11)(a0)
+	C_ST	REG4,UNIT(12)(a0)
+	C_ST	REG5,UNIT(13)(a0)
+	C_ST	REG6,UNIT(14)(a0)
+	C_ST	REG7,UNIT(15)(a0)
+	PTR_ADDIU a0,a0,UNIT(16)	/* adding 64/128 to dest */
+#if (PREFETCH_STORE_HINT == PREFETCH_HINT_PREPAREFORSTORE)
+	sltu	v1,t9,a0
+#endif
+	bne	a0,a3,L(loop16w)
+	PTR_ADDIU a1,a1,UNIT(16)	/* adding 64/128 to src */
+	move	a2,t8
+
+/* Here we have src and dest word-aligned but less than 64-bytes or
+ * 128 bytes to go.  Check for a 32(64) byte chunk and copy if if there
+ * is one.  Otherwise jump down to L(chk1w) to handle the tail end of
+ * the copy.
+ */
 
-	slti	t0, a2, 8		# Less than 8?
-	bne	t0, zero, L(last8)
-	move	v0, a0			# Setup exit value before too late
-
-	xor	t0, a1, a0		# Find a0/a1 displacement
-	andi	t0, 0x3
-	bne	t0, zero, L(shift)	# Go handle the unaligned case
-	subu	t1, zero, a1
-	andi	t1, 0x3			# a0/a1 are aligned, but are we
-	beq	t1, zero, L(chk8w)	#  starting in the middle of a word?
-	subu	a2, t1
-	LWHI	t0, 0(a1)		# Yes we are... take care of that
-	addu	a1, t1
-	SWHI	t0, 0(a0)
-	addu	a0, t1
-
-L(chk8w):	
-	andi	t0, a2, 0x1f		# 32 or more bytes left?
-	beq	t0, a2, L(chk1w)
-	subu	a3, a2, t0		# Yes
-	addu	a3, a1			# a3 = end address of loop
-	move	a2, t0			# a2 = what will be left after loop
-L(lop8w):	
-	lw	t0,  0(a1)		# Loop taking 8 words at a time
-	lw	t1,  4(a1)
-	lw	t2,  8(a1)
-	lw	t3, 12(a1)
-	lw	t4, 16(a1)
-	lw	t5, 20(a1)
-	lw	t6, 24(a1)
-	lw	t7, 28(a1)
-	addiu	a0, 32
-	addiu	a1, 32
-	sw	t0, -32(a0)
-	sw	t1, -28(a0)
-	sw	t2, -24(a0)
-	sw	t3, -20(a0)
-	sw	t4, -16(a0)
-	sw	t5, -12(a0)
-	sw	t6,  -8(a0)
-	bne	a1, a3, L(lop8w)
-	sw	t7,  -4(a0)
-
-L(chk1w):	
-	andi	t0, a2, 0x3		# 4 or more bytes left?
-	beq	t0, a2, L(last8)
-	subu	a3, a2, t0		# Yes, handle them one word at a time
-	addu	a3, a1			# a3 again end address
-	move	a2, t0
-L(lop1w):	
-	lw	t0, 0(a1)
-	addiu	a0, 4
-	addiu	a1, 4
-	bne	a1, a3, L(lop1w)
-	sw	t0, -4(a0)
-
-L(last8):	
-	blez	a2, L(lst8e)		# Handle last 8 bytes, one at a time
-	addu	a3, a2, a1
-L(lst8l):	
-	lb	t0, 0(a1)
-	addiu	a0, 1
-	addiu	a1, 1
-	bne	a1, a3, L(lst8l)
-	sb	t0, -1(a0)
-L(lst8e):	
-	jr	ra			# Bye, bye
+L(chkw):
+	PREFETCH_FOR_LOAD (0, a1)
+	andi	t8,a2,NSIZEMASK	/* Is there a 32-byte/64-byte chunk.  */
+				/* The t8 is the reminder count past 32-bytes */
+	beq	a2,t8,L(chk1w)	/* When a2=t8, no 32-byte chunk  */
 	nop
+	C_LD	t0,UNIT(0)(a1)
+	C_LD	t1,UNIT(1)(a1)
+	C_LD	REG2,UNIT(2)(a1)
+	C_LD	REG3,UNIT(3)(a1)
+	C_LD	REG4,UNIT(4)(a1)
+	C_LD	REG5,UNIT(5)(a1)
+	C_LD	REG6,UNIT(6)(a1)
+	C_LD	REG7,UNIT(7)(a1)
+	PTR_ADDIU a1,a1,UNIT(8)
+	C_ST	t0,UNIT(0)(a0)
+	C_ST	t1,UNIT(1)(a0)
+	C_ST	REG2,UNIT(2)(a0)
+	C_ST	REG3,UNIT(3)(a0)
+	C_ST	REG4,UNIT(4)(a0)
+	C_ST	REG5,UNIT(5)(a0)
+	C_ST	REG6,UNIT(6)(a0)
+	C_ST	REG7,UNIT(7)(a0)
+	PTR_ADDIU a0,a0,UNIT(8)
 
-L(shift):	
-	subu	a3, zero, a0		# Src and Dest unaligned 
-	andi	a3, 0x3			#  (unoptimized case...)
-	beq	a3, zero, L(shft1)
-	subu	a2, a3			# a2 = bytes left
-	LWHI	t0, 0(a1)		# Take care of first odd part
-	LWLO	t0, 3(a1)
-	addu	a1, a3
-	SWHI	t0, 0(a0)
-	addu	a0, a3
-L(shft1):	
-	andi	t0, a2, 0x3
-	subu	a3, a2, t0
-	addu	a3, a1
-L(shfth):	
-	LWHI	t1, 0(a1)		# Limp through, word by word
-	LWLO	t1, 3(a1)
-	addiu	a0, 4
-	addiu	a1, 4
-	bne	a1, a3, L(shfth)
-	sw	t1, -4(a0)
-	b	L(last8)		# Handle anything which may be left
-	move	a2, t0
+/*
+ * Here we have less then 32(64) bytes to copy.  Set up for a loop to
+ * copy one word (or double word) at a time.  Set a2 to count how many
+ * bytes we have to copy after all the word (or double word) chunks are
+ * copied and a3 to the dst pointer after all the (d)word chunks have
+ * been copied.  We will loop, incrementing a0 and a1 until a0 equals a3.
+ */
+L(chk1w):
+	andi	a2,t8,(NSIZE-1)	/* a2 is the reminder past one (d)word chunks */
+	beq	a2,t8,L(lastb)
+	PTR_SUBU a3,t8,a2	/* a3 is count of bytes in one (d)word chunks */
+	PTR_ADDU a3,a0,a3	/* a3 is the dst address after loop */
 
+/* copying in words (4-byte or 8-byte chunks) */
+L(wordCopy_loop):
+	C_LD	REG3,UNIT(0)(a1)
+	PTR_ADDIU a1,a1,UNIT(1)
+	PTR_ADDIU a0,a0,UNIT(1)
+	bne	a0,a3,L(wordCopy_loop)
+	C_ST	REG3,UNIT(-1)(a0)
+
+/* Copy the last 8 (or 16) bytes */
+L(lastb):
+	blez	a2,L(leave)
+	PTR_ADDU a3,a0,a2	/* a3 is the last dst address */
+L(lastbloop):
+	lb	v1,0(a1)
+	PTR_ADDIU a1,a1,1
+	PTR_ADDIU a0,a0,1
+	bne	a0,a3,L(lastbloop)
+	sb	v1,-1(a0)
+L(leave):
+	j	ra
+	nop
+/*
+ * UNALIGNED case, got here with a3 = "negu a0"
+ * This code is nearly identical to the aligned code above
+ * but only the destination (not the source) gets aligned
+ * so we need to do partial loads of the source followed
+ * by normal stores to the destination (once we have aligned
+ * the destination).
+ */
+
+L(unaligned):
+	andi	a3,a3,(NSIZE-1)	/* copy a3 bytes to align a0/a1 */
+	beqz	a3,L(ua_chk16w) /* if a3=0, it is already aligned */
+	PTR_SUBU a2,a2,a3	/* a2 is the remining bytes count */
+
+	C_LDHI	v1,UNIT(0)(a1)
+	C_LDLO	v1,UNITM1(1)(a1)
+	PTR_ADDU a1,a1,a3
+	C_STHI	v1,UNIT(0)(a0)
+	PTR_ADDU a0,a0,a3
+
+/*
+ *  Now the destination (but not the source) is aligned
+ * Set a2 to count how many bytes we have to copy after all the 64/128 byte
+ * chunks are copied and a3 to the dst pointer after all the 64/128 byte
+ * chunks have been copied.  We will loop, incrementing a0 and a1 until a0
+ * equals a3.
+ */
+
+L(ua_chk16w):
+	andi	t8,a2,NSIZEDMASK /* any whole 64-byte/128-byte chunks? */
+	beq	a2,t8,L(ua_chkw) /* if a2==t8, no 64-byte/128-byte chunks */
+	PTR_SUBU a3,a2,t8	 /* subtract from a2 the reminder */
+	PTR_ADDU a3,a0,a3	 /* Now a3 is the final dst after loop */
+
+#if (PREFETCH_STORE_HINT == PREFETCH_HINT_PREPAREFORSTORE)
+	PTR_ADDU t0,a0,a2	  /* t0 is the "past the end" address */
+	PTR_SUBU t9,t0,PREFETCH_LIMIT /* t9 is the "last safe pref" address */
+#endif
+	PREFETCH_FOR_LOAD  (0, a1)
+	PREFETCH_FOR_LOAD  (1, a1)
+	PREFETCH_FOR_LOAD  (2, a1)
+	PREFETCH_FOR_STORE (1, a0)
+#if (PREFETCH_STORE_HINT == PREFETCH_HINT_PREPAREFORSTORE)
+	sltu	v1,t9,a0
+	bgtz	v1,L(ua_loop16w)  /* skip prefetch for too short arrays */
+	nop
+#endif
+	PREFETCH_FOR_STORE (2, a0)
+L(ua_loop16w):
+	PREFETCH_FOR_LOAD  (3, a1)
+	C_LDHI	t0,UNIT(0)(a1)
+	C_LDLO	t0,UNITM1(1)(a1)
+	C_LDHI	t1,UNIT(1)(a1)
+#if (PREFETCH_STORE_HINT == PREFETCH_HINT_PREPAREFORSTORE)
+	bgtz	v1,L(ua_skip_pref30_96)
+#endif
+	C_LDLO	t1,UNITM1(2)(a1)
+	PREFETCH_FOR_STORE (3, a0)
+L(ua_skip_pref30_96):
+	C_LDHI	REG2,UNIT(2)(a1)
+	C_LDLO	REG2,UNITM1(3)(a1)
+	C_LDHI	REG3,UNIT(3)(a1)
+	C_LDLO	REG3,UNITM1(4)(a1)
+	C_LDHI	REG4,UNIT(4)(a1)
+	C_LDLO	REG4,UNITM1(5)(a1)
+	C_LDHI	REG5,UNIT(5)(a1)
+	C_LDLO	REG5,UNITM1(6)(a1)
+	C_LDHI	REG6,UNIT(6)(a1)
+	C_LDLO	REG6,UNITM1(7)(a1)
+	C_LDHI	REG7,UNIT(7)(a1)
+	C_LDLO	REG7,UNITM1(8)(a1)
+        PREFETCH_FOR_LOAD (4, a1)
+	C_ST	t0,UNIT(0)(a0)
+	C_ST	t1,UNIT(1)(a0)
+	C_ST	REG2,UNIT(2)(a0)
+	C_ST	REG3,UNIT(3)(a0)
+	C_ST	REG4,UNIT(4)(a0)
+	C_ST	REG5,UNIT(5)(a0)
+	C_ST	REG6,UNIT(6)(a0)
+	C_ST	REG7,UNIT(7)(a0)
+	C_LDHI	t0,UNIT(8)(a1)
+	C_LDLO	t0,UNITM1(9)(a1)
+	C_LDHI	t1,UNIT(9)(a1)
+#if (PREFETCH_STORE_HINT == PREFETCH_HINT_PREPAREFORSTORE)
+	bgtz	v1,L(ua_skip_pref30_128)
+#endif
+	C_LDLO	t1,UNITM1(10)(a1)
+	PREFETCH_FOR_STORE (4, a0)
+L(ua_skip_pref30_128):
+	C_LDHI	REG2,UNIT(10)(a1)
+	C_LDLO	REG2,UNITM1(11)(a1)
+	C_LDHI	REG3,UNIT(11)(a1)
+	C_LDLO	REG3,UNITM1(12)(a1)
+	C_LDHI	REG4,UNIT(12)(a1)
+	C_LDLO	REG4,UNITM1(13)(a1)
+	C_LDHI	REG5,UNIT(13)(a1)
+	C_LDLO	REG5,UNITM1(14)(a1)
+	C_LDHI	REG6,UNIT(14)(a1)
+	C_LDLO	REG6,UNITM1(15)(a1)
+	C_LDHI	REG7,UNIT(15)(a1)
+	C_LDLO	REG7,UNITM1(16)(a1)
+        PREFETCH_FOR_LOAD (5, a1)
+	C_ST	t0,UNIT(8)(a0)
+	C_ST	t1,UNIT(9)(a0)
+	C_ST	REG2,UNIT(10)(a0)
+	C_ST	REG3,UNIT(11)(a0)
+	C_ST	REG4,UNIT(12)(a0)
+	C_ST	REG5,UNIT(13)(a0)
+	C_ST	REG6,UNIT(14)(a0)
+	C_ST	REG7,UNIT(15)(a0)
+	PTR_ADDIU a0,a0,UNIT(16)	/* adding 64/128 to dest */
+#if (PREFETCH_STORE_HINT == PREFETCH_HINT_PREPAREFORSTORE)
+	sltu	v1,t9,a0
+#endif
+	bne	a0,a3,L(ua_loop16w)
+	PTR_ADDIU a1,a1,UNIT(16)	/* adding 64/128 to src */
+	move	a2,t8
+
+/* Here we have src and dest word-aligned but less than 64-bytes or
+ * 128 bytes to go.  Check for a 32(64) byte chunk and copy if if there
+ * is one.  Otherwise jump down to L(ua_chk1w) to handle the tail end of
+ * the copy.  */
+
+L(ua_chkw):
+	PREFETCH_FOR_LOAD (0, a1)
+	andi	t8,a2,NSIZEMASK	  /* Is there a 32-byte/64-byte chunk.  */
+				  /* t8 is the reminder count past 32-bytes */
+	beq	a2,t8,L(ua_chk1w) /* When a2=t8, no 32-byte chunk */
+	nop
+	C_LDHI	t0,UNIT(0)(a1)
+	C_LDLO	t0,UNITM1(1)(a1)
+	C_LDHI	t1,UNIT(1)(a1)
+	C_LDLO	t1,UNITM1(2)(a1)
+	C_LDHI	REG2,UNIT(2)(a1)
+	C_LDLO	REG2,UNITM1(3)(a1)
+	C_LDHI	REG3,UNIT(3)(a1)
+	C_LDLO	REG3,UNITM1(4)(a1)
+	C_LDHI	REG4,UNIT(4)(a1)
+	C_LDLO	REG4,UNITM1(5)(a1)
+	C_LDHI	REG5,UNIT(5)(a1)
+	C_LDLO	REG5,UNITM1(6)(a1)
+	C_LDHI	REG6,UNIT(6)(a1)
+	C_LDLO	REG6,UNITM1(7)(a1)
+	C_LDHI	REG7,UNIT(7)(a1)
+	C_LDLO	REG7,UNITM1(8)(a1)
+	PTR_ADDIU a1,a1,UNIT(8)
+	C_ST	t0,UNIT(0)(a0)
+	C_ST	t1,UNIT(1)(a0)
+	C_ST	REG2,UNIT(2)(a0)
+	C_ST	REG3,UNIT(3)(a0)
+	C_ST	REG4,UNIT(4)(a0)
+	C_ST	REG5,UNIT(5)(a0)
+	C_ST	REG6,UNIT(6)(a0)
+	C_ST	REG7,UNIT(7)(a0)
+	PTR_ADDIU a0,a0,UNIT(8)
+/*
+ * Here we have less then 32(64) bytes to copy.  Set up for a loop to
+ * copy one word (or double word) at a time.
+ */
+L(ua_chk1w):
+	andi	a2,t8,(NSIZE-1)	/* a2 is the reminder past one (d)word chunks */
+	beq	a2,t8,L(ua_smallCopy)
+	PTR_SUBU a3,t8,a2	/* a3 is count of bytes in one (d)word chunks */
+	PTR_ADDU a3,a0,a3	/* a3 is the dst address after loop */
+
+/* copying in words (4-byte or 8-byte chunks) */
+L(ua_wordCopy_loop):
+	C_LDHI	v1,UNIT(0)(a1)
+	C_LDLO	v1,UNITM1(1)(a1)
+	PTR_ADDIU a1,a1,UNIT(1)
+	PTR_ADDIU a0,a0,UNIT(1)
+	bne	a0,a3,L(ua_wordCopy_loop)
+	C_ST	v1,UNIT(-1)(a0)
+
+/* Copy the last 8 (or 16) bytes */
+L(ua_smallCopy):
+	beqz	a2,L(leave)
+	PTR_ADDU a3,a0,a2	/* a3 is the last dst address */
+L(ua_smallCopy_loop):
+	lb	v1,0(a1)
+	PTR_ADDIU a1,a1,1
+	PTR_ADDIU a0,a0,1
+	bne	a0,a3,L(ua_smallCopy_loop)
+	sb	v1,-1(a0)
+
+	j	ra
+	nop
+
+	.set	at
 	.set	reorder
-END (memcpy)
-libc_hidden_builtin_def (memcpy)
+END(MEMCPY_NAME)
+#ifdef _LIBC
+libc_hidden_builtin_def (MEMCPY_NAME)
+#endif

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH] Optimize MIPS memcpy
  2012-10-08 22:31                       ` Maxim Kuvyrkov
  2012-10-09 20:50                         ` Steve Ellcey
@ 2012-10-15 17:49                         ` Steve Ellcey
  2012-10-15 20:20                           ` Andrew Pinski
  2012-10-15 22:05                           ` Maxim Kuvyrkov
  1 sibling, 2 replies; 50+ messages in thread
From: Steve Ellcey @ 2012-10-15 17:49 UTC (permalink / raw)
  To: Maxim Kuvyrkov; +Cc: Andrew T Pinski, Joseph S. Myers, libc-ports

[-- Attachment #1: Type: text/plain, Size: 1284 bytes --]

On Tue, 2012-10-09 at 11:30 +1300, Maxim Kuvyrkov wrote:

> I too want to keep prepare-for-store prefetches is possible.  For debugging purposes you could amend
> prepare-for-store prefetch macros to trigger a loop that would unconditionally clobber memory locations
> that prepare-for-store is expected to zero-out.  Or add some other assertions to help out with debugging.
> 
> Thanks,
> 
> --
> Maxim Kuvyrkov
> CodeSourcery / Mentor Graphics

Maxim,

Could you try running this test program on your system.  I want to see
if it verifies that your machine is doing 32 byte prefetches.  The
output I get looks like:


0x004754a0, (0x004754a0 to 0x004754c0, 32 byte prefetch)
0x004754a1, (0x004754a0 to 0x004754c0, 32 byte prefetch)
0x004754a2, (0x004754a0 to 0x004754c0, 32 byte prefetch)
0x004754a3, (0x004754a0 to 0x004754c0, 32 byte prefetch)
0x004754a4, (0x004754a0 to 0x004754c0, 32 byte prefetch)
0x004754a5, (0x004754a0 to 0x004754c0, 32 byte prefetch)
.
.
.
0x0047589b, (0x00475880 to 0x004758a0, 32 byte prefetch)
0x0047589c, (0x00475880 to 0x004758a0, 32 byte prefetch)
0x0047589d, (0x00475880 to 0x004758a0, 32 byte prefetch)
0x0047589e, (0x00475880 to 0x004758a0, 32 byte prefetch)
0x0047589f, (0x00475880 to 0x004758a0, 32 byte prefetch)

Steve Ellcey
sellcey@mips.com

[-- Attachment #2: check_prefetch_size.c --]
[-- Type: text/x-csrc, Size: 1066 bytes --]

#include <stdio.h>

char dummy[409600];
char buffer[3072];


check_buffer(char *p)
{
	int i, zero_start, zero_stop;
	/* Initialize buffer to non-zero data */
	for (i = 0; i < 2048; i++)
		buffer[i] = 1;

	/* Clear buffer out of cache */
	for (i = 0; i < 409600; i++)
		dummy[i] = 9;

#if 1
	__asm__ ("pref 30, 0x0(%0)" : : "r" (p));
#endif

	/* Check contents for single block of zeros */
	zero_start = 0;
	while ((buffer[zero_start] == 1) && (zero_start < 2048)) zero_start++;
	zero_stop = zero_start;
	while ((buffer[zero_stop] == 0) && (zero_stop < 2048)) zero_stop++;
	for (i = zero_stop; i < 2048; i++)
		if (buffer[i] == 0) printf("Error, extra set of zeros\n");

	if (zero_start >= 2048)
		printf("0x%8.8x, (no zeros)\n", p);
	else
		printf("0x%8.8x, (0x%8.8x to 0x%8.8x, %d byte prefetch)\n", p, &buffer[zero_start], &buffer[zero_stop], (zero_stop - zero_start));

#if 0
	/* Dump buffer contents */
	for (i = 0; i < 2048; i++)
		printf("%1d", buffer[i]);
	printf("\n");
#endif
}

main()
{
	int i;
	for (i = 1024; i < 2048; i++)
		check_buffer(&buffer[i]);
}

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH] Optimize MIPS memcpy
  2012-10-15 17:49                         ` Steve Ellcey
@ 2012-10-15 20:20                           ` Andrew Pinski
  2012-10-15 20:34                             ` Steve Ellcey
  2012-10-15 22:05                           ` Maxim Kuvyrkov
  1 sibling, 1 reply; 50+ messages in thread
From: Andrew Pinski @ 2012-10-15 20:20 UTC (permalink / raw)
  To: Steve Ellcey; +Cc: Maxim Kuvyrkov, Joseph S. Myers, libc-ports

On Mon, Oct 15, 2012 at 10:49 AM, Steve Ellcey <sellcey@mips.com> wrote:
> On Tue, 2012-10-09 at 11:30 +1300, Maxim Kuvyrkov wrote:
>
>> I too want to keep prepare-for-store prefetches is possible.  For debugging purposes you could amend
>> prepare-for-store prefetch macros to trigger a loop that would unconditionally clobber memory locations
>> that prepare-for-store is expected to zero-out.  Or add some other assertions to help out with debugging.
>>
>> Thanks,
>>
>> --
>> Maxim Kuvyrkov
>> CodeSourcery / Mentor Graphics
>
> Maxim,
>
> Could you try running this test program on your system.  I want to see
> if it verifies that your machine is doing 32 byte prefetches.  The
> output I get looks like:
>
>
> 0x004754a0, (0x004754a0 to 0x004754c0, 32 byte prefetch)
> 0x004754a1, (0x004754a0 to 0x004754c0, 32 byte prefetch)
> 0x004754a2, (0x004754a0 to 0x004754c0, 32 byte prefetch)
> 0x004754a3, (0x004754a0 to 0x004754c0, 32 byte prefetch)
> 0x004754a4, (0x004754a0 to 0x004754c0, 32 byte prefetch)
> 0x004754a5, (0x004754a0 to 0x004754c0, 32 byte prefetch)
> .
> .
> .
> 0x0047589b, (0x00475880 to 0x004758a0, 32 byte prefetch)
> 0x0047589c, (0x00475880 to 0x004758a0, 32 byte prefetch)
> 0x0047589d, (0x00475880 to 0x004758a0, 32 byte prefetch)
> 0x0047589e, (0x00475880 to 0x004758a0, 32 byte prefetch)
> 0x0047589f, (0x00475880 to 0x004758a0, 32 byte prefetch)


On:
system type             : EBB6300 (CN6335p2.1-1500-AAP)
processor               : 0
cpu model               : Cavium Octeon II V0.9
BogoMIPS                : 3000.00
wait instruction        : yes
microsecond timers      : yes
tlb_entries             : 128
extra interrupt vector  : yes
hardware watchpoint     : yes, count: 2, address/irw mask: [0x0ffc, 0x0ffb]
ASEs implemented        :
shadow register sets    : 1
kscratch registers      : 3
core                    : 0
VCED exceptions         : not available
VCEI exceptions         : not available

I get:
...
0x200757cb, (no zeros)
0x200757cc, (no zeros)
0x200757cd, (no zeros)
0x200757ce, (no zeros)
0x200757cf, (0x20075780 to 0x20075800, 128 byte prefetch)
.....
0x2007587a, (no zeros)
0x2007587b, (no zeros)
0x2007587c, (no zeros)
0x2007587d, (no zeros)
0x2007587e, (no zeros)
0x2007587f, (no zeros)
0x20075880, (0x20075880 to 0x20075900, 128 byte prefetch)

Thanks,
Andrew Pinski


>
> Steve Ellcey
> sellcey@mips.com

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH] Optimize MIPS memcpy
  2012-10-15 20:20                           ` Andrew Pinski
@ 2012-10-15 20:34                             ` Steve Ellcey
  2012-10-15 20:42                               ` Andrew Pinski
  2012-10-15 21:29                               ` Maciej W. Rozycki
  0 siblings, 2 replies; 50+ messages in thread
From: Steve Ellcey @ 2012-10-15 20:34 UTC (permalink / raw)
  To: Andrew Pinski; +Cc: Maxim Kuvyrkov, Joseph S. Myers, libc-ports

On Mon, 2012-10-15 at 13:20 -0700, Andrew Pinski wrote:

> On:
> system type             : EBB6300 (CN6335p2.1-1500-AAP)
> processor               : 0
> cpu model               : Cavium Octeon II V0.9
> 
> I get:
> ...
> 0x200757cb, (no zeros)
> 0x200757cc, (no zeros)
> 0x200757cd, (no zeros)
> 0x200757ce, (no zeros)
> 0x200757cf, (0x20075780 to 0x20075800, 128 byte prefetch)
> 
> Thanks,
> Andrew Pinski

Andrew,

Is there a macro I can/should use when building glibc/memcpy to know
that it should assume a Cavium Octeon with 128 byte prefetch instead of
the 32 byte prefetch?

Steve Ellcey
sellcey@mips.com


^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH] Optimize MIPS memcpy
  2012-10-15 20:34                             ` Steve Ellcey
@ 2012-10-15 20:42                               ` Andrew Pinski
  2012-10-15 20:50                                 ` Andrew Pinski
  2012-10-15 21:29                               ` Maciej W. Rozycki
  1 sibling, 1 reply; 50+ messages in thread
From: Andrew Pinski @ 2012-10-15 20:42 UTC (permalink / raw)
  To: Steve Ellcey; +Cc: Maxim Kuvyrkov, Joseph S. Myers, libc-ports

On Mon, Oct 15, 2012 at 1:34 PM, Steve Ellcey <sellcey@mips.com> wrote:
> On Mon, 2012-10-15 at 13:20 -0700, Andrew Pinski wrote:
>
>> On:
>> system type             : EBB6300 (CN6335p2.1-1500-AAP)
>> processor               : 0
>> cpu model               : Cavium Octeon II V0.9
>>
>> I get:
>> ...
>> 0x200757cb, (no zeros)
>> 0x200757cc, (no zeros)
>> 0x200757cd, (no zeros)
>> 0x200757ce, (no zeros)
>> 0x200757cf, (0x20075780 to 0x20075800, 128 byte prefetch)
>>
>> Thanks,
>> Andrew Pinski
>
> Andrew,
>
> Is there a macro I can/should use when building glibc/memcpy to know
> that it should assume a Cavium Octeon with 128 byte prefetch instead of
> the 32 byte prefetch?


Building you could use __OCTEON__ but that does not change the fact
you could build glibc for the standard mips32/mips64 and then not get
a working glibc if it defaults to 32bytes prefetch.

Thanks,
Andrew Pinski

>
> Steve Ellcey
> sellcey@mips.com
>
>

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH] Optimize MIPS memcpy
  2012-10-15 20:42                               ` Andrew Pinski
@ 2012-10-15 20:50                                 ` Andrew Pinski
  2012-10-15 21:36                                   ` Steve Ellcey
  0 siblings, 1 reply; 50+ messages in thread
From: Andrew Pinski @ 2012-10-15 20:50 UTC (permalink / raw)
  To: Steve Ellcey; +Cc: Maxim Kuvyrkov, Joseph S. Myers, libc-ports

On Mon, Oct 15, 2012 at 1:42 PM, Andrew Pinski <pinskia@gmail.com> wrote:
> On Mon, Oct 15, 2012 at 1:34 PM, Steve Ellcey <sellcey@mips.com> wrote:
>> On Mon, 2012-10-15 at 13:20 -0700, Andrew Pinski wrote:
>>
>>> On:
>>> system type             : EBB6300 (CN6335p2.1-1500-AAP)
>>> processor               : 0
>>> cpu model               : Cavium Octeon II V0.9
>>>
>>> I get:
>>> ...
>>> 0x200757cb, (no zeros)
>>> 0x200757cc, (no zeros)
>>> 0x200757cd, (no zeros)
>>> 0x200757ce, (no zeros)
>>> 0x200757cf, (0x20075780 to 0x20075800, 128 byte prefetch)
>>>
>>> Thanks,
>>> Andrew Pinski
>>
>> Andrew,
>>
>> Is there a macro I can/should use when building glibc/memcpy to know
>> that it should assume a Cavium Octeon with 128 byte prefetch instead of
>> the 32 byte prefetch?
>
>
> Building you could use __OCTEON__ but that does not change the fact
> you could build glibc for the standard mips32/mips64 and then not get
> a working glibc if it defaults to 32bytes prefetch.

Also it would be nice to use ifunc's like they are used on x86_64 (and
I think PPC also) so we can compile one generic version of glibc and
get the optimized version of memcpy.  Though ifunc's have their own
issue as they don't currently work on MIPS (they cause internal linker
errors).

Thanks,
Andrew


>
> Thanks,
> Andrew Pinski
>
>>
>> Steve Ellcey
>> sellcey@mips.com
>>
>>

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH] Optimize MIPS memcpy
  2012-10-15 20:34                             ` Steve Ellcey
  2012-10-15 20:42                               ` Andrew Pinski
@ 2012-10-15 21:29                               ` Maciej W. Rozycki
  1 sibling, 0 replies; 50+ messages in thread
From: Maciej W. Rozycki @ 2012-10-15 21:29 UTC (permalink / raw)
  To: Steve Ellcey; +Cc: Andrew Pinski, Maxim Kuvyrkov, Joseph S. Myers, libc-ports

On Mon, 15 Oct 2012, Steve Ellcey wrote:

> > On:
> > system type             : EBB6300 (CN6335p2.1-1500-AAP)
> > processor               : 0
> > cpu model               : Cavium Octeon II V0.9
> > 
> > I get:
> > ...
> > 0x200757cb, (no zeros)
> > 0x200757cc, (no zeros)
> > 0x200757cd, (no zeros)
> > 0x200757ce, (no zeros)
> > 0x200757cf, (0x20075780 to 0x20075800, 128 byte prefetch)
> > 
> > Thanks,
> > Andrew Pinski
> 
> Andrew,
> 
> Is there a macro I can/should use when building glibc/memcpy to know
> that it should assume a Cavium Octeon with 128 byte prefetch instead of
> the 32 byte prefetch?

 FWIW I don't think hardcoding the cache line size for individual 
processor types is going to scale, not even mentioning it may not serve 
its purpose at all given that the cache line size may be boot-mode or even 
run-time configurable in a vendor-specific way (some MTI cores for example 
use CP0.Config.WC for cache topology reconfiguration, although the 
currently available implementations do not seem to include the line sizes 
among the reconfigurable parameters).

 This looks to me like a case for multiple copies of memcpy binary code 
tuned for an individual cache line size each and then selected via the 
IFUNC feature -- there should be no run-time penalty for doing that in 
dynamic executables/libraries (except from libc itself perhaps) as the 
call is going to be made through the GOT anyway.  Of course the line size 
needs to be determined somehow at the first invocation -- perhaps the 
appropriate bits from CP0 Config1/2 registers could be exported by the 
kernel.

 If storage/memory footprint is of concern, then perhaps for -Os builds 
(is that supported for glibc these days anyway?) only a single copy of 
memcpy could be built.

 BTW, the M14Kc only has a 16-byte cache line size, so it will need 
another arrangement.

 Thoughts?

  Maciej

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH] Optimize MIPS memcpy
  2012-10-15 20:50                                 ` Andrew Pinski
@ 2012-10-15 21:36                                   ` Steve Ellcey
  2012-10-15 21:47                                     ` Maxim Kuvyrkov
  2012-10-15 22:10                                     ` Joseph S. Myers
  0 siblings, 2 replies; 50+ messages in thread
From: Steve Ellcey @ 2012-10-15 21:36 UTC (permalink / raw)
  To: Andrew Pinski; +Cc: Maxim Kuvyrkov, Joseph S. Myers, libc-ports

On Mon, 2012-10-15 at 13:50 -0700, Andrew Pinski wrote:

Building you could use __OCTEON__ but that does not change the fact
you could build glibc for the standard mips32/mips64 and then not get
a working glibc if it defaults to 32bytes prefetch.

So are you saying that we shouldn't use the 'prepare to store' prefetch
in the glibc memcpy then?  We could use one of the other prefetches
without having to worry about bad code on machines with different size
prefetches, but it would not be as fast as using 'prepare to store'.

> Also it would be nice to use ifunc's like they are used on x86_64 (and
> I think PPC also) so we can compile one generic version of glibc and
> get the optimized version of memcpy.  Though ifunc's have their own
> issue as they don't currently work on MIPS (they cause internal linker
> errors).

I'll have to look at that, I am not familiar with the ifunc's except in
very general terms.

Steve Ellcey
sellcey@mips.com



^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH] Optimize MIPS memcpy
  2012-10-15 21:36                                   ` Steve Ellcey
@ 2012-10-15 21:47                                     ` Maxim Kuvyrkov
  2012-10-17 17:30                                       ` Steve Ellcey
  2012-10-15 22:10                                     ` Joseph S. Myers
  1 sibling, 1 reply; 50+ messages in thread
From: Maxim Kuvyrkov @ 2012-10-15 21:47 UTC (permalink / raw)
  To: Steve Ellcey; +Cc: Andrew Pinski, Joseph S. Myers, libc-ports

On 16/10/2012, at 10:36 AM, Steve Ellcey wrote:

> On Mon, 2012-10-15 at 13:50 -0700, Andrew Pinski wrote:
> 
> Building you could use __OCTEON__ but that does not change the fact
> you could build glibc for the standard mips32/mips64 and then not get
> a working glibc if it defaults to 32bytes prefetch.
> 
> So are you saying that we shouldn't use the 'prepare to store' prefetch
> in the glibc memcpy then?  We could use one of the other prefetches
> without having to worry about bad code on machines with different size
> prefetches, but it would not be as fast as using 'prepare to store'.
> 
>> Also it would be nice to use ifunc's like they are used on x86_64 (and
>> I think PPC also) so we can compile one generic version of glibc and
>> get the optimized version of memcpy.  Though ifunc's have their own
>> issue as they don't currently work on MIPS (they cause internal linker
>> errors).
> 
> I'll have to look at that, I am not familiar with the ifunc's except in
> very general terms.

I suggest you move support for prepare-to-store prefetches into TODO category blocked on working IFUNC support for MIPS.  Without IFUNCs and having all the different-sized cache lines on MIPS you can't make prepare-to-store work reliably.

Once IFUNC support for MIPS is there, we can revisit using prepare-to-store prefetches.

--
Maxim Kuvyrkov
CodeSourcery / Mentor Graphics

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH] Optimize MIPS memcpy
  2012-10-15 17:49                         ` Steve Ellcey
  2012-10-15 20:20                           ` Andrew Pinski
@ 2012-10-15 22:05                           ` Maxim Kuvyrkov
  1 sibling, 0 replies; 50+ messages in thread
From: Maxim Kuvyrkov @ 2012-10-15 22:05 UTC (permalink / raw)
  To: Steve Ellcey; +Cc: Andrew T Pinski, Joseph S. Myers, libc-ports

On 16/10/2012, at 6:49 AM, Steve Ellcey wrote:

> On Tue, 2012-10-09 at 11:30 +1300, Maxim Kuvyrkov wrote:
> 
>> I too want to keep prepare-for-store prefetches is possible.  For debugging purposes you could amend
>> prepare-for-store prefetch macros to trigger a loop that would unconditionally clobber memory locations
>> that prepare-for-store is expected to zero-out.  Or add some other assertions to help out with debugging.
>> 
>> Thanks,
>> 
>> --
>> Maxim Kuvyrkov
>> CodeSourcery / Mentor Graphics
> 
> Maxim,
> 
> Could you try running this test program on your system.  I want to see
> if it verifies that your machine is doing 32 byte prefetches.  The
> output I get looks like:
> 
> 
> 0x004754a0, (0x004754a0 to 0x004754c0, 32 byte prefetch)
> 0x004754a1, (0x004754a0 to 0x004754c0, 32 byte prefetch)
> 0x004754a2, (0x004754a0 to 0x004754c0, 32 byte prefetch)
> 0x004754a3, (0x004754a0 to 0x004754c0, 32 byte prefetch)
> 0x004754a4, (0x004754a0 to 0x004754c0, 32 byte prefetch)
> 0x004754a5, (0x004754a0 to 0x004754c0, 32 byte prefetch)
> .
> .
> .
> 0x0047589b, (0x00475880 to 0x004758a0, 32 byte prefetch)
> 0x0047589c, (0x00475880 to 0x004758a0, 32 byte prefetch)
> 0x0047589d, (0x00475880 to 0x004758a0, 32 byte prefetch)
> 0x0047589e, (0x00475880 to 0x004758a0, 32 byte prefetch)
> 0x0047589f, (0x00475880 to 0x004758a0, 32 byte prefetch)

This is a big-endian target and I get 64-byte prefetches (for n32, o32 and n64 ABIs).

Now that I have checked XLP documentation once again, it says that L1 cache line is 32-bytes, but L2 and L3 are 64-bytes.  Still, documentation for prefetch instruction insists that expected result for prefetch instruction is to prefetch 32-bytes.

0x200756d0, (no zeros)
0x200756d1, (0x200756c0 to 0x20075700, 64 byte prefetch)
0x200756d2, (0x200756c0 to 0x20075700, 64 byte prefetch)
0x200756d3, (0x200756c0 to 0x20075700, 64 byte prefetch)
0x200756d4, (0x200756c0 to 0x20075700, 64 byte prefetch)
0x200756d5, (0x200756c0 to 0x20075700, 64 byte prefetch)
0x200756d6, (0x200756c0 to 0x20075700, 64 byte prefetch)

--
Maxim Kuvyrkov
CodeSourcery / Mentor Graphics

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH] Optimize MIPS memcpy
  2012-10-15 21:36                                   ` Steve Ellcey
  2012-10-15 21:47                                     ` Maxim Kuvyrkov
@ 2012-10-15 22:10                                     ` Joseph S. Myers
  1 sibling, 0 replies; 50+ messages in thread
From: Joseph S. Myers @ 2012-10-15 22:10 UTC (permalink / raw)
  To: Steve Ellcey; +Cc: Andrew Pinski, Maxim Kuvyrkov, libc-ports

On Mon, 15 Oct 2012, Steve Ellcey wrote:

> > Also it would be nice to use ifunc's like they are used on x86_64 (and
> > I think PPC also) so we can compile one generic version of glibc and
> > get the optimized version of memcpy.  Though ifunc's have their own
> > issue as they don't currently work on MIPS (they cause internal linker
> > errors).
> 
> I'll have to look at that, I am not familiar with the ifunc's except in
> very general terms.

IFUNC support would first require defining the associated psABI pieces 
(for all three ABIs), before implementing in ld and glibc.

-- 
Joseph S. Myers
joseph@codesourcery.com

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH] Optimize MIPS memcpy
  2012-10-15 21:47                                     ` Maxim Kuvyrkov
@ 2012-10-17 17:30                                       ` Steve Ellcey
  2012-10-29 18:00                                         ` Steve Ellcey
  2012-10-31 19:27                                         ` Andreas Jaeger
  0 siblings, 2 replies; 50+ messages in thread
From: Steve Ellcey @ 2012-10-17 17:30 UTC (permalink / raw)
  To: Maxim Kuvyrkov; +Cc: Andrew Pinski, Joseph S. Myers, libc-ports

On Tue, 2012-10-16 at 10:47 +1300, Maxim Kuvyrkov wrote:

> I suggest you move support for prepare-to-store prefetches into TODO category blocked on working 
> IFUNC support for MIPS.  Without IFUNCs and having all the different-sized cache lines on MIPS you
> can't make prepare-to-store work reliably.
> 
> Once IFUNC support for MIPS is there, we can revisit using prepare-to-store prefetches.
> 
> --
> Maxim Kuvyrkov
> CodeSourcery / Mentor Graphics

OK, Here is a version of memcpy that uses the STORE_STREAMING prefetch.
While it is optimized for a 32 byte prefetch, it will work correctly
regardless of the size of the prefetch.

Is this version OK to checkin?

Steve Ellcey
sellcey@mips.com



2012-10-17  Steve Ellcey  <sellcey@mips.com>

	* sysdeps/mips/memcpy.S: Add prefetching and more unrolling, make
	it work in 32 or 64 bit modes.
	* sysdeps/mips/mips64/memcpy.S: Remove.

diff --git a/ports/sysdeps/mips/memcpy.S b/ports/sysdeps/mips/memcpy.S
index 753f67c..71474e9 100644
--- a/ports/sysdeps/mips/memcpy.S
+++ b/ports/sysdeps/mips/memcpy.S
@@ -1,7 +1,8 @@
-/* Copyright (C) 2002-2012 Free Software Foundation, Inc.
+/* Copyright (C) 2012 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
-   Contributed by Hartvig Ekner <hartvige@mips.com>, 2002.
-
+  
+   Contributed by MIPS Technologies, Inc.
+  
    The GNU C Library is free software; you can redistribute it and/or
    modify it under the terms of the GNU Lesser General Public
    License as published by the Free Software Foundation; either
@@ -16,119 +17,616 @@
    License along with the GNU C Library.  If not, see
    <http://www.gnu.org/licenses/>.  */
 
+#ifdef ANDROID_CHANGES
+#include "machine/asm.h"
+#include "machine/regdef.h"
+#define USE_MEMMOVE_FOR_OVERLAP
+#define PREFETCH_LOAD_HINT PREFETCH_HINT_LOAD_STREAMED
+#define PREFETCH_STORE_HINT PREFETCH_HINT_PREPAREFORSTORE
+#elif _LIBC
 #include <sysdep.h>
+#include <regdef.h>
+#include <sys/asm.h>
+#define PREFETCH_LOAD_HINT PREFETCH_HINT_LOAD_STREAMED
+#define PREFETCH_STORE_HINT PREFETCH_HINT_STORE_STREAMED
+#elif _COMPILING_NEWLIB
+#include "machine/asm.h"
+#include "machine/regdef.h"
+#define PREFETCH_LOAD_HINT PREFETCH_HINT_LOAD_STREAMED
+#define PREFETCH_STORE_HINT PREFETCH_HINT_STORE_STREAMED
+#else
+#include <regdef.h>
+#include <sys/asm.h>
+#endif
+
+#if (_MIPS_ISA == _MIPS_ISA_MIPS4) || (_MIPS_ISA == _MIPS_ISA_MIPS5) || \
+    (_MIPS_ISA == _MIPS_ISA_MIPS32) || (_MIPS_ISA == _MIPS_ISA_MIPS64)
+#ifndef DISABLE_PREFETCH
+#define USE_PREFETCH
+#endif
+#endif
+
+#if (_MIPS_SIM == _ABI64) || (_MIPS_SIM == _ABIN32)
+#ifndef DISABLE_DOUBLE
+#define USE_DOUBLE
+#endif
+#endif
+
+
+
+/* Some asm.h files do not have the L macro definition.  */
+#ifndef L
+#if _MIPS_SIM == _ABIO32
+# define L(label) $L ## label
+#else
+# define L(label) .L ## label
+#endif
+#endif
+
+/* Some asm.h files do not have the PTR_ADDIU macro definition.  */
+#ifndef PTR_ADDIU
+#ifdef USE_DOUBLE
+#define PTR_ADDIU	daddiu
+#else
+#define PTR_ADDIU	addiu
+#endif
+#endif
+
+/* Some asm.h files do not have the PTR_SRA macro definition.  */
+#ifndef PTR_SRA
+#ifdef USE_DOUBLE
+#define PTR_SRA		dsra
+#else
+#define PTR_SRA		sra
+#endif
+#endif
+
 
+/*
+ * Using PREFETCH_HINT_LOAD_STREAMED instead of PREFETCH_LOAD on load
+ * prefetches appears to offer a slight preformance advantage.
+ *
+ * Using PREFETCH_HINT_PREPAREFORSTORE instead of PREFETCH_STORE
+ * or PREFETCH_STORE_STREAMED offers a large performance advantage
+ * but PREPAREFORSTORE has some special restrictions to consider.
+ *
+ * Prefetch with the 'prepare for store' hint does not copy a memory
+ * location into the cache, it just allocates a cache line and zeros
+ * it out.  This means that if you do not write to the entire cache
+ * line before writing it out to memory some data will get zero'ed out
+ * when the cache line is written back to memory and data will be lost.
+ *
+ * Also if you are using this memcpy to copy overlapping buffers it may
+ * not behave correctly when using the 'prepare for store' hint.  If you
+ * use the 'prepare for store' prefetch on a memory area that is in the
+ * memcpy source (as well as the memcpy destination), then you will get
+ * some data zero'ed out before you have a chance to read it and data will
+ * be lost.
+ *
+ * If you are going to use this memcpy routine with the 'prepare for store'
+ * prefetch you may want to set USE_MEMMOVE_FOR_OVERLAP in order to avoid
+ * the problem of running memcpy on overlapping buffers.
+ *
+ * There are ifdef'ed sections of this memcpy to make sure that it does not
+ * do prefetches on cache lines that are not going to be completely written.
+ * This code is only needed and only used when PREFETCH_STORE_HINT is set to 
+ * PREFETCH_HINT_PREPAREFORSTORE.  This code assumes that cache lines are
+ * 32 bytes and if the cache line is larger it will not work correctly.
+ */
 
-/* void *memcpy(void *s1, const void *s2, size_t n);  */
+#ifdef USE_PREFETCH
+# define PREFETCH_HINT_LOAD		0
+# define PREFETCH_HINT_STORE		1
+# define PREFETCH_HINT_LOAD_STREAMED	4
+# define PREFETCH_HINT_STORE_STREAMED	5
+# define PREFETCH_HINT_LOAD_RETAINED	6
+# define PREFETCH_HINT_STORE_RETAINED	7
+# define PREFETCH_HINT_WRITEBACK_INVAL	25
+# define PREFETCH_HINT_PREPAREFORSTORE	30
+
+/*
+ * If we have not picked out what hints to use at this point use the
+ * standard load and store prefetch hints.
+ */
+#ifndef PREFETCH_STORE_HINT
+# define PREFETCH_STORE_HINT PREFETCH_HINT_STORE
+#endif
+#ifndef PREFETCH_LOAD_HINT
+# define PREFETCH_LOAD_HINT PREFETCH_HINT_LOAD
+#endif
 
+/*
+ * We double everything when USE_DOUBLE is true so we do 2 prefetches to
+ * get 64 bytes in that case.  The assumption is that each individual 
+ * prefetch brings in 32 bytes.
+ */
+#ifdef USE_DOUBLE
+# define PREFETCH_CHUNK 64
+# define PREFETCH_FOR_LOAD(chunk, reg) \
+ pref PREFETCH_LOAD_HINT, (chunk)*32(reg); \
+ pref PREFETCH_LOAD_HINT, ((chunk)+1)*32(reg)
+# define PREFETCH_FOR_STORE(chunk, reg) \
+ pref PREFETCH_STORE_HINT, (chunk)*32(reg); \
+ pref PREFETCH_STORE_HINT, ((chunk)+1)*32(reg)
+#else
+# define PREFETCH_CHUNK 32
+# define PREFETCH_FOR_LOAD(chunk, reg) \
+ pref PREFETCH_LOAD_HINT, (chunk)*32(reg)
+# define PREFETCH_FOR_STORE(chunk, reg) \
+ pref PREFETCH_STORE_HINT, (chunk)*32(reg)
+#endif
+# define PREFETCH_LIMIT (5 * PREFETCH_CHUNK)
+#else /* USE_PREFETCH not defined */
+# define PREFETCH_FOR_LOAD(offset, reg)
+# define PREFETCH_FOR_STORE(offset, reg)
+#endif
+
+/* Allow the routine to be named something else if desired.  */
+#ifndef MEMCPY_NAME
+#define MEMCPY_NAME memcpy
+#endif
+
+/* We use these 32/64 bit registers as temporaries to do the copying.  */
+#define REG0 t0
+#define REG1 t1
+#define REG2 t2
+#define REG3 t3
+#if _MIPS_SIM == _ABIO32
+#  define REG4 t4
+#  define REG5 t5
+#  define REG6 t6
+#  define REG7 t7
+#else
+#  define REG4 ta0
+#  define REG5 ta1
+#  define REG6 ta2
+#  define REG7 ta3
+#endif
+
+/* We load/store 64 bits at a time when USE_DOUBLE is true.
+ * The C_ prefix stands for CHUNK and is used to avoid macro name
+ * conflicts with system header files.  */
+
+#ifdef USE_DOUBLE
+#  define C_ST	sd
+#  define C_LD	ld
 #if __MIPSEB
-#  define LWHI	lwl		/* high part is left in big-endian	*/
-#  define SWHI	swl		/* high part is left in big-endian	*/
-#  define LWLO	lwr		/* low part is right in big-endian	*/
-#  define SWLO	swr		/* low part is right in big-endian	*/
+#  define C_LDHI	ldl	/* high part is left in big-endian	*/
+#  define C_STHI	sdl	/* high part is left in big-endian	*/
+#  define C_LDLO	ldr	/* low part is right in big-endian	*/
+#  define C_STLO	sdr	/* low part is right in big-endian	*/
+#else
+#  define C_LDHI	ldr	/* high part is right in little-endian	*/
+#  define C_STHI	sdr	/* high part is right in little-endian	*/
+#  define C_LDLO	ldl	/* low part is left in little-endian	*/
+#  define C_STLO	sdl	/* low part is left in little-endian	*/
+#endif
+#else
+#  define C_ST	sw
+#  define C_LD	lw
+#if __MIPSEB
+#  define C_LDHI	lwl	/* high part is left in big-endian	*/
+#  define C_STHI	swl	/* high part is left in big-endian	*/
+#  define C_LDLO	lwr	/* low part is right in big-endian	*/
+#  define C_STLO	swr	/* low part is right in big-endian	*/
+#else
+#  define C_LDHI	lwr	/* high part is right in little-endian	*/
+#  define C_STHI	swr	/* high part is right in little-endian	*/
+#  define C_LDLO	lwl	/* low part is left in little-endian	*/
+#  define C_STLO	swl	/* low part is left in little-endian	*/
+#endif
+#endif
+
+/* Bookkeeping values for 32 vs. 64 bit mode.  */
+#ifdef USE_DOUBLE
+#  define NSIZE 8
+#  define NSIZEMASK 0x3f
+#  define NSIZEDMASK 0x7f
 #else
-#  define LWHI	lwr		/* high part is right in little-endian	*/
-#  define SWHI	swr		/* high part is right in little-endian	*/
-#  define LWLO	lwl		/* low part is left in little-endian	*/
-#  define SWLO	swl		/* low part is left in little-endian	*/
+#  define NSIZE 4
+#  define NSIZEMASK 0x1f
+#  define NSIZEDMASK 0x3f
 #endif
+#define UNIT(unit) ((unit)*NSIZE)
+#define UNITM1(unit) (((unit)*NSIZE)-1)
 
-ENTRY (memcpy)
+#ifdef ANDROID_CHANGES
+LEAF(MEMCPY_NAME, 0)
+#else
+LEAF(MEMCPY_NAME)
+#endif
+	.set	nomips16
 	.set	noreorder
+/*
+ * Below we handle the case where memcpy is called with overlapping src and dst.
+ * Although memcpy is not required to handle this case, some parts of Android
+ * like Skia rely on such usage. We call memmove to handle such cases.
+ */
+#ifdef USE_MEMMOVE_FOR_OVERLAP
+	PTR_SUBU t0,a0,a1
+	PTR_SRA	t2,t0,31
+	xor	t1,t0,t2
+	PTR_SUBU t0,t1,t2
+	sltu	t2,t0,a2
+	beq	t2,zero,L(memcpy)
+	la	t9,memmove
+	jr	t9
+	 nop
+L(memcpy):
+#endif
+/*
+ * If the size is less then 2*NSIZE (8 or 16), go to L(lastb).  Regardless of
+ * size, copy dst pointer to v0 for the return value.
+ */
+	slti	t2,a2,(2 * NSIZE)
+	bne	t2,zero,L(lastb)
+	move	v0,a0
+/*
+ * If src and dst have different alignments, go to L(unaligned), if they
+ * have the same alignment (but are not actually aligned) do a partial
+ * load/store to make them aligned.  If they are both already aligned
+ * we can start copying at L(aligned).
+ */
+	xor	t8,a1,a0
+	andi	t8,t8,(NSIZE-1)		/* t8 is a0/a1 word-displacement */
+	bne	t8,zero,L(unaligned)
+	PTR_SUBU a3, zero, a0
+
+	andi	a3,a3,(NSIZE-1)		/* copy a3 bytes to align a0/a1	  */
+	beq	a3,zero,L(aligned)	/* if a3=0, it is already aligned */
+	PTR_SUBU a2,a2,a3		/* a2 is the remining bytes count */
+
+	C_LDHI	t8,0(a1)
+	PTR_ADDU a1,a1,a3
+	C_STHI	t8,0(a0)
+	PTR_ADDU a0,a0,a3
+
+/*
+ * Now dst/src are both aligned to (word or double word) aligned addresses
+ * Set a2 to count how many bytes we have to copy after all the 64/128 byte
+ * chunks are copied and a3 to the dst pointer after all the 64/128 byte 
+ * chunks have been copied.  We will loop, incrementing a0 and a1 until a0
+ * equals a3.
+ */
+
+L(aligned):
+	andi	t8,a2,NSIZEDMASK /* any whole 64-byte/128-byte chunks? */
+	beq	a2,t8,L(chkw)	 /* if a2==t8, no 64-byte/128-byte chunks */
+	PTR_SUBU a3,a2,t8	 /* subtract from a2 the reminder */
+	PTR_ADDU a3,a0,a3	 /* Now a3 is the final dst after loop */
+
+/* When in the loop we may prefetch with the 'prepare to store' hint,
+ * in this case the a0+x should not be past the "t0-32" address.  This
+ * means: for x=128 the last "safe" a0 address is "t0-160".  Alternatively,
+ * for x=64 the last "safe" a0 address is "t0-96" In the current version we
+ * will use "prefetch hint,128(a0)", so "t0-160" is the limit.
+ */
+#if (PREFETCH_STORE_HINT == PREFETCH_HINT_PREPAREFORSTORE)
+	PTR_ADDU t0,a0,a2		/* t0 is the "past the end" address */
+	PTR_SUBU t9,t0,PREFETCH_LIMIT	/* t9 is the "last safe pref" address */
+#endif
+	PREFETCH_FOR_LOAD  (0, a1)
+	PREFETCH_FOR_LOAD  (1, a1)
+	PREFETCH_FOR_LOAD  (2, a1)
+	PREFETCH_FOR_STORE (1, a0)
+#if (PREFETCH_STORE_HINT == PREFETCH_HINT_PREPAREFORSTORE)
+	sltu	v1,t9,a0		/* If a0 > t9 don't use next prefetch */
+	bgtz	v1,L(loop16w)
+	nop
+#endif
+	PREFETCH_FOR_STORE (2, a0)
+L(loop16w):
+	PREFETCH_FOR_LOAD  (3, a1)
+	C_LD	t0,UNIT(0)(a1)
+#if (PREFETCH_STORE_HINT == PREFETCH_HINT_PREPAREFORSTORE)
+	bgtz	v1,L(skip_pref30_96)
+#endif
+	C_LD	t1,UNIT(1)(a1)
+	PREFETCH_FOR_STORE (3, a0)
+L(skip_pref30_96):
+	C_LD	REG2,UNIT(2)(a1)
+	C_LD	REG3,UNIT(3)(a1)
+	C_LD	REG4,UNIT(4)(a1)
+	C_LD	REG5,UNIT(5)(a1)
+	C_LD	REG6,UNIT(6)(a1)
+	C_LD	REG7,UNIT(7)(a1)
+        PREFETCH_FOR_LOAD (4, a1)
+
+	C_ST	t0,UNIT(0)(a0)
+	C_ST	t1,UNIT(1)(a0)
+	C_ST	REG2,UNIT(2)(a0)
+	C_ST	REG3,UNIT(3)(a0)
+	C_ST	REG4,UNIT(4)(a0)
+	C_ST	REG5,UNIT(5)(a0)
+	C_ST	REG6,UNIT(6)(a0)
+	C_ST	REG7,UNIT(7)(a0)
+
+	C_LD	t0,UNIT(8)(a1)
+#if (PREFETCH_STORE_HINT == PREFETCH_HINT_PREPAREFORSTORE)
+	bgtz	v1,L(skip_pref30_128)
+#endif
+	C_LD	t1,UNIT(9)(a1)
+	PREFETCH_FOR_STORE (4, a0)
+L(skip_pref30_128):
+	C_LD	REG2,UNIT(10)(a1)
+	C_LD	REG3,UNIT(11)(a1)
+	C_LD	REG4,UNIT(12)(a1)
+	C_LD	REG5,UNIT(13)(a1)
+	C_LD	REG6,UNIT(14)(a1)
+	C_LD	REG7,UNIT(15)(a1)
+        PREFETCH_FOR_LOAD (5, a1)
+	C_ST	t0,UNIT(8)(a0)
+	C_ST	t1,UNIT(9)(a0)
+	C_ST	REG2,UNIT(10)(a0)
+	C_ST	REG3,UNIT(11)(a0)
+	C_ST	REG4,UNIT(12)(a0)
+	C_ST	REG5,UNIT(13)(a0)
+	C_ST	REG6,UNIT(14)(a0)
+	C_ST	REG7,UNIT(15)(a0)
+	PTR_ADDIU a0,a0,UNIT(16)	/* adding 64/128 to dest */
+#if (PREFETCH_STORE_HINT == PREFETCH_HINT_PREPAREFORSTORE)
+	sltu	v1,t9,a0
+#endif
+	bne	a0,a3,L(loop16w)
+	PTR_ADDIU a1,a1,UNIT(16)	/* adding 64/128 to src */
+	move	a2,t8
+
+/* Here we have src and dest word-aligned but less than 64-bytes or
+ * 128 bytes to go.  Check for a 32(64) byte chunk and copy if if there
+ * is one.  Otherwise jump down to L(chk1w) to handle the tail end of
+ * the copy.
+ */
+
+L(chkw):
+	PREFETCH_FOR_LOAD (0, a1)
+	andi	t8,a2,NSIZEMASK	/* Is there a 32-byte/64-byte chunk.  */
+				/* The t8 is the reminder count past 32-bytes */
+	beq	a2,t8,L(chk1w)	/* When a2=t8, no 32-byte chunk  */
+	nop
+	C_LD	t0,UNIT(0)(a1)
+	C_LD	t1,UNIT(1)(a1)
+	C_LD	REG2,UNIT(2)(a1)
+	C_LD	REG3,UNIT(3)(a1)
+	C_LD	REG4,UNIT(4)(a1)
+	C_LD	REG5,UNIT(5)(a1)
+	C_LD	REG6,UNIT(6)(a1)
+	C_LD	REG7,UNIT(7)(a1)
+	PTR_ADDIU a1,a1,UNIT(8)
+	C_ST	t0,UNIT(0)(a0)
+	C_ST	t1,UNIT(1)(a0)
+	C_ST	REG2,UNIT(2)(a0)
+	C_ST	REG3,UNIT(3)(a0)
+	C_ST	REG4,UNIT(4)(a0)
+	C_ST	REG5,UNIT(5)(a0)
+	C_ST	REG6,UNIT(6)(a0)
+	C_ST	REG7,UNIT(7)(a0)
+	PTR_ADDIU a0,a0,UNIT(8)
+
+/*
+ * Here we have less then 32(64) bytes to copy.  Set up for a loop to
+ * copy one word (or double word) at a time.  Set a2 to count how many
+ * bytes we have to copy after all the word (or double word) chunks are
+ * copied and a3 to the dst pointer after all the (d)word chunks have
+ * been copied.  We will loop, incrementing a0 and a1 until a0 equals a3.
+ */
+L(chk1w):
+	andi	a2,t8,(NSIZE-1)	/* a2 is the reminder past one (d)word chunks */
+	beq	a2,t8,L(lastb)
+	PTR_SUBU a3,t8,a2	/* a3 is count of bytes in one (d)word chunks */
+	PTR_ADDU a3,a0,a3	/* a3 is the dst address after loop */
+
+/* copying in words (4-byte or 8-byte chunks) */
+L(wordCopy_loop):
+	C_LD	REG3,UNIT(0)(a1)
+	PTR_ADDIU a1,a1,UNIT(1)
+	PTR_ADDIU a0,a0,UNIT(1)
+	bne	a0,a3,L(wordCopy_loop)
+	C_ST	REG3,UNIT(-1)(a0)
 
-	slti	t0, a2, 8		# Less than 8?
-	bne	t0, zero, L(last8)
-	move	v0, a0			# Setup exit value before too late
-
-	xor	t0, a1, a0		# Find a0/a1 displacement
-	andi	t0, 0x3
-	bne	t0, zero, L(shift)	# Go handle the unaligned case
-	subu	t1, zero, a1
-	andi	t1, 0x3			# a0/a1 are aligned, but are we
-	beq	t1, zero, L(chk8w)	#  starting in the middle of a word?
-	subu	a2, t1
-	LWHI	t0, 0(a1)		# Yes we are... take care of that
-	addu	a1, t1
-	SWHI	t0, 0(a0)
-	addu	a0, t1
-
-L(chk8w):	
-	andi	t0, a2, 0x1f		# 32 or more bytes left?
-	beq	t0, a2, L(chk1w)
-	subu	a3, a2, t0		# Yes
-	addu	a3, a1			# a3 = end address of loop
-	move	a2, t0			# a2 = what will be left after loop
-L(lop8w):	
-	lw	t0,  0(a1)		# Loop taking 8 words at a time
-	lw	t1,  4(a1)
-	lw	t2,  8(a1)
-	lw	t3, 12(a1)
-	lw	t4, 16(a1)
-	lw	t5, 20(a1)
-	lw	t6, 24(a1)
-	lw	t7, 28(a1)
-	addiu	a0, 32
-	addiu	a1, 32
-	sw	t0, -32(a0)
-	sw	t1, -28(a0)
-	sw	t2, -24(a0)
-	sw	t3, -20(a0)
-	sw	t4, -16(a0)
-	sw	t5, -12(a0)
-	sw	t6,  -8(a0)
-	bne	a1, a3, L(lop8w)
-	sw	t7,  -4(a0)
-
-L(chk1w):	
-	andi	t0, a2, 0x3		# 4 or more bytes left?
-	beq	t0, a2, L(last8)
-	subu	a3, a2, t0		# Yes, handle them one word at a time
-	addu	a3, a1			# a3 again end address
-	move	a2, t0
-L(lop1w):	
-	lw	t0, 0(a1)
-	addiu	a0, 4
-	addiu	a1, 4
-	bne	a1, a3, L(lop1w)
-	sw	t0, -4(a0)
-
-L(last8):	
-	blez	a2, L(lst8e)		# Handle last 8 bytes, one at a time
-	addu	a3, a2, a1
-L(lst8l):	
-	lb	t0, 0(a1)
-	addiu	a0, 1
-	addiu	a1, 1
-	bne	a1, a3, L(lst8l)
-	sb	t0, -1(a0)
-L(lst8e):	
-	jr	ra			# Bye, bye
+/* Copy the last 8 (or 16) bytes */
+L(lastb):
+	blez	a2,L(leave)
+	PTR_ADDU a3,a0,a2	/* a3 is the last dst address */
+L(lastbloop):
+	lb	v1,0(a1)
+	PTR_ADDIU a1,a1,1
+	PTR_ADDIU a0,a0,1
+	bne	a0,a3,L(lastbloop)
+	sb	v1,-1(a0)
+L(leave):
+	j	ra
 	nop
+/*
+ * UNALIGNED case, got here with a3 = "negu a0"
+ * This code is nearly identical to the aligned code above
+ * but only the destination (not the source) gets aligned
+ * so we need to do partial loads of the source followed
+ * by normal stores to the destination (once we have aligned
+ * the destination).
+ */
+
+L(unaligned):
+	andi	a3,a3,(NSIZE-1)	/* copy a3 bytes to align a0/a1 */
+	beqz	a3,L(ua_chk16w) /* if a3=0, it is already aligned */
+	PTR_SUBU a2,a2,a3	/* a2 is the remining bytes count */
+
+	C_LDHI	v1,UNIT(0)(a1)
+	C_LDLO	v1,UNITM1(1)(a1)
+	PTR_ADDU a1,a1,a3
+	C_STHI	v1,UNIT(0)(a0)
+	PTR_ADDU a0,a0,a3
+
+/*
+ *  Now the destination (but not the source) is aligned
+ * Set a2 to count how many bytes we have to copy after all the 64/128 byte
+ * chunks are copied and a3 to the dst pointer after all the 64/128 byte
+ * chunks have been copied.  We will loop, incrementing a0 and a1 until a0
+ * equals a3.
+ */
 
-L(shift):	
-	subu	a3, zero, a0		# Src and Dest unaligned 
-	andi	a3, 0x3			#  (unoptimized case...)
-	beq	a3, zero, L(shft1)
-	subu	a2, a3			# a2 = bytes left
-	LWHI	t0, 0(a1)		# Take care of first odd part
-	LWLO	t0, 3(a1)
-	addu	a1, a3
-	SWHI	t0, 0(a0)
-	addu	a0, a3
-L(shft1):	
-	andi	t0, a2, 0x3
-	subu	a3, a2, t0
-	addu	a3, a1
-L(shfth):	
-	LWHI	t1, 0(a1)		# Limp through, word by word
-	LWLO	t1, 3(a1)
-	addiu	a0, 4
-	addiu	a1, 4
-	bne	a1, a3, L(shfth)
-	sw	t1, -4(a0)
-	b	L(last8)		# Handle anything which may be left
-	move	a2, t0
+L(ua_chk16w):
+	andi	t8,a2,NSIZEDMASK /* any whole 64-byte/128-byte chunks? */
+	beq	a2,t8,L(ua_chkw) /* if a2==t8, no 64-byte/128-byte chunks */
+	PTR_SUBU a3,a2,t8	 /* subtract from a2 the reminder */
+	PTR_ADDU a3,a0,a3	 /* Now a3 is the final dst after loop */
 
+#if (PREFETCH_STORE_HINT == PREFETCH_HINT_PREPAREFORSTORE)
+	PTR_ADDU t0,a0,a2	  /* t0 is the "past the end" address */
+	PTR_SUBU t9,t0,PREFETCH_LIMIT /* t9 is the "last safe pref" address */
+#endif
+	PREFETCH_FOR_LOAD  (0, a1)
+	PREFETCH_FOR_LOAD  (1, a1)
+	PREFETCH_FOR_LOAD  (2, a1)
+	PREFETCH_FOR_STORE (1, a0)
+#if (PREFETCH_STORE_HINT == PREFETCH_HINT_PREPAREFORSTORE)
+	sltu	v1,t9,a0
+	bgtz	v1,L(ua_loop16w)  /* skip prefetch for too short arrays */
+	nop
+#endif
+	PREFETCH_FOR_STORE (2, a0)
+L(ua_loop16w):
+	PREFETCH_FOR_LOAD  (3, a1)
+	C_LDHI	t0,UNIT(0)(a1)
+	C_LDLO	t0,UNITM1(1)(a1)
+	C_LDHI	t1,UNIT(1)(a1)
+#if (PREFETCH_STORE_HINT == PREFETCH_HINT_PREPAREFORSTORE)
+	bgtz	v1,L(ua_skip_pref30_96)
+#endif
+	C_LDLO	t1,UNITM1(2)(a1)
+	PREFETCH_FOR_STORE (3, a0)
+L(ua_skip_pref30_96):
+	C_LDHI	REG2,UNIT(2)(a1)
+	C_LDLO	REG2,UNITM1(3)(a1)
+	C_LDHI	REG3,UNIT(3)(a1)
+	C_LDLO	REG3,UNITM1(4)(a1)
+	C_LDHI	REG4,UNIT(4)(a1)
+	C_LDLO	REG4,UNITM1(5)(a1)
+	C_LDHI	REG5,UNIT(5)(a1)
+	C_LDLO	REG5,UNITM1(6)(a1)
+	C_LDHI	REG6,UNIT(6)(a1)
+	C_LDLO	REG6,UNITM1(7)(a1)
+	C_LDHI	REG7,UNIT(7)(a1)
+	C_LDLO	REG7,UNITM1(8)(a1)
+        PREFETCH_FOR_LOAD (4, a1)
+	C_ST	t0,UNIT(0)(a0)
+	C_ST	t1,UNIT(1)(a0)
+	C_ST	REG2,UNIT(2)(a0)
+	C_ST	REG3,UNIT(3)(a0)
+	C_ST	REG4,UNIT(4)(a0)
+	C_ST	REG5,UNIT(5)(a0)
+	C_ST	REG6,UNIT(6)(a0)
+	C_ST	REG7,UNIT(7)(a0)
+	C_LDHI	t0,UNIT(8)(a1)
+	C_LDLO	t0,UNITM1(9)(a1)
+	C_LDHI	t1,UNIT(9)(a1)
+#if (PREFETCH_STORE_HINT == PREFETCH_HINT_PREPAREFORSTORE)
+	bgtz	v1,L(ua_skip_pref30_128)
+#endif
+	C_LDLO	t1,UNITM1(10)(a1)
+	PREFETCH_FOR_STORE (4, a0)
+L(ua_skip_pref30_128):
+	C_LDHI	REG2,UNIT(10)(a1)
+	C_LDLO	REG2,UNITM1(11)(a1)
+	C_LDHI	REG3,UNIT(11)(a1)
+	C_LDLO	REG3,UNITM1(12)(a1)
+	C_LDHI	REG4,UNIT(12)(a1)
+	C_LDLO	REG4,UNITM1(13)(a1)
+	C_LDHI	REG5,UNIT(13)(a1)
+	C_LDLO	REG5,UNITM1(14)(a1)
+	C_LDHI	REG6,UNIT(14)(a1)
+	C_LDLO	REG6,UNITM1(15)(a1)
+	C_LDHI	REG7,UNIT(15)(a1)
+	C_LDLO	REG7,UNITM1(16)(a1)
+        PREFETCH_FOR_LOAD (5, a1)
+	C_ST	t0,UNIT(8)(a0)
+	C_ST	t1,UNIT(9)(a0)
+	C_ST	REG2,UNIT(10)(a0)
+	C_ST	REG3,UNIT(11)(a0)
+	C_ST	REG4,UNIT(12)(a0)
+	C_ST	REG5,UNIT(13)(a0)
+	C_ST	REG6,UNIT(14)(a0)
+	C_ST	REG7,UNIT(15)(a0)
+	PTR_ADDIU a0,a0,UNIT(16)	/* adding 64/128 to dest */
+#if (PREFETCH_STORE_HINT == PREFETCH_HINT_PREPAREFORSTORE)
+	sltu	v1,t9,a0
+#endif
+	bne	a0,a3,L(ua_loop16w)
+	PTR_ADDIU a1,a1,UNIT(16)	/* adding 64/128 to src */
+	move	a2,t8
+
+/* Here we have src and dest word-aligned but less than 64-bytes or
+ * 128 bytes to go.  Check for a 32(64) byte chunk and copy if if there
+ * is one.  Otherwise jump down to L(ua_chk1w) to handle the tail end of
+ * the copy.  */
+
+L(ua_chkw):
+	PREFETCH_FOR_LOAD (0, a1)
+	andi	t8,a2,NSIZEMASK	  /* Is there a 32-byte/64-byte chunk.  */
+				  /* t8 is the reminder count past 32-bytes */
+	beq	a2,t8,L(ua_chk1w) /* When a2=t8, no 32-byte chunk */
+	nop
+	C_LDHI	t0,UNIT(0)(a1)
+	C_LDLO	t0,UNITM1(1)(a1)
+	C_LDHI	t1,UNIT(1)(a1)
+	C_LDLO	t1,UNITM1(2)(a1)
+	C_LDHI	REG2,UNIT(2)(a1)
+	C_LDLO	REG2,UNITM1(3)(a1)
+	C_LDHI	REG3,UNIT(3)(a1)
+	C_LDLO	REG3,UNITM1(4)(a1)
+	C_LDHI	REG4,UNIT(4)(a1)
+	C_LDLO	REG4,UNITM1(5)(a1)
+	C_LDHI	REG5,UNIT(5)(a1)
+	C_LDLO	REG5,UNITM1(6)(a1)
+	C_LDHI	REG6,UNIT(6)(a1)
+	C_LDLO	REG6,UNITM1(7)(a1)
+	C_LDHI	REG7,UNIT(7)(a1)
+	C_LDLO	REG7,UNITM1(8)(a1)
+	PTR_ADDIU a1,a1,UNIT(8)
+	C_ST	t0,UNIT(0)(a0)
+	C_ST	t1,UNIT(1)(a0)
+	C_ST	REG2,UNIT(2)(a0)
+	C_ST	REG3,UNIT(3)(a0)
+	C_ST	REG4,UNIT(4)(a0)
+	C_ST	REG5,UNIT(5)(a0)
+	C_ST	REG6,UNIT(6)(a0)
+	C_ST	REG7,UNIT(7)(a0)
+	PTR_ADDIU a0,a0,UNIT(8)
+/*
+ * Here we have less then 32(64) bytes to copy.  Set up for a loop to
+ * copy one word (or double word) at a time.
+ */
+L(ua_chk1w):
+	andi	a2,t8,(NSIZE-1)	/* a2 is the reminder past one (d)word chunks */
+	beq	a2,t8,L(ua_smallCopy)
+	PTR_SUBU a3,t8,a2	/* a3 is count of bytes in one (d)word chunks */
+	PTR_ADDU a3,a0,a3	/* a3 is the dst address after loop */
+
+/* copying in words (4-byte or 8-byte chunks) */
+L(ua_wordCopy_loop):
+	C_LDHI	v1,UNIT(0)(a1)
+	C_LDLO	v1,UNITM1(1)(a1)
+	PTR_ADDIU a1,a1,UNIT(1)
+	PTR_ADDIU a0,a0,UNIT(1)
+	bne	a0,a3,L(ua_wordCopy_loop)
+	C_ST	v1,UNIT(-1)(a0)
+
+/* Copy the last 8 (or 16) bytes */
+L(ua_smallCopy):
+	beqz	a2,L(leave)
+	PTR_ADDU a3,a0,a2	/* a3 is the last dst address */
+L(ua_smallCopy_loop):
+	lb	v1,0(a1)
+	PTR_ADDIU a1,a1,1
+	PTR_ADDIU a0,a0,1
+	bne	a0,a3,L(ua_smallCopy_loop)
+	sb	v1,-1(a0)
+
+	j	ra
+	nop
+
+	.set	at
 	.set	reorder
-END (memcpy)
-libc_hidden_builtin_def (memcpy)
+END(MEMCPY_NAME)
+#ifdef _LIBC
+libc_hidden_builtin_def (MEMCPY_NAME)
+#endif


^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH] Optimize MIPS memcpy
  2012-10-17 17:30                                       ` Steve Ellcey
@ 2012-10-29 18:00                                         ` Steve Ellcey
  2012-10-29 18:03                                           ` Maxim Kuvyrkov
  2012-10-30  7:16                                           ` Maxim Kuvyrkov
  2012-10-31 19:27                                         ` Andreas Jaeger
  1 sibling, 2 replies; 50+ messages in thread
From: Steve Ellcey @ 2012-10-29 18:00 UTC (permalink / raw)
  To: Maxim Kuvyrkov; +Cc: Andrew Pinski, Joseph S. Myers, libc-ports

On Wed, 2012-10-17 at 10:29 -0700, Steve Ellcey wrote:

> OK, Here is a version of memcpy that uses the STORE_STREAMING prefetch.
> While it is optimized for a 32 byte prefetch, it will work correctly
> regardless of the size of the prefetch.
> 
> Is this version OK to checkin?
> 
> Steve Ellcey
> sellcey@mips.com

Maxim,  have you had a chance to test this version of memcpy for MIPS?

Steve Ellcey
sellcey@mips.com


> 2012-10-17  Steve Ellcey  <sellcey@mips.com>
> 
> 	* sysdeps/mips/memcpy.S: Add prefetching and more unrolling, make
> 	it work in 32 or 64 bit modes.
> 	* sysdeps/mips/mips64/memcpy.S: Remove.
> 
> diff --git a/ports/sysdeps/mips/memcpy.S b/ports/sysdeps/mips/memcpy.S
> index 753f67c..71474e9 100644
> --- a/ports/sysdeps/mips/memcpy.S
> +++ b/ports/sysdeps/mips/memcpy.S
> @@ -1,7 +1,8 @@
> -/* Copyright (C) 2002-2012 Free Software Foundation, Inc.
> +/* Copyright (C) 2012 Free Software Foundation, Inc.
>     This file is part of the GNU C Library.
> -   Contributed by Hartvig Ekner <hartvige@mips.com>, 2002.
> -
> +  
> +   Contributed by MIPS Technologies, Inc.
> +  
>     The GNU C Library is free software; you can redistribute it and/or
>     modify it under the terms of the GNU Lesser General Public
>     License as published by the Free Software Foundation; either
> @@ -16,119 +17,616 @@
>     License along with the GNU C Library.  If not, see
>     <http://www.gnu.org/licenses/>.  */
>  
> +#ifdef ANDROID_CHANGES
> +#include "machine/asm.h"
> +#include "machine/regdef.h"
> +#define USE_MEMMOVE_FOR_OVERLAP
> +#define PREFETCH_LOAD_HINT PREFETCH_HINT_LOAD_STREAMED
> +#define PREFETCH_STORE_HINT PREFETCH_HINT_PREPAREFORSTORE
> +#elif _LIBC
>  #include <sysdep.h>
> +#include <regdef.h>
> +#include <sys/asm.h>
> +#define PREFETCH_LOAD_HINT PREFETCH_HINT_LOAD_STREAMED
> +#define PREFETCH_STORE_HINT PREFETCH_HINT_STORE_STREAMED
> +#elif _COMPILING_NEWLIB
> +#include "machine/asm.h"
> +#include "machine/regdef.h"
> +#define PREFETCH_LOAD_HINT PREFETCH_HINT_LOAD_STREAMED
> +#define PREFETCH_STORE_HINT PREFETCH_HINT_STORE_STREAMED
> +#else
> +#include <regdef.h>
> +#include <sys/asm.h>
> +#endif
> +
> +#if (_MIPS_ISA == _MIPS_ISA_MIPS4) || (_MIPS_ISA == _MIPS_ISA_MIPS5) || \
> +    (_MIPS_ISA == _MIPS_ISA_MIPS32) || (_MIPS_ISA == _MIPS_ISA_MIPS64)
> +#ifndef DISABLE_PREFETCH
> +#define USE_PREFETCH
> +#endif
> +#endif
> +
> +#if (_MIPS_SIM == _ABI64) || (_MIPS_SIM == _ABIN32)
> +#ifndef DISABLE_DOUBLE
> +#define USE_DOUBLE
> +#endif
> +#endif
> +
> +
> +
> +/* Some asm.h files do not have the L macro definition.  */
> +#ifndef L
> +#if _MIPS_SIM == _ABIO32
> +# define L(label) $L ## label
> +#else
> +# define L(label) .L ## label
> +#endif
> +#endif
> +
> +/* Some asm.h files do not have the PTR_ADDIU macro definition.  */
> +#ifndef PTR_ADDIU
> +#ifdef USE_DOUBLE
> +#define PTR_ADDIU	daddiu
> +#else
> +#define PTR_ADDIU	addiu
> +#endif
> +#endif
> +
> +/* Some asm.h files do not have the PTR_SRA macro definition.  */
> +#ifndef PTR_SRA
> +#ifdef USE_DOUBLE
> +#define PTR_SRA		dsra
> +#else
> +#define PTR_SRA		sra
> +#endif
> +#endif
> +
>  
> +/*
> + * Using PREFETCH_HINT_LOAD_STREAMED instead of PREFETCH_LOAD on load
> + * prefetches appears to offer a slight preformance advantage.
> + *
> + * Using PREFETCH_HINT_PREPAREFORSTORE instead of PREFETCH_STORE
> + * or PREFETCH_STORE_STREAMED offers a large performance advantage
> + * but PREPAREFORSTORE has some special restrictions to consider.
> + *
> + * Prefetch with the 'prepare for store' hint does not copy a memory
> + * location into the cache, it just allocates a cache line and zeros
> + * it out.  This means that if you do not write to the entire cache
> + * line before writing it out to memory some data will get zero'ed out
> + * when the cache line is written back to memory and data will be lost.
> + *
> + * Also if you are using this memcpy to copy overlapping buffers it may
> + * not behave correctly when using the 'prepare for store' hint.  If you
> + * use the 'prepare for store' prefetch on a memory area that is in the
> + * memcpy source (as well as the memcpy destination), then you will get
> + * some data zero'ed out before you have a chance to read it and data will
> + * be lost.
> + *
> + * If you are going to use this memcpy routine with the 'prepare for store'
> + * prefetch you may want to set USE_MEMMOVE_FOR_OVERLAP in order to avoid
> + * the problem of running memcpy on overlapping buffers.
> + *
> + * There are ifdef'ed sections of this memcpy to make sure that it does not
> + * do prefetches on cache lines that are not going to be completely written.
> + * This code is only needed and only used when PREFETCH_STORE_HINT is set to 
> + * PREFETCH_HINT_PREPAREFORSTORE.  This code assumes that cache lines are
> + * 32 bytes and if the cache line is larger it will not work correctly.
> + */
>  
> -/* void *memcpy(void *s1, const void *s2, size_t n);  */
> +#ifdef USE_PREFETCH
> +# define PREFETCH_HINT_LOAD		0
> +# define PREFETCH_HINT_STORE		1
> +# define PREFETCH_HINT_LOAD_STREAMED	4
> +# define PREFETCH_HINT_STORE_STREAMED	5
> +# define PREFETCH_HINT_LOAD_RETAINED	6
> +# define PREFETCH_HINT_STORE_RETAINED	7
> +# define PREFETCH_HINT_WRITEBACK_INVAL	25
> +# define PREFETCH_HINT_PREPAREFORSTORE	30
> +
> +/*
> + * If we have not picked out what hints to use at this point use the
> + * standard load and store prefetch hints.
> + */
> +#ifndef PREFETCH_STORE_HINT
> +# define PREFETCH_STORE_HINT PREFETCH_HINT_STORE
> +#endif
> +#ifndef PREFETCH_LOAD_HINT
> +# define PREFETCH_LOAD_HINT PREFETCH_HINT_LOAD
> +#endif
>  
> +/*
> + * We double everything when USE_DOUBLE is true so we do 2 prefetches to
> + * get 64 bytes in that case.  The assumption is that each individual 
> + * prefetch brings in 32 bytes.
> + */
> +#ifdef USE_DOUBLE
> +# define PREFETCH_CHUNK 64
> +# define PREFETCH_FOR_LOAD(chunk, reg) \
> + pref PREFETCH_LOAD_HINT, (chunk)*32(reg); \
> + pref PREFETCH_LOAD_HINT, ((chunk)+1)*32(reg)
> +# define PREFETCH_FOR_STORE(chunk, reg) \
> + pref PREFETCH_STORE_HINT, (chunk)*32(reg); \
> + pref PREFETCH_STORE_HINT, ((chunk)+1)*32(reg)
> +#else
> +# define PREFETCH_CHUNK 32
> +# define PREFETCH_FOR_LOAD(chunk, reg) \
> + pref PREFETCH_LOAD_HINT, (chunk)*32(reg)
> +# define PREFETCH_FOR_STORE(chunk, reg) \
> + pref PREFETCH_STORE_HINT, (chunk)*32(reg)
> +#endif
> +# define PREFETCH_LIMIT (5 * PREFETCH_CHUNK)
> +#else /* USE_PREFETCH not defined */
> +# define PREFETCH_FOR_LOAD(offset, reg)
> +# define PREFETCH_FOR_STORE(offset, reg)
> +#endif
> +
> +/* Allow the routine to be named something else if desired.  */
> +#ifndef MEMCPY_NAME
> +#define MEMCPY_NAME memcpy
> +#endif
> +
> +/* We use these 32/64 bit registers as temporaries to do the copying.  */
> +#define REG0 t0
> +#define REG1 t1
> +#define REG2 t2
> +#define REG3 t3
> +#if _MIPS_SIM == _ABIO32
> +#  define REG4 t4
> +#  define REG5 t5
> +#  define REG6 t6
> +#  define REG7 t7
> +#else
> +#  define REG4 ta0
> +#  define REG5 ta1
> +#  define REG6 ta2
> +#  define REG7 ta3
> +#endif
> +
> +/* We load/store 64 bits at a time when USE_DOUBLE is true.
> + * The C_ prefix stands for CHUNK and is used to avoid macro name
> + * conflicts with system header files.  */
> +
> +#ifdef USE_DOUBLE
> +#  define C_ST	sd
> +#  define C_LD	ld
>  #if __MIPSEB
> -#  define LWHI	lwl		/* high part is left in big-endian	*/
> -#  define SWHI	swl		/* high part is left in big-endian	*/
> -#  define LWLO	lwr		/* low part is right in big-endian	*/
> -#  define SWLO	swr		/* low part is right in big-endian	*/
> +#  define C_LDHI	ldl	/* high part is left in big-endian	*/
> +#  define C_STHI	sdl	/* high part is left in big-endian	*/
> +#  define C_LDLO	ldr	/* low part is right in big-endian	*/
> +#  define C_STLO	sdr	/* low part is right in big-endian	*/
> +#else
> +#  define C_LDHI	ldr	/* high part is right in little-endian	*/
> +#  define C_STHI	sdr	/* high part is right in little-endian	*/
> +#  define C_LDLO	ldl	/* low part is left in little-endian	*/
> +#  define C_STLO	sdl	/* low part is left in little-endian	*/
> +#endif
> +#else
> +#  define C_ST	sw
> +#  define C_LD	lw
> +#if __MIPSEB
> +#  define C_LDHI	lwl	/* high part is left in big-endian	*/
> +#  define C_STHI	swl	/* high part is left in big-endian	*/
> +#  define C_LDLO	lwr	/* low part is right in big-endian	*/
> +#  define C_STLO	swr	/* low part is right in big-endian	*/
> +#else
> +#  define C_LDHI	lwr	/* high part is right in little-endian	*/
> +#  define C_STHI	swr	/* high part is right in little-endian	*/
> +#  define C_LDLO	lwl	/* low part is left in little-endian	*/
> +#  define C_STLO	swl	/* low part is left in little-endian	*/
> +#endif
> +#endif
> +
> +/* Bookkeeping values for 32 vs. 64 bit mode.  */
> +#ifdef USE_DOUBLE
> +#  define NSIZE 8
> +#  define NSIZEMASK 0x3f
> +#  define NSIZEDMASK 0x7f
>  #else
> -#  define LWHI	lwr		/* high part is right in little-endian	*/
> -#  define SWHI	swr		/* high part is right in little-endian	*/
> -#  define LWLO	lwl		/* low part is left in little-endian	*/
> -#  define SWLO	swl		/* low part is left in little-endian	*/
> +#  define NSIZE 4
> +#  define NSIZEMASK 0x1f
> +#  define NSIZEDMASK 0x3f
>  #endif
> +#define UNIT(unit) ((unit)*NSIZE)
> +#define UNITM1(unit) (((unit)*NSIZE)-1)
>  
> -ENTRY (memcpy)
> +#ifdef ANDROID_CHANGES
> +LEAF(MEMCPY_NAME, 0)
> +#else
> +LEAF(MEMCPY_NAME)
> +#endif
> +	.set	nomips16
>  	.set	noreorder
> +/*
> + * Below we handle the case where memcpy is called with overlapping src and dst.
> + * Although memcpy is not required to handle this case, some parts of Android
> + * like Skia rely on such usage. We call memmove to handle such cases.
> + */
> +#ifdef USE_MEMMOVE_FOR_OVERLAP
> +	PTR_SUBU t0,a0,a1
> +	PTR_SRA	t2,t0,31
> +	xor	t1,t0,t2
> +	PTR_SUBU t0,t1,t2
> +	sltu	t2,t0,a2
> +	beq	t2,zero,L(memcpy)
> +	la	t9,memmove
> +	jr	t9
> +	 nop
> +L(memcpy):
> +#endif
> +/*
> + * If the size is less then 2*NSIZE (8 or 16), go to L(lastb).  Regardless of
> + * size, copy dst pointer to v0 for the return value.
> + */
> +	slti	t2,a2,(2 * NSIZE)
> +	bne	t2,zero,L(lastb)
> +	move	v0,a0
> +/*
> + * If src and dst have different alignments, go to L(unaligned), if they
> + * have the same alignment (but are not actually aligned) do a partial
> + * load/store to make them aligned.  If they are both already aligned
> + * we can start copying at L(aligned).
> + */
> +	xor	t8,a1,a0
> +	andi	t8,t8,(NSIZE-1)		/* t8 is a0/a1 word-displacement */
> +	bne	t8,zero,L(unaligned)
> +	PTR_SUBU a3, zero, a0
> +
> +	andi	a3,a3,(NSIZE-1)		/* copy a3 bytes to align a0/a1	  */
> +	beq	a3,zero,L(aligned)	/* if a3=0, it is already aligned */
> +	PTR_SUBU a2,a2,a3		/* a2 is the remining bytes count */
> +
> +	C_LDHI	t8,0(a1)
> +	PTR_ADDU a1,a1,a3
> +	C_STHI	t8,0(a0)
> +	PTR_ADDU a0,a0,a3
> +
> +/*
> + * Now dst/src are both aligned to (word or double word) aligned addresses
> + * Set a2 to count how many bytes we have to copy after all the 64/128 byte
> + * chunks are copied and a3 to the dst pointer after all the 64/128 byte 
> + * chunks have been copied.  We will loop, incrementing a0 and a1 until a0
> + * equals a3.
> + */
> +
> +L(aligned):
> +	andi	t8,a2,NSIZEDMASK /* any whole 64-byte/128-byte chunks? */
> +	beq	a2,t8,L(chkw)	 /* if a2==t8, no 64-byte/128-byte chunks */
> +	PTR_SUBU a3,a2,t8	 /* subtract from a2 the reminder */
> +	PTR_ADDU a3,a0,a3	 /* Now a3 is the final dst after loop */
> +
> +/* When in the loop we may prefetch with the 'prepare to store' hint,
> + * in this case the a0+x should not be past the "t0-32" address.  This
> + * means: for x=128 the last "safe" a0 address is "t0-160".  Alternatively,
> + * for x=64 the last "safe" a0 address is "t0-96" In the current version we
> + * will use "prefetch hint,128(a0)", so "t0-160" is the limit.
> + */
> +#if (PREFETCH_STORE_HINT == PREFETCH_HINT_PREPAREFORSTORE)
> +	PTR_ADDU t0,a0,a2		/* t0 is the "past the end" address */
> +	PTR_SUBU t9,t0,PREFETCH_LIMIT	/* t9 is the "last safe pref" address */
> +#endif
> +	PREFETCH_FOR_LOAD  (0, a1)
> +	PREFETCH_FOR_LOAD  (1, a1)
> +	PREFETCH_FOR_LOAD  (2, a1)
> +	PREFETCH_FOR_STORE (1, a0)
> +#if (PREFETCH_STORE_HINT == PREFETCH_HINT_PREPAREFORSTORE)
> +	sltu	v1,t9,a0		/* If a0 > t9 don't use next prefetch */
> +	bgtz	v1,L(loop16w)
> +	nop
> +#endif
> +	PREFETCH_FOR_STORE (2, a0)
> +L(loop16w):
> +	PREFETCH_FOR_LOAD  (3, a1)
> +	C_LD	t0,UNIT(0)(a1)
> +#if (PREFETCH_STORE_HINT == PREFETCH_HINT_PREPAREFORSTORE)
> +	bgtz	v1,L(skip_pref30_96)
> +#endif
> +	C_LD	t1,UNIT(1)(a1)
> +	PREFETCH_FOR_STORE (3, a0)
> +L(skip_pref30_96):
> +	C_LD	REG2,UNIT(2)(a1)
> +	C_LD	REG3,UNIT(3)(a1)
> +	C_LD	REG4,UNIT(4)(a1)
> +	C_LD	REG5,UNIT(5)(a1)
> +	C_LD	REG6,UNIT(6)(a1)
> +	C_LD	REG7,UNIT(7)(a1)
> +        PREFETCH_FOR_LOAD (4, a1)
> +
> +	C_ST	t0,UNIT(0)(a0)
> +	C_ST	t1,UNIT(1)(a0)
> +	C_ST	REG2,UNIT(2)(a0)
> +	C_ST	REG3,UNIT(3)(a0)
> +	C_ST	REG4,UNIT(4)(a0)
> +	C_ST	REG5,UNIT(5)(a0)
> +	C_ST	REG6,UNIT(6)(a0)
> +	C_ST	REG7,UNIT(7)(a0)
> +
> +	C_LD	t0,UNIT(8)(a1)
> +#if (PREFETCH_STORE_HINT == PREFETCH_HINT_PREPAREFORSTORE)
> +	bgtz	v1,L(skip_pref30_128)
> +#endif
> +	C_LD	t1,UNIT(9)(a1)
> +	PREFETCH_FOR_STORE (4, a0)
> +L(skip_pref30_128):
> +	C_LD	REG2,UNIT(10)(a1)
> +	C_LD	REG3,UNIT(11)(a1)
> +	C_LD	REG4,UNIT(12)(a1)
> +	C_LD	REG5,UNIT(13)(a1)
> +	C_LD	REG6,UNIT(14)(a1)
> +	C_LD	REG7,UNIT(15)(a1)
> +        PREFETCH_FOR_LOAD (5, a1)
> +	C_ST	t0,UNIT(8)(a0)
> +	C_ST	t1,UNIT(9)(a0)
> +	C_ST	REG2,UNIT(10)(a0)
> +	C_ST	REG3,UNIT(11)(a0)
> +	C_ST	REG4,UNIT(12)(a0)
> +	C_ST	REG5,UNIT(13)(a0)
> +	C_ST	REG6,UNIT(14)(a0)
> +	C_ST	REG7,UNIT(15)(a0)
> +	PTR_ADDIU a0,a0,UNIT(16)	/* adding 64/128 to dest */
> +#if (PREFETCH_STORE_HINT == PREFETCH_HINT_PREPAREFORSTORE)
> +	sltu	v1,t9,a0
> +#endif
> +	bne	a0,a3,L(loop16w)
> +	PTR_ADDIU a1,a1,UNIT(16)	/* adding 64/128 to src */
> +	move	a2,t8
> +
> +/* Here we have src and dest word-aligned but less than 64-bytes or
> + * 128 bytes to go.  Check for a 32(64) byte chunk and copy if if there
> + * is one.  Otherwise jump down to L(chk1w) to handle the tail end of
> + * the copy.
> + */
> +
> +L(chkw):
> +	PREFETCH_FOR_LOAD (0, a1)
> +	andi	t8,a2,NSIZEMASK	/* Is there a 32-byte/64-byte chunk.  */
> +				/* The t8 is the reminder count past 32-bytes */
> +	beq	a2,t8,L(chk1w)	/* When a2=t8, no 32-byte chunk  */
> +	nop
> +	C_LD	t0,UNIT(0)(a1)
> +	C_LD	t1,UNIT(1)(a1)
> +	C_LD	REG2,UNIT(2)(a1)
> +	C_LD	REG3,UNIT(3)(a1)
> +	C_LD	REG4,UNIT(4)(a1)
> +	C_LD	REG5,UNIT(5)(a1)
> +	C_LD	REG6,UNIT(6)(a1)
> +	C_LD	REG7,UNIT(7)(a1)
> +	PTR_ADDIU a1,a1,UNIT(8)
> +	C_ST	t0,UNIT(0)(a0)
> +	C_ST	t1,UNIT(1)(a0)
> +	C_ST	REG2,UNIT(2)(a0)
> +	C_ST	REG3,UNIT(3)(a0)
> +	C_ST	REG4,UNIT(4)(a0)
> +	C_ST	REG5,UNIT(5)(a0)
> +	C_ST	REG6,UNIT(6)(a0)
> +	C_ST	REG7,UNIT(7)(a0)
> +	PTR_ADDIU a0,a0,UNIT(8)
> +
> +/*
> + * Here we have less then 32(64) bytes to copy.  Set up for a loop to
> + * copy one word (or double word) at a time.  Set a2 to count how many
> + * bytes we have to copy after all the word (or double word) chunks are
> + * copied and a3 to the dst pointer after all the (d)word chunks have
> + * been copied.  We will loop, incrementing a0 and a1 until a0 equals a3.
> + */
> +L(chk1w):
> +	andi	a2,t8,(NSIZE-1)	/* a2 is the reminder past one (d)word chunks */
> +	beq	a2,t8,L(lastb)
> +	PTR_SUBU a3,t8,a2	/* a3 is count of bytes in one (d)word chunks */
> +	PTR_ADDU a3,a0,a3	/* a3 is the dst address after loop */
> +
> +/* copying in words (4-byte or 8-byte chunks) */
> +L(wordCopy_loop):
> +	C_LD	REG3,UNIT(0)(a1)
> +	PTR_ADDIU a1,a1,UNIT(1)
> +	PTR_ADDIU a0,a0,UNIT(1)
> +	bne	a0,a3,L(wordCopy_loop)
> +	C_ST	REG3,UNIT(-1)(a0)
>  
> -	slti	t0, a2, 8		# Less than 8?
> -	bne	t0, zero, L(last8)
> -	move	v0, a0			# Setup exit value before too late
> -
> -	xor	t0, a1, a0		# Find a0/a1 displacement
> -	andi	t0, 0x3
> -	bne	t0, zero, L(shift)	# Go handle the unaligned case
> -	subu	t1, zero, a1
> -	andi	t1, 0x3			# a0/a1 are aligned, but are we
> -	beq	t1, zero, L(chk8w)	#  starting in the middle of a word?
> -	subu	a2, t1
> -	LWHI	t0, 0(a1)		# Yes we are... take care of that
> -	addu	a1, t1
> -	SWHI	t0, 0(a0)
> -	addu	a0, t1
> -
> -L(chk8w):	
> -	andi	t0, a2, 0x1f		# 32 or more bytes left?
> -	beq	t0, a2, L(chk1w)
> -	subu	a3, a2, t0		# Yes
> -	addu	a3, a1			# a3 = end address of loop
> -	move	a2, t0			# a2 = what will be left after loop
> -L(lop8w):	
> -	lw	t0,  0(a1)		# Loop taking 8 words at a time
> -	lw	t1,  4(a1)
> -	lw	t2,  8(a1)
> -	lw	t3, 12(a1)
> -	lw	t4, 16(a1)
> -	lw	t5, 20(a1)
> -	lw	t6, 24(a1)
> -	lw	t7, 28(a1)
> -	addiu	a0, 32
> -	addiu	a1, 32
> -	sw	t0, -32(a0)
> -	sw	t1, -28(a0)
> -	sw	t2, -24(a0)
> -	sw	t3, -20(a0)
> -	sw	t4, -16(a0)
> -	sw	t5, -12(a0)
> -	sw	t6,  -8(a0)
> -	bne	a1, a3, L(lop8w)
> -	sw	t7,  -4(a0)
> -
> -L(chk1w):	
> -	andi	t0, a2, 0x3		# 4 or more bytes left?
> -	beq	t0, a2, L(last8)
> -	subu	a3, a2, t0		# Yes, handle them one word at a time
> -	addu	a3, a1			# a3 again end address
> -	move	a2, t0
> -L(lop1w):	
> -	lw	t0, 0(a1)
> -	addiu	a0, 4
> -	addiu	a1, 4
> -	bne	a1, a3, L(lop1w)
> -	sw	t0, -4(a0)
> -
> -L(last8):	
> -	blez	a2, L(lst8e)		# Handle last 8 bytes, one at a time
> -	addu	a3, a2, a1
> -L(lst8l):	
> -	lb	t0, 0(a1)
> -	addiu	a0, 1
> -	addiu	a1, 1
> -	bne	a1, a3, L(lst8l)
> -	sb	t0, -1(a0)
> -L(lst8e):	
> -	jr	ra			# Bye, bye
> +/* Copy the last 8 (or 16) bytes */
> +L(lastb):
> +	blez	a2,L(leave)
> +	PTR_ADDU a3,a0,a2	/* a3 is the last dst address */
> +L(lastbloop):
> +	lb	v1,0(a1)
> +	PTR_ADDIU a1,a1,1
> +	PTR_ADDIU a0,a0,1
> +	bne	a0,a3,L(lastbloop)
> +	sb	v1,-1(a0)
> +L(leave):
> +	j	ra
>  	nop
> +/*
> + * UNALIGNED case, got here with a3 = "negu a0"
> + * This code is nearly identical to the aligned code above
> + * but only the destination (not the source) gets aligned
> + * so we need to do partial loads of the source followed
> + * by normal stores to the destination (once we have aligned
> + * the destination).
> + */
> +
> +L(unaligned):
> +	andi	a3,a3,(NSIZE-1)	/* copy a3 bytes to align a0/a1 */
> +	beqz	a3,L(ua_chk16w) /* if a3=0, it is already aligned */
> +	PTR_SUBU a2,a2,a3	/* a2 is the remining bytes count */
> +
> +	C_LDHI	v1,UNIT(0)(a1)
> +	C_LDLO	v1,UNITM1(1)(a1)
> +	PTR_ADDU a1,a1,a3
> +	C_STHI	v1,UNIT(0)(a0)
> +	PTR_ADDU a0,a0,a3
> +
> +/*
> + *  Now the destination (but not the source) is aligned
> + * Set a2 to count how many bytes we have to copy after all the 64/128 byte
> + * chunks are copied and a3 to the dst pointer after all the 64/128 byte
> + * chunks have been copied.  We will loop, incrementing a0 and a1 until a0
> + * equals a3.
> + */
>  
> -L(shift):	
> -	subu	a3, zero, a0		# Src and Dest unaligned 
> -	andi	a3, 0x3			#  (unoptimized case...)
> -	beq	a3, zero, L(shft1)
> -	subu	a2, a3			# a2 = bytes left
> -	LWHI	t0, 0(a1)		# Take care of first odd part
> -	LWLO	t0, 3(a1)
> -	addu	a1, a3
> -	SWHI	t0, 0(a0)
> -	addu	a0, a3
> -L(shft1):	
> -	andi	t0, a2, 0x3
> -	subu	a3, a2, t0
> -	addu	a3, a1
> -L(shfth):	
> -	LWHI	t1, 0(a1)		# Limp through, word by word
> -	LWLO	t1, 3(a1)
> -	addiu	a0, 4
> -	addiu	a1, 4
> -	bne	a1, a3, L(shfth)
> -	sw	t1, -4(a0)
> -	b	L(last8)		# Handle anything which may be left
> -	move	a2, t0
> +L(ua_chk16w):
> +	andi	t8,a2,NSIZEDMASK /* any whole 64-byte/128-byte chunks? */
> +	beq	a2,t8,L(ua_chkw) /* if a2==t8, no 64-byte/128-byte chunks */
> +	PTR_SUBU a3,a2,t8	 /* subtract from a2 the reminder */
> +	PTR_ADDU a3,a0,a3	 /* Now a3 is the final dst after loop */
>  
> +#if (PREFETCH_STORE_HINT == PREFETCH_HINT_PREPAREFORSTORE)
> +	PTR_ADDU t0,a0,a2	  /* t0 is the "past the end" address */
> +	PTR_SUBU t9,t0,PREFETCH_LIMIT /* t9 is the "last safe pref" address */
> +#endif
> +	PREFETCH_FOR_LOAD  (0, a1)
> +	PREFETCH_FOR_LOAD  (1, a1)
> +	PREFETCH_FOR_LOAD  (2, a1)
> +	PREFETCH_FOR_STORE (1, a0)
> +#if (PREFETCH_STORE_HINT == PREFETCH_HINT_PREPAREFORSTORE)
> +	sltu	v1,t9,a0
> +	bgtz	v1,L(ua_loop16w)  /* skip prefetch for too short arrays */
> +	nop
> +#endif
> +	PREFETCH_FOR_STORE (2, a0)
> +L(ua_loop16w):
> +	PREFETCH_FOR_LOAD  (3, a1)
> +	C_LDHI	t0,UNIT(0)(a1)
> +	C_LDLO	t0,UNITM1(1)(a1)
> +	C_LDHI	t1,UNIT(1)(a1)
> +#if (PREFETCH_STORE_HINT == PREFETCH_HINT_PREPAREFORSTORE)
> +	bgtz	v1,L(ua_skip_pref30_96)
> +#endif
> +	C_LDLO	t1,UNITM1(2)(a1)
> +	PREFETCH_FOR_STORE (3, a0)
> +L(ua_skip_pref30_96):
> +	C_LDHI	REG2,UNIT(2)(a1)
> +	C_LDLO	REG2,UNITM1(3)(a1)
> +	C_LDHI	REG3,UNIT(3)(a1)
> +	C_LDLO	REG3,UNITM1(4)(a1)
> +	C_LDHI	REG4,UNIT(4)(a1)
> +	C_LDLO	REG4,UNITM1(5)(a1)
> +	C_LDHI	REG5,UNIT(5)(a1)
> +	C_LDLO	REG5,UNITM1(6)(a1)
> +	C_LDHI	REG6,UNIT(6)(a1)
> +	C_LDLO	REG6,UNITM1(7)(a1)
> +	C_LDHI	REG7,UNIT(7)(a1)
> +	C_LDLO	REG7,UNITM1(8)(a1)
> +        PREFETCH_FOR_LOAD (4, a1)
> +	C_ST	t0,UNIT(0)(a0)
> +	C_ST	t1,UNIT(1)(a0)
> +	C_ST	REG2,UNIT(2)(a0)
> +	C_ST	REG3,UNIT(3)(a0)
> +	C_ST	REG4,UNIT(4)(a0)
> +	C_ST	REG5,UNIT(5)(a0)
> +	C_ST	REG6,UNIT(6)(a0)
> +	C_ST	REG7,UNIT(7)(a0)
> +	C_LDHI	t0,UNIT(8)(a1)
> +	C_LDLO	t0,UNITM1(9)(a1)
> +	C_LDHI	t1,UNIT(9)(a1)
> +#if (PREFETCH_STORE_HINT == PREFETCH_HINT_PREPAREFORSTORE)
> +	bgtz	v1,L(ua_skip_pref30_128)
> +#endif
> +	C_LDLO	t1,UNITM1(10)(a1)
> +	PREFETCH_FOR_STORE (4, a0)
> +L(ua_skip_pref30_128):
> +	C_LDHI	REG2,UNIT(10)(a1)
> +	C_LDLO	REG2,UNITM1(11)(a1)
> +	C_LDHI	REG3,UNIT(11)(a1)
> +	C_LDLO	REG3,UNITM1(12)(a1)
> +	C_LDHI	REG4,UNIT(12)(a1)
> +	C_LDLO	REG4,UNITM1(13)(a1)
> +	C_LDHI	REG5,UNIT(13)(a1)
> +	C_LDLO	REG5,UNITM1(14)(a1)
> +	C_LDHI	REG6,UNIT(14)(a1)
> +	C_LDLO	REG6,UNITM1(15)(a1)
> +	C_LDHI	REG7,UNIT(15)(a1)
> +	C_LDLO	REG7,UNITM1(16)(a1)
> +        PREFETCH_FOR_LOAD (5, a1)
> +	C_ST	t0,UNIT(8)(a0)
> +	C_ST	t1,UNIT(9)(a0)
> +	C_ST	REG2,UNIT(10)(a0)
> +	C_ST	REG3,UNIT(11)(a0)
> +	C_ST	REG4,UNIT(12)(a0)
> +	C_ST	REG5,UNIT(13)(a0)
> +	C_ST	REG6,UNIT(14)(a0)
> +	C_ST	REG7,UNIT(15)(a0)
> +	PTR_ADDIU a0,a0,UNIT(16)	/* adding 64/128 to dest */
> +#if (PREFETCH_STORE_HINT == PREFETCH_HINT_PREPAREFORSTORE)
> +	sltu	v1,t9,a0
> +#endif
> +	bne	a0,a3,L(ua_loop16w)
> +	PTR_ADDIU a1,a1,UNIT(16)	/* adding 64/128 to src */
> +	move	a2,t8
> +
> +/* Here we have src and dest word-aligned but less than 64-bytes or
> + * 128 bytes to go.  Check for a 32(64) byte chunk and copy if if there
> + * is one.  Otherwise jump down to L(ua_chk1w) to handle the tail end of
> + * the copy.  */
> +
> +L(ua_chkw):
> +	PREFETCH_FOR_LOAD (0, a1)
> +	andi	t8,a2,NSIZEMASK	  /* Is there a 32-byte/64-byte chunk.  */
> +				  /* t8 is the reminder count past 32-bytes */
> +	beq	a2,t8,L(ua_chk1w) /* When a2=t8, no 32-byte chunk */
> +	nop
> +	C_LDHI	t0,UNIT(0)(a1)
> +	C_LDLO	t0,UNITM1(1)(a1)
> +	C_LDHI	t1,UNIT(1)(a1)
> +	C_LDLO	t1,UNITM1(2)(a1)
> +	C_LDHI	REG2,UNIT(2)(a1)
> +	C_LDLO	REG2,UNITM1(3)(a1)
> +	C_LDHI	REG3,UNIT(3)(a1)
> +	C_LDLO	REG3,UNITM1(4)(a1)
> +	C_LDHI	REG4,UNIT(4)(a1)
> +	C_LDLO	REG4,UNITM1(5)(a1)
> +	C_LDHI	REG5,UNIT(5)(a1)
> +	C_LDLO	REG5,UNITM1(6)(a1)
> +	C_LDHI	REG6,UNIT(6)(a1)
> +	C_LDLO	REG6,UNITM1(7)(a1)
> +	C_LDHI	REG7,UNIT(7)(a1)
> +	C_LDLO	REG7,UNITM1(8)(a1)
> +	PTR_ADDIU a1,a1,UNIT(8)
> +	C_ST	t0,UNIT(0)(a0)
> +	C_ST	t1,UNIT(1)(a0)
> +	C_ST	REG2,UNIT(2)(a0)
> +	C_ST	REG3,UNIT(3)(a0)
> +	C_ST	REG4,UNIT(4)(a0)
> +	C_ST	REG5,UNIT(5)(a0)
> +	C_ST	REG6,UNIT(6)(a0)
> +	C_ST	REG7,UNIT(7)(a0)
> +	PTR_ADDIU a0,a0,UNIT(8)
> +/*
> + * Here we have less then 32(64) bytes to copy.  Set up for a loop to
> + * copy one word (or double word) at a time.
> + */
> +L(ua_chk1w):
> +	andi	a2,t8,(NSIZE-1)	/* a2 is the reminder past one (d)word chunks */
> +	beq	a2,t8,L(ua_smallCopy)
> +	PTR_SUBU a3,t8,a2	/* a3 is count of bytes in one (d)word chunks */
> +	PTR_ADDU a3,a0,a3	/* a3 is the dst address after loop */
> +
> +/* copying in words (4-byte or 8-byte chunks) */
> +L(ua_wordCopy_loop):
> +	C_LDHI	v1,UNIT(0)(a1)
> +	C_LDLO	v1,UNITM1(1)(a1)
> +	PTR_ADDIU a1,a1,UNIT(1)
> +	PTR_ADDIU a0,a0,UNIT(1)
> +	bne	a0,a3,L(ua_wordCopy_loop)
> +	C_ST	v1,UNIT(-1)(a0)
> +
> +/* Copy the last 8 (or 16) bytes */
> +L(ua_smallCopy):
> +	beqz	a2,L(leave)
> +	PTR_ADDU a3,a0,a2	/* a3 is the last dst address */
> +L(ua_smallCopy_loop):
> +	lb	v1,0(a1)
> +	PTR_ADDIU a1,a1,1
> +	PTR_ADDIU a0,a0,1
> +	bne	a0,a3,L(ua_smallCopy_loop)
> +	sb	v1,-1(a0)
> +
> +	j	ra
> +	nop
> +
> +	.set	at
>  	.set	reorder
> -END (memcpy)
> -libc_hidden_builtin_def (memcpy)
> +END(MEMCPY_NAME)
> +#ifdef _LIBC
> +libc_hidden_builtin_def (MEMCPY_NAME)
> +#endif
> 



^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH] Optimize MIPS memcpy
  2012-10-29 18:00                                         ` Steve Ellcey
@ 2012-10-29 18:03                                           ` Maxim Kuvyrkov
  2012-10-30  7:16                                           ` Maxim Kuvyrkov
  1 sibling, 0 replies; 50+ messages in thread
From: Maxim Kuvyrkov @ 2012-10-29 18:03 UTC (permalink / raw)
  To: Steve Ellcey; +Cc: Andrew Pinski, Joseph S. Myers, libc-ports

On 30/10/2012, at 7:00 AM, Steve Ellcey wrote:

> On Wed, 2012-10-17 at 10:29 -0700, Steve Ellcey wrote:
> 
>> OK, Here is a version of memcpy that uses the STORE_STREAMING prefetch.
>> While it is optimized for a 32 byte prefetch, it will work correctly
>> regardless of the size of the prefetch.
>> 
>> Is this version OK to checkin?
>> 
>> Steve Ellcey
>> sellcey@mips.com
> 
> Maxim,  have you had a chance to test this version of memcpy for MIPS?

On my list for today/tomorrow.  It's been a hectic couple of weeks.

--
Maxim Kuvyrkov
CodeSourcery / Mentor Graphics

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH] Optimize MIPS memcpy
  2012-10-29 18:00                                         ` Steve Ellcey
  2012-10-29 18:03                                           ` Maxim Kuvyrkov
@ 2012-10-30  7:16                                           ` Maxim Kuvyrkov
  2012-10-30  7:19                                             ` Maxim Kuvyrkov
  2012-10-30 17:46                                             ` Steve Ellcey
  1 sibling, 2 replies; 50+ messages in thread
From: Maxim Kuvyrkov @ 2012-10-30  7:16 UTC (permalink / raw)
  To: Steve Ellcey; +Cc: Andrew Pinski, Joseph S. Myers, libc-ports

On 30/10/2012, at 7:00 AM, Steve Ellcey wrote:

> On Wed, 2012-10-17 at 10:29 -0700, Steve Ellcey wrote:
> 
>> OK, Here is a version of memcpy that uses the STORE_STREAMING prefetch.
>> While it is optimized for a 32 byte prefetch, it will work correctly
>> regardless of the size of the prefetch.
>> 
>> Is this version OK to checkin?
>> 
>> Steve Ellcey
>> sellcey@mips.com
> 
> Maxim,  have you had a chance to test this version of memcpy for MIPS?

I have tested your latest version.  Good news: there are no correctness issues.  Bad news: it underperforms compared to my patch by 2-3 times on both N32 and N64 (didn't test O32) on the benchmark that I used.  I've run the benchmark several times and results are consistent.  I use oprofile on libc.so to determine how much time is spent in memcpy.

Would you please confirm that your current implementation is faster on YOUR benchmark than my patch in http://sourceware.org/ml/libc-ports/2012-09/msg00000.html ?  Please make sure that PREFETCH macro in ports/sysdeps/mips/sys/asm.h gets defined to "pref", not "nop", in your build.

Thanks,

--
Maxim Kuvyrkov
CodeSourcery / Mentor Graphics

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH] Optimize MIPS memcpy
  2012-10-30  7:16                                           ` Maxim Kuvyrkov
@ 2012-10-30  7:19                                             ` Maxim Kuvyrkov
  2012-10-30 17:46                                             ` Steve Ellcey
  1 sibling, 0 replies; 50+ messages in thread
From: Maxim Kuvyrkov @ 2012-10-30  7:19 UTC (permalink / raw)
  To: Andrew Pinski; +Cc: Steve Ellcey, Joseph S. Myers, libc-ports

On 30/10/2012, at 8:16 PM, Maxim Kuvyrkov wrote:

> On 30/10/2012, at 7:00 AM, Steve Ellcey wrote:
> 
>> On Wed, 2012-10-17 at 10:29 -0700, Steve Ellcey wrote:
>> 
>>> OK, Here is a version of memcpy that uses the STORE_STREAMING prefetch.
>>> While it is optimized for a 32 byte prefetch, it will work correctly
>>> regardless of the size of the prefetch.
>>> 
>>> Is this version OK to checkin?
>>> 
>>> Steve Ellcey
>>> sellcey@mips.com
>> 
>> Maxim,  have you had a chance to test this version of memcpy for MIPS?
> 
> I have tested your latest version.  Good news: there are no correctness issues.  Bad news: it underperforms compared to my patch by 2-3 times on both N32 and N64 (didn't test O32) on the benchmark that I used.  I've run the benchmark several times and results are consistent.  I use oprofile on libc.so to determine how much time is spent in memcpy.
> 
> Would you please confirm that your current implementation is faster on YOUR benchmark than my patch in http://sourceware.org/ml/libc-ports/2012-09/msg00000.html ?  Please make sure that PREFETCH macro in ports/sysdeps/mips/sys/asm.h gets defined to "pref", not "nop", in your build.

Andrew,

You should also have a stake in this.  Would you please benchmark Steve's patch in http://sourceware.org/ml/libc-ports/2012-10/msg00037.html vs my patch in http://sourceware.org/ml/libc-ports/2012-09/msg00000.html on your favorite benchmark and report the results?

Thank you,

--
Maxim Kuvyrkov
CodeSourcery / Mentor Graphics


^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH] Optimize MIPS memcpy
  2012-10-30  7:16                                           ` Maxim Kuvyrkov
  2012-10-30  7:19                                             ` Maxim Kuvyrkov
@ 2012-10-30 17:46                                             ` Steve Ellcey
  2012-10-30 21:56                                               ` Maxim Kuvyrkov
  1 sibling, 1 reply; 50+ messages in thread
From: Steve Ellcey @ 2012-10-30 17:46 UTC (permalink / raw)
  To: Maxim Kuvyrkov; +Cc: Andrew Pinski, Joseph S. Myers, libc-ports

On Tue, 2012-10-30 at 20:16 +1300, Maxim Kuvyrkov wrote:
> On 30/10/2012, at 7:00 AM, Steve Ellcey wrote:
> 
> > On Wed, 2012-10-17 at 10:29 -0700, Steve Ellcey wrote:
> > 
> >> OK, Here is a version of memcpy that uses the STORE_STREAMING prefetch.
> >> While it is optimized for a 32 byte prefetch, it will work correctly
> >> regardless of the size of the prefetch.
> >> 
> >> Is this version OK to checkin?
> >> 
> >> Steve Ellcey
> >> sellcey@mips.com
> > 
> > Maxim,  have you had a chance to test this version of memcpy for MIPS?
> 
> I have tested your latest version.  Good news: there are no correctness issues.  Bad news: it underperforms compared to my patch by 2-3 times on both N32 and N64 (didn't test O32) on the benchmark that I used.  I've run the benchmark several times and results are consistent.  I use oprofile on libc.so to determine how much time is spent in memcpy.
> 
> Would you please confirm that your current implementation is faster on YOUR benchmark than my patch in http://sourceware.org/ml/libc-ports/2012-09/msg00000.html ?  Please make sure that PREFETCH macro in ports/sysdeps/mips/sys/asm.h gets defined to "pref", not "nop", in your build.
> 
> Thanks,
> 
> --
> Maxim Kuvyrkov
> CodeSourcery / Mentor Graphics

Maxim, With O32 ABI I am seeing my version as slightly faster for large
memcpy's and slightly slower for small memcpy's compared to yours.

With N32 and 64 ABI's I see my version as slightly faster across the
board (a couple of percentage points).  I am definitely not seeing
anything like a 2X difference.  Are you sure prefetch is defined when
you tested my version?  How about using double loads and stores?  They
should both get set by default.

Steve Ellcey
sellcey@mips.com

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH] Optimize MIPS memcpy
  2012-10-30 17:46                                             ` Steve Ellcey
@ 2012-10-30 21:56                                               ` Maxim Kuvyrkov
  2012-10-30 22:19                                                 ` Steve Ellcey
  0 siblings, 1 reply; 50+ messages in thread
From: Maxim Kuvyrkov @ 2012-10-30 21:56 UTC (permalink / raw)
  To: Steve Ellcey; +Cc: Andrew Pinski, Joseph S. Myers, libc-ports

n 31/10/2012, at 6:45 AM, Steve Ellcey wrote:

> On Tue, 2012-10-30 at 20:16 +1300, Maxim Kuvyrkov wrote:
>> 
...
>> I have tested your latest version.  Good news: there are no correctness issues.  Bad news: it underperforms compared to my patch by 2-3 times on both N32 and N64 (didn't test O32) on the benchmark that I used.  I've run the benchmark several times and results are consistent.  I use oprofile on libc.so to determine how much time is spent in memcpy.
>> 
>> Would you please confirm that your current implementation is faster on YOUR benchmark than my patch in http://sourceware.org/ml/libc-ports/2012-09/msg00000.html ?  Please make sure that PREFETCH macro in ports/sysdeps/mips/sys/asm.h gets defined to "pref", not "nop", in your build.
>> 
>> Thanks,
>> 
>> --
>> Maxim Kuvyrkov
>> CodeSourcery / Mentor Graphics
> 
> Maxim, With O32 ABI I am seeing my version as slightly faster for large
> memcpy's and slightly slower for small memcpy's compared to yours.
> 
> With N32 and 64 ABI's I see my version as slightly faster across the
> board (a couple of percentage points).  I am definitely not seeing
> anything like a 2X difference.  Are you sure prefetch is defined when
> you tested my version?  How about using double loads and stores?  They
> should both get set by default.

It turns out I was benchmarking my patch against original glibc implementation, not yours (patched files in ports/ instead of libc/ports).  With the patch applied correctly, the performance is virtually the same on my benchmark.  I've also checked the assembly dump of libc.so and confirmed that prefetch instructions and 8-byte loads/store are used where appropriate.

Given that your patch provides on par or better performance than mine, and it also unifies MIPS memcpy for all ABIs (as well as between glibc and Bionic!) -- I am all for your patch.

I've reviewed you patch -- code is clean and well-documented.  Please apply the patch if sufficient testing has been done: big- and little-endian for o32/n32/n64 ABIs.  I've tested your patch for all big-endian ABIs, so you just need to cover little-endian (which, I think, you may have done already).

Thanks for bearing with me through all the debugging process!

--
Maxim Kuvyrkov
CodeSourcery / Mentor Graphics

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH] Optimize MIPS memcpy
  2012-10-30 21:56                                               ` Maxim Kuvyrkov
@ 2012-10-30 22:19                                                 ` Steve Ellcey
  2012-12-19  1:51                                                   ` Maxim Kuvyrkov
  0 siblings, 1 reply; 50+ messages in thread
From: Steve Ellcey @ 2012-10-30 22:19 UTC (permalink / raw)
  To: Maxim Kuvyrkov; +Cc: Andrew Pinski, Joseph S. Myers, libc-ports

On Wed, 2012-10-31 at 10:56 +1300, Maxim Kuvyrkov wrote:

> It turns out I was benchmarking my patch against original glibc implementation, not yours (patched files in ports/ instead of libc/ports).  With the patch applied correctly, the performance is virtually the same on my benchmark.  I've also checked the assembly dump of libc.so and confirmed that prefetch instructions and 8-byte loads/store are used where appropriate.
> 
> Given that your patch provides on par or better performance than mine, and it also unifies MIPS memcpy for all ABIs (as well as between glibc and Bionic!) -- I am all for your patch.
> 
> I've reviewed you patch -- code is clean and well-documented.  Please apply the patch if sufficient testing has been done: big- and little-endian for o32/n32/n64 ABIs.  I've tested your patch for all big-endian ABIs, so you just need to cover little-endian (which, I think, you may have done already).
> 
> Thanks for bearing with me through all the debugging process!
> 
> --
> Maxim Kuvyrkov
> CodeSourcery / Mentor Graphics

Sounds good.  I will run one more round of testing tonight and then
check it in tomorrow if it all goes OK.

Steve Ellcey
sellcey@mips.com


^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH] Optimize MIPS memcpy
  2012-10-17 17:30                                       ` Steve Ellcey
  2012-10-29 18:00                                         ` Steve Ellcey
@ 2012-10-31 19:27                                         ` Andreas Jaeger
  2012-10-31 20:04                                           ` Steve Ellcey
  1 sibling, 1 reply; 50+ messages in thread
From: Andreas Jaeger @ 2012-10-31 19:27 UTC (permalink / raw)
  To: Steve Ellcey; +Cc: libc-ports

On 10/17/2012 07:29 PM, Steve Ellcey wrote:
> [...]
> diff --git a/ports/sysdeps/mips/memcpy.S b/ports/sysdeps/mips/memcpy.S
> index 753f67c..71474e9 100644
> --- a/ports/sysdeps/mips/memcpy.S
> +++ b/ports/sysdeps/mips/memcpy.S
> @@ -1,7 +1,8 @@
> -/* Copyright (C) 2002-2012 Free Software Foundation, Inc.
> +/* Copyright (C) 2012 Free Software Foundation, Inc.
>      This file is part of the GNU C Library.
> -   Contributed by Hartvig Ekner <hartvige@mips.com>, 2002.
> -
> +
> +   Contributed by MIPS Technologies, Inc.
> +

Steve, we're not adding any new "Contributed" lines, could you remove 
that one, please?

Thanks,
Andreas
-- 
  Andreas Jaeger aj@{suse.com,opensuse.org} Twitter/Identica: jaegerandi
   SUSE LINUX Products GmbH, Maxfeldstr. 5, 90409 Nürnberg, Germany
    GF: Jeff Hawn,Jennifer Guild,Felix Imendörffer,HRB16746 (AG Nürnberg)
     GPG fingerprint = 93A3 365E CE47 B889 DF7F  FED1 389A 563C C272 A126

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH] Optimize MIPS memcpy
  2012-10-31 19:27                                         ` Andreas Jaeger
@ 2012-10-31 20:04                                           ` Steve Ellcey
  0 siblings, 0 replies; 50+ messages in thread
From: Steve Ellcey @ 2012-10-31 20:04 UTC (permalink / raw)
  To: Andreas Jaeger; +Cc: libc-ports

On Wed, 2012-10-31 at 20:26 +0100, Andreas Jaeger wrote:
> On 10/17/2012 07:29 PM, Steve Ellcey wrote:

> > +
> > +   Contributed by MIPS Technologies, Inc.
> > +
> 
> Steve, we're not adding any new "Contributed" lines, could you remove 
> that one, please?
> 
> Thanks,
> Andreas

Done.

Steve Ellcey
sellcey@mips.com


^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH] Optimize MIPS memcpy
  2012-10-30 22:19                                                 ` Steve Ellcey
@ 2012-12-19  1:51                                                   ` Maxim Kuvyrkov
  2012-12-19 16:59                                                     ` Steve Ellcey
  0 siblings, 1 reply; 50+ messages in thread
From: Maxim Kuvyrkov @ 2012-12-19  1:51 UTC (permalink / raw)
  To: Steve Ellcey; +Cc: Andrew Pinski, Joseph S. Myers, libc-ports

On 31/10/2012, at 11:19 AM, Steve Ellcey wrote:

> On Wed, 2012-10-31 at 10:56 +1300, Maxim Kuvyrkov wrote:
> 
...
> Sounds good.  I will run one more round of testing tonight and then
> check it in tomorrow if it all goes OK.


Steve,

Would you please add a NEWS entry for improved MIPS memcpy?  It's a significant performance improvement, and deserves a note.

Thanks,

--
Maxim Kuvyrkov
CodeSourcery / Mentor Graphics

^ permalink raw reply	[flat|nested] 50+ messages in thread

* Re: [PATCH] Optimize MIPS memcpy
  2012-12-19  1:51                                                   ` Maxim Kuvyrkov
@ 2012-12-19 16:59                                                     ` Steve Ellcey
  0 siblings, 0 replies; 50+ messages in thread
From: Steve Ellcey @ 2012-12-19 16:59 UTC (permalink / raw)
  To: Maxim Kuvyrkov; +Cc: Andrew Pinski, Joseph S. Myers, libc-ports

On Wed, 2012-12-19 at 14:51 +1300, Maxim Kuvyrkov wrote:

> Steve,
> 
> Would you please add a NEWS entry for improved MIPS memcpy?  It's a significant performance improvement, and deserves a note.
> 
> Thanks,
> 
> --
> Maxim Kuvyrkov
> CodeSourcery / Mentor Graphics


OK, I have added this:



2012-12-19  Steve Ellcey  <sellcey@mips.com>

	* NEWS:  Mention new memcpy for MIPS. 



diff --git a/NEWS b/NEWS
index e58fabe..3a15555 100644
--- a/NEWS
+++ b/NEWS
@@ -25,6 +25,8 @@ Version 2.17
   14838, 14856, 14863, 14865, 14866, 14868, 14869, 14871, 14872, 14879,
   14889, 14893, 14898, 14914.
 
+* Optimization of memcpy for MIPS.
+
 * CVE-2011-4609 svc_run() produces high cpu usage when accept fails with
   EMFILE has been fixed (Bugzilla #14889).

^ permalink raw reply	[flat|nested] 50+ messages in thread

end of thread, other threads:[~2012-12-19 16:59 UTC | newest]

Thread overview: 50+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2012-09-01  6:16 [PATCH] Optimize MIPS memcpy Maxim Kuvyrkov
2012-09-01 16:37 ` Joseph S. Myers
2012-09-03  9:12 ` Andrew T Pinski
2012-09-03 17:12   ` Maxim Kuvyrkov
2012-09-04 15:09   ` Steve Ellcey
2012-09-04 15:14     ` Carlos O'Donell
2012-09-04 17:03       ` Steve Ellcey
2012-09-04 17:28         ` Carlos O'Donell
2012-09-05  0:43     ` Maxim Kuvyrkov
2012-09-06 16:25       ` Steve Ellcey
2012-09-06 18:43         ` Roland McGrath
2012-09-06 19:37           ` Steve Ellcey
2012-09-07 21:24         ` Maxim Kuvyrkov
2012-09-11  4:35         ` Maxim Kuvyrkov
2012-09-11 15:18           ` Steve Ellcey
2012-09-20  9:05             ` Maxim Kuvyrkov
2012-09-20 18:38               ` Steve Ellcey
2012-09-28  3:48                 ` Maxim Kuvyrkov
2012-10-06  4:43                   ` Maxim Kuvyrkov
2012-10-08 17:04                     ` Steve Ellcey
2012-10-08 22:31                       ` Maxim Kuvyrkov
2012-10-09 20:50                         ` Steve Ellcey
2012-10-15 17:49                         ` Steve Ellcey
2012-10-15 20:20                           ` Andrew Pinski
2012-10-15 20:34                             ` Steve Ellcey
2012-10-15 20:42                               ` Andrew Pinski
2012-10-15 20:50                                 ` Andrew Pinski
2012-10-15 21:36                                   ` Steve Ellcey
2012-10-15 21:47                                     ` Maxim Kuvyrkov
2012-10-17 17:30                                       ` Steve Ellcey
2012-10-29 18:00                                         ` Steve Ellcey
2012-10-29 18:03                                           ` Maxim Kuvyrkov
2012-10-30  7:16                                           ` Maxim Kuvyrkov
2012-10-30  7:19                                             ` Maxim Kuvyrkov
2012-10-30 17:46                                             ` Steve Ellcey
2012-10-30 21:56                                               ` Maxim Kuvyrkov
2012-10-30 22:19                                                 ` Steve Ellcey
2012-12-19  1:51                                                   ` Maxim Kuvyrkov
2012-12-19 16:59                                                     ` Steve Ellcey
2012-10-31 19:27                                         ` Andreas Jaeger
2012-10-31 20:04                                           ` Steve Ellcey
2012-10-15 22:10                                     ` Joseph S. Myers
2012-10-15 21:29                               ` Maciej W. Rozycki
2012-10-15 22:05                           ` Maxim Kuvyrkov
2012-09-21 18:47               ` Steve Ellcey
2012-09-21 18:57                 ` Joseph S. Myers
2012-09-21 20:41                   ` [PATCH] Optimize MIPS memcpy (mips glibc test results) Steve Ellcey
2012-09-21 20:49                     ` Joseph S. Myers
2012-09-21 20:56                       ` Steve Ellcey
2012-09-21 19:12                 ` [PATCH] Optimize MIPS memcpy Maxim Kuvyrkov

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).