* [PATCH] Optimize MIPS memcpy
@ 2012-09-01 6:16 Maxim Kuvyrkov
2012-09-01 16:37 ` Joseph S. Myers
2012-09-03 9:12 ` Andrew T Pinski
0 siblings, 2 replies; 50+ messages in thread
From: Maxim Kuvyrkov @ 2012-09-01 6:16 UTC (permalink / raw)
To: libc-ports; +Cc: Joseph S. Myers
[-- Attachment #1: Type: text/plain, Size: 1027 bytes --]
This patch improves MIPS assembly implementations of memcpy. Two optimizations are added: prefetching of data for subsequent iterations of memcpy loop and pipelined expansion of unaligned memcpy. These optimizations speed up MIPS memcpy by about 10%.
The prefetching part is straightforward: it adds prefetching of a cache line (32 bytes) for +1 iteration for unaligned case and +2 iteration for aligned case. The rationale here is that it will take prefetch to acquire data about same time as 1 iteration of unaligned loop or 2 iterations of aligned loop. Values for these parameters were tuned on a modern MIPS processor.
The pipelined expansion of unaligned loop is implemented in a similar fashion as expansion of the aligned loop. The assembly is tricky, but it works.
These changes are almost 3 years old, and have been thoroughly tested in CodeSourcery MIPS toolchains. Retested with current trunk with no regressions for n32, n64 and o32 ABIs.
OK to apply?
--
Maxim Kuvyrkov
Mentor Graphics
[-- Attachment #2: 0001-Optimize-MIPS-memcpy.patch --]
[-- Type: application/octet-stream, Size: 7992 bytes --]
From 689030542b798d0ac711b55b1363a37729be9ad4 Mon Sep 17 00:00:00 2001
From: Maxim Kuvyrkov <maxim@codesourcery.com>
Date: Fri, 31 Aug 2012 21:45:41 -0700
Subject: [PATCH 1/2] Optimize MIPS memcpy
* sysdeps/mips/memcpy.S, sysdeps/mips64/memcpy.S: Add prefetching and
optimize unaligned case.
(PREFETCH_ITERATION_OFFSET_ALIGNED)
(PREFETCH_ITERATION_OFFSET_MISALIGNED): Define.
* sysdeps/mips/sys/asm.h (PREFETCH): New macro.
---
ports/sysdeps/mips/memcpy.S | 105 ++++++++++++++++++++++++++++++++---
ports/sysdeps/mips/mips64/memcpy.S | 107 +++++++++++++++++++++++++++++++++---
ports/sysdeps/mips/sys/asm.h | 8 +++
3 files changed, 203 insertions(+), 17 deletions(-)
diff --git a/ports/sysdeps/mips/memcpy.S b/ports/sysdeps/mips/memcpy.S
index 753f67c..55b000d 100644
--- a/ports/sysdeps/mips/memcpy.S
+++ b/ports/sysdeps/mips/memcpy.S
@@ -17,6 +17,7 @@
<http://www.gnu.org/licenses/>. */
#include <sysdep.h>
+#include <sys/asm.h>
/* void *memcpy(void *s1, const void *s2, size_t n); */
@@ -33,6 +34,12 @@
# define SWLO swl /* low part is left in little-endian */
#endif
+/* 32-byte prefetch size assumed. */
+/* Prefetch data for (current iteration + PREFETCH_ITERATION_OFFSET) during
+ current iteration. */
+#define PREFETCH_ITERATION_OFFSET_ALIGNED 2 /* TUNING KNOB */
+#define PREFETCH_ITERATION_OFFSET_MISALIGNED 1 /* TUNING KNOB */
+
ENTRY (memcpy)
.set noreorder
@@ -67,6 +74,7 @@ L(lop8w):
lw t5, 20(a1)
lw t6, 24(a1)
lw t7, 28(a1)
+ PREFETCH (4, 32+PREFETCH_ITERATION_OFFSET_ALIGNED*32, a1)
addiu a0, 32
addiu a1, 32
sw t0, -32(a0)
@@ -76,8 +84,9 @@ L(lop8w):
sw t4, -16(a0)
sw t5, -12(a0)
sw t6, -8(a0)
- bne a1, a3, L(lop8w)
sw t7, -4(a0)
+ bne a1, a3, L(lop8w)
+ PREFETCH (5, 0+PREFETCH_ITERATION_OFFSET_ALIGNED*32, a0)
L(chk1w):
andi t0, a2, 0x3 # 4 or more bytes left?
@@ -116,19 +125,97 @@ L(shift):
SWHI t0, 0(a0)
addu a0, a3
L(shft1):
- andi t0, a2, 0x3
+ andi t0, a2, 0x1f
+ beq t0, a2, L(shfth_last)
subu a3, a2, t0
addu a3, a1
-L(shfth):
- LWHI t1, 0(a1) # Limp through, word by word
- LWLO t1, 3(a1)
- addiu a0, 4
- addiu a1, 4
+ move a2, t0
+L(shfth):
+ LWHI t0, 0(a1)
+ LWHI t1, 4(a1)
+ LWHI t2, 8(a1)
+ LWHI t3, 12(a1)
+ LWHI t4, 16(a1)
+ LWHI t5, 20(a1)
+ LWHI t6, 24(a1)
+ LWHI t7, 28(a1)
+ LWLO t0, 3(a1)
+ LWLO t1, 7(a1)
+ LWLO t2, 11(a1)
+ LWLO t3, 15(a1)
+ LWLO t4, 19(a1)
+ LWLO t5, 23(a1)
+ LWLO t6, 27(a1)
+ LWLO t7, 31(a1)
+ PREFETCH (4, 32+PREFETCH_ITERATION_OFFSET_MISALIGNED*32, a1)
+ addiu a0, 32
+ addiu a1, 32
+ sw t0, -32(a0)
+ sw t1, -28(a0)
+ sw t2, -24(a0)
+ sw t3, -20(a0)
+ sw t4, -16(a0)
+ sw t5, -12(a0)
+ sw t6, -8(a0)
+ sw t7, -4(a0)
bne a1, a3, L(shfth)
- sw t1, -4(a0)
- b L(last8) # Handle anything which may be left
+ PREFETCH (5, PREFETCH_ITERATION_OFFSET_MISALIGNED*32, a0)
+
+L(shfth_last):
+ andi t0, a2, 0x3
+ beq t0, a2, L(last8)
+ subu t8, a2, t0
+ move t7, ra
+
+ bal 1f
+ addiu a3, t8, 1f - L(shfth_lwhi)
+1: subu a3, ra, a3
+ jr a3
move a2, t0
+ LWHI t6, 24(a1)
+ LWHI t5, 20(a1)
+ LWHI t4, 16(a1)
+ LWHI t3, 12(a1)
+ LWHI t2, 8(a1)
+ LWHI t1, 4(a1)
+ LWHI t0, 0(a1)
+L(shfth_lwhi):
+
+ bal 1f
+ addiu a3, t8, 1f - L(shfth_lwlo)
+1: subu a3, ra, a3
+ jr a3
+ nop
+
+ LWLO t6, 27(a1)
+ LWLO t5, 23(a1)
+ LWLO t4, 19(a1)
+ LWLO t3, 15(a1)
+ LWLO t2, 11(a1)
+ LWLO t1, 7(a1)
+ LWLO t0, 3(a1)
+L(shfth_lwlo):
+
+ bal 1f
+ addiu a3, t8, 1f - L(shfth_sw)
+1: subu a3, ra, a3
+ jr a3
+ addu a1, t8
+
+ sw t6, 24(a0)
+ sw t5, 20(a0)
+ sw t4, 16(a0)
+ sw t3, 12(a0)
+ sw t2, 8(a0)
+ sw t1, 4(a0)
+ sw t0, 0(a0)
+L(shfth_sw):
+
+ move ra, t7
+ b L(last8) # Handle last 3 bytes
+ addu a0, t8
+
.set reorder
END (memcpy)
libc_hidden_builtin_def (memcpy)
diff --git a/ports/sysdeps/mips/mips64/memcpy.S b/ports/sysdeps/mips/mips64/memcpy.S
index 49ef34d..3a5b33c 100644
--- a/ports/sysdeps/mips/mips64/memcpy.S
+++ b/ports/sysdeps/mips/mips64/memcpy.S
@@ -37,6 +37,12 @@
# define SDLO sdl /* low part is left in little-endian */
#endif
+/* 32-byte prefetch size assumed. */
+/* Prefetch data for (current iteration + PREFETCH_ITERATION_OFFSET) during
+ current iteration. */
+#define PREFETCH_ITERATION_OFFSET_ALIGNED 2 /* TUNING KNOB */
+#define PREFETCH_ITERATION_OFFSET_MISALIGNED 1 /* TUNING KNOB */
+
ENTRY (memcpy)
.set noreorder
@@ -67,21 +73,25 @@ L(lop8w):
ld t1, 8(a1)
ld t2, 16(a1)
ld t3, 24(a1)
+ PREFETCH (4, 32+PREFETCH_ITERATION_OFFSET_ALIGNED*32, a1)
ld ta0, 32(a1)
ld ta1, 40(a1)
ld ta2, 48(a1)
ld ta3, 56(a1)
+ PREFETCH (4, 64+PREFETCH_ITERATION_OFFSET_ALIGNED*32, a1)
PTR_ADDIU a0, 64
PTR_ADDIU a1, 64
sd t0, -64(a0)
sd t1, -56(a0)
sd t2, -48(a0)
sd t3, -40(a0)
+ PREFETCH (5, -32+PREFETCH_ITERATION_OFFSET_ALIGNED*32, a0)
sd ta0, -32(a0)
sd ta1, -24(a0)
sd ta2, -16(a0)
- bne a1, a3, L(lop8w)
sd ta3, -8(a0)
+ bne a1, a3, L(lop8w)
+ PREFETCH (5, 0+PREFETCH_ITERATION_OFFSET_ALIGNED*32, a0)
L(chk1w):
andi t0, a2, 0x7 # 8 or more bytes left?
@@ -120,19 +130,100 @@ L(shift):
SDHI t0, 0(a0)
PTR_ADDU a0, a3
L(shft1):
- andi t0, a2, 0x7
+ andi t0, a2, 0x3f
+ beq t0, a2, L(shfth_last)
PTR_SUBU a3, a2, t0
PTR_ADDU a3, a1
+ move a2, t0
L(shfth):
- LDHI t1, 0(a1) # Limp through, dword by dword
- LDLO t1, 7(a1)
- PTR_ADDIU a0, 8
- PTR_ADDIU a1, 8
+ LDHI t0, 0(a1)
+ LDHI t1, 8(a1)
+ LDHI t2, 16(a1)
+ LDHI t3, 24(a1)
+ LDHI ta0, 32(a1)
+ LDHI ta1, 40(a1)
+ LDHI ta2, 48(a1)
+ LDHI ta3, 56(a1)
+ PREFETCH (4, 32+PREFETCH_ITERATION_OFFSET_MISALIGNED*32, a1)
+ LDLO t0, 7(a1)
+ LDLO t1, 15(a1)
+ LDLO t2, 23(a1)
+ LDLO t3, 31(a1)
+ LDLO ta0, 39(a1)
+ LDLO ta1, 47(a1)
+ LDLO ta2, 55(a1)
+ LDLO ta3, 63(a1)
+ PREFETCH (4, 64+PREFETCH_ITERATION_OFFSET_MISALIGNED*32, a1)
+ PTR_ADDIU a0, 64
+ PTR_ADDIU a1, 64
+ sd t0, -64(a0)
+ sd t1, -56(a0)
+ sd t2, -48(a0)
+ sd t3, -40(a0)
+ PREFETCH (5, -32+PREFETCH_ITERATION_OFFSET_MISALIGNED*32, a0)
+ sd ta0, -32(a0)
+ sd ta1, -24(a0)
+ sd ta2, -16(a0)
+ sd ta3, -8(a0)
bne a1, a3, L(shfth)
- sd t1, -8(a0)
- b L(last16) # Handle anything which may be left
+ PREFETCH (5, 0+PREFETCH_ITERATION_OFFSET_MISALIGNED*32, a0)
+
+L(shfth_last):
+ andi t0, a2, 0x7
+ beq t0, a2, L(last16)
+ PTR_SUBU t8, a2, t0
+ PTR_SRL t9, t8, 1
+ move ta3, ra
+
+ bal 1f
+ PTR_ADDIU a3, t9, 1f - L(shfth_ldhi)
+1: PTR_SUBU a3, ra, a3
+ jr a3
move a2, t0
+ LDHI ta2, 48(a1)
+ LDHI ta1, 40(a1)
+ LDHI ta0, 32(a1)
+ LDHI t3, 24(a1)
+ LDHI t2, 16(a1)
+ LDHI t1, 8(a1)
+ LDHI t0, 0(a1)
+L(shfth_ldhi):
+
+ bal 1f
+ PTR_ADDIU a3, t9, 1f - L(shfth_ldlo)
+1: PTR_SUBU a3, ra, a3
+ jr a3
+ nop
+
+ LDLO ta2, 55(a1)
+ LDLO ta1, 47(a1)
+ LDLO ta0, 39(a1)
+ LDLO t3, 31(a1)
+ LDLO t2, 23(a1)
+ LDLO t1, 15(a1)
+ LDLO t0, 7(a1)
+L(shfth_ldlo):
+
+ bal 1f
+ PTR_ADDIU a3, t9, 1f - L(shfth_sd)
+1: PTR_SUBU a3, ra, a3
+ jr a3
+ PTR_ADDU a1, t8
+
+ sd ta2, 48(a0)
+ sd ta1, 40(a0)
+ sd ta0, 32(a0)
+ sd t3, 24(a0)
+ sd t2, 16(a0)
+ sd t1, 8(a0)
+ sd t0, 0(a0)
+L(shfth_sd):
+
+ move ra, ta3
+ b L(last16) # Handle last 7 bytes
+ PTR_ADDU a0, t8
+
.set reorder
END (memcpy)
libc_hidden_builtin_def (memcpy)
diff --git a/ports/sysdeps/mips/sys/asm.h b/ports/sysdeps/mips/sys/asm.h
index 0f5edf9..e4057e5 100644
--- a/ports/sysdeps/mips/sys/asm.h
+++ b/ports/sysdeps/mips/sys/asm.h
@@ -482,4 +482,12 @@ symbol = value
# define MIPS_SYNC sync
#endif
+#if _MIPS_ISA == _MIPS_ISA_MIPS32 || _MIPS_ISA == _MIPS_ISA_MIPS64
+# define PREFETCH(hint, offset, reg) pref hint, offset(reg)
+#else
+/* Don't leave assembler prefetch undefined to avoid surprises from
+ delay slot placement. */
+# define PREFETCH(hint, offset, reg) nop
+#endif
+
#endif /* sys/asm.h */
--
1.7.0.4
^ permalink raw reply [flat|nested] 50+ messages in thread
* Re: [PATCH] Optimize MIPS memcpy
2012-09-01 6:16 [PATCH] Optimize MIPS memcpy Maxim Kuvyrkov
@ 2012-09-01 16:37 ` Joseph S. Myers
2012-09-03 9:12 ` Andrew T Pinski
1 sibling, 0 replies; 50+ messages in thread
From: Joseph S. Myers @ 2012-09-01 16:37 UTC (permalink / raw)
To: Maxim Kuvyrkov; +Cc: libc-ports
On Sat, 1 Sep 2012, Maxim Kuvyrkov wrote:
> Retested with current trunk with no regressions for n32, n64 and o32
> ABIs.
>
> OK to apply?
OK if this testing has been done for both big and little endian (so six
endian / ABI combinations).
--
Joseph S. Myers
joseph@codesourcery.com
^ permalink raw reply [flat|nested] 50+ messages in thread
* Re: [PATCH] Optimize MIPS memcpy
2012-09-01 6:16 [PATCH] Optimize MIPS memcpy Maxim Kuvyrkov
2012-09-01 16:37 ` Joseph S. Myers
@ 2012-09-03 9:12 ` Andrew T Pinski
2012-09-03 17:12 ` Maxim Kuvyrkov
2012-09-04 15:09 ` Steve Ellcey
1 sibling, 2 replies; 50+ messages in thread
From: Andrew T Pinski @ 2012-09-03 9:12 UTC (permalink / raw)
To: Maxim Kuvyrkov; +Cc: Joseph S. Myers, libc-ports
Forgot to CC libc-ports@ .
On Sat, 2012-09-01 at 18:15 +1200, Maxim Kuvyrkov wrote:
> This patch improves MIPS assembly implementations of memcpy. Two optimizations are added: prefetching of data for subsequent iterations of memcpy loop and pipelined expansion of unaligned memcpy. These optimizations speed up MIPS memcpy by about 10%.
>
> The prefetching part is straightforward: it adds prefetching of a cache line (32 bytes) for +1 iteration for unaligned case and +2 iteration for aligned case. The rationale here is that it will take prefetch to acquire data about same time as 1 iteration of unaligned loop or 2 iterations of aligned loop. Values for these parameters were tuned on a modern MIPS processor.
>
This might hurt Octeon as the cache line size there is 128 bytes. Can
you say which modern MIPS processor which this has been tuned with? And
is there a way to not hard code 32 in the assembly but in a macro
instead.
Thanks,
Andrew Pinski
> The pipelined expansion of unaligned loop is implemented in a similar fashion as expansion of the aligned loop. The assembly is tricky, but it works.
>
> These changes are almost 3 years old, and have been thoroughly tested in CodeSourcery MIPS toolchains. Retested with current trunk with no regressions for n32, n64 and o32 ABIs.
>
> OK to apply?
>
> --
> Maxim Kuvyrkov
> Mentor Graphics
>
>
^ permalink raw reply [flat|nested] 50+ messages in thread
* Re: [PATCH] Optimize MIPS memcpy
2012-09-03 9:12 ` Andrew T Pinski
@ 2012-09-03 17:12 ` Maxim Kuvyrkov
2012-09-04 15:09 ` Steve Ellcey
1 sibling, 0 replies; 50+ messages in thread
From: Maxim Kuvyrkov @ 2012-09-03 17:12 UTC (permalink / raw)
To: Andrew T Pinski; +Cc: Joseph S. Myers, libc-ports
On 3/09/2012, at 9:12 PM, Andrew T Pinski wrote:
> Forgot to CC libc-ports@ .
> On Sat, 2012-09-01 at 18:15 +1200, Maxim Kuvyrkov wrote:
>> This patch improves MIPS assembly implementations of memcpy. Two optimizations are added: prefetching of data for subsequent iterations of memcpy loop and pipelined expansion of unaligned memcpy. These optimizations speed up MIPS memcpy by about 10%.
>>
>> The prefetching part is straightforward: it adds prefetching of a cache line (32 bytes) for +1 iteration for unaligned case and +2 iteration for aligned case. The rationale here is that it will take prefetch to acquire data about same time as 1 iteration of unaligned loop or 2 iterations of aligned loop. Values for these parameters were tuned on a modern MIPS processor.
>>
>
> This might hurt Octeon as the cache line size there is 128 bytes. Can
> you say which modern MIPS processor which this has been tuned with? And
> is there a way to not hard code 32 in the assembly but in a macro
> instead.
This was implemented with NetLogic XLR/XLP in mind.
The above description that I wrote was not completely accurate with regards to why we are assuming 32-byte prefetch (as I mentioned, this patch was developed almost 3 years ago). For 32-bit ABIs one iteration of the loop processes 32-bytes of data -- that's how much can fit into available 8 registers at once. Therefore we are choosing to prefetch in 32-byte blocks and have 1 prefetch instruction per iteration (well, 2 prefetches actually -- one for read and one for write). It is possible to issue prefetch instructions only every Nth iteration, but the overhead of doing so will likely be greater than the benefit.
For 64-bit ABIs we process 64 bytes per iteration, so we could deal with just a single 64-byte-or-wider prefetch per iteration. As it happens, XLR/XLP prefetch 32 bytes at a time, so the current implementation issues 2 prefetches per iteration.
It is feasible to use 2 macros for 64-bit implementation: PREFETCH32 and PREFETCH64. XLR/XLP would define both these macros to "pref", while Octeon would define PREFETCH64 to "pref" and PREFETCH32 to "nop", thus issuing a single prefetch per iteration.
However, I doubt that the above improvement worths the increased complexity of the memcpy implementation. I would expect most modern CPU to quickly discard extraneous prefetch instructions. And the most we can reasonably save here is to remove 1 read and 1 write prefetch instructions for 64-bit memcpy.
Andrew, if you still think that it would provide significant performance improvement to Octeon to issue as few prefetches as possible, would you please compare performance between the two approaches (removing the second prefetch from 64-bit implementation is a trivial change) and get back to the list with the results?
Thank you,
--
Maxim Kuvyrkov
Mentor Graphics
^ permalink raw reply [flat|nested] 50+ messages in thread
* Re: [PATCH] Optimize MIPS memcpy
2012-09-03 9:12 ` Andrew T Pinski
2012-09-03 17:12 ` Maxim Kuvyrkov
@ 2012-09-04 15:09 ` Steve Ellcey
2012-09-04 15:14 ` Carlos O'Donell
2012-09-05 0:43 ` Maxim Kuvyrkov
1 sibling, 2 replies; 50+ messages in thread
From: Steve Ellcey @ 2012-09-04 15:09 UTC (permalink / raw)
To: Andrew T Pinski; +Cc: Maxim Kuvyrkov, Joseph S. Myers, libc-ports
[-- Attachment #1: Type: text/plain, Size: 2102 bytes --]
On Mon, 2012-09-03 at 02:12 -0700, Andrew T Pinski wrote:
> Forgot to CC libc-ports@ .
> On Sat, 2012-09-01 at 18:15 +1200, Maxim Kuvyrkov wrote:
> > This patch improves MIPS assembly implementations of memcpy. Two optimizations are added:
> prefetching of data for subsequent iterations of memcpy loop and pipelined expansion of unaligned
> memcpy. These optimizations speed up MIPS memcpy by about 10%.
> >
> > The prefetching part is straightforward: it adds prefetching of a cache line (32 bytes) for +1
> iteration for unaligned case and +2 iteration for aligned case. The rationale here is that it will
> take prefetch to acquire data about same time as 1 iteration of unaligned loop or 2 iterations of aligned loop. Values for these parameters were tuned on a modern MIPS processor.
> >
>
> This might hurt Octeon as the cache line size there is 128 bytes. Can
> you say which modern MIPS processor which this has been tuned with? And
> is there a way to not hard code 32 in the assembly but in a macro
> instead.
>
> Thanks,
> Andrew Pinski
I've been looking at the MIPS memcpy and was planning on submitting a
new version based on the one that MIPS submitted to Android. It has
prefetching like Maxim's though I found that using the load and 'prepare
for store' hints instead of 'load streaming' and 'store streaming' hints
gave me better results on the 74k and 24k that I did performance testing
on.
This version has more unrolling too and between that and the hints
difference I got a small performance improvement over Maxim's version
when doing small memcpy's and a fairly substantial improvement on large
memcpy's.
I also merged the 32 and 64 bit versions together so we would only have
one copy to maintain. I haven't tried building it as part of glibc yet,
I have been testing it standalone first and was going to try and
integrate it into glibc and submit it this week or next. I'll attach it
to this email so folks can look at it and I will see if I can
parameterize the cache line size. This one also assumes a 32 byte cache
prefetch.
Steve Ellcey
sellcey@mips.com
[-- Attachment #2: memcpy.S --]
[-- Type: text/x-csrc, Size: 16761 bytes --]
/*
* Copyright (c) 2009-2012
* MIPS Technologies, Inc., California.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
* 3. Neither the name of the MIPS Technologies, Inc., nor the names of its
* contributors may be used to endorse or promote products derived from
* this software without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE MIPS TECHNOLOGIES, INC. ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
* ARE DISCLAIMED. IN NO EVENT SHALL THE MIPS TECHNOLOGIES, INC. BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
* HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*/
/************************************************************************
*
* memcpy.S
* Version: "043009"
*
************************************************************************/
/************************************************************************
* Include files
************************************************************************/
#ifdef __BIONIC__
#include "machine/asm.h"
#include "machine/regdef.h"
#define ALLOW_OVERLAP
#define USE_PREFETCH
#else
#ifdef _LIBC
#include <sysdep.h>
#define USE_PREFETCH
#endif
#include <regdef.h>
#include <sys/asm.h>
#if _MIPS_ISA == _MIPS_ISA_MIPS32 || _MIPS_ISA == _MIPS_ISA_MIPS64
#define PREFETCH
#endif
#if _MIPS_SIM == _ABI64
#define USE_DOUBLE
#endif
#endif
/* Some asm.h files do not have the L macro definition. */
#ifndef L
#if _MIPS_SIM == _ABIO32
# define L(label) $L ## label
#else
# define L(label) .L ## label
#endif
#endif
/* Some regdef.h files deo not have the PTR_ADDIU macro definition. */
#ifndef PTR_ADDIU
#ifdef USE_DOUBLE
#define PTR_ADDIU daddiu
#else
#define PTR_ADDIU addiu
#endif
#endif
/*
* Using PREF_LOAD_STREAMED instead of PREF_LOAD on load prefetches offers
* a slight preformance advantage, using PREF_PREPAREFORSTORE instead of
* PREF_STORE_STREAMED or PREF_STORE offers a large performance advantage.
*/
#ifdef USE_PREFETCH
# define PREF_LOAD 0
# define PREF_STORE 1
# define PREF_LOAD_STREAMED 4
# define PREF_STORE_STREAMED 5
# define PREF_LOAD_RETAINED 6
# define PREF_STORE_RETAINED 7
# define PREF_WRITEBACK_INVAL 25
# define PREF_PREPAREFORSTORE 30
/*
* We double everything when USE_DOUBLE is true so we do 2 prefetches to
* get 64 bytes in that case. The assumption is that each individual
* prefetch brings in 32 bytes.
*/
#ifdef USE_DOUBLE
# define PREF_CHUNK 64
# define PREFETCH_FOR_LOAD(chunk, reg) \
pref PREF_LOAD_STREAMED, (chunk)*32(reg); \
pref PREF_LOAD_STREAMED, ((chunk)+1)*32(reg)
# define PREFETCH_FOR_STORE(chunk, reg) \
pref PREF_PREPAREFORSTORE, (chunk)*32(reg); \
pref PREF_PREPAREFORSTORE, ((chunk)+1)*32(reg)
#else
# define PREF_CHUNK 32
# define PREFETCH_FOR_LOAD(chunk, reg) \
pref PREF_LOAD_STREAMED, (chunk)*32(reg)
# define PREFETCH_FOR_STORE(chunk, reg) \
pref PREF_PREPAREFORSTORE, (chunk)*32(reg)
#endif
#define PREF_LIMIT (5 * PREF_CHUNK)
#else
# define PREFETCH_FOR_LOAD(offset, reg)
# define PREFETCH_FOR_STORE(offset, reg)
#endif
/* Allow the routine to be named something else if desired. */
#ifndef MEMCPY_NAME
#define MEMCPY_NAME memcpy
#endif
/* We use these 32/64 bit registers as temporaries to do the copying. */
#define REG0 t0
#define REG1 t1
#define REG2 t2
#define REG3 t3
#ifdef USE_DOUBLE
# define REG4 ta0
# define REG5 ta1
# define REG6 ta2
# define REG7 ta3
#else
# define REG4 t4
# define REG5 t5
# define REG6 t6
# define REG7 t7
#endif
/* We load/store 64 bits at a time when USE_DOUBLE is true. */
#ifdef USE_DOUBLE
# define ST sd
# define LD ld
#if __MIPSEB
# define LDHI ldl /* high part is left in big-endian */
# define STHI sdl /* high part is left in big-endian */
# define LDLO ldr /* low part is right in big-endian */
# define STLO sdr /* low part is right in big-endian */
#else
# define LDHI ldr /* high part is right in little-endian */
# define STHI sdr /* high part is right in little-endian */
# define LDLO ldl /* low part is left in little-endian */
# define STLO sdl /* low part is left in little-endian */
#endif
#else
# define ST sw
# define LD lw
#if __MIPSEB
# define LDHI lwl /* high part is left in big-endian */
# define STHI swl /* high part is left in big-endian */
# define LDLO lwr /* low part is right in big-endian */
# define STLO swr /* low part is right in big-endian */
#else
# define LDHI lwr /* high part is right in little-endian */
# define STHI swr /* high part is right in little-endian */
# define LDLO lwl /* low part is left in little-endian */
# define STLO swl /* low part is left in little-endian */
#endif
#endif
/* Bookkeeping values for 32 vs. 64 bit mode. */
#ifdef USE_DOUBLE
# define NSIZE 8
# define NSIZEMASK 0x3f
# define NSIZEDMASK 0x7f
#else
# define NSIZE 4
# define NSIZEMASK 0x1f
# define NSIZEDMASK 0x3f
#endif
#define UNIT(unit) ((unit)*NSIZE)
#define UNITM1(unit) (((unit)*NSIZE)-1)
#ifdef __BIONIC__
LEAF(MEMCPY_NAME, 0)
#else
LEAF(MEMCPY_NAME)
#endif
.set nomips16
.set noreorder
/*
* Below we handle the case where memcpy is called with overlapping src and dst.
* Although memcpy is not required to handle this case, some parts of Android
* like Skia rely on such usage. We call memmove to handle such cases.
*/
#ifdef ALLOW_OVERLAP
PTR_SUBU t0,a0,a1
PTR_SRA t2,t0,31
xor t1,t0,t2
PTR_SUBU t0,t1,t2
sltu t2,t0,a2
beq t2,zero,L(memcpy)
la t9,memmove
jr t9
nop
L(memcpy):
#endif
/*
* If the size is less then 2*NSIZE (8 or 16), go to L(lastb). Regardless of
* size, copy dst pointer to v0 for the return value.
*/
slti t2,a2,(2 * NSIZE)
bne t2,zero,L(lastb)
move v0,a0
/*
* If src and dst have different alignments, go to L(unaligned), if they
* have the same alignment (but are not actually aligned) do a partial
* load/store to make them aligned. If they are both already aligned
* we can start copying at L(aligned).
*/
xor t8,a1,a0
andi t8,t8,(NSIZE-1) /* t8 is a0/a1 word-displacement */
bne t8,zero,L(unaligned)
PTR_SUBU a3, zero, a0
andi a3,a3,(NSIZE-1) /* copy a3 bytes to align a0/a1 */
beq a3,zero,L(aligned) /* if a3=0, it is already aligned */
PTR_SUBU a2,a2,a3 /* a2 is the remining bytes count */
LDHI t8,0(a1)
PTR_ADDU a1,a1,a3
STHI t8,0(a0)
PTR_ADDU a0,a0,a3
/*
* Now dst/src are both aligned to (word or double word) aligned addresses
* Set a2 to count how many bytes we have to copy after all the 64/128 byte
* chunks are copied and a3 to the dst pointer after all the 64/128 byte
* chunks have been copied. We will loop, incrementing a0 and a1 until a0
* equals a3.
*/
L(aligned):
andi t8,a2,NSIZEDMASK /* any whole 64-byte/128-byte chunks? */
beq a2,t8,L(chkw) /* if a2==t8, no 64-byte/128-byte chunks */
PTR_SUBU a3,a2,t8 /* subtract from a2 the reminder */
PTR_ADDU a3,a0,a3 /* Now a3 is the final dst after loop */
/* When in the loop we exercise "pref 30,x(a0)", the a0+x should not be past
* the "t0-32" address. This means: for x=128 the last "safe" a0 address is
* "t0-160". Alternatively, for x=64 the last "safe" a0 address is "t0-96"
* In the current version we will use "pref 30,128(a0)", so "t0-160" is the
* limit
*/
#ifdef USE_PREFETCH
PTR_ADDU t0,a0,a2 /* t0 is the "past the end" address */
PTR_SUBU t9,t0,PREF_LIMIT /* t9 is the "last safe pref" address */
PREFETCH_FOR_LOAD (0, a1)
PREFETCH_FOR_LOAD (1, a1)
PREFETCH_FOR_LOAD (2, a1)
PREFETCH_FOR_STORE (1, a0)
sltu v1,t9,a0 /* If a0 > t9 don't use next prefetch */
bgtz v1,L(loop16w)
nop
#endif
PREFETCH_FOR_STORE (2, a0)
L(loop16w):
PREFETCH_FOR_LOAD (3, a1)
LD t0,UNIT(0)(a1)
#ifdef USE_PREFETCH
bgtz v1,L(skip_pref30_96)
#endif
LD t1,UNIT(1)(a1)
PREFETCH_FOR_STORE (3, a0)
L(skip_pref30_96):
LD REG2,UNIT(2)(a1)
LD REG3,UNIT(3)(a1)
LD REG4,UNIT(4)(a1)
LD REG5,UNIT(5)(a1)
LD REG6,UNIT(6)(a1)
LD REG7,UNIT(7)(a1)
PREFETCH_FOR_LOAD (4, a1)
ST t0,UNIT(0)(a0)
ST t1,UNIT(1)(a0)
ST REG2,UNIT(2)(a0)
ST REG3,UNIT(3)(a0)
ST REG4,UNIT(4)(a0)
ST REG5,UNIT(5)(a0)
ST REG6,UNIT(6)(a0)
ST REG7,UNIT(7)(a0)
LD t0,UNIT(8)(a1)
#ifdef USE_PREFETCH
bgtz v1,L(skip_pref30_128)
#endif
LD t1,UNIT(9)(a1)
PREFETCH_FOR_STORE (4, a0)
L(skip_pref30_128):
LD REG2,UNIT(10)(a1)
LD REG3,UNIT(11)(a1)
LD REG4,UNIT(12)(a1)
LD REG5,UNIT(13)(a1)
LD REG6,UNIT(14)(a1)
LD REG7,UNIT(15)(a1)
PREFETCH_FOR_LOAD (5, a1)
ST t0,UNIT(8)(a0)
ST t1,UNIT(9)(a0)
ST REG2,UNIT(10)(a0)
ST REG3,UNIT(11)(a0)
ST REG4,UNIT(12)(a0)
ST REG5,UNIT(13)(a0)
ST REG6,UNIT(14)(a0)
ST REG7,UNIT(15)(a0)
PTR_ADDIU a0,a0,UNIT(16) /* adding 64/128 to dest */
#ifdef USE_PREFETCH
sltu v1,t9,a0
#endif
bne a0,a3,L(loop16w)
PTR_ADDIU a1,a1,UNIT(16) /* adding 64/128 to src */
move a2,t8
/* Here we have src and dest word-aligned but less than 64-bytes or
* 128 bytes to go. Check for a 32(64) byte chunk and copy if if there
* is one. Otherwise jump down to L(chk1w) to handle the tail end of
* the copy.
*/
L(chkw):
PREFETCH_FOR_LOAD (0, a1)
andi t8,a2,NSIZEMASK /* Is there a 32-byte/64-byte chunk. */
/* The t8 is the reminder count past 32-bytes */
beq a2,t8,L(chk1w) /* When a2=t8, no 32-byte chunk */
nop
LD t0,UNIT(0)(a1)
LD t1,UNIT(1)(a1)
LD REG2,UNIT(2)(a1)
LD REG3,UNIT(3)(a1)
LD REG4,UNIT(4)(a1)
LD REG5,UNIT(5)(a1)
LD REG6,UNIT(6)(a1)
LD REG7,UNIT(7)(a1)
PTR_ADDIU a1,a1,UNIT(8)
ST t0,UNIT(0)(a0)
ST t1,UNIT(1)(a0)
ST REG2,UNIT(2)(a0)
ST REG3,UNIT(3)(a0)
ST REG4,UNIT(4)(a0)
ST REG5,UNIT(5)(a0)
ST REG6,UNIT(6)(a0)
ST REG7,UNIT(7)(a0)
PTR_ADDIU a0,a0,UNIT(8)
/*
* Here we have less then 32(64) bytes to copy. Set up for a loop to
* copy one word (or double word) at a time. Set a2 to count how many
* bytes we have to copy after all the word (or double word) chunks are
* copied and a3 to the dst pointer after all the (d)word chunks have
* been copied. We will loop, incrementing a0 and a1 until a0 equals a3.
*/
L(chk1w):
andi a2,t8,(NSIZE-1) /* a2 is the reminder past one (d)word chunks */
beq a2,t8,L(lastb)
PTR_SUBU a3,t8,a2 /* a3 is count of bytes in one (d)word chunks */
PTR_ADDU a3,a0,a3 /* a3 is the dst address after loop */
/* copying in words (4-byte or 8-byte chunks) */
L(wordCopy_loop):
LD REG3,UNIT(0)(a1)
PTR_ADDIU a1,a1,UNIT(1)
PTR_ADDIU a0,a0,UNIT(1)
bne a0,a3,L(wordCopy_loop)
ST REG3,UNIT(-1)(a0)
/* Copy the last 8 (or 16) bytes */
L(lastb):
blez a2,L(leave)
PTR_ADDU a3,a0,a2 /* a3 is the last dst address */
L(lastbloop):
lb v1,0(a1)
PTR_ADDIU a1,a1,1
PTR_ADDIU a0,a0,1
bne a0,a3,L(lastbloop)
sb v1,-1(a0)
L(leave):
j ra
nop
/*
* UNALIGNED case, got here with a3 = "negu a0"
* This code is nearly identical to the aligned code above
* but only the destination (not the source) gets aligned
* so we need to do partial loads of the source followed
* by normal stores to the destination (once we have aligned
* the destination).
*/
L(unaligned):
andi a3,a3,(NSIZE-1) /* copy a3 bytes to align a0/a1 */
beqz a3,L(ua_chk16w) /* if a3=0, it is already aligned */
PTR_SUBU a2,a2,a3 /* a2 is the remining bytes count */
LDHI v1,UNIT(0)(a1)
LDLO v1,UNITM1(1)(a1)
PTR_ADDU a1,a1,a3
STHI v1,UNIT(0)(a0)
PTR_ADDU a0,a0,a3
/*
* Now the destination (but not the source) is aligned
* Set a2 to count how many bytes we have to copy after all the 64/128 byte
* chunks are copied and a3 to the dst pointer after all the 64/128 byte
* chunks have been copied. We will loop, incrementing a0 and a1 until a0
* equals a3.
*/
L(ua_chk16w):
andi t8,a2,NSIZEDMASK /* any whole 64-byte/128-byte chunks? */
beq a2,t8,L(ua_chkw) /* if a2==t8, no 64-byte/128-byte chunks */
PTR_SUBU a3,a2,t8 /* subtract from a2 the reminder */
PTR_ADDU a3,a0,a3 /* Now a3 is the final dst after loop */
#ifdef USE_PREFETCH
PTR_ADDU t0,a0,a2 /* t0 is the "past the end" address */
PTR_SUBU t9,t0,PREF_LIMIT /* t9 is the "last safe pref" address */
PREFETCH_FOR_LOAD (0, a1)
PREFETCH_FOR_LOAD (1, a1)
PREFETCH_FOR_LOAD (2, a1)
PREFETCH_FOR_STORE (1, a0)
sltu v1,t9,a0
bgtz v1,L(ua_loop16w) /* skip prefetch for too short arrays */
nop
#endif
PREFETCH_FOR_STORE (2, a0)
L(ua_loop16w):
PREFETCH_FOR_LOAD (3, a1)
LDHI t0,UNIT(0)(a1)
LDLO t0,UNITM1(1)(a1)
LDHI t1,UNIT(1)(a1)
#ifdef USE_PREFETCH
bgtz v1,L(ua_skip_pref30_96)
#endif
LDLO t1,UNITM1(2)(a1)
PREFETCH_FOR_STORE (3, a0)
L(ua_skip_pref30_96):
LDHI REG2,UNIT(2)(a1)
LDLO REG2,UNITM1(3)(a1)
LDHI REG3,UNIT(3)(a1)
LDLO REG3,UNITM1(4)(a1)
LDHI REG4,UNIT(4)(a1)
LDLO REG4,UNITM1(5)(a1)
LDHI REG5,UNIT(5)(a1)
LDLO REG5,UNITM1(6)(a1)
LDHI REG6,UNIT(6)(a1)
LDLO REG6,UNITM1(7)(a1)
LDHI REG7,UNIT(7)(a1)
LDLO REG7,UNITM1(8)(a1)
PREFETCH_FOR_LOAD (4, a1)
ST t0,UNIT(0)(a0)
ST t1,UNIT(1)(a0)
ST REG2,UNIT(2)(a0)
ST REG3,UNIT(3)(a0)
ST REG4,UNIT(4)(a0)
ST REG5,UNIT(5)(a0)
ST REG6,UNIT(6)(a0)
ST REG7,UNIT(7)(a0)
LDHI t0,UNIT(8)(a1)
LDLO t0,UNITM1(9)(a1)
LDHI t1,UNIT(9)(a1)
#ifdef USE_PREFETCH
bgtz v1,L(ua_skip_pref30_128)
#endif
LDLO t1,UNITM1(10)(a1)
PREFETCH_FOR_STORE (4, a0)
L(ua_skip_pref30_128):
LDHI REG2,UNIT(10)(a1)
LDLO REG2,UNITM1(11)(a1)
LDHI REG3,UNIT(11)(a1)
LDLO REG3,UNITM1(12)(a1)
LDHI REG4,UNIT(12)(a1)
LDLO REG4,UNITM1(13)(a1)
LDHI REG5,UNIT(13)(a1)
LDLO REG5,UNITM1(14)(a1)
LDHI REG6,UNIT(14)(a1)
LDLO REG6,UNITM1(15)(a1)
LDHI REG7,UNIT(15)(a1)
LDLO REG7,UNITM1(16)(a1)
PREFETCH_FOR_LOAD (5, a1)
ST t0,UNIT(8)(a0)
ST t1,UNIT(9)(a0)
ST REG2,UNIT(10)(a0)
ST REG3,UNIT(11)(a0)
ST REG4,UNIT(12)(a0)
ST REG5,UNIT(13)(a0)
ST REG6,UNIT(14)(a0)
ST REG7,UNIT(15)(a0)
PTR_ADDIU a0,a0,UNIT(16) /* adding 64/128 to dest */
#ifdef USE_PREFETCH
sltu v1,t9,a0
#endif
bne a0,a3,L(ua_loop16w)
PTR_ADDIU a1,a1,UNIT(16) /* adding 64/128 to src */
move a2,t8
/* Here we have src and dest word-aligned but less than 64-bytes or
* 128 bytes to go. Check for a 32(64) byte chunk and copy if if there
* is one. Otherwise jump down to L(ua_chk1w) to handle the tail end of
* the copy. */
L(ua_chkw):
PREFETCH_FOR_LOAD (0, a1)
andi t8,a2,NSIZEMASK /* Is there a 32-byte/64-byte chunk. */
/* t8 is the reminder count past 32-bytes */
beq a2,t8,L(ua_chk1w) /* When a2=t8, no 32-byte chunk */
nop
LDHI t0,UNIT(0)(a1)
LDLO t0,UNITM1(1)(a1)
LDHI t1,UNIT(1)(a1)
LDLO t1,UNITM1(2)(a1)
LDHI REG2,UNIT(2)(a1)
LDLO REG2,UNITM1(3)(a1)
LDHI REG3,UNIT(3)(a1)
LDLO REG3,UNITM1(4)(a1)
LDHI REG4,UNIT(4)(a1)
LDLO REG4,UNITM1(5)(a1)
LDHI REG5,UNIT(5)(a1)
LDLO REG5,UNITM1(6)(a1)
LDHI REG6,UNIT(6)(a1)
LDLO REG6,UNITM1(7)(a1)
LDHI REG7,UNIT(7)(a1)
LDLO REG7,UNITM1(8)(a1)
PTR_ADDIU a1,a1,UNIT(8)
ST t0,UNIT(0)(a0)
ST t1,UNIT(1)(a0)
ST REG2,UNIT(2)(a0)
ST REG3,UNIT(3)(a0)
ST REG4,UNIT(4)(a0)
ST REG5,UNIT(5)(a0)
ST REG6,UNIT(6)(a0)
ST REG7,UNIT(7)(a0)
PTR_ADDIU a0,a0,UNIT(8)
/*
* Here we have less then 32(64) bytes to copy. Set up for a loop to
* copy one word (or double word) at a time.
*/
L(ua_chk1w):
andi a2,t8,(NSIZE-1) /* a2 is the reminder past one (d)word chunks */
beq a2,t8,L(ua_smallCopy)
PTR_SUBU a3,t8,a2 /* a3 is count of bytes in one (d)word chunks */
PTR_ADDU a3,a0,a3 /* a3 is the dst address after loop */
/* copying in words (4-byte or 8-byte chunks) */
L(ua_wordCopy_loop):
LDHI v1,UNIT(0)(a1)
LDLO v1,UNITM1(1)(a1)
PTR_ADDIU a1,a1,UNIT(1)
PTR_ADDIU a0,a0,UNIT(1)
bne a0,a3,L(ua_wordCopy_loop)
ST v1,UNIT(-1)(a0)
/* Copy the last 8 (or 16) bytes */
L(ua_smallCopy):
beqz a2,L(leave)
PTR_ADDU a3,a0,a2 /* a3 is the last dst address */
L(ua_smallCopy_loop):
lb v1,0(a1)
PTR_ADDIU a1,a1,1
PTR_ADDIU a0,a0,1
bne a0,a3,L(ua_smallCopy_loop)
sb v1,-1(a0)
j ra
nop
.set at
.set reorder
END(MEMCPY_NAME)
/************************************************************************
* Implementation : Static functions
************************************************************************/
^ permalink raw reply [flat|nested] 50+ messages in thread
* Re: [PATCH] Optimize MIPS memcpy
2012-09-04 15:09 ` Steve Ellcey
@ 2012-09-04 15:14 ` Carlos O'Donell
2012-09-04 17:03 ` Steve Ellcey
2012-09-05 0:43 ` Maxim Kuvyrkov
1 sibling, 1 reply; 50+ messages in thread
From: Carlos O'Donell @ 2012-09-04 15:14 UTC (permalink / raw)
To: Steve Ellcey; +Cc: Andrew T Pinski, Maxim Kuvyrkov, Joseph S. Myers, libc-ports
On 9/4/2012 11:09 AM, Steve Ellcey wrote:
> On Mon, 2012-09-03 at 02:12 -0700, Andrew T Pinski wrote:
>> Forgot to CC libc-ports@ .
>> On Sat, 2012-09-01 at 18:15 +1200, Maxim Kuvyrkov wrote:
>>> This patch improves MIPS assembly implementations of memcpy. Two optimizations are added:
>> prefetching of data for subsequent iterations of memcpy loop and pipelined expansion of unaligned
>> memcpy. These optimizations speed up MIPS memcpy by about 10%.
>>>
>>> The prefetching part is straightforward: it adds prefetching of a cache line (32 bytes) for +1
>> iteration for unaligned case and +2 iteration for aligned case. The rationale here is that it will
>> take prefetch to acquire data about same time as 1 iteration of unaligned loop or 2 iterations of aligned loop. Values for these parameters were tuned on a modern MIPS processor.
>>>
>>
>> This might hurt Octeon as the cache line size there is 128 bytes. Can
>> you say which modern MIPS processor which this has been tuned with? And
>> is there a way to not hard code 32 in the assembly but in a macro
>> instead.
>>
>> Thanks,
>> Andrew Pinski
>
> I've been looking at the MIPS memcpy and was planning on submitting a
> new version based on the one that MIPS submitted to Android. It has
> prefetching like Maxim's though I found that using the load and 'prepare
> for store' hints instead of 'load streaming' and 'store streaming' hints
> gave me better results on the 74k and 24k that I did performance testing
> on.
>
> This version has more unrolling too and between that and the hints
> difference I got a small performance improvement over Maxim's version
> when doing small memcpy's and a fairly substantial improvement on large
> memcpy's.
>
> I also merged the 32 and 64 bit versions together so we would only have
> one copy to maintain. I haven't tried building it as part of glibc yet,
> I have been testing it standalone first and was going to try and
> integrate it into glibc and submit it this week or next. I'll attach it
> to this email so folks can look at it and I will see if I can
> parameterize the cache line size. This one also assumes a 32 byte cache
> prefetch.
Exactly what benchmarks did you run to verify the performance gains?
The one thing I'd like to continue seeing is strong rationalization for
performance patches such that we have reproducible data in the event that
someone else comes along and wants to make a change.
For example see:
http://sourceware.org/glibc/wiki/benchmarking/results_2_17
and:
http://sourceware.org/glibc/wiki/benchmarking/benchmarks
Cheers,
Carlos.
--
Carlos O'Donell
Mentor Graphics / CodeSourcery
carlos_odonell@mentor.com
carlos@codesourcery.com
+1 (613) 963 1026
^ permalink raw reply [flat|nested] 50+ messages in thread
* Re: [PATCH] Optimize MIPS memcpy
2012-09-04 15:14 ` Carlos O'Donell
@ 2012-09-04 17:03 ` Steve Ellcey
2012-09-04 17:28 ` Carlos O'Donell
0 siblings, 1 reply; 50+ messages in thread
From: Steve Ellcey @ 2012-09-04 17:03 UTC (permalink / raw)
To: Carlos O'Donell
Cc: Andrew T Pinski, Maxim Kuvyrkov, Joseph S. Myers, libc-ports
[-- Attachment #1: Type: text/plain, Size: 865 bytes --]
> Exactly what benchmarks did you run to verify the performance gains?
>
> The one thing I'd like to continue seeing is strong rationalization for
> performance patches such that we have reproducible data in the event that
> someone else comes along and wants to make a change.
>
> For example see:
> http://sourceware.org/glibc/wiki/benchmarking/results_2_17
>
> and:
> http://sourceware.org/glibc/wiki/benchmarking/benchmarks
>
> Cheers,
> Carlos.
We had a few tests around here that I used and I wrote one of my own
too. I have attached my test, when using it with -UVERIFY and testing
on a 74K system I got the following timings (32 bits, little-endian):
The FSF memcpy: 3m9.34s
Maxim's memcpy: 2m0.41s
My memcpy: 1m22.20s
If there are any official or recommended memcpy benchmarks I'd be happy
to try them as well.
Steve Ellcey
sellcey@mips.com
[-- Attachment #2: test_memcpy.c --]
[-- Type: text/x-csrc, Size: 1379 bytes --]
#include <string.h>
#include <stdio.h>
#define SIZE 1024*100
#define MAXCOPYSIZE 1024*50
#define MAXSRCOFFSET 13
#define MAXDSTOFFSET 18
#define SRCVAL(N) ((N+10000) % 13)
#define DSTVAL(N) ((N+20001) % 17)
char src[SIZE], dst[SIZE];
#ifndef MEMCPY_NAME
#define MEMCPY_NAME memcpy
#endif
extern void *MEMCPY_NAME(void *, const void *, size_t);
test(int src_offset, int dst_offset, int size)
{
int i;
char *x, *y;
for (i = 0; i < SIZE; i++) {
src[i] = SRCVAL(i);
dst[i] = DSTVAL(i);
}
x = src;
y = dst;
x = x + src_offset;
y = y + dst_offset;
MEMCPY_NAME(&dst[dst_offset], &src[src_offset], size);
for (i = 0; i < SIZE; i++) {
if (src[i] != SRCVAL(i)) printf("FAIL, src got changed\n");
if (i < dst_offset) {
if (dst[i] != DSTVAL(i))
printf("FAIL, dst got changed before it should be\n");
} else if (i >= (dst_offset+size)) {
if (dst[i] != DSTVAL(i))
printf("FAIL, dst got changed after it should be (%d %d %d %d)\n", src_offset, dst_offset, size, i);
} else {
if (dst[i] != SRCVAL(i-dst_offset+src_offset)) {
printf("FAIL, dst was not changed when it should be (%d %d %d %d)\n", src_offset, dst_offset, size, i);
}
}
}
}
main()
{
int i, j, k;
for (i = 8; i < MAXDSTOFFSET; i++)
for (j = 8; j < MAXSRCOFFSET; j++)
for (k = MAXCOPYSIZE-20; k < MAXCOPYSIZE; k++)
test(i, j, k);
}
^ permalink raw reply [flat|nested] 50+ messages in thread
* Re: [PATCH] Optimize MIPS memcpy
2012-09-04 17:03 ` Steve Ellcey
@ 2012-09-04 17:28 ` Carlos O'Donell
0 siblings, 0 replies; 50+ messages in thread
From: Carlos O'Donell @ 2012-09-04 17:28 UTC (permalink / raw)
To: Steve Ellcey; +Cc: Andrew T Pinski, Maxim Kuvyrkov, Joseph S. Myers, libc-ports
On 9/4/2012 1:02 PM, Steve Ellcey wrote:
>
>> Exactly what benchmarks did you run to verify the performance gains?
>>
>> The one thing I'd like to continue seeing is strong rationalization for
>> performance patches such that we have reproducible data in the event that
>> someone else comes along and wants to make a change.
>>
>> For example see:
>> http://sourceware.org/glibc/wiki/benchmarking/results_2_17
>>
>> and:
>> http://sourceware.org/glibc/wiki/benchmarking/benchmarks
>>
>> Cheers,
>> Carlos.
>
> We had a few tests around here that I used and I wrote one of my own
> too. I have attached my test, when using it with -UVERIFY and testing
> on a 74K system I got the following timings (32 bits, little-endian):
>
> The FSF memcpy: 3m9.34s
> Maxim's memcpy: 2m0.41s
> My memcpy: 1m22.20s
>
> If there are any official or recommended memcpy benchmarks I'd be happy
> to try them as well.
There are none, but that's the kind of consensus we're trying to build by
documenting exactly which tests were used to benchmark which functions.
Thanks for posting the test sources!
Cheers,
Carlos.
--
Carlos O'Donell
Mentor Graphics / CodeSourcery
carlos_odonell@mentor.com
carlos@codesourcery.com
+1 (613) 963 1026
^ permalink raw reply [flat|nested] 50+ messages in thread
* Re: [PATCH] Optimize MIPS memcpy
2012-09-04 15:09 ` Steve Ellcey
2012-09-04 15:14 ` Carlos O'Donell
@ 2012-09-05 0:43 ` Maxim Kuvyrkov
2012-09-06 16:25 ` Steve Ellcey
1 sibling, 1 reply; 50+ messages in thread
From: Maxim Kuvyrkov @ 2012-09-05 0:43 UTC (permalink / raw)
To: Steve Ellcey; +Cc: Andrew T Pinski, Joseph S. Myers, libc-ports
On 5/09/2012, at 3:09 AM, Steve Ellcey wrote:
> On Mon, 2012-09-03 at 02:12 -0700, Andrew T Pinski wrote:
>> Forgot to CC libc-ports@ .
>> On Sat, 2012-09-01 at 18:15 +1200, Maxim Kuvyrkov wrote:
>>> This patch improves MIPS assembly implementations of memcpy. Two optimizations are added:
>> prefetching of data for subsequent iterations of memcpy loop and pipelined expansion of unaligned
>> memcpy. These optimizations speed up MIPS memcpy by about 10%.
>>>
>>> The prefetching part is straightforward: it adds prefetching of a cache line (32 bytes) for +1
>> iteration for unaligned case and +2 iteration for aligned case. The rationale here is that it will
>> take prefetch to acquire data about same time as 1 iteration of unaligned loop or 2 iterations of aligned loop. Values for these parameters were tuned on a modern MIPS processor.
>>>
>>
>> This might hurt Octeon as the cache line size there is 128 bytes. Can
>> you say which modern MIPS processor which this has been tuned with? And
>> is there a way to not hard code 32 in the assembly but in a macro
>> instead.
>>
>> Thanks,
>> Andrew Pinski
>
> I've been looking at the MIPS memcpy and was planning on submitting a
> new version based on the one that MIPS submitted to Android. It has
> prefetching like Maxim's though I found that using the load and 'prepare
> for store' hints instead of 'load streaming' and 'store streaming' hints
> gave me better results on the 74k and 24k that I did performance testing
> on.
I didn't experiment with various prefetching hints, so this very well may be the case.
>
> This version has more unrolling too and between that and the hints
> difference I got a small performance improvement over Maxim's version
> when doing small memcpy's and a fairly substantial improvement on large
> memcpy's.
>
> I also merged the 32 and 64 bit versions together so we would only have
> one copy to maintain. I haven't tried building it as part of glibc yet,
> I have been testing it standalone first and was going to try and
> integrate it into glibc and submit it this week or next. I'll attach it
> to this email so folks can look at it and I will see if I can
> parameterize the cache line size. This one also assumes a 32 byte cache
> prefetch.
>
Your version looks quite good. If you could wrap it up into a glibc patch I would test it on our setup to confirm that it indeed provides better performance.
Thanks,
--
Maxim Kuvyrkov
Mentor Graphics
^ permalink raw reply [flat|nested] 50+ messages in thread
* Re: [PATCH] Optimize MIPS memcpy
2012-09-05 0:43 ` Maxim Kuvyrkov
@ 2012-09-06 16:25 ` Steve Ellcey
2012-09-06 18:43 ` Roland McGrath
` (2 more replies)
0 siblings, 3 replies; 50+ messages in thread
From: Steve Ellcey @ 2012-09-06 16:25 UTC (permalink / raw)
To: Maxim Kuvyrkov; +Cc: Andrew T Pinski, Joseph S. Myers, libc-ports
[-- Attachment #1: Type: text/plain, Size: 553 bytes --]
On Wed, 2012-09-05 at 12:43 +1200, Maxim Kuvyrkov wrote:
> Your version looks quite good. If you could wrap it up into a glibc patch I would test it on our
> setup to confirm that it indeed provides better performance.
>
> Thanks,
>
> --
> Maxim Kuvyrkov
> Mentor Graphics
I have attached a glibc patch for my version of memcpy.
2012-09-06 Steve Ellcey <sellcey@mips.com>
* sysdeps/mips/memcpy.S: Add prefetching and more unrolling, make
it work in 32 or 64 bit modes.
* sysdeps/mips/mips64/memcpy.S: Remove.
Steve Ellcey
sellcey@mips.com
[-- Attachment #2: memcpy.patch --]
[-- Type: text/x-patch, Size: 24714 bytes --]
diff --git a/ports/sysdeps/mips/memcpy.S b/ports/sysdeps/mips/memcpy.S
index 753f67c..58d6e46 100644
--- a/ports/sysdeps/mips/memcpy.S
+++ b/ports/sysdeps/mips/memcpy.S
@@ -1,134 +1,573 @@
-/* Copyright (C) 2002-2012 Free Software Foundation, Inc.
- This file is part of the GNU C Library.
- Contributed by Hartvig Ekner <hartvige@mips.com>, 2002.
+/* Copyright (C) 2012 Free Software Foundation, Inc.
+ * This file is part of the GNU C Library.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the MIPS Technologies, Inc., nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE MIPS TECHNOLOGIES, INC. ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE MIPS TECHNOLOGIES, INC. BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
- The GNU C Library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
+#ifdef __BIONIC__
+#include "machine/asm.h"
+#include "machine/regdef.h"
+#define ALLOW_OVERLAP
+#define USE_PREFETCH
+#else
+#ifdef _LIBC
+#include <sysdep.h>
+#define USE_PREFETCH
+#endif
+#include <regdef.h>
+#include <sys/asm.h>
+#if _MIPS_ISA == _MIPS_ISA_MIPS32 || _MIPS_ISA == _MIPS_ISA_MIPS64
+#define PREFETCH
+#endif
+#if _MIPS_SIM == _ABI64
+#define USE_DOUBLE
+#endif
+#endif
- The GNU C Library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
- You should have received a copy of the GNU Lesser General Public
- License along with the GNU C Library. If not, see
- <http://www.gnu.org/licenses/>. */
-#include <sysdep.h>
+/* Some asm.h files do not have the L macro definition. */
+#ifndef L
+#if _MIPS_SIM == _ABIO32
+# define L(label) $L ## label
+#else
+# define L(label) .L ## label
+#endif
+#endif
+
+/* Some regdef.h files deo not have the PTR_ADDIU macro definition. */
+#ifndef PTR_ADDIU
+#ifdef USE_DOUBLE
+#define PTR_ADDIU daddiu
+#else
+#define PTR_ADDIU addiu
+#endif
+#endif
-/* void *memcpy(void *s1, const void *s2, size_t n); */
+/*
+ * Using PREF_LOAD_STREAMED instead of PREF_LOAD on load prefetches offers
+ * a slight preformance advantage, using PREF_PREPAREFORSTORE instead of
+ * PREF_STORE_STREAMED or PREF_STORE offers a large performance advantage.
+ */
+#ifdef USE_PREFETCH
+# define PREF_LOAD 0
+# define PREF_STORE 1
+# define PREF_LOAD_STREAMED 4
+# define PREF_STORE_STREAMED 5
+# define PREF_LOAD_RETAINED 6
+# define PREF_STORE_RETAINED 7
+# define PREF_WRITEBACK_INVAL 25
+# define PREF_PREPAREFORSTORE 30
+
+/*
+ * We double everything when USE_DOUBLE is true so we do 2 prefetches to
+ * get 64 bytes in that case. The assumption is that each individual
+ * prefetch brings in 32 bytes.
+ */
+#ifdef USE_DOUBLE
+# define PREF_CHUNK 64
+# define PREFETCH_FOR_LOAD(chunk, reg) \
+ pref PREF_LOAD_STREAMED, (chunk)*32(reg); \
+ pref PREF_LOAD_STREAMED, ((chunk)+1)*32(reg)
+# define PREFETCH_FOR_STORE(chunk, reg) \
+ pref PREF_PREPAREFORSTORE, (chunk)*32(reg); \
+ pref PREF_PREPAREFORSTORE, ((chunk)+1)*32(reg)
+#else
+# define PREF_CHUNK 32
+# define PREFETCH_FOR_LOAD(chunk, reg) \
+ pref PREF_LOAD_STREAMED, (chunk)*32(reg)
+# define PREFETCH_FOR_STORE(chunk, reg) \
+ pref PREF_PREPAREFORSTORE, (chunk)*32(reg)
+#endif
+#define PREF_LIMIT (5 * PREF_CHUNK)
+#else
+# define PREFETCH_FOR_LOAD(offset, reg)
+# define PREFETCH_FOR_STORE(offset, reg)
+#endif
+
+/* Allow the routine to be named something else if desired. */
+#ifndef MEMCPY_NAME
+#define MEMCPY_NAME memcpy
+#endif
+
+/* We use these 32/64 bit registers as temporaries to do the copying. */
+#define REG0 t0
+#define REG1 t1
+#define REG2 t2
+#define REG3 t3
+#ifdef USE_DOUBLE
+# define REG4 ta0
+# define REG5 ta1
+# define REG6 ta2
+# define REG7 ta3
+#else
+# define REG4 t4
+# define REG5 t5
+# define REG6 t6
+# define REG7 t7
+#endif
+
+/* We load/store 64 bits at a time when USE_DOUBLE is true. */
+#ifdef USE_DOUBLE
+# define ST sd
+# define LD ld
#if __MIPSEB
-# define LWHI lwl /* high part is left in big-endian */
-# define SWHI swl /* high part is left in big-endian */
-# define LWLO lwr /* low part is right in big-endian */
-# define SWLO swr /* low part is right in big-endian */
+# define LDHI ldl /* high part is left in big-endian */
+# define STHI sdl /* high part is left in big-endian */
+# define LDLO ldr /* low part is right in big-endian */
+# define STLO sdr /* low part is right in big-endian */
#else
-# define LWHI lwr /* high part is right in little-endian */
-# define SWHI swr /* high part is right in little-endian */
-# define LWLO lwl /* low part is left in little-endian */
-# define SWLO swl /* low part is left in little-endian */
+# define LDHI ldr /* high part is right in little-endian */
+# define STHI sdr /* high part is right in little-endian */
+# define LDLO ldl /* low part is left in little-endian */
+# define STLO sdl /* low part is left in little-endian */
#endif
+#else
+# define ST sw
+# define LD lw
+#if __MIPSEB
+# define LDHI lwl /* high part is left in big-endian */
+# define STHI swl /* high part is left in big-endian */
+# define LDLO lwr /* low part is right in big-endian */
+# define STLO swr /* low part is right in big-endian */
+#else
+# define LDHI lwr /* high part is right in little-endian */
+# define STHI swr /* high part is right in little-endian */
+# define LDLO lwl /* low part is left in little-endian */
+# define STLO swl /* low part is left in little-endian */
+#endif
+#endif
+
+/* Bookkeeping values for 32 vs. 64 bit mode. */
+#ifdef USE_DOUBLE
+# define NSIZE 8
+# define NSIZEMASK 0x3f
+# define NSIZEDMASK 0x7f
+#else
+# define NSIZE 4
+# define NSIZEMASK 0x1f
+# define NSIZEDMASK 0x3f
+#endif
+#define UNIT(unit) ((unit)*NSIZE)
+#define UNITM1(unit) (((unit)*NSIZE)-1)
-ENTRY (memcpy)
+#ifdef __BIONIC__
+LEAF(MEMCPY_NAME, 0)
+#else
+LEAF(MEMCPY_NAME)
+#endif
+ .set nomips16
.set noreorder
+/*
+ * Below we handle the case where memcpy is called with overlapping src and dst.
+ * Although memcpy is not required to handle this case, some parts of Android
+ * like Skia rely on such usage. We call memmove to handle such cases.
+ */
+#ifdef ALLOW_OVERLAP
+ PTR_SUBU t0,a0,a1
+ PTR_SRA t2,t0,31
+ xor t1,t0,t2
+ PTR_SUBU t0,t1,t2
+ sltu t2,t0,a2
+ beq t2,zero,L(memcpy)
+ la t9,memmove
+ jr t9
+ nop
+L(memcpy):
+#endif
+/*
+ * If the size is less then 2*NSIZE (8 or 16), go to L(lastb). Regardless of
+ * size, copy dst pointer to v0 for the return value.
+ */
+ slti t2,a2,(2 * NSIZE)
+ bne t2,zero,L(lastb)
+ move v0,a0
+/*
+ * If src and dst have different alignments, go to L(unaligned), if they
+ * have the same alignment (but are not actually aligned) do a partial
+ * load/store to make them aligned. If they are both already aligned
+ * we can start copying at L(aligned).
+ */
+ xor t8,a1,a0
+ andi t8,t8,(NSIZE-1) /* t8 is a0/a1 word-displacement */
+ bne t8,zero,L(unaligned)
+ PTR_SUBU a3, zero, a0
+
+ andi a3,a3,(NSIZE-1) /* copy a3 bytes to align a0/a1 */
+ beq a3,zero,L(aligned) /* if a3=0, it is already aligned */
+ PTR_SUBU a2,a2,a3 /* a2 is the remining bytes count */
+
+ LDHI t8,0(a1)
+ PTR_ADDU a1,a1,a3
+ STHI t8,0(a0)
+ PTR_ADDU a0,a0,a3
+
+/*
+ * Now dst/src are both aligned to (word or double word) aligned addresses
+ * Set a2 to count how many bytes we have to copy after all the 64/128 byte
+ * chunks are copied and a3 to the dst pointer after all the 64/128 byte
+ * chunks have been copied. We will loop, incrementing a0 and a1 until a0
+ * equals a3.
+ */
+
+L(aligned):
+ andi t8,a2,NSIZEDMASK /* any whole 64-byte/128-byte chunks? */
+ beq a2,t8,L(chkw) /* if a2==t8, no 64-byte/128-byte chunks */
+ PTR_SUBU a3,a2,t8 /* subtract from a2 the reminder */
+ PTR_ADDU a3,a0,a3 /* Now a3 is the final dst after loop */
+
+/* When in the loop we exercise "pref 30,x(a0)", the a0+x should not be past
+ * the "t0-32" address. This means: for x=128 the last "safe" a0 address is
+ * "t0-160". Alternatively, for x=64 the last "safe" a0 address is "t0-96"
+ * In the current version we will use "pref 30,128(a0)", so "t0-160" is the
+ * limit
+ */
+#ifdef USE_PREFETCH
+ PTR_ADDU t0,a0,a2 /* t0 is the "past the end" address */
+ PTR_SUBU t9,t0,PREF_LIMIT /* t9 is the "last safe pref" address */
+ PREFETCH_FOR_LOAD (0, a1)
+ PREFETCH_FOR_LOAD (1, a1)
+ PREFETCH_FOR_LOAD (2, a1)
+ PREFETCH_FOR_STORE (1, a0)
+ sltu v1,t9,a0 /* If a0 > t9 don't use next prefetch */
+ bgtz v1,L(loop16w)
+ nop
+#endif
+ PREFETCH_FOR_STORE (2, a0)
+L(loop16w):
+ PREFETCH_FOR_LOAD (3, a1)
+ LD t0,UNIT(0)(a1)
+#ifdef USE_PREFETCH
+ bgtz v1,L(skip_pref30_96)
+#endif
+ LD t1,UNIT(1)(a1)
+ PREFETCH_FOR_STORE (3, a0)
+L(skip_pref30_96):
+ LD REG2,UNIT(2)(a1)
+ LD REG3,UNIT(3)(a1)
+ LD REG4,UNIT(4)(a1)
+ LD REG5,UNIT(5)(a1)
+ LD REG6,UNIT(6)(a1)
+ LD REG7,UNIT(7)(a1)
+ PREFETCH_FOR_LOAD (4, a1)
+
+ ST t0,UNIT(0)(a0)
+ ST t1,UNIT(1)(a0)
+ ST REG2,UNIT(2)(a0)
+ ST REG3,UNIT(3)(a0)
+ ST REG4,UNIT(4)(a0)
+ ST REG5,UNIT(5)(a0)
+ ST REG6,UNIT(6)(a0)
+ ST REG7,UNIT(7)(a0)
+
+ LD t0,UNIT(8)(a1)
+#ifdef USE_PREFETCH
+ bgtz v1,L(skip_pref30_128)
+#endif
+ LD t1,UNIT(9)(a1)
+ PREFETCH_FOR_STORE (4, a0)
+L(skip_pref30_128):
+ LD REG2,UNIT(10)(a1)
+ LD REG3,UNIT(11)(a1)
+ LD REG4,UNIT(12)(a1)
+ LD REG5,UNIT(13)(a1)
+ LD REG6,UNIT(14)(a1)
+ LD REG7,UNIT(15)(a1)
+ PREFETCH_FOR_LOAD (5, a1)
+ ST t0,UNIT(8)(a0)
+ ST t1,UNIT(9)(a0)
+ ST REG2,UNIT(10)(a0)
+ ST REG3,UNIT(11)(a0)
+ ST REG4,UNIT(12)(a0)
+ ST REG5,UNIT(13)(a0)
+ ST REG6,UNIT(14)(a0)
+ ST REG7,UNIT(15)(a0)
+ PTR_ADDIU a0,a0,UNIT(16) /* adding 64/128 to dest */
+#ifdef USE_PREFETCH
+ sltu v1,t9,a0
+#endif
+ bne a0,a3,L(loop16w)
+ PTR_ADDIU a1,a1,UNIT(16) /* adding 64/128 to src */
+ move a2,t8
+
+/* Here we have src and dest word-aligned but less than 64-bytes or
+ * 128 bytes to go. Check for a 32(64) byte chunk and copy if if there
+ * is one. Otherwise jump down to L(chk1w) to handle the tail end of
+ * the copy.
+ */
+
+L(chkw):
+ PREFETCH_FOR_LOAD (0, a1)
+ andi t8,a2,NSIZEMASK /* Is there a 32-byte/64-byte chunk. */
+ /* The t8 is the reminder count past 32-bytes */
+ beq a2,t8,L(chk1w) /* When a2=t8, no 32-byte chunk */
+ nop
+ LD t0,UNIT(0)(a1)
+ LD t1,UNIT(1)(a1)
+ LD REG2,UNIT(2)(a1)
+ LD REG3,UNIT(3)(a1)
+ LD REG4,UNIT(4)(a1)
+ LD REG5,UNIT(5)(a1)
+ LD REG6,UNIT(6)(a1)
+ LD REG7,UNIT(7)(a1)
+ PTR_ADDIU a1,a1,UNIT(8)
+ ST t0,UNIT(0)(a0)
+ ST t1,UNIT(1)(a0)
+ ST REG2,UNIT(2)(a0)
+ ST REG3,UNIT(3)(a0)
+ ST REG4,UNIT(4)(a0)
+ ST REG5,UNIT(5)(a0)
+ ST REG6,UNIT(6)(a0)
+ ST REG7,UNIT(7)(a0)
+ PTR_ADDIU a0,a0,UNIT(8)
+
+/*
+ * Here we have less then 32(64) bytes to copy. Set up for a loop to
+ * copy one word (or double word) at a time. Set a2 to count how many
+ * bytes we have to copy after all the word (or double word) chunks are
+ * copied and a3 to the dst pointer after all the (d)word chunks have
+ * been copied. We will loop, incrementing a0 and a1 until a0 equals a3.
+ */
+L(chk1w):
+ andi a2,t8,(NSIZE-1) /* a2 is the reminder past one (d)word chunks */
+ beq a2,t8,L(lastb)
+ PTR_SUBU a3,t8,a2 /* a3 is count of bytes in one (d)word chunks */
+ PTR_ADDU a3,a0,a3 /* a3 is the dst address after loop */
+
+/* copying in words (4-byte or 8-byte chunks) */
+L(wordCopy_loop):
+ LD REG3,UNIT(0)(a1)
+ PTR_ADDIU a1,a1,UNIT(1)
+ PTR_ADDIU a0,a0,UNIT(1)
+ bne a0,a3,L(wordCopy_loop)
+ ST REG3,UNIT(-1)(a0)
- slti t0, a2, 8 # Less than 8?
- bne t0, zero, L(last8)
- move v0, a0 # Setup exit value before too late
-
- xor t0, a1, a0 # Find a0/a1 displacement
- andi t0, 0x3
- bne t0, zero, L(shift) # Go handle the unaligned case
- subu t1, zero, a1
- andi t1, 0x3 # a0/a1 are aligned, but are we
- beq t1, zero, L(chk8w) # starting in the middle of a word?
- subu a2, t1
- LWHI t0, 0(a1) # Yes we are... take care of that
- addu a1, t1
- SWHI t0, 0(a0)
- addu a0, t1
-
-L(chk8w):
- andi t0, a2, 0x1f # 32 or more bytes left?
- beq t0, a2, L(chk1w)
- subu a3, a2, t0 # Yes
- addu a3, a1 # a3 = end address of loop
- move a2, t0 # a2 = what will be left after loop
-L(lop8w):
- lw t0, 0(a1) # Loop taking 8 words at a time
- lw t1, 4(a1)
- lw t2, 8(a1)
- lw t3, 12(a1)
- lw t4, 16(a1)
- lw t5, 20(a1)
- lw t6, 24(a1)
- lw t7, 28(a1)
- addiu a0, 32
- addiu a1, 32
- sw t0, -32(a0)
- sw t1, -28(a0)
- sw t2, -24(a0)
- sw t3, -20(a0)
- sw t4, -16(a0)
- sw t5, -12(a0)
- sw t6, -8(a0)
- bne a1, a3, L(lop8w)
- sw t7, -4(a0)
-
-L(chk1w):
- andi t0, a2, 0x3 # 4 or more bytes left?
- beq t0, a2, L(last8)
- subu a3, a2, t0 # Yes, handle them one word at a time
- addu a3, a1 # a3 again end address
- move a2, t0
-L(lop1w):
- lw t0, 0(a1)
- addiu a0, 4
- addiu a1, 4
- bne a1, a3, L(lop1w)
- sw t0, -4(a0)
-
-L(last8):
- blez a2, L(lst8e) # Handle last 8 bytes, one at a time
- addu a3, a2, a1
-L(lst8l):
- lb t0, 0(a1)
- addiu a0, 1
- addiu a1, 1
- bne a1, a3, L(lst8l)
- sb t0, -1(a0)
-L(lst8e):
- jr ra # Bye, bye
+/* Copy the last 8 (or 16) bytes */
+L(lastb):
+ blez a2,L(leave)
+ PTR_ADDU a3,a0,a2 /* a3 is the last dst address */
+L(lastbloop):
+ lb v1,0(a1)
+ PTR_ADDIU a1,a1,1
+ PTR_ADDIU a0,a0,1
+ bne a0,a3,L(lastbloop)
+ sb v1,-1(a0)
+L(leave):
+ j ra
nop
+/*
+ * UNALIGNED case, got here with a3 = "negu a0"
+ * This code is nearly identical to the aligned code above
+ * but only the destination (not the source) gets aligned
+ * so we need to do partial loads of the source followed
+ * by normal stores to the destination (once we have aligned
+ * the destination).
+ */
-L(shift):
- subu a3, zero, a0 # Src and Dest unaligned
- andi a3, 0x3 # (unoptimized case...)
- beq a3, zero, L(shft1)
- subu a2, a3 # a2 = bytes left
- LWHI t0, 0(a1) # Take care of first odd part
- LWLO t0, 3(a1)
- addu a1, a3
- SWHI t0, 0(a0)
- addu a0, a3
-L(shft1):
- andi t0, a2, 0x3
- subu a3, a2, t0
- addu a3, a1
-L(shfth):
- LWHI t1, 0(a1) # Limp through, word by word
- LWLO t1, 3(a1)
- addiu a0, 4
- addiu a1, 4
- bne a1, a3, L(shfth)
- sw t1, -4(a0)
- b L(last8) # Handle anything which may be left
- move a2, t0
+L(unaligned):
+ andi a3,a3,(NSIZE-1) /* copy a3 bytes to align a0/a1 */
+ beqz a3,L(ua_chk16w) /* if a3=0, it is already aligned */
+ PTR_SUBU a2,a2,a3 /* a2 is the remining bytes count */
+ LDHI v1,UNIT(0)(a1)
+ LDLO v1,UNITM1(1)(a1)
+ PTR_ADDU a1,a1,a3
+ STHI v1,UNIT(0)(a0)
+ PTR_ADDU a0,a0,a3
+
+/*
+ * Now the destination (but not the source) is aligned
+ * Set a2 to count how many bytes we have to copy after all the 64/128 byte
+ * chunks are copied and a3 to the dst pointer after all the 64/128 byte
+ * chunks have been copied. We will loop, incrementing a0 and a1 until a0
+ * equals a3.
+ */
+
+L(ua_chk16w):
+ andi t8,a2,NSIZEDMASK /* any whole 64-byte/128-byte chunks? */
+ beq a2,t8,L(ua_chkw) /* if a2==t8, no 64-byte/128-byte chunks */
+ PTR_SUBU a3,a2,t8 /* subtract from a2 the reminder */
+ PTR_ADDU a3,a0,a3 /* Now a3 is the final dst after loop */
+
+#ifdef USE_PREFETCH
+ PTR_ADDU t0,a0,a2 /* t0 is the "past the end" address */
+ PTR_SUBU t9,t0,PREF_LIMIT /* t9 is the "last safe pref" address */
+ PREFETCH_FOR_LOAD (0, a1)
+ PREFETCH_FOR_LOAD (1, a1)
+ PREFETCH_FOR_LOAD (2, a1)
+ PREFETCH_FOR_STORE (1, a0)
+ sltu v1,t9,a0
+ bgtz v1,L(ua_loop16w) /* skip prefetch for too short arrays */
+ nop
+#endif
+ PREFETCH_FOR_STORE (2, a0)
+L(ua_loop16w):
+ PREFETCH_FOR_LOAD (3, a1)
+ LDHI t0,UNIT(0)(a1)
+ LDLO t0,UNITM1(1)(a1)
+ LDHI t1,UNIT(1)(a1)
+#ifdef USE_PREFETCH
+ bgtz v1,L(ua_skip_pref30_96)
+#endif
+ LDLO t1,UNITM1(2)(a1)
+ PREFETCH_FOR_STORE (3, a0)
+L(ua_skip_pref30_96):
+ LDHI REG2,UNIT(2)(a1)
+ LDLO REG2,UNITM1(3)(a1)
+ LDHI REG3,UNIT(3)(a1)
+ LDLO REG3,UNITM1(4)(a1)
+ LDHI REG4,UNIT(4)(a1)
+ LDLO REG4,UNITM1(5)(a1)
+ LDHI REG5,UNIT(5)(a1)
+ LDLO REG5,UNITM1(6)(a1)
+ LDHI REG6,UNIT(6)(a1)
+ LDLO REG6,UNITM1(7)(a1)
+ LDHI REG7,UNIT(7)(a1)
+ LDLO REG7,UNITM1(8)(a1)
+ PREFETCH_FOR_LOAD (4, a1)
+ ST t0,UNIT(0)(a0)
+ ST t1,UNIT(1)(a0)
+ ST REG2,UNIT(2)(a0)
+ ST REG3,UNIT(3)(a0)
+ ST REG4,UNIT(4)(a0)
+ ST REG5,UNIT(5)(a0)
+ ST REG6,UNIT(6)(a0)
+ ST REG7,UNIT(7)(a0)
+ LDHI t0,UNIT(8)(a1)
+ LDLO t0,UNITM1(9)(a1)
+ LDHI t1,UNIT(9)(a1)
+#ifdef USE_PREFETCH
+ bgtz v1,L(ua_skip_pref30_128)
+#endif
+ LDLO t1,UNITM1(10)(a1)
+ PREFETCH_FOR_STORE (4, a0)
+L(ua_skip_pref30_128):
+ LDHI REG2,UNIT(10)(a1)
+ LDLO REG2,UNITM1(11)(a1)
+ LDHI REG3,UNIT(11)(a1)
+ LDLO REG3,UNITM1(12)(a1)
+ LDHI REG4,UNIT(12)(a1)
+ LDLO REG4,UNITM1(13)(a1)
+ LDHI REG5,UNIT(13)(a1)
+ LDLO REG5,UNITM1(14)(a1)
+ LDHI REG6,UNIT(14)(a1)
+ LDLO REG6,UNITM1(15)(a1)
+ LDHI REG7,UNIT(15)(a1)
+ LDLO REG7,UNITM1(16)(a1)
+ PREFETCH_FOR_LOAD (5, a1)
+ ST t0,UNIT(8)(a0)
+ ST t1,UNIT(9)(a0)
+ ST REG2,UNIT(10)(a0)
+ ST REG3,UNIT(11)(a0)
+ ST REG4,UNIT(12)(a0)
+ ST REG5,UNIT(13)(a0)
+ ST REG6,UNIT(14)(a0)
+ ST REG7,UNIT(15)(a0)
+ PTR_ADDIU a0,a0,UNIT(16) /* adding 64/128 to dest */
+#ifdef USE_PREFETCH
+ sltu v1,t9,a0
+#endif
+ bne a0,a3,L(ua_loop16w)
+ PTR_ADDIU a1,a1,UNIT(16) /* adding 64/128 to src */
+ move a2,t8
+
+/* Here we have src and dest word-aligned but less than 64-bytes or
+ * 128 bytes to go. Check for a 32(64) byte chunk and copy if if there
+ * is one. Otherwise jump down to L(ua_chk1w) to handle the tail end of
+ * the copy. */
+
+L(ua_chkw):
+ PREFETCH_FOR_LOAD (0, a1)
+ andi t8,a2,NSIZEMASK /* Is there a 32-byte/64-byte chunk. */
+ /* t8 is the reminder count past 32-bytes */
+ beq a2,t8,L(ua_chk1w) /* When a2=t8, no 32-byte chunk */
+ nop
+ LDHI t0,UNIT(0)(a1)
+ LDLO t0,UNITM1(1)(a1)
+ LDHI t1,UNIT(1)(a1)
+ LDLO t1,UNITM1(2)(a1)
+ LDHI REG2,UNIT(2)(a1)
+ LDLO REG2,UNITM1(3)(a1)
+ LDHI REG3,UNIT(3)(a1)
+ LDLO REG3,UNITM1(4)(a1)
+ LDHI REG4,UNIT(4)(a1)
+ LDLO REG4,UNITM1(5)(a1)
+ LDHI REG5,UNIT(5)(a1)
+ LDLO REG5,UNITM1(6)(a1)
+ LDHI REG6,UNIT(6)(a1)
+ LDLO REG6,UNITM1(7)(a1)
+ LDHI REG7,UNIT(7)(a1)
+ LDLO REG7,UNITM1(8)(a1)
+ PTR_ADDIU a1,a1,UNIT(8)
+ ST t0,UNIT(0)(a0)
+ ST t1,UNIT(1)(a0)
+ ST REG2,UNIT(2)(a0)
+ ST REG3,UNIT(3)(a0)
+ ST REG4,UNIT(4)(a0)
+ ST REG5,UNIT(5)(a0)
+ ST REG6,UNIT(6)(a0)
+ ST REG7,UNIT(7)(a0)
+ PTR_ADDIU a0,a0,UNIT(8)
+/*
+ * Here we have less then 32(64) bytes to copy. Set up for a loop to
+ * copy one word (or double word) at a time.
+ */
+L(ua_chk1w):
+ andi a2,t8,(NSIZE-1) /* a2 is the reminder past one (d)word chunks */
+ beq a2,t8,L(ua_smallCopy)
+ PTR_SUBU a3,t8,a2 /* a3 is count of bytes in one (d)word chunks */
+ PTR_ADDU a3,a0,a3 /* a3 is the dst address after loop */
+
+/* copying in words (4-byte or 8-byte chunks) */
+L(ua_wordCopy_loop):
+ LDHI v1,UNIT(0)(a1)
+ LDLO v1,UNITM1(1)(a1)
+ PTR_ADDIU a1,a1,UNIT(1)
+ PTR_ADDIU a0,a0,UNIT(1)
+ bne a0,a3,L(ua_wordCopy_loop)
+ ST v1,UNIT(-1)(a0)
+
+/* Copy the last 8 (or 16) bytes */
+L(ua_smallCopy):
+ beqz a2,L(leave)
+ PTR_ADDU a3,a0,a2 /* a3 is the last dst address */
+L(ua_smallCopy_loop):
+ lb v1,0(a1)
+ PTR_ADDIU a1,a1,1
+ PTR_ADDIU a0,a0,1
+ bne a0,a3,L(ua_smallCopy_loop)
+ sb v1,-1(a0)
+
+ j ra
+ nop
+
+ .set at
.set reorder
-END (memcpy)
-libc_hidden_builtin_def (memcpy)
+END(MEMCPY_NAME)
+#ifdef _LIBC
+libc_hidden_builtin_def (MEMCPY_NAME)
+#endif
diff --git a/ports/sysdeps/mips/mips64/memcpy.S b/ports/sysdeps/mips/mips64/memcpy.S
deleted file mode 100644
index 49ef34d..0000000
--- a/ports/sysdeps/mips/mips64/memcpy.S
+++ /dev/null
@@ -1,138 +0,0 @@
-/* Copyright (C) 2002-2012 Free Software Foundation, Inc.
- This file is part of the GNU C Library.
- Contributed by Hartvig Ekner <hartvige@mips.com>, 2002.
- Ported to mips3 n32/n64 by Alexandre Oliva <aoliva@redhat.com>
-
- The GNU C Library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- The GNU C Library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with the GNU C Library. If not, see
- <http://www.gnu.org/licenses/>. */
-
-#include <sysdep.h>
-#include <sys/asm.h>
-
-
-/* void *memcpy(void *s1, const void *s2, size_t n);
-
- This could probably be optimized further. */
-
-#if __MIPSEB
-# define LDHI ldl /* high part is left in big-endian */
-# define SDHI sdl /* high part is left in big-endian */
-# define LDLO ldr /* low part is right in big-endian */
-# define SDLO sdr /* low part is right in big-endian */
-#else
-# define LDHI ldr /* high part is right in little-endian */
-# define SDHI sdr /* high part is right in little-endian */
-# define LDLO ldl /* low part is left in little-endian */
-# define SDLO sdl /* low part is left in little-endian */
-#endif
-
-ENTRY (memcpy)
- .set noreorder
-
- slti t0, a2, 16 # Less than 16?
- bne t0, zero, L(last16)
- move v0, a0 # Setup exit value before too late
-
- xor t0, a1, a0 # Find a0/a1 displacement
- andi t0, 0x7
- bne t0, zero, L(shift) # Go handle the unaligned case
- PTR_SUBU t1, zero, a1
- andi t1, 0x7 # a0/a1 are aligned, but are we
- beq t1, zero, L(chk8w) # starting in the middle of a word?
- PTR_SUBU a2, t1
- LDHI t0, 0(a1) # Yes we are... take care of that
- PTR_ADDU a1, t1
- SDHI t0, 0(a0)
- PTR_ADDU a0, t1
-
-L(chk8w):
- andi t0, a2, 0x3f # 64 or more bytes left?
- beq t0, a2, L(chk1w)
- PTR_SUBU a3, a2, t0 # Yes
- PTR_ADDU a3, a1 # a3 = end address of loop
- move a2, t0 # a2 = what will be left after loop
-L(lop8w):
- ld t0, 0(a1) # Loop taking 8 words at a time
- ld t1, 8(a1)
- ld t2, 16(a1)
- ld t3, 24(a1)
- ld ta0, 32(a1)
- ld ta1, 40(a1)
- ld ta2, 48(a1)
- ld ta3, 56(a1)
- PTR_ADDIU a0, 64
- PTR_ADDIU a1, 64
- sd t0, -64(a0)
- sd t1, -56(a0)
- sd t2, -48(a0)
- sd t3, -40(a0)
- sd ta0, -32(a0)
- sd ta1, -24(a0)
- sd ta2, -16(a0)
- bne a1, a3, L(lop8w)
- sd ta3, -8(a0)
-
-L(chk1w):
- andi t0, a2, 0x7 # 8 or more bytes left?
- beq t0, a2, L(last16)
- PTR_SUBU a3, a2, t0 # Yes, handle them one dword at a time
- PTR_ADDU a3, a1 # a3 again end address
- move a2, t0
-L(lop1w):
- ld t0, 0(a1)
- PTR_ADDIU a0, 8
- PTR_ADDIU a1, 8
- bne a1, a3, L(lop1w)
- sd t0, -8(a0)
-
-L(last16):
- blez a2, L(lst16e) # Handle last 16 bytes, one at a time
- PTR_ADDU a3, a2, a1
-L(lst16l):
- lb t0, 0(a1)
- PTR_ADDIU a0, 1
- PTR_ADDIU a1, 1
- bne a1, a3, L(lst16l)
- sb t0, -1(a0)
-L(lst16e):
- jr ra # Bye, bye
- nop
-
-L(shift):
- PTR_SUBU a3, zero, a0 # Src and Dest unaligned
- andi a3, 0x7 # (unoptimized case...)
- beq a3, zero, L(shft1)
- PTR_SUBU a2, a3 # a2 = bytes left
- LDHI t0, 0(a1) # Take care of first odd part
- LDLO t0, 7(a1)
- PTR_ADDU a1, a3
- SDHI t0, 0(a0)
- PTR_ADDU a0, a3
-L(shft1):
- andi t0, a2, 0x7
- PTR_SUBU a3, a2, t0
- PTR_ADDU a3, a1
-L(shfth):
- LDHI t1, 0(a1) # Limp through, dword by dword
- LDLO t1, 7(a1)
- PTR_ADDIU a0, 8
- PTR_ADDIU a1, 8
- bne a1, a3, L(shfth)
- sd t1, -8(a0)
- b L(last16) # Handle anything which may be left
- move a2, t0
-
- .set reorder
-END (memcpy)
-libc_hidden_builtin_def (memcpy)
^ permalink raw reply [flat|nested] 50+ messages in thread
* Re: [PATCH] Optimize MIPS memcpy
2012-09-06 16:25 ` Steve Ellcey
@ 2012-09-06 18:43 ` Roland McGrath
2012-09-06 19:37 ` Steve Ellcey
2012-09-07 21:24 ` Maxim Kuvyrkov
2012-09-11 4:35 ` Maxim Kuvyrkov
2 siblings, 1 reply; 50+ messages in thread
From: Roland McGrath @ 2012-09-06 18:43 UTC (permalink / raw)
To: Steve Ellcey
Cc: Maxim Kuvyrkov, Andrew T Pinski, Joseph S. Myers, libc-ports
If you are contributing code to the GNU C library, then its copyright
terms must not be changed. Your patch left FSF as the copyright owner
but changed the terms, which doesn't make sense at all. We cannot
accept code that has not had its copyright assigned to the FSF. If
you and your employer have not already done the assignment paperwork,
we need that first. As the copyright owner, FSF will choose the exact
copyright terms, which will be the same ones used for the rest of the
library code.
Thanks,
Roland
^ permalink raw reply [flat|nested] 50+ messages in thread
* Re: [PATCH] Optimize MIPS memcpy
2012-09-06 18:43 ` Roland McGrath
@ 2012-09-06 19:37 ` Steve Ellcey
0 siblings, 0 replies; 50+ messages in thread
From: Steve Ellcey @ 2012-09-06 19:37 UTC (permalink / raw)
To: Roland McGrath
Cc: Maxim Kuvyrkov, Andrew T Pinski, Joseph S. Myers, libc-ports
On Thu, 2012-09-06 at 11:43 -0700, Roland McGrath wrote:
> If you are contributing code to the GNU C library, then its copyright
> terms must not be changed. Your patch left FSF as the copyright owner
> but changed the terms, which doesn't make sense at all. We cannot
> accept code that has not had its copyright assigned to the FSF. If
> you and your employer have not already done the assignment paperwork,
> we need that first. As the copyright owner, FSF will choose the exact
> copyright terms, which will be the same ones used for the rest of the
> library code.
>
>
> Thanks,
> Roland
Sorry about that. I guess if I had thought it about I would have
realized that just changing the owner to FSF and not changing the
actual notice was the wrong thing to do. I do have a copyright
assignment on file already so that shouldn't be a problem.
Steve Ellcey
sellcey@mips.com
^ permalink raw reply [flat|nested] 50+ messages in thread
* Re: [PATCH] Optimize MIPS memcpy
2012-09-06 16:25 ` Steve Ellcey
2012-09-06 18:43 ` Roland McGrath
@ 2012-09-07 21:24 ` Maxim Kuvyrkov
2012-09-11 4:35 ` Maxim Kuvyrkov
2 siblings, 0 replies; 50+ messages in thread
From: Maxim Kuvyrkov @ 2012-09-07 21:24 UTC (permalink / raw)
To: Steve Ellcey; +Cc: Andrew T Pinski, Joseph S. Myers, libc-ports
On 7/09/2012, at 4:25 AM, Steve Ellcey wrote:
> On Wed, 2012-09-05 at 12:43 +1200, Maxim Kuvyrkov wrote:
>
>> Your version looks quite good. If you could wrap it up into a glibc patch I would test it on our
>> setup to confirm that it indeed provides better performance.
>>
>> Thanks,
>>
>> --
>> Maxim Kuvyrkov
>> Mentor Graphics
>
> I have attached a glibc patch for my version of memcpy.
>
>
> 2012-09-06 Steve Ellcey <sellcey@mips.com>
>
> * sysdeps/mips/memcpy.S: Add prefetching and more unrolling, make
> it work in 32 or 64 bit modes.
> * sysdeps/mips/mips64/memcpy.S: Remove.
Thanks, I will benchmark it shortly.
Meanwhile, would you please test your memcpy implementation together with patch in http://sourceware.org/ml/libc-alpha/2012-09/msg00197.html to make sure your memcpy also can be used in memmove.
Thanks,
--
Maxim Kuvyrkov
CodeSourcery / Mentor Graphics
^ permalink raw reply [flat|nested] 50+ messages in thread
* Re: [PATCH] Optimize MIPS memcpy
2012-09-06 16:25 ` Steve Ellcey
2012-09-06 18:43 ` Roland McGrath
2012-09-07 21:24 ` Maxim Kuvyrkov
@ 2012-09-11 4:35 ` Maxim Kuvyrkov
2012-09-11 15:18 ` Steve Ellcey
2 siblings, 1 reply; 50+ messages in thread
From: Maxim Kuvyrkov @ 2012-09-11 4:35 UTC (permalink / raw)
To: Steve Ellcey; +Cc: Andrew T Pinski, Joseph S. Myers, libc-ports
On 7/09/2012, at 4:25 AM, Steve Ellcey wrote:
> On Wed, 2012-09-05 at 12:43 +1200, Maxim Kuvyrkov wrote:
>
>> Your version looks quite good. If you could wrap it up into a glibc patch I would test it on our
>> setup to confirm that it indeed provides better performance.
>>
>> Thanks,
>>
>> --
>> Maxim Kuvyrkov
>> Mentor Graphics
>
> I have attached a glibc patch for my version of memcpy.
>
>
> 2012-09-06 Steve Ellcey <sellcey@mips.com>
>
> * sysdeps/mips/memcpy.S: Add prefetching and more unrolling, make
> it work in 32 or 64 bit modes.
> * sysdeps/mips/mips64/memcpy.S: Remove.
This fails to build for me at least for N32 ABI.
../ports/sysdeps/mips/memcpy.S: Assembler messages:
../ports/sysdeps/mips/memcpy.S:272: Error: Illegal operands `lw t4,((4)*4)($5)'
../ports/sysdeps/mips/memcpy.S:273: Error: Illegal operands `lw t5,((5)*4)($5)'
I guess the extra parenthesis screw up assembler syntax.
--
Maxim Kuvyrkov
CodeSourcery / Mentor Graphics
^ permalink raw reply [flat|nested] 50+ messages in thread
* Re: [PATCH] Optimize MIPS memcpy
2012-09-11 4:35 ` Maxim Kuvyrkov
@ 2012-09-11 15:18 ` Steve Ellcey
2012-09-20 9:05 ` Maxim Kuvyrkov
0 siblings, 1 reply; 50+ messages in thread
From: Steve Ellcey @ 2012-09-11 15:18 UTC (permalink / raw)
To: Maxim Kuvyrkov; +Cc: Andrew T Pinski, Joseph S. Myers, libc-ports
[-- Attachment #1: Type: text/plain, Size: 985 bytes --]
On Tue, 2012-09-11 at 16:34 +1200, Maxim Kuvyrkov wrote:
> > 2012-09-06 Steve Ellcey <sellcey@mips.com>
> >
> > * sysdeps/mips/memcpy.S: Add prefetching and more unrolling, make
> > it work in 32 or 64 bit modes.
> > * sysdeps/mips/mips64/memcpy.S: Remove.
>
> This fails to build for me at least for N32 ABI.
>
> ../ports/sysdeps/mips/memcpy.S: Assembler messages:
> ../ports/sysdeps/mips/memcpy.S:272: Error: Illegal operands `lw t4,((4)*4)($5)'
> ../ports/sysdeps/mips/memcpy.S:273: Error: Illegal operands `lw t5,((5)*4)($5)'
>
> I guess the extra parenthesis screw up assembler syntax.
>
> --
> Maxim Kuvyrkov
> CodeSourcery / Mentor Graphics
It is not the parenthesis, it is the use of t4, t5, t6, and t7 instead
of ta0, ta1, ta2, and ta3. I use the t[4567] for 32 bit mode but I
guess I want to use ta[0123] for N32 ABI mode as well as for the 64 bit
mode. Here is a new version with this change and with a fixed copyright
notice.
Steve Ellcey
sellcey@mips.com
[-- Attachment #2: memcpy.patch --]
[-- Type: text/x-patch, Size: 22793 bytes --]
diff --git a/ports/sysdeps/mips/memcpy.S b/ports/sysdeps/mips/memcpy.S
index 753f67c..2e31946 100644
--- a/ports/sysdeps/mips/memcpy.S
+++ b/ports/sysdeps/mips/memcpy.S
@@ -1,7 +1,8 @@
-/* Copyright (C) 2002-2012 Free Software Foundation, Inc.
+/* Copyright (C) 2012 Free Software Foundation, Inc.
This file is part of the GNU C Library.
- Contributed by Hartvig Ekner <hartvige@mips.com>, 2002.
-
+
+ Contributed by MIPS Technologies, Inc.
+
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
@@ -16,119 +17,548 @@
License along with the GNU C Library. If not, see
<http://www.gnu.org/licenses/>. */
+#ifdef __BIONIC__
+#include "machine/asm.h"
+#include "machine/regdef.h"
+#define ALLOW_OVERLAP
+#define USE_PREFETCH
+#else
+#ifdef _LIBC
#include <sysdep.h>
+#define USE_PREFETCH
+#endif
+#include <regdef.h>
+#include <sys/asm.h>
+#if _MIPS_ISA == _MIPS_ISA_MIPS32 || _MIPS_ISA == _MIPS_ISA_MIPS64
+#define PREFETCH
+#endif
+#if _MIPS_SIM == _ABI64
+#define USE_DOUBLE
+#endif
+#endif
+
+
+
+/* Some asm.h files do not have the L macro definition. */
+#ifndef L
+#if _MIPS_SIM == _ABIO32
+# define L(label) $L ## label
+#else
+# define L(label) .L ## label
+#endif
+#endif
+
+/* Some regdef.h files deo not have the PTR_ADDIU macro definition. */
+#ifndef PTR_ADDIU
+#ifdef USE_DOUBLE
+#define PTR_ADDIU daddiu
+#else
+#define PTR_ADDIU addiu
+#endif
+#endif
+
+
+/*
+ * Using PREF_LOAD_STREAMED instead of PREF_LOAD on load prefetches offers
+ * a slight preformance advantage, using PREF_PREPAREFORSTORE instead of
+ * PREF_STORE_STREAMED or PREF_STORE offers a large performance advantage.
+ */
+#ifdef USE_PREFETCH
+# define PREF_LOAD 0
+# define PREF_STORE 1
+# define PREF_LOAD_STREAMED 4
+# define PREF_STORE_STREAMED 5
+# define PREF_LOAD_RETAINED 6
+# define PREF_STORE_RETAINED 7
+# define PREF_WRITEBACK_INVAL 25
+# define PREF_PREPAREFORSTORE 30
-/* void *memcpy(void *s1, const void *s2, size_t n); */
+/*
+ * We double everything when USE_DOUBLE is true so we do 2 prefetches to
+ * get 64 bytes in that case. The assumption is that each individual
+ * prefetch brings in 32 bytes.
+ */
+#ifdef USE_DOUBLE
+# define PREF_CHUNK 64
+# define PREFETCH_FOR_LOAD(chunk, reg) \
+ pref PREF_LOAD_STREAMED, (chunk)*32(reg); \
+ pref PREF_LOAD_STREAMED, ((chunk)+1)*32(reg)
+# define PREFETCH_FOR_STORE(chunk, reg) \
+ pref PREF_PREPAREFORSTORE, (chunk)*32(reg); \
+ pref PREF_PREPAREFORSTORE, ((chunk)+1)*32(reg)
+#else
+# define PREF_CHUNK 32
+# define PREFETCH_FOR_LOAD(chunk, reg) \
+ pref PREF_LOAD_STREAMED, (chunk)*32(reg)
+# define PREFETCH_FOR_STORE(chunk, reg) \
+ pref PREF_PREPAREFORSTORE, (chunk)*32(reg)
+#endif
+#define PREF_LIMIT (5 * PREF_CHUNK)
+#else
+# define PREFETCH_FOR_LOAD(offset, reg)
+# define PREFETCH_FOR_STORE(offset, reg)
+#endif
+
+/* Allow the routine to be named something else if desired. */
+#ifndef MEMCPY_NAME
+#define MEMCPY_NAME memcpy
+#endif
+
+/* We use these 32/64 bit registers as temporaries to do the copying. */
+#define REG0 t0
+#define REG1 t1
+#define REG2 t2
+#define REG3 t3
+#if _MIPS_SIM == _ABIO32
+# define REG4 t4
+# define REG5 t5
+# define REG6 t6
+# define REG7 t7
+#else
+# define REG4 ta0
+# define REG5 ta1
+# define REG6 ta2
+# define REG7 ta3
+#endif
+/* We load/store 64 bits at a time when USE_DOUBLE is true. */
+#ifdef USE_DOUBLE
+# define ST sd
+# define LD ld
#if __MIPSEB
-# define LWHI lwl /* high part is left in big-endian */
-# define SWHI swl /* high part is left in big-endian */
-# define LWLO lwr /* low part is right in big-endian */
-# define SWLO swr /* low part is right in big-endian */
+# define LDHI ldl /* high part is left in big-endian */
+# define STHI sdl /* high part is left in big-endian */
+# define LDLO ldr /* low part is right in big-endian */
+# define STLO sdr /* low part is right in big-endian */
#else
-# define LWHI lwr /* high part is right in little-endian */
-# define SWHI swr /* high part is right in little-endian */
-# define LWLO lwl /* low part is left in little-endian */
-# define SWLO swl /* low part is left in little-endian */
+# define LDHI ldr /* high part is right in little-endian */
+# define STHI sdr /* high part is right in little-endian */
+# define LDLO ldl /* low part is left in little-endian */
+# define STLO sdl /* low part is left in little-endian */
+#endif
+#else
+# define ST sw
+# define LD lw
+#if __MIPSEB
+# define LDHI lwl /* high part is left in big-endian */
+# define STHI swl /* high part is left in big-endian */
+# define LDLO lwr /* low part is right in big-endian */
+# define STLO swr /* low part is right in big-endian */
+#else
+# define LDHI lwr /* high part is right in little-endian */
+# define STHI swr /* high part is right in little-endian */
+# define LDLO lwl /* low part is left in little-endian */
+# define STLO swl /* low part is left in little-endian */
+#endif
#endif
-ENTRY (memcpy)
+/* Bookkeeping values for 32 vs. 64 bit mode. */
+#ifdef USE_DOUBLE
+# define NSIZE 8
+# define NSIZEMASK 0x3f
+# define NSIZEDMASK 0x7f
+#else
+# define NSIZE 4
+# define NSIZEMASK 0x1f
+# define NSIZEDMASK 0x3f
+#endif
+#define UNIT(unit) ((unit)*NSIZE)
+#define UNITM1(unit) (((unit)*NSIZE)-1)
+
+#ifdef __BIONIC__
+LEAF(MEMCPY_NAME, 0)
+#else
+LEAF(MEMCPY_NAME)
+#endif
+ .set nomips16
.set noreorder
+/*
+ * Below we handle the case where memcpy is called with overlapping src and dst.
+ * Although memcpy is not required to handle this case, some parts of Android
+ * like Skia rely on such usage. We call memmove to handle such cases.
+ */
+#ifdef ALLOW_OVERLAP
+ PTR_SUBU t0,a0,a1
+ PTR_SRA t2,t0,31
+ xor t1,t0,t2
+ PTR_SUBU t0,t1,t2
+ sltu t2,t0,a2
+ beq t2,zero,L(memcpy)
+ la t9,memmove
+ jr t9
+ nop
+L(memcpy):
+#endif
+/*
+ * If the size is less then 2*NSIZE (8 or 16), go to L(lastb). Regardless of
+ * size, copy dst pointer to v0 for the return value.
+ */
+ slti t2,a2,(2 * NSIZE)
+ bne t2,zero,L(lastb)
+ move v0,a0
+/*
+ * If src and dst have different alignments, go to L(unaligned), if they
+ * have the same alignment (but are not actually aligned) do a partial
+ * load/store to make them aligned. If they are both already aligned
+ * we can start copying at L(aligned).
+ */
+ xor t8,a1,a0
+ andi t8,t8,(NSIZE-1) /* t8 is a0/a1 word-displacement */
+ bne t8,zero,L(unaligned)
+ PTR_SUBU a3, zero, a0
+
+ andi a3,a3,(NSIZE-1) /* copy a3 bytes to align a0/a1 */
+ beq a3,zero,L(aligned) /* if a3=0, it is already aligned */
+ PTR_SUBU a2,a2,a3 /* a2 is the remining bytes count */
+
+ LDHI t8,0(a1)
+ PTR_ADDU a1,a1,a3
+ STHI t8,0(a0)
+ PTR_ADDU a0,a0,a3
+
+/*
+ * Now dst/src are both aligned to (word or double word) aligned addresses
+ * Set a2 to count how many bytes we have to copy after all the 64/128 byte
+ * chunks are copied and a3 to the dst pointer after all the 64/128 byte
+ * chunks have been copied. We will loop, incrementing a0 and a1 until a0
+ * equals a3.
+ */
+
+L(aligned):
+ andi t8,a2,NSIZEDMASK /* any whole 64-byte/128-byte chunks? */
+ beq a2,t8,L(chkw) /* if a2==t8, no 64-byte/128-byte chunks */
+ PTR_SUBU a3,a2,t8 /* subtract from a2 the reminder */
+ PTR_ADDU a3,a0,a3 /* Now a3 is the final dst after loop */
+
+/* When in the loop we exercise "pref 30,x(a0)", the a0+x should not be past
+ * the "t0-32" address. This means: for x=128 the last "safe" a0 address is
+ * "t0-160". Alternatively, for x=64 the last "safe" a0 address is "t0-96"
+ * In the current version we will use "pref 30,128(a0)", so "t0-160" is the
+ * limit
+ */
+#ifdef USE_PREFETCH
+ PTR_ADDU t0,a0,a2 /* t0 is the "past the end" address */
+ PTR_SUBU t9,t0,PREF_LIMIT /* t9 is the "last safe pref" address */
+ PREFETCH_FOR_LOAD (0, a1)
+ PREFETCH_FOR_LOAD (1, a1)
+ PREFETCH_FOR_LOAD (2, a1)
+ PREFETCH_FOR_STORE (1, a0)
+ sltu v1,t9,a0 /* If a0 > t9 don't use next prefetch */
+ bgtz v1,L(loop16w)
+ nop
+#endif
+ PREFETCH_FOR_STORE (2, a0)
+L(loop16w):
+ PREFETCH_FOR_LOAD (3, a1)
+ LD t0,UNIT(0)(a1)
+#ifdef USE_PREFETCH
+ bgtz v1,L(skip_pref30_96)
+#endif
+ LD t1,UNIT(1)(a1)
+ PREFETCH_FOR_STORE (3, a0)
+L(skip_pref30_96):
+ LD REG2,UNIT(2)(a1)
+ LD REG3,UNIT(3)(a1)
+ LD REG4,UNIT(4)(a1)
+ LD REG5,UNIT(5)(a1)
+ LD REG6,UNIT(6)(a1)
+ LD REG7,UNIT(7)(a1)
+ PREFETCH_FOR_LOAD (4, a1)
+
+ ST t0,UNIT(0)(a0)
+ ST t1,UNIT(1)(a0)
+ ST REG2,UNIT(2)(a0)
+ ST REG3,UNIT(3)(a0)
+ ST REG4,UNIT(4)(a0)
+ ST REG5,UNIT(5)(a0)
+ ST REG6,UNIT(6)(a0)
+ ST REG7,UNIT(7)(a0)
+
+ LD t0,UNIT(8)(a1)
+#ifdef USE_PREFETCH
+ bgtz v1,L(skip_pref30_128)
+#endif
+ LD t1,UNIT(9)(a1)
+ PREFETCH_FOR_STORE (4, a0)
+L(skip_pref30_128):
+ LD REG2,UNIT(10)(a1)
+ LD REG3,UNIT(11)(a1)
+ LD REG4,UNIT(12)(a1)
+ LD REG5,UNIT(13)(a1)
+ LD REG6,UNIT(14)(a1)
+ LD REG7,UNIT(15)(a1)
+ PREFETCH_FOR_LOAD (5, a1)
+ ST t0,UNIT(8)(a0)
+ ST t1,UNIT(9)(a0)
+ ST REG2,UNIT(10)(a0)
+ ST REG3,UNIT(11)(a0)
+ ST REG4,UNIT(12)(a0)
+ ST REG5,UNIT(13)(a0)
+ ST REG6,UNIT(14)(a0)
+ ST REG7,UNIT(15)(a0)
+ PTR_ADDIU a0,a0,UNIT(16) /* adding 64/128 to dest */
+#ifdef USE_PREFETCH
+ sltu v1,t9,a0
+#endif
+ bne a0,a3,L(loop16w)
+ PTR_ADDIU a1,a1,UNIT(16) /* adding 64/128 to src */
+ move a2,t8
+
+/* Here we have src and dest word-aligned but less than 64-bytes or
+ * 128 bytes to go. Check for a 32(64) byte chunk and copy if if there
+ * is one. Otherwise jump down to L(chk1w) to handle the tail end of
+ * the copy.
+ */
+
+L(chkw):
+ PREFETCH_FOR_LOAD (0, a1)
+ andi t8,a2,NSIZEMASK /* Is there a 32-byte/64-byte chunk. */
+ /* The t8 is the reminder count past 32-bytes */
+ beq a2,t8,L(chk1w) /* When a2=t8, no 32-byte chunk */
+ nop
+ LD t0,UNIT(0)(a1)
+ LD t1,UNIT(1)(a1)
+ LD REG2,UNIT(2)(a1)
+ LD REG3,UNIT(3)(a1)
+ LD REG4,UNIT(4)(a1)
+ LD REG5,UNIT(5)(a1)
+ LD REG6,UNIT(6)(a1)
+ LD REG7,UNIT(7)(a1)
+ PTR_ADDIU a1,a1,UNIT(8)
+ ST t0,UNIT(0)(a0)
+ ST t1,UNIT(1)(a0)
+ ST REG2,UNIT(2)(a0)
+ ST REG3,UNIT(3)(a0)
+ ST REG4,UNIT(4)(a0)
+ ST REG5,UNIT(5)(a0)
+ ST REG6,UNIT(6)(a0)
+ ST REG7,UNIT(7)(a0)
+ PTR_ADDIU a0,a0,UNIT(8)
+
+/*
+ * Here we have less then 32(64) bytes to copy. Set up for a loop to
+ * copy one word (or double word) at a time. Set a2 to count how many
+ * bytes we have to copy after all the word (or double word) chunks are
+ * copied and a3 to the dst pointer after all the (d)word chunks have
+ * been copied. We will loop, incrementing a0 and a1 until a0 equals a3.
+ */
+L(chk1w):
+ andi a2,t8,(NSIZE-1) /* a2 is the reminder past one (d)word chunks */
+ beq a2,t8,L(lastb)
+ PTR_SUBU a3,t8,a2 /* a3 is count of bytes in one (d)word chunks */
+ PTR_ADDU a3,a0,a3 /* a3 is the dst address after loop */
- slti t0, a2, 8 # Less than 8?
- bne t0, zero, L(last8)
- move v0, a0 # Setup exit value before too late
-
- xor t0, a1, a0 # Find a0/a1 displacement
- andi t0, 0x3
- bne t0, zero, L(shift) # Go handle the unaligned case
- subu t1, zero, a1
- andi t1, 0x3 # a0/a1 are aligned, but are we
- beq t1, zero, L(chk8w) # starting in the middle of a word?
- subu a2, t1
- LWHI t0, 0(a1) # Yes we are... take care of that
- addu a1, t1
- SWHI t0, 0(a0)
- addu a0, t1
-
-L(chk8w):
- andi t0, a2, 0x1f # 32 or more bytes left?
- beq t0, a2, L(chk1w)
- subu a3, a2, t0 # Yes
- addu a3, a1 # a3 = end address of loop
- move a2, t0 # a2 = what will be left after loop
-L(lop8w):
- lw t0, 0(a1) # Loop taking 8 words at a time
- lw t1, 4(a1)
- lw t2, 8(a1)
- lw t3, 12(a1)
- lw t4, 16(a1)
- lw t5, 20(a1)
- lw t6, 24(a1)
- lw t7, 28(a1)
- addiu a0, 32
- addiu a1, 32
- sw t0, -32(a0)
- sw t1, -28(a0)
- sw t2, -24(a0)
- sw t3, -20(a0)
- sw t4, -16(a0)
- sw t5, -12(a0)
- sw t6, -8(a0)
- bne a1, a3, L(lop8w)
- sw t7, -4(a0)
-
-L(chk1w):
- andi t0, a2, 0x3 # 4 or more bytes left?
- beq t0, a2, L(last8)
- subu a3, a2, t0 # Yes, handle them one word at a time
- addu a3, a1 # a3 again end address
- move a2, t0
-L(lop1w):
- lw t0, 0(a1)
- addiu a0, 4
- addiu a1, 4
- bne a1, a3, L(lop1w)
- sw t0, -4(a0)
-
-L(last8):
- blez a2, L(lst8e) # Handle last 8 bytes, one at a time
- addu a3, a2, a1
-L(lst8l):
- lb t0, 0(a1)
- addiu a0, 1
- addiu a1, 1
- bne a1, a3, L(lst8l)
- sb t0, -1(a0)
-L(lst8e):
- jr ra # Bye, bye
+/* copying in words (4-byte or 8-byte chunks) */
+L(wordCopy_loop):
+ LD REG3,UNIT(0)(a1)
+ PTR_ADDIU a1,a1,UNIT(1)
+ PTR_ADDIU a0,a0,UNIT(1)
+ bne a0,a3,L(wordCopy_loop)
+ ST REG3,UNIT(-1)(a0)
+
+/* Copy the last 8 (or 16) bytes */
+L(lastb):
+ blez a2,L(leave)
+ PTR_ADDU a3,a0,a2 /* a3 is the last dst address */
+L(lastbloop):
+ lb v1,0(a1)
+ PTR_ADDIU a1,a1,1
+ PTR_ADDIU a0,a0,1
+ bne a0,a3,L(lastbloop)
+ sb v1,-1(a0)
+L(leave):
+ j ra
+ nop
+/*
+ * UNALIGNED case, got here with a3 = "negu a0"
+ * This code is nearly identical to the aligned code above
+ * but only the destination (not the source) gets aligned
+ * so we need to do partial loads of the source followed
+ * by normal stores to the destination (once we have aligned
+ * the destination).
+ */
+
+L(unaligned):
+ andi a3,a3,(NSIZE-1) /* copy a3 bytes to align a0/a1 */
+ beqz a3,L(ua_chk16w) /* if a3=0, it is already aligned */
+ PTR_SUBU a2,a2,a3 /* a2 is the remining bytes count */
+
+ LDHI v1,UNIT(0)(a1)
+ LDLO v1,UNITM1(1)(a1)
+ PTR_ADDU a1,a1,a3
+ STHI v1,UNIT(0)(a0)
+ PTR_ADDU a0,a0,a3
+
+/*
+ * Now the destination (but not the source) is aligned
+ * Set a2 to count how many bytes we have to copy after all the 64/128 byte
+ * chunks are copied and a3 to the dst pointer after all the 64/128 byte
+ * chunks have been copied. We will loop, incrementing a0 and a1 until a0
+ * equals a3.
+ */
+
+L(ua_chk16w):
+ andi t8,a2,NSIZEDMASK /* any whole 64-byte/128-byte chunks? */
+ beq a2,t8,L(ua_chkw) /* if a2==t8, no 64-byte/128-byte chunks */
+ PTR_SUBU a3,a2,t8 /* subtract from a2 the reminder */
+ PTR_ADDU a3,a0,a3 /* Now a3 is the final dst after loop */
+
+#ifdef USE_PREFETCH
+ PTR_ADDU t0,a0,a2 /* t0 is the "past the end" address */
+ PTR_SUBU t9,t0,PREF_LIMIT /* t9 is the "last safe pref" address */
+ PREFETCH_FOR_LOAD (0, a1)
+ PREFETCH_FOR_LOAD (1, a1)
+ PREFETCH_FOR_LOAD (2, a1)
+ PREFETCH_FOR_STORE (1, a0)
+ sltu v1,t9,a0
+ bgtz v1,L(ua_loop16w) /* skip prefetch for too short arrays */
+ nop
+#endif
+ PREFETCH_FOR_STORE (2, a0)
+L(ua_loop16w):
+ PREFETCH_FOR_LOAD (3, a1)
+ LDHI t0,UNIT(0)(a1)
+ LDLO t0,UNITM1(1)(a1)
+ LDHI t1,UNIT(1)(a1)
+#ifdef USE_PREFETCH
+ bgtz v1,L(ua_skip_pref30_96)
+#endif
+ LDLO t1,UNITM1(2)(a1)
+ PREFETCH_FOR_STORE (3, a0)
+L(ua_skip_pref30_96):
+ LDHI REG2,UNIT(2)(a1)
+ LDLO REG2,UNITM1(3)(a1)
+ LDHI REG3,UNIT(3)(a1)
+ LDLO REG3,UNITM1(4)(a1)
+ LDHI REG4,UNIT(4)(a1)
+ LDLO REG4,UNITM1(5)(a1)
+ LDHI REG5,UNIT(5)(a1)
+ LDLO REG5,UNITM1(6)(a1)
+ LDHI REG6,UNIT(6)(a1)
+ LDLO REG6,UNITM1(7)(a1)
+ LDHI REG7,UNIT(7)(a1)
+ LDLO REG7,UNITM1(8)(a1)
+ PREFETCH_FOR_LOAD (4, a1)
+ ST t0,UNIT(0)(a0)
+ ST t1,UNIT(1)(a0)
+ ST REG2,UNIT(2)(a0)
+ ST REG3,UNIT(3)(a0)
+ ST REG4,UNIT(4)(a0)
+ ST REG5,UNIT(5)(a0)
+ ST REG6,UNIT(6)(a0)
+ ST REG7,UNIT(7)(a0)
+ LDHI t0,UNIT(8)(a1)
+ LDLO t0,UNITM1(9)(a1)
+ LDHI t1,UNIT(9)(a1)
+#ifdef USE_PREFETCH
+ bgtz v1,L(ua_skip_pref30_128)
+#endif
+ LDLO t1,UNITM1(10)(a1)
+ PREFETCH_FOR_STORE (4, a0)
+L(ua_skip_pref30_128):
+ LDHI REG2,UNIT(10)(a1)
+ LDLO REG2,UNITM1(11)(a1)
+ LDHI REG3,UNIT(11)(a1)
+ LDLO REG3,UNITM1(12)(a1)
+ LDHI REG4,UNIT(12)(a1)
+ LDLO REG4,UNITM1(13)(a1)
+ LDHI REG5,UNIT(13)(a1)
+ LDLO REG5,UNITM1(14)(a1)
+ LDHI REG6,UNIT(14)(a1)
+ LDLO REG6,UNITM1(15)(a1)
+ LDHI REG7,UNIT(15)(a1)
+ LDLO REG7,UNITM1(16)(a1)
+ PREFETCH_FOR_LOAD (5, a1)
+ ST t0,UNIT(8)(a0)
+ ST t1,UNIT(9)(a0)
+ ST REG2,UNIT(10)(a0)
+ ST REG3,UNIT(11)(a0)
+ ST REG4,UNIT(12)(a0)
+ ST REG5,UNIT(13)(a0)
+ ST REG6,UNIT(14)(a0)
+ ST REG7,UNIT(15)(a0)
+ PTR_ADDIU a0,a0,UNIT(16) /* adding 64/128 to dest */
+#ifdef USE_PREFETCH
+ sltu v1,t9,a0
+#endif
+ bne a0,a3,L(ua_loop16w)
+ PTR_ADDIU a1,a1,UNIT(16) /* adding 64/128 to src */
+ move a2,t8
+
+/* Here we have src and dest word-aligned but less than 64-bytes or
+ * 128 bytes to go. Check for a 32(64) byte chunk and copy if if there
+ * is one. Otherwise jump down to L(ua_chk1w) to handle the tail end of
+ * the copy. */
+
+L(ua_chkw):
+ PREFETCH_FOR_LOAD (0, a1)
+ andi t8,a2,NSIZEMASK /* Is there a 32-byte/64-byte chunk. */
+ /* t8 is the reminder count past 32-bytes */
+ beq a2,t8,L(ua_chk1w) /* When a2=t8, no 32-byte chunk */
nop
+ LDHI t0,UNIT(0)(a1)
+ LDLO t0,UNITM1(1)(a1)
+ LDHI t1,UNIT(1)(a1)
+ LDLO t1,UNITM1(2)(a1)
+ LDHI REG2,UNIT(2)(a1)
+ LDLO REG2,UNITM1(3)(a1)
+ LDHI REG3,UNIT(3)(a1)
+ LDLO REG3,UNITM1(4)(a1)
+ LDHI REG4,UNIT(4)(a1)
+ LDLO REG4,UNITM1(5)(a1)
+ LDHI REG5,UNIT(5)(a1)
+ LDLO REG5,UNITM1(6)(a1)
+ LDHI REG6,UNIT(6)(a1)
+ LDLO REG6,UNITM1(7)(a1)
+ LDHI REG7,UNIT(7)(a1)
+ LDLO REG7,UNITM1(8)(a1)
+ PTR_ADDIU a1,a1,UNIT(8)
+ ST t0,UNIT(0)(a0)
+ ST t1,UNIT(1)(a0)
+ ST REG2,UNIT(2)(a0)
+ ST REG3,UNIT(3)(a0)
+ ST REG4,UNIT(4)(a0)
+ ST REG5,UNIT(5)(a0)
+ ST REG6,UNIT(6)(a0)
+ ST REG7,UNIT(7)(a0)
+ PTR_ADDIU a0,a0,UNIT(8)
+/*
+ * Here we have less then 32(64) bytes to copy. Set up for a loop to
+ * copy one word (or double word) at a time.
+ */
+L(ua_chk1w):
+ andi a2,t8,(NSIZE-1) /* a2 is the reminder past one (d)word chunks */
+ beq a2,t8,L(ua_smallCopy)
+ PTR_SUBU a3,t8,a2 /* a3 is count of bytes in one (d)word chunks */
+ PTR_ADDU a3,a0,a3 /* a3 is the dst address after loop */
-L(shift):
- subu a3, zero, a0 # Src and Dest unaligned
- andi a3, 0x3 # (unoptimized case...)
- beq a3, zero, L(shft1)
- subu a2, a3 # a2 = bytes left
- LWHI t0, 0(a1) # Take care of first odd part
- LWLO t0, 3(a1)
- addu a1, a3
- SWHI t0, 0(a0)
- addu a0, a3
-L(shft1):
- andi t0, a2, 0x3
- subu a3, a2, t0
- addu a3, a1
-L(shfth):
- LWHI t1, 0(a1) # Limp through, word by word
- LWLO t1, 3(a1)
- addiu a0, 4
- addiu a1, 4
- bne a1, a3, L(shfth)
- sw t1, -4(a0)
- b L(last8) # Handle anything which may be left
- move a2, t0
+/* copying in words (4-byte or 8-byte chunks) */
+L(ua_wordCopy_loop):
+ LDHI v1,UNIT(0)(a1)
+ LDLO v1,UNITM1(1)(a1)
+ PTR_ADDIU a1,a1,UNIT(1)
+ PTR_ADDIU a0,a0,UNIT(1)
+ bne a0,a3,L(ua_wordCopy_loop)
+ ST v1,UNIT(-1)(a0)
+/* Copy the last 8 (or 16) bytes */
+L(ua_smallCopy):
+ beqz a2,L(leave)
+ PTR_ADDU a3,a0,a2 /* a3 is the last dst address */
+L(ua_smallCopy_loop):
+ lb v1,0(a1)
+ PTR_ADDIU a1,a1,1
+ PTR_ADDIU a0,a0,1
+ bne a0,a3,L(ua_smallCopy_loop)
+ sb v1,-1(a0)
+
+ j ra
+ nop
+
+ .set at
.set reorder
-END (memcpy)
-libc_hidden_builtin_def (memcpy)
+END(MEMCPY_NAME)
+#ifdef _LIBC
+libc_hidden_builtin_def (MEMCPY_NAME)
+#endif
diff --git a/ports/sysdeps/mips/mips64/memcpy.S b/ports/sysdeps/mips/mips64/memcpy.S
deleted file mode 100644
index 49ef34d..0000000
--- a/ports/sysdeps/mips/mips64/memcpy.S
+++ /dev/null
@@ -1,138 +0,0 @@
-/* Copyright (C) 2002-2012 Free Software Foundation, Inc.
- This file is part of the GNU C Library.
- Contributed by Hartvig Ekner <hartvige@mips.com>, 2002.
- Ported to mips3 n32/n64 by Alexandre Oliva <aoliva@redhat.com>
-
- The GNU C Library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- The GNU C Library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with the GNU C Library. If not, see
- <http://www.gnu.org/licenses/>. */
-
-#include <sysdep.h>
-#include <sys/asm.h>
-
-
-/* void *memcpy(void *s1, const void *s2, size_t n);
-
- This could probably be optimized further. */
-
-#if __MIPSEB
-# define LDHI ldl /* high part is left in big-endian */
-# define SDHI sdl /* high part is left in big-endian */
-# define LDLO ldr /* low part is right in big-endian */
-# define SDLO sdr /* low part is right in big-endian */
-#else
-# define LDHI ldr /* high part is right in little-endian */
-# define SDHI sdr /* high part is right in little-endian */
-# define LDLO ldl /* low part is left in little-endian */
-# define SDLO sdl /* low part is left in little-endian */
-#endif
-
-ENTRY (memcpy)
- .set noreorder
-
- slti t0, a2, 16 # Less than 16?
- bne t0, zero, L(last16)
- move v0, a0 # Setup exit value before too late
-
- xor t0, a1, a0 # Find a0/a1 displacement
- andi t0, 0x7
- bne t0, zero, L(shift) # Go handle the unaligned case
- PTR_SUBU t1, zero, a1
- andi t1, 0x7 # a0/a1 are aligned, but are we
- beq t1, zero, L(chk8w) # starting in the middle of a word?
- PTR_SUBU a2, t1
- LDHI t0, 0(a1) # Yes we are... take care of that
- PTR_ADDU a1, t1
- SDHI t0, 0(a0)
- PTR_ADDU a0, t1
-
-L(chk8w):
- andi t0, a2, 0x3f # 64 or more bytes left?
- beq t0, a2, L(chk1w)
- PTR_SUBU a3, a2, t0 # Yes
- PTR_ADDU a3, a1 # a3 = end address of loop
- move a2, t0 # a2 = what will be left after loop
-L(lop8w):
- ld t0, 0(a1) # Loop taking 8 words at a time
- ld t1, 8(a1)
- ld t2, 16(a1)
- ld t3, 24(a1)
- ld ta0, 32(a1)
- ld ta1, 40(a1)
- ld ta2, 48(a1)
- ld ta3, 56(a1)
- PTR_ADDIU a0, 64
- PTR_ADDIU a1, 64
- sd t0, -64(a0)
- sd t1, -56(a0)
- sd t2, -48(a0)
- sd t3, -40(a0)
- sd ta0, -32(a0)
- sd ta1, -24(a0)
- sd ta2, -16(a0)
- bne a1, a3, L(lop8w)
- sd ta3, -8(a0)
-
-L(chk1w):
- andi t0, a2, 0x7 # 8 or more bytes left?
- beq t0, a2, L(last16)
- PTR_SUBU a3, a2, t0 # Yes, handle them one dword at a time
- PTR_ADDU a3, a1 # a3 again end address
- move a2, t0
-L(lop1w):
- ld t0, 0(a1)
- PTR_ADDIU a0, 8
- PTR_ADDIU a1, 8
- bne a1, a3, L(lop1w)
- sd t0, -8(a0)
-
-L(last16):
- blez a2, L(lst16e) # Handle last 16 bytes, one at a time
- PTR_ADDU a3, a2, a1
-L(lst16l):
- lb t0, 0(a1)
- PTR_ADDIU a0, 1
- PTR_ADDIU a1, 1
- bne a1, a3, L(lst16l)
- sb t0, -1(a0)
-L(lst16e):
- jr ra # Bye, bye
- nop
-
-L(shift):
- PTR_SUBU a3, zero, a0 # Src and Dest unaligned
- andi a3, 0x7 # (unoptimized case...)
- beq a3, zero, L(shft1)
- PTR_SUBU a2, a3 # a2 = bytes left
- LDHI t0, 0(a1) # Take care of first odd part
- LDLO t0, 7(a1)
- PTR_ADDU a1, a3
- SDHI t0, 0(a0)
- PTR_ADDU a0, a3
-L(shft1):
- andi t0, a2, 0x7
- PTR_SUBU a3, a2, t0
- PTR_ADDU a3, a1
-L(shfth):
- LDHI t1, 0(a1) # Limp through, dword by dword
- LDLO t1, 7(a1)
- PTR_ADDIU a0, 8
- PTR_ADDIU a1, 8
- bne a1, a3, L(shfth)
- sd t1, -8(a0)
- b L(last16) # Handle anything which may be left
- move a2, t0
-
- .set reorder
-END (memcpy)
-libc_hidden_builtin_def (memcpy)
^ permalink raw reply [flat|nested] 50+ messages in thread
* Re: [PATCH] Optimize MIPS memcpy
2012-09-11 15:18 ` Steve Ellcey
@ 2012-09-20 9:05 ` Maxim Kuvyrkov
2012-09-20 18:38 ` Steve Ellcey
2012-09-21 18:47 ` Steve Ellcey
0 siblings, 2 replies; 50+ messages in thread
From: Maxim Kuvyrkov @ 2012-09-20 9:05 UTC (permalink / raw)
To: Steve Ellcey; +Cc: Andrew T Pinski, Joseph S. Myers, libc-ports
On 12/09/2012, at 3:17 AM, Steve Ellcey wrote:
> On Tue, 2012-09-11 at 16:34 +1200, Maxim Kuvyrkov wrote:
>
>>> 2012-09-06 Steve Ellcey <sellcey@mips.com>
>>>
>>> * sysdeps/mips/memcpy.S: Add prefetching and more unrolling, make
>>> it work in 32 or 64 bit modes.
>>> * sysdeps/mips/mips64/memcpy.S: Remove.
>>
>> This fails to build for me at least for N32 ABI.
>>
>> ../ports/sysdeps/mips/memcpy.S: Assembler messages:
>> ../ports/sysdeps/mips/memcpy.S:272: Error: Illegal operands `lw t4,((4)*4)($5)'
>> ../ports/sysdeps/mips/memcpy.S:273: Error: Illegal operands `lw t5,((5)*4)($5)'
>>
>> I guess the extra parenthesis screw up assembler syntax.
>>
>> --
>> Maxim Kuvyrkov
>> CodeSourcery / Mentor Graphics
>
> It is not the parenthesis, it is the use of t4, t5, t6, and t7 instead
> of ta0, ta1, ta2, and ta3. I use the t[4567] for 32 bit mode but I
> guess I want to use ta[0123] for N32 ABI mode as well as for the 64 bit
> mode. Here is a new version with this change and with a fixed copyright
> notice.
What testing was done for this patch, does it pass glibc testsuite?
I have a benchmark that exercises various string and mem* routines failing with it.
Thank you,
--
Maxim Kuvyrkov
CodeSourcery / Mentor Graphics
^ permalink raw reply [flat|nested] 50+ messages in thread
* Re: [PATCH] Optimize MIPS memcpy
2012-09-20 9:05 ` Maxim Kuvyrkov
@ 2012-09-20 18:38 ` Steve Ellcey
2012-09-28 3:48 ` Maxim Kuvyrkov
2012-09-21 18:47 ` Steve Ellcey
1 sibling, 1 reply; 50+ messages in thread
From: Steve Ellcey @ 2012-09-20 18:38 UTC (permalink / raw)
To: Maxim Kuvyrkov; +Cc: Andrew T Pinski, Joseph S. Myers, libc-ports
On Thu, 2012-09-20 at 21:05 +1200, Maxim Kuvyrkov wrote:
> What testing was done for this patch, does it pass glibc testsuite?
>
> I have a benchmark that exercises various string and mem* routines failing with it.
>
> Thank you,
>
> --
> Maxim Kuvyrkov
> CodeSourcery / Mentor Graphics
Is the benchmark anything you can share? I ran the glibc testsuite and
got some failures but I don't think they are due to the new memcpy. I
am going back now and running the glibc testsuite with no changes to get
a baseline so I can verify that. Hopefully I will have an answer later
today.
I ran some other tests like the gcc testsuite using a glibc with this
change in it and that didn't have any problems and there is the one I
sent to the list
http://sourceware.org/ml/libc-ports/2012-09/msg00007.html that also ran
with no problems.
Steve Ellcey
sellcey@mips.com
^ permalink raw reply [flat|nested] 50+ messages in thread
* Re: [PATCH] Optimize MIPS memcpy
2012-09-20 9:05 ` Maxim Kuvyrkov
2012-09-20 18:38 ` Steve Ellcey
@ 2012-09-21 18:47 ` Steve Ellcey
2012-09-21 18:57 ` Joseph S. Myers
2012-09-21 19:12 ` [PATCH] Optimize MIPS memcpy Maxim Kuvyrkov
1 sibling, 2 replies; 50+ messages in thread
From: Steve Ellcey @ 2012-09-21 18:47 UTC (permalink / raw)
To: Maxim Kuvyrkov; +Cc: Andrew T Pinski, Joseph S. Myers, libc-ports
On Thu, 2012-09-20 at 21:05 +1200, Maxim Kuvyrkov wrote:
> What testing was done for this patch, does it pass glibc testsuite?
Maxim, I get the same failures with this memcpy as without it when
running the glibc testsuite:
glibc/localedata/tst-fmon.out
glibc/math/test-float.out
glibc/math/test-double.out
glibc/math/test-ifloat.out
glibc/math/test-idouble.out
glibc/stdlib/tst-strtod-overflow.out
glibc/stdio-common/bug22.out
glibc/malloc/tst-trim1.out
glibc/nptl/tst-cancel7.out
glibc/nptl/tst-cancelx7.out
I did find a build problem, the version I sent you sets USE_PREFETCH for
any libc build. If building for something like MIPS1 that is not right,
so I need to change how that macro is set.
Steve Ellcey
sellcey@mips.com
^ permalink raw reply [flat|nested] 50+ messages in thread
* Re: [PATCH] Optimize MIPS memcpy
2012-09-21 18:47 ` Steve Ellcey
@ 2012-09-21 18:57 ` Joseph S. Myers
2012-09-21 20:41 ` [PATCH] Optimize MIPS memcpy (mips glibc test results) Steve Ellcey
2012-09-21 19:12 ` [PATCH] Optimize MIPS memcpy Maxim Kuvyrkov
1 sibling, 1 reply; 50+ messages in thread
From: Joseph S. Myers @ 2012-09-21 18:57 UTC (permalink / raw)
To: Steve Ellcey; +Cc: Maxim Kuvyrkov, Andrew T Pinski, libc-ports
On Fri, 21 Sep 2012, Steve Ellcey wrote:
> glibc/localedata/tst-fmon.out
You should investigate why you see this failure, it's not one of the known
issues listed at <http://sourceware.org/glibc/wiki/Release/2.16>
> glibc/math/test-float.out
> glibc/math/test-double.out
> glibc/math/test-ifloat.out
> glibc/math/test-idouble.out
Expected for soft-float configurations.
> glibc/stdlib/tst-strtod-overflow.out
Not expected, should investigate. (The test should exit cleanly on memory
allocation failure.)
> glibc/stdio-common/bug22.out
Expected for low memory, bug 14231.
> glibc/malloc/tst-trim1.out
Not expected, should investigate.
> glibc/nptl/tst-cancel7.out
> glibc/nptl/tst-cancelx7.out
Known race condition, bug 14232.
--
Joseph S. Myers
joseph@codesourcery.com
^ permalink raw reply [flat|nested] 50+ messages in thread
* Re: [PATCH] Optimize MIPS memcpy
2012-09-21 18:47 ` Steve Ellcey
2012-09-21 18:57 ` Joseph S. Myers
@ 2012-09-21 19:12 ` Maxim Kuvyrkov
1 sibling, 0 replies; 50+ messages in thread
From: Maxim Kuvyrkov @ 2012-09-21 19:12 UTC (permalink / raw)
To: Steve Ellcey; +Cc: Andrew T Pinski, Joseph S. Myers, libc-ports
On 22/09/2012, at 6:46 AM, Steve Ellcey wrote:
> On Thu, 2012-09-20 at 21:05 +1200, Maxim Kuvyrkov wrote:
>
>> What testing was done for this patch, does it pass glibc testsuite?
>
> Maxim, I get the same failures with this memcpy as without it when
> running the glibc testsuite:
Thanks Steve. I'll look into the failures that I'm seeing and try to get a testcase for you. Also, it's possible that I flunked testing the first time, we'll see.
--
Maxim Kuvyrkov
CodeSourcery / Mentor Graphics
^ permalink raw reply [flat|nested] 50+ messages in thread
* Re: [PATCH] Optimize MIPS memcpy (mips glibc test results)
2012-09-21 18:57 ` Joseph S. Myers
@ 2012-09-21 20:41 ` Steve Ellcey
2012-09-21 20:49 ` Joseph S. Myers
0 siblings, 1 reply; 50+ messages in thread
From: Steve Ellcey @ 2012-09-21 20:41 UTC (permalink / raw)
To: Joseph S. Myers; +Cc: Maxim Kuvyrkov, Andrew T Pinski, libc-ports
On Fri, 2012-09-21 at 18:55 +0000, Joseph S. Myers wrote:
> On Fri, 21 Sep 2012, Steve Ellcey wrote:
>
> > glibc/localedata/tst-fmon.out
>
> You should investigate why you see this failure, it's not one of the known
> issues listed at <http://sourceware.org/glibc/wiki/Release/2.16>
Hm, the out file has a bunch of lines like this:
Locale: "de_DE.ISO-8859-1" Format: "%n" Value: "1.23" Received: "1,23 EUR" Expected: " 1,23 EUR" => false
I am not sure why this would be mips specific or where to look for the problem.
I guess it expects more spaces in the output.
>
> > glibc/math/test-float.out
> > glibc/math/test-double.out
> > glibc/math/test-ifloat.out
> > glibc/math/test-idouble.out
>
> Expected for soft-float configurations.
But I thought I was doing a hard float build. It looks like the
failures are with cos, sincos, clog10, clog, ctan*. Maybe not having
those instructions in hardware counts as soft-float?
> > glibc/stdlib/tst-strtod-overflow.out
>
> Not expected, should investigate. (The test should exit cleanly on memory
> allocation failure.)
Hm, the out file is completely empty. But I do see:
Timed out: killed the child process
In the output.
>
> > glibc/stdio-common/bug22.out
>
> Expected for low memory, bug 14231.
>
> > glibc/malloc/tst-trim1.out
>
> Not expected, should investigate.
Also has the Timed out message like tst-strtod-overflow.out.
>
> > glibc/nptl/tst-cancel7.out
> > glibc/nptl/tst-cancelx7.out
>
> Known race condition, bug 14232.
Thanks for the info on known failures.
Steve Ellcey
sellcey@mips.com
^ permalink raw reply [flat|nested] 50+ messages in thread
* Re: [PATCH] Optimize MIPS memcpy (mips glibc test results)
2012-09-21 20:41 ` [PATCH] Optimize MIPS memcpy (mips glibc test results) Steve Ellcey
@ 2012-09-21 20:49 ` Joseph S. Myers
2012-09-21 20:56 ` Steve Ellcey
0 siblings, 1 reply; 50+ messages in thread
From: Joseph S. Myers @ 2012-09-21 20:49 UTC (permalink / raw)
To: Steve Ellcey; +Cc: Maxim Kuvyrkov, Andrew T Pinski, libc-ports
On Fri, 21 Sep 2012, Steve Ellcey wrote:
> > > glibc/math/test-float.out
> > > glibc/math/test-double.out
> > > glibc/math/test-ifloat.out
> > > glibc/math/test-idouble.out
> >
> > Expected for soft-float configurations.
>
> But I thought I was doing a hard float build. It looks like the
> failures are with cos, sincos, clog10, clog, ctan*. Maybe not having
> those instructions in hardware counts as soft-float?
Those have had tests added since I updated libm-test-ulps for 2.16, so if
it's just small ulps values for new tests then it's also fine and will go
away when the ulps are updated again.
> > > glibc/stdlib/tst-strtod-overflow.out
> >
> > Not expected, should investigate. (The test should exit cleanly on memory
> > allocation failure.)
>
> Hm, the out file is completely empty. But I do see:
>
> Timed out: killed the child process
>
> In the output.
You may need to increase your TIMEOUTFACTOR (or depending on how long it
takes with a sufficiently long timeout, propose an increase of the TIMEOUT
value in the test itself on libc-alpha).
> > > glibc/malloc/tst-trim1.out
> >
> > Not expected, should investigate.
>
> Also has the Timed out message like tst-strtod-overflow.out.
Again, maybe should set an increased TIMEOUT value, depending on how long
it takes.
--
Joseph S. Myers
joseph@codesourcery.com
^ permalink raw reply [flat|nested] 50+ messages in thread
* Re: [PATCH] Optimize MIPS memcpy (mips glibc test results)
2012-09-21 20:49 ` Joseph S. Myers
@ 2012-09-21 20:56 ` Steve Ellcey
0 siblings, 0 replies; 50+ messages in thread
From: Steve Ellcey @ 2012-09-21 20:56 UTC (permalink / raw)
To: Joseph S. Myers; +Cc: Maxim Kuvyrkov, Andrew T Pinski, libc-ports
On Fri, 2012-09-21 at 20:48 +0000, Joseph S. Myers wrote:
> On Fri, 21 Sep 2012, Steve Ellcey wrote:
>
> > > > glibc/math/test-float.out
> > > > glibc/math/test-double.out
> > > > glibc/math/test-ifloat.out
> > > > glibc/math/test-idouble.out
> > >
> > > Expected for soft-float configurations.
> >
> > But I thought I was doing a hard float build. It looks like the
> > failures are with cos, sincos, clog10, clog, ctan*. Maybe not having
> > those instructions in hardware counts as soft-float?
>
> Those have had tests added since I updated libm-test-ulps for 2.16, so if
> it's just small ulps values for new tests then it's also fine and will go
> away when the ulps are updated again.
Yes, it is mostly 1 ulp differences with a couple of 2 ulp diffs.
Steve Ellcey
sellcey@mips.com
^ permalink raw reply [flat|nested] 50+ messages in thread
* Re: [PATCH] Optimize MIPS memcpy
2012-09-20 18:38 ` Steve Ellcey
@ 2012-09-28 3:48 ` Maxim Kuvyrkov
2012-10-06 4:43 ` Maxim Kuvyrkov
0 siblings, 1 reply; 50+ messages in thread
From: Maxim Kuvyrkov @ 2012-09-28 3:48 UTC (permalink / raw)
To: Steve Ellcey; +Cc: Andrew T Pinski, Joseph S. Myers, libc-ports
On 21/09/2012, at 6:38 AM, Steve Ellcey wrote:
> On Thu, 2012-09-20 at 21:05 +1200, Maxim Kuvyrkov wrote:
>
>> What testing was done for this patch, does it pass glibc testsuite?
>>
>> I have a benchmark that exercises various string and mem* routines failing with it.
>>
>> Thank you,
>>
>> --
>> Maxim Kuvyrkov
>> CodeSourcery / Mentor Graphics
>
> Is the benchmark anything you can share? I ran the glibc testsuite and
> got some failures but I don't think they are due to the new memcpy. I
> am going back now and running the glibc testsuite with no changes to get
> a baseline so I can verify that. Hopefully I will have an answer later
> today.
>
> I ran some other tests like the gcc testsuite using a glibc with this
> change in it and that didn't have any problems and there is the one I
> sent to the list
> http://sourceware.org/ml/libc-ports/2012-09/msg00007.html that also ran
> with no problems.
As I mentioned in a different email, I can't share the benchmark, but I think I've got a testcase of sorts for you to investigate. It appears your memcpy clobbers a couple of bytes just before DEST in certain cases.
In particular when ABI=N32, DEST=0x10060008, SRC=0x1002c088, N=0x172 it clobbers DEST[-1] and DEST[-2] bytes.
--
Maxim Kuvyrkov
CodeSourcery / Mentor Graphics
^ permalink raw reply [flat|nested] 50+ messages in thread
* Re: [PATCH] Optimize MIPS memcpy
2012-09-28 3:48 ` Maxim Kuvyrkov
@ 2012-10-06 4:43 ` Maxim Kuvyrkov
2012-10-08 17:04 ` Steve Ellcey
0 siblings, 1 reply; 50+ messages in thread
From: Maxim Kuvyrkov @ 2012-10-06 4:43 UTC (permalink / raw)
To: Steve Ellcey; +Cc: Andrew T Pinski, Joseph S. Myers, libc-ports
On 28/09/2012, at 3:47 PM, Maxim Kuvyrkov wrote:
> On 21/09/2012, at 6:38 AM, Steve Ellcey wrote:
>
>> On Thu, 2012-09-20 at 21:05 +1200, Maxim Kuvyrkov wrote:
>>
>>> What testing was done for this patch, does it pass glibc testsuite?
>>>
>>> I have a benchmark that exercises various string and mem* routines failing with it.
>>>
>>> Thank you,
>>>
>>> --
>>> Maxim Kuvyrkov
>>> CodeSourcery / Mentor Graphics
>>
>> Is the benchmark anything you can share? I ran the glibc testsuite and
>> got some failures but I don't think they are due to the new memcpy. I
>> am going back now and running the glibc testsuite with no changes to get
>> a baseline so I can verify that. Hopefully I will have an answer later
>> today.
>>
>> I ran some other tests like the gcc testsuite using a glibc with this
>> change in it and that didn't have any problems and there is the one I
>> sent to the list
>> http://sourceware.org/ml/libc-ports/2012-09/msg00007.html that also ran
>> with no problems.
>
> As I mentioned in a different email, I can't share the benchmark, but I think I've got a testcase of sorts for you to investigate. It appears your memcpy clobbers a couple of bytes just before DEST in certain cases.
>
> In particular when ABI=N32, DEST=0x10060008, SRC=0x1002c088, N=0x172 it clobbers DEST[-1] and DEST[-2] bytes.
Steve and I have debugged these failures and they now seem to be resolved. I'll let Steve to followup with analysis and a new patch.
Meanwhile, I've benchmarked Steve's patch against mine. On the benchmark that I use both implementations provide equal performance for N64 ABI, but on N32 ABI Steve's patch is only half as fast. This is, probably, due to using 4-byte operations instead of 8-byte operations for N32 ABI:
#if _MIPS_SIM == _ABI64
#define USE_DOUBLE
#endif
It should be easy to improve Steve's patch for N32 ABI. Steve, will you look into that?
I would also appreciate if you look into making your version of memcpy memmove-safe, if it is not already.
Thank you,
--
Maxim Kuvyrkov
CodeSourcery / Mentor Graphics
^ permalink raw reply [flat|nested] 50+ messages in thread
* Re: [PATCH] Optimize MIPS memcpy
2012-10-06 4:43 ` Maxim Kuvyrkov
@ 2012-10-08 17:04 ` Steve Ellcey
2012-10-08 22:31 ` Maxim Kuvyrkov
0 siblings, 1 reply; 50+ messages in thread
From: Steve Ellcey @ 2012-10-08 17:04 UTC (permalink / raw)
To: Maxim Kuvyrkov; +Cc: Andrew T Pinski, Joseph S. Myers, libc-ports
On Sat, 2012-10-06 at 17:43 +1300, Maxim Kuvyrkov wrote:
> Steve and I have debugged these failures and they now seem to be resolved. I'll let Steve to followup with analysis and a new patch.
>
> Meanwhile, I've benchmarked Steve's patch against mine. On the benchmark that I use both implementations provide equal performance for N64 ABI, but on N32 ABI Steve's patch is only half as fast. This is, probably, due to using 4-byte operations instead of 8-byte operations for N32 ABI:
>
> #if _MIPS_SIM == _ABI64
> #define USE_DOUBLE
> #endif
>
> It should be easy to improve Steve's patch for N32 ABI. Steve, will you look into that?
>
> I would also appreciate if you look into making your version of memcpy memmove-safe, if it is not already.
>
> Thank you,
>
> --
> Maxim Kuvyrkov
> CodeSourcery / Mentor Graphics
Maxim, do you know if your test is doing a memcpy on overlapping memory?
While our analysis showed that the problem was due to the use of the
'prepare to store' prefetch hint, the code I sent earlier should have
worked fine for any code that was not doing an overlapping memcpy.
For anyone who may be interested, the 'prepare for store' prefetch hint
is different then other 'safe' prefetches which can be executed or
ignored without affecting the results of the code being executed.
Instead of bringing a chunk of memory into the cache, it simply
allocates a line of cache for use and zeros it out. If you write to
every byte of that line of cache, you are OK. But if you use the
'prepare to store' cache hint and do not write to the entire cache line
then the bytes you don't write to get written back to memory as zeros,
overwriting whatever was there before. The code in my memcpy routine
accounts for this, by checking the length of the buffer before doing the
'prepare to store' prefetches and only using them when it knows that it
is going to write to the entire cache line.
The other issue though is if the source and destination of the memcpy
overlap and if you use the prepare to store prefetch on a memory address
that is also part of the source of the memcpy you will get incorrect
results. That means that if we want to have memcpy be 'memmove-safe'
we cannot use the 'prepare to store' hint.
I will fix the code to use double loads and stores with the N32 ABI
and add comments about the 'prepare to store' hint. I hate to give up
on using the 'prepare for store' prefetch hint, since it does result in
the best peformance, but given the various issues maybe it is not the
best idea for glibc.
Steve Ellcey
sellcey@mips.com
^ permalink raw reply [flat|nested] 50+ messages in thread
* Re: [PATCH] Optimize MIPS memcpy
2012-10-08 17:04 ` Steve Ellcey
@ 2012-10-08 22:31 ` Maxim Kuvyrkov
2012-10-09 20:50 ` Steve Ellcey
2012-10-15 17:49 ` Steve Ellcey
0 siblings, 2 replies; 50+ messages in thread
From: Maxim Kuvyrkov @ 2012-10-08 22:31 UTC (permalink / raw)
To: Steve Ellcey; +Cc: Andrew T Pinski, Joseph S. Myers, libc-ports
On 9/10/2012, at 6:03 AM, Steve Ellcey wrote:
> On Sat, 2012-10-06 at 17:43 +1300, Maxim Kuvyrkov wrote:
>
>> Steve and I have debugged these failures and they now seem to be resolved. I'll let Steve to followup with analysis and a new patch.
>>
>> Meanwhile, I've benchmarked Steve's patch against mine. On the benchmark that I use both implementations provide equal performance for N64 ABI, but on N32 ABI Steve's patch is only half as fast. This is, probably, due to using 4-byte operations instead of 8-byte operations for N32 ABI:
>>
>> #if _MIPS_SIM == _ABI64
>> #define USE_DOUBLE
>> #endif
>>
>> It should be easy to improve Steve's patch for N32 ABI. Steve, will you look into that?
>>
>> I would also appreciate if you look into making your version of memcpy memmove-safe, if it is not already.
>>
>> Thank you,
>>
>> --
>> Maxim Kuvyrkov
>> CodeSourcery / Mentor Graphics
>
> Maxim, do you know if your test is doing a memcpy on overlapping memory?
> While our analysis showed that the problem was due to the use of the
> 'prepare to store' prefetch hint, the code I sent earlier should have
> worked fine for any code that was not doing an overlapping memcpy.
The test does not use overlapping memcpy.
>
> For anyone who may be interested, the 'prepare for store' prefetch hint
> is different then other 'safe' prefetches which can be executed or
> ignored without affecting the results of the code being executed.
>
> Instead of bringing a chunk of memory into the cache, it simply
> allocates a line of cache for use and zeros it out. If you write to
> every byte of that line of cache, you are OK. But if you use the
> 'prepare to store' cache hint and do not write to the entire cache line
> then the bytes you don't write to get written back to memory as zeros,
> overwriting whatever was there before. The code in my memcpy routine
> accounts for this, by checking the length of the buffer before doing the
> 'prepare to store' prefetches and only using them when it knows that it
> is going to write to the entire cache line.
Can there be a bug in logic that decides that a prepare-for-store prefetch is safe?
I've checked documentation for XLP (which is the target I'm using for testing) and it specifies 32-byte prefetch.
>
> The other issue though is if the source and destination of the memcpy
> overlap and if you use the prepare to store prefetch on a memory address
> that is also part of the source of the memcpy you will get incorrect
> results. That means that if we want to have memcpy be 'memmove-safe'
> we cannot use the 'prepare to store' hint.
I don't think this is a concern. Memmove will use memcpy only if the memory locations don't overlap. And for the record's sake, I'm testing without the memcpy-in-memmove patch.
>
> I will fix the code to use double loads and stores with the N32 ABI
> and add comments about the 'prepare to store' hint. I hate to give up
> on using the 'prepare for store' prefetch hint, since it does result in
> the best peformance, but given the various issues maybe it is not the
> best idea for glibc.
I too want to keep prepare-for-store prefetches is possible. For debugging purposes you could amend prepare-for-store prefetch macros to trigger a loop that would unconditionally clobber memory locations that prepare-for-store is expected to zero-out. Or add some other assertions to help out with debugging.
Thanks,
--
Maxim Kuvyrkov
CodeSourcery / Mentor Graphics
^ permalink raw reply [flat|nested] 50+ messages in thread
* Re: [PATCH] Optimize MIPS memcpy
2012-10-08 22:31 ` Maxim Kuvyrkov
@ 2012-10-09 20:50 ` Steve Ellcey
2012-10-15 17:49 ` Steve Ellcey
1 sibling, 0 replies; 50+ messages in thread
From: Steve Ellcey @ 2012-10-09 20:50 UTC (permalink / raw)
To: Maxim Kuvyrkov; +Cc: Andrew T Pinski, Joseph S. Myers, libc-ports
[-- Attachment #1: Type: text/plain, Size: 1207 bytes --]
On Tue, 2012-10-09 at 11:30 +1300, Maxim Kuvyrkov wrote:
> Can there be a bug in logic that decides that a prepare-for-store prefetch is safe?
>
> I've checked documentation for XLP (which is the target I'm using for testing) and it specifies 32-byte prefetch.
It's possible but I haven't found it yet if there is. One thought I had
was endianness, when you see the problem are you running in big-endian
or little-endian mode?
>
> I too want to keep prepare-for-store prefetches is possible. For debugging purposes you could amend
> prepare-for-store prefetch macros to trigger a loop that would unconditionally clobber memory locations
> that prepare-for-store is expected to zero-out. Or add some other assertions to help out with debugging.
That is an interesting idea. Here is a new copy of memcpy, it has a
macro 'DEBUG_PREFETCH' which, if you set it, will replace the prefetch
prepare-for-store with a set of writes to write out 32 bytes worth of
zeros. I still can't find any problems using this macro though. I also
fixed the N32 version to use double registers. Could you try this
version (with DEBUG_PREFETCH set) and see if you still get the problem.
Steve Ellcey
sellcey@mips.com
[-- Attachment #2: memcpy.patch --]
[-- Type: text/x-patch, Size: 23390 bytes --]
diff --git a/ports/sysdeps/mips/memcpy.S b/ports/sysdeps/mips/memcpy.S
index 753f67c..49e22e5 100644
--- a/ports/sysdeps/mips/memcpy.S
+++ b/ports/sysdeps/mips/memcpy.S
@@ -1,7 +1,8 @@
-/* Copyright (C) 2002-2012 Free Software Foundation, Inc.
+/* Copyright (C) 2012 Free Software Foundation, Inc.
This file is part of the GNU C Library.
- Contributed by Hartvig Ekner <hartvige@mips.com>, 2002.
-
+
+ Contributed by MIPS Technologies, Inc.
+
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
@@ -16,119 +17,648 @@
License along with the GNU C Library. If not, see
<http://www.gnu.org/licenses/>. */
+#ifdef ANDROID_CHANGES
+#include "machine/asm.h"
+#include "machine/regdef.h"
+#define USE_MEMMOVE_FOR_OVERLAP
+#define PREFETCH_LOAD_HINT PREFETCH_HINT_LOAD_STREAMED
+#define PREFETCH_STORE_HINT PREFETCH_HINT_PREPAREFORSTORE
+#elif _LIBC
#include <sysdep.h>
+#include <regdef.h>
+#include <sys/asm.h>
+#define PREFETCH_LOAD_HINT PREFETCH_HINT_LOAD_STREAMED
+#define PREFETCH_STORE_HINT PREFETCH_HINT_PREPAREFORSTORE
+#elif _NEWLIB
+#include "machine/asm.h"
+#include "machine/regdef.h"
+#define PREFETCH_LOAD_HINT PREFETCH_HINT_LOAD_STREAMED
+#define PREFETCH_STORE_HINT PREFETCH_HINT_PREPAREFORSTORE
+#else
+#include <regdef.h>
+#include <sys/asm.h>
+#endif
+
+#if (_MIPS_ISA == _MIPS_ISA_MIPS4) || (_MIPS_ISA == _MIPS_ISA_MIPS5) || \
+ (_MIPS_ISA == _MIPS_ISA_MIPS32) || (_MIPS_ISA == _MIPS_ISA_MIPS64)
+#ifndef DISABLE_PREFETCH
+#define USE_PREFETCH
+#endif
+#endif
+
+#if (_MIPS_SIM == _ABI64) || (_MIPS_SIM == _ABIN32)
+#ifndef DISABLE_DOUBLE
+#define USE_DOUBLE
+#endif
+#endif
+
+
+
+/* Some asm.h files do not have the L macro definition. */
+#ifndef L
+#if _MIPS_SIM == _ABIO32
+# define L(label) $L ## label
+#else
+# define L(label) .L ## label
+#endif
+#endif
+
+/* Some asm.h files do not have the PTR_ADDIU macro definition. */
+#ifndef PTR_ADDIU
+#ifdef USE_DOUBLE
+#define PTR_ADDIU daddiu
+#else
+#define PTR_ADDIU addiu
+#endif
+#endif
+
+/* Some asm.h files do not have the PTR_SRA macro definition. */
+#ifndef PTR_SRA
+#ifdef USE_DOUBLE
+#define PTR_SRA dsra
+#else
+#define PTR_SRA sra
+#endif
+#endif
-/* void *memcpy(void *s1, const void *s2, size_t n); */
+/*
+ * Using PREFETCH_HINT_LOAD_STREAMED instead of PREFETCH_LOAD on load
+ * prefetches appears to offer a slight preformance advantage.
+ *
+ * Using PREFETCH_HINT_PREPAREFORSTORE instead of PREFETCH_STORE
+ * or PREFETCH_STORE_STREAMED offers a large performance advantage
+ * but PREPAREFORSTORE has some special restrictions to consider.
+ *
+ * Prefetch with the 'prepare for store' hint does not copy a memory
+ * location into the cache, it just allocates a cache line and zeros
+ * it out. This means that if you do not write to the entire cache
+ * line before writing it out to memory some data will get zero'ed out
+ * when the cache line is written back to memory and data will be lost.
+ *
+ * Also if you are using this memcpy to copy overlapping buffers it may
+ * not behave correctly when using the 'prepare for store' hint. If you
+ * use the 'prepare for store' prefetch on a memory area that is in the
+ * memcpy source (as well as the memcpy destination), then you will get
+ * some data zero'ed out before you have a chance to read it and data will
+ * be lost.
+ *
+ * If you are going to use this memcpy routine with the 'prepare for store'
+ * prefetch you may want to set USE_MEMMOVE_FOR_OVERLAP in order to avoid
+ * the problem of running memcpy on overlapping buffers.
+ *
+ * There are ifdef'ed sections of this memcpy to make sure that it does not
+ * do prefetches on cache lines that are not going to be completely written.
+ * This code is only needed and only used when PREFETCH_STORE_HINT is set to
+ * PREFETCH_HINT_PREPAREFORSTORE. This code assumes that cache lines are
+ * 32 bytes.
+ */
+#ifdef USE_PREFETCH
+# define PREFETCH_HINT_LOAD 0
+# define PREFETCH_HINT_STORE 1
+# define PREFETCH_HINT_LOAD_STREAMED 4
+# define PREFETCH_HINT_STORE_STREAMED 5
+# define PREFETCH_HINT_LOAD_RETAINED 6
+# define PREFETCH_HINT_STORE_RETAINED 7
+# define PREFETCH_HINT_WRITEBACK_INVAL 25
+# define PREFETCH_HINT_PREPAREFORSTORE 30
+
+/*
+ * If we have not picked out what hints to use at this point use the
+ * standard load and store prefetch hints.
+ */
+#ifndef PREFETCH_STORE_HINT
+#define PREFETCH_STORE_HINT PREFETCH_HINT_STORE
+#endif
+#ifndef PREFETCH_LOAD_HINT
+#define PREFETCH_LOAD_HINT PREFETCH_HINT_LOAD
+#endif
+
+/*
+ * We double everything when USE_DOUBLE is true so we do 2 prefetches to
+ * get 64 bytes in that case. The assumption is that each individual
+ * prefetch brings in 32 bytes.
+ *
+ * You can try defining DEBUG_PREFETCH if you are having problems with
+ * the 'prepare for store' prefetch hint, if set the code will zero out
+ * the 32 bytes that are part of the prefetch instead of doing the actual
+ * prefetch. If this causes problems then you know that you are prefetching
+ * memory that you are not writing to.
+ */
+#ifdef USE_DOUBLE
+# define PREFETCH_CHUNK 64
+# define PREFETCH_FOR_LOAD(chunk, reg) \
+ pref PREFETCH_LOAD_HINT, (chunk)*32(reg); \
+ pref PREFETCH_LOAD_HINT, ((chunk)+1)*32(reg)
+#if defined(DEBUG_PREFETCH) \
+ && (PREFETCH_STORE_HINT == PREFETCH_HINT_PREPAREFORSTORE)
+# define PREFETCH_FOR_STORE(chunk, reg) \
+ sd zero, (((chunk)*32)+0)(reg); \
+ sd zero, (((chunk)*32)+8)(reg); \
+ sd zero, (((chunk)*32)+16)(reg); \
+ sd zero, (((chunk)*32)+24)(reg); \
+ sd zero, (((chunk)*32)+32)(reg); \
+ sd zero, (((chunk)*32)+40)(reg); \
+ sd zero, (((chunk)*32)+48)(reg); \
+ sd zero, (((chunk)*32)+56)(reg)
+#else
+# define PREFETCH_FOR_STORE(chunk, reg) \
+ pref PREFETCH_STORE_HINT, (chunk)*32(reg); \
+ pref PREFETCH_STORE_HINT, ((chunk)+1)*32(reg)
+#endif
+#else
+# define PREFETCH_CHUNK 32
+# define PREFETCH_FOR_LOAD(chunk, reg) \
+ pref PREFETCH_LOAD_HINT, (chunk)*32(reg)
+#if defined(DEBUG_PREFETCH) \
+ && (PREFETCH_STORE_HINT == PREFETCH_HINT_PREPAREFORSTORE)
+# define PREFETCH_FOR_STORE(chunk, reg) \
+ sw zero, (((chunk)*32)+0)(reg); \
+ sw zero, (((chunk)*32)+4)(reg); \
+ sw zero, (((chunk)*32)+8)(reg); \
+ sw zero, (((chunk)*32)+12)(reg); \
+ sw zero, (((chunk)*32)+16)(reg); \
+ sw zero, (((chunk)*32)+20)(reg); \
+ sw zero, (((chunk)*32)+24)(reg); \
+ sw zero, (((chunk)*32)+28)(reg)
+#else
+# define PREFETCH_FOR_STORE(chunk, reg) \
+ pref PREFETCH_STORE_HINT, (chunk)*32(reg)
+#endif
+#endif
+#define PREFETCH_LIMIT (5 * PREFETCH_CHUNK)
+#else
+# define PREFETCH_FOR_LOAD(offset, reg)
+# define PREFETCH_FOR_STORE(offset, reg)
+#endif
+
+/* Allow the routine to be named something else if desired. */
+#ifndef MEMCPY_NAME
+#define MEMCPY_NAME memcpy
+#endif
+
+/* We use these 32/64 bit registers as temporaries to do the copying. */
+#define REG0 t0
+#define REG1 t1
+#define REG2 t2
+#define REG3 t3
+#if _MIPS_SIM == _ABIO32
+# define REG4 t4
+# define REG5 t5
+# define REG6 t6
+# define REG7 t7
+#else
+# define REG4 ta0
+# define REG5 ta1
+# define REG6 ta2
+# define REG7 ta3
+#endif
+
+/* We load/store 64 bits at a time when USE_DOUBLE is true.
+ * The C_ prefix stands for CHUNK and is used to avoid macro name
+ * conflicts with system header files. */
+
+#ifdef USE_DOUBLE
+# define C_ST sd
+# define C_LD ld
#if __MIPSEB
-# define LWHI lwl /* high part is left in big-endian */
-# define SWHI swl /* high part is left in big-endian */
-# define LWLO lwr /* low part is right in big-endian */
-# define SWLO swr /* low part is right in big-endian */
+# define C_LDHI ldl /* high part is left in big-endian */
+# define C_STHI sdl /* high part is left in big-endian */
+# define C_LDLO ldr /* low part is right in big-endian */
+# define C_STLO sdr /* low part is right in big-endian */
#else
-# define LWHI lwr /* high part is right in little-endian */
-# define SWHI swr /* high part is right in little-endian */
-# define LWLO lwl /* low part is left in little-endian */
-# define SWLO swl /* low part is left in little-endian */
+# define C_LDHI ldr /* high part is right in little-endian */
+# define C_STHI sdr /* high part is right in little-endian */
+# define C_LDLO ldl /* low part is left in little-endian */
+# define C_STLO sdl /* low part is left in little-endian */
#endif
+#else
+# define C_ST sw
+# define C_LD lw
+#if __MIPSEB
+# define C_LDHI lwl /* high part is left in big-endian */
+# define C_STHI swl /* high part is left in big-endian */
+# define C_LDLO lwr /* low part is right in big-endian */
+# define C_STLO swr /* low part is right in big-endian */
+#else
+# define C_LDHI lwr /* high part is right in little-endian */
+# define C_STHI swr /* high part is right in little-endian */
+# define C_LDLO lwl /* low part is left in little-endian */
+# define C_STLO swl /* low part is left in little-endian */
+#endif
+#endif
+
+/* Bookkeeping values for 32 vs. 64 bit mode. */
+#ifdef USE_DOUBLE
+# define NSIZE 8
+# define NSIZEMASK 0x3f
+# define NSIZEDMASK 0x7f
+#else
+# define NSIZE 4
+# define NSIZEMASK 0x1f
+# define NSIZEDMASK 0x3f
+#endif
+#define UNIT(unit) ((unit)*NSIZE)
+#define UNITM1(unit) (((unit)*NSIZE)-1)
-ENTRY (memcpy)
+#ifdef ANDROID_CHANGES
+LEAF(MEMCPY_NAME, 0)
+#else
+LEAF(MEMCPY_NAME)
+#endif
+ .set nomips16
.set noreorder
+/*
+ * Below we handle the case where memcpy is called with overlapping src and dst.
+ * Although memcpy is not required to handle this case, some parts of Android
+ * like Skia rely on such usage. We call memmove to handle such cases.
+ */
+#ifdef USE_MEMMOVE_FOR_OVERLAP
+ PTR_SUBU t0,a0,a1
+ PTR_SRA t2,t0,31
+ xor t1,t0,t2
+ PTR_SUBU t0,t1,t2
+ sltu t2,t0,a2
+ beq t2,zero,L(memcpy)
+ la t9,memmove
+ jr t9
+ nop
+L(memcpy):
+#endif
+/*
+ * If the size is less then 2*NSIZE (8 or 16), go to L(lastb). Regardless of
+ * size, copy dst pointer to v0 for the return value.
+ */
+ slti t2,a2,(2 * NSIZE)
+ bne t2,zero,L(lastb)
+ move v0,a0
+/*
+ * If src and dst have different alignments, go to L(unaligned), if they
+ * have the same alignment (but are not actually aligned) do a partial
+ * load/store to make them aligned. If they are both already aligned
+ * we can start copying at L(aligned).
+ */
+ xor t8,a1,a0
+ andi t8,t8,(NSIZE-1) /* t8 is a0/a1 word-displacement */
+ bne t8,zero,L(unaligned)
+ PTR_SUBU a3, zero, a0
+
+ andi a3,a3,(NSIZE-1) /* copy a3 bytes to align a0/a1 */
+ beq a3,zero,L(aligned) /* if a3=0, it is already aligned */
+ PTR_SUBU a2,a2,a3 /* a2 is the remining bytes count */
+
+ C_LDHI t8,0(a1)
+ PTR_ADDU a1,a1,a3
+ C_STHI t8,0(a0)
+ PTR_ADDU a0,a0,a3
+
+/*
+ * Now dst/src are both aligned to (word or double word) aligned addresses
+ * Set a2 to count how many bytes we have to copy after all the 64/128 byte
+ * chunks are copied and a3 to the dst pointer after all the 64/128 byte
+ * chunks have been copied. We will loop, incrementing a0 and a1 until a0
+ * equals a3.
+ */
+
+L(aligned):
+ andi t8,a2,NSIZEDMASK /* any whole 64-byte/128-byte chunks? */
+ beq a2,t8,L(chkw) /* if a2==t8, no 64-byte/128-byte chunks */
+ PTR_SUBU a3,a2,t8 /* subtract from a2 the reminder */
+ PTR_ADDU a3,a0,a3 /* Now a3 is the final dst after loop */
+
+/* When in the loop we may prefetch with the 'prepare to store' hint,
+ * in this case the a0+x should not be past the "t0-32" address. This
+ * means: for x=128 the last "safe" a0 address is "t0-160". Alternatively,
+ * for x=64 the last "safe" a0 address is "t0-96" In the current version we
+ * will use "prefetch hint,128(a0)", so "t0-160" is the limit.
+ */
+#if (PREFETCH_STORE_HINT == PREFETCH_HINT_PREPAREFORSTORE)
+ PTR_ADDU t0,a0,a2 /* t0 is the "past the end" address */
+ PTR_SUBU t9,t0,PREFETCH_LIMIT /* t9 is the "last safe pref" address */
+#endif
+ PREFETCH_FOR_LOAD (0, a1)
+ PREFETCH_FOR_LOAD (1, a1)
+ PREFETCH_FOR_LOAD (2, a1)
+ PREFETCH_FOR_STORE (1, a0)
+#if (PREFETCH_STORE_HINT == PREFETCH_HINT_PREPAREFORSTORE)
+ sltu v1,t9,a0 /* If a0 > t9 don't use next prefetch */
+ bgtz v1,L(loop16w)
+ nop
+#endif
+ PREFETCH_FOR_STORE (2, a0)
+L(loop16w):
+ PREFETCH_FOR_LOAD (3, a1)
+ C_LD t0,UNIT(0)(a1)
+#if (PREFETCH_STORE_HINT == PREFETCH_HINT_PREPAREFORSTORE)
+ bgtz v1,L(skip_pref30_96)
+#endif
+ C_LD t1,UNIT(1)(a1)
+ PREFETCH_FOR_STORE (3, a0)
+L(skip_pref30_96):
+ C_LD REG2,UNIT(2)(a1)
+ C_LD REG3,UNIT(3)(a1)
+ C_LD REG4,UNIT(4)(a1)
+ C_LD REG5,UNIT(5)(a1)
+ C_LD REG6,UNIT(6)(a1)
+ C_LD REG7,UNIT(7)(a1)
+ PREFETCH_FOR_LOAD (4, a1)
+
+ C_ST t0,UNIT(0)(a0)
+ C_ST t1,UNIT(1)(a0)
+ C_ST REG2,UNIT(2)(a0)
+ C_ST REG3,UNIT(3)(a0)
+ C_ST REG4,UNIT(4)(a0)
+ C_ST REG5,UNIT(5)(a0)
+ C_ST REG6,UNIT(6)(a0)
+ C_ST REG7,UNIT(7)(a0)
+
+ C_LD t0,UNIT(8)(a1)
+#if (PREFETCH_STORE_HINT == PREFETCH_HINT_PREPAREFORSTORE)
+ bgtz v1,L(skip_pref30_128)
+#endif
+ C_LD t1,UNIT(9)(a1)
+ PREFETCH_FOR_STORE (4, a0)
+L(skip_pref30_128):
+ C_LD REG2,UNIT(10)(a1)
+ C_LD REG3,UNIT(11)(a1)
+ C_LD REG4,UNIT(12)(a1)
+ C_LD REG5,UNIT(13)(a1)
+ C_LD REG6,UNIT(14)(a1)
+ C_LD REG7,UNIT(15)(a1)
+ PREFETCH_FOR_LOAD (5, a1)
+ C_ST t0,UNIT(8)(a0)
+ C_ST t1,UNIT(9)(a0)
+ C_ST REG2,UNIT(10)(a0)
+ C_ST REG3,UNIT(11)(a0)
+ C_ST REG4,UNIT(12)(a0)
+ C_ST REG5,UNIT(13)(a0)
+ C_ST REG6,UNIT(14)(a0)
+ C_ST REG7,UNIT(15)(a0)
+ PTR_ADDIU a0,a0,UNIT(16) /* adding 64/128 to dest */
+#if (PREFETCH_STORE_HINT == PREFETCH_HINT_PREPAREFORSTORE)
+ sltu v1,t9,a0
+#endif
+ bne a0,a3,L(loop16w)
+ PTR_ADDIU a1,a1,UNIT(16) /* adding 64/128 to src */
+ move a2,t8
+
+/* Here we have src and dest word-aligned but less than 64-bytes or
+ * 128 bytes to go. Check for a 32(64) byte chunk and copy if if there
+ * is one. Otherwise jump down to L(chk1w) to handle the tail end of
+ * the copy.
+ */
- slti t0, a2, 8 # Less than 8?
- bne t0, zero, L(last8)
- move v0, a0 # Setup exit value before too late
-
- xor t0, a1, a0 # Find a0/a1 displacement
- andi t0, 0x3
- bne t0, zero, L(shift) # Go handle the unaligned case
- subu t1, zero, a1
- andi t1, 0x3 # a0/a1 are aligned, but are we
- beq t1, zero, L(chk8w) # starting in the middle of a word?
- subu a2, t1
- LWHI t0, 0(a1) # Yes we are... take care of that
- addu a1, t1
- SWHI t0, 0(a0)
- addu a0, t1
-
-L(chk8w):
- andi t0, a2, 0x1f # 32 or more bytes left?
- beq t0, a2, L(chk1w)
- subu a3, a2, t0 # Yes
- addu a3, a1 # a3 = end address of loop
- move a2, t0 # a2 = what will be left after loop
-L(lop8w):
- lw t0, 0(a1) # Loop taking 8 words at a time
- lw t1, 4(a1)
- lw t2, 8(a1)
- lw t3, 12(a1)
- lw t4, 16(a1)
- lw t5, 20(a1)
- lw t6, 24(a1)
- lw t7, 28(a1)
- addiu a0, 32
- addiu a1, 32
- sw t0, -32(a0)
- sw t1, -28(a0)
- sw t2, -24(a0)
- sw t3, -20(a0)
- sw t4, -16(a0)
- sw t5, -12(a0)
- sw t6, -8(a0)
- bne a1, a3, L(lop8w)
- sw t7, -4(a0)
-
-L(chk1w):
- andi t0, a2, 0x3 # 4 or more bytes left?
- beq t0, a2, L(last8)
- subu a3, a2, t0 # Yes, handle them one word at a time
- addu a3, a1 # a3 again end address
- move a2, t0
-L(lop1w):
- lw t0, 0(a1)
- addiu a0, 4
- addiu a1, 4
- bne a1, a3, L(lop1w)
- sw t0, -4(a0)
-
-L(last8):
- blez a2, L(lst8e) # Handle last 8 bytes, one at a time
- addu a3, a2, a1
-L(lst8l):
- lb t0, 0(a1)
- addiu a0, 1
- addiu a1, 1
- bne a1, a3, L(lst8l)
- sb t0, -1(a0)
-L(lst8e):
- jr ra # Bye, bye
+L(chkw):
+ PREFETCH_FOR_LOAD (0, a1)
+ andi t8,a2,NSIZEMASK /* Is there a 32-byte/64-byte chunk. */
+ /* The t8 is the reminder count past 32-bytes */
+ beq a2,t8,L(chk1w) /* When a2=t8, no 32-byte chunk */
nop
+ C_LD t0,UNIT(0)(a1)
+ C_LD t1,UNIT(1)(a1)
+ C_LD REG2,UNIT(2)(a1)
+ C_LD REG3,UNIT(3)(a1)
+ C_LD REG4,UNIT(4)(a1)
+ C_LD REG5,UNIT(5)(a1)
+ C_LD REG6,UNIT(6)(a1)
+ C_LD REG7,UNIT(7)(a1)
+ PTR_ADDIU a1,a1,UNIT(8)
+ C_ST t0,UNIT(0)(a0)
+ C_ST t1,UNIT(1)(a0)
+ C_ST REG2,UNIT(2)(a0)
+ C_ST REG3,UNIT(3)(a0)
+ C_ST REG4,UNIT(4)(a0)
+ C_ST REG5,UNIT(5)(a0)
+ C_ST REG6,UNIT(6)(a0)
+ C_ST REG7,UNIT(7)(a0)
+ PTR_ADDIU a0,a0,UNIT(8)
-L(shift):
- subu a3, zero, a0 # Src and Dest unaligned
- andi a3, 0x3 # (unoptimized case...)
- beq a3, zero, L(shft1)
- subu a2, a3 # a2 = bytes left
- LWHI t0, 0(a1) # Take care of first odd part
- LWLO t0, 3(a1)
- addu a1, a3
- SWHI t0, 0(a0)
- addu a0, a3
-L(shft1):
- andi t0, a2, 0x3
- subu a3, a2, t0
- addu a3, a1
-L(shfth):
- LWHI t1, 0(a1) # Limp through, word by word
- LWLO t1, 3(a1)
- addiu a0, 4
- addiu a1, 4
- bne a1, a3, L(shfth)
- sw t1, -4(a0)
- b L(last8) # Handle anything which may be left
- move a2, t0
+/*
+ * Here we have less then 32(64) bytes to copy. Set up for a loop to
+ * copy one word (or double word) at a time. Set a2 to count how many
+ * bytes we have to copy after all the word (or double word) chunks are
+ * copied and a3 to the dst pointer after all the (d)word chunks have
+ * been copied. We will loop, incrementing a0 and a1 until a0 equals a3.
+ */
+L(chk1w):
+ andi a2,t8,(NSIZE-1) /* a2 is the reminder past one (d)word chunks */
+ beq a2,t8,L(lastb)
+ PTR_SUBU a3,t8,a2 /* a3 is count of bytes in one (d)word chunks */
+ PTR_ADDU a3,a0,a3 /* a3 is the dst address after loop */
+/* copying in words (4-byte or 8-byte chunks) */
+L(wordCopy_loop):
+ C_LD REG3,UNIT(0)(a1)
+ PTR_ADDIU a1,a1,UNIT(1)
+ PTR_ADDIU a0,a0,UNIT(1)
+ bne a0,a3,L(wordCopy_loop)
+ C_ST REG3,UNIT(-1)(a0)
+
+/* Copy the last 8 (or 16) bytes */
+L(lastb):
+ blez a2,L(leave)
+ PTR_ADDU a3,a0,a2 /* a3 is the last dst address */
+L(lastbloop):
+ lb v1,0(a1)
+ PTR_ADDIU a1,a1,1
+ PTR_ADDIU a0,a0,1
+ bne a0,a3,L(lastbloop)
+ sb v1,-1(a0)
+L(leave):
+ j ra
+ nop
+/*
+ * UNALIGNED case, got here with a3 = "negu a0"
+ * This code is nearly identical to the aligned code above
+ * but only the destination (not the source) gets aligned
+ * so we need to do partial loads of the source followed
+ * by normal stores to the destination (once we have aligned
+ * the destination).
+ */
+
+L(unaligned):
+ andi a3,a3,(NSIZE-1) /* copy a3 bytes to align a0/a1 */
+ beqz a3,L(ua_chk16w) /* if a3=0, it is already aligned */
+ PTR_SUBU a2,a2,a3 /* a2 is the remining bytes count */
+
+ C_LDHI v1,UNIT(0)(a1)
+ C_LDLO v1,UNITM1(1)(a1)
+ PTR_ADDU a1,a1,a3
+ C_STHI v1,UNIT(0)(a0)
+ PTR_ADDU a0,a0,a3
+
+/*
+ * Now the destination (but not the source) is aligned
+ * Set a2 to count how many bytes we have to copy after all the 64/128 byte
+ * chunks are copied and a3 to the dst pointer after all the 64/128 byte
+ * chunks have been copied. We will loop, incrementing a0 and a1 until a0
+ * equals a3.
+ */
+
+L(ua_chk16w):
+ andi t8,a2,NSIZEDMASK /* any whole 64-byte/128-byte chunks? */
+ beq a2,t8,L(ua_chkw) /* if a2==t8, no 64-byte/128-byte chunks */
+ PTR_SUBU a3,a2,t8 /* subtract from a2 the reminder */
+ PTR_ADDU a3,a0,a3 /* Now a3 is the final dst after loop */
+
+#if (PREFETCH_STORE_HINT == PREFETCH_HINT_PREPAREFORSTORE)
+ PTR_ADDU t0,a0,a2 /* t0 is the "past the end" address */
+ PTR_SUBU t9,t0,PREFETCH_LIMIT /* t9 is the "last safe pref" address */
+#endif
+ PREFETCH_FOR_LOAD (0, a1)
+ PREFETCH_FOR_LOAD (1, a1)
+ PREFETCH_FOR_LOAD (2, a1)
+ PREFETCH_FOR_STORE (1, a0)
+#if (PREFETCH_STORE_HINT == PREFETCH_HINT_PREPAREFORSTORE)
+ sltu v1,t9,a0
+ bgtz v1,L(ua_loop16w) /* skip prefetch for too short arrays */
+ nop
+#endif
+ PREFETCH_FOR_STORE (2, a0)
+L(ua_loop16w):
+ PREFETCH_FOR_LOAD (3, a1)
+ C_LDHI t0,UNIT(0)(a1)
+ C_LDLO t0,UNITM1(1)(a1)
+ C_LDHI t1,UNIT(1)(a1)
+#if (PREFETCH_STORE_HINT == PREFETCH_HINT_PREPAREFORSTORE)
+ bgtz v1,L(ua_skip_pref30_96)
+#endif
+ C_LDLO t1,UNITM1(2)(a1)
+ PREFETCH_FOR_STORE (3, a0)
+L(ua_skip_pref30_96):
+ C_LDHI REG2,UNIT(2)(a1)
+ C_LDLO REG2,UNITM1(3)(a1)
+ C_LDHI REG3,UNIT(3)(a1)
+ C_LDLO REG3,UNITM1(4)(a1)
+ C_LDHI REG4,UNIT(4)(a1)
+ C_LDLO REG4,UNITM1(5)(a1)
+ C_LDHI REG5,UNIT(5)(a1)
+ C_LDLO REG5,UNITM1(6)(a1)
+ C_LDHI REG6,UNIT(6)(a1)
+ C_LDLO REG6,UNITM1(7)(a1)
+ C_LDHI REG7,UNIT(7)(a1)
+ C_LDLO REG7,UNITM1(8)(a1)
+ PREFETCH_FOR_LOAD (4, a1)
+ C_ST t0,UNIT(0)(a0)
+ C_ST t1,UNIT(1)(a0)
+ C_ST REG2,UNIT(2)(a0)
+ C_ST REG3,UNIT(3)(a0)
+ C_ST REG4,UNIT(4)(a0)
+ C_ST REG5,UNIT(5)(a0)
+ C_ST REG6,UNIT(6)(a0)
+ C_ST REG7,UNIT(7)(a0)
+ C_LDHI t0,UNIT(8)(a1)
+ C_LDLO t0,UNITM1(9)(a1)
+ C_LDHI t1,UNIT(9)(a1)
+#if (PREFETCH_STORE_HINT == PREFETCH_HINT_PREPAREFORSTORE)
+ bgtz v1,L(ua_skip_pref30_128)
+#endif
+ C_LDLO t1,UNITM1(10)(a1)
+ PREFETCH_FOR_STORE (4, a0)
+L(ua_skip_pref30_128):
+ C_LDHI REG2,UNIT(10)(a1)
+ C_LDLO REG2,UNITM1(11)(a1)
+ C_LDHI REG3,UNIT(11)(a1)
+ C_LDLO REG3,UNITM1(12)(a1)
+ C_LDHI REG4,UNIT(12)(a1)
+ C_LDLO REG4,UNITM1(13)(a1)
+ C_LDHI REG5,UNIT(13)(a1)
+ C_LDLO REG5,UNITM1(14)(a1)
+ C_LDHI REG6,UNIT(14)(a1)
+ C_LDLO REG6,UNITM1(15)(a1)
+ C_LDHI REG7,UNIT(15)(a1)
+ C_LDLO REG7,UNITM1(16)(a1)
+ PREFETCH_FOR_LOAD (5, a1)
+ C_ST t0,UNIT(8)(a0)
+ C_ST t1,UNIT(9)(a0)
+ C_ST REG2,UNIT(10)(a0)
+ C_ST REG3,UNIT(11)(a0)
+ C_ST REG4,UNIT(12)(a0)
+ C_ST REG5,UNIT(13)(a0)
+ C_ST REG6,UNIT(14)(a0)
+ C_ST REG7,UNIT(15)(a0)
+ PTR_ADDIU a0,a0,UNIT(16) /* adding 64/128 to dest */
+#if (PREFETCH_STORE_HINT == PREFETCH_HINT_PREPAREFORSTORE)
+ sltu v1,t9,a0
+#endif
+ bne a0,a3,L(ua_loop16w)
+ PTR_ADDIU a1,a1,UNIT(16) /* adding 64/128 to src */
+ move a2,t8
+
+/* Here we have src and dest word-aligned but less than 64-bytes or
+ * 128 bytes to go. Check for a 32(64) byte chunk and copy if if there
+ * is one. Otherwise jump down to L(ua_chk1w) to handle the tail end of
+ * the copy. */
+
+L(ua_chkw):
+ PREFETCH_FOR_LOAD (0, a1)
+ andi t8,a2,NSIZEMASK /* Is there a 32-byte/64-byte chunk. */
+ /* t8 is the reminder count past 32-bytes */
+ beq a2,t8,L(ua_chk1w) /* When a2=t8, no 32-byte chunk */
+ nop
+ C_LDHI t0,UNIT(0)(a1)
+ C_LDLO t0,UNITM1(1)(a1)
+ C_LDHI t1,UNIT(1)(a1)
+ C_LDLO t1,UNITM1(2)(a1)
+ C_LDHI REG2,UNIT(2)(a1)
+ C_LDLO REG2,UNITM1(3)(a1)
+ C_LDHI REG3,UNIT(3)(a1)
+ C_LDLO REG3,UNITM1(4)(a1)
+ C_LDHI REG4,UNIT(4)(a1)
+ C_LDLO REG4,UNITM1(5)(a1)
+ C_LDHI REG5,UNIT(5)(a1)
+ C_LDLO REG5,UNITM1(6)(a1)
+ C_LDHI REG6,UNIT(6)(a1)
+ C_LDLO REG6,UNITM1(7)(a1)
+ C_LDHI REG7,UNIT(7)(a1)
+ C_LDLO REG7,UNITM1(8)(a1)
+ PTR_ADDIU a1,a1,UNIT(8)
+ C_ST t0,UNIT(0)(a0)
+ C_ST t1,UNIT(1)(a0)
+ C_ST REG2,UNIT(2)(a0)
+ C_ST REG3,UNIT(3)(a0)
+ C_ST REG4,UNIT(4)(a0)
+ C_ST REG5,UNIT(5)(a0)
+ C_ST REG6,UNIT(6)(a0)
+ C_ST REG7,UNIT(7)(a0)
+ PTR_ADDIU a0,a0,UNIT(8)
+/*
+ * Here we have less then 32(64) bytes to copy. Set up for a loop to
+ * copy one word (or double word) at a time.
+ */
+L(ua_chk1w):
+ andi a2,t8,(NSIZE-1) /* a2 is the reminder past one (d)word chunks */
+ beq a2,t8,L(ua_smallCopy)
+ PTR_SUBU a3,t8,a2 /* a3 is count of bytes in one (d)word chunks */
+ PTR_ADDU a3,a0,a3 /* a3 is the dst address after loop */
+
+/* copying in words (4-byte or 8-byte chunks) */
+L(ua_wordCopy_loop):
+ C_LDHI v1,UNIT(0)(a1)
+ C_LDLO v1,UNITM1(1)(a1)
+ PTR_ADDIU a1,a1,UNIT(1)
+ PTR_ADDIU a0,a0,UNIT(1)
+ bne a0,a3,L(ua_wordCopy_loop)
+ C_ST v1,UNIT(-1)(a0)
+
+/* Copy the last 8 (or 16) bytes */
+L(ua_smallCopy):
+ beqz a2,L(leave)
+ PTR_ADDU a3,a0,a2 /* a3 is the last dst address */
+L(ua_smallCopy_loop):
+ lb v1,0(a1)
+ PTR_ADDIU a1,a1,1
+ PTR_ADDIU a0,a0,1
+ bne a0,a3,L(ua_smallCopy_loop)
+ sb v1,-1(a0)
+
+ j ra
+ nop
+
+ .set at
.set reorder
-END (memcpy)
-libc_hidden_builtin_def (memcpy)
+END(MEMCPY_NAME)
+#ifdef _LIBC
+libc_hidden_builtin_def (MEMCPY_NAME)
+#endif
^ permalink raw reply [flat|nested] 50+ messages in thread
* Re: [PATCH] Optimize MIPS memcpy
2012-10-08 22:31 ` Maxim Kuvyrkov
2012-10-09 20:50 ` Steve Ellcey
@ 2012-10-15 17:49 ` Steve Ellcey
2012-10-15 20:20 ` Andrew Pinski
2012-10-15 22:05 ` Maxim Kuvyrkov
1 sibling, 2 replies; 50+ messages in thread
From: Steve Ellcey @ 2012-10-15 17:49 UTC (permalink / raw)
To: Maxim Kuvyrkov; +Cc: Andrew T Pinski, Joseph S. Myers, libc-ports
[-- Attachment #1: Type: text/plain, Size: 1284 bytes --]
On Tue, 2012-10-09 at 11:30 +1300, Maxim Kuvyrkov wrote:
> I too want to keep prepare-for-store prefetches is possible. For debugging purposes you could amend
> prepare-for-store prefetch macros to trigger a loop that would unconditionally clobber memory locations
> that prepare-for-store is expected to zero-out. Or add some other assertions to help out with debugging.
>
> Thanks,
>
> --
> Maxim Kuvyrkov
> CodeSourcery / Mentor Graphics
Maxim,
Could you try running this test program on your system. I want to see
if it verifies that your machine is doing 32 byte prefetches. The
output I get looks like:
0x004754a0, (0x004754a0 to 0x004754c0, 32 byte prefetch)
0x004754a1, (0x004754a0 to 0x004754c0, 32 byte prefetch)
0x004754a2, (0x004754a0 to 0x004754c0, 32 byte prefetch)
0x004754a3, (0x004754a0 to 0x004754c0, 32 byte prefetch)
0x004754a4, (0x004754a0 to 0x004754c0, 32 byte prefetch)
0x004754a5, (0x004754a0 to 0x004754c0, 32 byte prefetch)
.
.
.
0x0047589b, (0x00475880 to 0x004758a0, 32 byte prefetch)
0x0047589c, (0x00475880 to 0x004758a0, 32 byte prefetch)
0x0047589d, (0x00475880 to 0x004758a0, 32 byte prefetch)
0x0047589e, (0x00475880 to 0x004758a0, 32 byte prefetch)
0x0047589f, (0x00475880 to 0x004758a0, 32 byte prefetch)
Steve Ellcey
sellcey@mips.com
[-- Attachment #2: check_prefetch_size.c --]
[-- Type: text/x-csrc, Size: 1066 bytes --]
#include <stdio.h>
char dummy[409600];
char buffer[3072];
check_buffer(char *p)
{
int i, zero_start, zero_stop;
/* Initialize buffer to non-zero data */
for (i = 0; i < 2048; i++)
buffer[i] = 1;
/* Clear buffer out of cache */
for (i = 0; i < 409600; i++)
dummy[i] = 9;
#if 1
__asm__ ("pref 30, 0x0(%0)" : : "r" (p));
#endif
/* Check contents for single block of zeros */
zero_start = 0;
while ((buffer[zero_start] == 1) && (zero_start < 2048)) zero_start++;
zero_stop = zero_start;
while ((buffer[zero_stop] == 0) && (zero_stop < 2048)) zero_stop++;
for (i = zero_stop; i < 2048; i++)
if (buffer[i] == 0) printf("Error, extra set of zeros\n");
if (zero_start >= 2048)
printf("0x%8.8x, (no zeros)\n", p);
else
printf("0x%8.8x, (0x%8.8x to 0x%8.8x, %d byte prefetch)\n", p, &buffer[zero_start], &buffer[zero_stop], (zero_stop - zero_start));
#if 0
/* Dump buffer contents */
for (i = 0; i < 2048; i++)
printf("%1d", buffer[i]);
printf("\n");
#endif
}
main()
{
int i;
for (i = 1024; i < 2048; i++)
check_buffer(&buffer[i]);
}
^ permalink raw reply [flat|nested] 50+ messages in thread
* Re: [PATCH] Optimize MIPS memcpy
2012-10-15 17:49 ` Steve Ellcey
@ 2012-10-15 20:20 ` Andrew Pinski
2012-10-15 20:34 ` Steve Ellcey
2012-10-15 22:05 ` Maxim Kuvyrkov
1 sibling, 1 reply; 50+ messages in thread
From: Andrew Pinski @ 2012-10-15 20:20 UTC (permalink / raw)
To: Steve Ellcey; +Cc: Maxim Kuvyrkov, Joseph S. Myers, libc-ports
On Mon, Oct 15, 2012 at 10:49 AM, Steve Ellcey <sellcey@mips.com> wrote:
> On Tue, 2012-10-09 at 11:30 +1300, Maxim Kuvyrkov wrote:
>
>> I too want to keep prepare-for-store prefetches is possible. For debugging purposes you could amend
>> prepare-for-store prefetch macros to trigger a loop that would unconditionally clobber memory locations
>> that prepare-for-store is expected to zero-out. Or add some other assertions to help out with debugging.
>>
>> Thanks,
>>
>> --
>> Maxim Kuvyrkov
>> CodeSourcery / Mentor Graphics
>
> Maxim,
>
> Could you try running this test program on your system. I want to see
> if it verifies that your machine is doing 32 byte prefetches. The
> output I get looks like:
>
>
> 0x004754a0, (0x004754a0 to 0x004754c0, 32 byte prefetch)
> 0x004754a1, (0x004754a0 to 0x004754c0, 32 byte prefetch)
> 0x004754a2, (0x004754a0 to 0x004754c0, 32 byte prefetch)
> 0x004754a3, (0x004754a0 to 0x004754c0, 32 byte prefetch)
> 0x004754a4, (0x004754a0 to 0x004754c0, 32 byte prefetch)
> 0x004754a5, (0x004754a0 to 0x004754c0, 32 byte prefetch)
> .
> .
> .
> 0x0047589b, (0x00475880 to 0x004758a0, 32 byte prefetch)
> 0x0047589c, (0x00475880 to 0x004758a0, 32 byte prefetch)
> 0x0047589d, (0x00475880 to 0x004758a0, 32 byte prefetch)
> 0x0047589e, (0x00475880 to 0x004758a0, 32 byte prefetch)
> 0x0047589f, (0x00475880 to 0x004758a0, 32 byte prefetch)
On:
system type : EBB6300 (CN6335p2.1-1500-AAP)
processor : 0
cpu model : Cavium Octeon II V0.9
BogoMIPS : 3000.00
wait instruction : yes
microsecond timers : yes
tlb_entries : 128
extra interrupt vector : yes
hardware watchpoint : yes, count: 2, address/irw mask: [0x0ffc, 0x0ffb]
ASEs implemented :
shadow register sets : 1
kscratch registers : 3
core : 0
VCED exceptions : not available
VCEI exceptions : not available
I get:
...
0x200757cb, (no zeros)
0x200757cc, (no zeros)
0x200757cd, (no zeros)
0x200757ce, (no zeros)
0x200757cf, (0x20075780 to 0x20075800, 128 byte prefetch)
.....
0x2007587a, (no zeros)
0x2007587b, (no zeros)
0x2007587c, (no zeros)
0x2007587d, (no zeros)
0x2007587e, (no zeros)
0x2007587f, (no zeros)
0x20075880, (0x20075880 to 0x20075900, 128 byte prefetch)
Thanks,
Andrew Pinski
>
> Steve Ellcey
> sellcey@mips.com
^ permalink raw reply [flat|nested] 50+ messages in thread
* Re: [PATCH] Optimize MIPS memcpy
2012-10-15 20:20 ` Andrew Pinski
@ 2012-10-15 20:34 ` Steve Ellcey
2012-10-15 20:42 ` Andrew Pinski
2012-10-15 21:29 ` Maciej W. Rozycki
0 siblings, 2 replies; 50+ messages in thread
From: Steve Ellcey @ 2012-10-15 20:34 UTC (permalink / raw)
To: Andrew Pinski; +Cc: Maxim Kuvyrkov, Joseph S. Myers, libc-ports
On Mon, 2012-10-15 at 13:20 -0700, Andrew Pinski wrote:
> On:
> system type : EBB6300 (CN6335p2.1-1500-AAP)
> processor : 0
> cpu model : Cavium Octeon II V0.9
>
> I get:
> ...
> 0x200757cb, (no zeros)
> 0x200757cc, (no zeros)
> 0x200757cd, (no zeros)
> 0x200757ce, (no zeros)
> 0x200757cf, (0x20075780 to 0x20075800, 128 byte prefetch)
>
> Thanks,
> Andrew Pinski
Andrew,
Is there a macro I can/should use when building glibc/memcpy to know
that it should assume a Cavium Octeon with 128 byte prefetch instead of
the 32 byte prefetch?
Steve Ellcey
sellcey@mips.com
^ permalink raw reply [flat|nested] 50+ messages in thread
* Re: [PATCH] Optimize MIPS memcpy
2012-10-15 20:34 ` Steve Ellcey
@ 2012-10-15 20:42 ` Andrew Pinski
2012-10-15 20:50 ` Andrew Pinski
2012-10-15 21:29 ` Maciej W. Rozycki
1 sibling, 1 reply; 50+ messages in thread
From: Andrew Pinski @ 2012-10-15 20:42 UTC (permalink / raw)
To: Steve Ellcey; +Cc: Maxim Kuvyrkov, Joseph S. Myers, libc-ports
On Mon, Oct 15, 2012 at 1:34 PM, Steve Ellcey <sellcey@mips.com> wrote:
> On Mon, 2012-10-15 at 13:20 -0700, Andrew Pinski wrote:
>
>> On:
>> system type : EBB6300 (CN6335p2.1-1500-AAP)
>> processor : 0
>> cpu model : Cavium Octeon II V0.9
>>
>> I get:
>> ...
>> 0x200757cb, (no zeros)
>> 0x200757cc, (no zeros)
>> 0x200757cd, (no zeros)
>> 0x200757ce, (no zeros)
>> 0x200757cf, (0x20075780 to 0x20075800, 128 byte prefetch)
>>
>> Thanks,
>> Andrew Pinski
>
> Andrew,
>
> Is there a macro I can/should use when building glibc/memcpy to know
> that it should assume a Cavium Octeon with 128 byte prefetch instead of
> the 32 byte prefetch?
Building you could use __OCTEON__ but that does not change the fact
you could build glibc for the standard mips32/mips64 and then not get
a working glibc if it defaults to 32bytes prefetch.
Thanks,
Andrew Pinski
>
> Steve Ellcey
> sellcey@mips.com
>
>
^ permalink raw reply [flat|nested] 50+ messages in thread
* Re: [PATCH] Optimize MIPS memcpy
2012-10-15 20:42 ` Andrew Pinski
@ 2012-10-15 20:50 ` Andrew Pinski
2012-10-15 21:36 ` Steve Ellcey
0 siblings, 1 reply; 50+ messages in thread
From: Andrew Pinski @ 2012-10-15 20:50 UTC (permalink / raw)
To: Steve Ellcey; +Cc: Maxim Kuvyrkov, Joseph S. Myers, libc-ports
On Mon, Oct 15, 2012 at 1:42 PM, Andrew Pinski <pinskia@gmail.com> wrote:
> On Mon, Oct 15, 2012 at 1:34 PM, Steve Ellcey <sellcey@mips.com> wrote:
>> On Mon, 2012-10-15 at 13:20 -0700, Andrew Pinski wrote:
>>
>>> On:
>>> system type : EBB6300 (CN6335p2.1-1500-AAP)
>>> processor : 0
>>> cpu model : Cavium Octeon II V0.9
>>>
>>> I get:
>>> ...
>>> 0x200757cb, (no zeros)
>>> 0x200757cc, (no zeros)
>>> 0x200757cd, (no zeros)
>>> 0x200757ce, (no zeros)
>>> 0x200757cf, (0x20075780 to 0x20075800, 128 byte prefetch)
>>>
>>> Thanks,
>>> Andrew Pinski
>>
>> Andrew,
>>
>> Is there a macro I can/should use when building glibc/memcpy to know
>> that it should assume a Cavium Octeon with 128 byte prefetch instead of
>> the 32 byte prefetch?
>
>
> Building you could use __OCTEON__ but that does not change the fact
> you could build glibc for the standard mips32/mips64 and then not get
> a working glibc if it defaults to 32bytes prefetch.
Also it would be nice to use ifunc's like they are used on x86_64 (and
I think PPC also) so we can compile one generic version of glibc and
get the optimized version of memcpy. Though ifunc's have their own
issue as they don't currently work on MIPS (they cause internal linker
errors).
Thanks,
Andrew
>
> Thanks,
> Andrew Pinski
>
>>
>> Steve Ellcey
>> sellcey@mips.com
>>
>>
^ permalink raw reply [flat|nested] 50+ messages in thread
* Re: [PATCH] Optimize MIPS memcpy
2012-10-15 20:34 ` Steve Ellcey
2012-10-15 20:42 ` Andrew Pinski
@ 2012-10-15 21:29 ` Maciej W. Rozycki
1 sibling, 0 replies; 50+ messages in thread
From: Maciej W. Rozycki @ 2012-10-15 21:29 UTC (permalink / raw)
To: Steve Ellcey; +Cc: Andrew Pinski, Maxim Kuvyrkov, Joseph S. Myers, libc-ports
On Mon, 15 Oct 2012, Steve Ellcey wrote:
> > On:
> > system type : EBB6300 (CN6335p2.1-1500-AAP)
> > processor : 0
> > cpu model : Cavium Octeon II V0.9
> >
> > I get:
> > ...
> > 0x200757cb, (no zeros)
> > 0x200757cc, (no zeros)
> > 0x200757cd, (no zeros)
> > 0x200757ce, (no zeros)
> > 0x200757cf, (0x20075780 to 0x20075800, 128 byte prefetch)
> >
> > Thanks,
> > Andrew Pinski
>
> Andrew,
>
> Is there a macro I can/should use when building glibc/memcpy to know
> that it should assume a Cavium Octeon with 128 byte prefetch instead of
> the 32 byte prefetch?
FWIW I don't think hardcoding the cache line size for individual
processor types is going to scale, not even mentioning it may not serve
its purpose at all given that the cache line size may be boot-mode or even
run-time configurable in a vendor-specific way (some MTI cores for example
use CP0.Config.WC for cache topology reconfiguration, although the
currently available implementations do not seem to include the line sizes
among the reconfigurable parameters).
This looks to me like a case for multiple copies of memcpy binary code
tuned for an individual cache line size each and then selected via the
IFUNC feature -- there should be no run-time penalty for doing that in
dynamic executables/libraries (except from libc itself perhaps) as the
call is going to be made through the GOT anyway. Of course the line size
needs to be determined somehow at the first invocation -- perhaps the
appropriate bits from CP0 Config1/2 registers could be exported by the
kernel.
If storage/memory footprint is of concern, then perhaps for -Os builds
(is that supported for glibc these days anyway?) only a single copy of
memcpy could be built.
BTW, the M14Kc only has a 16-byte cache line size, so it will need
another arrangement.
Thoughts?
Maciej
^ permalink raw reply [flat|nested] 50+ messages in thread
* Re: [PATCH] Optimize MIPS memcpy
2012-10-15 20:50 ` Andrew Pinski
@ 2012-10-15 21:36 ` Steve Ellcey
2012-10-15 21:47 ` Maxim Kuvyrkov
2012-10-15 22:10 ` Joseph S. Myers
0 siblings, 2 replies; 50+ messages in thread
From: Steve Ellcey @ 2012-10-15 21:36 UTC (permalink / raw)
To: Andrew Pinski; +Cc: Maxim Kuvyrkov, Joseph S. Myers, libc-ports
On Mon, 2012-10-15 at 13:50 -0700, Andrew Pinski wrote:
Building you could use __OCTEON__ but that does not change the fact
you could build glibc for the standard mips32/mips64 and then not get
a working glibc if it defaults to 32bytes prefetch.
So are you saying that we shouldn't use the 'prepare to store' prefetch
in the glibc memcpy then? We could use one of the other prefetches
without having to worry about bad code on machines with different size
prefetches, but it would not be as fast as using 'prepare to store'.
> Also it would be nice to use ifunc's like they are used on x86_64 (and
> I think PPC also) so we can compile one generic version of glibc and
> get the optimized version of memcpy. Though ifunc's have their own
> issue as they don't currently work on MIPS (they cause internal linker
> errors).
I'll have to look at that, I am not familiar with the ifunc's except in
very general terms.
Steve Ellcey
sellcey@mips.com
^ permalink raw reply [flat|nested] 50+ messages in thread
* Re: [PATCH] Optimize MIPS memcpy
2012-10-15 21:36 ` Steve Ellcey
@ 2012-10-15 21:47 ` Maxim Kuvyrkov
2012-10-17 17:30 ` Steve Ellcey
2012-10-15 22:10 ` Joseph S. Myers
1 sibling, 1 reply; 50+ messages in thread
From: Maxim Kuvyrkov @ 2012-10-15 21:47 UTC (permalink / raw)
To: Steve Ellcey; +Cc: Andrew Pinski, Joseph S. Myers, libc-ports
On 16/10/2012, at 10:36 AM, Steve Ellcey wrote:
> On Mon, 2012-10-15 at 13:50 -0700, Andrew Pinski wrote:
>
> Building you could use __OCTEON__ but that does not change the fact
> you could build glibc for the standard mips32/mips64 and then not get
> a working glibc if it defaults to 32bytes prefetch.
>
> So are you saying that we shouldn't use the 'prepare to store' prefetch
> in the glibc memcpy then? We could use one of the other prefetches
> without having to worry about bad code on machines with different size
> prefetches, but it would not be as fast as using 'prepare to store'.
>
>> Also it would be nice to use ifunc's like they are used on x86_64 (and
>> I think PPC also) so we can compile one generic version of glibc and
>> get the optimized version of memcpy. Though ifunc's have their own
>> issue as they don't currently work on MIPS (they cause internal linker
>> errors).
>
> I'll have to look at that, I am not familiar with the ifunc's except in
> very general terms.
I suggest you move support for prepare-to-store prefetches into TODO category blocked on working IFUNC support for MIPS. Without IFUNCs and having all the different-sized cache lines on MIPS you can't make prepare-to-store work reliably.
Once IFUNC support for MIPS is there, we can revisit using prepare-to-store prefetches.
--
Maxim Kuvyrkov
CodeSourcery / Mentor Graphics
^ permalink raw reply [flat|nested] 50+ messages in thread
* Re: [PATCH] Optimize MIPS memcpy
2012-10-15 17:49 ` Steve Ellcey
2012-10-15 20:20 ` Andrew Pinski
@ 2012-10-15 22:05 ` Maxim Kuvyrkov
1 sibling, 0 replies; 50+ messages in thread
From: Maxim Kuvyrkov @ 2012-10-15 22:05 UTC (permalink / raw)
To: Steve Ellcey; +Cc: Andrew T Pinski, Joseph S. Myers, libc-ports
On 16/10/2012, at 6:49 AM, Steve Ellcey wrote:
> On Tue, 2012-10-09 at 11:30 +1300, Maxim Kuvyrkov wrote:
>
>> I too want to keep prepare-for-store prefetches is possible. For debugging purposes you could amend
>> prepare-for-store prefetch macros to trigger a loop that would unconditionally clobber memory locations
>> that prepare-for-store is expected to zero-out. Or add some other assertions to help out with debugging.
>>
>> Thanks,
>>
>> --
>> Maxim Kuvyrkov
>> CodeSourcery / Mentor Graphics
>
> Maxim,
>
> Could you try running this test program on your system. I want to see
> if it verifies that your machine is doing 32 byte prefetches. The
> output I get looks like:
>
>
> 0x004754a0, (0x004754a0 to 0x004754c0, 32 byte prefetch)
> 0x004754a1, (0x004754a0 to 0x004754c0, 32 byte prefetch)
> 0x004754a2, (0x004754a0 to 0x004754c0, 32 byte prefetch)
> 0x004754a3, (0x004754a0 to 0x004754c0, 32 byte prefetch)
> 0x004754a4, (0x004754a0 to 0x004754c0, 32 byte prefetch)
> 0x004754a5, (0x004754a0 to 0x004754c0, 32 byte prefetch)
> .
> .
> .
> 0x0047589b, (0x00475880 to 0x004758a0, 32 byte prefetch)
> 0x0047589c, (0x00475880 to 0x004758a0, 32 byte prefetch)
> 0x0047589d, (0x00475880 to 0x004758a0, 32 byte prefetch)
> 0x0047589e, (0x00475880 to 0x004758a0, 32 byte prefetch)
> 0x0047589f, (0x00475880 to 0x004758a0, 32 byte prefetch)
This is a big-endian target and I get 64-byte prefetches (for n32, o32 and n64 ABIs).
Now that I have checked XLP documentation once again, it says that L1 cache line is 32-bytes, but L2 and L3 are 64-bytes. Still, documentation for prefetch instruction insists that expected result for prefetch instruction is to prefetch 32-bytes.
0x200756d0, (no zeros)
0x200756d1, (0x200756c0 to 0x20075700, 64 byte prefetch)
0x200756d2, (0x200756c0 to 0x20075700, 64 byte prefetch)
0x200756d3, (0x200756c0 to 0x20075700, 64 byte prefetch)
0x200756d4, (0x200756c0 to 0x20075700, 64 byte prefetch)
0x200756d5, (0x200756c0 to 0x20075700, 64 byte prefetch)
0x200756d6, (0x200756c0 to 0x20075700, 64 byte prefetch)
--
Maxim Kuvyrkov
CodeSourcery / Mentor Graphics
^ permalink raw reply [flat|nested] 50+ messages in thread
* Re: [PATCH] Optimize MIPS memcpy
2012-10-15 21:36 ` Steve Ellcey
2012-10-15 21:47 ` Maxim Kuvyrkov
@ 2012-10-15 22:10 ` Joseph S. Myers
1 sibling, 0 replies; 50+ messages in thread
From: Joseph S. Myers @ 2012-10-15 22:10 UTC (permalink / raw)
To: Steve Ellcey; +Cc: Andrew Pinski, Maxim Kuvyrkov, libc-ports
On Mon, 15 Oct 2012, Steve Ellcey wrote:
> > Also it would be nice to use ifunc's like they are used on x86_64 (and
> > I think PPC also) so we can compile one generic version of glibc and
> > get the optimized version of memcpy. Though ifunc's have their own
> > issue as they don't currently work on MIPS (they cause internal linker
> > errors).
>
> I'll have to look at that, I am not familiar with the ifunc's except in
> very general terms.
IFUNC support would first require defining the associated psABI pieces
(for all three ABIs), before implementing in ld and glibc.
--
Joseph S. Myers
joseph@codesourcery.com
^ permalink raw reply [flat|nested] 50+ messages in thread
* Re: [PATCH] Optimize MIPS memcpy
2012-10-15 21:47 ` Maxim Kuvyrkov
@ 2012-10-17 17:30 ` Steve Ellcey
2012-10-29 18:00 ` Steve Ellcey
2012-10-31 19:27 ` Andreas Jaeger
0 siblings, 2 replies; 50+ messages in thread
From: Steve Ellcey @ 2012-10-17 17:30 UTC (permalink / raw)
To: Maxim Kuvyrkov; +Cc: Andrew Pinski, Joseph S. Myers, libc-ports
On Tue, 2012-10-16 at 10:47 +1300, Maxim Kuvyrkov wrote:
> I suggest you move support for prepare-to-store prefetches into TODO category blocked on working
> IFUNC support for MIPS. Without IFUNCs and having all the different-sized cache lines on MIPS you
> can't make prepare-to-store work reliably.
>
> Once IFUNC support for MIPS is there, we can revisit using prepare-to-store prefetches.
>
> --
> Maxim Kuvyrkov
> CodeSourcery / Mentor Graphics
OK, Here is a version of memcpy that uses the STORE_STREAMING prefetch.
While it is optimized for a 32 byte prefetch, it will work correctly
regardless of the size of the prefetch.
Is this version OK to checkin?
Steve Ellcey
sellcey@mips.com
2012-10-17 Steve Ellcey <sellcey@mips.com>
* sysdeps/mips/memcpy.S: Add prefetching and more unrolling, make
it work in 32 or 64 bit modes.
* sysdeps/mips/mips64/memcpy.S: Remove.
diff --git a/ports/sysdeps/mips/memcpy.S b/ports/sysdeps/mips/memcpy.S
index 753f67c..71474e9 100644
--- a/ports/sysdeps/mips/memcpy.S
+++ b/ports/sysdeps/mips/memcpy.S
@@ -1,7 +1,8 @@
-/* Copyright (C) 2002-2012 Free Software Foundation, Inc.
+/* Copyright (C) 2012 Free Software Foundation, Inc.
This file is part of the GNU C Library.
- Contributed by Hartvig Ekner <hartvige@mips.com>, 2002.
-
+
+ Contributed by MIPS Technologies, Inc.
+
The GNU C Library is free software; you can redistribute it and/or
modify it under the terms of the GNU Lesser General Public
License as published by the Free Software Foundation; either
@@ -16,119 +17,616 @@
License along with the GNU C Library. If not, see
<http://www.gnu.org/licenses/>. */
+#ifdef ANDROID_CHANGES
+#include "machine/asm.h"
+#include "machine/regdef.h"
+#define USE_MEMMOVE_FOR_OVERLAP
+#define PREFETCH_LOAD_HINT PREFETCH_HINT_LOAD_STREAMED
+#define PREFETCH_STORE_HINT PREFETCH_HINT_PREPAREFORSTORE
+#elif _LIBC
#include <sysdep.h>
+#include <regdef.h>
+#include <sys/asm.h>
+#define PREFETCH_LOAD_HINT PREFETCH_HINT_LOAD_STREAMED
+#define PREFETCH_STORE_HINT PREFETCH_HINT_STORE_STREAMED
+#elif _COMPILING_NEWLIB
+#include "machine/asm.h"
+#include "machine/regdef.h"
+#define PREFETCH_LOAD_HINT PREFETCH_HINT_LOAD_STREAMED
+#define PREFETCH_STORE_HINT PREFETCH_HINT_STORE_STREAMED
+#else
+#include <regdef.h>
+#include <sys/asm.h>
+#endif
+
+#if (_MIPS_ISA == _MIPS_ISA_MIPS4) || (_MIPS_ISA == _MIPS_ISA_MIPS5) || \
+ (_MIPS_ISA == _MIPS_ISA_MIPS32) || (_MIPS_ISA == _MIPS_ISA_MIPS64)
+#ifndef DISABLE_PREFETCH
+#define USE_PREFETCH
+#endif
+#endif
+
+#if (_MIPS_SIM == _ABI64) || (_MIPS_SIM == _ABIN32)
+#ifndef DISABLE_DOUBLE
+#define USE_DOUBLE
+#endif
+#endif
+
+
+
+/* Some asm.h files do not have the L macro definition. */
+#ifndef L
+#if _MIPS_SIM == _ABIO32
+# define L(label) $L ## label
+#else
+# define L(label) .L ## label
+#endif
+#endif
+
+/* Some asm.h files do not have the PTR_ADDIU macro definition. */
+#ifndef PTR_ADDIU
+#ifdef USE_DOUBLE
+#define PTR_ADDIU daddiu
+#else
+#define PTR_ADDIU addiu
+#endif
+#endif
+
+/* Some asm.h files do not have the PTR_SRA macro definition. */
+#ifndef PTR_SRA
+#ifdef USE_DOUBLE
+#define PTR_SRA dsra
+#else
+#define PTR_SRA sra
+#endif
+#endif
+
+/*
+ * Using PREFETCH_HINT_LOAD_STREAMED instead of PREFETCH_LOAD on load
+ * prefetches appears to offer a slight preformance advantage.
+ *
+ * Using PREFETCH_HINT_PREPAREFORSTORE instead of PREFETCH_STORE
+ * or PREFETCH_STORE_STREAMED offers a large performance advantage
+ * but PREPAREFORSTORE has some special restrictions to consider.
+ *
+ * Prefetch with the 'prepare for store' hint does not copy a memory
+ * location into the cache, it just allocates a cache line and zeros
+ * it out. This means that if you do not write to the entire cache
+ * line before writing it out to memory some data will get zero'ed out
+ * when the cache line is written back to memory and data will be lost.
+ *
+ * Also if you are using this memcpy to copy overlapping buffers it may
+ * not behave correctly when using the 'prepare for store' hint. If you
+ * use the 'prepare for store' prefetch on a memory area that is in the
+ * memcpy source (as well as the memcpy destination), then you will get
+ * some data zero'ed out before you have a chance to read it and data will
+ * be lost.
+ *
+ * If you are going to use this memcpy routine with the 'prepare for store'
+ * prefetch you may want to set USE_MEMMOVE_FOR_OVERLAP in order to avoid
+ * the problem of running memcpy on overlapping buffers.
+ *
+ * There are ifdef'ed sections of this memcpy to make sure that it does not
+ * do prefetches on cache lines that are not going to be completely written.
+ * This code is only needed and only used when PREFETCH_STORE_HINT is set to
+ * PREFETCH_HINT_PREPAREFORSTORE. This code assumes that cache lines are
+ * 32 bytes and if the cache line is larger it will not work correctly.
+ */
-/* void *memcpy(void *s1, const void *s2, size_t n); */
+#ifdef USE_PREFETCH
+# define PREFETCH_HINT_LOAD 0
+# define PREFETCH_HINT_STORE 1
+# define PREFETCH_HINT_LOAD_STREAMED 4
+# define PREFETCH_HINT_STORE_STREAMED 5
+# define PREFETCH_HINT_LOAD_RETAINED 6
+# define PREFETCH_HINT_STORE_RETAINED 7
+# define PREFETCH_HINT_WRITEBACK_INVAL 25
+# define PREFETCH_HINT_PREPAREFORSTORE 30
+
+/*
+ * If we have not picked out what hints to use at this point use the
+ * standard load and store prefetch hints.
+ */
+#ifndef PREFETCH_STORE_HINT
+# define PREFETCH_STORE_HINT PREFETCH_HINT_STORE
+#endif
+#ifndef PREFETCH_LOAD_HINT
+# define PREFETCH_LOAD_HINT PREFETCH_HINT_LOAD
+#endif
+/*
+ * We double everything when USE_DOUBLE is true so we do 2 prefetches to
+ * get 64 bytes in that case. The assumption is that each individual
+ * prefetch brings in 32 bytes.
+ */
+#ifdef USE_DOUBLE
+# define PREFETCH_CHUNK 64
+# define PREFETCH_FOR_LOAD(chunk, reg) \
+ pref PREFETCH_LOAD_HINT, (chunk)*32(reg); \
+ pref PREFETCH_LOAD_HINT, ((chunk)+1)*32(reg)
+# define PREFETCH_FOR_STORE(chunk, reg) \
+ pref PREFETCH_STORE_HINT, (chunk)*32(reg); \
+ pref PREFETCH_STORE_HINT, ((chunk)+1)*32(reg)
+#else
+# define PREFETCH_CHUNK 32
+# define PREFETCH_FOR_LOAD(chunk, reg) \
+ pref PREFETCH_LOAD_HINT, (chunk)*32(reg)
+# define PREFETCH_FOR_STORE(chunk, reg) \
+ pref PREFETCH_STORE_HINT, (chunk)*32(reg)
+#endif
+# define PREFETCH_LIMIT (5 * PREFETCH_CHUNK)
+#else /* USE_PREFETCH not defined */
+# define PREFETCH_FOR_LOAD(offset, reg)
+# define PREFETCH_FOR_STORE(offset, reg)
+#endif
+
+/* Allow the routine to be named something else if desired. */
+#ifndef MEMCPY_NAME
+#define MEMCPY_NAME memcpy
+#endif
+
+/* We use these 32/64 bit registers as temporaries to do the copying. */
+#define REG0 t0
+#define REG1 t1
+#define REG2 t2
+#define REG3 t3
+#if _MIPS_SIM == _ABIO32
+# define REG4 t4
+# define REG5 t5
+# define REG6 t6
+# define REG7 t7
+#else
+# define REG4 ta0
+# define REG5 ta1
+# define REG6 ta2
+# define REG7 ta3
+#endif
+
+/* We load/store 64 bits at a time when USE_DOUBLE is true.
+ * The C_ prefix stands for CHUNK and is used to avoid macro name
+ * conflicts with system header files. */
+
+#ifdef USE_DOUBLE
+# define C_ST sd
+# define C_LD ld
#if __MIPSEB
-# define LWHI lwl /* high part is left in big-endian */
-# define SWHI swl /* high part is left in big-endian */
-# define LWLO lwr /* low part is right in big-endian */
-# define SWLO swr /* low part is right in big-endian */
+# define C_LDHI ldl /* high part is left in big-endian */
+# define C_STHI sdl /* high part is left in big-endian */
+# define C_LDLO ldr /* low part is right in big-endian */
+# define C_STLO sdr /* low part is right in big-endian */
+#else
+# define C_LDHI ldr /* high part is right in little-endian */
+# define C_STHI sdr /* high part is right in little-endian */
+# define C_LDLO ldl /* low part is left in little-endian */
+# define C_STLO sdl /* low part is left in little-endian */
+#endif
+#else
+# define C_ST sw
+# define C_LD lw
+#if __MIPSEB
+# define C_LDHI lwl /* high part is left in big-endian */
+# define C_STHI swl /* high part is left in big-endian */
+# define C_LDLO lwr /* low part is right in big-endian */
+# define C_STLO swr /* low part is right in big-endian */
+#else
+# define C_LDHI lwr /* high part is right in little-endian */
+# define C_STHI swr /* high part is right in little-endian */
+# define C_LDLO lwl /* low part is left in little-endian */
+# define C_STLO swl /* low part is left in little-endian */
+#endif
+#endif
+
+/* Bookkeeping values for 32 vs. 64 bit mode. */
+#ifdef USE_DOUBLE
+# define NSIZE 8
+# define NSIZEMASK 0x3f
+# define NSIZEDMASK 0x7f
#else
-# define LWHI lwr /* high part is right in little-endian */
-# define SWHI swr /* high part is right in little-endian */
-# define LWLO lwl /* low part is left in little-endian */
-# define SWLO swl /* low part is left in little-endian */
+# define NSIZE 4
+# define NSIZEMASK 0x1f
+# define NSIZEDMASK 0x3f
#endif
+#define UNIT(unit) ((unit)*NSIZE)
+#define UNITM1(unit) (((unit)*NSIZE)-1)
-ENTRY (memcpy)
+#ifdef ANDROID_CHANGES
+LEAF(MEMCPY_NAME, 0)
+#else
+LEAF(MEMCPY_NAME)
+#endif
+ .set nomips16
.set noreorder
+/*
+ * Below we handle the case where memcpy is called with overlapping src and dst.
+ * Although memcpy is not required to handle this case, some parts of Android
+ * like Skia rely on such usage. We call memmove to handle such cases.
+ */
+#ifdef USE_MEMMOVE_FOR_OVERLAP
+ PTR_SUBU t0,a0,a1
+ PTR_SRA t2,t0,31
+ xor t1,t0,t2
+ PTR_SUBU t0,t1,t2
+ sltu t2,t0,a2
+ beq t2,zero,L(memcpy)
+ la t9,memmove
+ jr t9
+ nop
+L(memcpy):
+#endif
+/*
+ * If the size is less then 2*NSIZE (8 or 16), go to L(lastb). Regardless of
+ * size, copy dst pointer to v0 for the return value.
+ */
+ slti t2,a2,(2 * NSIZE)
+ bne t2,zero,L(lastb)
+ move v0,a0
+/*
+ * If src and dst have different alignments, go to L(unaligned), if they
+ * have the same alignment (but are not actually aligned) do a partial
+ * load/store to make them aligned. If they are both already aligned
+ * we can start copying at L(aligned).
+ */
+ xor t8,a1,a0
+ andi t8,t8,(NSIZE-1) /* t8 is a0/a1 word-displacement */
+ bne t8,zero,L(unaligned)
+ PTR_SUBU a3, zero, a0
+
+ andi a3,a3,(NSIZE-1) /* copy a3 bytes to align a0/a1 */
+ beq a3,zero,L(aligned) /* if a3=0, it is already aligned */
+ PTR_SUBU a2,a2,a3 /* a2 is the remining bytes count */
+
+ C_LDHI t8,0(a1)
+ PTR_ADDU a1,a1,a3
+ C_STHI t8,0(a0)
+ PTR_ADDU a0,a0,a3
+
+/*
+ * Now dst/src are both aligned to (word or double word) aligned addresses
+ * Set a2 to count how many bytes we have to copy after all the 64/128 byte
+ * chunks are copied and a3 to the dst pointer after all the 64/128 byte
+ * chunks have been copied. We will loop, incrementing a0 and a1 until a0
+ * equals a3.
+ */
+
+L(aligned):
+ andi t8,a2,NSIZEDMASK /* any whole 64-byte/128-byte chunks? */
+ beq a2,t8,L(chkw) /* if a2==t8, no 64-byte/128-byte chunks */
+ PTR_SUBU a3,a2,t8 /* subtract from a2 the reminder */
+ PTR_ADDU a3,a0,a3 /* Now a3 is the final dst after loop */
+
+/* When in the loop we may prefetch with the 'prepare to store' hint,
+ * in this case the a0+x should not be past the "t0-32" address. This
+ * means: for x=128 the last "safe" a0 address is "t0-160". Alternatively,
+ * for x=64 the last "safe" a0 address is "t0-96" In the current version we
+ * will use "prefetch hint,128(a0)", so "t0-160" is the limit.
+ */
+#if (PREFETCH_STORE_HINT == PREFETCH_HINT_PREPAREFORSTORE)
+ PTR_ADDU t0,a0,a2 /* t0 is the "past the end" address */
+ PTR_SUBU t9,t0,PREFETCH_LIMIT /* t9 is the "last safe pref" address */
+#endif
+ PREFETCH_FOR_LOAD (0, a1)
+ PREFETCH_FOR_LOAD (1, a1)
+ PREFETCH_FOR_LOAD (2, a1)
+ PREFETCH_FOR_STORE (1, a0)
+#if (PREFETCH_STORE_HINT == PREFETCH_HINT_PREPAREFORSTORE)
+ sltu v1,t9,a0 /* If a0 > t9 don't use next prefetch */
+ bgtz v1,L(loop16w)
+ nop
+#endif
+ PREFETCH_FOR_STORE (2, a0)
+L(loop16w):
+ PREFETCH_FOR_LOAD (3, a1)
+ C_LD t0,UNIT(0)(a1)
+#if (PREFETCH_STORE_HINT == PREFETCH_HINT_PREPAREFORSTORE)
+ bgtz v1,L(skip_pref30_96)
+#endif
+ C_LD t1,UNIT(1)(a1)
+ PREFETCH_FOR_STORE (3, a0)
+L(skip_pref30_96):
+ C_LD REG2,UNIT(2)(a1)
+ C_LD REG3,UNIT(3)(a1)
+ C_LD REG4,UNIT(4)(a1)
+ C_LD REG5,UNIT(5)(a1)
+ C_LD REG6,UNIT(6)(a1)
+ C_LD REG7,UNIT(7)(a1)
+ PREFETCH_FOR_LOAD (4, a1)
+
+ C_ST t0,UNIT(0)(a0)
+ C_ST t1,UNIT(1)(a0)
+ C_ST REG2,UNIT(2)(a0)
+ C_ST REG3,UNIT(3)(a0)
+ C_ST REG4,UNIT(4)(a0)
+ C_ST REG5,UNIT(5)(a0)
+ C_ST REG6,UNIT(6)(a0)
+ C_ST REG7,UNIT(7)(a0)
+
+ C_LD t0,UNIT(8)(a1)
+#if (PREFETCH_STORE_HINT == PREFETCH_HINT_PREPAREFORSTORE)
+ bgtz v1,L(skip_pref30_128)
+#endif
+ C_LD t1,UNIT(9)(a1)
+ PREFETCH_FOR_STORE (4, a0)
+L(skip_pref30_128):
+ C_LD REG2,UNIT(10)(a1)
+ C_LD REG3,UNIT(11)(a1)
+ C_LD REG4,UNIT(12)(a1)
+ C_LD REG5,UNIT(13)(a1)
+ C_LD REG6,UNIT(14)(a1)
+ C_LD REG7,UNIT(15)(a1)
+ PREFETCH_FOR_LOAD (5, a1)
+ C_ST t0,UNIT(8)(a0)
+ C_ST t1,UNIT(9)(a0)
+ C_ST REG2,UNIT(10)(a0)
+ C_ST REG3,UNIT(11)(a0)
+ C_ST REG4,UNIT(12)(a0)
+ C_ST REG5,UNIT(13)(a0)
+ C_ST REG6,UNIT(14)(a0)
+ C_ST REG7,UNIT(15)(a0)
+ PTR_ADDIU a0,a0,UNIT(16) /* adding 64/128 to dest */
+#if (PREFETCH_STORE_HINT == PREFETCH_HINT_PREPAREFORSTORE)
+ sltu v1,t9,a0
+#endif
+ bne a0,a3,L(loop16w)
+ PTR_ADDIU a1,a1,UNIT(16) /* adding 64/128 to src */
+ move a2,t8
+
+/* Here we have src and dest word-aligned but less than 64-bytes or
+ * 128 bytes to go. Check for a 32(64) byte chunk and copy if if there
+ * is one. Otherwise jump down to L(chk1w) to handle the tail end of
+ * the copy.
+ */
+
+L(chkw):
+ PREFETCH_FOR_LOAD (0, a1)
+ andi t8,a2,NSIZEMASK /* Is there a 32-byte/64-byte chunk. */
+ /* The t8 is the reminder count past 32-bytes */
+ beq a2,t8,L(chk1w) /* When a2=t8, no 32-byte chunk */
+ nop
+ C_LD t0,UNIT(0)(a1)
+ C_LD t1,UNIT(1)(a1)
+ C_LD REG2,UNIT(2)(a1)
+ C_LD REG3,UNIT(3)(a1)
+ C_LD REG4,UNIT(4)(a1)
+ C_LD REG5,UNIT(5)(a1)
+ C_LD REG6,UNIT(6)(a1)
+ C_LD REG7,UNIT(7)(a1)
+ PTR_ADDIU a1,a1,UNIT(8)
+ C_ST t0,UNIT(0)(a0)
+ C_ST t1,UNIT(1)(a0)
+ C_ST REG2,UNIT(2)(a0)
+ C_ST REG3,UNIT(3)(a0)
+ C_ST REG4,UNIT(4)(a0)
+ C_ST REG5,UNIT(5)(a0)
+ C_ST REG6,UNIT(6)(a0)
+ C_ST REG7,UNIT(7)(a0)
+ PTR_ADDIU a0,a0,UNIT(8)
+
+/*
+ * Here we have less then 32(64) bytes to copy. Set up for a loop to
+ * copy one word (or double word) at a time. Set a2 to count how many
+ * bytes we have to copy after all the word (or double word) chunks are
+ * copied and a3 to the dst pointer after all the (d)word chunks have
+ * been copied. We will loop, incrementing a0 and a1 until a0 equals a3.
+ */
+L(chk1w):
+ andi a2,t8,(NSIZE-1) /* a2 is the reminder past one (d)word chunks */
+ beq a2,t8,L(lastb)
+ PTR_SUBU a3,t8,a2 /* a3 is count of bytes in one (d)word chunks */
+ PTR_ADDU a3,a0,a3 /* a3 is the dst address after loop */
+
+/* copying in words (4-byte or 8-byte chunks) */
+L(wordCopy_loop):
+ C_LD REG3,UNIT(0)(a1)
+ PTR_ADDIU a1,a1,UNIT(1)
+ PTR_ADDIU a0,a0,UNIT(1)
+ bne a0,a3,L(wordCopy_loop)
+ C_ST REG3,UNIT(-1)(a0)
- slti t0, a2, 8 # Less than 8?
- bne t0, zero, L(last8)
- move v0, a0 # Setup exit value before too late
-
- xor t0, a1, a0 # Find a0/a1 displacement
- andi t0, 0x3
- bne t0, zero, L(shift) # Go handle the unaligned case
- subu t1, zero, a1
- andi t1, 0x3 # a0/a1 are aligned, but are we
- beq t1, zero, L(chk8w) # starting in the middle of a word?
- subu a2, t1
- LWHI t0, 0(a1) # Yes we are... take care of that
- addu a1, t1
- SWHI t0, 0(a0)
- addu a0, t1
-
-L(chk8w):
- andi t0, a2, 0x1f # 32 or more bytes left?
- beq t0, a2, L(chk1w)
- subu a3, a2, t0 # Yes
- addu a3, a1 # a3 = end address of loop
- move a2, t0 # a2 = what will be left after loop
-L(lop8w):
- lw t0, 0(a1) # Loop taking 8 words at a time
- lw t1, 4(a1)
- lw t2, 8(a1)
- lw t3, 12(a1)
- lw t4, 16(a1)
- lw t5, 20(a1)
- lw t6, 24(a1)
- lw t7, 28(a1)
- addiu a0, 32
- addiu a1, 32
- sw t0, -32(a0)
- sw t1, -28(a0)
- sw t2, -24(a0)
- sw t3, -20(a0)
- sw t4, -16(a0)
- sw t5, -12(a0)
- sw t6, -8(a0)
- bne a1, a3, L(lop8w)
- sw t7, -4(a0)
-
-L(chk1w):
- andi t0, a2, 0x3 # 4 or more bytes left?
- beq t0, a2, L(last8)
- subu a3, a2, t0 # Yes, handle them one word at a time
- addu a3, a1 # a3 again end address
- move a2, t0
-L(lop1w):
- lw t0, 0(a1)
- addiu a0, 4
- addiu a1, 4
- bne a1, a3, L(lop1w)
- sw t0, -4(a0)
-
-L(last8):
- blez a2, L(lst8e) # Handle last 8 bytes, one at a time
- addu a3, a2, a1
-L(lst8l):
- lb t0, 0(a1)
- addiu a0, 1
- addiu a1, 1
- bne a1, a3, L(lst8l)
- sb t0, -1(a0)
-L(lst8e):
- jr ra # Bye, bye
+/* Copy the last 8 (or 16) bytes */
+L(lastb):
+ blez a2,L(leave)
+ PTR_ADDU a3,a0,a2 /* a3 is the last dst address */
+L(lastbloop):
+ lb v1,0(a1)
+ PTR_ADDIU a1,a1,1
+ PTR_ADDIU a0,a0,1
+ bne a0,a3,L(lastbloop)
+ sb v1,-1(a0)
+L(leave):
+ j ra
nop
+/*
+ * UNALIGNED case, got here with a3 = "negu a0"
+ * This code is nearly identical to the aligned code above
+ * but only the destination (not the source) gets aligned
+ * so we need to do partial loads of the source followed
+ * by normal stores to the destination (once we have aligned
+ * the destination).
+ */
+
+L(unaligned):
+ andi a3,a3,(NSIZE-1) /* copy a3 bytes to align a0/a1 */
+ beqz a3,L(ua_chk16w) /* if a3=0, it is already aligned */
+ PTR_SUBU a2,a2,a3 /* a2 is the remining bytes count */
+
+ C_LDHI v1,UNIT(0)(a1)
+ C_LDLO v1,UNITM1(1)(a1)
+ PTR_ADDU a1,a1,a3
+ C_STHI v1,UNIT(0)(a0)
+ PTR_ADDU a0,a0,a3
+
+/*
+ * Now the destination (but not the source) is aligned
+ * Set a2 to count how many bytes we have to copy after all the 64/128 byte
+ * chunks are copied and a3 to the dst pointer after all the 64/128 byte
+ * chunks have been copied. We will loop, incrementing a0 and a1 until a0
+ * equals a3.
+ */
-L(shift):
- subu a3, zero, a0 # Src and Dest unaligned
- andi a3, 0x3 # (unoptimized case...)
- beq a3, zero, L(shft1)
- subu a2, a3 # a2 = bytes left
- LWHI t0, 0(a1) # Take care of first odd part
- LWLO t0, 3(a1)
- addu a1, a3
- SWHI t0, 0(a0)
- addu a0, a3
-L(shft1):
- andi t0, a2, 0x3
- subu a3, a2, t0
- addu a3, a1
-L(shfth):
- LWHI t1, 0(a1) # Limp through, word by word
- LWLO t1, 3(a1)
- addiu a0, 4
- addiu a1, 4
- bne a1, a3, L(shfth)
- sw t1, -4(a0)
- b L(last8) # Handle anything which may be left
- move a2, t0
+L(ua_chk16w):
+ andi t8,a2,NSIZEDMASK /* any whole 64-byte/128-byte chunks? */
+ beq a2,t8,L(ua_chkw) /* if a2==t8, no 64-byte/128-byte chunks */
+ PTR_SUBU a3,a2,t8 /* subtract from a2 the reminder */
+ PTR_ADDU a3,a0,a3 /* Now a3 is the final dst after loop */
+#if (PREFETCH_STORE_HINT == PREFETCH_HINT_PREPAREFORSTORE)
+ PTR_ADDU t0,a0,a2 /* t0 is the "past the end" address */
+ PTR_SUBU t9,t0,PREFETCH_LIMIT /* t9 is the "last safe pref" address */
+#endif
+ PREFETCH_FOR_LOAD (0, a1)
+ PREFETCH_FOR_LOAD (1, a1)
+ PREFETCH_FOR_LOAD (2, a1)
+ PREFETCH_FOR_STORE (1, a0)
+#if (PREFETCH_STORE_HINT == PREFETCH_HINT_PREPAREFORSTORE)
+ sltu v1,t9,a0
+ bgtz v1,L(ua_loop16w) /* skip prefetch for too short arrays */
+ nop
+#endif
+ PREFETCH_FOR_STORE (2, a0)
+L(ua_loop16w):
+ PREFETCH_FOR_LOAD (3, a1)
+ C_LDHI t0,UNIT(0)(a1)
+ C_LDLO t0,UNITM1(1)(a1)
+ C_LDHI t1,UNIT(1)(a1)
+#if (PREFETCH_STORE_HINT == PREFETCH_HINT_PREPAREFORSTORE)
+ bgtz v1,L(ua_skip_pref30_96)
+#endif
+ C_LDLO t1,UNITM1(2)(a1)
+ PREFETCH_FOR_STORE (3, a0)
+L(ua_skip_pref30_96):
+ C_LDHI REG2,UNIT(2)(a1)
+ C_LDLO REG2,UNITM1(3)(a1)
+ C_LDHI REG3,UNIT(3)(a1)
+ C_LDLO REG3,UNITM1(4)(a1)
+ C_LDHI REG4,UNIT(4)(a1)
+ C_LDLO REG4,UNITM1(5)(a1)
+ C_LDHI REG5,UNIT(5)(a1)
+ C_LDLO REG5,UNITM1(6)(a1)
+ C_LDHI REG6,UNIT(6)(a1)
+ C_LDLO REG6,UNITM1(7)(a1)
+ C_LDHI REG7,UNIT(7)(a1)
+ C_LDLO REG7,UNITM1(8)(a1)
+ PREFETCH_FOR_LOAD (4, a1)
+ C_ST t0,UNIT(0)(a0)
+ C_ST t1,UNIT(1)(a0)
+ C_ST REG2,UNIT(2)(a0)
+ C_ST REG3,UNIT(3)(a0)
+ C_ST REG4,UNIT(4)(a0)
+ C_ST REG5,UNIT(5)(a0)
+ C_ST REG6,UNIT(6)(a0)
+ C_ST REG7,UNIT(7)(a0)
+ C_LDHI t0,UNIT(8)(a1)
+ C_LDLO t0,UNITM1(9)(a1)
+ C_LDHI t1,UNIT(9)(a1)
+#if (PREFETCH_STORE_HINT == PREFETCH_HINT_PREPAREFORSTORE)
+ bgtz v1,L(ua_skip_pref30_128)
+#endif
+ C_LDLO t1,UNITM1(10)(a1)
+ PREFETCH_FOR_STORE (4, a0)
+L(ua_skip_pref30_128):
+ C_LDHI REG2,UNIT(10)(a1)
+ C_LDLO REG2,UNITM1(11)(a1)
+ C_LDHI REG3,UNIT(11)(a1)
+ C_LDLO REG3,UNITM1(12)(a1)
+ C_LDHI REG4,UNIT(12)(a1)
+ C_LDLO REG4,UNITM1(13)(a1)
+ C_LDHI REG5,UNIT(13)(a1)
+ C_LDLO REG5,UNITM1(14)(a1)
+ C_LDHI REG6,UNIT(14)(a1)
+ C_LDLO REG6,UNITM1(15)(a1)
+ C_LDHI REG7,UNIT(15)(a1)
+ C_LDLO REG7,UNITM1(16)(a1)
+ PREFETCH_FOR_LOAD (5, a1)
+ C_ST t0,UNIT(8)(a0)
+ C_ST t1,UNIT(9)(a0)
+ C_ST REG2,UNIT(10)(a0)
+ C_ST REG3,UNIT(11)(a0)
+ C_ST REG4,UNIT(12)(a0)
+ C_ST REG5,UNIT(13)(a0)
+ C_ST REG6,UNIT(14)(a0)
+ C_ST REG7,UNIT(15)(a0)
+ PTR_ADDIU a0,a0,UNIT(16) /* adding 64/128 to dest */
+#if (PREFETCH_STORE_HINT == PREFETCH_HINT_PREPAREFORSTORE)
+ sltu v1,t9,a0
+#endif
+ bne a0,a3,L(ua_loop16w)
+ PTR_ADDIU a1,a1,UNIT(16) /* adding 64/128 to src */
+ move a2,t8
+
+/* Here we have src and dest word-aligned but less than 64-bytes or
+ * 128 bytes to go. Check for a 32(64) byte chunk and copy if if there
+ * is one. Otherwise jump down to L(ua_chk1w) to handle the tail end of
+ * the copy. */
+
+L(ua_chkw):
+ PREFETCH_FOR_LOAD (0, a1)
+ andi t8,a2,NSIZEMASK /* Is there a 32-byte/64-byte chunk. */
+ /* t8 is the reminder count past 32-bytes */
+ beq a2,t8,L(ua_chk1w) /* When a2=t8, no 32-byte chunk */
+ nop
+ C_LDHI t0,UNIT(0)(a1)
+ C_LDLO t0,UNITM1(1)(a1)
+ C_LDHI t1,UNIT(1)(a1)
+ C_LDLO t1,UNITM1(2)(a1)
+ C_LDHI REG2,UNIT(2)(a1)
+ C_LDLO REG2,UNITM1(3)(a1)
+ C_LDHI REG3,UNIT(3)(a1)
+ C_LDLO REG3,UNITM1(4)(a1)
+ C_LDHI REG4,UNIT(4)(a1)
+ C_LDLO REG4,UNITM1(5)(a1)
+ C_LDHI REG5,UNIT(5)(a1)
+ C_LDLO REG5,UNITM1(6)(a1)
+ C_LDHI REG6,UNIT(6)(a1)
+ C_LDLO REG6,UNITM1(7)(a1)
+ C_LDHI REG7,UNIT(7)(a1)
+ C_LDLO REG7,UNITM1(8)(a1)
+ PTR_ADDIU a1,a1,UNIT(8)
+ C_ST t0,UNIT(0)(a0)
+ C_ST t1,UNIT(1)(a0)
+ C_ST REG2,UNIT(2)(a0)
+ C_ST REG3,UNIT(3)(a0)
+ C_ST REG4,UNIT(4)(a0)
+ C_ST REG5,UNIT(5)(a0)
+ C_ST REG6,UNIT(6)(a0)
+ C_ST REG7,UNIT(7)(a0)
+ PTR_ADDIU a0,a0,UNIT(8)
+/*
+ * Here we have less then 32(64) bytes to copy. Set up for a loop to
+ * copy one word (or double word) at a time.
+ */
+L(ua_chk1w):
+ andi a2,t8,(NSIZE-1) /* a2 is the reminder past one (d)word chunks */
+ beq a2,t8,L(ua_smallCopy)
+ PTR_SUBU a3,t8,a2 /* a3 is count of bytes in one (d)word chunks */
+ PTR_ADDU a3,a0,a3 /* a3 is the dst address after loop */
+
+/* copying in words (4-byte or 8-byte chunks) */
+L(ua_wordCopy_loop):
+ C_LDHI v1,UNIT(0)(a1)
+ C_LDLO v1,UNITM1(1)(a1)
+ PTR_ADDIU a1,a1,UNIT(1)
+ PTR_ADDIU a0,a0,UNIT(1)
+ bne a0,a3,L(ua_wordCopy_loop)
+ C_ST v1,UNIT(-1)(a0)
+
+/* Copy the last 8 (or 16) bytes */
+L(ua_smallCopy):
+ beqz a2,L(leave)
+ PTR_ADDU a3,a0,a2 /* a3 is the last dst address */
+L(ua_smallCopy_loop):
+ lb v1,0(a1)
+ PTR_ADDIU a1,a1,1
+ PTR_ADDIU a0,a0,1
+ bne a0,a3,L(ua_smallCopy_loop)
+ sb v1,-1(a0)
+
+ j ra
+ nop
+
+ .set at
.set reorder
-END (memcpy)
-libc_hidden_builtin_def (memcpy)
+END(MEMCPY_NAME)
+#ifdef _LIBC
+libc_hidden_builtin_def (MEMCPY_NAME)
+#endif
^ permalink raw reply [flat|nested] 50+ messages in thread
* Re: [PATCH] Optimize MIPS memcpy
2012-10-17 17:30 ` Steve Ellcey
@ 2012-10-29 18:00 ` Steve Ellcey
2012-10-29 18:03 ` Maxim Kuvyrkov
2012-10-30 7:16 ` Maxim Kuvyrkov
2012-10-31 19:27 ` Andreas Jaeger
1 sibling, 2 replies; 50+ messages in thread
From: Steve Ellcey @ 2012-10-29 18:00 UTC (permalink / raw)
To: Maxim Kuvyrkov; +Cc: Andrew Pinski, Joseph S. Myers, libc-ports
On Wed, 2012-10-17 at 10:29 -0700, Steve Ellcey wrote:
> OK, Here is a version of memcpy that uses the STORE_STREAMING prefetch.
> While it is optimized for a 32 byte prefetch, it will work correctly
> regardless of the size of the prefetch.
>
> Is this version OK to checkin?
>
> Steve Ellcey
> sellcey@mips.com
Maxim, have you had a chance to test this version of memcpy for MIPS?
Steve Ellcey
sellcey@mips.com
> 2012-10-17 Steve Ellcey <sellcey@mips.com>
>
> * sysdeps/mips/memcpy.S: Add prefetching and more unrolling, make
> it work in 32 or 64 bit modes.
> * sysdeps/mips/mips64/memcpy.S: Remove.
>
> diff --git a/ports/sysdeps/mips/memcpy.S b/ports/sysdeps/mips/memcpy.S
> index 753f67c..71474e9 100644
> --- a/ports/sysdeps/mips/memcpy.S
> +++ b/ports/sysdeps/mips/memcpy.S
> @@ -1,7 +1,8 @@
> -/* Copyright (C) 2002-2012 Free Software Foundation, Inc.
> +/* Copyright (C) 2012 Free Software Foundation, Inc.
> This file is part of the GNU C Library.
> - Contributed by Hartvig Ekner <hartvige@mips.com>, 2002.
> -
> +
> + Contributed by MIPS Technologies, Inc.
> +
> The GNU C Library is free software; you can redistribute it and/or
> modify it under the terms of the GNU Lesser General Public
> License as published by the Free Software Foundation; either
> @@ -16,119 +17,616 @@
> License along with the GNU C Library. If not, see
> <http://www.gnu.org/licenses/>. */
>
> +#ifdef ANDROID_CHANGES
> +#include "machine/asm.h"
> +#include "machine/regdef.h"
> +#define USE_MEMMOVE_FOR_OVERLAP
> +#define PREFETCH_LOAD_HINT PREFETCH_HINT_LOAD_STREAMED
> +#define PREFETCH_STORE_HINT PREFETCH_HINT_PREPAREFORSTORE
> +#elif _LIBC
> #include <sysdep.h>
> +#include <regdef.h>
> +#include <sys/asm.h>
> +#define PREFETCH_LOAD_HINT PREFETCH_HINT_LOAD_STREAMED
> +#define PREFETCH_STORE_HINT PREFETCH_HINT_STORE_STREAMED
> +#elif _COMPILING_NEWLIB
> +#include "machine/asm.h"
> +#include "machine/regdef.h"
> +#define PREFETCH_LOAD_HINT PREFETCH_HINT_LOAD_STREAMED
> +#define PREFETCH_STORE_HINT PREFETCH_HINT_STORE_STREAMED
> +#else
> +#include <regdef.h>
> +#include <sys/asm.h>
> +#endif
> +
> +#if (_MIPS_ISA == _MIPS_ISA_MIPS4) || (_MIPS_ISA == _MIPS_ISA_MIPS5) || \
> + (_MIPS_ISA == _MIPS_ISA_MIPS32) || (_MIPS_ISA == _MIPS_ISA_MIPS64)
> +#ifndef DISABLE_PREFETCH
> +#define USE_PREFETCH
> +#endif
> +#endif
> +
> +#if (_MIPS_SIM == _ABI64) || (_MIPS_SIM == _ABIN32)
> +#ifndef DISABLE_DOUBLE
> +#define USE_DOUBLE
> +#endif
> +#endif
> +
> +
> +
> +/* Some asm.h files do not have the L macro definition. */
> +#ifndef L
> +#if _MIPS_SIM == _ABIO32
> +# define L(label) $L ## label
> +#else
> +# define L(label) .L ## label
> +#endif
> +#endif
> +
> +/* Some asm.h files do not have the PTR_ADDIU macro definition. */
> +#ifndef PTR_ADDIU
> +#ifdef USE_DOUBLE
> +#define PTR_ADDIU daddiu
> +#else
> +#define PTR_ADDIU addiu
> +#endif
> +#endif
> +
> +/* Some asm.h files do not have the PTR_SRA macro definition. */
> +#ifndef PTR_SRA
> +#ifdef USE_DOUBLE
> +#define PTR_SRA dsra
> +#else
> +#define PTR_SRA sra
> +#endif
> +#endif
> +
>
> +/*
> + * Using PREFETCH_HINT_LOAD_STREAMED instead of PREFETCH_LOAD on load
> + * prefetches appears to offer a slight preformance advantage.
> + *
> + * Using PREFETCH_HINT_PREPAREFORSTORE instead of PREFETCH_STORE
> + * or PREFETCH_STORE_STREAMED offers a large performance advantage
> + * but PREPAREFORSTORE has some special restrictions to consider.
> + *
> + * Prefetch with the 'prepare for store' hint does not copy a memory
> + * location into the cache, it just allocates a cache line and zeros
> + * it out. This means that if you do not write to the entire cache
> + * line before writing it out to memory some data will get zero'ed out
> + * when the cache line is written back to memory and data will be lost.
> + *
> + * Also if you are using this memcpy to copy overlapping buffers it may
> + * not behave correctly when using the 'prepare for store' hint. If you
> + * use the 'prepare for store' prefetch on a memory area that is in the
> + * memcpy source (as well as the memcpy destination), then you will get
> + * some data zero'ed out before you have a chance to read it and data will
> + * be lost.
> + *
> + * If you are going to use this memcpy routine with the 'prepare for store'
> + * prefetch you may want to set USE_MEMMOVE_FOR_OVERLAP in order to avoid
> + * the problem of running memcpy on overlapping buffers.
> + *
> + * There are ifdef'ed sections of this memcpy to make sure that it does not
> + * do prefetches on cache lines that are not going to be completely written.
> + * This code is only needed and only used when PREFETCH_STORE_HINT is set to
> + * PREFETCH_HINT_PREPAREFORSTORE. This code assumes that cache lines are
> + * 32 bytes and if the cache line is larger it will not work correctly.
> + */
>
> -/* void *memcpy(void *s1, const void *s2, size_t n); */
> +#ifdef USE_PREFETCH
> +# define PREFETCH_HINT_LOAD 0
> +# define PREFETCH_HINT_STORE 1
> +# define PREFETCH_HINT_LOAD_STREAMED 4
> +# define PREFETCH_HINT_STORE_STREAMED 5
> +# define PREFETCH_HINT_LOAD_RETAINED 6
> +# define PREFETCH_HINT_STORE_RETAINED 7
> +# define PREFETCH_HINT_WRITEBACK_INVAL 25
> +# define PREFETCH_HINT_PREPAREFORSTORE 30
> +
> +/*
> + * If we have not picked out what hints to use at this point use the
> + * standard load and store prefetch hints.
> + */
> +#ifndef PREFETCH_STORE_HINT
> +# define PREFETCH_STORE_HINT PREFETCH_HINT_STORE
> +#endif
> +#ifndef PREFETCH_LOAD_HINT
> +# define PREFETCH_LOAD_HINT PREFETCH_HINT_LOAD
> +#endif
>
> +/*
> + * We double everything when USE_DOUBLE is true so we do 2 prefetches to
> + * get 64 bytes in that case. The assumption is that each individual
> + * prefetch brings in 32 bytes.
> + */
> +#ifdef USE_DOUBLE
> +# define PREFETCH_CHUNK 64
> +# define PREFETCH_FOR_LOAD(chunk, reg) \
> + pref PREFETCH_LOAD_HINT, (chunk)*32(reg); \
> + pref PREFETCH_LOAD_HINT, ((chunk)+1)*32(reg)
> +# define PREFETCH_FOR_STORE(chunk, reg) \
> + pref PREFETCH_STORE_HINT, (chunk)*32(reg); \
> + pref PREFETCH_STORE_HINT, ((chunk)+1)*32(reg)
> +#else
> +# define PREFETCH_CHUNK 32
> +# define PREFETCH_FOR_LOAD(chunk, reg) \
> + pref PREFETCH_LOAD_HINT, (chunk)*32(reg)
> +# define PREFETCH_FOR_STORE(chunk, reg) \
> + pref PREFETCH_STORE_HINT, (chunk)*32(reg)
> +#endif
> +# define PREFETCH_LIMIT (5 * PREFETCH_CHUNK)
> +#else /* USE_PREFETCH not defined */
> +# define PREFETCH_FOR_LOAD(offset, reg)
> +# define PREFETCH_FOR_STORE(offset, reg)
> +#endif
> +
> +/* Allow the routine to be named something else if desired. */
> +#ifndef MEMCPY_NAME
> +#define MEMCPY_NAME memcpy
> +#endif
> +
> +/* We use these 32/64 bit registers as temporaries to do the copying. */
> +#define REG0 t0
> +#define REG1 t1
> +#define REG2 t2
> +#define REG3 t3
> +#if _MIPS_SIM == _ABIO32
> +# define REG4 t4
> +# define REG5 t5
> +# define REG6 t6
> +# define REG7 t7
> +#else
> +# define REG4 ta0
> +# define REG5 ta1
> +# define REG6 ta2
> +# define REG7 ta3
> +#endif
> +
> +/* We load/store 64 bits at a time when USE_DOUBLE is true.
> + * The C_ prefix stands for CHUNK and is used to avoid macro name
> + * conflicts with system header files. */
> +
> +#ifdef USE_DOUBLE
> +# define C_ST sd
> +# define C_LD ld
> #if __MIPSEB
> -# define LWHI lwl /* high part is left in big-endian */
> -# define SWHI swl /* high part is left in big-endian */
> -# define LWLO lwr /* low part is right in big-endian */
> -# define SWLO swr /* low part is right in big-endian */
> +# define C_LDHI ldl /* high part is left in big-endian */
> +# define C_STHI sdl /* high part is left in big-endian */
> +# define C_LDLO ldr /* low part is right in big-endian */
> +# define C_STLO sdr /* low part is right in big-endian */
> +#else
> +# define C_LDHI ldr /* high part is right in little-endian */
> +# define C_STHI sdr /* high part is right in little-endian */
> +# define C_LDLO ldl /* low part is left in little-endian */
> +# define C_STLO sdl /* low part is left in little-endian */
> +#endif
> +#else
> +# define C_ST sw
> +# define C_LD lw
> +#if __MIPSEB
> +# define C_LDHI lwl /* high part is left in big-endian */
> +# define C_STHI swl /* high part is left in big-endian */
> +# define C_LDLO lwr /* low part is right in big-endian */
> +# define C_STLO swr /* low part is right in big-endian */
> +#else
> +# define C_LDHI lwr /* high part is right in little-endian */
> +# define C_STHI swr /* high part is right in little-endian */
> +# define C_LDLO lwl /* low part is left in little-endian */
> +# define C_STLO swl /* low part is left in little-endian */
> +#endif
> +#endif
> +
> +/* Bookkeeping values for 32 vs. 64 bit mode. */
> +#ifdef USE_DOUBLE
> +# define NSIZE 8
> +# define NSIZEMASK 0x3f
> +# define NSIZEDMASK 0x7f
> #else
> -# define LWHI lwr /* high part is right in little-endian */
> -# define SWHI swr /* high part is right in little-endian */
> -# define LWLO lwl /* low part is left in little-endian */
> -# define SWLO swl /* low part is left in little-endian */
> +# define NSIZE 4
> +# define NSIZEMASK 0x1f
> +# define NSIZEDMASK 0x3f
> #endif
> +#define UNIT(unit) ((unit)*NSIZE)
> +#define UNITM1(unit) (((unit)*NSIZE)-1)
>
> -ENTRY (memcpy)
> +#ifdef ANDROID_CHANGES
> +LEAF(MEMCPY_NAME, 0)
> +#else
> +LEAF(MEMCPY_NAME)
> +#endif
> + .set nomips16
> .set noreorder
> +/*
> + * Below we handle the case where memcpy is called with overlapping src and dst.
> + * Although memcpy is not required to handle this case, some parts of Android
> + * like Skia rely on such usage. We call memmove to handle such cases.
> + */
> +#ifdef USE_MEMMOVE_FOR_OVERLAP
> + PTR_SUBU t0,a0,a1
> + PTR_SRA t2,t0,31
> + xor t1,t0,t2
> + PTR_SUBU t0,t1,t2
> + sltu t2,t0,a2
> + beq t2,zero,L(memcpy)
> + la t9,memmove
> + jr t9
> + nop
> +L(memcpy):
> +#endif
> +/*
> + * If the size is less then 2*NSIZE (8 or 16), go to L(lastb). Regardless of
> + * size, copy dst pointer to v0 for the return value.
> + */
> + slti t2,a2,(2 * NSIZE)
> + bne t2,zero,L(lastb)
> + move v0,a0
> +/*
> + * If src and dst have different alignments, go to L(unaligned), if they
> + * have the same alignment (but are not actually aligned) do a partial
> + * load/store to make them aligned. If they are both already aligned
> + * we can start copying at L(aligned).
> + */
> + xor t8,a1,a0
> + andi t8,t8,(NSIZE-1) /* t8 is a0/a1 word-displacement */
> + bne t8,zero,L(unaligned)
> + PTR_SUBU a3, zero, a0
> +
> + andi a3,a3,(NSIZE-1) /* copy a3 bytes to align a0/a1 */
> + beq a3,zero,L(aligned) /* if a3=0, it is already aligned */
> + PTR_SUBU a2,a2,a3 /* a2 is the remining bytes count */
> +
> + C_LDHI t8,0(a1)
> + PTR_ADDU a1,a1,a3
> + C_STHI t8,0(a0)
> + PTR_ADDU a0,a0,a3
> +
> +/*
> + * Now dst/src are both aligned to (word or double word) aligned addresses
> + * Set a2 to count how many bytes we have to copy after all the 64/128 byte
> + * chunks are copied and a3 to the dst pointer after all the 64/128 byte
> + * chunks have been copied. We will loop, incrementing a0 and a1 until a0
> + * equals a3.
> + */
> +
> +L(aligned):
> + andi t8,a2,NSIZEDMASK /* any whole 64-byte/128-byte chunks? */
> + beq a2,t8,L(chkw) /* if a2==t8, no 64-byte/128-byte chunks */
> + PTR_SUBU a3,a2,t8 /* subtract from a2 the reminder */
> + PTR_ADDU a3,a0,a3 /* Now a3 is the final dst after loop */
> +
> +/* When in the loop we may prefetch with the 'prepare to store' hint,
> + * in this case the a0+x should not be past the "t0-32" address. This
> + * means: for x=128 the last "safe" a0 address is "t0-160". Alternatively,
> + * for x=64 the last "safe" a0 address is "t0-96" In the current version we
> + * will use "prefetch hint,128(a0)", so "t0-160" is the limit.
> + */
> +#if (PREFETCH_STORE_HINT == PREFETCH_HINT_PREPAREFORSTORE)
> + PTR_ADDU t0,a0,a2 /* t0 is the "past the end" address */
> + PTR_SUBU t9,t0,PREFETCH_LIMIT /* t9 is the "last safe pref" address */
> +#endif
> + PREFETCH_FOR_LOAD (0, a1)
> + PREFETCH_FOR_LOAD (1, a1)
> + PREFETCH_FOR_LOAD (2, a1)
> + PREFETCH_FOR_STORE (1, a0)
> +#if (PREFETCH_STORE_HINT == PREFETCH_HINT_PREPAREFORSTORE)
> + sltu v1,t9,a0 /* If a0 > t9 don't use next prefetch */
> + bgtz v1,L(loop16w)
> + nop
> +#endif
> + PREFETCH_FOR_STORE (2, a0)
> +L(loop16w):
> + PREFETCH_FOR_LOAD (3, a1)
> + C_LD t0,UNIT(0)(a1)
> +#if (PREFETCH_STORE_HINT == PREFETCH_HINT_PREPAREFORSTORE)
> + bgtz v1,L(skip_pref30_96)
> +#endif
> + C_LD t1,UNIT(1)(a1)
> + PREFETCH_FOR_STORE (3, a0)
> +L(skip_pref30_96):
> + C_LD REG2,UNIT(2)(a1)
> + C_LD REG3,UNIT(3)(a1)
> + C_LD REG4,UNIT(4)(a1)
> + C_LD REG5,UNIT(5)(a1)
> + C_LD REG6,UNIT(6)(a1)
> + C_LD REG7,UNIT(7)(a1)
> + PREFETCH_FOR_LOAD (4, a1)
> +
> + C_ST t0,UNIT(0)(a0)
> + C_ST t1,UNIT(1)(a0)
> + C_ST REG2,UNIT(2)(a0)
> + C_ST REG3,UNIT(3)(a0)
> + C_ST REG4,UNIT(4)(a0)
> + C_ST REG5,UNIT(5)(a0)
> + C_ST REG6,UNIT(6)(a0)
> + C_ST REG7,UNIT(7)(a0)
> +
> + C_LD t0,UNIT(8)(a1)
> +#if (PREFETCH_STORE_HINT == PREFETCH_HINT_PREPAREFORSTORE)
> + bgtz v1,L(skip_pref30_128)
> +#endif
> + C_LD t1,UNIT(9)(a1)
> + PREFETCH_FOR_STORE (4, a0)
> +L(skip_pref30_128):
> + C_LD REG2,UNIT(10)(a1)
> + C_LD REG3,UNIT(11)(a1)
> + C_LD REG4,UNIT(12)(a1)
> + C_LD REG5,UNIT(13)(a1)
> + C_LD REG6,UNIT(14)(a1)
> + C_LD REG7,UNIT(15)(a1)
> + PREFETCH_FOR_LOAD (5, a1)
> + C_ST t0,UNIT(8)(a0)
> + C_ST t1,UNIT(9)(a0)
> + C_ST REG2,UNIT(10)(a0)
> + C_ST REG3,UNIT(11)(a0)
> + C_ST REG4,UNIT(12)(a0)
> + C_ST REG5,UNIT(13)(a0)
> + C_ST REG6,UNIT(14)(a0)
> + C_ST REG7,UNIT(15)(a0)
> + PTR_ADDIU a0,a0,UNIT(16) /* adding 64/128 to dest */
> +#if (PREFETCH_STORE_HINT == PREFETCH_HINT_PREPAREFORSTORE)
> + sltu v1,t9,a0
> +#endif
> + bne a0,a3,L(loop16w)
> + PTR_ADDIU a1,a1,UNIT(16) /* adding 64/128 to src */
> + move a2,t8
> +
> +/* Here we have src and dest word-aligned but less than 64-bytes or
> + * 128 bytes to go. Check for a 32(64) byte chunk and copy if if there
> + * is one. Otherwise jump down to L(chk1w) to handle the tail end of
> + * the copy.
> + */
> +
> +L(chkw):
> + PREFETCH_FOR_LOAD (0, a1)
> + andi t8,a2,NSIZEMASK /* Is there a 32-byte/64-byte chunk. */
> + /* The t8 is the reminder count past 32-bytes */
> + beq a2,t8,L(chk1w) /* When a2=t8, no 32-byte chunk */
> + nop
> + C_LD t0,UNIT(0)(a1)
> + C_LD t1,UNIT(1)(a1)
> + C_LD REG2,UNIT(2)(a1)
> + C_LD REG3,UNIT(3)(a1)
> + C_LD REG4,UNIT(4)(a1)
> + C_LD REG5,UNIT(5)(a1)
> + C_LD REG6,UNIT(6)(a1)
> + C_LD REG7,UNIT(7)(a1)
> + PTR_ADDIU a1,a1,UNIT(8)
> + C_ST t0,UNIT(0)(a0)
> + C_ST t1,UNIT(1)(a0)
> + C_ST REG2,UNIT(2)(a0)
> + C_ST REG3,UNIT(3)(a0)
> + C_ST REG4,UNIT(4)(a0)
> + C_ST REG5,UNIT(5)(a0)
> + C_ST REG6,UNIT(6)(a0)
> + C_ST REG7,UNIT(7)(a0)
> + PTR_ADDIU a0,a0,UNIT(8)
> +
> +/*
> + * Here we have less then 32(64) bytes to copy. Set up for a loop to
> + * copy one word (or double word) at a time. Set a2 to count how many
> + * bytes we have to copy after all the word (or double word) chunks are
> + * copied and a3 to the dst pointer after all the (d)word chunks have
> + * been copied. We will loop, incrementing a0 and a1 until a0 equals a3.
> + */
> +L(chk1w):
> + andi a2,t8,(NSIZE-1) /* a2 is the reminder past one (d)word chunks */
> + beq a2,t8,L(lastb)
> + PTR_SUBU a3,t8,a2 /* a3 is count of bytes in one (d)word chunks */
> + PTR_ADDU a3,a0,a3 /* a3 is the dst address after loop */
> +
> +/* copying in words (4-byte or 8-byte chunks) */
> +L(wordCopy_loop):
> + C_LD REG3,UNIT(0)(a1)
> + PTR_ADDIU a1,a1,UNIT(1)
> + PTR_ADDIU a0,a0,UNIT(1)
> + bne a0,a3,L(wordCopy_loop)
> + C_ST REG3,UNIT(-1)(a0)
>
> - slti t0, a2, 8 # Less than 8?
> - bne t0, zero, L(last8)
> - move v0, a0 # Setup exit value before too late
> -
> - xor t0, a1, a0 # Find a0/a1 displacement
> - andi t0, 0x3
> - bne t0, zero, L(shift) # Go handle the unaligned case
> - subu t1, zero, a1
> - andi t1, 0x3 # a0/a1 are aligned, but are we
> - beq t1, zero, L(chk8w) # starting in the middle of a word?
> - subu a2, t1
> - LWHI t0, 0(a1) # Yes we are... take care of that
> - addu a1, t1
> - SWHI t0, 0(a0)
> - addu a0, t1
> -
> -L(chk8w):
> - andi t0, a2, 0x1f # 32 or more bytes left?
> - beq t0, a2, L(chk1w)
> - subu a3, a2, t0 # Yes
> - addu a3, a1 # a3 = end address of loop
> - move a2, t0 # a2 = what will be left after loop
> -L(lop8w):
> - lw t0, 0(a1) # Loop taking 8 words at a time
> - lw t1, 4(a1)
> - lw t2, 8(a1)
> - lw t3, 12(a1)
> - lw t4, 16(a1)
> - lw t5, 20(a1)
> - lw t6, 24(a1)
> - lw t7, 28(a1)
> - addiu a0, 32
> - addiu a1, 32
> - sw t0, -32(a0)
> - sw t1, -28(a0)
> - sw t2, -24(a0)
> - sw t3, -20(a0)
> - sw t4, -16(a0)
> - sw t5, -12(a0)
> - sw t6, -8(a0)
> - bne a1, a3, L(lop8w)
> - sw t7, -4(a0)
> -
> -L(chk1w):
> - andi t0, a2, 0x3 # 4 or more bytes left?
> - beq t0, a2, L(last8)
> - subu a3, a2, t0 # Yes, handle them one word at a time
> - addu a3, a1 # a3 again end address
> - move a2, t0
> -L(lop1w):
> - lw t0, 0(a1)
> - addiu a0, 4
> - addiu a1, 4
> - bne a1, a3, L(lop1w)
> - sw t0, -4(a0)
> -
> -L(last8):
> - blez a2, L(lst8e) # Handle last 8 bytes, one at a time
> - addu a3, a2, a1
> -L(lst8l):
> - lb t0, 0(a1)
> - addiu a0, 1
> - addiu a1, 1
> - bne a1, a3, L(lst8l)
> - sb t0, -1(a0)
> -L(lst8e):
> - jr ra # Bye, bye
> +/* Copy the last 8 (or 16) bytes */
> +L(lastb):
> + blez a2,L(leave)
> + PTR_ADDU a3,a0,a2 /* a3 is the last dst address */
> +L(lastbloop):
> + lb v1,0(a1)
> + PTR_ADDIU a1,a1,1
> + PTR_ADDIU a0,a0,1
> + bne a0,a3,L(lastbloop)
> + sb v1,-1(a0)
> +L(leave):
> + j ra
> nop
> +/*
> + * UNALIGNED case, got here with a3 = "negu a0"
> + * This code is nearly identical to the aligned code above
> + * but only the destination (not the source) gets aligned
> + * so we need to do partial loads of the source followed
> + * by normal stores to the destination (once we have aligned
> + * the destination).
> + */
> +
> +L(unaligned):
> + andi a3,a3,(NSIZE-1) /* copy a3 bytes to align a0/a1 */
> + beqz a3,L(ua_chk16w) /* if a3=0, it is already aligned */
> + PTR_SUBU a2,a2,a3 /* a2 is the remining bytes count */
> +
> + C_LDHI v1,UNIT(0)(a1)
> + C_LDLO v1,UNITM1(1)(a1)
> + PTR_ADDU a1,a1,a3
> + C_STHI v1,UNIT(0)(a0)
> + PTR_ADDU a0,a0,a3
> +
> +/*
> + * Now the destination (but not the source) is aligned
> + * Set a2 to count how many bytes we have to copy after all the 64/128 byte
> + * chunks are copied and a3 to the dst pointer after all the 64/128 byte
> + * chunks have been copied. We will loop, incrementing a0 and a1 until a0
> + * equals a3.
> + */
>
> -L(shift):
> - subu a3, zero, a0 # Src and Dest unaligned
> - andi a3, 0x3 # (unoptimized case...)
> - beq a3, zero, L(shft1)
> - subu a2, a3 # a2 = bytes left
> - LWHI t0, 0(a1) # Take care of first odd part
> - LWLO t0, 3(a1)
> - addu a1, a3
> - SWHI t0, 0(a0)
> - addu a0, a3
> -L(shft1):
> - andi t0, a2, 0x3
> - subu a3, a2, t0
> - addu a3, a1
> -L(shfth):
> - LWHI t1, 0(a1) # Limp through, word by word
> - LWLO t1, 3(a1)
> - addiu a0, 4
> - addiu a1, 4
> - bne a1, a3, L(shfth)
> - sw t1, -4(a0)
> - b L(last8) # Handle anything which may be left
> - move a2, t0
> +L(ua_chk16w):
> + andi t8,a2,NSIZEDMASK /* any whole 64-byte/128-byte chunks? */
> + beq a2,t8,L(ua_chkw) /* if a2==t8, no 64-byte/128-byte chunks */
> + PTR_SUBU a3,a2,t8 /* subtract from a2 the reminder */
> + PTR_ADDU a3,a0,a3 /* Now a3 is the final dst after loop */
>
> +#if (PREFETCH_STORE_HINT == PREFETCH_HINT_PREPAREFORSTORE)
> + PTR_ADDU t0,a0,a2 /* t0 is the "past the end" address */
> + PTR_SUBU t9,t0,PREFETCH_LIMIT /* t9 is the "last safe pref" address */
> +#endif
> + PREFETCH_FOR_LOAD (0, a1)
> + PREFETCH_FOR_LOAD (1, a1)
> + PREFETCH_FOR_LOAD (2, a1)
> + PREFETCH_FOR_STORE (1, a0)
> +#if (PREFETCH_STORE_HINT == PREFETCH_HINT_PREPAREFORSTORE)
> + sltu v1,t9,a0
> + bgtz v1,L(ua_loop16w) /* skip prefetch for too short arrays */
> + nop
> +#endif
> + PREFETCH_FOR_STORE (2, a0)
> +L(ua_loop16w):
> + PREFETCH_FOR_LOAD (3, a1)
> + C_LDHI t0,UNIT(0)(a1)
> + C_LDLO t0,UNITM1(1)(a1)
> + C_LDHI t1,UNIT(1)(a1)
> +#if (PREFETCH_STORE_HINT == PREFETCH_HINT_PREPAREFORSTORE)
> + bgtz v1,L(ua_skip_pref30_96)
> +#endif
> + C_LDLO t1,UNITM1(2)(a1)
> + PREFETCH_FOR_STORE (3, a0)
> +L(ua_skip_pref30_96):
> + C_LDHI REG2,UNIT(2)(a1)
> + C_LDLO REG2,UNITM1(3)(a1)
> + C_LDHI REG3,UNIT(3)(a1)
> + C_LDLO REG3,UNITM1(4)(a1)
> + C_LDHI REG4,UNIT(4)(a1)
> + C_LDLO REG4,UNITM1(5)(a1)
> + C_LDHI REG5,UNIT(5)(a1)
> + C_LDLO REG5,UNITM1(6)(a1)
> + C_LDHI REG6,UNIT(6)(a1)
> + C_LDLO REG6,UNITM1(7)(a1)
> + C_LDHI REG7,UNIT(7)(a1)
> + C_LDLO REG7,UNITM1(8)(a1)
> + PREFETCH_FOR_LOAD (4, a1)
> + C_ST t0,UNIT(0)(a0)
> + C_ST t1,UNIT(1)(a0)
> + C_ST REG2,UNIT(2)(a0)
> + C_ST REG3,UNIT(3)(a0)
> + C_ST REG4,UNIT(4)(a0)
> + C_ST REG5,UNIT(5)(a0)
> + C_ST REG6,UNIT(6)(a0)
> + C_ST REG7,UNIT(7)(a0)
> + C_LDHI t0,UNIT(8)(a1)
> + C_LDLO t0,UNITM1(9)(a1)
> + C_LDHI t1,UNIT(9)(a1)
> +#if (PREFETCH_STORE_HINT == PREFETCH_HINT_PREPAREFORSTORE)
> + bgtz v1,L(ua_skip_pref30_128)
> +#endif
> + C_LDLO t1,UNITM1(10)(a1)
> + PREFETCH_FOR_STORE (4, a0)
> +L(ua_skip_pref30_128):
> + C_LDHI REG2,UNIT(10)(a1)
> + C_LDLO REG2,UNITM1(11)(a1)
> + C_LDHI REG3,UNIT(11)(a1)
> + C_LDLO REG3,UNITM1(12)(a1)
> + C_LDHI REG4,UNIT(12)(a1)
> + C_LDLO REG4,UNITM1(13)(a1)
> + C_LDHI REG5,UNIT(13)(a1)
> + C_LDLO REG5,UNITM1(14)(a1)
> + C_LDHI REG6,UNIT(14)(a1)
> + C_LDLO REG6,UNITM1(15)(a1)
> + C_LDHI REG7,UNIT(15)(a1)
> + C_LDLO REG7,UNITM1(16)(a1)
> + PREFETCH_FOR_LOAD (5, a1)
> + C_ST t0,UNIT(8)(a0)
> + C_ST t1,UNIT(9)(a0)
> + C_ST REG2,UNIT(10)(a0)
> + C_ST REG3,UNIT(11)(a0)
> + C_ST REG4,UNIT(12)(a0)
> + C_ST REG5,UNIT(13)(a0)
> + C_ST REG6,UNIT(14)(a0)
> + C_ST REG7,UNIT(15)(a0)
> + PTR_ADDIU a0,a0,UNIT(16) /* adding 64/128 to dest */
> +#if (PREFETCH_STORE_HINT == PREFETCH_HINT_PREPAREFORSTORE)
> + sltu v1,t9,a0
> +#endif
> + bne a0,a3,L(ua_loop16w)
> + PTR_ADDIU a1,a1,UNIT(16) /* adding 64/128 to src */
> + move a2,t8
> +
> +/* Here we have src and dest word-aligned but less than 64-bytes or
> + * 128 bytes to go. Check for a 32(64) byte chunk and copy if if there
> + * is one. Otherwise jump down to L(ua_chk1w) to handle the tail end of
> + * the copy. */
> +
> +L(ua_chkw):
> + PREFETCH_FOR_LOAD (0, a1)
> + andi t8,a2,NSIZEMASK /* Is there a 32-byte/64-byte chunk. */
> + /* t8 is the reminder count past 32-bytes */
> + beq a2,t8,L(ua_chk1w) /* When a2=t8, no 32-byte chunk */
> + nop
> + C_LDHI t0,UNIT(0)(a1)
> + C_LDLO t0,UNITM1(1)(a1)
> + C_LDHI t1,UNIT(1)(a1)
> + C_LDLO t1,UNITM1(2)(a1)
> + C_LDHI REG2,UNIT(2)(a1)
> + C_LDLO REG2,UNITM1(3)(a1)
> + C_LDHI REG3,UNIT(3)(a1)
> + C_LDLO REG3,UNITM1(4)(a1)
> + C_LDHI REG4,UNIT(4)(a1)
> + C_LDLO REG4,UNITM1(5)(a1)
> + C_LDHI REG5,UNIT(5)(a1)
> + C_LDLO REG5,UNITM1(6)(a1)
> + C_LDHI REG6,UNIT(6)(a1)
> + C_LDLO REG6,UNITM1(7)(a1)
> + C_LDHI REG7,UNIT(7)(a1)
> + C_LDLO REG7,UNITM1(8)(a1)
> + PTR_ADDIU a1,a1,UNIT(8)
> + C_ST t0,UNIT(0)(a0)
> + C_ST t1,UNIT(1)(a0)
> + C_ST REG2,UNIT(2)(a0)
> + C_ST REG3,UNIT(3)(a0)
> + C_ST REG4,UNIT(4)(a0)
> + C_ST REG5,UNIT(5)(a0)
> + C_ST REG6,UNIT(6)(a0)
> + C_ST REG7,UNIT(7)(a0)
> + PTR_ADDIU a0,a0,UNIT(8)
> +/*
> + * Here we have less then 32(64) bytes to copy. Set up for a loop to
> + * copy one word (or double word) at a time.
> + */
> +L(ua_chk1w):
> + andi a2,t8,(NSIZE-1) /* a2 is the reminder past one (d)word chunks */
> + beq a2,t8,L(ua_smallCopy)
> + PTR_SUBU a3,t8,a2 /* a3 is count of bytes in one (d)word chunks */
> + PTR_ADDU a3,a0,a3 /* a3 is the dst address after loop */
> +
> +/* copying in words (4-byte or 8-byte chunks) */
> +L(ua_wordCopy_loop):
> + C_LDHI v1,UNIT(0)(a1)
> + C_LDLO v1,UNITM1(1)(a1)
> + PTR_ADDIU a1,a1,UNIT(1)
> + PTR_ADDIU a0,a0,UNIT(1)
> + bne a0,a3,L(ua_wordCopy_loop)
> + C_ST v1,UNIT(-1)(a0)
> +
> +/* Copy the last 8 (or 16) bytes */
> +L(ua_smallCopy):
> + beqz a2,L(leave)
> + PTR_ADDU a3,a0,a2 /* a3 is the last dst address */
> +L(ua_smallCopy_loop):
> + lb v1,0(a1)
> + PTR_ADDIU a1,a1,1
> + PTR_ADDIU a0,a0,1
> + bne a0,a3,L(ua_smallCopy_loop)
> + sb v1,-1(a0)
> +
> + j ra
> + nop
> +
> + .set at
> .set reorder
> -END (memcpy)
> -libc_hidden_builtin_def (memcpy)
> +END(MEMCPY_NAME)
> +#ifdef _LIBC
> +libc_hidden_builtin_def (MEMCPY_NAME)
> +#endif
>
^ permalink raw reply [flat|nested] 50+ messages in thread
* Re: [PATCH] Optimize MIPS memcpy
2012-10-29 18:00 ` Steve Ellcey
@ 2012-10-29 18:03 ` Maxim Kuvyrkov
2012-10-30 7:16 ` Maxim Kuvyrkov
1 sibling, 0 replies; 50+ messages in thread
From: Maxim Kuvyrkov @ 2012-10-29 18:03 UTC (permalink / raw)
To: Steve Ellcey; +Cc: Andrew Pinski, Joseph S. Myers, libc-ports
On 30/10/2012, at 7:00 AM, Steve Ellcey wrote:
> On Wed, 2012-10-17 at 10:29 -0700, Steve Ellcey wrote:
>
>> OK, Here is a version of memcpy that uses the STORE_STREAMING prefetch.
>> While it is optimized for a 32 byte prefetch, it will work correctly
>> regardless of the size of the prefetch.
>>
>> Is this version OK to checkin?
>>
>> Steve Ellcey
>> sellcey@mips.com
>
> Maxim, have you had a chance to test this version of memcpy for MIPS?
On my list for today/tomorrow. It's been a hectic couple of weeks.
--
Maxim Kuvyrkov
CodeSourcery / Mentor Graphics
^ permalink raw reply [flat|nested] 50+ messages in thread
* Re: [PATCH] Optimize MIPS memcpy
2012-10-29 18:00 ` Steve Ellcey
2012-10-29 18:03 ` Maxim Kuvyrkov
@ 2012-10-30 7:16 ` Maxim Kuvyrkov
2012-10-30 7:19 ` Maxim Kuvyrkov
2012-10-30 17:46 ` Steve Ellcey
1 sibling, 2 replies; 50+ messages in thread
From: Maxim Kuvyrkov @ 2012-10-30 7:16 UTC (permalink / raw)
To: Steve Ellcey; +Cc: Andrew Pinski, Joseph S. Myers, libc-ports
On 30/10/2012, at 7:00 AM, Steve Ellcey wrote:
> On Wed, 2012-10-17 at 10:29 -0700, Steve Ellcey wrote:
>
>> OK, Here is a version of memcpy that uses the STORE_STREAMING prefetch.
>> While it is optimized for a 32 byte prefetch, it will work correctly
>> regardless of the size of the prefetch.
>>
>> Is this version OK to checkin?
>>
>> Steve Ellcey
>> sellcey@mips.com
>
> Maxim, have you had a chance to test this version of memcpy for MIPS?
I have tested your latest version. Good news: there are no correctness issues. Bad news: it underperforms compared to my patch by 2-3 times on both N32 and N64 (didn't test O32) on the benchmark that I used. I've run the benchmark several times and results are consistent. I use oprofile on libc.so to determine how much time is spent in memcpy.
Would you please confirm that your current implementation is faster on YOUR benchmark than my patch in http://sourceware.org/ml/libc-ports/2012-09/msg00000.html ? Please make sure that PREFETCH macro in ports/sysdeps/mips/sys/asm.h gets defined to "pref", not "nop", in your build.
Thanks,
--
Maxim Kuvyrkov
CodeSourcery / Mentor Graphics
^ permalink raw reply [flat|nested] 50+ messages in thread
* Re: [PATCH] Optimize MIPS memcpy
2012-10-30 7:16 ` Maxim Kuvyrkov
@ 2012-10-30 7:19 ` Maxim Kuvyrkov
2012-10-30 17:46 ` Steve Ellcey
1 sibling, 0 replies; 50+ messages in thread
From: Maxim Kuvyrkov @ 2012-10-30 7:19 UTC (permalink / raw)
To: Andrew Pinski; +Cc: Steve Ellcey, Joseph S. Myers, libc-ports
On 30/10/2012, at 8:16 PM, Maxim Kuvyrkov wrote:
> On 30/10/2012, at 7:00 AM, Steve Ellcey wrote:
>
>> On Wed, 2012-10-17 at 10:29 -0700, Steve Ellcey wrote:
>>
>>> OK, Here is a version of memcpy that uses the STORE_STREAMING prefetch.
>>> While it is optimized for a 32 byte prefetch, it will work correctly
>>> regardless of the size of the prefetch.
>>>
>>> Is this version OK to checkin?
>>>
>>> Steve Ellcey
>>> sellcey@mips.com
>>
>> Maxim, have you had a chance to test this version of memcpy for MIPS?
>
> I have tested your latest version. Good news: there are no correctness issues. Bad news: it underperforms compared to my patch by 2-3 times on both N32 and N64 (didn't test O32) on the benchmark that I used. I've run the benchmark several times and results are consistent. I use oprofile on libc.so to determine how much time is spent in memcpy.
>
> Would you please confirm that your current implementation is faster on YOUR benchmark than my patch in http://sourceware.org/ml/libc-ports/2012-09/msg00000.html ? Please make sure that PREFETCH macro in ports/sysdeps/mips/sys/asm.h gets defined to "pref", not "nop", in your build.
Andrew,
You should also have a stake in this. Would you please benchmark Steve's patch in http://sourceware.org/ml/libc-ports/2012-10/msg00037.html vs my patch in http://sourceware.org/ml/libc-ports/2012-09/msg00000.html on your favorite benchmark and report the results?
Thank you,
--
Maxim Kuvyrkov
CodeSourcery / Mentor Graphics
^ permalink raw reply [flat|nested] 50+ messages in thread
* Re: [PATCH] Optimize MIPS memcpy
2012-10-30 7:16 ` Maxim Kuvyrkov
2012-10-30 7:19 ` Maxim Kuvyrkov
@ 2012-10-30 17:46 ` Steve Ellcey
2012-10-30 21:56 ` Maxim Kuvyrkov
1 sibling, 1 reply; 50+ messages in thread
From: Steve Ellcey @ 2012-10-30 17:46 UTC (permalink / raw)
To: Maxim Kuvyrkov; +Cc: Andrew Pinski, Joseph S. Myers, libc-ports
On Tue, 2012-10-30 at 20:16 +1300, Maxim Kuvyrkov wrote:
> On 30/10/2012, at 7:00 AM, Steve Ellcey wrote:
>
> > On Wed, 2012-10-17 at 10:29 -0700, Steve Ellcey wrote:
> >
> >> OK, Here is a version of memcpy that uses the STORE_STREAMING prefetch.
> >> While it is optimized for a 32 byte prefetch, it will work correctly
> >> regardless of the size of the prefetch.
> >>
> >> Is this version OK to checkin?
> >>
> >> Steve Ellcey
> >> sellcey@mips.com
> >
> > Maxim, have you had a chance to test this version of memcpy for MIPS?
>
> I have tested your latest version. Good news: there are no correctness issues. Bad news: it underperforms compared to my patch by 2-3 times on both N32 and N64 (didn't test O32) on the benchmark that I used. I've run the benchmark several times and results are consistent. I use oprofile on libc.so to determine how much time is spent in memcpy.
>
> Would you please confirm that your current implementation is faster on YOUR benchmark than my patch in http://sourceware.org/ml/libc-ports/2012-09/msg00000.html ? Please make sure that PREFETCH macro in ports/sysdeps/mips/sys/asm.h gets defined to "pref", not "nop", in your build.
>
> Thanks,
>
> --
> Maxim Kuvyrkov
> CodeSourcery / Mentor Graphics
Maxim, With O32 ABI I am seeing my version as slightly faster for large
memcpy's and slightly slower for small memcpy's compared to yours.
With N32 and 64 ABI's I see my version as slightly faster across the
board (a couple of percentage points). I am definitely not seeing
anything like a 2X difference. Are you sure prefetch is defined when
you tested my version? How about using double loads and stores? They
should both get set by default.
Steve Ellcey
sellcey@mips.com
^ permalink raw reply [flat|nested] 50+ messages in thread
* Re: [PATCH] Optimize MIPS memcpy
2012-10-30 17:46 ` Steve Ellcey
@ 2012-10-30 21:56 ` Maxim Kuvyrkov
2012-10-30 22:19 ` Steve Ellcey
0 siblings, 1 reply; 50+ messages in thread
From: Maxim Kuvyrkov @ 2012-10-30 21:56 UTC (permalink / raw)
To: Steve Ellcey; +Cc: Andrew Pinski, Joseph S. Myers, libc-ports
n 31/10/2012, at 6:45 AM, Steve Ellcey wrote:
> On Tue, 2012-10-30 at 20:16 +1300, Maxim Kuvyrkov wrote:
>>
...
>> I have tested your latest version. Good news: there are no correctness issues. Bad news: it underperforms compared to my patch by 2-3 times on both N32 and N64 (didn't test O32) on the benchmark that I used. I've run the benchmark several times and results are consistent. I use oprofile on libc.so to determine how much time is spent in memcpy.
>>
>> Would you please confirm that your current implementation is faster on YOUR benchmark than my patch in http://sourceware.org/ml/libc-ports/2012-09/msg00000.html ? Please make sure that PREFETCH macro in ports/sysdeps/mips/sys/asm.h gets defined to "pref", not "nop", in your build.
>>
>> Thanks,
>>
>> --
>> Maxim Kuvyrkov
>> CodeSourcery / Mentor Graphics
>
> Maxim, With O32 ABI I am seeing my version as slightly faster for large
> memcpy's and slightly slower for small memcpy's compared to yours.
>
> With N32 and 64 ABI's I see my version as slightly faster across the
> board (a couple of percentage points). I am definitely not seeing
> anything like a 2X difference. Are you sure prefetch is defined when
> you tested my version? How about using double loads and stores? They
> should both get set by default.
It turns out I was benchmarking my patch against original glibc implementation, not yours (patched files in ports/ instead of libc/ports). With the patch applied correctly, the performance is virtually the same on my benchmark. I've also checked the assembly dump of libc.so and confirmed that prefetch instructions and 8-byte loads/store are used where appropriate.
Given that your patch provides on par or better performance than mine, and it also unifies MIPS memcpy for all ABIs (as well as between glibc and Bionic!) -- I am all for your patch.
I've reviewed you patch -- code is clean and well-documented. Please apply the patch if sufficient testing has been done: big- and little-endian for o32/n32/n64 ABIs. I've tested your patch for all big-endian ABIs, so you just need to cover little-endian (which, I think, you may have done already).
Thanks for bearing with me through all the debugging process!
--
Maxim Kuvyrkov
CodeSourcery / Mentor Graphics
^ permalink raw reply [flat|nested] 50+ messages in thread
* Re: [PATCH] Optimize MIPS memcpy
2012-10-30 21:56 ` Maxim Kuvyrkov
@ 2012-10-30 22:19 ` Steve Ellcey
2012-12-19 1:51 ` Maxim Kuvyrkov
0 siblings, 1 reply; 50+ messages in thread
From: Steve Ellcey @ 2012-10-30 22:19 UTC (permalink / raw)
To: Maxim Kuvyrkov; +Cc: Andrew Pinski, Joseph S. Myers, libc-ports
On Wed, 2012-10-31 at 10:56 +1300, Maxim Kuvyrkov wrote:
> It turns out I was benchmarking my patch against original glibc implementation, not yours (patched files in ports/ instead of libc/ports). With the patch applied correctly, the performance is virtually the same on my benchmark. I've also checked the assembly dump of libc.so and confirmed that prefetch instructions and 8-byte loads/store are used where appropriate.
>
> Given that your patch provides on par or better performance than mine, and it also unifies MIPS memcpy for all ABIs (as well as between glibc and Bionic!) -- I am all for your patch.
>
> I've reviewed you patch -- code is clean and well-documented. Please apply the patch if sufficient testing has been done: big- and little-endian for o32/n32/n64 ABIs. I've tested your patch for all big-endian ABIs, so you just need to cover little-endian (which, I think, you may have done already).
>
> Thanks for bearing with me through all the debugging process!
>
> --
> Maxim Kuvyrkov
> CodeSourcery / Mentor Graphics
Sounds good. I will run one more round of testing tonight and then
check it in tomorrow if it all goes OK.
Steve Ellcey
sellcey@mips.com
^ permalink raw reply [flat|nested] 50+ messages in thread
* Re: [PATCH] Optimize MIPS memcpy
2012-10-17 17:30 ` Steve Ellcey
2012-10-29 18:00 ` Steve Ellcey
@ 2012-10-31 19:27 ` Andreas Jaeger
2012-10-31 20:04 ` Steve Ellcey
1 sibling, 1 reply; 50+ messages in thread
From: Andreas Jaeger @ 2012-10-31 19:27 UTC (permalink / raw)
To: Steve Ellcey; +Cc: libc-ports
On 10/17/2012 07:29 PM, Steve Ellcey wrote:
> [...]
> diff --git a/ports/sysdeps/mips/memcpy.S b/ports/sysdeps/mips/memcpy.S
> index 753f67c..71474e9 100644
> --- a/ports/sysdeps/mips/memcpy.S
> +++ b/ports/sysdeps/mips/memcpy.S
> @@ -1,7 +1,8 @@
> -/* Copyright (C) 2002-2012 Free Software Foundation, Inc.
> +/* Copyright (C) 2012 Free Software Foundation, Inc.
> This file is part of the GNU C Library.
> - Contributed by Hartvig Ekner <hartvige@mips.com>, 2002.
> -
> +
> + Contributed by MIPS Technologies, Inc.
> +
Steve, we're not adding any new "Contributed" lines, could you remove
that one, please?
Thanks,
Andreas
--
Andreas Jaeger aj@{suse.com,opensuse.org} Twitter/Identica: jaegerandi
SUSE LINUX Products GmbH, Maxfeldstr. 5, 90409 Nürnberg, Germany
GF: Jeff Hawn,Jennifer Guild,Felix Imendörffer,HRB16746 (AG Nürnberg)
GPG fingerprint = 93A3 365E CE47 B889 DF7F FED1 389A 563C C272 A126
^ permalink raw reply [flat|nested] 50+ messages in thread
* Re: [PATCH] Optimize MIPS memcpy
2012-10-31 19:27 ` Andreas Jaeger
@ 2012-10-31 20:04 ` Steve Ellcey
0 siblings, 0 replies; 50+ messages in thread
From: Steve Ellcey @ 2012-10-31 20:04 UTC (permalink / raw)
To: Andreas Jaeger; +Cc: libc-ports
On Wed, 2012-10-31 at 20:26 +0100, Andreas Jaeger wrote:
> On 10/17/2012 07:29 PM, Steve Ellcey wrote:
> > +
> > + Contributed by MIPS Technologies, Inc.
> > +
>
> Steve, we're not adding any new "Contributed" lines, could you remove
> that one, please?
>
> Thanks,
> Andreas
Done.
Steve Ellcey
sellcey@mips.com
^ permalink raw reply [flat|nested] 50+ messages in thread
* Re: [PATCH] Optimize MIPS memcpy
2012-10-30 22:19 ` Steve Ellcey
@ 2012-12-19 1:51 ` Maxim Kuvyrkov
2012-12-19 16:59 ` Steve Ellcey
0 siblings, 1 reply; 50+ messages in thread
From: Maxim Kuvyrkov @ 2012-12-19 1:51 UTC (permalink / raw)
To: Steve Ellcey; +Cc: Andrew Pinski, Joseph S. Myers, libc-ports
On 31/10/2012, at 11:19 AM, Steve Ellcey wrote:
> On Wed, 2012-10-31 at 10:56 +1300, Maxim Kuvyrkov wrote:
>
...
> Sounds good. I will run one more round of testing tonight and then
> check it in tomorrow if it all goes OK.
Steve,
Would you please add a NEWS entry for improved MIPS memcpy? It's a significant performance improvement, and deserves a note.
Thanks,
--
Maxim Kuvyrkov
CodeSourcery / Mentor Graphics
^ permalink raw reply [flat|nested] 50+ messages in thread
* Re: [PATCH] Optimize MIPS memcpy
2012-12-19 1:51 ` Maxim Kuvyrkov
@ 2012-12-19 16:59 ` Steve Ellcey
0 siblings, 0 replies; 50+ messages in thread
From: Steve Ellcey @ 2012-12-19 16:59 UTC (permalink / raw)
To: Maxim Kuvyrkov; +Cc: Andrew Pinski, Joseph S. Myers, libc-ports
On Wed, 2012-12-19 at 14:51 +1300, Maxim Kuvyrkov wrote:
> Steve,
>
> Would you please add a NEWS entry for improved MIPS memcpy? It's a significant performance improvement, and deserves a note.
>
> Thanks,
>
> --
> Maxim Kuvyrkov
> CodeSourcery / Mentor Graphics
OK, I have added this:
2012-12-19 Steve Ellcey <sellcey@mips.com>
* NEWS: Mention new memcpy for MIPS.
diff --git a/NEWS b/NEWS
index e58fabe..3a15555 100644
--- a/NEWS
+++ b/NEWS
@@ -25,6 +25,8 @@ Version 2.17
14838, 14856, 14863, 14865, 14866, 14868, 14869, 14871, 14872, 14879,
14889, 14893, 14898, 14914.
+* Optimization of memcpy for MIPS.
+
* CVE-2011-4609 svc_run() produces high cpu usage when accept fails with
EMFILE has been fixed (Bugzilla #14889).
^ permalink raw reply [flat|nested] 50+ messages in thread
end of thread, other threads:[~2012-12-19 16:59 UTC | newest]
Thread overview: 50+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2012-09-01 6:16 [PATCH] Optimize MIPS memcpy Maxim Kuvyrkov
2012-09-01 16:37 ` Joseph S. Myers
2012-09-03 9:12 ` Andrew T Pinski
2012-09-03 17:12 ` Maxim Kuvyrkov
2012-09-04 15:09 ` Steve Ellcey
2012-09-04 15:14 ` Carlos O'Donell
2012-09-04 17:03 ` Steve Ellcey
2012-09-04 17:28 ` Carlos O'Donell
2012-09-05 0:43 ` Maxim Kuvyrkov
2012-09-06 16:25 ` Steve Ellcey
2012-09-06 18:43 ` Roland McGrath
2012-09-06 19:37 ` Steve Ellcey
2012-09-07 21:24 ` Maxim Kuvyrkov
2012-09-11 4:35 ` Maxim Kuvyrkov
2012-09-11 15:18 ` Steve Ellcey
2012-09-20 9:05 ` Maxim Kuvyrkov
2012-09-20 18:38 ` Steve Ellcey
2012-09-28 3:48 ` Maxim Kuvyrkov
2012-10-06 4:43 ` Maxim Kuvyrkov
2012-10-08 17:04 ` Steve Ellcey
2012-10-08 22:31 ` Maxim Kuvyrkov
2012-10-09 20:50 ` Steve Ellcey
2012-10-15 17:49 ` Steve Ellcey
2012-10-15 20:20 ` Andrew Pinski
2012-10-15 20:34 ` Steve Ellcey
2012-10-15 20:42 ` Andrew Pinski
2012-10-15 20:50 ` Andrew Pinski
2012-10-15 21:36 ` Steve Ellcey
2012-10-15 21:47 ` Maxim Kuvyrkov
2012-10-17 17:30 ` Steve Ellcey
2012-10-29 18:00 ` Steve Ellcey
2012-10-29 18:03 ` Maxim Kuvyrkov
2012-10-30 7:16 ` Maxim Kuvyrkov
2012-10-30 7:19 ` Maxim Kuvyrkov
2012-10-30 17:46 ` Steve Ellcey
2012-10-30 21:56 ` Maxim Kuvyrkov
2012-10-30 22:19 ` Steve Ellcey
2012-12-19 1:51 ` Maxim Kuvyrkov
2012-12-19 16:59 ` Steve Ellcey
2012-10-31 19:27 ` Andreas Jaeger
2012-10-31 20:04 ` Steve Ellcey
2012-10-15 22:10 ` Joseph S. Myers
2012-10-15 21:29 ` Maciej W. Rozycki
2012-10-15 22:05 ` Maxim Kuvyrkov
2012-09-21 18:47 ` Steve Ellcey
2012-09-21 18:57 ` Joseph S. Myers
2012-09-21 20:41 ` [PATCH] Optimize MIPS memcpy (mips glibc test results) Steve Ellcey
2012-09-21 20:49 ` Joseph S. Myers
2012-09-21 20:56 ` Steve Ellcey
2012-09-21 19:12 ` [PATCH] Optimize MIPS memcpy Maxim Kuvyrkov
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).