From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: (qmail 29912 invoked by alias); 27 Nov 2012 20:37:58 -0000 Received: (qmail 29838 invoked by uid 22791); 27 Nov 2012 20:37:57 -0000 X-SWARE-Spam-Status: No, hits=-2.6 required=5.0 tests=AWL,BAYES_00,RP_MATCHES_RCVD,TW_BG,TW_CP X-Spam-Check-By: sourceware.org Received: from dns1.mips.com (HELO dns1.mips.com) (12.201.5.69) by sourceware.org (qpsmtpd/0.43rc1) with ESMTP; Tue, 27 Nov 2012 20:37:48 +0000 Received: from mailgate1.mips.com (mailgate1.mips.com [12.201.5.111]) by dns1.mips.com (8.13.8/8.13.8) with ESMTP id qARKbiRc013496; Tue, 27 Nov 2012 12:37:44 -0800 X-M-MSG: Received: from exchdb01.mips.com (unknown [192.168.36.84]) (using TLSv1 with cipher AES128-SHA (128/128 bits)) (No client certificate requested) by mailgate1.mips.com (Postfix) with ESMTP id 21585364673; Tue, 27 Nov 2012 12:37:40 -0800 (PST) Received: from ubuntu-sellcey.mips.com (192.168.65.53) by exchhub01.mips.com (192.168.36.84) with Microsoft SMTP Server id 14.1.270.1; Tue, 27 Nov 2012 12:37:38 -0800 Received: by ubuntu-sellcey.mips.com (sSMTP sendmail emulation); Tue, 27 Nov 2012 12:37:38 -0800 From: "Steve Ellcey " Date: Tue, 27 Nov 2012 20:37:00 -0000 To: CC: , Subject: [patch, mips] More mips memcpy improvements User-Agent: Heirloom mailx 12.4 7/29/08 MIME-Version: 1.0 Content-Type: text/plain; charset="us-ascii" Content-Transfer-Encoding: 7bit Message-ID: <1f415392-abbf-4875-9543-ef1ff49206bf@EXCHHUB01.MIPS.com> X-EMS-Proccessed: 6LP3oGfGVdcdb8o1aBnt6w== X-EMS-STAMP: I31ywe5wstcVfdN7iiHHOA== Mailing-List: contact libc-ports-help@sourceware.org; run by ezmlm Precedence: bulk List-Id: List-Subscribe: List-Post: List-Help: , Sender: libc-ports-owner@sourceware.org X-SW-Source: 2012-11/txt/msg00113.txt.bz2 I would like to take another crack at using the MIPS prepare-for-store prefetch hint in memcpy. The performance improvement in using it is substantial, but the problem of course is that you cannot prefetch anything that is outside the area being copied. I have addressed that problem by having a variable MAX_PREFETCH_SIZE set to 128. As long as no CPU has a cache line greater then 128 bytes, this code should work fine. It is still optimized for a cache line of 32 bytes, but it will work for machines that have a cache line of 64 or 128 bytes. On a MIPS 74K board, using a test case that is nothing but memcpy's of various sizes, I got a timing of 72.676 seconds for my earlier memcpy and a timing of 41.685 seconds with this new version. To test for correctness, I added two macros, RETURN_FIRST_PREFETCH and RETURN_LAST_PREFETCH, which change memcpy to return the first (last) address being prefetched instead of the destination pointer. I used these to verify that my memcpy does not prefetch any address less then 128 bytes after the beginning of the destination buffer or less then 128 bytes before the end of the destination buffer. I was also able to directly test this memcpy on a Cavium system with 128 byte cache lines by running a test and verifying that it didn't zero out any bytes outside the destination buffer. This new memcpy also showed a performance improvement on the Cavium system over my previous version, with my performance testcase going from 3.848 seconds to 3.597 seconds. Not as impressive a gain as the 74K, but still faster. In addition to the prefetches I made a couple of other small improvements such as reordering the partial loads in the unaligned loop copy case. The idea is to separate the load left/load right instructions that load the same register from each other because if they are one after the other it can cause a 1 cycle delay due to a partial load having to both read and write the destination register. Andrew and Maxim, could you take a look at this new version and see if it works OK for you? Steve Ellcey sellcey@mips.com 2012-11-27 Steve Ellcey * sysdeps/mips/memcpy.S: Change prefetch hint, reorder partial loads, set and use MAX_PREFETCH_SIZE. diff --git a/ports/sysdeps/mips/memcpy.S b/ports/sysdeps/mips/memcpy.S index abb07f9..31c00f1 100644 --- a/ports/sysdeps/mips/memcpy.S +++ b/ports/sysdeps/mips/memcpy.S @@ -26,12 +26,12 @@ #include #include #define PREFETCH_LOAD_HINT PREFETCH_HINT_LOAD_STREAMED -#define PREFETCH_STORE_HINT PREFETCH_HINT_STORE_STREAMED +#define PREFETCH_STORE_HINT PREFETCH_HINT_PREPAREFORSTORE #elif _COMPILING_NEWLIB #include "machine/asm.h" #include "machine/regdef.h" #define PREFETCH_LOAD_HINT PREFETCH_HINT_LOAD_STREAMED -#define PREFETCH_STORE_HINT PREFETCH_HINT_STORE_STREAMED +#define PREFETCH_STORE_HINT PREFETCH_HINT_PREPAREFORSTORE #else #include #include @@ -141,11 +141,11 @@ #ifdef USE_DOUBLE # define PREFETCH_CHUNK 64 # define PREFETCH_FOR_LOAD(chunk, reg) \ - pref PREFETCH_LOAD_HINT, (chunk)*32(reg); \ - pref PREFETCH_LOAD_HINT, ((chunk)+1)*32(reg) + pref PREFETCH_LOAD_HINT, (chunk)*64(reg); \ + pref PREFETCH_LOAD_HINT, ((chunk)*64)+32(reg) # define PREFETCH_FOR_STORE(chunk, reg) \ - pref PREFETCH_STORE_HINT, (chunk)*32(reg); \ - pref PREFETCH_STORE_HINT, ((chunk)+1)*32(reg) + pref PREFETCH_STORE_HINT, (chunk)*64(reg); \ + pref PREFETCH_STORE_HINT, ((chunk)*64)+32(reg) #else # define PREFETCH_CHUNK 32 # define PREFETCH_FOR_LOAD(chunk, reg) \ @@ -153,7 +153,30 @@ # define PREFETCH_FOR_STORE(chunk, reg) \ pref PREFETCH_STORE_HINT, (chunk)*32(reg) #endif -# define PREFETCH_LIMIT (5 * PREFETCH_CHUNK) + +/* MAX_PREFETCH_SIZE is the maximum size of a prefetch, it must not be less + * then PREFETCH_CHUNK, the assumed size of each prefetch. If the real size + * of a prefetch is greater then MAX_PREFETCH_SIZE and the PREPAREFORSTORE + * hint is used, the code will not work corrrectly. If PREPAREFORSTORE is not + * used then MAX_PREFETCH_SIZE does not matter. */ +#define MAX_PREFETCH_SIZE 128 +/* PREFETCH_LIMIT is set based on the fact that we neve use an offset number + * greater then 5 on a STORE prefetch and that a single prefetch can never be + * larger then MAX_PREFETCH_SIZE. We add the extra 32 when USE_DOUBLE is set + * because we actually do two prefetches in that case, one 32 bytes after the + * other. */ +#ifdef USE_DOUBLE +# define PREFETCH_LIMIT (5 * PREFETCH_CHUNK) + 32 + MAX_PREFETCH_SIZE +#else +# define PREFETCH_LIMIT (5 * PREFETCH_CHUNK) + MAX_PREFETCH_SIZE +#endif +#if (PREFETCH_STORE_HINT == PREFETCH_HINT_PREPAREFORSTORE) \ + && ((PREFETCH_CHUNK * 4) < MAX_PREFETCH_SIZE) +/* We cannot handle this because the initial prefetches may fetch bytes that + * are before the buffer being copied. We start copies with an offset + * of 4 so avoid this situation when using PREPAREFORSTORE. */ +#error "PREFETCH_CHUNK is too large and/or MAX_PREFETCH_SIZE is too small." +#endif #else /* USE_PREFETCH not defined */ # define PREFETCH_FOR_LOAD(offset, reg) # define PREFETCH_FOR_STORE(offset, reg) @@ -258,7 +281,11 @@ L(memcpy): */ slti t2,a2,(2 * NSIZE) bne t2,zero,L(lastb) +#if defined(RETURN_FIRST_PREFETCH) || defined(RETURN_LAST_PREFETCH) + move v0,zero +#else move v0,a0 +#endif /* * If src and dst have different alignments, go to L(unaligned), if they * have the same alignment (but are not actually aligned) do a partial @@ -306,22 +333,46 @@ L(aligned): PREFETCH_FOR_LOAD (0, a1) PREFETCH_FOR_LOAD (1, a1) PREFETCH_FOR_LOAD (2, a1) + PREFETCH_FOR_LOAD (3, a1) +#if defined(USE_PREFETCH) && (PREFETCH_STORE_HINT != PREFETCH_HINT_PREPAREFORSTORE) PREFETCH_FOR_STORE (1, a0) -#if defined(USE_PREFETCH) && (PREFETCH_STORE_HINT == PREFETCH_HINT_PREPAREFORSTORE) - sltu v1,t9,a0 /* If a0 > t9 don't use next prefetch */ - bgtz v1,L(loop16w) + PREFETCH_FOR_STORE (2, a0) + PREFETCH_FOR_STORE (3, a0) +#endif +#if defined(RETURN_FIRST_PREFETCH) && defined(USE_PREFETCH) +#if PREFETCH_STORE_HINT == PREFETCH_HINT_PREPAREFORSTORE + sltu v1,t9,a0 + bgtz v1,L(skip_set) nop + PTR_ADDIU v0,a0,(PREFETCH_CHUNK*4) +L(skip_set): +#else + PTR_ADDIU v0,a0,(PREFETCH_CHUNK*1) +#endif +#endif +#if defined(RETURN_LAST_PREFETCH) && defined(USE_PREFETCH) \ + && (PREFETCH_STORE_HINT != PREFETCH_HINT_PREPAREFORSTORE) + PTR_ADDIU v0,a0,(PREFETCH_CHUNK*3) +#ifdef USE_DOUBLE + PTR_ADDIU v0,v0,32 +#endif #endif - PREFETCH_FOR_STORE (2, a0) L(loop16w): - PREFETCH_FOR_LOAD (3, a1) C_LD t0,UNIT(0)(a1) #if defined(USE_PREFETCH) && (PREFETCH_STORE_HINT == PREFETCH_HINT_PREPAREFORSTORE) - bgtz v1,L(skip_pref30_96) + sltu v1,t9,a0 /* If a0 > t9 don't use next prefetch */ + bgtz v1,L(skip_pref) #endif C_LD t1,UNIT(1)(a1) - PREFETCH_FOR_STORE (3, a0) -L(skip_pref30_96): + PREFETCH_FOR_STORE (4, a0) + PREFETCH_FOR_STORE (5, a0) +#if defined(RETURN_LAST_PREFETCH) && defined(USE_PREFETCH) + PTR_ADDIU v0,a0,(PREFETCH_CHUNK*5) +#ifdef USE_DOUBLE + PTR_ADDIU v0,v0,32 +#endif +#endif +L(skip_pref): C_LD REG2,UNIT(2)(a1) C_LD REG3,UNIT(3)(a1) C_LD REG4,UNIT(4)(a1) @@ -340,12 +391,7 @@ L(skip_pref30_96): C_ST REG7,UNIT(7)(a0) C_LD t0,UNIT(8)(a1) -#if defined(USE_PREFETCH) && (PREFETCH_STORE_HINT == PREFETCH_HINT_PREPAREFORSTORE) - bgtz v1,L(skip_pref30_128) -#endif C_LD t1,UNIT(9)(a1) - PREFETCH_FOR_STORE (4, a0) -L(skip_pref30_128): C_LD REG2,UNIT(10)(a1) C_LD REG3,UNIT(11)(a1) C_LD REG4,UNIT(12)(a1) @@ -362,9 +408,6 @@ L(skip_pref30_128): C_ST REG6,UNIT(14)(a0) C_ST REG7,UNIT(15)(a0) PTR_ADDIU a0,a0,UNIT(16) /* adding 64/128 to dest */ -#if defined(USE_PREFETCH) && (PREFETCH_STORE_HINT == PREFETCH_HINT_PREPAREFORSTORE) - sltu v1,t9,a0 -#endif bne a0,a3,L(loop16w) PTR_ADDIU a1,a1,UNIT(16) /* adding 64/128 to src */ move a2,t8 @@ -416,8 +459,8 @@ L(chk1w): /* copying in words (4-byte or 8-byte chunks) */ L(wordCopy_loop): C_LD REG3,UNIT(0)(a1) - PTR_ADDIU a1,a1,UNIT(1) PTR_ADDIU a0,a0,UNIT(1) + PTR_ADDIU a1,a1,UNIT(1) bne a0,a3,L(wordCopy_loop) C_ST REG3,UNIT(-1)(a0) @@ -427,8 +470,8 @@ L(lastb): PTR_ADDU a3,a0,a2 /* a3 is the last dst address */ L(lastbloop): lb v1,0(a1) - PTR_ADDIU a1,a1,1 PTR_ADDIU a0,a0,1 + PTR_ADDIU a1,a1,1 bne a0,a3,L(lastbloop) sb v1,-1(a0) L(leave): @@ -475,35 +518,46 @@ L(ua_chk16w): PREFETCH_FOR_LOAD (0, a1) PREFETCH_FOR_LOAD (1, a1) PREFETCH_FOR_LOAD (2, a1) +#if defined(USE_PREFETCH) && (PREFETCH_STORE_HINT != PREFETCH_HINT_PREPAREFORSTORE) PREFETCH_FOR_STORE (1, a0) -#if defined(USE_PREFETCH) && (PREFETCH_STORE_HINT == PREFETCH_HINT_PREPAREFORSTORE) - sltu v1,t9,a0 - bgtz v1,L(ua_loop16w) /* skip prefetch for too short arrays */ + PREFETCH_FOR_STORE (2, a0) + PREFETCH_FOR_STORE (3, a0) +#endif +#if defined(RETURN_FIRST_PREFETCH) && defined(USE_PREFETCH) +#if (PREFETCH_STORE_HINT == PREFETCH_HINT_PREPAREFORSTORE) + sltu v1,t9,a0 + bgtz v1,L(ua_skip_set) nop + PTR_ADDIU v0,a0,(PREFETCH_CHUNK*4) +L(ua_skip_set): +#else + PTR_ADDIU v0,a0,(PREFETCH_CHUNK*1) +#endif #endif - PREFETCH_FOR_STORE (2, a0) L(ua_loop16w): PREFETCH_FOR_LOAD (3, a1) C_LDHI t0,UNIT(0)(a1) - C_LDLO t0,UNITM1(1)(a1) C_LDHI t1,UNIT(1)(a1) + C_LDHI REG2,UNIT(2)(a1) #if defined(USE_PREFETCH) && (PREFETCH_STORE_HINT == PREFETCH_HINT_PREPAREFORSTORE) - bgtz v1,L(ua_skip_pref30_96) + sltu v1,t9,a0 + bgtz v1,L(ua_skip_pref) #endif + C_LDHI REG3,UNIT(3)(a1) + PREFETCH_FOR_STORE (4, a0) + PREFETCH_FOR_STORE (5, a0) +L(ua_skip_pref): + C_LDHI REG4,UNIT(4)(a1) + C_LDHI REG5,UNIT(5)(a1) + C_LDHI REG6,UNIT(6)(a1) + C_LDHI REG7,UNIT(7)(a1) + C_LDLO t0,UNITM1(1)(a1) C_LDLO t1,UNITM1(2)(a1) - PREFETCH_FOR_STORE (3, a0) -L(ua_skip_pref30_96): - C_LDHI REG2,UNIT(2)(a1) C_LDLO REG2,UNITM1(3)(a1) - C_LDHI REG3,UNIT(3)(a1) C_LDLO REG3,UNITM1(4)(a1) - C_LDHI REG4,UNIT(4)(a1) C_LDLO REG4,UNITM1(5)(a1) - C_LDHI REG5,UNIT(5)(a1) C_LDLO REG5,UNITM1(6)(a1) - C_LDHI REG6,UNIT(6)(a1) C_LDLO REG6,UNITM1(7)(a1) - C_LDHI REG7,UNIT(7)(a1) C_LDLO REG7,UNITM1(8)(a1) PREFETCH_FOR_LOAD (4, a1) C_ST t0,UNIT(0)(a0) @@ -515,25 +569,20 @@ L(ua_skip_pref30_96): C_ST REG6,UNIT(6)(a0) C_ST REG7,UNIT(7)(a0) C_LDHI t0,UNIT(8)(a1) - C_LDLO t0,UNITM1(9)(a1) C_LDHI t1,UNIT(9)(a1) -#if defined(USE_PREFETCH) && (PREFETCH_STORE_HINT == PREFETCH_HINT_PREPAREFORSTORE) - bgtz v1,L(ua_skip_pref30_128) -#endif - C_LDLO t1,UNITM1(10)(a1) - PREFETCH_FOR_STORE (4, a0) -L(ua_skip_pref30_128): C_LDHI REG2,UNIT(10)(a1) - C_LDLO REG2,UNITM1(11)(a1) C_LDHI REG3,UNIT(11)(a1) - C_LDLO REG3,UNITM1(12)(a1) C_LDHI REG4,UNIT(12)(a1) - C_LDLO REG4,UNITM1(13)(a1) C_LDHI REG5,UNIT(13)(a1) - C_LDLO REG5,UNITM1(14)(a1) C_LDHI REG6,UNIT(14)(a1) - C_LDLO REG6,UNITM1(15)(a1) C_LDHI REG7,UNIT(15)(a1) + C_LDLO t0,UNITM1(9)(a1) + C_LDLO t1,UNITM1(10)(a1) + C_LDLO REG2,UNITM1(11)(a1) + C_LDLO REG3,UNITM1(12)(a1) + C_LDLO REG4,UNITM1(13)(a1) + C_LDLO REG5,UNITM1(14)(a1) + C_LDLO REG6,UNITM1(15)(a1) C_LDLO REG7,UNITM1(16)(a1) PREFETCH_FOR_LOAD (5, a1) C_ST t0,UNIT(8)(a0) @@ -545,9 +594,6 @@ L(ua_skip_pref30_128): C_ST REG6,UNIT(14)(a0) C_ST REG7,UNIT(15)(a0) PTR_ADDIU a0,a0,UNIT(16) /* adding 64/128 to dest */ -#if defined(USE_PREFETCH) && (PREFETCH_STORE_HINT == PREFETCH_HINT_PREPAREFORSTORE) - sltu v1,t9,a0 -#endif bne a0,a3,L(ua_loop16w) PTR_ADDIU a1,a1,UNIT(16) /* adding 64/128 to src */ move a2,t8 @@ -564,20 +610,20 @@ L(ua_chkw): beq a2,t8,L(ua_chk1w) /* When a2=t8, no 32-byte chunk */ nop C_LDHI t0,UNIT(0)(a1) - C_LDLO t0,UNITM1(1)(a1) C_LDHI t1,UNIT(1)(a1) - C_LDLO t1,UNITM1(2)(a1) C_LDHI REG2,UNIT(2)(a1) - C_LDLO REG2,UNITM1(3)(a1) C_LDHI REG3,UNIT(3)(a1) - C_LDLO REG3,UNITM1(4)(a1) C_LDHI REG4,UNIT(4)(a1) - C_LDLO REG4,UNITM1(5)(a1) C_LDHI REG5,UNIT(5)(a1) - C_LDLO REG5,UNITM1(6)(a1) C_LDHI REG6,UNIT(6)(a1) - C_LDLO REG6,UNITM1(7)(a1) C_LDHI REG7,UNIT(7)(a1) + C_LDLO t0,UNITM1(1)(a1) + C_LDLO t1,UNITM1(2)(a1) + C_LDLO REG2,UNITM1(3)(a1) + C_LDLO REG3,UNITM1(4)(a1) + C_LDLO REG4,UNITM1(5)(a1) + C_LDLO REG5,UNITM1(6)(a1) + C_LDLO REG6,UNITM1(7)(a1) C_LDLO REG7,UNITM1(8)(a1) PTR_ADDIU a1,a1,UNIT(8) C_ST t0,UNIT(0)(a0) @@ -603,8 +649,8 @@ L(ua_chk1w): L(ua_wordCopy_loop): C_LDHI v1,UNIT(0)(a1) C_LDLO v1,UNITM1(1)(a1) - PTR_ADDIU a1,a1,UNIT(1) PTR_ADDIU a0,a0,UNIT(1) + PTR_ADDIU a1,a1,UNIT(1) bne a0,a3,L(ua_wordCopy_loop) C_ST v1,UNIT(-1)(a0) @@ -614,8 +660,8 @@ L(ua_smallCopy): PTR_ADDU a3,a0,a2 /* a3 is the last dst address */ L(ua_smallCopy_loop): lb v1,0(a1) - PTR_ADDIU a1,a1,1 PTR_ADDIU a0,a0,1 + PTR_ADDIU a1,a1,1 bne a0,a3,L(ua_smallCopy_loop) sb v1,-1(a0) @@ -625,6 +671,8 @@ L(ua_smallCopy_loop): .set at .set reorder END(MEMCPY_NAME) +#ifndef ANDROID_CHANGES #ifdef _LIBC libc_hidden_builtin_def (MEMCPY_NAME) #endif +#endif