From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: (qmail 2625 invoked by alias); 12 Nov 2015 12:39:34 -0000 Mailing-List: contact newlib-cvs-help@sourceware.org; run by ezmlm Precedence: bulk List-Id: List-Subscribe: List-Archive: List-Post: List-Help: , Sender: newlib-cvs-owner@sourceware.org Received: (qmail 2563 invoked by uid 9078); 12 Nov 2015 12:39:33 -0000 Date: Thu, 12 Nov 2015 12:39:00 -0000 Message-ID: <20151112123933.2561.qmail@sourceware.org> Content-Type: text/plain; charset="us-ascii" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit From: Corinna Vinschen To: newlib-cvs@sourceware.org Subject: [newlib-cygwin] AArch64: Tune memcpy X-Act-Checkin: newlib-cygwin X-Git-Author: Wilco Dijkstra X-Git-Refname: refs/heads/master X-Git-Oldrev: 914620a7e6be4011fadebc7e1f23ed1d0e9ed4d7 X-Git-Newrev: 3c8636acf6a3fbd1ec0bab3db5ebe6ce301de511 X-SW-Source: 2015-q4/txt/msg00036.txt.bz2 https://sourceware.org/git/gitweb.cgi?p=newlib-cygwin.git;h=3c8636acf6a3fbd1ec0bab3db5ebe6ce301de511 commit 3c8636acf6a3fbd1ec0bab3db5ebe6ce301de511 Author: Wilco Dijkstra Date: Fri Nov 6 14:09:20 2015 +0000 AArch64: Tune memcpy * newlib/libc/machine/aarch64/memcpy.S (memcpy): Further tuning for performance. Diff: --- newlib/ChangeLog | 5 ++++ newlib/libc/machine/aarch64/memcpy.S | 56 ++++++++++++++++++++---------------- 2 files changed, 37 insertions(+), 24 deletions(-) diff --git a/newlib/ChangeLog b/newlib/ChangeLog index 9ae8c8d..437058d 100644 --- a/newlib/ChangeLog +++ b/newlib/ChangeLog @@ -1,3 +1,8 @@ +2015-11-12 Wilco Dijkstra + + * newlib/libc/machine/aarch64/memcpy.S (memcpy): Further tuning for + performance. + 2015-11-12 Joseph Myers * libc/machine/arm/strcmp-arm-tiny.S: Use .cfi_sections diff --git a/newlib/libc/machine/aarch64/memcpy.S b/newlib/libc/machine/aarch64/memcpy.S index c109684..463bad0 100644 --- a/newlib/libc/machine/aarch64/memcpy.S +++ b/newlib/libc/machine/aarch64/memcpy.S @@ -73,6 +73,7 @@ #define A_h x7 #define A_hw w7 #define B_l x8 +#define B_lw w8 #define B_h x9 #define C_l x10 #define C_h x11 @@ -104,21 +105,40 @@ */ def_fn memcpy p2align=6 + prfm PLDL1KEEP, [src] add srcend, src, count add dstend, dstin, count + cmp count, 16 + b.ls L(copy16) cmp count, 96 b.hi L(copy_long) - cmp count, 16 - b.hs L(copy_medium) + /* Medium copies: 17..96 bytes. */ + sub tmp1, count, 1 + ldp A_l, A_h, [src] + tbnz tmp1, 6, L(copy96) + ldp D_l, D_h, [srcend, -16] + tbz tmp1, 5, 1f + ldp B_l, B_h, [src, 16] + ldp C_l, C_h, [srcend, -32] + stp B_l, B_h, [dstin, 16] + stp C_l, C_h, [dstend, -32] +1: + stp A_l, A_h, [dstin] + stp D_l, D_h, [dstend, -16] + ret + + .p2align 4 /* Small copies: 0..16 bytes. */ L(copy16): - tbz count, 3, 1f + cmp count, 8 + b.lo 1f ldr A_l, [src] ldr A_h, [srcend, -8] str A_l, [dstin] str A_h, [dstend, -8] ret + .p2align 4 1: tbz count, 2, 1f ldr A_lw, [src] @@ -126,33 +146,21 @@ L(copy16): str A_lw, [dstin] str A_hw, [dstend, -4] ret - .p2align 4 + + /* Copy 0..3 bytes. Use a branchless sequence that copies the same + byte 3 times if count==1, or the 2nd byte twice if count==2. */ 1: cbz count, 2f + lsr tmp1, count, 1 ldrb A_lw, [src] - tbz count, 1, 1f - ldrh A_hw, [srcend, -2] - strh A_hw, [dstend, -2] -1: strb A_lw, [dstin] + ldrb A_hw, [srcend, -1] + ldrb B_lw, [src, tmp1] + strb A_lw, [dstin] + strb B_lw, [dstin, tmp1] + strb A_hw, [dstend, -1] 2: ret .p2align 4 - /* Medium copies: 17..96 bytes. */ -L(copy_medium): - ldp A_l, A_h, [src] - tbnz count, 6, L(copy96) - ldp D_l, D_h, [srcend, -16] - tbz count, 5, 1f - ldp B_l, B_h, [src, 16] - ldp C_l, C_h, [srcend, -32] - stp B_l, B_h, [dstin, 16] - stp C_l, C_h, [dstend, -32] -1: - stp A_l, A_h, [dstin] - stp D_l, D_h, [dstend, -16] - ret - - .p2align 4 /* Copy 64..96 bytes. Copy 64 bytes from the start and 32 bytes from the end. */ L(copy96):