From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: (qmail 106790 invoked by alias); 19 Dec 2019 19:45:05 -0000 Mailing-List: contact glibc-cvs-help@sourceware.org; run by ezmlm Precedence: bulk List-Id: List-Archive: List-Post: List-Help: , Sender: glibc-cvs-owner@sourceware.org List-Subscribe: Received: (qmail 106699 invoked by uid 9943); 19 Dec 2019 19:45:04 -0000 Date: Thu, 19 Dec 2019 19:45:00 -0000 Message-ID: <20191219194504.106695.qmail@sourceware.org> Content-Type: text/plain; charset="us-ascii" MIME-Version: 1.0 Content-Transfer-Encoding: 7bit From: Adhemerval Zanella To: glibc-cvs@sourceware.org Subject: [glibc] aarch64: Optimized strlen for strlen_asimd X-Act-Checkin: glibc X-Git-Author: Xuelei Zhang X-Git-Refname: refs/heads/master X-Git-Oldrev: 0db8e7b36665fa90c53161742dedab21d786924c X-Git-Newrev: c2150769d015dca1805334af7743829f1e4c0b6a X-SW-Source: 2019-q4/txt/msg00641.txt.bz2 https://sourceware.org/git/gitweb.cgi?p=glibc.git;h=c2150769d015dca1805334af7743829f1e4c0b6a commit c2150769d015dca1805334af7743829f1e4c0b6a Author: Xuelei Zhang Date: Thu Dec 19 13:41:40 2019 +0000 aarch64: Optimized strlen for strlen_asimd Optimize the strlen implementation by using vector operations and loop unrolling in main loop.Compared to __strlen_generic,it reduces latency of cases in bench-strlen by 7%~18% when the length of src is greater than 128 bytes, with gains throughout the benchmark. Checked on aarch64-linux-gnu. Reviewed-by: Wilco Dijkstra Diff: --- sysdeps/aarch64/multiarch/strlen.c | 4 ++- sysdeps/aarch64/multiarch/strlen_asimd.S | 42 ++++++++++++++++++++------------ 2 files changed, 29 insertions(+), 17 deletions(-) diff --git a/sysdeps/aarch64/multiarch/strlen.c b/sysdeps/aarch64/multiarch/strlen.c index 1db01ba..abf6513 100644 --- a/sysdeps/aarch64/multiarch/strlen.c +++ b/sysdeps/aarch64/multiarch/strlen.c @@ -34,7 +34,9 @@ extern __typeof (__redirect_strlen) __strlen_generic attribute_hidden; extern __typeof (__redirect_strlen) __strlen_asimd attribute_hidden; libc_ifunc (__strlen, - (USE_ASIMD_STRLEN () ? __strlen_asimd : __strlen_generic)); + (USE_ASIMD_STRLEN () || IS_KUNPENG(midr) + ? __strlen_asimd + :__strlen_generic)); # undef strlen strong_alias (__strlen, strlen); diff --git a/sysdeps/aarch64/multiarch/strlen_asimd.S b/sysdeps/aarch64/multiarch/strlen_asimd.S index 1d1c6ab..1de6cd3 100644 --- a/sysdeps/aarch64/multiarch/strlen_asimd.S +++ b/sysdeps/aarch64/multiarch/strlen_asimd.S @@ -48,6 +48,9 @@ #define dataq2 q3 #define datav2 v3 +#define REP8_01 0x0101010101010101 +#define REP8_7f 0x7f7f7f7f7f7f7f7f + #ifdef TEST_PAGE_CROSS # define MIN_PAGE_SIZE 16 #else @@ -82,40 +85,47 @@ ENTRY_ALIGN (__strlen_asimd, 6) DELOUSE (0) DELOUSE (1) and tmp1, srcin, MIN_PAGE_SIZE - 1 + mov zeroones, REP8_01 cmp tmp1, MIN_PAGE_SIZE - 16 b.gt L(page_cross) - ldr dataq, [srcin] + ldp data1, data2, [srcin] #ifdef __AARCH64EB__ - rev64 datav.16b, datav.16b + rev data1, data1 + rev data2, data2 #endif - /* Get the minimum value and keep going if it is not zero. */ - uminv datab2, datav.16b - mov tmp1, datav2.d[0] - cbnz tmp1, L(main_loop_entry) - - cmeq datav.16b, datav.16b, #0 - mov data1, datav.d[0] - mov data2, datav.d[1] - cmp data1, 0 - csel data1, data1, data2, ne + sub tmp1, data1, zeroones + orr tmp2, data1, REP8_7f + sub tmp3, data2, zeroones + orr tmp4, data2, REP8_7f + bics has_nul1, tmp1, tmp2 + bic has_nul2, tmp3, tmp4 + ccmp has_nul2, 0, 0, eq + beq L(main_loop_entry) + csel has_nul1, has_nul1, has_nul2, cc mov len, 8 - rev data1, data1 - clz tmp1, data1 - csel len, xzr, len, ne + rev has_nul1, has_nul1 + clz tmp1, has_nul1 + csel len, xzr, len, cc add len, len, tmp1, lsr 3 ret L(main_loop_entry): bic src, srcin, 15 + sub src, src, 16 L(main_loop): - ldr dataq, [src, 16]! + ldr dataq, [src, 32]! L(page_cross_entry): /* Get the minimum value and keep going if it is not zero. */ uminv datab2, datav.16b mov tmp1, datav2.d[0] + cbz tmp1, L(tail) + ldr dataq, [src, 16] + uminv datab2, datav.16b + mov tmp1, datav2.d[0] cbnz tmp1, L(main_loop) + add src, src, 16 L(tail): #ifdef __AARCH64EB__