https://sourceware.org/git/gitweb.cgi?p=glibc.git;h=0237b61526e716fa9597f521643908a4fda3b46a commit 0237b61526e716fa9597f521643908a4fda3b46a Author: Xuelei Zhang Date: Thu Dec 19 13:08:11 2019 +0000 aarch64: Optimized implementation of strcpy Optimize the strcpy implementation by using vector loads and operations in main loop.Compared to aarch64/strcpy.S, it reduces latency of cases in bench-strlen by 5%~18% when the length of src is greater than 64 bytes, with gains throughout the benchmark. Checked on aarch64-linux-gnu. Reviewed-by: Wilco Dijkstra Diff: --- sysdeps/aarch64/strcpy.S | 59 ++++++++++++++++++++++-------------------------- 1 file changed, 27 insertions(+), 32 deletions(-) diff --git a/sysdeps/aarch64/strcpy.S b/sysdeps/aarch64/strcpy.S index edc1625..791644c 100644 --- a/sysdeps/aarch64/strcpy.S +++ b/sysdeps/aarch64/strcpy.S @@ -53,6 +53,12 @@ #define len x16 #define to_align x17 +/* NEON register */ +#define dataq q2 +#define datav v2 +#define datab2 b3 +#define datav2 v3 + #ifdef BUILD_STPCPY #define STRCPY __stpcpy #else @@ -199,7 +205,6 @@ L(fp_lt2): #endif ret - .p2align 6 /* Aligning here ensures that the entry code and main loop all lies within one 64-byte cache line. */ L(bulk_entry): @@ -214,46 +219,36 @@ L(bulk_entry): especially on cores with a high number of issue slots per cycle, as we get much better parallelism out of the operations. */ L(main_loop): - stp data1, data2, [dst], #16 + str dataq, [dst], #16 L(entry_no_page_cross): - ldp data1, data2, [src], #16 - sub tmp1, data1, zeroones - orr tmp2, data1, #REP8_7f - sub tmp3, data2, zeroones - orr tmp4, data2, #REP8_7f - bic has_nul1, tmp1, tmp2 - bics has_nul2, tmp3, tmp4 - ccmp has_nul1, #0, #0, eq /* NZCV = 0000 */ - b.eq L(main_loop) + ldr dataq, [src], #16 + uminv datab2, datav.16b + mov tmp3, datav2.d[0] + cbnz tmp3, L(main_loop) /* Since we know we are copying at least 16 bytes, the fastest way to deal with the tail is to determine the location of the trailing NUL, then (re)copy the 16 bytes leading up to that. */ - cmp has_nul1, #0 #ifdef __AARCH64EB__ - /* For big-endian, carry propagation (if the final byte in the - string is 0x01) means we cannot use has_nul directly. The - easiest way to get the correct byte is to byte-swap the data - and calculate the syndrome a second time. */ + rev64 datav.16b, datav.16b +#endif + /* ���loc */ + cmeq datav.16b, datav.16b, #0 + mov data1, datav.d[0] + mov data2, datav.d[1] + cmp data1, 0 csel data1, data1, data2, ne + mov pos, 8 rev data1, data1 - sub tmp1, data1, zeroones - orr tmp2, data1, #REP8_7f - bic has_nul1, tmp1, tmp2 -#else - csel has_nul1, has_nul1, has_nul2, ne -#endif - rev has_nul1, has_nul1 - clz pos, has_nul1 - add tmp1, pos, #72 - add pos, pos, #8 - csel pos, pos, tmp1, ne - add src, src, pos, lsr #3 - add dst, dst, pos, lsr #3 - ldp data1, data2, [src, #-32] - stp data1, data2, [dst, #-16] + clz tmp1, data1 + csel pos, xzr, pos, ne + add pos, pos, tmp1, lsr 3 + add src, src, pos + add dst, dst, pos + ldr dataq,[src, #-31] + str dataq,[dst, #-15] #ifdef BUILD_STPCPY - sub dstin, dst, #1 + mov dstin, dst #endif ret