* [PATCH] aarch64: Optimized strnlen for Kunpeng processor
@ 2019-10-17 14:53 Xuelei Zhang
0 siblings, 0 replies; only message in thread
From: Xuelei Zhang @ 2019-10-17 14:53 UTC (permalink / raw)
To: libc-alpha, siddhesh, Szabolcs.Nagy, Wilco.Dijkstra, jiangyikun,
yikunkero
Optimize the strlen implementation by using vector operations and
loop unrooling in main loop. Compared to aarch64/strnlen.S, it
reduces latency of cases in bench-strnlen by 11%~24% when the length
of src is greater than 64 bytes, with gains throughout the benchmark.
Here is the result:
simple_strnlen __strnlen_kunpeng __strnlen_generic
Length 1, alignment 0: 16.0938 12.3438 11.0938
Length 1, alignment 0: 11.25 12.0312 12.1875
Length 1, alignment 0: 12.9688 12.1875 12.0312
Length 2, alignment 0: 10.9375 12.3438 12.3438
Length 2, alignment 0: 12.5 12.6562 12.8125
Length 2, alignment 0: 14.5312 12.3438 12.1875
Length 3, alignment 0: 12.8125 12.3438 12.1875
Length 3, alignment 0: 13.5938 12.5 12.6562
Length 3, alignment 0: 14.8438 12.3438 12.1875
Length 4, alignment 0: 13.75 12.3438 12.3438
Length 4, alignment 0: 15 12.3438 12.3438
Length 4, alignment 0: 15.7812 12.3438 12.1875
Length 5, alignment 0: 14.2188 12.5 12.0312
Length 5, alignment 0: 15 12.3438 12.3438
Length 5, alignment 0: 16.7188 12.3438 12.5
Length 6, alignment 0: 14.6875 12.1875 12.0312
Length 6, alignment 0: 16.4062 12.3438 12.5
Length 6, alignment 0: 17.3438 12.3438 12.3438
Length 7, alignment 0: 15.4688 12.3438 12.3438
Length 7, alignment 0: 16.5625 12.3438 12.3438
Length 7, alignment 0: 17.5 12.3438 12.3438
Length 1, alignment 1: 10.7812 11.5625 11.0938
Length 1, alignment 1: 10.9375 13.4375 13.5938
Length 1, alignment 1: 12.5 13.4375 13.75
Length 2, alignment 2: 10.9375 13.5938 13.4375
Length 2, alignment 2: 12.6562 13.5938 13.2812
Length 2, alignment 2: 14.375 13.5938 13.9062
Length 3, alignment 3: 12.3438 13.5938 13.4375
Length 3, alignment 3: 24.0625 13.5938 13.5938
Length 3, alignment 3: 15.3125 14.0625 13.2812
Length 4, alignment 4: 23.2812 12.9688 12.6562
Length 4, alignment 4: 15.3125 13.4375 12.9688
Length 4, alignment 4: 16.25 13.125 13.125
Length 5, alignment 5: 14.6875 13.2812 13.125
Length 5, alignment 5: 15.625 13.2812 12.9688
Length 5, alignment 5: 17.3438 13.2812 13.125
Length 6, alignment 6: 14.8438 13.2812 12.8125
Length 6, alignment 6: 16.5625 13.125 12.6562
Length 6, alignment 6: 17.3438 13.125 12.9688
Length 7, alignment 7: 15.4688 13.2812 13.125
Length 7, alignment 7: 17.1875 13.125 13.125
Length 7, alignment 7: 18.5938 13.2812 12.9688
Length 4, alignment 0: 16.0938 12.3438 12.0312
Length 4, alignment 1: 15.3125 13.2812 13.125
Length 8, alignment 0: 19.0625 12.5 12.1875
Length 8, alignment 1: 18.125 13.4375 12.9688
Length 16, alignment 0: 25.3125 13.5938 14.2188
Length 16, alignment 1: 24.5312 14.5312 15.1562
Length 32, alignment 0: 37.3438 14.0625 16.875
Length 32, alignment 1: 36.5625 15.3125 17.5
Length 64, alignment 0: 67.5 17.1875 20.7812
Length 64, alignment 1: 67.6562 17.5 19.8438
Length 128, alignment 0: 117.031 20.4688 23.9062
Length 128, alignment 1: 117.344 22.3438 27.8125
Length 256, alignment 0: 215.312 30 33.9062
Length 256, alignment 1: 215.312 31.25 36.0938
Length 512, alignment 0: 412.031 44.0625 57.8125
Length 512, alignment 1: 412.656 46.5625 58.5938
Length 1024, alignment 0: 806.25 79.8438 102.031
Length 1024, alignment 1: 806.094 79.2188 101.875
Length 1, alignment 0: 12.6562 12.3438 12.3438
Length 2, alignment 0: 14.0625 11.7188 12.3438
Length 3, alignment 0: 14.6875 12.0312 12.0312
Length 4, alignment 0: 15.625 12.1875 11.875
Length 5, alignment 0: 16.25 12.3438 11.875
Length 6, alignment 0: 17.0312 12.3438 12.8125
Length 7, alignment 0: 17.5 12.0312 12.3438
Length 1, alignment 1: 12.5 13.5938 13.9062
Length 2, alignment 2: 13.75 13.4375 13.5938
Length 3, alignment 3: 14.375 13.75 13.4375
Length 4, alignment 4: 15.3125 13.2812 12.8125
Length 5, alignment 5: 16.25 13.125 12.8125
Length 6, alignment 6: 16.7188 13.5938 13.4375
Length 7, alignment 7: 17.6562 13.2812 12.9688
Length 4, alignment 0: 15.3125 12.6562 12.5
Length 4, alignment 1: 15.1562 13.2812 13.2812
Length 8, alignment 0: 18.4375 12.3438 12.6562
Length 8, alignment 1: 18.4375 13.2812 13.125
Length 16, alignment 0: 25 13.4375 14.0625
Length 16, alignment 1: 24.6875 14.0625 15
Length 32, alignment 0: 37.5 13.9062 14.5312
Length 32, alignment 1: 37.0312 14.8438 17.3438
Length 64, alignment 0: 67.8125 17.1875 18.2812
Length 64, alignment 1: 67.8125 17.3438 19.8438
Length 128, alignment 0: 117.031 21.25 23.9062
Length 128, alignment 1: 116.562 21.25 25
Length 256, alignment 0: 215.156 30.3125 34.0625
Length 256, alignment 1: 215.312 31.875 35.1562
Length 512, alignment 0: 411.719 44.2188 59.0625
Length 512, alignment 1: 412.031 46.0938 57.8125
Length 1024, alignment 0: 805.938 77.5 102.344
Length 1024, alignment 1: 805.625 79.5312 102.5
---
sysdeps/aarch64/multiarch/Makefile | 1 +
sysdeps/aarch64/multiarch/ifunc-impl-list.c | 4 +
sysdeps/aarch64/multiarch/strnlen.c | 37 +++++
sysdeps/aarch64/multiarch/strnlen_generic.S | 40 ++++++
sysdeps/aarch64/multiarch/strnlen_kunpeng.S | 215 ++++++++++++++++++++++++++++
sysdeps/aarch64/strnlen.S | 12 +-
6 files changed, 305 insertions(+), 4 deletions(-)
create mode 100644 sysdeps/aarch64/multiarch/strnlen.c
create mode 100644 sysdeps/aarch64/multiarch/strnlen_generic.S
create mode 100644 sysdeps/aarch64/multiarch/strnlen_kunpeng.S
diff --git a/sysdeps/aarch64/multiarch/Makefile b/sysdeps/aarch64/multiarch/Makefile
index 4150b89a90..a9d163d20f 100644
--- a/sysdeps/aarch64/multiarch/Makefile
+++ b/sysdeps/aarch64/multiarch/Makefile
@@ -3,5 +3,6 @@ sysdep_routines += memcpy_generic memcpy_thunderx memcpy_thunderx2 \
memcpy_falkor memmove_falkor \
memset_generic memset_falkor memset_emag \
memchr_generic memchr_nosimd \
+ strnlen_generic strnlen_kunpeng \
strlen_generic strlen_asimd
endif
diff --git a/sysdeps/aarch64/multiarch/ifunc-impl-list.c b/sysdeps/aarch64/multiarch/ifunc-impl-list.c
index be13b916e5..1e253799a5 100644
--- a/sysdeps/aarch64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/aarch64/multiarch/ifunc-impl-list.c
@@ -62,5 +62,9 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
IFUNC_IMPL_ADD (array, i, strlen, 1, __strlen_asimd)
IFUNC_IMPL_ADD (array, i, strlen, 1, __strlen_generic))
+ IFUNC_IMPL (i, name, strnlen,
+ IFUNC_IMPL_ADD (array, i, strnlen, 1, __strnlen_kunpeng)
+ IFUNC_IMPL_ADD (array, i, strnlen, 1, __strnlen_generic))
+
return i;
}
diff --git a/sysdeps/aarch64/multiarch/strnlen.c b/sysdeps/aarch64/multiarch/strnlen.c
new file mode 100644
index 0000000000..3c832de847
--- /dev/null
+++ b/sysdeps/aarch64/multiarch/strnlen.c
@@ -0,0 +1,37 @@
+/* Multiple versions of strnlen. AARCH64 version.
+ Copyright (C) 2019-2020 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+/* Define multiple versions only for the definition in libc. */
+#if IS_IN (libc)
+/* Redefine strnlen so that the compiler won't complain about the type
+ mismatch with the IFUNC selector in strong_alias, below. */
+# define strnlen __redirect_strnlen
+# define __strnlen __redirect___strnlen
+# include <string.h>
+# include <init-arch.h>
+
+extern __typeof (__strnlen) __strnlen_generic attribute_hidden;
+extern __typeof (__strnlen) __strnlen_kunpeng attribute_hidden;
+# undef strnlen
+# undef __strnlen
+
+libc_ifunc_redirected (__redirect___strnlen, __strnlen,
+ (IS_KUNPENG(midr) ? __strnlen_kunpeng : __strnlen_generic));
+
+weak_alias (__strnlen, strnlen);
+#endif
diff --git a/sysdeps/aarch64/multiarch/strnlen_generic.S b/sysdeps/aarch64/multiarch/strnlen_generic.S
new file mode 100644
index 0000000000..4b562bc3dd
--- /dev/null
+++ b/sysdeps/aarch64/multiarch/strnlen_generic.S
@@ -0,0 +1,40 @@
+/* A Generic Optimized strnlen implementation for AARCH64.
+ Copyright (C) 2018-2019 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+/* The actual strnlen code is in ../strnlen.S. If we are building libc this file
+ defines __strnlen_generic. Otherwise the include of ../strnlen.S will define
+ the normal __strnlen entry points. */
+
+#include <sysdep.h>
+
+#if IS_IN (libc)
+
+# define STRNLEN __strnlen_generic
+
+/* Do not hide the generic version of strnlen, we use it internally. */
+# undef libc_hidden_def
+# define libc_hidden_def(name)
+
+# ifdef SHARED
+ .globl __GI_strnlen; __GI_strnlen = STRNLEN
+ .globl __GI___strnlen; __GI___strnlen = STRNLEN
+# endif
+#endif
+
+#include "../strnlen.S"
+
diff --git a/sysdeps/aarch64/multiarch/strnlen_kunpeng.S b/sysdeps/aarch64/multiarch/strnlen_kunpeng.S
new file mode 100644
index 0000000000..a2be5fd1ec
--- /dev/null
+++ b/sysdeps/aarch64/multiarch/strnlen_kunpeng.S
@@ -0,0 +1,215 @@
+/* Optimized strnlen for Huawei Kunpeng processor.
+
+ Copyright (C) 2013-2019 Free Software Foundation, Inc.
+
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library. If not, see
+ <https://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+
+/* Assumptions:
+ *
+ * ARMv8-a, AArch64
+ */
+
+/* Arguments and results. */
+#define srcin x0
+#define len x0
+#define limit x1
+
+/* Locals and temporaries. */
+#define src x2
+#define data1 x3
+#define data2 x4
+#define data2a x5
+#define has_nul1 x6
+#define has_nul2 x7
+#define tmp1 x8
+#define tmp2 x9
+#define tmp3 x10
+#define tmp4 x11
+#define zeroones x12
+#define pos x13
+#define limit_wd x14
+
+/* NEON register */
+#define dataq q2
+#define datav v2
+#define datab2 b3
+#define dataq2 q3
+#define datav2 v3
+
+#define REP8_01 0x0101010101010101
+#define REP8_7f 0x7f7f7f7f7f7f7f7f
+#define REP8_80 0x8080808080808080
+
+ENTRY_ALIGN_AND_PAD (__strnlen_kunpeng, 6, 9)
+ DELOUSE (0)
+ DELOUSE (1)
+ DELOUSE (2)
+ cbz limit, L(hit_limit)
+ mov zeroones, #REP8_01
+ bic src, srcin, #15
+ ands tmp1, srcin, #15
+ b.ne L(misaligned)
+ /* Calculate the number of full and partial words -1. */
+ sub limit_wd, limit, #1 /* Limit != 0, so no underflow. */
+ lsr limit_wd, limit_wd, #4 /* Convert to Qwords. */
+
+ /* NUL detection works on the principle that (X - 1) & (~X) & 0x80
+ (=> (X - 1) & ~(X | 0x7f)) is non-zero iff a byte is zero, and
+ can be done in parallel across the entire word. */
+ /* The inner loop deals with two Dwords at a time. This has a
+ slightly higher start-up cost, but we should win quite quickly,
+ especially on cores with a high number of issue slots per
+ cycle, as we get much better parallelism out of the operations. */
+
+ /* Start of critial section -- keep to one 64Byte cache line. */
+ ldp data1, data2, [src], #16
+L(realigned):
+ sub tmp1, data1, zeroones
+ orr tmp2, data1, #REP8_7f
+ sub tmp3, data2, zeroones
+ orr tmp4, data2, #REP8_7f
+ bic has_nul1, tmp1, tmp2
+ bic has_nul2, tmp3, tmp4
+ subs limit_wd, limit_wd, #1
+ orr tmp1, has_nul1, has_nul2
+ ccmp tmp1, #0, #0, pl /* NZCV = 0000 */
+ b.eq L(loop)
+ /* End of critical section -- keep to one 64Byte cache line. */
+
+ orr tmp1, has_nul1, has_nul2
+ cbz tmp1, L(hit_limit) /* No null in final Qword. */
+
+ /* We know there's a null in the final Qword. The easiest thing
+ to do now is work out the length of the string and return
+ MIN (len, limit). */
+
+ sub len, src, srcin
+ cbz has_nul1, L(nul_in_data2)
+#ifdef __AARCH64EB__
+ mov data2, data1
+#endif
+ sub len, len, #8
+ mov has_nul2, has_nul1
+L(nul_in_data2):
+#ifdef __AARCH64EB__
+ /* For big-endian, carry propagation (if the final byte in the
+ string is 0x01) means we cannot use has_nul directly. The
+ easiest way to get the correct byte is to byte-swap the data
+ and calculate the syndrome a second time. */
+ rev data2, data2
+ sub tmp1, data2, zeroones
+ orr tmp2, data2, #REP8_7f
+ bic has_nul2, tmp1, tmp2
+#endif
+ sub len, len, #8
+ rev has_nul2, has_nul2
+ clz pos, has_nul2
+ add len, len, pos, lsr #3 /* Bits to bytes. */
+ cmp len, limit
+ csel len, len, limit, ls /* Return the lower value. */
+ RET
+
+L(loop):
+ ldr dataq, [src], #16
+ uminv datab2, datav.16b
+ mov tmp1, datav2.d[0]
+ subs limit_wd, limit_wd, #1
+ ccmp tmp1, #0, #4, pl /* NZCV = 0000 */
+ b.eq L(loop_end)
+ ldr dataq, [src], #16
+ uminv datab2, datav.16b
+ mov tmp1, datav2.d[0]
+ subs limit_wd, limit_wd, #1
+ ccmp tmp1, #0, #4, pl /* NZCV = 0000 */
+ b.ne L(loop)
+L(loop_end):
+ /* End of critical section -- keep to one 64Byte cache line. */
+
+ cbnz tmp1, L(hit_limit) /* No null in final Qword. */
+
+ /* We know there's a null in the final Qword. The easiest thing
+ to do now is work out the length of the string and return
+ MIN (len, limit). */
+
+#ifdef __AARCH64EB__
+ rev64 datav.16b, datav.16b
+#endif
+ /* Set te NULL byte as 0xff and the rest as 0x00, move the data into a
+ pair of scalars and then compute the length from the earliest NULL
+ byte. */
+
+ cmeq datav.16b, datav.16b, #0
+ mov data1, datav.d[0]
+ mov data2, datav.d[1]
+ cmp data1, 0
+ csel data1, data1, data2, ne
+ sub len, src, srcin
+ sub len, len, #16
+ rev data1, data1
+ add tmp2, len, 8
+ clz tmp1, data1
+ csel len, len, tmp2, ne
+ add len, len, tmp1, lsr 3
+ cmp len, limit
+ csel len, len, limit, ls /* Return the lower value. */
+ RET
+
+L(misaligned):
+ /* Deal with a partial first word.
+ We're doing two things in parallel here;
+ 1) Calculate the number of words (but avoiding overflow if
+ limit is near ULONG_MAX) - to do this we need to work out
+ limit + tmp1 - 1 as a 65-bit value before shifting it;
+ 2) Load and mask the initial data words - we force the bytes
+ before the ones we are interested in to 0xff - this ensures
+ early bytes will not hit any zero detection. */
+ sub limit_wd, limit, #1
+ neg tmp4, tmp1
+ cmp tmp1, #8
+
+ and tmp3, limit_wd, #15
+ lsr limit_wd, limit_wd, #4
+ mov tmp2, #~0
+
+ ldp data1, data2, [src], #16
+ lsl tmp4, tmp4, #3 /* Bytes beyond alignment -> bits. */
+ add tmp3, tmp3, tmp1
+
+#ifdef __AARCH64EB__
+ /* Big-endian. Early bytes are at MSB. */
+ lsl tmp2, tmp2, tmp4 /* Shift (tmp1 & 63). */
+#else
+ /* Little-endian. Early bytes are at LSB. */
+ lsr tmp2, tmp2, tmp4 /* Shift (tmp1 & 63). */
+#endif
+ add limit_wd, limit_wd, tmp3, lsr #4
+
+ orr data1, data1, tmp2
+ orr data2a, data2, tmp2
+
+ csinv data1, data1, xzr, le
+ csel data2, data2, data2a, le
+ b L(realigned)
+
+L(hit_limit):
+ mov len, limit
+ RET
+END (__strnlen_kunpeng)
+weak_alias (__strnlen_kunpeng, strnlen_kunpeng)
+libc_hidden_builtin_def (strnlen_kunpeng)
diff --git a/sysdeps/aarch64/strnlen.S b/sysdeps/aarch64/strnlen.S
index 70283c8074..9a4dfbda15 100644
--- a/sysdeps/aarch64/strnlen.S
+++ b/sysdeps/aarch64/strnlen.S
@@ -25,6 +25,10 @@
* ARMv8-a, AArch64
*/
+#ifndef STRNLEN
+# define STRNLEN __strnlen
+#endif
+
/* Arguments and results. */
#define srcin x0
#define len x0
@@ -49,7 +53,7 @@
#define REP8_7f 0x7f7f7f7f7f7f7f7f
#define REP8_80 0x8080808080808080
-ENTRY_ALIGN_AND_PAD (__strnlen, 6, 9)
+ENTRY_ALIGN_AND_PAD (STRNLEN, 6, 9)
DELOUSE (0)
DELOUSE (1)
DELOUSE (2)
@@ -159,7 +163,7 @@ L(misaligned):
L(hit_limit):
mov len, limit
RET
-END (__strnlen)
-libc_hidden_def (__strnlen)
-weak_alias (__strnlen, strnlen)
+END (STRNLEN)
+libc_hidden_def (STRNLEN)
+weak_alias (STRNLEN, strnlen)
libc_hidden_def (strnlen)
--
2.14.1.windows.1
^ permalink raw reply [flat|nested] only message in thread
only message in thread, other threads:[~2019-10-17 14:53 UTC | newest]
Thread overview: (only message) (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2019-10-17 14:53 [PATCH] aarch64: Optimized strnlen for Kunpeng processor Xuelei Zhang
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).