* [PATCH 1/2] powerpc: Optimized strncpy for POWER9
@ 2020-08-20 18:29 Raphael Moreira Zinsly
2020-08-20 18:29 ` [PATCH 2/2] powerpc: Optimzed stpncpy " Raphael Moreira Zinsly
` (4 more replies)
0 siblings, 5 replies; 9+ messages in thread
From: Raphael Moreira Zinsly @ 2020-08-20 18:29 UTC (permalink / raw)
To: libc-alpha; +Cc: Raphael Moreira Zinsly
Similar to the strcpy P9 optimization, this version uses VSX to improve
performance.
---
sysdeps/powerpc/powerpc64/le/power9/strncpy.S | 276 ++++++++++++++++++
sysdeps/powerpc/powerpc64/multiarch/Makefile | 2 +-
.../powerpc64/multiarch/ifunc-impl-list.c | 5 +
.../powerpc64/multiarch/strncpy-power9.S | 26 ++
sysdeps/powerpc/powerpc64/multiarch/strncpy.c | 7 +
5 files changed, 315 insertions(+), 1 deletion(-)
create mode 100644 sysdeps/powerpc/powerpc64/le/power9/strncpy.S
create mode 100644 sysdeps/powerpc/powerpc64/multiarch/strncpy-power9.S
diff --git a/sysdeps/powerpc/powerpc64/le/power9/strncpy.S b/sysdeps/powerpc/powerpc64/le/power9/strncpy.S
new file mode 100644
index 0000000000..cde68384d4
--- /dev/null
+++ b/sysdeps/powerpc/powerpc64/le/power9/strncpy.S
@@ -0,0 +1,276 @@
+/* Optimized strncpy implementation for PowerPC64/POWER9.
+ Copyright (C) 2020 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+
+# ifndef STRNCPY
+# define FUNC_NAME strncpy
+# else
+# define FUNC_NAME STRNCPY
+# endif
+
+/* Implements the function
+
+ char * [r3] strncpy (char *dest [r3], const char *src [r4], size_t n [r5])
+
+ The implementation can load bytes past a null terminator, but only
+ up to the next 16B boundary, so it never crosses a page. */
+
+.machine power9
+ENTRY_TOCLESS (FUNC_NAME, 4)
+ CALL_MCOUNT 2
+
+ cmpwi r5, 0
+ beqlr
+ /* NULL string optimisation */
+ lbz r0,0(r4)
+ stb r0,0(r3)
+ addi r11,r3,1
+ addi r5,r5,-1
+ vspltisb v18,0 /* Zeroes in v18 */
+ cmpwi r0,0
+ beq L(zero_padding_loop)
+
+ cmpwi r5,0
+ beqlr
+
+L(cont):
+ addi r4,r4,1
+ neg r7,r4
+ rldicl r9,r7,0,60 /* How many bytes to get source 16B aligned? */
+
+ /* Get source 16B aligned */
+ lvx v0,0,r4
+ lvsr v1,0,r4
+ vperm v0,v18,v0,v1
+
+ vcmpequb v6,v0,v18 /* 0xff if byte is NULL, 0x00 otherwise */
+ vctzlsbb r7,v6 /* Number of trailing zeroes */
+ addi r8,r7,1 /* Add null terminator */
+
+ /* r8 = bytes including null
+ r9 = bytes to get source 16B aligned
+ if r8 > r9
+ no null, copy r9 bytes
+ else
+ there is a null, copy r8 bytes and return. */
+ cmpd r8,r9
+ bgt L(no_null)
+
+ cmpd r8,r5 /* r8 <= n? */
+ ble L(null)
+ sldi r10,r5,56 /* stxvl wants size in top 8 bits */
+ stxvl 32+v0,r11,r10 /* Partial store */
+
+ blr
+
+L(null):
+ sldi r10,r8,56 /* stxvl wants size in top 8 bits */
+ stxvl 32+v0,r11,r10 /* Partial store */
+
+ add r11,r11,r8
+ sub r5,r5,r8
+ b L(zero_padding_loop)
+
+L(no_null):
+ cmpd r9,r5 /* Check if length was reached. */
+ bge L(n_tail1)
+ sldi r10,r9,56 /* stxvl wants size in top 8 bits */
+ stxvl 32+v0,r11,r10 /* Partial store */
+
+ add r4,r4,r9
+ add r11,r11,r9
+ sub r5,r5,r9
+
+L(loop):
+ cmpldi cr6,r5,64 /* Check if length was reached. */
+ ble cr6,L(final_loop)
+
+ lxv 32+v0,0(r4)
+ vcmpequb. v6,v0,v18 /* Any zero bytes? */
+ bne cr6,L(prep_tail1)
+
+ lxv 32+v1,16(r4)
+ vcmpequb. v6,v1,v18 /* Any zero bytes? */
+ bne cr6,L(prep_tail2)
+
+ lxv 32+v2,32(r4)
+ vcmpequb. v6,v2,v18 /* Any zero bytes? */
+ bne cr6,L(prep_tail3)
+
+ lxv 32+v3,48(r4)
+ vcmpequb. v6,v3,v18 /* Any zero bytes? */
+ bne cr6,L(prep_tail4)
+
+ stxv 32+v0,0(r11)
+ stxv 32+v1,16(r11)
+ stxv 32+v2,32(r11)
+ stxv 32+v3,48(r11)
+
+ addi r4,r4,64
+ addi r11,r11,64
+ addi r5,r5,-64
+
+ b L(loop)
+
+L(final_loop):
+ cmpldi cr5,r5,16
+ lxv 32+v0,0(r4)
+ vcmpequb. v6,v0,v18 /* Any zero bytes? */
+ ble cr5,L(prep_n_tail1)
+ bne cr6,L(count_tail1)
+ addi r5,r5,-16
+
+ cmpldi cr5,r5,16
+ lxv 32+v1,16(r4)
+ vcmpequb. v6,v1,v18 /* Any zero bytes? */
+ ble cr5,L(prep_n_tail2)
+ bne cr6,L(count_tail2)
+ addi r5,r5,-16
+
+ cmpldi cr5,r5,16
+ lxv 32+v2,32(r4)
+ vcmpequb. v6,v2,v18 /* Any zero bytes? */
+ ble cr5,L(prep_n_tail3)
+ bne cr6,L(count_tail3)
+ addi r5,r5,-16
+
+ lxv 32+v3,48(r4)
+ vcmpequb. v6,v3,v18 /* Any zero bytes? */
+ beq cr6,L(n_tail4)
+
+ vctzlsbb r8,v6 /* Number of trailing zeroes */
+ cmpd r8,r5 /* r8 < n? */
+ blt L(tail4)
+L(n_tail4):
+ stxv 32+v0,0(r11)
+ stxv 32+v1,16(r11)
+ stxv 32+v2,32(r11)
+ sldi r10,r5,56 /* stxvl wants size in top 8 bits */
+ addi r11,r11,48 /* Offset */
+ stxvl 32+v3,r11,r10 /* Partial store */
+ blr
+
+L(prep_n_tail1):
+ beq cr6,L(n_tail1) /* Any zero bytes? */
+ vctzlsbb r8,v6 /* Number of trailing zeroes */
+ cmpd r8,r5 /* r8 < n? */
+ blt L(tail1)
+L(n_tail1):
+ sldi r10,r5,56 /* stxvl wants size in top 8 bits */
+ stxvl 32+v0,r11,r10 /* Partial store */
+ blr
+
+L(prep_n_tail2):
+ beq cr6,L(n_tail2) /* Any zero bytes? */
+ vctzlsbb r8,v6 /* Number of trailing zeroes */
+ cmpd r8,r5 /* r8 < n? */
+ blt L(tail2)
+L(n_tail2):
+ stxv 32+v0,0(r11)
+ sldi r10,r5,56 /* stxvl wants size in top 8 bits */
+ addi r11,r11,16 /* offset */
+ stxvl 32+v1,r11,r10 /* Partial store */
+ blr
+
+L(prep_n_tail3):
+ beq cr6,L(n_tail3) /* Any zero bytes? */
+ vctzlsbb r8,v6 /* Number of trailing zeroes */
+ cmpd r8,r5 /* r8 < n? */
+ blt L(tail3)
+L(n_tail3):
+ stxv 32+v0,0(r11)
+ stxv 32+v1,16(r11)
+ sldi r10,r5,56 /* stxvl wants size in top 8 bits */
+ addi r11,r11,32 /* Offset */
+ stxvl 32+v2,r11,r10 /* Partial store */
+ blr
+
+L(prep_tail1):
+L(count_tail1):
+ vctzlsbb r8,v6 /* Number of trailing zeroes */
+L(tail1):
+ addi r9,r8,1 /* Add null terminator */
+ sldi r10,r9,56 /* stxvl wants size in top 8 bits */
+ stxvl 32+v0,r11,r10 /* Partial store */
+ add r11,r11,r9
+ sub r5,r5,r9
+ b L(zero_padding_loop)
+
+L(prep_tail2):
+ addi r5,r5,-16
+L(count_tail2):
+ vctzlsbb r8,v6 /* Number of trailing zeroes */
+L(tail2):
+ addi r9,r8,1 /* Add null terminator */
+ stxv 32+v0,0(r11)
+ sldi r10,r9,56 /* stxvl wants size in top 8 bits */
+ addi r11,r11,16 /* offset */
+ stxvl 32+v1,r11,r10 /* Partial store */
+ add r11,r11,r9
+ sub r5,r5,r9
+ b L(zero_padding_loop)
+
+L(prep_tail3):
+ addi r5,r5,-32
+L(count_tail3):
+ vctzlsbb r8,v6 /* Number of trailing zeroes */
+L(tail3):
+ addi r9,r8,1 /* Add null terminator */
+ stxv 32+v0,0(r11)
+ stxv 32+v1,16(r11)
+ sldi r10,r9,56 /* stxvl wants size in top 8 bits */
+ addi r11,r11,32 /* offset */
+ stxvl 32+v2,r11,r10 /* Partial store */
+ add r11,r11,r9
+ sub r5,r5,r9
+ b L(zero_padding_loop)
+
+L(prep_tail4):
+ addi r5,r5,-48
+ vctzlsbb r8,v6 /* Number of trailing zeroes */
+L(tail4):
+ addi r9,r8,1 /* Add null terminator */
+ stxv 32+v0,0(r11)
+ stxv 32+v1,16(r11)
+ stxv 32+v2,32(r11)
+ sldi r10,r9,56 /* stxvl wants size in top 8 bits */
+ addi r11,r11,48 /* offset */
+ stxvl 32+v3,r11,r10 /* Partial store */
+ add r11,r11,r9
+ sub r5,r5,r9
+
+/* This code pads the remainder of dest with NULL bytes. */
+L(zero_padding_loop):
+ cmpldi cr6,r5,16 /* Check if length was reached. */
+ ble cr6,L(zero_padding_end)
+
+ stxv v18,0(r11)
+ addi r11,r11,16
+ addi r5,r5,-16
+
+ b L(zero_padding_loop)
+
+L(zero_padding_end):
+ sldi r10,r5,56 /* stxvl wants size in top 8 bits */
+ stxvl v18,r11,r10 /* Partial store */
+ blr
+
+L(n_tail):
+
+END (FUNC_NAME)
diff --git a/sysdeps/powerpc/powerpc64/multiarch/Makefile b/sysdeps/powerpc/powerpc64/multiarch/Makefile
index 19acb6c64a..cd2b47b403 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/Makefile
+++ b/sysdeps/powerpc/powerpc64/multiarch/Makefile
@@ -33,7 +33,7 @@ sysdep_routines += memcpy-power8-cached memcpy-power7 memcpy-a2 memcpy-power6 \
ifneq (,$(filter %le,$(config-machine)))
sysdep_routines += strcmp-power9 strncmp-power9 strcpy-power9 stpcpy-power9 \
- rawmemchr-power9 strlen-power9
+ rawmemchr-power9 strlen-power9 strncpy-power9
endif
CFLAGS-strncase-power7.c += -mcpu=power7 -funroll-loops
CFLAGS-strncase_l-power7.c += -mcpu=power7 -funroll-loops
diff --git a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
index ea10b00417..aa63e1c23f 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
@@ -301,6 +301,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
/* Support sysdeps/powerpc/powerpc64/multiarch/strncpy.c. */
IFUNC_IMPL (i, name, strncpy,
+#ifdef __LITTLE_ENDIAN__
+ IFUNC_IMPL_ADD (array, i, strncpy,
+ hwcap2 & PPC_FEATURE2_ARCH_3_00,
+ __strncpy_power9)
+#endif
IFUNC_IMPL_ADD (array, i, strncpy,
hwcap2 & PPC_FEATURE2_ARCH_2_07,
__strncpy_power8)
diff --git a/sysdeps/powerpc/powerpc64/multiarch/strncpy-power9.S b/sysdeps/powerpc/powerpc64/multiarch/strncpy-power9.S
new file mode 100644
index 0000000000..b9b6092f7b
--- /dev/null
+++ b/sysdeps/powerpc/powerpc64/multiarch/strncpy-power9.S
@@ -0,0 +1,26 @@
+/* Optimized strncpy implementation for POWER9/PPC64.
+ Copyright (C) 2016-2020 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#if defined __LITTLE_ENDIAN__ && IS_IN (libc)
+#define STRNCPY __strncpy_power9
+
+#undef libc_hidden_builtin_def
+#define libc_hidden_builtin_def(name)
+
+#include <sysdeps/powerpc/powerpc64/le/power9/strncpy.S>
+#endif
diff --git a/sysdeps/powerpc/powerpc64/multiarch/strncpy.c b/sysdeps/powerpc/powerpc64/multiarch/strncpy.c
index 7bacf28aca..822ceb2003 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/strncpy.c
+++ b/sysdeps/powerpc/powerpc64/multiarch/strncpy.c
@@ -28,11 +28,18 @@
extern __typeof (strncpy) __strncpy_ppc attribute_hidden;
extern __typeof (strncpy) __strncpy_power7 attribute_hidden;
extern __typeof (strncpy) __strncpy_power8 attribute_hidden;
+# ifdef __LITTLE_ENDIAN__
+extern __typeof (strncpy) __strncpy_power9 attribute_hidden;
+# endif
# undef strncpy
/* Avoid DWARF definition DIE on ifunc symbol so that GDB can handle
ifunc symbol properly. */
libc_ifunc_redirected (__redirect_strncpy, strncpy,
+# ifdef __LITTLE_ENDIAN__
+ (hwcap2 & PPC_FEATURE2_ARCH_3_00)
+ ? __strncpy_power9 :
+# endif
(hwcap2 & PPC_FEATURE2_ARCH_2_07)
? __strncpy_power8
: (hwcap & PPC_FEATURE_HAS_VSX)
--
2.26.2
^ permalink raw reply [flat|nested] 9+ messages in thread
* [PATCH 2/2] powerpc: Optimzed stpncpy for POWER9
2020-08-20 18:29 [PATCH 1/2] powerpc: Optimized strncpy for POWER9 Raphael Moreira Zinsly
@ 2020-08-20 18:29 ` Raphael Moreira Zinsly
2020-08-20 18:31 ` Raphael M Zinsly
2020-08-28 17:04 ` Paul E Murphy
2020-08-20 18:31 ` [PATCH 1/2] powerpc: Optimized strncpy " Raphael M Zinsly
` (3 subsequent siblings)
4 siblings, 2 replies; 9+ messages in thread
From: Raphael Moreira Zinsly @ 2020-08-20 18:29 UTC (permalink / raw)
To: libc-alpha; +Cc: Raphael Moreira Zinsly
Adds stpncpy support into the POWER9 strncpy.
---
sysdeps/powerpc/powerpc64/le/power9/stpncpy.S | 24 ++++++
sysdeps/powerpc/powerpc64/le/power9/strncpy.S | 74 +++++++++++++++++++
sysdeps/powerpc/powerpc64/multiarch/Makefile | 2 +-
.../powerpc64/multiarch/ifunc-impl-list.c | 5 ++
.../powerpc64/multiarch/stpncpy-power9.S | 24 ++++++
sysdeps/powerpc/powerpc64/multiarch/stpncpy.c | 7 ++
6 files changed, 135 insertions(+), 1 deletion(-)
create mode 100644 sysdeps/powerpc/powerpc64/le/power9/stpncpy.S
create mode 100644 sysdeps/powerpc/powerpc64/multiarch/stpncpy-power9.S
diff --git a/sysdeps/powerpc/powerpc64/le/power9/stpncpy.S b/sysdeps/powerpc/powerpc64/le/power9/stpncpy.S
new file mode 100644
index 0000000000..a96840bb6f
--- /dev/null
+++ b/sysdeps/powerpc/powerpc64/le/power9/stpncpy.S
@@ -0,0 +1,24 @@
+/* Optimized stpncpy implementation for PowerPC64/POWER9.
+ Copyright (C) 2015-2020 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#define USE_AS_STPNCPY
+#include <sysdeps/powerpc/powerpc64/le/power9/strncpy.S>
+
+weak_alias (__stpncpy, stpncpy)
+libc_hidden_def (__stpncpy)
+libc_hidden_builtin_def (stpncpy)
diff --git a/sysdeps/powerpc/powerpc64/le/power9/strncpy.S b/sysdeps/powerpc/powerpc64/le/power9/strncpy.S
index cde68384d4..64b06a9040 100644
--- a/sysdeps/powerpc/powerpc64/le/power9/strncpy.S
+++ b/sysdeps/powerpc/powerpc64/le/power9/strncpy.S
@@ -18,16 +18,30 @@
#include <sysdep.h>
+#ifdef USE_AS_STPNCPY
+# ifndef STPNCPY
+# define FUNC_NAME __stpncpy
+# else
+# define FUNC_NAME STPNCPY
+# endif
+#else
# ifndef STRNCPY
# define FUNC_NAME strncpy
# else
# define FUNC_NAME STRNCPY
# endif
+#endif /* !USE_AS_STPNCPY */
/* Implements the function
char * [r3] strncpy (char *dest [r3], const char *src [r4], size_t n [r5])
+ or
+
+ char * [r3] stpncpy (char *dest [r3], const char *src [r4], size_t n [r5])
+
+ if USE_AS_STPNCPY is defined.
+
The implementation can load bytes past a null terminator, but only
up to the next 16B boundary, so it never crosses a page. */
@@ -47,6 +61,13 @@ ENTRY_TOCLESS (FUNC_NAME, 4)
beq L(zero_padding_loop)
cmpwi r5,0
+#ifdef USE_AS_STPNCPY
+ bgt L(cont)
+ /* stpncpy returns the dest address plus the size not counting the
+ final '\0'. */
+ addi r3,r3,1
+ blr
+#endif
beqlr
L(cont):
@@ -77,12 +98,22 @@ L(cont):
sldi r10,r5,56 /* stxvl wants size in top 8 bits */
stxvl 32+v0,r11,r10 /* Partial store */
+#ifdef USE_AS_STPNCPY
+ /* stpncpy returns the dest address plus the size not counting the
+ final '\0'. */
+ add r3,r11,r5
+#endif
blr
L(null):
sldi r10,r8,56 /* stxvl wants size in top 8 bits */
stxvl 32+v0,r11,r10 /* Partial store */
+#ifdef USE_AS_STPNCPY
+ /* stpncpy returns the dest address plus the size not counting the
+ final '\0'. */
+ add r3,r11,r7
+#endif
add r11,r11,r8
sub r5,r5,r8
b L(zero_padding_loop)
@@ -164,6 +195,11 @@ L(n_tail4):
sldi r10,r5,56 /* stxvl wants size in top 8 bits */
addi r11,r11,48 /* Offset */
stxvl 32+v3,r11,r10 /* Partial store */
+#ifdef USE_AS_STPNCPY
+ /* stpncpy returns the dest address plus the size not counting the
+ final '\0'. */
+ add r3,r11,r5
+#endif
blr
L(prep_n_tail1):
@@ -174,6 +210,11 @@ L(prep_n_tail1):
L(n_tail1):
sldi r10,r5,56 /* stxvl wants size in top 8 bits */
stxvl 32+v0,r11,r10 /* Partial store */
+#ifdef USE_AS_STPNCPY
+ /* stpncpy returns the dest address plus the size not counting the
+ final '\0'. */
+ add r3,r11,r5
+#endif
blr
L(prep_n_tail2):
@@ -186,6 +227,11 @@ L(n_tail2):
sldi r10,r5,56 /* stxvl wants size in top 8 bits */
addi r11,r11,16 /* offset */
stxvl 32+v1,r11,r10 /* Partial store */
+#ifdef USE_AS_STPNCPY
+ /* stpncpy returns the dest address plus the size not counting the
+ final '\0'. */
+ add r3,r11,r5
+#endif
blr
L(prep_n_tail3):
@@ -199,6 +245,11 @@ L(n_tail3):
sldi r10,r5,56 /* stxvl wants size in top 8 bits */
addi r11,r11,32 /* Offset */
stxvl 32+v2,r11,r10 /* Partial store */
+#ifdef USE_AS_STPNCPY
+ /* stpncpy returns the dest address plus the size not counting the
+ final '\0'. */
+ add r3,r11,r5
+#endif
blr
L(prep_tail1):
@@ -208,6 +259,11 @@ L(tail1):
addi r9,r8,1 /* Add null terminator */
sldi r10,r9,56 /* stxvl wants size in top 8 bits */
stxvl 32+v0,r11,r10 /* Partial store */
+#ifdef USE_AS_STPNCPY
+ /* stpncpy returns the dest address plus the size not counting the
+ final '\0'. */
+ add r3,r11,r8
+#endif
add r11,r11,r9
sub r5,r5,r9
b L(zero_padding_loop)
@@ -222,6 +278,11 @@ L(tail2):
sldi r10,r9,56 /* stxvl wants size in top 8 bits */
addi r11,r11,16 /* offset */
stxvl 32+v1,r11,r10 /* Partial store */
+#ifdef USE_AS_STPNCPY
+ /* stpncpy returns the dest address plus the size not counting the
+ final '\0'. */
+ add r3,r11,r8
+#endif
add r11,r11,r9
sub r5,r5,r9
b L(zero_padding_loop)
@@ -237,6 +298,11 @@ L(tail3):
sldi r10,r9,56 /* stxvl wants size in top 8 bits */
addi r11,r11,32 /* offset */
stxvl 32+v2,r11,r10 /* Partial store */
+#ifdef USE_AS_STPNCPY
+ /* stpncpy returns the dest address plus the size not counting the
+ final '\0'. */
+ add r3,r11,r8
+#endif
add r11,r11,r9
sub r5,r5,r9
b L(zero_padding_loop)
@@ -252,6 +318,11 @@ L(tail4):
sldi r10,r9,56 /* stxvl wants size in top 8 bits */
addi r11,r11,48 /* offset */
stxvl 32+v3,r11,r10 /* Partial store */
+#ifdef USE_AS_STPNCPY
+ /* stpncpy returns the dest address plus the size not counting the
+ final '\0'. */
+ add r3,r11,r8
+#endif
add r11,r11,r9
sub r5,r5,r9
@@ -274,3 +345,6 @@ L(zero_padding_end):
L(n_tail):
END (FUNC_NAME)
+#ifndef USE_AS_STPNCPY
+libc_hidden_builtin_def (strncpy)
+#endif
diff --git a/sysdeps/powerpc/powerpc64/multiarch/Makefile b/sysdeps/powerpc/powerpc64/multiarch/Makefile
index cd2b47b403..f46bf50732 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/Makefile
+++ b/sysdeps/powerpc/powerpc64/multiarch/Makefile
@@ -33,7 +33,7 @@ sysdep_routines += memcpy-power8-cached memcpy-power7 memcpy-a2 memcpy-power6 \
ifneq (,$(filter %le,$(config-machine)))
sysdep_routines += strcmp-power9 strncmp-power9 strcpy-power9 stpcpy-power9 \
- rawmemchr-power9 strlen-power9 strncpy-power9
+ rawmemchr-power9 strlen-power9 strncpy-power9 stpncpy-power9
endif
CFLAGS-strncase-power7.c += -mcpu=power7 -funroll-loops
CFLAGS-strncase_l-power7.c += -mcpu=power7 -funroll-loops
diff --git a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
index aa63e1c23f..56790bcfe3 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
@@ -317,6 +317,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
/* Support sysdeps/powerpc/powerpc64/multiarch/stpncpy.c. */
IFUNC_IMPL (i, name, stpncpy,
+#ifdef __LITTLE_ENDIAN__
+ IFUNC_IMPL_ADD (array, i, stpncpy,
+ hwcap2 & PPC_FEATURE2_ARCH_3_00,
+ __stpncpy_power9)
+#endif
IFUNC_IMPL_ADD (array, i, stpncpy,
hwcap2 & PPC_FEATURE2_ARCH_2_07,
__stpncpy_power8)
diff --git a/sysdeps/powerpc/powerpc64/multiarch/stpncpy-power9.S b/sysdeps/powerpc/powerpc64/multiarch/stpncpy-power9.S
new file mode 100644
index 0000000000..ecbbb5c8e9
--- /dev/null
+++ b/sysdeps/powerpc/powerpc64/multiarch/stpncpy-power9.S
@@ -0,0 +1,24 @@
+/* Optimized stpncpy implementation for POWER9/PPC64.
+ Copyright (C) 2015-2020 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#define STPNCPY __stpncpy_power9
+
+#undef libc_hidden_builtin_def
+#define libc_hidden_builtin_def(name)
+
+#include <sysdeps/powerpc/powerpc64/le/power9/stpncpy.S>
diff --git a/sysdeps/powerpc/powerpc64/multiarch/stpncpy.c b/sysdeps/powerpc/powerpc64/multiarch/stpncpy.c
index 17df886431..21702716a3 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/stpncpy.c
+++ b/sysdeps/powerpc/powerpc64/multiarch/stpncpy.c
@@ -26,10 +26,17 @@
extern __typeof (__stpncpy) __stpncpy_ppc attribute_hidden;
extern __typeof (__stpncpy) __stpncpy_power7 attribute_hidden;
extern __typeof (__stpncpy) __stpncpy_power8 attribute_hidden;
+# ifdef __LITTLE_ENDIAN__
+extern __typeof (__stpncpy) __stpncpy_power9 attribute_hidden;
+# endif
# undef stpncpy
# undef __stpncpy
libc_ifunc_redirected (__redirect___stpncpy, __stpncpy,
+# ifdef __LITTLE_ENDIAN__
+ (hwcap2 & PPC_FEATURE2_ARCH_3_00)
+ ? __stpncpy_power9 :
+# endif
(hwcap2 & PPC_FEATURE2_ARCH_2_07)
? __stpncpy_power8
: (hwcap & PPC_FEATURE_HAS_VSX)
--
2.26.2
^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: [PATCH 1/2] powerpc: Optimized strncpy for POWER9
2020-08-20 18:29 [PATCH 1/2] powerpc: Optimized strncpy for POWER9 Raphael Moreira Zinsly
2020-08-20 18:29 ` [PATCH 2/2] powerpc: Optimzed stpncpy " Raphael Moreira Zinsly
@ 2020-08-20 18:31 ` Raphael M Zinsly
2020-08-28 14:25 ` Paul E Murphy
` (2 subsequent siblings)
4 siblings, 0 replies; 9+ messages in thread
From: Raphael M Zinsly @ 2020-08-20 18:31 UTC (permalink / raw)
To: libc-alpha
Here is the make bench output:
generic_strncpy __strncpy_power9
__strncpy_power8 __strncpy_power7 __strncpy_ppc
Length 16, n 16, alignment 1/ 1: 7.11694 2.77348 2.80296 6.5724 10.4471
Length 16, n 16, alignment 1/ 1: 7.1557 2.75968 2.805 6.5748 10.5064
Length 16, n 16, alignment 1/ 2: 7.17956 2.79127 2.79964 6.57323
10.3281
Length 16, n 16, alignment 2/ 1: 7.15841 2.77364 3.10582 6.2332 10.331
Length 2, n 4, alignment 7/ 2: 8.90911 2.4623 5.38449 5.64873 9.36348
Length 4, n 2, alignment 2/ 7: 6.65395 1.84558 2.58298 3.10566
7.46376
Length 2, n 4, alignment 7/ 2: 8.70625 2.41166 5.38131 5.73421
9.64285
Length 4, n 2, alignment 2/ 7: 6.65458 1.84354 2.58382 3.64721
6.96163
Length 16, n 16, alignment 2/ 2: 7.01778 2.77373 3.10668 6.58047
10.6006
Length 16, n 16, alignment 2/ 2: 7.53778 2.75789 3.10591 6.2277 10.2613
Length 16, n 16, alignment 2/ 4: 7.13828 2.79132 3.10567 6.56847 10.619
Length 16, n 16, alignment 4/ 2: 7.38659 2.77668 3.70851 6.54537
9.17368
Length 4, n 8, alignment 6/ 4: 8.71748 2.45183 5.76669 4.65782 10.014
Length 8, n 4, alignment 4/ 6: 6.5504 1.83463 2.96574 2.66227 8.49964
Length 4, n 8, alignment 6/ 4: 8.96461 2.4499 5.78384 5.32287 9.79641
Length 8, n 4, alignment 4/ 6: 6.48083 1.83265 2.9783 3.38632 8.51888
Length 16, n 16, alignment 3/ 3: 7.7538 2.77353 3.29008 6.55912 9.94143
Length 16, n 16, alignment 3/ 3: 7.75279 2.76148 3.30616 6.5445 9.98866
Length 16, n 16, alignment 3/ 6: 7.21486 2.79444 3.33712 6.24747 10.113
Length 16, n 16, alignment 6/ 3: 6.99138 2.77778 4.50777 6.22522
8.53482
Length 8, n 16, alignment 5/ 6: 8.26994 2.77966 4.60681 6.10938
10.5975
Length 16, n 8, alignment 6/ 5: 6.28062 2.07193 2.57761 4.95636
6.48035
Length 8, n 16, alignment 5/ 6: 8.17113 2.43559 4.27753 5.95453
11.1796
Length 16, n 8, alignment 6/ 5: 6.21214 2.07239 2.57714 4.96762
6.76041
Length 16, n 16, alignment 4/ 4: 7.31373 2.77573 3.78349 6.19349
8.91432
Length 16, n 16, alignment 4/ 4: 7.32226 2.75658 3.70319 6.60792
9.17307
Length 16, n 16, alignment 4/ 0: 7.58812 2.76841 3.71554 6.54282
8.90051
Length 16, n 16, alignment 0/ 4: 6.92871 2.06944 2.66876 6.63947
8.08171
Length 16, n 32, alignment 4/ 0: 10.2972 3.3192 6.53695 7.77295 12.332
Length 32, n 16, alignment 0/ 4: 6.98056 2.06954 2.66445 6.54976
7.65286
Length 16, n 32, alignment 4/ 0: 10.5356 3.31343 6.53813 7.72029
12.2915
Length 32, n 16, alignment 0/ 4: 7.36068 2.06945 2.66424 6.21052
8.07614
Length 16, n 16, alignment 5/ 5: 7.2122 2.77732 4.17451 6.55383 10.4887
Length 16, n 16, alignment 5/ 5: 7.34438 2.77512 4.17191 6.56873
10.5664
Length 16, n 16, alignment 5/ 2: 7.15746 2.76198 4.14481 6.56235
10.7391
Length 16, n 16, alignment 2/ 5: 7.19372 2.79273 3.10693 6.56984
10.2697
Length 32, n 64, alignment 3/ 2: 15.3918 4.22964 7.0146 12.5809 13.8661
Length 64, n 32, alignment 2/ 3: 10.5331 3.02942 3.54253 9.19106
12.9356
Length 32, n 64, alignment 3/ 2: 15.369 4.17282 7.36163 12.5759 16.8501
Length 64, n 32, alignment 2/ 3: 10.5585 3.01971 3.52885 9.03369
15.6663
Length 16, n 16, alignment 6/ 6: 7.0405 2.77527 4.53842 6.54733 7.99437
Length 16, n 16, alignment 6/ 6: 7.02801 2.76059 4.52873 6.53536
8.45713
Length 16, n 16, alignment 6/ 4: 7.42011 2.77669 4.52223 6.57756 7.9899
Length 16, n 16, alignment 4/ 6: 7.37787 2.77507 3.77821 6.57058
9.17396
Length 64, n 128, alignment 2/ 4: 17.188 5.33493 8.00394 12.6196 19.1784
Length 128, n 64, alignment 4/ 2: 12.7962 3.91004 5.42994 11.294 12.5273
Length 64, n 128, alignment 2/ 4: 17.2298 5.2748 8.15392 12.6039 24.3802
Length 128, n 64, alignment 4/ 2: 12.7866 3.87534 5.3334 11.8516 21.6528
Length 16, n 16, alignment 7/ 7: 7.75015 2.76775 5.59024 6.57976
8.42318
Length 16, n 16, alignment 7/ 7: 7.81681 2.75691 5.56801 6.55397
10.0378
Length 16, n 16, alignment 7/ 6: 7.75225 2.77446 5.56813 6.57349
8.49645
Length 16, n 16, alignment 6/ 7: 7.23237 2.79186 4.51528 6.55304
8.63443
Length 128, n 256, alignment 1/ 6: 19.8414 8.37691 10.3445 18.4838
22.8314
Length 256, n 128, alignment 6/ 1: 14.7972 5.38498 8.83611 13.8521
16.6154
Length 128, n 256, alignment 1/ 6: 19.8497 8.37754 10.3469 18.2655
43.3568
Length 256, n 128, alignment 6/ 1: 14.7542 5.31075 8.75314 13.7759
37.6351
Length 8, n 16, alignment 0/ 0: 8.19872 2.45818 4.27602 4.6578 7.98513
Length 32, n 16, alignment 0/ 0: 6.92066 2.07115 2.66465 2.66381
7.75655
Length 8, n 16, alignment 7/ 2: 8.18253 2.42685 4.70317 6.01808
9.35743
Length 32, n 16, alignment 7/ 2: 7.79714 2.60074 5.58717 6.64181
6.98583
Length 16, n 32, alignment 0/ 0: 10.4715 3.40184 6.28388 4.86146
11.0819
Length 64, n 32, alignment 0/ 0: 10.4403 2.54135 3.07109 3.38791
9.35196
Length 16, n 32, alignment 6/ 4: 10.7077 3.34867 7.01321 10.4278
11.2951
Length 64, n 32, alignment 6/ 4: 10.9215 3.03041 5.04324 8.30023
11.2648
Length 32, n 64, alignment 0/ 0: 12.0062 4.09428 5.32372 5.48319
14.1455
Length 128, n 64, alignment 0/ 0: 10.1803 3.47282 3.83134 4.21557
10.6674
Length 32, n 64, alignment 5/ 6: 15.4165 4.16297 7.78876 10.8762
15.4308
Length 128, n 64, alignment 5/ 6: 12.7332 3.91667 5.8014 10.5869 14.0961
Length 64, n 128, alignment 0/ 0: 13.238 5.24242 6.90661 8.05566 15.9848
Length 256, n 128, alignment 0/ 0: 8.759 4.9483 6.98675 6.11489 12.6755
Length 64, n 128, alignment 4/ 0: 13.6593 5.27931 8.60925 12.5916
17.5016
Length 256, n 128, alignment 4/ 0: 13.4801 5.37114 7.47485 14.0585
17.4517
Length 128, n 256, alignment 0/ 0: 15.3147 8.02462 8.92006 9.67769
20.3757
Length 512, n 256, alignment 0/ 0: 11.5638 7.22535 9.80468 9.93597
21.3421
Length 128, n 256, alignment 3/ 2: 19.8948 8.15967 10.9435 20.6146
22.4146
Length 512, n 256, alignment 3/ 2: 18.681 7.77864 10.9269 25.9269 28.0105
Length 256, n 512, alignment 0/ 0: 19.4894 13.7363 14.8394 15.4064
31.6341
Length 1024, n 512, alignment 0/ 0: 18.0108 11.8737 18.1779 18.5072
41.5425
Length 256, n 512, alignment 2/ 4: 25.5662 14.5189 16.1872 29.5395
33.7587
Length 1024, n 512, alignment 2/ 4: 28.0079 13.2347 19.067 48.1998 52.3078
Length 512, n 1024, alignment 0/ 0: 23.1385 25.4237 21.2303 23.632 47.4502
Length 2048, n 1024, alignment 0/ 0: 31.201 21.308 40.6351 39.04 75.0329
Length 512, n 1024, alignment 1/ 6: 35.6234 27.0042 24.4711 51.3364
60.6277
Length 2048, n 1024, alignment 1/ 6: 47.442 24.0381 41.6616 85.4832 91.8897
^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: [PATCH 2/2] powerpc: Optimzed stpncpy for POWER9
2020-08-20 18:29 ` [PATCH 2/2] powerpc: Optimzed stpncpy " Raphael Moreira Zinsly
@ 2020-08-20 18:31 ` Raphael M Zinsly
2020-08-28 17:04 ` Paul E Murphy
1 sibling, 0 replies; 9+ messages in thread
From: Raphael M Zinsly @ 2020-08-20 18:31 UTC (permalink / raw)
To: libc-alpha
Here is the make bench output:
generic_stpncpy __stpncpy_power9
__stpncpy_power8 __stpncpy_power7 __stpncpy_ppc
Length 16, n 16, alignment 1/ 1: 7.31792 2.79249 2.98207 6.20964
11.2262
Length 16, n 16, alignment 1/ 1: 7.26441 2.79883 2.97986 6.09795
11.1118
Length 16, n 16, alignment 1/ 2: 7.22475 2.82518 2.98169 6.18967
10.9933
Length 16, n 16, alignment 2/ 1: 7.28211 2.78851 3.1079 6.06067 10.4232
Length 2, n 4, alignment 7/ 2: 9.30193 2.4733 4.30086 4.74387 9.25328
Length 4, n 2, alignment 2/ 7: 6.7756 1.91031 2.93946 3.24475 7.76389
Length 2, n 4, alignment 7/ 2: 8.81319 2.4726 4.57341 4.74421 9.44667
Length 4, n 2, alignment 2/ 7: 6.77806 1.9118 2.93637 3.1857 7.00171
Length 16, n 16, alignment 2/ 2: 7.35335 2.80104 3.10653 5.85492
10.5689
Length 16, n 16, alignment 2/ 2: 7.14308 2.78571 3.10889 6.10044
10.4816
Length 16, n 16, alignment 2/ 4: 7.21628 2.81563 3.10724 6.14674
10.6005
Length 16, n 16, alignment 4/ 2: 7.47713 2.80531 3.80081 5.86977
9.43599
Length 4, n 8, alignment 6/ 4: 8.63537 2.4676 5.53825 4.1877 9.88309
Length 8, n 4, alignment 4/ 6: 6.63429 1.91051 3.10751 2.76472 8.4156
Length 4, n 8, alignment 6/ 4: 8.59304 2.43152 5.30288 4.16475
9.77498
Length 8, n 4, alignment 4/ 6: 6.63843 1.91047 3.19713 2.69566
8.67023
Length 16, n 16, alignment 3/ 3: 7.45277 2.80045 3.42433 6.06204
9.92282
Length 16, n 16, alignment 3/ 3: 8.04191 2.78645 3.43317 5.99773
10.0662
Length 16, n 16, alignment 3/ 6: 7.5816 2.81606 3.44168 6.0801 9.94673
Length 16, n 16, alignment 6/ 3: 7.10582 2.80176 5.03947 6.06942
8.40249
Length 8, n 16, alignment 5/ 6: 8.19747 2.42028 4.30043 5.0752 11.3093
Length 16, n 8, alignment 6/ 5: 6.37287 2.07239 2.56322 4.36972
6.52164
Length 8, n 16, alignment 5/ 6: 8.25022 2.45124 4.05051 5.02258
10.8683
Length 16, n 8, alignment 6/ 5: 6.31868 2.07215 2.83061 4.44584
7.14464
Length 16, n 16, alignment 4/ 4: 7.54408 2.80105 3.82846 5.71392
9.91359
Length 16, n 16, alignment 4/ 4: 7.66265 2.79063 3.86233 6.06489
9.31705
Length 16, n 16, alignment 4/ 0: 7.84286 2.79896 3.83148 6.08954
9.55253
Length 16, n 16, alignment 0/ 4: 7.36697 2.07019 2.66533 6.13894
7.75685
Length 16, n 32, alignment 4/ 0: 10.3819 3.33088 6.32994 7.24949
12.3827
Length 32, n 16, alignment 0/ 4: 7.15586 2.07172 2.66097 6.11743
7.56448
Length 16, n 32, alignment 4/ 0: 10.3262 3.35225 6.34556 7.3211 12.2527
Length 32, n 16, alignment 0/ 4: 7.13287 2.07265 2.6613 6.17878 7.61901
Length 16, n 16, alignment 5/ 5: 7.22471 2.80128 4.65776 6.15455
9.93333
Length 16, n 16, alignment 5/ 5: 7.22458 2.78586 4.65874 6.06763
9.87968
Length 16, n 16, alignment 5/ 2: 7.22718 2.79127 4.65999 6.025 10.3775
Length 16, n 16, alignment 2/ 5: 7.73485 2.8025 3.10754 6.08303 10.3871
Length 32, n 64, alignment 3/ 2: 13.7685 4.1256 7.04965 11.5105 15.3903
Length 64, n 32, alignment 2/ 3: 10.526 3.05149 3.59497 8.45078 13.7462
Length 32, n 64, alignment 3/ 2: 13.7681 4.11611 7.08236 11.5129
16.6004
Length 64, n 32, alignment 2/ 3: 10.962 3.05712 3.60447 8.43981 15.4906
Length 16, n 16, alignment 6/ 6: 7.30916 2.80056 5.03985 6.16331
8.43692
Length 16, n 16, alignment 6/ 6: 7.31688 2.7914 5.02931 6.12345 8.42848
Length 16, n 16, alignment 6/ 4: 7.7402 2.7993 5.04435 6.02685 8.28199
Length 16, n 16, alignment 4/ 6: 7.79103 2.82496 3.82464 6.0778 9.31532
Length 64, n 128, alignment 2/ 4: 15.4969 5.3714 8.09812 12.6067 18.7831
Length 128, n 64, alignment 4/ 2: 12.9023 3.93138 5.46487 10.7071
13.3253
Length 64, n 128, alignment 2/ 4: 15.4998 5.42611 7.88843 12.6007
24.0491
Length 128, n 64, alignment 4/ 2: 12.8971 3.94646 5.49689 11.1747
21.5779
Length 16, n 16, alignment 7/ 7: 7.68992 2.78151 6.14775 6.19397
8.38412
Length 16, n 16, alignment 7/ 7: 7.90811 2.7803 6.11502 6.17383 8.78371
Length 16, n 16, alignment 7/ 6: 7.45456 2.80173 5.93657 6.15191
8.38489
Length 16, n 16, alignment 6/ 7: 7.44846 2.80238 5.03654 6.1154 8.41589
Length 128, n 256, alignment 1/ 6: 17.9114 8.39532 10.3246 17.9457
21.9452
Length 256, n 128, alignment 6/ 1: 14.8346 5.41104 8.89047 13.5379
17.1437
Length 128, n 256, alignment 1/ 6: 17.9118 8.39985 10.3271 17.9503
42.0831
Length 256, n 128, alignment 6/ 1: 14.8306 5.40714 9.04492 13.5227 37.819
Length 8, n 16, alignment 0/ 0: 8.19945 2.46752 4.04264 4.62897
8.22975
Length 32, n 16, alignment 0/ 0: 7.23617 2.07229 2.66504 2.66683
7.93411
Length 8, n 16, alignment 7/ 2: 8.26373 2.41779 4.18003 5.31418 9.0473
Length 32, n 16, alignment 7/ 2: 7.46119 2.63992 6.16424 6.14534
7.28237
Length 16, n 32, alignment 0/ 0: 10.1282 3.42401 5.00287 5.02318
11.4985
Length 64, n 32, alignment 0/ 0: 9.29452 2.57779 2.79807 3.1362 10.9532
Length 16, n 32, alignment 6/ 4: 10.2194 3.30297 7.48371 10.4067
11.2264
Length 64, n 32, alignment 6/ 4: 10.6887 3.04976 5.13062 8.10511
11.1225
Length 32, n 64, alignment 0/ 0: 12.1806 4.09924 5.12341 6.14159
14.0965
Length 128, n 64, alignment 0/ 0: 10.1569 3.52625 3.88528 4.65782
11.3018
Length 32, n 64, alignment 5/ 6: 13.7795 4.13456 8.53476 10.2846
15.1556
Length 128, n 64, alignment 5/ 6: 12.8171 3.92765 5.82505 10.3559
15.0831
Length 64, n 128, alignment 0/ 0: 13.6328 5.33523 6.43324 7.92213
16.4658
Length 256, n 128, alignment 0/ 0: 8.92495 4.97169 7.13044 6.30158
12.9039
Length 64, n 128, alignment 4/ 0: 13.8393 5.36588 7.52682 11.5294
17.5523
Length 256, n 128, alignment 4/ 0: 13.5309 5.36019 7.56527 13.3503
17.8202
Length 128, n 256, alignment 0/ 0: 15.2956 8.14449 8.79678 9.69352
21.2463
Length 512, n 256, alignment 0/ 0: 11.5667 7.22974 10.1355 10.2592
21.5805
Length 128, n 256, alignment 3/ 2: 18.0152 8.21506 10.9175 20.4131
22.3927
Length 512, n 256, alignment 3/ 2: 18.7328 7.81909 11.251 25.0633 29.2378
Length 256, n 512, alignment 0/ 0: 17.5135 13.9768 15.6849 16.1219
30.9344
Length 1024, n 512, alignment 0/ 0: 17.988 11.8498 18.4388 18.7385 41.5762
Length 256, n 512, alignment 2/ 4: 23.3724 14.8026 15.9182 28.6762
33.9031
Length 1024, n 512, alignment 2/ 4: 27.9562 13.2785 19.5893 46.9671
52.4943
Length 512, n 1024, alignment 0/ 0: 23.3637 25.283 21.2536 23.4228 55.6501
Length 2048, n 1024, alignment 0/ 0: 31.303 21.2731 40.7001 38.8365 75.1105
Length 512, n 1024, alignment 1/ 6: 33.0535 26.873 24.8167 51.5917 56.236
Length 2048, n 1024, alignment 1/ 6: 47.5444 24.0206 42.5163 86.0245
92.5819
^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: [PATCH 1/2] powerpc: Optimized strncpy for POWER9
2020-08-20 18:29 [PATCH 1/2] powerpc: Optimized strncpy for POWER9 Raphael Moreira Zinsly
2020-08-20 18:29 ` [PATCH 2/2] powerpc: Optimzed stpncpy " Raphael Moreira Zinsly
2020-08-20 18:31 ` [PATCH 1/2] powerpc: Optimized strncpy " Raphael M Zinsly
@ 2020-08-28 14:25 ` Paul E Murphy
2020-08-28 19:12 ` Paul A. Clarke
2020-09-02 13:20 ` Tulio Magno Quites Machado Filho
4 siblings, 0 replies; 9+ messages in thread
From: Paul E Murphy @ 2020-08-28 14:25 UTC (permalink / raw)
To: Raphael Moreira Zinsly, libc-alpha
On 8/20/20 1:29 PM, Raphael Moreira Zinsly via Libc-alpha wrote:
> Similar to the strcpy P9 optimization, this version uses VSX to improve
> performance.
> ---
> sysdeps/powerpc/powerpc64/le/power9/strncpy.S | 276 ++++++++++++++++++
> sysdeps/powerpc/powerpc64/multiarch/Makefile | 2 +-
> .../powerpc64/multiarch/ifunc-impl-list.c | 5 +
> .../powerpc64/multiarch/strncpy-power9.S | 26 ++
> sysdeps/powerpc/powerpc64/multiarch/strncpy.c | 7 +
> 5 files changed, 315 insertions(+), 1 deletion(-)
> create mode 100644 sysdeps/powerpc/powerpc64/le/power9/strncpy.S
> create mode 100644 sysdeps/powerpc/powerpc64/multiarch/strncpy-power9.S
>
> diff --git a/sysdeps/powerpc/powerpc64/le/power9/strncpy.S b/sysdeps/powerpc/powerpc64/le/power9/strncpy.S
> new file mode 100644
> index 0000000000..cde68384d4
> --- /dev/null
> +++ b/sysdeps/powerpc/powerpc64/le/power9/strncpy.S
> @@ -0,0 +1,276 @@
> +/* Optimized strncpy implementation for PowerPC64/POWER9.
> + Copyright (C) 2020 Free Software Foundation, Inc.
> + This file is part of the GNU C Library.
> +
> + The GNU C Library is free software; you can redistribute it and/or
> + modify it under the terms of the GNU Lesser General Public
> + License as published by the Free Software Foundation; either
> + version 2.1 of the License, or (at your option) any later version.
> +
> + The GNU C Library is distributed in the hope that it will be useful,
> + but WITHOUT ANY WARRANTY; without even the implied warranty of
> + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + Lesser General Public License for more details.
> +
> + You should have received a copy of the GNU Lesser General Public
> + License along with the GNU C Library; if not, see
> + <https://www.gnu.org/licenses/>. */
> +
> +#include <sysdep.h>
> +
> +# ifndef STRNCPY
> +# define FUNC_NAME strncpy
> +# else
> +# define FUNC_NAME STRNCPY
> +# endif
> +
> +/* Implements the function
> +
> + char * [r3] strncpy (char *dest [r3], const char *src [r4], size_t n [r5])
> +
> + The implementation can load bytes past a null terminator, but only
> + up to the next 16B boundary, so it never crosses a page. */
> +
> +.machine power9
> +ENTRY_TOCLESS (FUNC_NAME, 4)
> + CALL_MCOUNT 2
> +
> + cmpwi r5, 0
> + beqlr
Trivial nit, an newline after branches helps readability for me.
> + /* NULL string optimisation */
> + lbz r0,0(r4)
> + stb r0,0(r3)
> + addi r11,r3,1
> + addi r5,r5,-1
> + vspltisb v18,0 /* Zeroes in v18 */
> + cmpwi r0,0
> + beq L(zero_padding_loop) > +
> + cmpwi r5,0
> + beqlr
OK.
> +
> +L(cont):
I think this label can be removed or replaced with a comment.
> + addi r4,r4,1
> + neg r7,r4
> + rldicl r9,r7,0,60 /* How many bytes to get source 16B aligned? */
> +
> + /* Get source 16B aligned */
> + lvx v0,0,r4
> + lvsr v1,0,r4
> + vperm v0,v18,v0,v1
> +
> + vcmpequb v6,v0,v18 /* 0xff if byte is NULL, 0x00 otherwise */
> + vctzlsbb r7,v6 /* Number of trailing zeroes */
> + addi r8,r7,1 /* Add null terminator */
Minor nit, can you align the comment with previous comments?
> +
> + /* r8 = bytes including null
> + r9 = bytes to get source 16B aligned
> + if r8 > r9
> + no null, copy r9 bytes
> + else
> + there is a null, copy r8 bytes and return. */
> + cmpd r8,r9
> + bgt L(no_null)
> +
> + cmpd r8,r5 /* r8 <= n? */
Minor, you could use another CR and run this in parallel with the
previous check.
> + ble L(null)
> + sldi r10,r5,56 /* stxvl wants size in top 8 bits */
> + stxvl 32+v0,r11,r10 /* Partial store */
> +
> + blr
OK.
> +
> +L(null):
> + sldi r10,r8,56 /* stxvl wants size in top 8 bits */
> + stxvl 32+v0,r11,r10 /* Partial store */
> +
> + add r11,r11,r8
> + sub r5,r5,r8
> + b L(zero_padding_loop)
OK.
> +
> +L(no_null):
> + cmpd r9,r5 /* Check if length was reached. */
> + bge L(n_tail1)
An extra newline would help here.
> + sldi r10,r9,56 /* stxvl wants size in top 8 bits */
> + stxvl 32+v0,r11,r10 /* Partial store */
> +
> + add r4,r4,r9
> + add r11,r11,r9
> + sub r5,r5,r9
OK.
> +
> +L(loop):
> + cmpldi cr6,r5,64 /* Check if length was reached. */
> + ble cr6,L(final_loop)
> +
> + lxv 32+v0,0(r4)
> + vcmpequb. v6,v0,v18 /* Any zero bytes? */
> + bne cr6,L(prep_tail1)
> +
> + lxv 32+v1,16(r4)
> + vcmpequb. v6,v1,v18 /* Any zero bytes? */
> + bne cr6,L(prep_tail2)
> +
> + lxv 32+v2,32(r4)
> + vcmpequb. v6,v2,v18 /* Any zero bytes? */
> + bne cr6,L(prep_tail3)
> +
> + lxv 32+v3,48(r4)
> + vcmpequb. v6,v3,v18 /* Any zero bytes? */
> + bne cr6,L(prep_tail4)
> +
> + stxv 32+v0,0(r11)
> + stxv 32+v1,16(r11)
> + stxv 32+v2,32(r11)
> + stxv 32+v3,48(r11)
> +
> + addi r4,r4,64
> + addi r11,r11,64
> + addi r5,r5,-64
> +
> + b L(loop)
OK.
> +
> +L(final_loop):
> + cmpldi cr5,r5,16
> + lxv 32+v0,0(r4)
> + vcmpequb. v6,v0,v18 /* Any zero bytes? */
> + ble cr5,L(prep_n_tail1)
> + bne cr6,L(count_tail1)
> + addi r5,r5,-16
> +
> + cmpldi cr5,r5,16
> + lxv 32+v1,16(r4)
> + vcmpequb. v6,v1,v18 /* Any zero bytes? */
> + ble cr5,L(prep_n_tail2)
> + bne cr6,L(count_tail2)
> + addi r5,r5,-16
> +
> + cmpldi cr5,r5,16
> + lxv 32+v2,32(r4)
> + vcmpequb. v6,v2,v18 /* Any zero bytes? */
> + ble cr5,L(prep_n_tail3)
> + bne cr6,L(count_tail3)
> + addi r5,r5,-16
OK.
> +
> + lxv 32+v3,48(r4)
> + vcmpequb. v6,v3,v18 /* Any zero bytes? */
> + beq cr6,L(n_tail4)
> +
> + vctzlsbb r8,v6 /* Number of trailing zeroes */
> + cmpd r8,r5 /* r8 < n? */
> + blt L(tail4)
OK. Newline here (and for the other similar cases below too please).
> +L(n_tail4):
> + stxv 32+v0,0(r11)
> + stxv 32+v1,16(r11)
> + stxv 32+v2,32(r11)
> + sldi r10,r5,56 /* stxvl wants size in top 8 bits */
> + addi r11,r11,48 /* Offset */
> + stxvl 32+v3,r11,r10 /* Partial store */
> + blr
OK.
> +
> +L(prep_n_tail1):
> + beq cr6,L(n_tail1) /* Any zero bytes? */
> + vctzlsbb r8,v6 /* Number of trailing zeroes */
> + cmpd r8,r5 /* r8 < n? */
> + blt L(tail1)
> +L(n_tail1):
> + sldi r10,r5,56 /* stxvl wants size in top 8 bits */
> + stxvl 32+v0,r11,r10 /* Partial store */
> + blr
OK.
> +
> +L(prep_n_tail2):
> + beq cr6,L(n_tail2) /* Any zero bytes? */
> + vctzlsbb r8,v6 /* Number of trailing zeroes */
> + cmpd r8,r5 /* r8 < n? */
> + blt L(tail2)
> +L(n_tail2):
> + stxv 32+v0,0(r11)
> + sldi r10,r5,56 /* stxvl wants size in top 8 bits */
> + addi r11,r11,16 /* offset */
> + stxvl 32+v1,r11,r10 /* Partial store */
> + blr
OK.
> +
> +L(prep_n_tail3):
> + beq cr6,L(n_tail3) /* Any zero bytes? */
> + vctzlsbb r8,v6 /* Number of trailing zeroes */
> + cmpd r8,r5 /* r8 < n? */
> + blt L(tail3)
> +L(n_tail3):
> + stxv 32+v0,0(r11)
> + stxv 32+v1,16(r11)
> + sldi r10,r5,56 /* stxvl wants size in top 8 bits */
> + addi r11,r11,32 /* Offset */
> + stxvl 32+v2,r11,r10 /* Partial store */
> + blr
OK.
> +
> +L(prep_tail1):
> +L(count_tail1):
> + vctzlsbb r8,v6 /* Number of trailing zeroes */
> +L(tail1):
> + addi r9,r8,1 /* Add null terminator */
Please align this comment (and the 3 other similar cases).
> + sldi r10,r9,56 /* stxvl wants size in top 8 bits */
> + stxvl 32+v0,r11,r10 /* Partial store */
> + add r11,r11,r9
> + sub r5,r5,r9
> + b L(zero_padding_loop)
OK.
> +
> +L(prep_tail2):
> + addi r5,r5,-16
> +L(count_tail2):
> + vctzlsbb r8,v6 /* Number of trailing zeroes */
> +L(tail2):
> + addi r9,r8,1 /* Add null terminator */
> + stxv 32+v0,0(r11)
> + sldi r10,r9,56 /* stxvl wants size in top 8 bits */
> + addi r11,r11,16 /* offset */
> + stxvl 32+v1,r11,r10 /* Partial store */
> + add r11,r11,r9
> + sub r5,r5,r9
> + b L(zero_padding_loop)
> +
> +L(prep_tail3):
> + addi r5,r5,-32
> +L(count_tail3):
> + vctzlsbb r8,v6 /* Number of trailing zeroes */
> +L(tail3):
> + addi r9,r8,1 /* Add null terminator */
> + stxv 32+v0,0(r11)
> + stxv 32+v1,16(r11)
> + sldi r10,r9,56 /* stxvl wants size in top 8 bits */
> + addi r11,r11,32 /* offset */
> + stxvl 32+v2,r11,r10 /* Partial store */
> + add r11,r11,r9
> + sub r5,r5,r9
> + b L(zero_padding_loop)
> +
> +L(prep_tail4):
> + addi r5,r5,-48
> + vctzlsbb r8,v6 /* Number of trailing zeroes */
> +L(tail4):
> + addi r9,r8,1 /* Add null terminator */
> + stxv 32+v0,0(r11)
> + stxv 32+v1,16(r11)
> + stxv 32+v2,32(r11)
> + sldi r10,r9,56 /* stxvl wants size in top 8 bits */
> + addi r11,r11,48 /* offset */
> + stxvl 32+v3,r11,r10 /* Partial store */
> + add r11,r11,r9
> + sub r5,r5,r9
OK.
> +
> +/* This code pads the remainder of dest with NULL bytes. */
> +L(zero_padding_loop):
> + cmpldi cr6,r5,16 /* Check if length was reached. */
> + ble cr6,L(zero_padding_end)
> +
> + stxv v18,0(r11)
> + addi r11,r11,16
> + addi r5,r5,-16
> +
> + b L(zero_padding_loop)
> +
OK.
> +L(zero_padding_end):
> + sldi r10,r5,56 /* stxvl wants size in top 8 bits */
> + stxvl v18,r11,r10 /* Partial store */
> + blr
OK.
> +
> +L(n_tail):Is this label used?
> +
> +END (FUNC_NAME)
> diff --git a/sysdeps/powerpc/powerpc64/multiarch/Makefile b/sysdeps/powerpc/powerpc64/multiarch/Makefile
> index 19acb6c64a..cd2b47b403 100644
> --- a/sysdeps/powerpc/powerpc64/multiarch/Makefile
> +++ b/sysdeps/powerpc/powerpc64/multiarch/Makefile
OK.
> index ea10b00417..aa63e1c23f 100644
> --- a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
> +++ b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
OK.
> diff --git a/sysdeps/powerpc/powerpc64/multiarch/strncpy-power9.S b/sysdeps/powerpc/powerpc64/multiarch/strncpy-power9.S
> new file mode 100644
> index 0000000000..b9b6092f7b
> --- /dev/null
> +++ b/sysdeps/powerpc/powerpc64/multiarch/strncpy-power9.S
OK.
> diff --git a/sysdeps/powerpc/powerpc64/multiarch/strncpy.c b/sysdeps/powerpc/powerpc64/multiarch/strncpy.c
> index 7bacf28aca..822ceb2003 100644
> --- a/sysdeps/powerpc/powerpc64/multiarch/strncpy.c
> +++ b/sysdeps/powerpc/powerpc64/multiarch/strncpy.c
> @@ -28,11 +28,18 @@
> extern __typeof (strncpy) __strncpy_ppc attribute_hidden;
> extern __typeof (strncpy) __strncpy_power7 attribute_hidden;
> extern __typeof (strncpy) __strncpy_power8 attribute_hidden;
> +# ifdef __LITTLE_ENDIAN__
> +extern __typeof (strncpy) __strncpy_power9 attribute_hidden;
> +# endif
> # undef strncpy
>
> /* Avoid DWARF definition DIE on ifunc symbol so that GDB can handle
> ifunc symbol properly. */
> libc_ifunc_redirected (__redirect_strncpy, strncpy,
> +# ifdef __LITTLE_ENDIAN__
> + (hwcap2 & PPC_FEATURE2_ARCH_3_00)
> + ? __strncpy_power9 :
Trivial nit, I think the above two lines need two extra spaces.
^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: [PATCH 2/2] powerpc: Optimzed stpncpy for POWER9
2020-08-20 18:29 ` [PATCH 2/2] powerpc: Optimzed stpncpy " Raphael Moreira Zinsly
2020-08-20 18:31 ` Raphael M Zinsly
@ 2020-08-28 17:04 ` Paul E Murphy
1 sibling, 0 replies; 9+ messages in thread
From: Paul E Murphy @ 2020-08-28 17:04 UTC (permalink / raw)
To: Raphael Moreira Zinsly, libc-alpha
Thank you for your contributions, I have a few minor
comments/suggestions below.
On 8/20/20 1:29 PM, Raphael Moreira Zinsly via Libc-alpha wrote:
> Adds stpncpy support into the POWER9 strncpy.
s/Adds/Add/ s/into the/to/.
Likewise, s/Optimzed/Add optimized/ in the title.
> ---
> sysdeps/powerpc/powerpc64/le/power9/stpncpy.S | 24 ++++++
> sysdeps/powerpc/powerpc64/le/power9/strncpy.S | 74 +++++++++++++++++++
> sysdeps/powerpc/powerpc64/multiarch/Makefile | 2 +-
> .../powerpc64/multiarch/ifunc-impl-list.c | 5 ++
> .../powerpc64/multiarch/stpncpy-power9.S | 24 ++++++
> sysdeps/powerpc/powerpc64/multiarch/stpncpy.c | 7 ++
> 6 files changed, 135 insertions(+), 1 deletion(-)
> create mode 100644 sysdeps/powerpc/powerpc64/le/power9/stpncpy.S
> create mode 100644 sysdeps/powerpc/powerpc64/multiarch/stpncpy-power9.S
>
> diff --git a/sysdeps/powerpc/powerpc64/le/power9/stpncpy.S b/sysdeps/powerpc/powerpc64/le/power9/stpncpy.S
> new file mode 100644
> index 0000000000..a96840bb6f
> --- /dev/null
> +++ b/sysdeps/powerpc/powerpc64/le/power9/stpncpy.S
> @@ -0,0 +1,24 @@
> +/* Optimized stpncpy implementation for PowerPC64/POWER9.
> + Copyright (C) 2015-2020 Free Software Foundation, Inc.
Should this date be exclusively 2020?
> + This file is part of the GNU C Library.
> +
> + The GNU C Library is free software; you can redistribute it and/or
> + modify it under the terms of the GNU Lesser General Public
> + License as published by the Free Software Foundation; either
> + version 2.1 of the License, or (at your option) any later version.
> +
> + The GNU C Library is distributed in the hope that it will be useful,
> + but WITHOUT ANY WARRANTY; without even the implied warranty of
> + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + Lesser General Public License for more details.
> +
> + You should have received a copy of the GNU Lesser General Public
> + License along with the GNU C Library; if not, see
> + <https://www.gnu.org/licenses/>. */
> +
> +#define USE_AS_STPNCPY
> +#include <sysdeps/powerpc/powerpc64/le/power9/strncpy.S>
> +
> +weak_alias (__stpncpy, stpncpy)
> +libc_hidden_def (__stpncpy)
> +libc_hidden_builtin_def (stpncpy)
OK.
> diff --git a/sysdeps/powerpc/powerpc64/le/power9/strncpy.S b/sysdeps/powerpc/powerpc64/le/power9/strncpy.S
> index cde68384d4..64b06a9040 100644
> --- a/sysdeps/powerpc/powerpc64/le/power9/strncpy.S
> +++ b/sysdeps/powerpc/powerpc64/le/power9/strncpy.S
> @@ -18,16 +18,30 @@
>
> #include <sysdep.h>
>
> +#ifdef USE_AS_STPNCPY
> +# ifndef STPNCPY
> +# define FUNC_NAME __stpncpy
> +# else
> +# define FUNC_NAME STPNCPY
> +# endif
> +#else
> # ifndef STRNCPY
> # define FUNC_NAME strncpy
> # else
> # define FUNC_NAME STRNCPY
> # endif
> +#endif /* !USE_AS_STPNCPY */
>
> /* Implements the function
>
> char * [r3] strncpy (char *dest [r3], const char *src [r4], size_t n [r5])
>
> + or
> +
> + char * [r3] stpncpy (char *dest [r3], const char *src [r4], size_t n [r5])
> +
> + if USE_AS_STPNCPY is defined.
> +
> The implementation can load bytes past a null terminator, but only
> up to the next 16B boundary, so it never crosses a page. */
>
> @@ -47,6 +61,13 @@ ENTRY_TOCLESS (FUNC_NAME, 4)
> beq L(zero_padding_loop)
>
> cmpwi r5,0
> +#ifdef USE_AS_STPNCPY
> + bgt L(cont)
> + /* stpncpy returns the dest address plus the size not counting the
> + final '\0'. */
"Compute pointer to last byte copied into dest." Likwise for the other
copied instances.
> + addi r3,r3,1
> + blr
> +#endif
OK.
> beqlr
This is unreachable in stpncpy, can this be conditionally included in
the !stpncpy configuration?
>
> L(cont):
> @@ -77,12 +98,22 @@ L(cont):
> sldi r10,r5,56 /* stxvl wants size in top 8 bits */
> stxvl 32+v0,r11,r10 /* Partial store */
>
> +#ifdef USE_AS_STPNCPY
> + /* stpncpy returns the dest address plus the size not counting the
> + final '\0'. */
> + add r3,r11,r5
> +#endif
> blr
>
> L(null):
> sldi r10,r8,56 /* stxvl wants size in top 8 bits */
> stxvl 32+v0,r11,r10 /* Partial store */
>
> +#ifdef USE_AS_STPNCPY
> + /* stpncpy returns the dest address plus the size not counting the
> + final '\0'. */
> + add r3,r11,r7
> +#endif
> add r11,r11,r8
> sub r5,r5,r8
> b L(zero_padding_loop)
> @@ -164,6 +195,11 @@ L(n_tail4):
> sldi r10,r5,56 /* stxvl wants size in top 8 bits */
> addi r11,r11,48 /* Offset */
> stxvl 32+v3,r11,r10 /* Partial store */
> +#ifdef USE_AS_STPNCPY
> + /* stpncpy returns the dest address plus the size not counting the
> + final '\0'. */
> + add r3,r11,r5
> +#endif
> blr
>
> L(prep_n_tail1):
> @@ -174,6 +210,11 @@ L(prep_n_tail1):
> L(n_tail1):
> sldi r10,r5,56 /* stxvl wants size in top 8 bits */
> stxvl 32+v0,r11,r10 /* Partial store */
> +#ifdef USE_AS_STPNCPY
> + /* stpncpy returns the dest address plus the size not counting the
> + final '\0'. */
> + add r3,r11,r5
> +#endif
> blr
>
> L(prep_n_tail2):
> @@ -186,6 +227,11 @@ L(n_tail2):
> sldi r10,r5,56 /* stxvl wants size in top 8 bits */
> addi r11,r11,16 /* offset */
> stxvl 32+v1,r11,r10 /* Partial store */
> +#ifdef USE_AS_STPNCPY
> + /* stpncpy returns the dest address plus the size not counting the
> + final '\0'. */
> + add r3,r11,r5
> +#endif
> blr
>
> L(prep_n_tail3):
> @@ -199,6 +245,11 @@ L(n_tail3):
> sldi r10,r5,56 /* stxvl wants size in top 8 bits */
> addi r11,r11,32 /* Offset */
> stxvl 32+v2,r11,r10 /* Partial store */
> +#ifdef USE_AS_STPNCPY
> + /* stpncpy returns the dest address plus the size not counting the
> + final '\0'. */
> + add r3,r11,r5
> +#endif
> blr
>
> L(prep_tail1):
> @@ -208,6 +259,11 @@ L(tail1):
> addi r9,r8,1 /* Add null terminator */
> sldi r10,r9,56 /* stxvl wants size in top 8 bits */
> stxvl 32+v0,r11,r10 /* Partial store */
> +#ifdef USE_AS_STPNCPY
> + /* stpncpy returns the dest address plus the size not counting the
> + final '\0'. */
> + add r3,r11,r8
> +#endif
> add r11,r11,r9
> sub r5,r5,r9
> b L(zero_padding_loop)
> @@ -222,6 +278,11 @@ L(tail2):
> sldi r10,r9,56 /* stxvl wants size in top 8 bits */
> addi r11,r11,16 /* offset */
> stxvl 32+v1,r11,r10 /* Partial store */
> +#ifdef USE_AS_STPNCPY
> + /* stpncpy returns the dest address plus the size not counting the
> + final '\0'. */
> + add r3,r11,r8
> +#endif
> add r11,r11,r9
> sub r5,r5,r9
> b L(zero_padding_loop)
> @@ -237,6 +298,11 @@ L(tail3):
> sldi r10,r9,56 /* stxvl wants size in top 8 bits */
> addi r11,r11,32 /* offset */
> stxvl 32+v2,r11,r10 /* Partial store */
> +#ifdef USE_AS_STPNCPY
> + /* stpncpy returns the dest address plus the size not counting the
> + final '\0'. */
> + add r3,r11,r8
> +#endif
> add r11,r11,r9
> sub r5,r5,r9
> b L(zero_padding_loop)
> @@ -252,6 +318,11 @@ L(tail4):
> sldi r10,r9,56 /* stxvl wants size in top 8 bits */
> addi r11,r11,48 /* offset */
> stxvl 32+v3,r11,r10 /* Partial store */
> +#ifdef USE_AS_STPNCPY
> + /* stpncpy returns the dest address plus the size not counting the
> + final '\0'. */
> + add r3,r11,r8
> +#endif
> add r11,r11,r9
> sub r5,r5,r9
>
> @@ -274,3 +345,6 @@ L(zero_padding_end):
> L(n_tail):
>
> END (FUNC_NAME)
> +#ifndef USE_AS_STPNCPY
> +libc_hidden_builtin_def (strncpy)
> +#endif
OK.
> diff --git a/sysdeps/powerpc/powerpc64/multiarch/Makefile b/sysdeps/powerpc/powerpc64/multiarch/Makefile
> index cd2b47b403..f46bf50732 100644
> --- a/sysdeps/powerpc/powerpc64/multiarch/Makefile
> +++ b/sysdeps/powerpc/powerpc64/multiarch/Makefile
OK.
> diff --git a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
> index aa63e1c23f..56790bcfe3 100644
> --- a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
> +++ b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
OK.
> diff --git a/sysdeps/powerpc/powerpc64/multiarch/stpncpy-power9.S b/sysdeps/powerpc/powerpc64/multiarch/stpncpy-power9.S
> new file mode 100644
> index 0000000000..ecbbb5c8e9
> --- /dev/null
> +++ b/sysdeps/powerpc/powerpc64/multiarch/stpncpy-power9.S
> @@ -0,0 +1,24 @@
> +/* Optimized stpncpy implementation for POWER9/PPC64.
> + Copyright (C) 2015-2020 Free Software Foundation, Inc.
Minor nit, I suspect that date should only include 2020.
> + This file is part of the GNU C Library.
> +
> + The GNU C Library is free software; you can redistribute it and/or
> + modify it under the terms of the GNU Lesser General Public
> + License as published by the Free Software Foundation; either
> + version 2.1 of the License, or (at your option) any later version.
> +
> + The GNU C Library is distributed in the hope that it will be useful,
> + but WITHOUT ANY WARRANTY; without even the implied warranty of
> + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + Lesser General Public License for more details.
> +
> + You should have received a copy of the GNU Lesser General Public
> + License along with the GNU C Library; if not, see
> + <https://www.gnu.org/licenses/>. */
> +
> +#define STPNCPY __stpncpy_power9
> +
> +#undef libc_hidden_builtin_def
> +#define libc_hidden_builtin_def(name)
> +
OK.
> +#include <sysdeps/powerpc/powerpc64/le/power9/stpncpy.S>
> diff --git a/sysdeps/powerpc/powerpc64/multiarch/stpncpy.c b/sysdeps/powerpc/powerpc64/multiarch/stpncpy.c
> index 17df886431..21702716a3 100644
> --- a/sysdeps/powerpc/powerpc64/multiarch/stpncpy.c
> +++ b/sysdeps/powerpc/powerpc64/multiarch/stpncpy.c
> @@ -26,10 +26,17 @@
> extern __typeof (__stpncpy) __stpncpy_ppc attribute_hidden;
> extern __typeof (__stpncpy) __stpncpy_power7 attribute_hidden;
> extern __typeof (__stpncpy) __stpncpy_power8 attribute_hidden;
> +# ifdef __LITTLE_ENDIAN__
> +extern __typeof (__stpncpy) __stpncpy_power9 attribute_hidden;
> +# endif
> # undef stpncpy
> # undef __stpncpy
>
> libc_ifunc_redirected (__redirect___stpncpy, __stpncpy,
> +# ifdef __LITTLE_ENDIAN__
> + (hwcap2 & PPC_FEATURE2_ARCH_3_00)
> + ? __stpncpy_power9 :
> +# endif
> (hwcap2 & PPC_FEATURE2_ARCH_2_07)
> ? __stpncpy_power8
> : (hwcap & PPC_FEATURE_HAS_VSX)
>
I think the spacing is off by two here.
^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: [PATCH 1/2] powerpc: Optimized strncpy for POWER9
2020-08-20 18:29 [PATCH 1/2] powerpc: Optimized strncpy for POWER9 Raphael Moreira Zinsly
` (2 preceding siblings ...)
2020-08-28 14:25 ` Paul E Murphy
@ 2020-08-28 19:12 ` Paul A. Clarke
2020-09-02 13:20 ` Tulio Magno Quites Machado Filho
4 siblings, 0 replies; 9+ messages in thread
From: Paul A. Clarke @ 2020-08-28 19:12 UTC (permalink / raw)
To: Raphael Moreira Zinsly; +Cc: libc-alpha
On Thu, Aug 20, 2020 at 03:29:16PM -0300, Raphael Moreira Zinsly via Libc-alpha wrote:
> Similar to the strcpy P9 optimization, this version uses VSX to improve
> performance.
> ---
> diff --git a/sysdeps/powerpc/powerpc64/le/power9/strncpy.S b/sysdeps/powerpc/powerpc64/le/power9/strncpy.S
> new file mode 100644
> index 0000000000..cde68384d4
> --- /dev/null
> +++ b/sysdeps/powerpc/powerpc64/le/power9/strncpy.S
> @@ -0,0 +1,276 @@
> +/* Optimized strncpy implementation for PowerPC64/POWER9.
sysdeps/powerpc/powerpc64/multiarch/strncpy-power9.S below, has
"POWER9/PPC64". Can we make these consistent? Can we just say
"POWER9"? Do we need to indicate little-endian only?
> + Copyright (C) 2020 Free Software Foundation, Inc.
> + This file is part of the GNU C Library.
> +
> + The GNU C Library is free software; you can redistribute it and/or
> + modify it under the terms of the GNU Lesser General Public
> + License as published by the Free Software Foundation; either
> + version 2.1 of the License, or (at your option) any later version.
> +
> + The GNU C Library is distributed in the hope that it will be useful,
> + but WITHOUT ANY WARRANTY; without even the implied warranty of
> + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + Lesser General Public License for more details.
> +
> + You should have received a copy of the GNU Lesser General Public
> + License along with the GNU C Library; if not, see
> + <https://www.gnu.org/licenses/>. */
> +
> +#include <sysdep.h>
> +
> +# ifndef STRNCPY
> +# define FUNC_NAME strncpy
> +# else
> +# define FUNC_NAME STRNCPY
> +# endif
> +
> +/* Implements the function
> +
> + char * [r3] strncpy (char *dest [r3], const char *src [r4], size_t n [r5])
> +
> + The implementation can load bytes past a null terminator, but only
> + up to the next 16B boundary, so it never crosses a page. */
nit, subjective: "up to the next 16-byte aligned address"
> +
> +.machine power9
> +ENTRY_TOCLESS (FUNC_NAME, 4)
> + CALL_MCOUNT 2
> +
> + cmpwi r5, 0
This should be "cmpdi".
> + beqlr
> + /* NULL string optimisation */
This comment would make more sense above the "cmpdi", above.
> + lbz r0,0(r4)
> + stb r0,0(r3)
> + addi r11,r3,1
> + addi r5,r5,-1
> + vspltisb v18,0 /* Zeroes in v18 */
> + cmpwi r0,0
This should be "cmpdi".
> + beq L(zero_padding_loop)
> +
Given the above "NULL string" comment, you could
put an "empty string optimization" comment here.
> + cmpwi r5,0
This should be "cmpdi".
> + beqlr
The "addi r11,r3,1" and "vspltisb v18,0" above aren't needed until
a bit later, which penalizes the empty string case. I think you
can move the empty string test up. Some experiments seemed to move
the lbz and dependent stb apart. Something like this:
/* NULL string optimisation */
cmpdi r5,0
beqlr
lbz r0,0(r4)
/* empty/1-byte string optimisation */
cmpdi r5,1
stb r0,0(r3)
beqlr
cmpdi r0,0
addi r11,r3,1
addi r5,r5,-1
vspltisb v18,0 /* Zeroes in v18 */
beq L(zero_padding_loop)
(But, I didn't see significant performance difference in
some light experimentation. It might be worth another look.)
> +
> +L(cont):
This label isn't used.
> + addi r4,r4,1
> + neg r7,r4
> + rldicl r9,r7,0,60 /* How many bytes to get source 16B aligned? */
> +
> + /* Get source 16B aligned */
> + lvx v0,0,r4
> + lvsr v1,0,r4
> + vperm v0,v18,v0,v1
> +
> + vcmpequb v6,v0,v18 /* 0xff if byte is NULL, 0x00 otherwise */
> + vctzlsbb r7,v6 /* Number of trailing zeroes */
> + addi r8,r7,1 /* Add null terminator */
> +
> + /* r8 = bytes including null
> + r9 = bytes to get source 16B aligned
> + if r8 > r9
> + no null, copy r9 bytes
> + else
> + there is a null, copy r8 bytes and return. */
> + cmpd r8,r9
This should probably be "cmpld".
> + bgt L(no_null)
> +
> + cmpd r8,r5 /* r8 <= n? */
This should probably be "cmpld".
> + ble L(null)
> + sldi r10,r5,56 /* stxvl wants size in top 8 bits */
> + stxvl 32+v0,r11,r10 /* Partial store */
Do we still need this "32+v0" syntax? Is that due to a minimum supported
level of binutils which isn't VSX-aware?
> +
> + blr
> +
> +L(null):
> + sldi r10,r8,56 /* stxvl wants size in top 8 bits */
> + stxvl 32+v0,r11,r10 /* Partial store */
> +
> + add r11,r11,r8
> + sub r5,r5,r8
> + b L(zero_padding_loop)
> +
> +L(no_null):
> + cmpd r9,r5 /* Check if length was reached. */
This should probably be "cmpld".
> + bge L(n_tail1)
> + sldi r10,r9,56 /* stxvl wants size in top 8 bits */
> + stxvl 32+v0,r11,r10 /* Partial store */
> +
> + add r4,r4,r9
> + add r11,r11,r9
> + sub r5,r5,r9
> +
> +L(loop):
> + cmpldi cr6,r5,64 /* Check if length was reached. */
> + ble cr6,L(final_loop)
> +
> + lxv 32+v0,0(r4)
> + vcmpequb. v6,v0,v18 /* Any zero bytes? */
> + bne cr6,L(prep_tail1)
> +
> + lxv 32+v1,16(r4)
> + vcmpequb. v6,v1,v18 /* Any zero bytes? */
> + bne cr6,L(prep_tail2)
> +
> + lxv 32+v2,32(r4)
> + vcmpequb. v6,v2,v18 /* Any zero bytes? */
> + bne cr6,L(prep_tail3)
> +
> + lxv 32+v3,48(r4)
> + vcmpequb. v6,v3,v18 /* Any zero bytes? */
> + bne cr6,L(prep_tail4)
> +
> + stxv 32+v0,0(r11)
> + stxv 32+v1,16(r11)
> + stxv 32+v2,32(r11)
> + stxv 32+v3,48(r11)
> +
> + addi r4,r4,64
> + addi r11,r11,64
> + addi r5,r5,-64
> +
> + b L(loop)
> +
> +L(final_loop):
> + cmpldi cr5,r5,16
> + lxv 32+v0,0(r4)
> + vcmpequb. v6,v0,v18 /* Any zero bytes? */
> + ble cr5,L(prep_n_tail1)
> + bne cr6,L(count_tail1)
> + addi r5,r5,-16
> +
> + cmpldi cr5,r5,16
> + lxv 32+v1,16(r4)
> + vcmpequb. v6,v1,v18 /* Any zero bytes? */
> + ble cr5,L(prep_n_tail2)
> + bne cr6,L(count_tail2)
> + addi r5,r5,-16
> +
> + cmpldi cr5,r5,16
> + lxv 32+v2,32(r4)
> + vcmpequb. v6,v2,v18 /* Any zero bytes? */
> + ble cr5,L(prep_n_tail3)
> + bne cr6,L(count_tail3)
> + addi r5,r5,-16
> +
> + lxv 32+v3,48(r4)
> + vcmpequb. v6,v3,v18 /* Any zero bytes? */
> + beq cr6,L(n_tail4)
> +
> + vctzlsbb r8,v6 /* Number of trailing zeroes */
> + cmpd r8,r5 /* r8 < n? */
This should probably be "cmpld".
> + blt L(tail4)
> +L(n_tail4):
> + stxv 32+v0,0(r11)
> + stxv 32+v1,16(r11)
> + stxv 32+v2,32(r11)
> + sldi r10,r5,56 /* stxvl wants size in top 8 bits */
> + addi r11,r11,48 /* Offset */
> + stxvl 32+v3,r11,r10 /* Partial store */
> + blr
> +
> +L(prep_n_tail1):
> + beq cr6,L(n_tail1) /* Any zero bytes? */
> + vctzlsbb r8,v6 /* Number of trailing zeroes */
> + cmpd r8,r5 /* r8 < n? */
This should probably be "cmpld".
> + blt L(tail1)
> +L(n_tail1):
> + sldi r10,r5,56 /* stxvl wants size in top 8 bits */
> + stxvl 32+v0,r11,r10 /* Partial store */
> + blr
> +
> +L(prep_n_tail2):
> + beq cr6,L(n_tail2) /* Any zero bytes? */
> + vctzlsbb r8,v6 /* Number of trailing zeroes */
> + cmpd r8,r5 /* r8 < n? */
This should probably be "cmpld".
> + blt L(tail2)
> +L(n_tail2):
> + stxv 32+v0,0(r11)
> + sldi r10,r5,56 /* stxvl wants size in top 8 bits */
> + addi r11,r11,16 /* offset */
> + stxvl 32+v1,r11,r10 /* Partial store */
> + blr
> +
> +L(prep_n_tail3):
> + beq cr6,L(n_tail3) /* Any zero bytes? */
> + vctzlsbb r8,v6 /* Number of trailing zeroes */
> + cmpd r8,r5 /* r8 < n? */
This should probably be "cmpld".
> + blt L(tail3)
> +L(n_tail3):
> + stxv 32+v0,0(r11)
> + stxv 32+v1,16(r11)
> + sldi r10,r5,56 /* stxvl wants size in top 8 bits */
> + addi r11,r11,32 /* Offset */
> + stxvl 32+v2,r11,r10 /* Partial store */
> + blr
> +
> +L(prep_tail1):
> +L(count_tail1):
> + vctzlsbb r8,v6 /* Number of trailing zeroes */
> +L(tail1):
> + addi r9,r8,1 /* Add null terminator */
> + sldi r10,r9,56 /* stxvl wants size in top 8 bits */
> + stxvl 32+v0,r11,r10 /* Partial store */
> + add r11,r11,r9
> + sub r5,r5,r9
> + b L(zero_padding_loop)
> +
> +L(prep_tail2):
> + addi r5,r5,-16
> +L(count_tail2):
> + vctzlsbb r8,v6 /* Number of trailing zeroes */
> +L(tail2):
> + addi r9,r8,1 /* Add null terminator */
> + stxv 32+v0,0(r11)
> + sldi r10,r9,56 /* stxvl wants size in top 8 bits */
> + addi r11,r11,16 /* offset */
> + stxvl 32+v1,r11,r10 /* Partial store */
> + add r11,r11,r9
> + sub r5,r5,r9
> + b L(zero_padding_loop)
> +
> +L(prep_tail3):
> + addi r5,r5,-32
> +L(count_tail3):
> + vctzlsbb r8,v6 /* Number of trailing zeroes */
> +L(tail3):
> + addi r9,r8,1 /* Add null terminator */
> + stxv 32+v0,0(r11)
> + stxv 32+v1,16(r11)
> + sldi r10,r9,56 /* stxvl wants size in top 8 bits */
> + addi r11,r11,32 /* offset */
> + stxvl 32+v2,r11,r10 /* Partial store */
> + add r11,r11,r9
> + sub r5,r5,r9
> + b L(zero_padding_loop)
> +
> +L(prep_tail4):
> + addi r5,r5,-48
> + vctzlsbb r8,v6 /* Number of trailing zeroes */
> +L(tail4):
> + addi r9,r8,1 /* Add null terminator */
> + stxv 32+v0,0(r11)
> + stxv 32+v1,16(r11)
> + stxv 32+v2,32(r11)
> + sldi r10,r9,56 /* stxvl wants size in top 8 bits */
> + addi r11,r11,48 /* offset */
> + stxvl 32+v3,r11,r10 /* Partial store */
> + add r11,r11,r9
> + sub r5,r5,r9
> +
> +/* This code pads the remainder of dest with NULL bytes. */
> +L(zero_padding_loop):
> + cmpldi cr6,r5,16 /* Check if length was reached. */
> + ble cr6,L(zero_padding_end)
> +
> + stxv v18,0(r11)
> + addi r11,r11,16
> + addi r5,r5,-16
> +
> + b L(zero_padding_loop)
> +
> +L(zero_padding_end):
> + sldi r10,r5,56 /* stxvl wants size in top 8 bits */
> + stxvl v18,r11,r10 /* Partial store */
> + blr
> +
> +L(n_tail):
> +
> +END (FUNC_NAME)
PC
^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: [PATCH 1/2] powerpc: Optimized strncpy for POWER9
2020-08-20 18:29 [PATCH 1/2] powerpc: Optimized strncpy for POWER9 Raphael Moreira Zinsly
` (3 preceding siblings ...)
2020-08-28 19:12 ` Paul A. Clarke
@ 2020-09-02 13:20 ` Tulio Magno Quites Machado Filho
2020-09-02 14:00 ` Paul E Murphy
4 siblings, 1 reply; 9+ messages in thread
From: Tulio Magno Quites Machado Filho @ 2020-09-02 13:20 UTC (permalink / raw)
To: libc-alpha, Raphael Moreira Zinsly
Raphael Moreira Zinsly via Libc-alpha <libc-alpha@sourceware.org> writes:
> diff --git a/sysdeps/powerpc/powerpc64/le/power9/strncpy.S b/sysdeps/powerpc/powerpc64/le/power9/strncpy.S
> new file mode 100644
> index 0000000000..cde68384d4
> --- /dev/null
> +++ b/sysdeps/powerpc/powerpc64/le/power9/strncpy.S
> @@ -0,0 +1,276 @@
> ...
> +/* Implements the function
> +
> + char * [r3] strncpy (char *dest [r3], const char *src [r4], size_t n [r5])
> +
> + The implementation can load bytes past a null terminator, but only
> + up to the next 16B boundary, so it never crosses a page. */
> +
> +.machine power9
I don't think Binutils 2.26 supports .machine power9. Likewise for all P9
instructions. However, current glibc is expected to work with Binutils 2.26
(ppc64le), i.e. builds with Binutils 2.26 should not fail.
So, we either need to change this code (e.g. similar to strcmp) or we need
to bump the Binutils requirements.
The last time Binutils requirements was bumped was in 2017, so I think it's safe
to do this now.
Let me prepare a patch proposing this.
--
Tulio Magno
^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: [PATCH 1/2] powerpc: Optimized strncpy for POWER9
2020-09-02 13:20 ` Tulio Magno Quites Machado Filho
@ 2020-09-02 14:00 ` Paul E Murphy
0 siblings, 0 replies; 9+ messages in thread
From: Paul E Murphy @ 2020-09-02 14:00 UTC (permalink / raw)
To: Tulio Magno Quites Machado Filho, libc-alpha, Raphael Moreira Zinsly
On 9/2/20 8:20 AM, Tulio Magno Quites Machado Filho wrote:
> Raphael Moreira Zinsly via Libc-alpha <libc-alpha@sourceware.org> writes:
>
>> diff --git a/sysdeps/powerpc/powerpc64/le/power9/strncpy.S b/sysdeps/powerpc/powerpc64/le/power9/strncpy.S
>> new file mode 100644
>> index 0000000000..cde68384d4
>> --- /dev/null
>> +++ b/sysdeps/powerpc/powerpc64/le/power9/strncpy.S
>> @@ -0,0 +1,276 @@
>> ...
>> +/* Implements the function
>> +
>> + char * [r3] strncpy (char *dest [r3], const char *src [r4], size_t n [r5])
>> +
>> + The implementation can load bytes past a null terminator, but only
>> + up to the next 16B boundary, so it never crosses a page. */
>> +
>> +.machine power9
>
> I don't think Binutils 2.26 supports .machine power9. Likewise for all P9
> instructions. However, current glibc is expected to work with Binutils 2.26
> (ppc64le), i.e. builds with Binutils 2.26 should not fail.
>
> So, we either need to change this code (e.g. similar to strcmp) or we need
> to bump the Binutils requirements.
> The last time Binutils requirements was bumped was in 2017, so I think it's safe
> to do this now.
>
> Let me prepare a patch proposing this.
There are at least 5 uses of .machine power9 throughout glibc today. I
agree with bumping at least the ppc64le requirements to match.
^ permalink raw reply [flat|nested] 9+ messages in thread
end of thread, other threads:[~2020-09-02 14:00 UTC | newest]
Thread overview: 9+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2020-08-20 18:29 [PATCH 1/2] powerpc: Optimized strncpy for POWER9 Raphael Moreira Zinsly
2020-08-20 18:29 ` [PATCH 2/2] powerpc: Optimzed stpncpy " Raphael Moreira Zinsly
2020-08-20 18:31 ` Raphael M Zinsly
2020-08-28 17:04 ` Paul E Murphy
2020-08-20 18:31 ` [PATCH 1/2] powerpc: Optimized strncpy " Raphael M Zinsly
2020-08-28 14:25 ` Paul E Murphy
2020-08-28 19:12 ` Paul A. Clarke
2020-09-02 13:20 ` Tulio Magno Quites Machado Filho
2020-09-02 14:00 ` Paul E Murphy
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).