* [PATCH v2 1/2] powerpc: Add optimized strncpy for POWER9
@ 2020-09-04 16:56 Raphael Moreira Zinsly
2020-09-04 16:56 ` [PATCH v2 2/2] powerpc: Add optimized stpncpy " Raphael Moreira Zinsly
` (2 more replies)
0 siblings, 3 replies; 9+ messages in thread
From: Raphael Moreira Zinsly @ 2020-09-04 16:56 UTC (permalink / raw)
To: libc-alpha; +Cc: murphyp, pc, tuliom, Raphael Moreira Zinsly
Changes since v1:
- Fixed comments identation and added some spaces to improve
readbillity.
- Use "POWER 9 LE" instead of "PowerPC64/POWER9".
- Fixed copyright dates.
- Replaced cmpwi for cmpdi.
---8<---
Similar to the strcpy P9 optimization, this version uses VSX to improve
performance.
---
sysdeps/powerpc/powerpc64/le/power9/strncpy.S | 281 ++++++++++++++++++
sysdeps/powerpc/powerpc64/multiarch/Makefile | 2 +-
.../powerpc64/multiarch/ifunc-impl-list.c | 5 +
.../powerpc64/multiarch/strncpy-power9.S | 26 ++
sysdeps/powerpc/powerpc64/multiarch/strncpy.c | 7 +
5 files changed, 320 insertions(+), 1 deletion(-)
create mode 100644 sysdeps/powerpc/powerpc64/le/power9/strncpy.S
create mode 100644 sysdeps/powerpc/powerpc64/multiarch/strncpy-power9.S
diff --git a/sysdeps/powerpc/powerpc64/le/power9/strncpy.S b/sysdeps/powerpc/powerpc64/le/power9/strncpy.S
new file mode 100644
index 0000000000..34fcdee913
--- /dev/null
+++ b/sysdeps/powerpc/powerpc64/le/power9/strncpy.S
@@ -0,0 +1,281 @@
+/* Optimized strncpy implementation for POWER9 LE.
+ Copyright (C) 2020 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+
+# ifndef STRNCPY
+# define FUNC_NAME strncpy
+# else
+# define FUNC_NAME STRNCPY
+# endif
+
+/* Implements the function
+
+ char * [r3] strncpy (char *dest [r3], const char *src [r4], size_t n [r5])
+
+ The implementation can load bytes past a null terminator, but only
+ up to the next 16-byte aligned address, so it never crosses a page. */
+
+.machine power9
+ENTRY_TOCLESS (FUNC_NAME, 4)
+ CALL_MCOUNT 2
+
+ /* NULL string optimizations */
+ cmpdi r5, 0
+ beqlr
+
+ lbz r0,0(r4)
+ stb r0,0(r3)
+ addi r11,r3,1
+ addi r5,r5,-1
+ vspltisb v18,0 /* Zeroes in v18 */
+ cmpdi r0,0
+ beq L(zero_padding_loop)
+
+ /* Empty/1-byte string optimization */
+ cmpdi r5,0
+ beqlr
+
+ addi r4,r4,1
+ neg r7,r4
+ rldicl r9,r7,0,60 /* How many bytes to get source 16B aligned? */
+
+ /* Get source 16B aligned */
+ lvx v0,0,r4
+ lvsr v1,0,r4
+ vperm v0,v18,v0,v1
+
+ vcmpequb v6,v0,v18 /* 0xff if byte is NULL, 0x00 otherwise */
+ vctzlsbb r7,v6 /* Number of trailing zeroes */
+ addi r8,r7,1 /* Add null terminator */
+
+ /* r8 = bytes including null
+ r9 = bytes to get source 16B aligned
+ if r8 > r9
+ no null, copy r9 bytes
+ else
+ there is a null, copy r8 bytes and return. */
+ cmpld r8,r9
+ bgt L(no_null)
+
+ cmpld cr6,r8,r5 /* r8 <= n? */
+ ble cr6,L(null)
+
+ sldi r10,r5,56 /* stxvl wants size in top 8 bits */
+ stxvl 32+v0,r11,r10 /* Partial store */
+
+ blr
+
+L(null):
+ sldi r10,r8,56 /* stxvl wants size in top 8 bits */
+ stxvl 32+v0,r11,r10 /* Partial store */
+
+ add r11,r11,r8
+ sub r5,r5,r8
+ b L(zero_padding_loop)
+
+L(no_null):
+ cmpld r9,r5 /* Check if length was reached. */
+ bge L(n_tail1)
+
+ sldi r10,r9,56 /* stxvl wants size in top 8 bits */
+ stxvl 32+v0,r11,r10 /* Partial store */
+
+ add r4,r4,r9
+ add r11,r11,r9
+ sub r5,r5,r9
+
+L(loop):
+ cmpldi cr6,r5,64 /* Check if length was reached. */
+ ble cr6,L(final_loop)
+
+ lxv 32+v0,0(r4)
+ vcmpequb. v6,v0,v18 /* Any zero bytes? */
+ bne cr6,L(prep_tail1)
+
+ lxv 32+v1,16(r4)
+ vcmpequb. v6,v1,v18 /* Any zero bytes? */
+ bne cr6,L(prep_tail2)
+
+ lxv 32+v2,32(r4)
+ vcmpequb. v6,v2,v18 /* Any zero bytes? */
+ bne cr6,L(prep_tail3)
+
+ lxv 32+v3,48(r4)
+ vcmpequb. v6,v3,v18 /* Any zero bytes? */
+ bne cr6,L(prep_tail4)
+
+ stxv 32+v0,0(r11)
+ stxv 32+v1,16(r11)
+ stxv 32+v2,32(r11)
+ stxv 32+v3,48(r11)
+
+ addi r4,r4,64
+ addi r11,r11,64
+ addi r5,r5,-64
+
+ b L(loop)
+
+L(final_loop):
+ cmpldi cr5,r5,16
+ lxv 32+v0,0(r4)
+ vcmpequb. v6,v0,v18 /* Any zero bytes? */
+ ble cr5,L(prep_n_tail1)
+ bne cr6,L(count_tail1)
+ addi r5,r5,-16
+
+ cmpldi cr5,r5,16
+ lxv 32+v1,16(r4)
+ vcmpequb. v6,v1,v18 /* Any zero bytes? */
+ ble cr5,L(prep_n_tail2)
+ bne cr6,L(count_tail2)
+ addi r5,r5,-16
+
+ cmpldi cr5,r5,16
+ lxv 32+v2,32(r4)
+ vcmpequb. v6,v2,v18 /* Any zero bytes? */
+ ble cr5,L(prep_n_tail3)
+ bne cr6,L(count_tail3)
+ addi r5,r5,-16
+
+ lxv 32+v3,48(r4)
+ vcmpequb. v6,v3,v18 /* Any zero bytes? */
+ beq cr6,L(n_tail4)
+
+ vctzlsbb r8,v6 /* Number of trailing zeroes */
+ cmpld r8,r5 /* r8 < n? */
+ blt L(tail4)
+
+L(n_tail4):
+ stxv 32+v0,0(r11)
+ stxv 32+v1,16(r11)
+ stxv 32+v2,32(r11)
+ sldi r10,r5,56 /* stxvl wants size in top 8 bits */
+ addi r11,r11,48 /* Offset */
+ stxvl 32+v3,r11,r10 /* Partial store */
+ blr
+
+L(prep_n_tail1):
+ beq cr6,L(n_tail1) /* Any zero bytes? */
+ vctzlsbb r8,v6 /* Number of trailing zeroes */
+ cmpld r8,r5 /* r8 < n? */
+ blt L(tail1)
+
+L(n_tail1):
+ sldi r10,r5,56 /* stxvl wants size in top 8 bits */
+ stxvl 32+v0,r11,r10 /* Partial store */
+ blr
+
+L(prep_n_tail2):
+ beq cr6,L(n_tail2) /* Any zero bytes? */
+ vctzlsbb r8,v6 /* Number of trailing zeroes */
+ cmpld r8,r5 /* r8 < n? */
+ blt L(tail2)
+
+L(n_tail2):
+ stxv 32+v0,0(r11)
+ sldi r10,r5,56 /* stxvl wants size in top 8 bits */
+ addi r11,r11,16 /* offset */
+ stxvl 32+v1,r11,r10 /* Partial store */
+ blr
+
+L(prep_n_tail3):
+ beq cr6,L(n_tail3) /* Any zero bytes? */
+ vctzlsbb r8,v6 /* Number of trailing zeroes */
+ cmpld r8,r5 /* r8 < n? */
+ blt L(tail3)
+
+L(n_tail3):
+ stxv 32+v0,0(r11)
+ stxv 32+v1,16(r11)
+ sldi r10,r5,56 /* stxvl wants size in top 8 bits */
+ addi r11,r11,32 /* Offset */
+ stxvl 32+v2,r11,r10 /* Partial store */
+ blr
+
+L(prep_tail1):
+L(count_tail1):
+ vctzlsbb r8,v6 /* Number of trailing zeroes */
+L(tail1):
+ addi r9,r8,1 /* Add null terminator */
+ sldi r10,r9,56 /* stxvl wants size in top 8 bits */
+ stxvl 32+v0,r11,r10 /* Partial store */
+ add r11,r11,r9
+ sub r5,r5,r9
+ b L(zero_padding_loop)
+
+L(prep_tail2):
+ addi r5,r5,-16
+L(count_tail2):
+ vctzlsbb r8,v6 /* Number of trailing zeroes */
+L(tail2):
+ addi r9,r8,1 /* Add null terminator */
+ stxv 32+v0,0(r11)
+ sldi r10,r9,56 /* stxvl wants size in top 8 bits */
+ addi r11,r11,16 /* offset */
+ stxvl 32+v1,r11,r10 /* Partial store */
+ add r11,r11,r9
+ sub r5,r5,r9
+ b L(zero_padding_loop)
+
+L(prep_tail3):
+ addi r5,r5,-32
+L(count_tail3):
+ vctzlsbb r8,v6 /* Number of trailing zeroes */
+L(tail3):
+ addi r9,r8,1 /* Add null terminator */
+ stxv 32+v0,0(r11)
+ stxv 32+v1,16(r11)
+ sldi r10,r9,56 /* stxvl wants size in top 8 bits */
+ addi r11,r11,32 /* offset */
+ stxvl 32+v2,r11,r10 /* Partial store */
+ add r11,r11,r9
+ sub r5,r5,r9
+ b L(zero_padding_loop)
+
+L(prep_tail4):
+ addi r5,r5,-48
+ vctzlsbb r8,v6 /* Number of trailing zeroes */
+L(tail4):
+ addi r9,r8,1 /* Add null terminator */
+ stxv 32+v0,0(r11)
+ stxv 32+v1,16(r11)
+ stxv 32+v2,32(r11)
+ sldi r10,r9,56 /* stxvl wants size in top 8 bits */
+ addi r11,r11,48 /* offset */
+ stxvl 32+v3,r11,r10 /* Partial store */
+ add r11,r11,r9
+ sub r5,r5,r9
+
+/* This code pads the remainder of dest with NULL bytes. */
+L(zero_padding_loop):
+ cmpldi cr6,r5,16 /* Check if length was reached. */
+ ble cr6,L(zero_padding_end)
+
+ stxv v18,0(r11)
+ addi r11,r11,16
+ addi r5,r5,-16
+
+ b L(zero_padding_loop)
+
+L(zero_padding_end):
+ sldi r10,r5,56 /* stxvl wants size in top 8 bits */
+ stxvl v18,r11,r10 /* Partial store */
+ blr
+
+END (FUNC_NAME)
diff --git a/sysdeps/powerpc/powerpc64/multiarch/Makefile b/sysdeps/powerpc/powerpc64/multiarch/Makefile
index 19acb6c64a..cd2b47b403 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/Makefile
+++ b/sysdeps/powerpc/powerpc64/multiarch/Makefile
@@ -33,7 +33,7 @@ sysdep_routines += memcpy-power8-cached memcpy-power7 memcpy-a2 memcpy-power6 \
ifneq (,$(filter %le,$(config-machine)))
sysdep_routines += strcmp-power9 strncmp-power9 strcpy-power9 stpcpy-power9 \
- rawmemchr-power9 strlen-power9
+ rawmemchr-power9 strlen-power9 strncpy-power9
endif
CFLAGS-strncase-power7.c += -mcpu=power7 -funroll-loops
CFLAGS-strncase_l-power7.c += -mcpu=power7 -funroll-loops
diff --git a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
index ea10b00417..aa63e1c23f 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
@@ -301,6 +301,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
/* Support sysdeps/powerpc/powerpc64/multiarch/strncpy.c. */
IFUNC_IMPL (i, name, strncpy,
+#ifdef __LITTLE_ENDIAN__
+ IFUNC_IMPL_ADD (array, i, strncpy,
+ hwcap2 & PPC_FEATURE2_ARCH_3_00,
+ __strncpy_power9)
+#endif
IFUNC_IMPL_ADD (array, i, strncpy,
hwcap2 & PPC_FEATURE2_ARCH_2_07,
__strncpy_power8)
diff --git a/sysdeps/powerpc/powerpc64/multiarch/strncpy-power9.S b/sysdeps/powerpc/powerpc64/multiarch/strncpy-power9.S
new file mode 100644
index 0000000000..ab7c570d54
--- /dev/null
+++ b/sysdeps/powerpc/powerpc64/multiarch/strncpy-power9.S
@@ -0,0 +1,26 @@
+/* Optimized strncpy implementation for POWER9 LE.
+ Copyright (C) 2020 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#if defined __LITTLE_ENDIAN__ && IS_IN (libc)
+#define STRNCPY __strncpy_power9
+
+#undef libc_hidden_builtin_def
+#define libc_hidden_builtin_def(name)
+
+#include <sysdeps/powerpc/powerpc64/le/power9/strncpy.S>
+#endif
diff --git a/sysdeps/powerpc/powerpc64/multiarch/strncpy.c b/sysdeps/powerpc/powerpc64/multiarch/strncpy.c
index 7bacf28aca..8ef0a99cb5 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/strncpy.c
+++ b/sysdeps/powerpc/powerpc64/multiarch/strncpy.c
@@ -28,11 +28,18 @@
extern __typeof (strncpy) __strncpy_ppc attribute_hidden;
extern __typeof (strncpy) __strncpy_power7 attribute_hidden;
extern __typeof (strncpy) __strncpy_power8 attribute_hidden;
+# ifdef __LITTLE_ENDIAN__
+extern __typeof (strncpy) __strncpy_power9 attribute_hidden;
+# endif
# undef strncpy
/* Avoid DWARF definition DIE on ifunc symbol so that GDB can handle
ifunc symbol properly. */
libc_ifunc_redirected (__redirect_strncpy, strncpy,
+# ifdef __LITTLE_ENDIAN__
+ (hwcap2 & PPC_FEATURE2_ARCH_3_00)
+ ? __strncpy_power9 :
+# endif
(hwcap2 & PPC_FEATURE2_ARCH_2_07)
? __strncpy_power8
: (hwcap & PPC_FEATURE_HAS_VSX)
--
2.26.2
^ permalink raw reply [flat|nested] 9+ messages in thread
* [PATCH v2 2/2] powerpc: Add optimized stpncpy for POWER9
2020-09-04 16:56 [PATCH v2 1/2] powerpc: Add optimized strncpy for POWER9 Raphael Moreira Zinsly
@ 2020-09-04 16:56 ` Raphael Moreira Zinsly
2020-09-04 16:59 ` Raphael M Zinsly
2020-09-16 12:35 ` Matheus Castanho
2020-09-04 16:59 ` [PATCH v2 1/2] powerpc: Add optimized strncpy " Raphael M Zinsly
2020-09-16 12:24 ` Matheus Castanho
2 siblings, 2 replies; 9+ messages in thread
From: Raphael Moreira Zinsly @ 2020-09-04 16:56 UTC (permalink / raw)
To: libc-alpha; +Cc: murphyp, pc, tuliom, Raphael Moreira Zinsly
Add stpncpy support into the POWER9 strncpy.
---
sysdeps/powerpc/powerpc64/le/power9/stpncpy.S | 24 +++++++
sysdeps/powerpc/powerpc64/le/power9/strncpy.S | 65 +++++++++++++++++++
sysdeps/powerpc/powerpc64/multiarch/Makefile | 2 +-
.../powerpc64/multiarch/ifunc-impl-list.c | 5 ++
.../powerpc64/multiarch/stpncpy-power9.S | 24 +++++++
sysdeps/powerpc/powerpc64/multiarch/stpncpy.c | 7 ++
6 files changed, 126 insertions(+), 1 deletion(-)
create mode 100644 sysdeps/powerpc/powerpc64/le/power9/stpncpy.S
create mode 100644 sysdeps/powerpc/powerpc64/multiarch/stpncpy-power9.S
diff --git a/sysdeps/powerpc/powerpc64/le/power9/stpncpy.S b/sysdeps/powerpc/powerpc64/le/power9/stpncpy.S
new file mode 100644
index 0000000000..81d9673d8b
--- /dev/null
+++ b/sysdeps/powerpc/powerpc64/le/power9/stpncpy.S
@@ -0,0 +1,24 @@
+/* Optimized stpncpy implementation for POWER9 LE.
+ Copyright (C) 2020 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#define USE_AS_STPNCPY
+#include <sysdeps/powerpc/powerpc64/le/power9/strncpy.S>
+
+weak_alias (__stpncpy, stpncpy)
+libc_hidden_def (__stpncpy)
+libc_hidden_builtin_def (stpncpy)
diff --git a/sysdeps/powerpc/powerpc64/le/power9/strncpy.S b/sysdeps/powerpc/powerpc64/le/power9/strncpy.S
index 34fcdee913..f7265b11ec 100644
--- a/sysdeps/powerpc/powerpc64/le/power9/strncpy.S
+++ b/sysdeps/powerpc/powerpc64/le/power9/strncpy.S
@@ -18,16 +18,30 @@
#include <sysdep.h>
+#ifdef USE_AS_STPNCPY
+# ifndef STPNCPY
+# define FUNC_NAME __stpncpy
+# else
+# define FUNC_NAME STPNCPY
+# endif
+#else
# ifndef STRNCPY
# define FUNC_NAME strncpy
# else
# define FUNC_NAME STRNCPY
# endif
+#endif /* !USE_AS_STPNCPY */
/* Implements the function
char * [r3] strncpy (char *dest [r3], const char *src [r4], size_t n [r5])
+ or
+
+ char * [r3] stpncpy (char *dest [r3], const char *src [r4], size_t n [r5])
+
+ if USE_AS_STPNCPY is defined.
+
The implementation can load bytes past a null terminator, but only
up to the next 16-byte aligned address, so it never crosses a page. */
@@ -49,7 +63,15 @@ ENTRY_TOCLESS (FUNC_NAME, 4)
/* Empty/1-byte string optimization */
cmpdi r5,0
+#ifdef USE_AS_STPNCPY
+ bgt L(cont)
+ /* Compute pointer to last byte copied into dest. */
+ addi r3,r3,1
+ blr
+L(cont):
+#else
beqlr
+#endif
addi r4,r4,1
neg r7,r4
@@ -79,12 +101,20 @@ ENTRY_TOCLESS (FUNC_NAME, 4)
sldi r10,r5,56 /* stxvl wants size in top 8 bits */
stxvl 32+v0,r11,r10 /* Partial store */
+#ifdef USE_AS_STPNCPY
+ /* Compute pointer to last byte copied into dest. */
+ add r3,r11,r5
+#endif
blr
L(null):
sldi r10,r8,56 /* stxvl wants size in top 8 bits */
stxvl 32+v0,r11,r10 /* Partial store */
+#ifdef USE_AS_STPNCPY
+ /* Compute pointer to last byte copied into dest. */
+ add r3,r11,r7
+#endif
add r11,r11,r8
sub r5,r5,r8
b L(zero_padding_loop)
@@ -168,6 +198,10 @@ L(n_tail4):
sldi r10,r5,56 /* stxvl wants size in top 8 bits */
addi r11,r11,48 /* Offset */
stxvl 32+v3,r11,r10 /* Partial store */
+#ifdef USE_AS_STPNCPY
+ /* Compute pointer to last byte copied into dest. */
+ add r3,r11,r5
+#endif
blr
L(prep_n_tail1):
@@ -179,6 +213,10 @@ L(prep_n_tail1):
L(n_tail1):
sldi r10,r5,56 /* stxvl wants size in top 8 bits */
stxvl 32+v0,r11,r10 /* Partial store */
+#ifdef USE_AS_STPNCPY
+ /* Compute pointer to last byte copied into dest. */
+ add r3,r11,r5
+#endif
blr
L(prep_n_tail2):
@@ -192,6 +230,10 @@ L(n_tail2):
sldi r10,r5,56 /* stxvl wants size in top 8 bits */
addi r11,r11,16 /* offset */
stxvl 32+v1,r11,r10 /* Partial store */
+#ifdef USE_AS_STPNCPY
+ /* Compute pointer to last byte copied into dest. */
+ add r3,r11,r5
+#endif
blr
L(prep_n_tail3):
@@ -206,6 +248,10 @@ L(n_tail3):
sldi r10,r5,56 /* stxvl wants size in top 8 bits */
addi r11,r11,32 /* Offset */
stxvl 32+v2,r11,r10 /* Partial store */
+#ifdef USE_AS_STPNCPY
+ /* Compute pointer to last byte copied into dest. */
+ add r3,r11,r5
+#endif
blr
L(prep_tail1):
@@ -215,6 +261,10 @@ L(tail1):
addi r9,r8,1 /* Add null terminator */
sldi r10,r9,56 /* stxvl wants size in top 8 bits */
stxvl 32+v0,r11,r10 /* Partial store */
+#ifdef USE_AS_STPNCPY
+ /* Compute pointer to last byte copied into dest. */
+ add r3,r11,r8
+#endif
add r11,r11,r9
sub r5,r5,r9
b L(zero_padding_loop)
@@ -229,6 +279,10 @@ L(tail2):
sldi r10,r9,56 /* stxvl wants size in top 8 bits */
addi r11,r11,16 /* offset */
stxvl 32+v1,r11,r10 /* Partial store */
+#ifdef USE_AS_STPNCPY
+ /* Compute pointer to last byte copied into dest. */
+ add r3,r11,r8
+#endif
add r11,r11,r9
sub r5,r5,r9
b L(zero_padding_loop)
@@ -244,6 +298,10 @@ L(tail3):
sldi r10,r9,56 /* stxvl wants size in top 8 bits */
addi r11,r11,32 /* offset */
stxvl 32+v2,r11,r10 /* Partial store */
+#ifdef USE_AS_STPNCPY
+ /* Compute pointer to last byte copied into dest. */
+ add r3,r11,r8
+#endif
add r11,r11,r9
sub r5,r5,r9
b L(zero_padding_loop)
@@ -259,6 +317,10 @@ L(tail4):
sldi r10,r9,56 /* stxvl wants size in top 8 bits */
addi r11,r11,48 /* offset */
stxvl 32+v3,r11,r10 /* Partial store */
+#ifdef USE_AS_STPNCPY
+ /* Compute pointer to last byte copied into dest. */
+ add r3,r11,r8
+#endif
add r11,r11,r9
sub r5,r5,r9
@@ -279,3 +341,6 @@ L(zero_padding_end):
blr
END (FUNC_NAME)
+#ifndef USE_AS_STPNCPY
+libc_hidden_builtin_def (strncpy)
+#endif
diff --git a/sysdeps/powerpc/powerpc64/multiarch/Makefile b/sysdeps/powerpc/powerpc64/multiarch/Makefile
index cd2b47b403..f46bf50732 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/Makefile
+++ b/sysdeps/powerpc/powerpc64/multiarch/Makefile
@@ -33,7 +33,7 @@ sysdep_routines += memcpy-power8-cached memcpy-power7 memcpy-a2 memcpy-power6 \
ifneq (,$(filter %le,$(config-machine)))
sysdep_routines += strcmp-power9 strncmp-power9 strcpy-power9 stpcpy-power9 \
- rawmemchr-power9 strlen-power9 strncpy-power9
+ rawmemchr-power9 strlen-power9 strncpy-power9 stpncpy-power9
endif
CFLAGS-strncase-power7.c += -mcpu=power7 -funroll-loops
CFLAGS-strncase_l-power7.c += -mcpu=power7 -funroll-loops
diff --git a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
index aa63e1c23f..56790bcfe3 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
@@ -317,6 +317,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
/* Support sysdeps/powerpc/powerpc64/multiarch/stpncpy.c. */
IFUNC_IMPL (i, name, stpncpy,
+#ifdef __LITTLE_ENDIAN__
+ IFUNC_IMPL_ADD (array, i, stpncpy,
+ hwcap2 & PPC_FEATURE2_ARCH_3_00,
+ __stpncpy_power9)
+#endif
IFUNC_IMPL_ADD (array, i, stpncpy,
hwcap2 & PPC_FEATURE2_ARCH_2_07,
__stpncpy_power8)
diff --git a/sysdeps/powerpc/powerpc64/multiarch/stpncpy-power9.S b/sysdeps/powerpc/powerpc64/multiarch/stpncpy-power9.S
new file mode 100644
index 0000000000..ccbab55c31
--- /dev/null
+++ b/sysdeps/powerpc/powerpc64/multiarch/stpncpy-power9.S
@@ -0,0 +1,24 @@
+/* Optimized stpncpy implementation for POWER9 LE.
+ Copyright (C) 2020 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#define STPNCPY __stpncpy_power9
+
+#undef libc_hidden_builtin_def
+#define libc_hidden_builtin_def(name)
+
+#include <sysdeps/powerpc/powerpc64/le/power9/stpncpy.S>
diff --git a/sysdeps/powerpc/powerpc64/multiarch/stpncpy.c b/sysdeps/powerpc/powerpc64/multiarch/stpncpy.c
index 17df886431..ac17b26650 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/stpncpy.c
+++ b/sysdeps/powerpc/powerpc64/multiarch/stpncpy.c
@@ -26,10 +26,17 @@
extern __typeof (__stpncpy) __stpncpy_ppc attribute_hidden;
extern __typeof (__stpncpy) __stpncpy_power7 attribute_hidden;
extern __typeof (__stpncpy) __stpncpy_power8 attribute_hidden;
+# ifdef __LITTLE_ENDIAN__
+extern __typeof (__stpncpy) __stpncpy_power9 attribute_hidden;
+# endif
# undef stpncpy
# undef __stpncpy
libc_ifunc_redirected (__redirect___stpncpy, __stpncpy,
+# ifdef __LITTLE_ENDIAN__
+ (hwcap2 & PPC_FEATURE2_ARCH_3_00)
+ ? __stpncpy_power9 :
+# endif
(hwcap2 & PPC_FEATURE2_ARCH_2_07)
? __stpncpy_power8
: (hwcap & PPC_FEATURE_HAS_VSX)
--
2.26.2
^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: [PATCH v2 1/2] powerpc: Add optimized strncpy for POWER9
2020-09-04 16:56 [PATCH v2 1/2] powerpc: Add optimized strncpy for POWER9 Raphael Moreira Zinsly
2020-09-04 16:56 ` [PATCH v2 2/2] powerpc: Add optimized stpncpy " Raphael Moreira Zinsly
@ 2020-09-04 16:59 ` Raphael M Zinsly
2020-09-16 12:24 ` Matheus Castanho
2 siblings, 0 replies; 9+ messages in thread
From: Raphael M Zinsly @ 2020-09-04 16:59 UTC (permalink / raw)
To: libc-alpha
Benchtest output:
generic_strncpy __strncpy_power9
__strncpy_power8 __strncpy_power7 __strncpy_ppc
Length 16, n 16, alignment 1/ 1: 6.44861 2.51617 2.54878 5.94753
9.41467
Length 16, n 16, alignment 1/ 1: 6.4448 2.51688 2.56978 5.86275 9.52956
Length 16, n 16, alignment 1/ 2: 6.51392 2.53026 2.55617 5.96487
9.51182
Length 16, n 16, alignment 2/ 1: 6.5421 2.5026 2.82458 5.95353 9.36524
Length 2, n 4, alignment 7/ 2: 8.02857 2.19272 4.35397 4.97347
8.60923
Length 4, n 2, alignment 2/ 7: 6.04262 1.66226 2.31865 3.27123
6.23803
Length 2, n 4, alignment 7/ 2: 8.15691 2.21924 4.48871 4.97328 8.3591
Length 4, n 2, alignment 2/ 7: 6.0428 1.66435 2.31671 3.2874 6.23902
Length 16, n 16, alignment 2/ 2: 6.75511 2.51667 2.82529 5.65252
9.32002
Length 16, n 16, alignment 2/ 2: 6.53469 2.51982 2.82678 5.93257
9.25613
Length 16, n 16, alignment 2/ 4: 6.3502 2.53333 2.82267 5.66948 9.35942
Length 16, n 16, alignment 4/ 2: 6.71533 2.51217 3.47278 5.95821 8.3249
Length 4, n 8, alignment 6/ 4: 7.85332 2.21708 5.68665 4.83111
9.07271
Length 8, n 4, alignment 4/ 6: 5.93863 1.67938 2.67249 3.07391
7.90751
Length 4, n 8, alignment 6/ 4: 8.24352 2.16644 5.22268 5.04674
9.10352
Length 8, n 4, alignment 4/ 6: 5.88514 1.67966 2.67286 3.29382
7.66757
Length 16, n 16, alignment 3/ 3: 6.55525 2.52511 3.06709 5.95625
9.23173
Length 16, n 16, alignment 3/ 3: 6.66344 2.50855 3.11771 5.96121
8.99767
Length 16, n 16, alignment 3/ 6: 6.82163 2.53355 3.0638 5.96451 9.09031
Length 16, n 16, alignment 6/ 3: 6.35636 2.51634 4.17868 5.95112
7.82576
Length 8, n 16, alignment 5/ 6: 7.46873 2.23953 4.33782 5.76124
10.2851
Length 16, n 8, alignment 6/ 5: 5.63643 1.88233 2.32899 4.72233
5.79268
Length 8, n 16, alignment 5/ 6: 7.47291 2.65201 3.9103 5.40334 10.3902
Length 16, n 8, alignment 6/ 5: 5.73738 1.8787 2.32749 4.69061 6.03053
Length 16, n 16, alignment 4/ 4: 6.63998 2.5166 3.5133 5.83764 8.17814
Length 16, n 16, alignment 4/ 4: 6.6866 2.51915 3.5831 5.96121 8.32436
Length 16, n 16, alignment 4/ 0: 6.58543 2.51529 3.38441 5.96909
8.03797
Length 16, n 16, alignment 0/ 4: 6.6541 1.87852 2.45328 5.96068 7.32961
Length 16, n 32, alignment 4/ 0: 9.37236 3.00744 5.92214 7.25884
11.1515
Length 32, n 16, alignment 0/ 4: 6.2795 1.87939 2.45688 5.96206 7.03327
Length 16, n 32, alignment 4/ 0: 9.24513 3.00344 5.97977 6.94778
11.0213
Length 32, n 16, alignment 0/ 4: 6.45422 1.87851 2.45698 5.96172
7.32939
Length 16, n 16, alignment 5/ 5: 6.53949 2.51619 3.88095 5.96091
9.05987
Length 16, n 16, alignment 5/ 5: 6.47371 2.51703 3.91695 5.96417
9.24674
Length 16, n 16, alignment 5/ 2: 6.5493 2.5163 3.78779 5.95898 9.44104
Length 16, n 16, alignment 2/ 5: 6.70967 2.52226 2.82034 5.96365
9.37646
Length 32, n 64, alignment 3/ 2: 14.0298 3.74521 6.80923 11.2825
12.8659
Length 64, n 32, alignment 2/ 3: 9.53123 2.75624 3.21242 8.51653
12.6887
Length 32, n 64, alignment 3/ 2: 14.179 3.83256 6.56898 11.3584 15.2479
Length 64, n 32, alignment 2/ 3: 9.53184 2.75305 3.21245 8.37087
14.1081
Length 16, n 16, alignment 6/ 6: 6.42159 2.51726 4.38574 5.9562 7.12266
Length 16, n 16, alignment 6/ 6: 6.67028 2.51692 4.2448 5.9544 7.81439
Length 16, n 16, alignment 6/ 4: 6.42402 2.51636 4.23817 5.96162
7.23351
Length 16, n 16, alignment 4/ 6: 6.60107 2.53036 3.54038 5.95837
8.32176
Length 64, n 128, alignment 2/ 4: 15.5573 4.80414 7.45917 11.5659
16.9298
Length 128, n 64, alignment 4/ 2: 11.6195 3.53279 4.80585 10.1583
11.6096
Length 64, n 128, alignment 2/ 4: 15.5233 4.7997 7.34679 11.6628 22.0123
Length 128, n 64, alignment 4/ 2: 11.6078 3.5492 4.77929 10.027 19.504
Length 16, n 16, alignment 7/ 7: 6.54515 2.5141 5.04928 5.95083 7.57587
Length 16, n 16, alignment 7/ 7: 7.00425 2.51299 5.06765 5.92888
8.25286
Length 16, n 16, alignment 7/ 6: 6.62954 2.51922 5.07189 6.02372
7.72968
Length 16, n 16, alignment 6/ 7: 6.34475 2.51841 4.36954 5.95968
7.78498
Length 128, n 256, alignment 1/ 6: 17.9386 7.60767 9.40348 16.5301
20.6134
Length 256, n 128, alignment 6/ 1: 13.373 4.84375 7.34616 12.3919 15.1296
Length 128, n 256, alignment 1/ 6: 17.9186 7.6077 9.37853 16.686 39.2821
Length 256, n 128, alignment 6/ 1: 13.3632 4.91799 8.06183 12.4174
34.1655
Length 8, n 16, alignment 0/ 0: 7.36981 2.22579 4.22739 4.9063 7.24636
Length 32, n 16, alignment 0/ 0: 6.43465 1.87932 2.45308 2.41526 7.1679
Length 8, n 16, alignment 7/ 2: 7.48861 2.21639 3.75708 5.35882
8.45777
Length 32, n 16, alignment 7/ 2: 7.03412 2.3535 5.04692 5.95484 7.25068
Length 16, n 32, alignment 0/ 0: 9.10177 3.06646 4.81682 4.41358
9.89656
Length 64, n 32, alignment 0/ 0: 8.57287 2.53847 2.94869 2.70506 8.2629
Length 16, n 32, alignment 6/ 4: 9.20906 3.04216 6.37553 9.46301
10.2489
Length 64, n 32, alignment 6/ 4: 9.73117 2.75023 4.49311 7.7856 9.59261
Length 32, n 64, alignment 0/ 0: 10.9253 3.80104 4.83111 4.97682
12.1086
Length 128, n 64, alignment 0/ 0: 9.26987 3.15895 3.49112 4.31372
10.1329
Length 32, n 64, alignment 5/ 6: 14.1856 3.78089 7.1768 9.63551 13.9944
Length 128, n 64, alignment 5/ 6: 11.5298 3.5249 5.07847 9.96481 12.8245
Length 64, n 128, alignment 0/ 0: 12.0142 4.73085 5.98759 7.1613 15.0462
Length 256, n 128, alignment 0/ 0: 7.96029 4.50244 6.44433 5.38248
11.6022
Length 64, n 128, alignment 4/ 0: 12.4223 4.80085 7.79294 11.0101
15.5277
Length 256, n 128, alignment 4/ 0: 12.2371 4.79242 6.83902 13.2758
16.0479
Length 128, n 256, alignment 0/ 0: 13.9165 7.28703 8.13319 8.79111
16.9101
Length 512, n 256, alignment 0/ 0: 10.5083 6.49881 9.05173 9.03139
19.6212
Length 128, n 256, alignment 3/ 2: 18.025 7.45493 9.86636 18.7234 20.5106
Length 512, n 256, alignment 3/ 2: 16.9588 7.07807 9.97969 23.4911
25.4407
Length 256, n 512, alignment 0/ 0: 17.6801 12.5811 15.3595 13.9989
28.5549
Length 1024, n 512, alignment 0/ 0: 16.379 10.7794 16.4748 16.7344 37.8286
Length 256, n 512, alignment 2/ 4: 23.2012 13.2761 14.3776 26.3752
31.6336
Length 1024, n 512, alignment 2/ 4: 25.4264 12.1716 17.2608 42.2122 47.425
Length 512, n 1024, alignment 0/ 0: 21.0239 23.0736 19.8285 21.0169
48.0091
Length 2048, n 1024, alignment 0/ 0: 28.424 19.323 36.917 35.4247 68.1661
Length 512, n 1024, alignment 1/ 6: 32.3159 24.2617 21.4919 46.5936 55.163
Length 2048, n 1024, alignment 1/ 6: 43.0359 21.6207 37.7643 77.5705
83.2998
^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: [PATCH v2 2/2] powerpc: Add optimized stpncpy for POWER9
2020-09-04 16:56 ` [PATCH v2 2/2] powerpc: Add optimized stpncpy " Raphael Moreira Zinsly
@ 2020-09-04 16:59 ` Raphael M Zinsly
2020-09-16 12:32 ` Matheus Castanho
2020-09-16 12:35 ` Matheus Castanho
1 sibling, 1 reply; 9+ messages in thread
From: Raphael M Zinsly @ 2020-09-04 16:59 UTC (permalink / raw)
To: libc-alpha
Benchtest output:
generic_stpncpy __stpncpy_power9
__stpncpy_power8 __stpncpy_power7 __stpncpy_ppc
Length 16, n 16, alignment 1/ 1: 6.55566 2.5481 2.74063 5.28665 9.96288
Length 16, n 16, alignment 1/ 1: 6.70016 2.54137 2.7108 4.77502 9.91703
Length 16, n 16, alignment 1/ 2: 6.55975 2.56295 2.70641 5.49298
9.59591
Length 16, n 16, alignment 2/ 1: 6.90759 2.52713 2.854 5.48949 9.37664
Length 2, n 4, alignment 7/ 2: 7.90969 2.22698 3.90151 4.6461 8.4503
Length 4, n 2, alignment 2/ 7: 6.14855 1.73403 2.67338 3.05675
6.86316
Length 2, n 4, alignment 7/ 2: 8.40868 2.22338 4.50838 4.51078
9.28489
Length 4, n 2, alignment 2/ 7: 6.14849 1.73402 2.67225 2.85349
6.34342
Length 16, n 16, alignment 2/ 2: 6.963 2.54442 2.87779 5.63547 9.85162
Length 16, n 16, alignment 2/ 2: 6.59452 2.54121 2.84662 5.57178
9.51406
Length 16, n 16, alignment 2/ 4: 6.79115 2.55835 2.84836 5.50427
9.67999
Length 16, n 16, alignment 4/ 2: 6.78419 2.54132 3.54229 5.52563
8.50938
Length 4, n 8, alignment 6/ 4: 8.45703 2.17266 4.80507 3.8714 9.04725
Length 8, n 4, alignment 4/ 6: 6.01753 1.73761 2.8185 2.41527 8.00051
Length 4, n 8, alignment 6/ 4: 7.82081 2.22612 4.80057 3.76103
8.99812
Length 8, n 4, alignment 4/ 6: 6.01752 1.73474 2.82089 2.41524
7.82703
Length 16, n 16, alignment 3/ 3: 6.78194 2.54143 3.21392 5.46447
8.90749
Length 16, n 16, alignment 3/ 3: 6.76324 2.54088 3.22883 5.39689
9.14749
Length 16, n 16, alignment 3/ 6: 7.05278 2.55795 3.22243 5.53422
9.11315
Length 16, n 16, alignment 6/ 3: 6.72881 2.54183 4.58459 5.51658
7.85006
Length 8, n 16, alignment 5/ 6: 7.67184 2.23969 4.13269 4.90728
10.2248
Length 16, n 8, alignment 6/ 5: 5.73672 1.88048 2.6693 4.35579 6.11674
Length 8, n 16, alignment 5/ 6: 7.51707 2.2284 3.67276 4.90637 10.2411
Length 16, n 8, alignment 6/ 5: 5.73665 1.88119 2.57514 3.96351
6.16253
Length 16, n 16, alignment 4/ 4: 7.03577 2.5415 3.66445 4.94157 8.98371
Length 16, n 16, alignment 4/ 4: 6.93549 2.53033 3.65577 5.53815
8.48335
Length 16, n 16, alignment 4/ 0: 6.95106 2.53483 3.48744 5.43759
8.45425
Length 16, n 16, alignment 0/ 4: 6.44601 1.87936 2.41984 5.49488
6.92169
Length 16, n 32, alignment 4/ 0: 9.2036 3.04122 5.78685 6.66434 10.9065
Length 32, n 16, alignment 0/ 4: 6.65504 1.87934 2.41817 6.08706
6.98513
Length 16, n 32, alignment 4/ 0: 9.17461 3.04153 5.77758 6.66444
10.8015
Length 32, n 16, alignment 0/ 4: 6.44123 1.87936 2.41847 5.55207
6.86039
Length 16, n 16, alignment 5/ 5: 6.56005 2.53132 4.22362 5.43527
9.25109
Length 16, n 16, alignment 5/ 5: 6.55552 2.53088 4.22655 5.59271
9.61369
Length 16, n 16, alignment 5/ 2: 6.55553 2.54559 4.31135 5.47438
8.83103
Length 16, n 16, alignment 2/ 5: 6.88992 2.56255 2.84059 5.23185
9.51441
Length 32, n 64, alignment 3/ 2: 12.5054 3.75138 6.42457 10.4719
15.0663
Length 64, n 32, alignment 2/ 3: 9.87185 2.78283 3.17042 7.66624 11.503
Length 32, n 64, alignment 3/ 2: 12.4999 3.74537 6.38161 10.4578
15.1104
Length 64, n 32, alignment 2/ 3: 9.86495 2.77889 3.19171 7.63272
13.9799
Length 16, n 16, alignment 6/ 6: 6.41353 2.5453 4.50915 5.30382 8.45391
Length 16, n 16, alignment 6/ 6: 6.49495 2.54119 4.54493 5.55909 8.1629
Length 16, n 16, alignment 6/ 4: 6.41743 2.54487 4.57202 4.98659
7.53033
Length 16, n 16, alignment 4/ 6: 6.91724 2.54649 3.67868 5.36838
8.45677
Length 64, n 128, alignment 2/ 4: 14.0687 4.93151 8.11667 11.4411
16.9533
Length 128, n 64, alignment 4/ 2: 11.7134 3.58948 4.90121 10.3018
11.6692
Length 64, n 128, alignment 2/ 4: 14.0677 4.93413 7.28129 11.439 22.2186
Length 128, n 64, alignment 4/ 2: 11.7149 3.59312 4.85286 10.3403
19.4651
Length 16, n 16, alignment 7/ 7: 6.76501 2.52563 5.55792 5.44155
8.39997
Length 16, n 16, alignment 7/ 7: 7.16923 2.5265 5.55148 5.60184 7.98311
Length 16, n 16, alignment 7/ 6: 6.76252 2.52629 5.48067 5.51161
7.61026
Length 16, n 16, alignment 6/ 7: 6.65772 2.5521 4.55758 5.48893 7.7301
Length 128, n 256, alignment 1/ 6: 16.2494 7.62034 9.3616 16.2888 19.7029
Length 256, n 128, alignment 6/ 1: 13.4311 4.94455 8.10802 12.2681
15.6941
Length 128, n 256, alignment 1/ 6: 16.2608 7.6209 9.35509 16.2856 38.0277
Length 256, n 128, alignment 6/ 1: 13.4327 4.89474 8.35934 12.2646
34.3268
Length 8, n 16, alignment 0/ 0: 7.20671 2.23256 3.75778 5.63555
7.36414
Length 32, n 16, alignment 0/ 0: 6.4449 1.88 2.41577 2.89598 6.42537
Length 8, n 16, alignment 7/ 2: 7.45976 2.21832 3.91671 4.6524 8.45825
Length 32, n 16, alignment 7/ 2: 6.78267 2.34296 5.59161 5.58598
6.88842
Length 16, n 32, alignment 0/ 0: 9.47971 3.10847 4.74758 4.75377
10.2238
Length 64, n 32, alignment 0/ 0: 8.45634 2.34747 2.59248 2.82356
9.42305
Length 16, n 32, alignment 6/ 4: 9.37784 3.05067 6.92384 9.47727
10.1826
Length 64, n 32, alignment 6/ 4: 9.89233 2.77968 4.63672 7.09838
10.2804
Length 32, n 64, alignment 0/ 0: 11.0813 3.71086 4.43777 5.3549 12.2048
Length 128, n 64, alignment 0/ 0: 9.25192 3.20123 3.53388 4.50794
10.1934
Length 32, n 64, alignment 5/ 6: 12.5099 3.75871 7.29613 9.64902
13.5821
Length 128, n 64, alignment 5/ 6: 11.6115 3.60165 5.71818 9.07288
12.7929
Length 64, n 128, alignment 0/ 0: 12.3671 4.80754 5.46926 6.84492
14.9238
Length 256, n 128, alignment 0/ 0: 8.08427 4.52607 6.47996 5.92086 11.701
Length 64, n 128, alignment 4/ 0: 12.5692 4.89717 7.11058 10.472 15.875
Length 256, n 128, alignment 4/ 0: 12.2945 4.94163 7.11645 12.3831
16.6219
Length 128, n 256, alignment 0/ 0: 13.8948 7.28911 7.78784 9.30215
17.0358
Length 512, n 256, alignment 0/ 0: 10.5266 6.56481 9.14202 9.31096
20.0531
Length 128, n 256, alignment 3/ 2: 16.3534 7.46332 9.90009 18.5282
19.5969
Length 512, n 256, alignment 3/ 2: 17.0519 7.09947 10.1635 23.5411
25.0043
Length 256, n 512, alignment 0/ 0: 15.8935 12.6195 14.0756 14.7553
28.5299
Length 1024, n 512, alignment 0/ 0: 16.3758 10.8028 16.5447 16.8966
37.8653
Length 256, n 512, alignment 2/ 4: 21.16 13.2779 14.3088 26.4475 30.1647
Length 1024, n 512, alignment 2/ 4: 25.3364 12.0899 17.5443 42.7216
47.5803
Length 512, n 1024, alignment 0/ 0: 20.5111 22.9782 19.6648 21.3857
42.4801
Length 2048, n 1024, alignment 0/ 0: 28.4023 19.1577 36.9065 35.4799
68.3555
Length 512, n 1024, alignment 1/ 6: 29.9694 24.3087 22.0513 46.7436
51.5908
Length 2048, n 1024, alignment 1/ 6: 42.9897 21.5402 38.739 78.3266 84.3956
^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: [PATCH v2 1/2] powerpc: Add optimized strncpy for POWER9
2020-09-04 16:56 [PATCH v2 1/2] powerpc: Add optimized strncpy for POWER9 Raphael Moreira Zinsly
2020-09-04 16:56 ` [PATCH v2 2/2] powerpc: Add optimized stpncpy " Raphael Moreira Zinsly
2020-09-04 16:59 ` [PATCH v2 1/2] powerpc: Add optimized strncpy " Raphael M Zinsly
@ 2020-09-16 12:24 ` Matheus Castanho
2 siblings, 0 replies; 9+ messages in thread
From: Matheus Castanho @ 2020-09-16 12:24 UTC (permalink / raw)
To: Raphael Moreira Zinsly, libc-alpha
On 9/4/20 1:56 PM, Raphael Moreira Zinsly via Libc-alpha wrote:
> Changes since v1:
> - Fixed comments identation and added some spaces to improve
> readbillity.
> - Use "POWER 9 LE" instead of "PowerPC64/POWER9".
> - Fixed copyright dates.
> - Replaced cmpwi for cmpdi.
>
> ---8<---
>
> Similar to the strcpy P9 optimization, this version uses VSX to improve
> performance.
> ---
> sysdeps/powerpc/powerpc64/le/power9/strncpy.S | 281 ++++++++++++++++++
> sysdeps/powerpc/powerpc64/multiarch/Makefile | 2 +-
> .../powerpc64/multiarch/ifunc-impl-list.c | 5 +
> .../powerpc64/multiarch/strncpy-power9.S | 26 ++
> sysdeps/powerpc/powerpc64/multiarch/strncpy.c | 7 +
> 5 files changed, 320 insertions(+), 1 deletion(-)
> create mode 100644 sysdeps/powerpc/powerpc64/le/power9/strncpy.S
> create mode 100644 sysdeps/powerpc/powerpc64/multiarch/strncpy-power9.S
>
> diff --git a/sysdeps/powerpc/powerpc64/le/power9/strncpy.S b/sysdeps/powerpc/powerpc64/le/power9/strncpy.S
> new file mode 100644
> index 0000000000..34fcdee913
> --- /dev/null
> +++ b/sysdeps/powerpc/powerpc64/le/power9/strncpy.S
> @@ -0,0 +1,281 @@
> +/* Optimized strncpy implementation for POWER9 LE.
> + Copyright (C) 2020 Free Software Foundation, Inc.
> + This file is part of the GNU C Library.
> +
> + The GNU C Library is free software; you can redistribute it and/or
> + modify it under the terms of the GNU Lesser General Public
> + License as published by the Free Software Foundation; either
> + version 2.1 of the License, or (at your option) any later version.
> +
> + The GNU C Library is distributed in the hope that it will be useful,
> + but WITHOUT ANY WARRANTY; without even the implied warranty of
> + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + Lesser General Public License for more details.
> +
> + You should have received a copy of the GNU Lesser General Public
> + License along with the GNU C Library; if not, see
> + <https://www.gnu.org/licenses/>. */
> +
> +#include <sysdep.h>
> +
> +# ifndef STRNCPY
> +# define FUNC_NAME strncpy
> +# else
> +# define FUNC_NAME STRNCPY
> +# endif
> +
> +/* Implements the function
> +
> + char * [r3] strncpy (char *dest [r3], const char *src [r4], size_t n [r5])
> +
> + The implementation can load bytes past a null terminator, but only
> + up to the next 16-byte aligned address, so it never crosses a page. */
> +
> +.machine power9
> +ENTRY_TOCLESS (FUNC_NAME, 4)
> + CALL_MCOUNT 2
> +
> + /* NULL string optimizations */
> + cmpdi r5, 0
> + beqlr
> +
> + lbz r0,0(r4)
> + stb r0,0(r3)
> + addi r11,r3,1
> + addi r5,r5,-1
> + vspltisb v18,0 /* Zeroes in v18 */
> + cmpdi r0,0
> + beq L(zero_padding_loop)
> +
> + /* Empty/1-byte string optimization */
> + cmpdi r5,0
> + beqlr
> +
> + addi r4,r4,1
> + neg r7,r4
> + rldicl r9,r7,0,60 /* How many bytes to get source 16B aligned? */
> +
> + /* Get source 16B aligned */
> + lvx v0,0,r4
> + lvsr v1,0,r4
> + vperm v0,v18,v0,v1
> +
> + vcmpequb v6,v0,v18 /* 0xff if byte is NULL, 0x00 otherwise */
> + vctzlsbb r7,v6 /* Number of trailing zeroes */
> + addi r8,r7,1 /* Add null terminator */
> +
> + /* r8 = bytes including null
> + r9 = bytes to get source 16B aligned
> + if r8 > r9
> + no null, copy r9 bytes
> + else
> + there is a null, copy r8 bytes and return. */
> + cmpld r8,r9
> + bgt L(no_null)
> +
> + cmpld cr6,r8,r5 /* r8 <= n? */
> + ble cr6,L(null)
> +
> + sldi r10,r5,56 /* stxvl wants size in top 8 bits */
> + stxvl 32+v0,r11,r10 /* Partial store */
At first I was confused by this 32+vX syntax. Maybe we could consider
adding defines for VSX registers to sysdeps/powerpc/sysdep.h in the
future? This way we could refer to v0+32 as vs32, for example. But I
don't think this needs to be part of this patchset.
> +
> + blr
> +
> +L(null):
> + sldi r10,r8,56 /* stxvl wants size in top 8 bits */
> + stxvl 32+v0,r11,r10 /* Partial store */
> +
> + add r11,r11,r8
> + sub r5,r5,r8
> + b L(zero_padding_loop)
> +
> +L(no_null):
> + cmpld r9,r5 /* Check if length was reached. */
> + bge L(n_tail1)
> +
> + sldi r10,r9,56 /* stxvl wants size in top 8 bits */
> + stxvl 32+v0,r11,r10 /* Partial store */
> +
> + add r4,r4,r9
> + add r11,r11,r9
> + sub r5,r5,r9
> +
> +L(loop):
> + cmpldi cr6,r5,64 /* Check if length was reached. */
> + ble cr6,L(final_loop)
> +
> + lxv 32+v0,0(r4)
> + vcmpequb. v6,v0,v18 /* Any zero bytes? */
> + bne cr6,L(prep_tail1)
> +
> + lxv 32+v1,16(r4)
> + vcmpequb. v6,v1,v18 /* Any zero bytes? */
> + bne cr6,L(prep_tail2)
> +
> + lxv 32+v2,32(r4)
> + vcmpequb. v6,v2,v18 /* Any zero bytes? */
> + bne cr6,L(prep_tail3)
> +
> + lxv 32+v3,48(r4)
> + vcmpequb. v6,v3,v18 /* Any zero bytes? */
> + bne cr6,L(prep_tail4)
> +
> + stxv 32+v0,0(r11)
> + stxv 32+v1,16(r11)
> + stxv 32+v2,32(r11)
> + stxv 32+v3,48(r11)
> +
> + addi r4,r4,64
> + addi r11,r11,64
> + addi r5,r5,-64
> +
> + b L(loop)
> +
> +L(final_loop):
> + cmpldi cr5,r5,16
> + lxv 32+v0,0(r4)
> + vcmpequb. v6,v0,v18 /* Any zero bytes? */
> + ble cr5,L(prep_n_tail1)
> + bne cr6,L(count_tail1)
> + addi r5,r5,-16
> +
> + cmpldi cr5,r5,16
> + lxv 32+v1,16(r4)
> + vcmpequb. v6,v1,v18 /* Any zero bytes? */
> + ble cr5,L(prep_n_tail2)
> + bne cr6,L(count_tail2)
> + addi r5,r5,-16
> +
> + cmpldi cr5,r5,16
> + lxv 32+v2,32(r4)
> + vcmpequb. v6,v2,v18 /* Any zero bytes? */
> + ble cr5,L(prep_n_tail3)
> + bne cr6,L(count_tail3)
> + addi r5,r5,-16
> +
> + lxv 32+v3,48(r4)
> + vcmpequb. v6,v3,v18 /* Any zero bytes? */
> + beq cr6,L(n_tail4)
> +
> + vctzlsbb r8,v6 /* Number of trailing zeroes */
> + cmpld r8,r5 /* r8 < n? */
> + blt L(tail4)
> +
> +L(n_tail4):
> + stxv 32+v0,0(r11)
> + stxv 32+v1,16(r11)
> + stxv 32+v2,32(r11)
> + sldi r10,r5,56 /* stxvl wants size in top 8 bits */
> + addi r11,r11,48 /* Offset */
> + stxvl 32+v3,r11,r10 /* Partial store */
> + blr
> +
> +L(prep_n_tail1):
> + beq cr6,L(n_tail1) /* Any zero bytes? */
> + vctzlsbb r8,v6 /* Number of trailing zeroes */
> + cmpld r8,r5 /* r8 < n? */
> + blt L(tail1)
> +
> +L(n_tail1):
> + sldi r10,r5,56 /* stxvl wants size in top 8 bits */
> + stxvl 32+v0,r11,r10 /* Partial store */
> + blr
> +
> +L(prep_n_tail2):
> + beq cr6,L(n_tail2) /* Any zero bytes? */
> + vctzlsbb r8,v6 /* Number of trailing zeroes */
> + cmpld r8,r5 /* r8 < n? */
> + blt L(tail2)
> +
> +L(n_tail2):
> + stxv 32+v0,0(r11)
> + sldi r10,r5,56 /* stxvl wants size in top 8 bits */
> + addi r11,r11,16 /* offset */
> + stxvl 32+v1,r11,r10 /* Partial store */
> + blr
> +
> +L(prep_n_tail3):
> + beq cr6,L(n_tail3) /* Any zero bytes? */
> + vctzlsbb r8,v6 /* Number of trailing zeroes */
> + cmpld r8,r5 /* r8 < n? */
> + blt L(tail3)
> +
> +L(n_tail3):
> + stxv 32+v0,0(r11)
> + stxv 32+v1,16(r11)
> + sldi r10,r5,56 /* stxvl wants size in top 8 bits */
> + addi r11,r11,32 /* Offset */
> + stxvl 32+v2,r11,r10 /* Partial store */
> + blr
> +
> +L(prep_tail1):
> +L(count_tail1):
> + vctzlsbb r8,v6 /* Number of trailing zeroes */
> +L(tail1):
> + addi r9,r8,1 /* Add null terminator */
> + sldi r10,r9,56 /* stxvl wants size in top 8 bits */
> + stxvl 32+v0,r11,r10 /* Partial store */
> + add r11,r11,r9
> + sub r5,r5,r9
> + b L(zero_padding_loop)
> +
> +L(prep_tail2):
> + addi r5,r5,-16
> +L(count_tail2):
> + vctzlsbb r8,v6 /* Number of trailing zeroes */
> +L(tail2):
> + addi r9,r8,1 /* Add null terminator */
> + stxv 32+v0,0(r11)
> + sldi r10,r9,56 /* stxvl wants size in top 8 bits */
> + addi r11,r11,16 /* offset */
> + stxvl 32+v1,r11,r10 /* Partial store */
> + add r11,r11,r9
> + sub r5,r5,r9
> + b L(zero_padding_loop)
> +
> +L(prep_tail3):
> + addi r5,r5,-32
> +L(count_tail3):
> + vctzlsbb r8,v6 /* Number of trailing zeroes */
> +L(tail3):
> + addi r9,r8,1 /* Add null terminator */
> + stxv 32+v0,0(r11)
> + stxv 32+v1,16(r11)
> + sldi r10,r9,56 /* stxvl wants size in top 8 bits */
> + addi r11,r11,32 /* offset */
> + stxvl 32+v2,r11,r10 /* Partial store */
> + add r11,r11,r9
> + sub r5,r5,r9
> + b L(zero_padding_loop)
> +
> +L(prep_tail4):
> + addi r5,r5,-48
> + vctzlsbb r8,v6 /* Number of trailing zeroes */
> +L(tail4):
> + addi r9,r8,1 /* Add null terminator */
> + stxv 32+v0,0(r11)
> + stxv 32+v1,16(r11)
> + stxv 32+v2,32(r11)
> + sldi r10,r9,56 /* stxvl wants size in top 8 bits */
> + addi r11,r11,48 /* offset */
> + stxvl 32+v3,r11,r10 /* Partial store */
> + add r11,r11,r9
> + sub r5,r5,r9
> +
> +/* This code pads the remainder of dest with NULL bytes. */
> +L(zero_padding_loop):
> + cmpldi cr6,r5,16 /* Check if length was reached. */
> + ble cr6,L(zero_padding_end)
> +
> + stxv v18,0(r11)
> + addi r11,r11,16
> + addi r5,r5,-16
> +
> + b L(zero_padding_loop)
> +
> +L(zero_padding_end):
> + sldi r10,r5,56 /* stxvl wants size in top 8 bits */
> + stxvl v18,r11,r10 /* Partial store */
> + blr
> +
The logic looks good. I tried to find a way to reuse some code, as there
are many similar blocks (e.g. tail* blocks). But their slight
differences make it hard to reuse anything.
> +END (FUNC_NAME)
> diff --git a/sysdeps/powerpc/powerpc64/multiarch/Makefile b/sysdeps/powerpc/powerpc64/multiarch/Makefile
> index 19acb6c64a..cd2b47b403 100644
> --- a/sysdeps/powerpc/powerpc64/multiarch/Makefile
> +++ b/sysdeps/powerpc/powerpc64/multiarch/Makefile
> @@ -33,7 +33,7 @@ sysdep_routines += memcpy-power8-cached memcpy-power7 memcpy-a2 memcpy-power6 \
>
> ifneq (,$(filter %le,$(config-machine)))
> sysdep_routines += strcmp-power9 strncmp-power9 strcpy-power9 stpcpy-power9 \
> - rawmemchr-power9 strlen-power9
> + rawmemchr-power9 strlen-power9 strncpy-power9
> endif
> CFLAGS-strncase-power7.c += -mcpu=power7 -funroll-loops
> CFLAGS-strncase_l-power7.c += -mcpu=power7 -funroll-loops
> diff --git a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
> index ea10b00417..aa63e1c23f 100644
> --- a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
> +++ b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
> @@ -301,6 +301,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
>
> /* Support sysdeps/powerpc/powerpc64/multiarch/strncpy.c. */
> IFUNC_IMPL (i, name, strncpy,
> +#ifdef __LITTLE_ENDIAN__
> + IFUNC_IMPL_ADD (array, i, strncpy,
> + hwcap2 & PPC_FEATURE2_ARCH_3_00,
> + __strncpy_power9)
> +#endif
> IFUNC_IMPL_ADD (array, i, strncpy,
> hwcap2 & PPC_FEATURE2_ARCH_2_07,
> __strncpy_power8)
> diff --git a/sysdeps/powerpc/powerpc64/multiarch/strncpy-power9.S b/sysdeps/powerpc/powerpc64/multiarch/strncpy-power9.S
> new file mode 100644
> index 0000000000..ab7c570d54
> --- /dev/null
> +++ b/sysdeps/powerpc/powerpc64/multiarch/strncpy-power9.S
> @@ -0,0 +1,26 @@
> +/* Optimized strncpy implementation for POWER9 LE.
> + Copyright (C) 2020 Free Software Foundation, Inc.
> + This file is part of the GNU C Library.
> +
> + The GNU C Library is free software; you can redistribute it and/or
> + modify it under the terms of the GNU Lesser General Public
> + License as published by the Free Software Foundation; either
> + version 2.1 of the License, or (at your option) any later version.
> +
> + The GNU C Library is distributed in the hope that it will be useful,
> + but WITHOUT ANY WARRANTY; without even the implied warranty of
> + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + Lesser General Public License for more details.
> +
> + You should have received a copy of the GNU Lesser General Public
> + License along with the GNU C Library; if not, see
> + <https://www.gnu.org/licenses/>. */
> +
> +#if defined __LITTLE_ENDIAN__ && IS_IN (libc)
> +#define STRNCPY __strncpy_power9
> +
> +#undef libc_hidden_builtin_def
> +#define libc_hidden_builtin_def(name)
> +
> +#include <sysdeps/powerpc/powerpc64/le/power9/strncpy.S>
> +#endif
> diff --git a/sysdeps/powerpc/powerpc64/multiarch/strncpy.c b/sysdeps/powerpc/powerpc64/multiarch/strncpy.c
> index 7bacf28aca..8ef0a99cb5 100644
> --- a/sysdeps/powerpc/powerpc64/multiarch/strncpy.c
> +++ b/sysdeps/powerpc/powerpc64/multiarch/strncpy.c
> @@ -28,11 +28,18 @@
> extern __typeof (strncpy) __strncpy_ppc attribute_hidden;
> extern __typeof (strncpy) __strncpy_power7 attribute_hidden;
> extern __typeof (strncpy) __strncpy_power8 attribute_hidden;
> +# ifdef __LITTLE_ENDIAN__
> +extern __typeof (strncpy) __strncpy_power9 attribute_hidden;
> +# endif
> # undef strncpy
>
> /* Avoid DWARF definition DIE on ifunc symbol so that GDB can handle
> ifunc symbol properly. */
> libc_ifunc_redirected (__redirect_strncpy, strncpy,
> +# ifdef __LITTLE_ENDIAN__
> + (hwcap2 & PPC_FEATURE2_ARCH_3_00)
> + ? __strncpy_power9 :
> +# endif
> (hwcap2 & PPC_FEATURE2_ARCH_2_07)
> ? __strncpy_power8
> : (hwcap & PPC_FEATURE_HAS_VSX)
>
--
The only thing missing now seems to be the .machine power9 issue that
was pointed out in v1.
Otherwise, LGTM.
Reviewed-by: Matheus Castanho <msc@linux.ibm.com>
--
Matheus Castanho
^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: [PATCH v2 2/2] powerpc: Add optimized stpncpy for POWER9
2020-09-04 16:59 ` Raphael M Zinsly
@ 2020-09-16 12:32 ` Matheus Castanho
2020-09-16 12:56 ` Raphael M Zinsly
0 siblings, 1 reply; 9+ messages in thread
From: Matheus Castanho @ 2020-09-16 12:32 UTC (permalink / raw)
To: Raphael M Zinsly, libc-alpha
On 9/4/20 1:59 PM, Raphael M Zinsly via Libc-alpha wrote:
> Benchtest output:
> generic_stpncpy __stpncpy_power9 __stpncpy_power8 __stpncpy_power7 __stpncpy_ppc
<snip>
> Length 512, n 1024, alignment 0/ 0: 20.5111 22.9782 19.6648 21.3857 42.4801
<snip>
> Length 512, n 1024, alignment 1/ 6: 29.9694 24.3087 22.0513 46.7436 51.5908
These two seem to be the only cases in which the power9 version loses to
the power8 one. Have you investigated what happens in these two specific
cases?
^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: [PATCH v2 2/2] powerpc: Add optimized stpncpy for POWER9
2020-09-04 16:56 ` [PATCH v2 2/2] powerpc: Add optimized stpncpy " Raphael Moreira Zinsly
2020-09-04 16:59 ` Raphael M Zinsly
@ 2020-09-16 12:35 ` Matheus Castanho
1 sibling, 0 replies; 9+ messages in thread
From: Matheus Castanho @ 2020-09-16 12:35 UTC (permalink / raw)
To: Raphael Moreira Zinsly, libc-alpha
On 9/4/20 1:56 PM, Raphael Moreira Zinsly via Libc-alpha wrote:
> Add stpncpy support into the POWER9 strncpy.
> ---
> sysdeps/powerpc/powerpc64/le/power9/stpncpy.S | 24 +++++++
> sysdeps/powerpc/powerpc64/le/power9/strncpy.S | 65 +++++++++++++++++++
> sysdeps/powerpc/powerpc64/multiarch/Makefile | 2 +-
> .../powerpc64/multiarch/ifunc-impl-list.c | 5 ++
> .../powerpc64/multiarch/stpncpy-power9.S | 24 +++++++
> sysdeps/powerpc/powerpc64/multiarch/stpncpy.c | 7 ++
> 6 files changed, 126 insertions(+), 1 deletion(-)
> create mode 100644 sysdeps/powerpc/powerpc64/le/power9/stpncpy.S
> create mode 100644 sysdeps/powerpc/powerpc64/multiarch/stpncpy-power9.S
>
> diff --git a/sysdeps/powerpc/powerpc64/le/power9/stpncpy.S b/sysdeps/powerpc/powerpc64/le/power9/stpncpy.S
> new file mode 100644
> index 0000000000..81d9673d8b
> --- /dev/null
> +++ b/sysdeps/powerpc/powerpc64/le/power9/stpncpy.S
> @@ -0,0 +1,24 @@
> +/* Optimized stpncpy implementation for POWER9 LE.
> + Copyright (C) 2020 Free Software Foundation, Inc.
> + This file is part of the GNU C Library.
> +
> + The GNU C Library is free software; you can redistribute it and/or
> + modify it under the terms of the GNU Lesser General Public
> + License as published by the Free Software Foundation; either
> + version 2.1 of the License, or (at your option) any later version.
> +
> + The GNU C Library is distributed in the hope that it will be useful,
> + but WITHOUT ANY WARRANTY; without even the implied warranty of
> + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + Lesser General Public License for more details.
> +
> + You should have received a copy of the GNU Lesser General Public
> + License along with the GNU C Library; if not, see
> + <https://www.gnu.org/licenses/>. */
> +
> +#define USE_AS_STPNCPY
> +#include <sysdeps/powerpc/powerpc64/le/power9/strncpy.S>
> +
> +weak_alias (__stpncpy, stpncpy)
> +libc_hidden_def (__stpncpy)
> +libc_hidden_builtin_def (stpncpy)
> diff --git a/sysdeps/powerpc/powerpc64/le/power9/strncpy.S b/sysdeps/powerpc/powerpc64/le/power9/strncpy.S
> index 34fcdee913..f7265b11ec 100644
> --- a/sysdeps/powerpc/powerpc64/le/power9/strncpy.S
> +++ b/sysdeps/powerpc/powerpc64/le/power9/strncpy.S
> @@ -18,16 +18,30 @@
>
> #include <sysdep.h>
>
> +#ifdef USE_AS_STPNCPY
> +# ifndef STPNCPY
> +# define FUNC_NAME __stpncpy
> +# else
> +# define FUNC_NAME STPNCPY
> +# endif
> +#else
> # ifndef STRNCPY
> # define FUNC_NAME strncpy
> # else
> # define FUNC_NAME STRNCPY
> # endif
> +#endif /* !USE_AS_STPNCPY */
>
> /* Implements the function
>
> char * [r3] strncpy (char *dest [r3], const char *src [r4], size_t n [r5])
>
> + or
> +
> + char * [r3] stpncpy (char *dest [r3], const char *src [r4], size_t n [r5])
> +
> + if USE_AS_STPNCPY is defined.
> +
> The implementation can load bytes past a null terminator, but only
> up to the next 16-byte aligned address, so it never crosses a page. */
>
> @@ -49,7 +63,15 @@ ENTRY_TOCLESS (FUNC_NAME, 4)
>
> /* Empty/1-byte string optimization */
> cmpdi r5,0
> +#ifdef USE_AS_STPNCPY
> + bgt L(cont)
> + /* Compute pointer to last byte copied into dest. */
> + addi r3,r3,1
> + blr
> +L(cont):
> +#else
> beqlr
> +#endif
>
> addi r4,r4,1
> neg r7,r4
> @@ -79,12 +101,20 @@ ENTRY_TOCLESS (FUNC_NAME, 4)
> sldi r10,r5,56 /* stxvl wants size in top 8 bits */
> stxvl 32+v0,r11,r10 /* Partial store */
>
> +#ifdef USE_AS_STPNCPY
> + /* Compute pointer to last byte copied into dest. */
> + add r3,r11,r5
> +#endif
> blr
>
> L(null):
> sldi r10,r8,56 /* stxvl wants size in top 8 bits */
> stxvl 32+v0,r11,r10 /* Partial store */
>
> +#ifdef USE_AS_STPNCPY
> + /* Compute pointer to last byte copied into dest. */
> + add r3,r11,r7
> +#endif
> add r11,r11,r8
> sub r5,r5,r8
> b L(zero_padding_loop)
> @@ -168,6 +198,10 @@ L(n_tail4):
> sldi r10,r5,56 /* stxvl wants size in top 8 bits */
> addi r11,r11,48 /* Offset */
> stxvl 32+v3,r11,r10 /* Partial store */
> +#ifdef USE_AS_STPNCPY
> + /* Compute pointer to last byte copied into dest. */
> + add r3,r11,r5
> +#endif
> blr
>
> L(prep_n_tail1):
> @@ -179,6 +213,10 @@ L(prep_n_tail1):
> L(n_tail1):
> sldi r10,r5,56 /* stxvl wants size in top 8 bits */
> stxvl 32+v0,r11,r10 /* Partial store */
> +#ifdef USE_AS_STPNCPY
> + /* Compute pointer to last byte copied into dest. */
> + add r3,r11,r5
> +#endif
> blr
>
> L(prep_n_tail2):
> @@ -192,6 +230,10 @@ L(n_tail2):
> sldi r10,r5,56 /* stxvl wants size in top 8 bits */
> addi r11,r11,16 /* offset */
> stxvl 32+v1,r11,r10 /* Partial store */
> +#ifdef USE_AS_STPNCPY
> + /* Compute pointer to last byte copied into dest. */
> + add r3,r11,r5
> +#endif
> blr
>
> L(prep_n_tail3):
> @@ -206,6 +248,10 @@ L(n_tail3):
> sldi r10,r5,56 /* stxvl wants size in top 8 bits */
> addi r11,r11,32 /* Offset */
> stxvl 32+v2,r11,r10 /* Partial store */
> +#ifdef USE_AS_STPNCPY
> + /* Compute pointer to last byte copied into dest. */
> + add r3,r11,r5
> +#endif
> blr
>
> L(prep_tail1):
> @@ -215,6 +261,10 @@ L(tail1):
> addi r9,r8,1 /* Add null terminator */
> sldi r10,r9,56 /* stxvl wants size in top 8 bits */
> stxvl 32+v0,r11,r10 /* Partial store */
> +#ifdef USE_AS_STPNCPY
> + /* Compute pointer to last byte copied into dest. */
> + add r3,r11,r8
> +#endif
> add r11,r11,r9
> sub r5,r5,r9
> b L(zero_padding_loop)
> @@ -229,6 +279,10 @@ L(tail2):
> sldi r10,r9,56 /* stxvl wants size in top 8 bits */
> addi r11,r11,16 /* offset */
> stxvl 32+v1,r11,r10 /* Partial store */
> +#ifdef USE_AS_STPNCPY
> + /* Compute pointer to last byte copied into dest. */
> + add r3,r11,r8
> +#endif
> add r11,r11,r9
> sub r5,r5,r9
> b L(zero_padding_loop)
> @@ -244,6 +298,10 @@ L(tail3):
> sldi r10,r9,56 /* stxvl wants size in top 8 bits */
> addi r11,r11,32 /* offset */
> stxvl 32+v2,r11,r10 /* Partial store */
> +#ifdef USE_AS_STPNCPY
> + /* Compute pointer to last byte copied into dest. */
> + add r3,r11,r8
> +#endif
> add r11,r11,r9
> sub r5,r5,r9
> b L(zero_padding_loop)
> @@ -259,6 +317,10 @@ L(tail4):
> sldi r10,r9,56 /* stxvl wants size in top 8 bits */
> addi r11,r11,48 /* offset */
> stxvl 32+v3,r11,r10 /* Partial store */
> +#ifdef USE_AS_STPNCPY
> + /* Compute pointer to last byte copied into dest. */
> + add r3,r11,r8
> +#endif
> add r11,r11,r9
> sub r5,r5,r9
>
> @@ -279,3 +341,6 @@ L(zero_padding_end):
> blr
>
> END (FUNC_NAME)
> +#ifndef USE_AS_STPNCPY
> +libc_hidden_builtin_def (strncpy)
> +#endif
> diff --git a/sysdeps/powerpc/powerpc64/multiarch/Makefile b/sysdeps/powerpc/powerpc64/multiarch/Makefile
> index cd2b47b403..f46bf50732 100644
> --- a/sysdeps/powerpc/powerpc64/multiarch/Makefile
> +++ b/sysdeps/powerpc/powerpc64/multiarch/Makefile
> @@ -33,7 +33,7 @@ sysdep_routines += memcpy-power8-cached memcpy-power7 memcpy-a2 memcpy-power6 \
>
> ifneq (,$(filter %le,$(config-machine)))
> sysdep_routines += strcmp-power9 strncmp-power9 strcpy-power9 stpcpy-power9 \
> - rawmemchr-power9 strlen-power9 strncpy-power9
> + rawmemchr-power9 strlen-power9 strncpy-power9 stpncpy-power9
> endif
> CFLAGS-strncase-power7.c += -mcpu=power7 -funroll-loops
> CFLAGS-strncase_l-power7.c += -mcpu=power7 -funroll-loops
> diff --git a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
> index aa63e1c23f..56790bcfe3 100644
> --- a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
> +++ b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
> @@ -317,6 +317,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
>
> /* Support sysdeps/powerpc/powerpc64/multiarch/stpncpy.c. */
> IFUNC_IMPL (i, name, stpncpy,
> +#ifdef __LITTLE_ENDIAN__
> + IFUNC_IMPL_ADD (array, i, stpncpy,
> + hwcap2 & PPC_FEATURE2_ARCH_3_00,
> + __stpncpy_power9)
> +#endif
> IFUNC_IMPL_ADD (array, i, stpncpy,
> hwcap2 & PPC_FEATURE2_ARCH_2_07,
> __stpncpy_power8)
> diff --git a/sysdeps/powerpc/powerpc64/multiarch/stpncpy-power9.S b/sysdeps/powerpc/powerpc64/multiarch/stpncpy-power9.S
> new file mode 100644
> index 0000000000..ccbab55c31
> --- /dev/null
> +++ b/sysdeps/powerpc/powerpc64/multiarch/stpncpy-power9.S
> @@ -0,0 +1,24 @@
> +/* Optimized stpncpy implementation for POWER9 LE.
> + Copyright (C) 2020 Free Software Foundation, Inc.
> + This file is part of the GNU C Library.
> +
> + The GNU C Library is free software; you can redistribute it and/or
> + modify it under the terms of the GNU Lesser General Public
> + License as published by the Free Software Foundation; either
> + version 2.1 of the License, or (at your option) any later version.
> +
> + The GNU C Library is distributed in the hope that it will be useful,
> + but WITHOUT ANY WARRANTY; without even the implied warranty of
> + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + Lesser General Public License for more details.
> +
> + You should have received a copy of the GNU Lesser General Public
> + License along with the GNU C Library; if not, see
> + <https://www.gnu.org/licenses/>. */
> +
> +#define STPNCPY __stpncpy_power9
> +
> +#undef libc_hidden_builtin_def
> +#define libc_hidden_builtin_def(name)
> +
> +#include <sysdeps/powerpc/powerpc64/le/power9/stpncpy.S>
> diff --git a/sysdeps/powerpc/powerpc64/multiarch/stpncpy.c b/sysdeps/powerpc/powerpc64/multiarch/stpncpy.c
> index 17df886431..ac17b26650 100644
> --- a/sysdeps/powerpc/powerpc64/multiarch/stpncpy.c
> +++ b/sysdeps/powerpc/powerpc64/multiarch/stpncpy.c
> @@ -26,10 +26,17 @@
> extern __typeof (__stpncpy) __stpncpy_ppc attribute_hidden;
> extern __typeof (__stpncpy) __stpncpy_power7 attribute_hidden;
> extern __typeof (__stpncpy) __stpncpy_power8 attribute_hidden;
> +# ifdef __LITTLE_ENDIAN__
> +extern __typeof (__stpncpy) __stpncpy_power9 attribute_hidden;
> +# endif
> # undef stpncpy
> # undef __stpncpy
>
> libc_ifunc_redirected (__redirect___stpncpy, __stpncpy,
> +# ifdef __LITTLE_ENDIAN__
> + (hwcap2 & PPC_FEATURE2_ARCH_3_00)
> + ? __stpncpy_power9 :
> +# endif
> (hwcap2 & PPC_FEATURE2_ARCH_2_07)
> ? __stpncpy_power8
> : (hwcap & PPC_FEATURE_HAS_VSX)
>
LGTM.
Reviewed-by: Matheus Castanho <msc@linux.ibm.com>
--
Matheus Castanho
^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: [PATCH v2 2/2] powerpc: Add optimized stpncpy for POWER9
2020-09-16 12:32 ` Matheus Castanho
@ 2020-09-16 12:56 ` Raphael M Zinsly
2020-09-18 15:53 ` Paul A. Clarke
0 siblings, 1 reply; 9+ messages in thread
From: Raphael M Zinsly @ 2020-09-16 12:56 UTC (permalink / raw)
To: Matheus Castanho, libc-alpha
Hi Matheus,
On 16/09/2020 09:32, Matheus Castanho wrote:
> On 9/4/20 1:59 PM, Raphael M Zinsly via Libc-alpha wrote:
>> Benchtest output:
>> generic_stpncpy __stpncpy_power9 __stpncpy_power8 __stpncpy_power7 __stpncpy_ppc
> <snip>
>> Length 512, n 1024, alignment 0/ 0: 20.5111 22.9782 19.6648 21.3857 42.4801
> <snip>
>> Length 512, n 1024, alignment 1/ 6: 29.9694 24.3087 22.0513 46.7436 51.5908
>
> These two seem to be the only cases in which the power9 version loses to
> the power8 one. Have you investigated what happens in these two specific
> cases?
>
Yes the power8 optimization calls memset to do the zero padding at the
end if n > length. In this case where n is way higher, memset is faster
than the loop used in my implementation.
Thanks for the review!
Regards,
--
Raphael Moreira Zinsly
IBM
Linux on Power Toolchain
^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: [PATCH v2 2/2] powerpc: Add optimized stpncpy for POWER9
2020-09-16 12:56 ` Raphael M Zinsly
@ 2020-09-18 15:53 ` Paul A. Clarke
0 siblings, 0 replies; 9+ messages in thread
From: Paul A. Clarke @ 2020-09-18 15:53 UTC (permalink / raw)
To: Raphael M Zinsly; +Cc: Matheus Castanho, libc-alpha
On Wed, Sep 16, 2020 at 09:56:59AM -0300, Raphael M Zinsly via Libc-alpha wrote:
> On 16/09/2020 09:32, Matheus Castanho wrote:
> > On 9/4/20 1:59 PM, Raphael M Zinsly via Libc-alpha wrote:
> > > Benchtest output:
> > > generic_stpncpy __stpncpy_power9 __stpncpy_power8 __stpncpy_power7 __stpncpy_ppc
> > <snip>
> > > Length 512, n 1024, alignment 0/ 0: 20.5111 22.9782 19.6648 21.3857 42.4801
> > <snip>
> > > Length 512, n 1024, alignment 1/ 6: 29.9694 24.3087 22.0513 46.7436 51.5908
> >
> > These two seem to be the only cases in which the power9 version loses to
> > the power8 one. Have you investigated what happens in these two specific
> > cases?
> >
> Yes the power8 optimization calls memset to do the zero padding at the end
> if n > length. In this case where n is way higher, memset is faster than the
> loop used in my implementation.
Is there some sort of threshold that would help these cases by transitioning
to memset (or replicating the relevant part of that code here?
PC
^ permalink raw reply [flat|nested] 9+ messages in thread
end of thread, other threads:[~2020-09-18 15:53 UTC | newest]
Thread overview: 9+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2020-09-04 16:56 [PATCH v2 1/2] powerpc: Add optimized strncpy for POWER9 Raphael Moreira Zinsly
2020-09-04 16:56 ` [PATCH v2 2/2] powerpc: Add optimized stpncpy " Raphael Moreira Zinsly
2020-09-04 16:59 ` Raphael M Zinsly
2020-09-16 12:32 ` Matheus Castanho
2020-09-16 12:56 ` Raphael M Zinsly
2020-09-18 15:53 ` Paul A. Clarke
2020-09-16 12:35 ` Matheus Castanho
2020-09-04 16:59 ` [PATCH v2 1/2] powerpc: Add optimized strncpy " Raphael M Zinsly
2020-09-16 12:24 ` Matheus Castanho
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).