public inbox for libc-alpha@sourceware.org
 help / color / mirror / Atom feed
* [PATCH v2 1/2] powerpc: Add optimized strncpy for POWER9
@ 2020-09-04 16:56 Raphael Moreira Zinsly
  2020-09-04 16:56 ` [PATCH v2 2/2] powerpc: Add optimized stpncpy " Raphael Moreira Zinsly
                   ` (2 more replies)
  0 siblings, 3 replies; 9+ messages in thread
From: Raphael Moreira Zinsly @ 2020-09-04 16:56 UTC (permalink / raw)
  To: libc-alpha; +Cc: murphyp, pc, tuliom, Raphael Moreira Zinsly

Changes since v1:
	- Fixed comments identation and added some spaces to improve
	  readbillity.
	- Use "POWER 9 LE" instead of "PowerPC64/POWER9".
	- Fixed copyright dates.
	- Replaced cmpwi for cmpdi.

---8<---

Similar to the strcpy P9 optimization, this version uses VSX to improve
performance.
---
 sysdeps/powerpc/powerpc64/le/power9/strncpy.S | 281 ++++++++++++++++++
 sysdeps/powerpc/powerpc64/multiarch/Makefile  |   2 +-
 .../powerpc64/multiarch/ifunc-impl-list.c     |   5 +
 .../powerpc64/multiarch/strncpy-power9.S      |  26 ++
 sysdeps/powerpc/powerpc64/multiarch/strncpy.c |   7 +
 5 files changed, 320 insertions(+), 1 deletion(-)
 create mode 100644 sysdeps/powerpc/powerpc64/le/power9/strncpy.S
 create mode 100644 sysdeps/powerpc/powerpc64/multiarch/strncpy-power9.S

diff --git a/sysdeps/powerpc/powerpc64/le/power9/strncpy.S b/sysdeps/powerpc/powerpc64/le/power9/strncpy.S
new file mode 100644
index 0000000000..34fcdee913
--- /dev/null
+++ b/sysdeps/powerpc/powerpc64/le/power9/strncpy.S
@@ -0,0 +1,281 @@
+/* Optimized strncpy implementation for POWER9 LE.
+   Copyright (C) 2020 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+# ifndef STRNCPY
+#  define FUNC_NAME strncpy
+# else
+#  define FUNC_NAME STRNCPY
+# endif
+
+/* Implements the function
+
+   char * [r3] strncpy (char *dest [r3], const char *src [r4], size_t n [r5])
+
+   The implementation can load bytes past a null terminator, but only
+   up to the next 16-byte aligned address, so it never crosses a page.  */
+
+.machine power9
+ENTRY_TOCLESS (FUNC_NAME, 4)
+	CALL_MCOUNT 2
+
+	/* NULL string optimizations  */
+	cmpdi   r5, 0
+	beqlr
+
+	lbz	r0,0(r4)
+	stb	r0,0(r3)
+	addi	r11,r3,1
+	addi	r5,r5,-1
+	vspltisb v18,0		/* Zeroes in v18  */
+	cmpdi	r0,0
+	beq	L(zero_padding_loop)
+
+	/* Empty/1-byte string optimization  */
+	cmpdi	r5,0
+	beqlr
+
+	addi	r4,r4,1
+	neg	r7,r4
+	rldicl	r9,r7,0,60	/* How many bytes to get source 16B aligned?  */
+
+	/* Get source 16B aligned  */
+	lvx	v0,0,r4
+	lvsr	v1,0,r4
+	vperm	v0,v18,v0,v1
+
+	vcmpequb v6,v0,v18	/* 0xff if byte is NULL, 0x00 otherwise  */
+	vctzlsbb r7,v6		/* Number of trailing zeroes  */
+	addi	r8,r7,1		/* Add null terminator  */
+
+	/* r8 = bytes including null
+	   r9 = bytes to get source 16B aligned
+	   if r8 > r9
+	      no null, copy r9 bytes
+	   else
+	      there is a null, copy r8 bytes and return.  */
+	cmpld	r8,r9
+	bgt	L(no_null)
+
+	cmpld	cr6,r8,r5	/* r8 <= n?  */
+	ble	cr6,L(null)
+
+	sldi	r10,r5,56	/* stxvl wants size in top 8 bits  */
+	stxvl	32+v0,r11,r10	/* Partial store  */
+
+	blr
+
+L(null):
+	sldi	r10,r8,56	/* stxvl wants size in top 8 bits  */
+	stxvl	32+v0,r11,r10	/* Partial store  */
+
+	add	r11,r11,r8
+	sub	r5,r5,r8
+	b L(zero_padding_loop)
+
+L(no_null):
+	cmpld	r9,r5		/* Check if length was reached.  */
+	bge	L(n_tail1)
+
+	sldi	r10,r9,56	/* stxvl wants size in top 8 bits  */
+	stxvl	32+v0,r11,r10	/* Partial store  */
+
+	add	r4,r4,r9
+	add	r11,r11,r9
+	sub	r5,r5,r9
+
+L(loop):
+	cmpldi	cr6,r5,64	/* Check if length was reached.  */
+	ble	cr6,L(final_loop)
+
+	lxv	32+v0,0(r4)
+	vcmpequb. v6,v0,v18	/* Any zero bytes?  */
+	bne	cr6,L(prep_tail1)
+
+	lxv	32+v1,16(r4)
+	vcmpequb. v6,v1,v18	/* Any zero bytes?  */
+	bne	cr6,L(prep_tail2)
+
+	lxv	32+v2,32(r4)
+	vcmpequb. v6,v2,v18	/* Any zero bytes?  */
+	bne	cr6,L(prep_tail3)
+
+	lxv	32+v3,48(r4)
+	vcmpequb. v6,v3,v18	/* Any zero bytes?  */
+	bne	cr6,L(prep_tail4)
+
+	stxv	32+v0,0(r11)
+	stxv	32+v1,16(r11)
+	stxv	32+v2,32(r11)
+	stxv	32+v3,48(r11)
+
+	addi	r4,r4,64
+	addi	r11,r11,64
+	addi	r5,r5,-64
+
+	b	L(loop)
+
+L(final_loop):
+	cmpldi	cr5,r5,16
+	lxv	32+v0,0(r4)
+	vcmpequb. v6,v0,v18	/* Any zero bytes?  */
+	ble	cr5,L(prep_n_tail1)
+	bne	cr6,L(count_tail1)
+	addi	r5,r5,-16
+
+	cmpldi	cr5,r5,16
+	lxv	32+v1,16(r4)
+	vcmpequb. v6,v1,v18	/* Any zero bytes?  */
+	ble	cr5,L(prep_n_tail2)
+	bne	cr6,L(count_tail2)
+	addi	r5,r5,-16
+
+	cmpldi	cr5,r5,16
+	lxv	32+v2,32(r4)
+	vcmpequb. v6,v2,v18	/* Any zero bytes?  */
+	ble	cr5,L(prep_n_tail3)
+	bne	cr6,L(count_tail3)
+	addi	r5,r5,-16
+
+	lxv	32+v3,48(r4)
+	vcmpequb. v6,v3,v18	/* Any zero bytes?  */
+	beq	cr6,L(n_tail4)
+
+	vctzlsbb r8,v6		/* Number of trailing zeroes  */
+	cmpld	r8,r5		/* r8 < n?  */
+	blt	L(tail4)
+
+L(n_tail4):
+	stxv	32+v0,0(r11)
+	stxv	32+v1,16(r11)
+	stxv	32+v2,32(r11)
+	sldi	r10,r5,56	/* stxvl wants size in top 8 bits  */
+	addi	r11,r11,48	/* Offset */
+	stxvl	32+v3,r11,r10	/* Partial store  */
+	blr
+
+L(prep_n_tail1):
+	beq	cr6,L(n_tail1)	/* Any zero bytes?  */
+	vctzlsbb r8,v6		/* Number of trailing zeroes  */
+	cmpld	r8,r5		/* r8 < n?  */
+	blt	L(tail1)
+
+L(n_tail1):
+	sldi	r10,r5,56	/* stxvl wants size in top 8 bits  */
+	stxvl	32+v0,r11,r10	/* Partial store  */
+	blr
+
+L(prep_n_tail2):
+	beq	cr6,L(n_tail2)	/* Any zero bytes?  */
+	vctzlsbb r8,v6		/* Number of trailing zeroes  */
+	cmpld	r8,r5		/* r8 < n?  */
+	blt	L(tail2)
+
+L(n_tail2):
+	stxv	32+v0,0(r11)
+	sldi	r10,r5,56	/* stxvl wants size in top 8 bits  */
+	addi	r11,r11,16	/* offset */
+	stxvl	32+v1,r11,r10	/* Partial store  */
+	blr
+
+L(prep_n_tail3):
+	beq	cr6,L(n_tail3)	/* Any zero bytes?  */
+	vctzlsbb r8,v6		/* Number of trailing zeroes  */
+	cmpld	r8,r5		/* r8 < n?  */
+	blt	L(tail3)
+
+L(n_tail3):
+	stxv	32+v0,0(r11)
+	stxv	32+v1,16(r11)
+	sldi	r10,r5,56	/* stxvl wants size in top 8 bits  */
+	addi	r11,r11,32	/* Offset */
+	stxvl	32+v2,r11,r10	/* Partial store  */
+	blr
+
+L(prep_tail1):
+L(count_tail1):
+	vctzlsbb r8,v6		/* Number of trailing zeroes  */
+L(tail1):
+	addi	r9,r8,1		/* Add null terminator  */
+	sldi	r10,r9,56	/* stxvl wants size in top 8 bits  */
+	stxvl	32+v0,r11,r10	/* Partial store  */
+	add	r11,r11,r9
+	sub	r5,r5,r9
+	b L(zero_padding_loop)
+
+L(prep_tail2):
+	addi	r5,r5,-16
+L(count_tail2):
+	vctzlsbb r8,v6		/* Number of trailing zeroes  */
+L(tail2):
+	addi	r9,r8,1		/* Add null terminator  */
+	stxv	32+v0,0(r11)
+	sldi	r10,r9,56	/* stxvl wants size in top 8 bits  */
+	addi	r11,r11,16	/* offset */
+	stxvl	32+v1,r11,r10	/* Partial store  */
+	add	r11,r11,r9
+	sub	r5,r5,r9
+	b L(zero_padding_loop)
+
+L(prep_tail3):
+	addi	r5,r5,-32
+L(count_tail3):
+	vctzlsbb r8,v6		/* Number of trailing zeroes  */
+L(tail3):
+	addi	r9,r8,1		/* Add null terminator  */
+	stxv	32+v0,0(r11)
+	stxv	32+v1,16(r11)
+	sldi	r10,r9,56	/* stxvl wants size in top 8 bits  */
+	addi	r11,r11,32	/* offset */
+	stxvl	32+v2,r11,r10	/* Partial store  */
+	add	r11,r11,r9
+	sub	r5,r5,r9
+	b L(zero_padding_loop)
+
+L(prep_tail4):
+	addi	r5,r5,-48
+	vctzlsbb r8,v6		/* Number of trailing zeroes  */
+L(tail4):
+	addi	r9,r8,1		/* Add null terminator  */
+	stxv	32+v0,0(r11)
+	stxv	32+v1,16(r11)
+	stxv	32+v2,32(r11)
+	sldi	r10,r9,56	/* stxvl wants size in top 8 bits  */
+	addi	r11,r11,48	/* offset */
+	stxvl	32+v3,r11,r10	/* Partial store  */
+	add	r11,r11,r9
+	sub	r5,r5,r9
+
+/* This code pads the remainder of dest with NULL bytes.  */
+L(zero_padding_loop):
+	cmpldi	cr6,r5,16	/* Check if length was reached.  */
+	ble	cr6,L(zero_padding_end)
+
+	stxv	v18,0(r11)
+	addi	r11,r11,16
+	addi	r5,r5,-16
+
+	b	L(zero_padding_loop)
+
+L(zero_padding_end):
+	sldi	r10,r5,56	/* stxvl wants size in top 8 bits  */
+	stxvl	v18,r11,r10	/* Partial store  */
+	blr
+
+END (FUNC_NAME)
diff --git a/sysdeps/powerpc/powerpc64/multiarch/Makefile b/sysdeps/powerpc/powerpc64/multiarch/Makefile
index 19acb6c64a..cd2b47b403 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/Makefile
+++ b/sysdeps/powerpc/powerpc64/multiarch/Makefile
@@ -33,7 +33,7 @@ sysdep_routines += memcpy-power8-cached memcpy-power7 memcpy-a2 memcpy-power6 \
 
 ifneq (,$(filter %le,$(config-machine)))
 sysdep_routines += strcmp-power9 strncmp-power9 strcpy-power9 stpcpy-power9 \
-		   rawmemchr-power9 strlen-power9
+		   rawmemchr-power9 strlen-power9 strncpy-power9
 endif
 CFLAGS-strncase-power7.c += -mcpu=power7 -funroll-loops
 CFLAGS-strncase_l-power7.c += -mcpu=power7 -funroll-loops
diff --git a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
index ea10b00417..aa63e1c23f 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
@@ -301,6 +301,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 
   /* Support sysdeps/powerpc/powerpc64/multiarch/strncpy.c.  */
   IFUNC_IMPL (i, name, strncpy,
+#ifdef __LITTLE_ENDIAN__
+	      IFUNC_IMPL_ADD (array, i, strncpy,
+			      hwcap2 & PPC_FEATURE2_ARCH_3_00,
+			      __strncpy_power9)
+#endif
 	      IFUNC_IMPL_ADD (array, i, strncpy,
 			      hwcap2 & PPC_FEATURE2_ARCH_2_07,
 			      __strncpy_power8)
diff --git a/sysdeps/powerpc/powerpc64/multiarch/strncpy-power9.S b/sysdeps/powerpc/powerpc64/multiarch/strncpy-power9.S
new file mode 100644
index 0000000000..ab7c570d54
--- /dev/null
+++ b/sysdeps/powerpc/powerpc64/multiarch/strncpy-power9.S
@@ -0,0 +1,26 @@
+/* Optimized strncpy implementation for POWER9 LE.
+   Copyright (C) 2020 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#if defined __LITTLE_ENDIAN__ && IS_IN (libc)
+#define STRNCPY __strncpy_power9
+
+#undef libc_hidden_builtin_def
+#define libc_hidden_builtin_def(name)
+
+#include <sysdeps/powerpc/powerpc64/le/power9/strncpy.S>
+#endif
diff --git a/sysdeps/powerpc/powerpc64/multiarch/strncpy.c b/sysdeps/powerpc/powerpc64/multiarch/strncpy.c
index 7bacf28aca..8ef0a99cb5 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/strncpy.c
+++ b/sysdeps/powerpc/powerpc64/multiarch/strncpy.c
@@ -28,11 +28,18 @@
 extern __typeof (strncpy) __strncpy_ppc attribute_hidden;
 extern __typeof (strncpy) __strncpy_power7 attribute_hidden;
 extern __typeof (strncpy) __strncpy_power8 attribute_hidden;
+# ifdef __LITTLE_ENDIAN__
+extern __typeof (strncpy) __strncpy_power9 attribute_hidden;
+# endif
 # undef strncpy
 
 /* Avoid DWARF definition DIE on ifunc symbol so that GDB can handle
  ifunc symbol properly. */
 libc_ifunc_redirected (__redirect_strncpy, strncpy,
+# ifdef __LITTLE_ENDIAN__
+		     (hwcap2 & PPC_FEATURE2_ARCH_3_00)
+		     ? __strncpy_power9 :
+# endif
 		       (hwcap2 & PPC_FEATURE2_ARCH_2_07)
 		       ? __strncpy_power8
 		       : (hwcap & PPC_FEATURE_HAS_VSX)
-- 
2.26.2


^ permalink raw reply	[flat|nested] 9+ messages in thread

* [PATCH v2 2/2] powerpc: Add optimized stpncpy for POWER9
  2020-09-04 16:56 [PATCH v2 1/2] powerpc: Add optimized strncpy for POWER9 Raphael Moreira Zinsly
@ 2020-09-04 16:56 ` Raphael Moreira Zinsly
  2020-09-04 16:59   ` Raphael M Zinsly
  2020-09-16 12:35   ` Matheus Castanho
  2020-09-04 16:59 ` [PATCH v2 1/2] powerpc: Add optimized strncpy " Raphael M Zinsly
  2020-09-16 12:24 ` Matheus Castanho
  2 siblings, 2 replies; 9+ messages in thread
From: Raphael Moreira Zinsly @ 2020-09-04 16:56 UTC (permalink / raw)
  To: libc-alpha; +Cc: murphyp, pc, tuliom, Raphael Moreira Zinsly

Add stpncpy support into the POWER9 strncpy.
---
 sysdeps/powerpc/powerpc64/le/power9/stpncpy.S | 24 +++++++
 sysdeps/powerpc/powerpc64/le/power9/strncpy.S | 65 +++++++++++++++++++
 sysdeps/powerpc/powerpc64/multiarch/Makefile  |  2 +-
 .../powerpc64/multiarch/ifunc-impl-list.c     |  5 ++
 .../powerpc64/multiarch/stpncpy-power9.S      | 24 +++++++
 sysdeps/powerpc/powerpc64/multiarch/stpncpy.c |  7 ++
 6 files changed, 126 insertions(+), 1 deletion(-)
 create mode 100644 sysdeps/powerpc/powerpc64/le/power9/stpncpy.S
 create mode 100644 sysdeps/powerpc/powerpc64/multiarch/stpncpy-power9.S

diff --git a/sysdeps/powerpc/powerpc64/le/power9/stpncpy.S b/sysdeps/powerpc/powerpc64/le/power9/stpncpy.S
new file mode 100644
index 0000000000..81d9673d8b
--- /dev/null
+++ b/sysdeps/powerpc/powerpc64/le/power9/stpncpy.S
@@ -0,0 +1,24 @@
+/* Optimized stpncpy implementation for POWER9 LE.
+   Copyright (C) 2020 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#define USE_AS_STPNCPY
+#include <sysdeps/powerpc/powerpc64/le/power9/strncpy.S>
+
+weak_alias (__stpncpy, stpncpy)
+libc_hidden_def (__stpncpy)
+libc_hidden_builtin_def (stpncpy)
diff --git a/sysdeps/powerpc/powerpc64/le/power9/strncpy.S b/sysdeps/powerpc/powerpc64/le/power9/strncpy.S
index 34fcdee913..f7265b11ec 100644
--- a/sysdeps/powerpc/powerpc64/le/power9/strncpy.S
+++ b/sysdeps/powerpc/powerpc64/le/power9/strncpy.S
@@ -18,16 +18,30 @@
 
 #include <sysdep.h>
 
+#ifdef USE_AS_STPNCPY
+# ifndef STPNCPY
+#   define FUNC_NAME __stpncpy
+# else
+#   define FUNC_NAME STPNCPY
+# endif
+#else
 # ifndef STRNCPY
 #  define FUNC_NAME strncpy
 # else
 #  define FUNC_NAME STRNCPY
 # endif
+#endif  /* !USE_AS_STPNCPY  */
 
 /* Implements the function
 
    char * [r3] strncpy (char *dest [r3], const char *src [r4], size_t n [r5])
 
+   or
+
+   char * [r3] stpncpy (char *dest [r3], const char *src [r4], size_t n [r5])
+
+   if USE_AS_STPNCPY is defined.
+
    The implementation can load bytes past a null terminator, but only
    up to the next 16-byte aligned address, so it never crosses a page.  */
 
@@ -49,7 +63,15 @@ ENTRY_TOCLESS (FUNC_NAME, 4)
 
 	/* Empty/1-byte string optimization  */
 	cmpdi	r5,0
+#ifdef USE_AS_STPNCPY
+	bgt	L(cont)
+	/* Compute pointer to last byte copied into dest.  */
+	addi	r3,r3,1
+	blr
+L(cont):
+#else
 	beqlr
+#endif
 
 	addi	r4,r4,1
 	neg	r7,r4
@@ -79,12 +101,20 @@ ENTRY_TOCLESS (FUNC_NAME, 4)
 	sldi	r10,r5,56	/* stxvl wants size in top 8 bits  */
 	stxvl	32+v0,r11,r10	/* Partial store  */
 
+#ifdef USE_AS_STPNCPY
+	/* Compute pointer to last byte copied into dest.  */
+	add	r3,r11,r5
+#endif
 	blr
 
 L(null):
 	sldi	r10,r8,56	/* stxvl wants size in top 8 bits  */
 	stxvl	32+v0,r11,r10	/* Partial store  */
 
+#ifdef USE_AS_STPNCPY
+	/* Compute pointer to last byte copied into dest.  */
+	add	r3,r11,r7
+#endif
 	add	r11,r11,r8
 	sub	r5,r5,r8
 	b L(zero_padding_loop)
@@ -168,6 +198,10 @@ L(n_tail4):
 	sldi	r10,r5,56	/* stxvl wants size in top 8 bits  */
 	addi	r11,r11,48	/* Offset */
 	stxvl	32+v3,r11,r10	/* Partial store  */
+#ifdef USE_AS_STPNCPY
+	/* Compute pointer to last byte copied into dest.  */
+	add	r3,r11,r5
+#endif
 	blr
 
 L(prep_n_tail1):
@@ -179,6 +213,10 @@ L(prep_n_tail1):
 L(n_tail1):
 	sldi	r10,r5,56	/* stxvl wants size in top 8 bits  */
 	stxvl	32+v0,r11,r10	/* Partial store  */
+#ifdef USE_AS_STPNCPY
+	/* Compute pointer to last byte copied into dest.  */
+	add	r3,r11,r5
+#endif
 	blr
 
 L(prep_n_tail2):
@@ -192,6 +230,10 @@ L(n_tail2):
 	sldi	r10,r5,56	/* stxvl wants size in top 8 bits  */
 	addi	r11,r11,16	/* offset */
 	stxvl	32+v1,r11,r10	/* Partial store  */
+#ifdef USE_AS_STPNCPY
+	/* Compute pointer to last byte copied into dest.  */
+	add	r3,r11,r5
+#endif
 	blr
 
 L(prep_n_tail3):
@@ -206,6 +248,10 @@ L(n_tail3):
 	sldi	r10,r5,56	/* stxvl wants size in top 8 bits  */
 	addi	r11,r11,32	/* Offset */
 	stxvl	32+v2,r11,r10	/* Partial store  */
+#ifdef USE_AS_STPNCPY
+	/* Compute pointer to last byte copied into dest.  */
+	add	r3,r11,r5
+#endif
 	blr
 
 L(prep_tail1):
@@ -215,6 +261,10 @@ L(tail1):
 	addi	r9,r8,1		/* Add null terminator  */
 	sldi	r10,r9,56	/* stxvl wants size in top 8 bits  */
 	stxvl	32+v0,r11,r10	/* Partial store  */
+#ifdef USE_AS_STPNCPY
+	/* Compute pointer to last byte copied into dest.  */
+	add	r3,r11,r8
+#endif
 	add	r11,r11,r9
 	sub	r5,r5,r9
 	b L(zero_padding_loop)
@@ -229,6 +279,10 @@ L(tail2):
 	sldi	r10,r9,56	/* stxvl wants size in top 8 bits  */
 	addi	r11,r11,16	/* offset */
 	stxvl	32+v1,r11,r10	/* Partial store  */
+#ifdef USE_AS_STPNCPY
+	/* Compute pointer to last byte copied into dest.  */
+	add	r3,r11,r8
+#endif
 	add	r11,r11,r9
 	sub	r5,r5,r9
 	b L(zero_padding_loop)
@@ -244,6 +298,10 @@ L(tail3):
 	sldi	r10,r9,56	/* stxvl wants size in top 8 bits  */
 	addi	r11,r11,32	/* offset */
 	stxvl	32+v2,r11,r10	/* Partial store  */
+#ifdef USE_AS_STPNCPY
+	/* Compute pointer to last byte copied into dest.  */
+	add	r3,r11,r8
+#endif
 	add	r11,r11,r9
 	sub	r5,r5,r9
 	b L(zero_padding_loop)
@@ -259,6 +317,10 @@ L(tail4):
 	sldi	r10,r9,56	/* stxvl wants size in top 8 bits  */
 	addi	r11,r11,48	/* offset */
 	stxvl	32+v3,r11,r10	/* Partial store  */
+#ifdef USE_AS_STPNCPY
+	/* Compute pointer to last byte copied into dest.  */
+	add	r3,r11,r8
+#endif
 	add	r11,r11,r9
 	sub	r5,r5,r9
 
@@ -279,3 +341,6 @@ L(zero_padding_end):
 	blr
 
 END (FUNC_NAME)
+#ifndef USE_AS_STPNCPY
+libc_hidden_builtin_def (strncpy)
+#endif
diff --git a/sysdeps/powerpc/powerpc64/multiarch/Makefile b/sysdeps/powerpc/powerpc64/multiarch/Makefile
index cd2b47b403..f46bf50732 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/Makefile
+++ b/sysdeps/powerpc/powerpc64/multiarch/Makefile
@@ -33,7 +33,7 @@ sysdep_routines += memcpy-power8-cached memcpy-power7 memcpy-a2 memcpy-power6 \
 
 ifneq (,$(filter %le,$(config-machine)))
 sysdep_routines += strcmp-power9 strncmp-power9 strcpy-power9 stpcpy-power9 \
-		   rawmemchr-power9 strlen-power9 strncpy-power9
+		   rawmemchr-power9 strlen-power9 strncpy-power9 stpncpy-power9
 endif
 CFLAGS-strncase-power7.c += -mcpu=power7 -funroll-loops
 CFLAGS-strncase_l-power7.c += -mcpu=power7 -funroll-loops
diff --git a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
index aa63e1c23f..56790bcfe3 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
@@ -317,6 +317,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 
   /* Support sysdeps/powerpc/powerpc64/multiarch/stpncpy.c.  */
   IFUNC_IMPL (i, name, stpncpy,
+#ifdef __LITTLE_ENDIAN__
+	      IFUNC_IMPL_ADD (array, i, stpncpy,
+			      hwcap2 & PPC_FEATURE2_ARCH_3_00,
+			      __stpncpy_power9)
+#endif
 	      IFUNC_IMPL_ADD (array, i, stpncpy,
 			      hwcap2 & PPC_FEATURE2_ARCH_2_07,
 			      __stpncpy_power8)
diff --git a/sysdeps/powerpc/powerpc64/multiarch/stpncpy-power9.S b/sysdeps/powerpc/powerpc64/multiarch/stpncpy-power9.S
new file mode 100644
index 0000000000..ccbab55c31
--- /dev/null
+++ b/sysdeps/powerpc/powerpc64/multiarch/stpncpy-power9.S
@@ -0,0 +1,24 @@
+/* Optimized stpncpy implementation for POWER9 LE.
+   Copyright (C) 2020 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#define STPNCPY __stpncpy_power9
+
+#undef libc_hidden_builtin_def
+#define libc_hidden_builtin_def(name)
+
+#include <sysdeps/powerpc/powerpc64/le/power9/stpncpy.S>
diff --git a/sysdeps/powerpc/powerpc64/multiarch/stpncpy.c b/sysdeps/powerpc/powerpc64/multiarch/stpncpy.c
index 17df886431..ac17b26650 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/stpncpy.c
+++ b/sysdeps/powerpc/powerpc64/multiarch/stpncpy.c
@@ -26,10 +26,17 @@
 extern __typeof (__stpncpy) __stpncpy_ppc attribute_hidden;
 extern __typeof (__stpncpy) __stpncpy_power7 attribute_hidden;
 extern __typeof (__stpncpy) __stpncpy_power8 attribute_hidden;
+# ifdef __LITTLE_ENDIAN__
+extern __typeof (__stpncpy) __stpncpy_power9 attribute_hidden;
+# endif
 # undef stpncpy
 # undef __stpncpy
 
 libc_ifunc_redirected (__redirect___stpncpy, __stpncpy,
+# ifdef __LITTLE_ENDIAN__
+		     (hwcap2 & PPC_FEATURE2_ARCH_3_00)
+		     ? __stpncpy_power9 :
+# endif
 		       (hwcap2 & PPC_FEATURE2_ARCH_2_07)
 		       ? __stpncpy_power8
 		       : (hwcap & PPC_FEATURE_HAS_VSX)
-- 
2.26.2


^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH v2 1/2] powerpc: Add optimized strncpy for POWER9
  2020-09-04 16:56 [PATCH v2 1/2] powerpc: Add optimized strncpy for POWER9 Raphael Moreira Zinsly
  2020-09-04 16:56 ` [PATCH v2 2/2] powerpc: Add optimized stpncpy " Raphael Moreira Zinsly
@ 2020-09-04 16:59 ` Raphael M Zinsly
  2020-09-16 12:24 ` Matheus Castanho
  2 siblings, 0 replies; 9+ messages in thread
From: Raphael M Zinsly @ 2020-09-04 16:59 UTC (permalink / raw)
  To: libc-alpha

Benchtest output:
                             	generic_strncpy	__strncpy_power9 
__strncpy_power8	__strncpy_power7	__strncpy_ppc
Length   16, n   16, alignment  1/ 1:	6.44861	2.51617	2.54878	5.94753 
9.41467
Length   16, n   16, alignment  1/ 1:	6.4448	2.51688	2.56978	5.86275	9.52956
Length   16, n   16, alignment  1/ 2:	6.51392	2.53026	2.55617	5.96487 
9.51182
Length   16, n   16, alignment  2/ 1:	6.5421	2.5026	2.82458	5.95353	9.36524
Length    2, n    4, alignment  7/ 2:	8.02857	2.19272	4.35397	4.97347 
8.60923
Length    4, n    2, alignment  2/ 7:	6.04262	1.66226	2.31865	3.27123 
6.23803
Length    2, n    4, alignment  7/ 2:	8.15691	2.21924	4.48871	4.97328	8.3591
Length    4, n    2, alignment  2/ 7:	6.0428	1.66435	2.31671	3.2874	6.23902
Length   16, n   16, alignment  2/ 2:	6.75511	2.51667	2.82529	5.65252 
9.32002
Length   16, n   16, alignment  2/ 2:	6.53469	2.51982	2.82678	5.93257 
9.25613
Length   16, n   16, alignment  2/ 4:	6.3502	2.53333	2.82267	5.66948	9.35942
Length   16, n   16, alignment  4/ 2:	6.71533	2.51217	3.47278	5.95821	8.3249
Length    4, n    8, alignment  6/ 4:	7.85332	2.21708	5.68665	4.83111 
9.07271
Length    8, n    4, alignment  4/ 6:	5.93863	1.67938	2.67249	3.07391 
7.90751
Length    4, n    8, alignment  6/ 4:	8.24352	2.16644	5.22268	5.04674 
9.10352
Length    8, n    4, alignment  4/ 6:	5.88514	1.67966	2.67286	3.29382 
7.66757
Length   16, n   16, alignment  3/ 3:	6.55525	2.52511	3.06709	5.95625 
9.23173
Length   16, n   16, alignment  3/ 3:	6.66344	2.50855	3.11771	5.96121 
8.99767
Length   16, n   16, alignment  3/ 6:	6.82163	2.53355	3.0638	5.96451	9.09031
Length   16, n   16, alignment  6/ 3:	6.35636	2.51634	4.17868	5.95112 
7.82576
Length    8, n   16, alignment  5/ 6:	7.46873	2.23953	4.33782	5.76124 
10.2851
Length   16, n    8, alignment  6/ 5:	5.63643	1.88233	2.32899	4.72233 
5.79268
Length    8, n   16, alignment  5/ 6:	7.47291	2.65201	3.9103	5.40334	10.3902
Length   16, n    8, alignment  6/ 5:	5.73738	1.8787	2.32749	4.69061	6.03053
Length   16, n   16, alignment  4/ 4:	6.63998	2.5166	3.5133	5.83764	8.17814
Length   16, n   16, alignment  4/ 4:	6.6866	2.51915	3.5831	5.96121	8.32436
Length   16, n   16, alignment  4/ 0:	6.58543	2.51529	3.38441	5.96909 
8.03797
Length   16, n   16, alignment  0/ 4:	6.6541	1.87852	2.45328	5.96068	7.32961
Length   16, n   32, alignment  4/ 0:	9.37236	3.00744	5.92214	7.25884 
11.1515
Length   32, n   16, alignment  0/ 4:	6.2795	1.87939	2.45688	5.96206	7.03327
Length   16, n   32, alignment  4/ 0:	9.24513	3.00344	5.97977	6.94778 
11.0213
Length   32, n   16, alignment  0/ 4:	6.45422	1.87851	2.45698	5.96172 
7.32939
Length   16, n   16, alignment  5/ 5:	6.53949	2.51619	3.88095	5.96091 
9.05987
Length   16, n   16, alignment  5/ 5:	6.47371	2.51703	3.91695	5.96417 
9.24674
Length   16, n   16, alignment  5/ 2:	6.5493	2.5163	3.78779	5.95898	9.44104
Length   16, n   16, alignment  2/ 5:	6.70967	2.52226	2.82034	5.96365 
9.37646
Length   32, n   64, alignment  3/ 2:	14.0298	3.74521	6.80923	11.2825 
12.8659
Length   64, n   32, alignment  2/ 3:	9.53123	2.75624	3.21242	8.51653 
12.6887
Length   32, n   64, alignment  3/ 2:	14.179	3.83256	6.56898	11.3584	15.2479
Length   64, n   32, alignment  2/ 3:	9.53184	2.75305	3.21245	8.37087 
14.1081
Length   16, n   16, alignment  6/ 6:	6.42159	2.51726	4.38574	5.9562	7.12266
Length   16, n   16, alignment  6/ 6:	6.67028	2.51692	4.2448	5.9544	7.81439
Length   16, n   16, alignment  6/ 4:	6.42402	2.51636	4.23817	5.96162 
7.23351
Length   16, n   16, alignment  4/ 6:	6.60107	2.53036	3.54038	5.95837 
8.32176
Length   64, n  128, alignment  2/ 4:	15.5573	4.80414	7.45917	11.5659 
16.9298
Length  128, n   64, alignment  4/ 2:	11.6195	3.53279	4.80585	10.1583 
11.6096
Length   64, n  128, alignment  2/ 4:	15.5233	4.7997	7.34679	11.6628	22.0123
Length  128, n   64, alignment  4/ 2:	11.6078	3.5492	4.77929	10.027	19.504
Length   16, n   16, alignment  7/ 7:	6.54515	2.5141	5.04928	5.95083	7.57587
Length   16, n   16, alignment  7/ 7:	7.00425	2.51299	5.06765	5.92888 
8.25286
Length   16, n   16, alignment  7/ 6:	6.62954	2.51922	5.07189	6.02372 
7.72968
Length   16, n   16, alignment  6/ 7:	6.34475	2.51841	4.36954	5.95968 
7.78498
Length  128, n  256, alignment  1/ 6:	17.9386	7.60767	9.40348	16.5301 
20.6134
Length  256, n  128, alignment  6/ 1:	13.373	4.84375	7.34616	12.3919	15.1296
Length  128, n  256, alignment  1/ 6:	17.9186	7.6077	9.37853	16.686	39.2821
Length  256, n  128, alignment  6/ 1:	13.3632	4.91799	8.06183	12.4174 
34.1655
Length    8, n   16, alignment  0/ 0:	7.36981	2.22579	4.22739	4.9063	7.24636
Length   32, n   16, alignment  0/ 0:	6.43465	1.87932	2.45308	2.41526	7.1679
Length    8, n   16, alignment  7/ 2:	7.48861	2.21639	3.75708	5.35882 
8.45777
Length   32, n   16, alignment  7/ 2:	7.03412	2.3535	5.04692	5.95484	7.25068
Length   16, n   32, alignment  0/ 0:	9.10177	3.06646	4.81682	4.41358 
9.89656
Length   64, n   32, alignment  0/ 0:	8.57287	2.53847	2.94869	2.70506	8.2629
Length   16, n   32, alignment  6/ 4:	9.20906	3.04216	6.37553	9.46301 
10.2489
Length   64, n   32, alignment  6/ 4:	9.73117	2.75023	4.49311	7.7856	9.59261
Length   32, n   64, alignment  0/ 0:	10.9253	3.80104	4.83111	4.97682 
12.1086
Length  128, n   64, alignment  0/ 0:	9.26987	3.15895	3.49112	4.31372 
10.1329
Length   32, n   64, alignment  5/ 6:	14.1856	3.78089	7.1768	9.63551	13.9944
Length  128, n   64, alignment  5/ 6:	11.5298	3.5249	5.07847	9.96481	12.8245
Length   64, n  128, alignment  0/ 0:	12.0142	4.73085	5.98759	7.1613	15.0462
Length  256, n  128, alignment  0/ 0:	7.96029	4.50244	6.44433	5.38248 
11.6022
Length   64, n  128, alignment  4/ 0:	12.4223	4.80085	7.79294	11.0101 
15.5277
Length  256, n  128, alignment  4/ 0:	12.2371	4.79242	6.83902	13.2758 
16.0479
Length  128, n  256, alignment  0/ 0:	13.9165	7.28703	8.13319	8.79111 
16.9101
Length  512, n  256, alignment  0/ 0:	10.5083	6.49881	9.05173	9.03139 
19.6212
Length  128, n  256, alignment  3/ 2:	18.025	7.45493	9.86636	18.7234	20.5106
Length  512, n  256, alignment  3/ 2:	16.9588	7.07807	9.97969	23.4911 
25.4407
Length  256, n  512, alignment  0/ 0:	17.6801	12.5811	15.3595	13.9989 
28.5549
Length 1024, n  512, alignment  0/ 0:	16.379	10.7794	16.4748	16.7344	37.8286
Length  256, n  512, alignment  2/ 4:	23.2012	13.2761	14.3776	26.3752 
31.6336
Length 1024, n  512, alignment  2/ 4:	25.4264	12.1716	17.2608	42.2122	47.425
Length  512, n 1024, alignment  0/ 0:	21.0239	23.0736	19.8285	21.0169 
48.0091
Length 2048, n 1024, alignment  0/ 0:	28.424	19.323	36.917	35.4247	68.1661
Length  512, n 1024, alignment  1/ 6:	32.3159	24.2617	21.4919	46.5936	55.163
Length 2048, n 1024, alignment  1/ 6:	43.0359	21.6207	37.7643	77.5705 
83.2998

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH v2 2/2] powerpc: Add optimized stpncpy for POWER9
  2020-09-04 16:56 ` [PATCH v2 2/2] powerpc: Add optimized stpncpy " Raphael Moreira Zinsly
@ 2020-09-04 16:59   ` Raphael M Zinsly
  2020-09-16 12:32     ` Matheus Castanho
  2020-09-16 12:35   ` Matheus Castanho
  1 sibling, 1 reply; 9+ messages in thread
From: Raphael M Zinsly @ 2020-09-04 16:59 UTC (permalink / raw)
  To: libc-alpha

Benchtest output:
                             	generic_stpncpy	__stpncpy_power9 
__stpncpy_power8	__stpncpy_power7	__stpncpy_ppc
Length   16, n   16, alignment  1/ 1:	6.55566	2.5481	2.74063	5.28665	9.96288
Length   16, n   16, alignment  1/ 1:	6.70016	2.54137	2.7108	4.77502	9.91703
Length   16, n   16, alignment  1/ 2:	6.55975	2.56295	2.70641	5.49298 
9.59591
Length   16, n   16, alignment  2/ 1:	6.90759	2.52713	2.854	5.48949	9.37664
Length    2, n    4, alignment  7/ 2:	7.90969	2.22698	3.90151	4.6461	8.4503
Length    4, n    2, alignment  2/ 7:	6.14855	1.73403	2.67338	3.05675 
6.86316
Length    2, n    4, alignment  7/ 2:	8.40868	2.22338	4.50838	4.51078 
9.28489
Length    4, n    2, alignment  2/ 7:	6.14849	1.73402	2.67225	2.85349 
6.34342
Length   16, n   16, alignment  2/ 2:	6.963	2.54442	2.87779	5.63547	9.85162
Length   16, n   16, alignment  2/ 2:	6.59452	2.54121	2.84662	5.57178 
9.51406
Length   16, n   16, alignment  2/ 4:	6.79115	2.55835	2.84836	5.50427 
9.67999
Length   16, n   16, alignment  4/ 2:	6.78419	2.54132	3.54229	5.52563 
8.50938
Length    4, n    8, alignment  6/ 4:	8.45703	2.17266	4.80507	3.8714	9.04725
Length    8, n    4, alignment  4/ 6:	6.01753	1.73761	2.8185	2.41527	8.00051
Length    4, n    8, alignment  6/ 4:	7.82081	2.22612	4.80057	3.76103 
8.99812
Length    8, n    4, alignment  4/ 6:	6.01752	1.73474	2.82089	2.41524 
7.82703
Length   16, n   16, alignment  3/ 3:	6.78194	2.54143	3.21392	5.46447 
8.90749
Length   16, n   16, alignment  3/ 3:	6.76324	2.54088	3.22883	5.39689 
9.14749
Length   16, n   16, alignment  3/ 6:	7.05278	2.55795	3.22243	5.53422 
9.11315
Length   16, n   16, alignment  6/ 3:	6.72881	2.54183	4.58459	5.51658 
7.85006
Length    8, n   16, alignment  5/ 6:	7.67184	2.23969	4.13269	4.90728 
10.2248
Length   16, n    8, alignment  6/ 5:	5.73672	1.88048	2.6693	4.35579	6.11674
Length    8, n   16, alignment  5/ 6:	7.51707	2.2284	3.67276	4.90637	10.2411
Length   16, n    8, alignment  6/ 5:	5.73665	1.88119	2.57514	3.96351 
6.16253
Length   16, n   16, alignment  4/ 4:	7.03577	2.5415	3.66445	4.94157	8.98371
Length   16, n   16, alignment  4/ 4:	6.93549	2.53033	3.65577	5.53815 
8.48335
Length   16, n   16, alignment  4/ 0:	6.95106	2.53483	3.48744	5.43759 
8.45425
Length   16, n   16, alignment  0/ 4:	6.44601	1.87936	2.41984	5.49488 
6.92169
Length   16, n   32, alignment  4/ 0:	9.2036	3.04122	5.78685	6.66434	10.9065
Length   32, n   16, alignment  0/ 4:	6.65504	1.87934	2.41817	6.08706 
6.98513
Length   16, n   32, alignment  4/ 0:	9.17461	3.04153	5.77758	6.66444 
10.8015
Length   32, n   16, alignment  0/ 4:	6.44123	1.87936	2.41847	5.55207 
6.86039
Length   16, n   16, alignment  5/ 5:	6.56005	2.53132	4.22362	5.43527 
9.25109
Length   16, n   16, alignment  5/ 5:	6.55552	2.53088	4.22655	5.59271 
9.61369
Length   16, n   16, alignment  5/ 2:	6.55553	2.54559	4.31135	5.47438 
8.83103
Length   16, n   16, alignment  2/ 5:	6.88992	2.56255	2.84059	5.23185 
9.51441
Length   32, n   64, alignment  3/ 2:	12.5054	3.75138	6.42457	10.4719 
15.0663
Length   64, n   32, alignment  2/ 3:	9.87185	2.78283	3.17042	7.66624	11.503
Length   32, n   64, alignment  3/ 2:	12.4999	3.74537	6.38161	10.4578 
15.1104
Length   64, n   32, alignment  2/ 3:	9.86495	2.77889	3.19171	7.63272 
13.9799
Length   16, n   16, alignment  6/ 6:	6.41353	2.5453	4.50915	5.30382	8.45391
Length   16, n   16, alignment  6/ 6:	6.49495	2.54119	4.54493	5.55909	8.1629
Length   16, n   16, alignment  6/ 4:	6.41743	2.54487	4.57202	4.98659 
7.53033
Length   16, n   16, alignment  4/ 6:	6.91724	2.54649	3.67868	5.36838 
8.45677
Length   64, n  128, alignment  2/ 4:	14.0687	4.93151	8.11667	11.4411 
16.9533
Length  128, n   64, alignment  4/ 2:	11.7134	3.58948	4.90121	10.3018 
11.6692
Length   64, n  128, alignment  2/ 4:	14.0677	4.93413	7.28129	11.439	22.2186
Length  128, n   64, alignment  4/ 2:	11.7149	3.59312	4.85286	10.3403 
19.4651
Length   16, n   16, alignment  7/ 7:	6.76501	2.52563	5.55792	5.44155 
8.39997
Length   16, n   16, alignment  7/ 7:	7.16923	2.5265	5.55148	5.60184	7.98311
Length   16, n   16, alignment  7/ 6:	6.76252	2.52629	5.48067	5.51161 
7.61026
Length   16, n   16, alignment  6/ 7:	6.65772	2.5521	4.55758	5.48893	7.7301
Length  128, n  256, alignment  1/ 6:	16.2494	7.62034	9.3616	16.2888	19.7029
Length  256, n  128, alignment  6/ 1:	13.4311	4.94455	8.10802	12.2681 
15.6941
Length  128, n  256, alignment  1/ 6:	16.2608	7.6209	9.35509	16.2856	38.0277
Length  256, n  128, alignment  6/ 1:	13.4327	4.89474	8.35934	12.2646 
34.3268
Length    8, n   16, alignment  0/ 0:	7.20671	2.23256	3.75778	5.63555 
7.36414
Length   32, n   16, alignment  0/ 0:	6.4449	1.88	2.41577	2.89598	6.42537
Length    8, n   16, alignment  7/ 2:	7.45976	2.21832	3.91671	4.6524	8.45825
Length   32, n   16, alignment  7/ 2:	6.78267	2.34296	5.59161	5.58598 
6.88842
Length   16, n   32, alignment  0/ 0:	9.47971	3.10847	4.74758	4.75377 
10.2238
Length   64, n   32, alignment  0/ 0:	8.45634	2.34747	2.59248	2.82356 
9.42305
Length   16, n   32, alignment  6/ 4:	9.37784	3.05067	6.92384	9.47727 
10.1826
Length   64, n   32, alignment  6/ 4:	9.89233	2.77968	4.63672	7.09838 
10.2804
Length   32, n   64, alignment  0/ 0:	11.0813	3.71086	4.43777	5.3549	12.2048
Length  128, n   64, alignment  0/ 0:	9.25192	3.20123	3.53388	4.50794 
10.1934
Length   32, n   64, alignment  5/ 6:	12.5099	3.75871	7.29613	9.64902 
13.5821
Length  128, n   64, alignment  5/ 6:	11.6115	3.60165	5.71818	9.07288 
12.7929
Length   64, n  128, alignment  0/ 0:	12.3671	4.80754	5.46926	6.84492 
14.9238
Length  256, n  128, alignment  0/ 0:	8.08427	4.52607	6.47996	5.92086	11.701
Length   64, n  128, alignment  4/ 0:	12.5692	4.89717	7.11058	10.472	15.875
Length  256, n  128, alignment  4/ 0:	12.2945	4.94163	7.11645	12.3831 
16.6219
Length  128, n  256, alignment  0/ 0:	13.8948	7.28911	7.78784	9.30215 
17.0358
Length  512, n  256, alignment  0/ 0:	10.5266	6.56481	9.14202	9.31096 
20.0531
Length  128, n  256, alignment  3/ 2:	16.3534	7.46332	9.90009	18.5282 
19.5969
Length  512, n  256, alignment  3/ 2:	17.0519	7.09947	10.1635	23.5411 
25.0043
Length  256, n  512, alignment  0/ 0:	15.8935	12.6195	14.0756	14.7553 
28.5299
Length 1024, n  512, alignment  0/ 0:	16.3758	10.8028	16.5447	16.8966 
37.8653
Length  256, n  512, alignment  2/ 4:	21.16	13.2779	14.3088	26.4475	30.1647
Length 1024, n  512, alignment  2/ 4:	25.3364	12.0899	17.5443	42.7216 
47.5803
Length  512, n 1024, alignment  0/ 0:	20.5111	22.9782	19.6648	21.3857 
42.4801
Length 2048, n 1024, alignment  0/ 0:	28.4023	19.1577	36.9065	35.4799 
68.3555
Length  512, n 1024, alignment  1/ 6:	29.9694	24.3087	22.0513	46.7436 
51.5908
Length 2048, n 1024, alignment  1/ 6:	42.9897	21.5402	38.739	78.3266	84.3956

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH v2 1/2] powerpc: Add optimized strncpy for POWER9
  2020-09-04 16:56 [PATCH v2 1/2] powerpc: Add optimized strncpy for POWER9 Raphael Moreira Zinsly
  2020-09-04 16:56 ` [PATCH v2 2/2] powerpc: Add optimized stpncpy " Raphael Moreira Zinsly
  2020-09-04 16:59 ` [PATCH v2 1/2] powerpc: Add optimized strncpy " Raphael M Zinsly
@ 2020-09-16 12:24 ` Matheus Castanho
  2 siblings, 0 replies; 9+ messages in thread
From: Matheus Castanho @ 2020-09-16 12:24 UTC (permalink / raw)
  To: Raphael Moreira Zinsly, libc-alpha



On 9/4/20 1:56 PM, Raphael Moreira Zinsly via Libc-alpha wrote:
> Changes since v1:
> 	- Fixed comments identation and added some spaces to improve
> 	  readbillity.
> 	- Use "POWER 9 LE" instead of "PowerPC64/POWER9".
> 	- Fixed copyright dates.
> 	- Replaced cmpwi for cmpdi.
> 
> ---8<---
> 
> Similar to the strcpy P9 optimization, this version uses VSX to improve
> performance.
> ---
>  sysdeps/powerpc/powerpc64/le/power9/strncpy.S | 281 ++++++++++++++++++
>  sysdeps/powerpc/powerpc64/multiarch/Makefile  |   2 +-
>  .../powerpc64/multiarch/ifunc-impl-list.c     |   5 +
>  .../powerpc64/multiarch/strncpy-power9.S      |  26 ++
>  sysdeps/powerpc/powerpc64/multiarch/strncpy.c |   7 +
>  5 files changed, 320 insertions(+), 1 deletion(-)
>  create mode 100644 sysdeps/powerpc/powerpc64/le/power9/strncpy.S
>  create mode 100644 sysdeps/powerpc/powerpc64/multiarch/strncpy-power9.S
> 
> diff --git a/sysdeps/powerpc/powerpc64/le/power9/strncpy.S b/sysdeps/powerpc/powerpc64/le/power9/strncpy.S
> new file mode 100644
> index 0000000000..34fcdee913
> --- /dev/null
> +++ b/sysdeps/powerpc/powerpc64/le/power9/strncpy.S
> @@ -0,0 +1,281 @@
> +/* Optimized strncpy implementation for POWER9 LE.
> +   Copyright (C) 2020 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +#include <sysdep.h>
> +
> +# ifndef STRNCPY
> +#  define FUNC_NAME strncpy
> +# else
> +#  define FUNC_NAME STRNCPY
> +# endif
> +
> +/* Implements the function
> +
> +   char * [r3] strncpy (char *dest [r3], const char *src [r4], size_t n [r5])
> +
> +   The implementation can load bytes past a null terminator, but only
> +   up to the next 16-byte aligned address, so it never crosses a page.  */
> +
> +.machine power9
> +ENTRY_TOCLESS (FUNC_NAME, 4)
> +	CALL_MCOUNT 2
> +
> +	/* NULL string optimizations  */
> +	cmpdi   r5, 0
> +	beqlr
> +
> +	lbz	r0,0(r4)
> +	stb	r0,0(r3)
> +	addi	r11,r3,1
> +	addi	r5,r5,-1
> +	vspltisb v18,0		/* Zeroes in v18  */
> +	cmpdi	r0,0
> +	beq	L(zero_padding_loop)
> +
> +	/* Empty/1-byte string optimization  */
> +	cmpdi	r5,0
> +	beqlr
> +
> +	addi	r4,r4,1
> +	neg	r7,r4
> +	rldicl	r9,r7,0,60	/* How many bytes to get source 16B aligned?  */
> +
> +	/* Get source 16B aligned  */
> +	lvx	v0,0,r4
> +	lvsr	v1,0,r4
> +	vperm	v0,v18,v0,v1
> +
> +	vcmpequb v6,v0,v18	/* 0xff if byte is NULL, 0x00 otherwise  */
> +	vctzlsbb r7,v6		/* Number of trailing zeroes  */
> +	addi	r8,r7,1		/* Add null terminator  */
> +
> +	/* r8 = bytes including null
> +	   r9 = bytes to get source 16B aligned
> +	   if r8 > r9
> +	      no null, copy r9 bytes
> +	   else
> +	      there is a null, copy r8 bytes and return.  */
> +	cmpld	r8,r9
> +	bgt	L(no_null)
> +
> +	cmpld	cr6,r8,r5	/* r8 <= n?  */
> +	ble	cr6,L(null)
> +
> +	sldi	r10,r5,56	/* stxvl wants size in top 8 bits  */
> +	stxvl	32+v0,r11,r10	/* Partial store  */

At first I was confused by this 32+vX syntax. Maybe we could consider
adding defines for VSX registers to sysdeps/powerpc/sysdep.h in the
future? This way we could refer to v0+32 as vs32, for example. But I
don't think this needs to be part of this patchset.

> +
> +	blr
> +
> +L(null):
> +	sldi	r10,r8,56	/* stxvl wants size in top 8 bits  */
> +	stxvl	32+v0,r11,r10	/* Partial store  */
> +
> +	add	r11,r11,r8
> +	sub	r5,r5,r8
> +	b L(zero_padding_loop)
> +
> +L(no_null):
> +	cmpld	r9,r5		/* Check if length was reached.  */
> +	bge	L(n_tail1)
> +
> +	sldi	r10,r9,56	/* stxvl wants size in top 8 bits  */
> +	stxvl	32+v0,r11,r10	/* Partial store  */
> +
> +	add	r4,r4,r9
> +	add	r11,r11,r9
> +	sub	r5,r5,r9
> +
> +L(loop):
> +	cmpldi	cr6,r5,64	/* Check if length was reached.  */
> +	ble	cr6,L(final_loop)
> +
> +	lxv	32+v0,0(r4)
> +	vcmpequb. v6,v0,v18	/* Any zero bytes?  */
> +	bne	cr6,L(prep_tail1)
> +
> +	lxv	32+v1,16(r4)
> +	vcmpequb. v6,v1,v18	/* Any zero bytes?  */
> +	bne	cr6,L(prep_tail2)
> +
> +	lxv	32+v2,32(r4)
> +	vcmpequb. v6,v2,v18	/* Any zero bytes?  */
> +	bne	cr6,L(prep_tail3)
> +
> +	lxv	32+v3,48(r4)
> +	vcmpequb. v6,v3,v18	/* Any zero bytes?  */
> +	bne	cr6,L(prep_tail4)
> +
> +	stxv	32+v0,0(r11)
> +	stxv	32+v1,16(r11)
> +	stxv	32+v2,32(r11)
> +	stxv	32+v3,48(r11)
> +
> +	addi	r4,r4,64
> +	addi	r11,r11,64
> +	addi	r5,r5,-64
> +
> +	b	L(loop)
> +
> +L(final_loop):
> +	cmpldi	cr5,r5,16
> +	lxv	32+v0,0(r4)
> +	vcmpequb. v6,v0,v18	/* Any zero bytes?  */
> +	ble	cr5,L(prep_n_tail1)
> +	bne	cr6,L(count_tail1)
> +	addi	r5,r5,-16
> +
> +	cmpldi	cr5,r5,16
> +	lxv	32+v1,16(r4)
> +	vcmpequb. v6,v1,v18	/* Any zero bytes?  */
> +	ble	cr5,L(prep_n_tail2)
> +	bne	cr6,L(count_tail2)
> +	addi	r5,r5,-16
> +
> +	cmpldi	cr5,r5,16
> +	lxv	32+v2,32(r4)
> +	vcmpequb. v6,v2,v18	/* Any zero bytes?  */
> +	ble	cr5,L(prep_n_tail3)
> +	bne	cr6,L(count_tail3)
> +	addi	r5,r5,-16
> +
> +	lxv	32+v3,48(r4)
> +	vcmpequb. v6,v3,v18	/* Any zero bytes?  */
> +	beq	cr6,L(n_tail4)
> +
> +	vctzlsbb r8,v6		/* Number of trailing zeroes  */
> +	cmpld	r8,r5		/* r8 < n?  */
> +	blt	L(tail4)
> +
> +L(n_tail4):
> +	stxv	32+v0,0(r11)
> +	stxv	32+v1,16(r11)
> +	stxv	32+v2,32(r11)
> +	sldi	r10,r5,56	/* stxvl wants size in top 8 bits  */
> +	addi	r11,r11,48	/* Offset */
> +	stxvl	32+v3,r11,r10	/* Partial store  */
> +	blr
> +
> +L(prep_n_tail1):
> +	beq	cr6,L(n_tail1)	/* Any zero bytes?  */
> +	vctzlsbb r8,v6		/* Number of trailing zeroes  */
> +	cmpld	r8,r5		/* r8 < n?  */
> +	blt	L(tail1)
> +
> +L(n_tail1):
> +	sldi	r10,r5,56	/* stxvl wants size in top 8 bits  */
> +	stxvl	32+v0,r11,r10	/* Partial store  */
> +	blr
> +
> +L(prep_n_tail2):
> +	beq	cr6,L(n_tail2)	/* Any zero bytes?  */
> +	vctzlsbb r8,v6		/* Number of trailing zeroes  */
> +	cmpld	r8,r5		/* r8 < n?  */
> +	blt	L(tail2)
> +
> +L(n_tail2):
> +	stxv	32+v0,0(r11)
> +	sldi	r10,r5,56	/* stxvl wants size in top 8 bits  */
> +	addi	r11,r11,16	/* offset */
> +	stxvl	32+v1,r11,r10	/* Partial store  */
> +	blr
> +
> +L(prep_n_tail3):
> +	beq	cr6,L(n_tail3)	/* Any zero bytes?  */
> +	vctzlsbb r8,v6		/* Number of trailing zeroes  */
> +	cmpld	r8,r5		/* r8 < n?  */
> +	blt	L(tail3)
> +
> +L(n_tail3):
> +	stxv	32+v0,0(r11)
> +	stxv	32+v1,16(r11)
> +	sldi	r10,r5,56	/* stxvl wants size in top 8 bits  */
> +	addi	r11,r11,32	/* Offset */
> +	stxvl	32+v2,r11,r10	/* Partial store  */
> +	blr
> +
> +L(prep_tail1):
> +L(count_tail1):
> +	vctzlsbb r8,v6		/* Number of trailing zeroes  */
> +L(tail1):
> +	addi	r9,r8,1		/* Add null terminator  */
> +	sldi	r10,r9,56	/* stxvl wants size in top 8 bits  */
> +	stxvl	32+v0,r11,r10	/* Partial store  */
> +	add	r11,r11,r9
> +	sub	r5,r5,r9
> +	b L(zero_padding_loop)
> +
> +L(prep_tail2):
> +	addi	r5,r5,-16
> +L(count_tail2):
> +	vctzlsbb r8,v6		/* Number of trailing zeroes  */
> +L(tail2):
> +	addi	r9,r8,1		/* Add null terminator  */
> +	stxv	32+v0,0(r11)
> +	sldi	r10,r9,56	/* stxvl wants size in top 8 bits  */
> +	addi	r11,r11,16	/* offset */
> +	stxvl	32+v1,r11,r10	/* Partial store  */
> +	add	r11,r11,r9
> +	sub	r5,r5,r9
> +	b L(zero_padding_loop)
> +
> +L(prep_tail3):
> +	addi	r5,r5,-32
> +L(count_tail3):
> +	vctzlsbb r8,v6		/* Number of trailing zeroes  */
> +L(tail3):
> +	addi	r9,r8,1		/* Add null terminator  */
> +	stxv	32+v0,0(r11)
> +	stxv	32+v1,16(r11)
> +	sldi	r10,r9,56	/* stxvl wants size in top 8 bits  */
> +	addi	r11,r11,32	/* offset */
> +	stxvl	32+v2,r11,r10	/* Partial store  */
> +	add	r11,r11,r9
> +	sub	r5,r5,r9
> +	b L(zero_padding_loop)
> +
> +L(prep_tail4):
> +	addi	r5,r5,-48
> +	vctzlsbb r8,v6		/* Number of trailing zeroes  */
> +L(tail4):
> +	addi	r9,r8,1		/* Add null terminator  */
> +	stxv	32+v0,0(r11)
> +	stxv	32+v1,16(r11)
> +	stxv	32+v2,32(r11)
> +	sldi	r10,r9,56	/* stxvl wants size in top 8 bits  */
> +	addi	r11,r11,48	/* offset */
> +	stxvl	32+v3,r11,r10	/* Partial store  */
> +	add	r11,r11,r9
> +	sub	r5,r5,r9
> +
> +/* This code pads the remainder of dest with NULL bytes.  */
> +L(zero_padding_loop):
> +	cmpldi	cr6,r5,16	/* Check if length was reached.  */
> +	ble	cr6,L(zero_padding_end)
> +
> +	stxv	v18,0(r11)
> +	addi	r11,r11,16
> +	addi	r5,r5,-16
> +
> +	b	L(zero_padding_loop)
> +
> +L(zero_padding_end):
> +	sldi	r10,r5,56	/* stxvl wants size in top 8 bits  */
> +	stxvl	v18,r11,r10	/* Partial store  */
> +	blr
> +

The logic looks good. I tried to find a way to reuse some code, as there
are many similar blocks (e.g. tail* blocks). But their slight
differences make it hard to reuse anything.

> +END (FUNC_NAME)
> diff --git a/sysdeps/powerpc/powerpc64/multiarch/Makefile b/sysdeps/powerpc/powerpc64/multiarch/Makefile
> index 19acb6c64a..cd2b47b403 100644
> --- a/sysdeps/powerpc/powerpc64/multiarch/Makefile
> +++ b/sysdeps/powerpc/powerpc64/multiarch/Makefile
> @@ -33,7 +33,7 @@ sysdep_routines += memcpy-power8-cached memcpy-power7 memcpy-a2 memcpy-power6 \
>  
>  ifneq (,$(filter %le,$(config-machine)))
>  sysdep_routines += strcmp-power9 strncmp-power9 strcpy-power9 stpcpy-power9 \
> -		   rawmemchr-power9 strlen-power9
> +		   rawmemchr-power9 strlen-power9 strncpy-power9
>  endif
>  CFLAGS-strncase-power7.c += -mcpu=power7 -funroll-loops
>  CFLAGS-strncase_l-power7.c += -mcpu=power7 -funroll-loops
> diff --git a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
> index ea10b00417..aa63e1c23f 100644
> --- a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
> +++ b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
> @@ -301,6 +301,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
>  
>    /* Support sysdeps/powerpc/powerpc64/multiarch/strncpy.c.  */
>    IFUNC_IMPL (i, name, strncpy,
> +#ifdef __LITTLE_ENDIAN__
> +	      IFUNC_IMPL_ADD (array, i, strncpy,
> +			      hwcap2 & PPC_FEATURE2_ARCH_3_00,
> +			      __strncpy_power9)
> +#endif
>  	      IFUNC_IMPL_ADD (array, i, strncpy,
>  			      hwcap2 & PPC_FEATURE2_ARCH_2_07,
>  			      __strncpy_power8)
> diff --git a/sysdeps/powerpc/powerpc64/multiarch/strncpy-power9.S b/sysdeps/powerpc/powerpc64/multiarch/strncpy-power9.S
> new file mode 100644
> index 0000000000..ab7c570d54
> --- /dev/null
> +++ b/sysdeps/powerpc/powerpc64/multiarch/strncpy-power9.S
> @@ -0,0 +1,26 @@
> +/* Optimized strncpy implementation for POWER9 LE.
> +   Copyright (C) 2020 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +#if defined __LITTLE_ENDIAN__ && IS_IN (libc)
> +#define STRNCPY __strncpy_power9
> +
> +#undef libc_hidden_builtin_def
> +#define libc_hidden_builtin_def(name)
> +
> +#include <sysdeps/powerpc/powerpc64/le/power9/strncpy.S>
> +#endif
> diff --git a/sysdeps/powerpc/powerpc64/multiarch/strncpy.c b/sysdeps/powerpc/powerpc64/multiarch/strncpy.c
> index 7bacf28aca..8ef0a99cb5 100644
> --- a/sysdeps/powerpc/powerpc64/multiarch/strncpy.c
> +++ b/sysdeps/powerpc/powerpc64/multiarch/strncpy.c
> @@ -28,11 +28,18 @@
>  extern __typeof (strncpy) __strncpy_ppc attribute_hidden;
>  extern __typeof (strncpy) __strncpy_power7 attribute_hidden;
>  extern __typeof (strncpy) __strncpy_power8 attribute_hidden;
> +# ifdef __LITTLE_ENDIAN__
> +extern __typeof (strncpy) __strncpy_power9 attribute_hidden;
> +# endif
>  # undef strncpy
>  
>  /* Avoid DWARF definition DIE on ifunc symbol so that GDB can handle
>   ifunc symbol properly. */
>  libc_ifunc_redirected (__redirect_strncpy, strncpy,
> +# ifdef __LITTLE_ENDIAN__
> +		     (hwcap2 & PPC_FEATURE2_ARCH_3_00)
> +		     ? __strncpy_power9 :
> +# endif
>  		       (hwcap2 & PPC_FEATURE2_ARCH_2_07)
>  		       ? __strncpy_power8
>  		       : (hwcap & PPC_FEATURE_HAS_VSX)
> 

--

The only thing missing now seems to be the .machine power9 issue that
was pointed out in v1.

Otherwise, LGTM.

Reviewed-by: Matheus Castanho <msc@linux.ibm.com>

--
Matheus Castanho

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH v2 2/2] powerpc: Add optimized stpncpy for POWER9
  2020-09-04 16:59   ` Raphael M Zinsly
@ 2020-09-16 12:32     ` Matheus Castanho
  2020-09-16 12:56       ` Raphael M Zinsly
  0 siblings, 1 reply; 9+ messages in thread
From: Matheus Castanho @ 2020-09-16 12:32 UTC (permalink / raw)
  To: Raphael M Zinsly, libc-alpha

On 9/4/20 1:59 PM, Raphael M Zinsly via Libc-alpha wrote:
> Benchtest output:
>                                 generic_stpncpy    __stpncpy_power9  __stpncpy_power8    __stpncpy_power7    __stpncpy_ppc
<snip>
> Length  512, n 1024, alignment  0/ 0:    20.5111    22.9782   19.6648    21.3857 42.4801
<snip>
> Length  512, n 1024, alignment  1/ 6:    29.9694    24.3087   22.0513    46.7436 51.5908

These two seem to be the only cases in which the power9 version loses to
the power8 one. Have you investigated what happens in these two specific
cases?

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH v2 2/2] powerpc: Add optimized stpncpy for POWER9
  2020-09-04 16:56 ` [PATCH v2 2/2] powerpc: Add optimized stpncpy " Raphael Moreira Zinsly
  2020-09-04 16:59   ` Raphael M Zinsly
@ 2020-09-16 12:35   ` Matheus Castanho
  1 sibling, 0 replies; 9+ messages in thread
From: Matheus Castanho @ 2020-09-16 12:35 UTC (permalink / raw)
  To: Raphael Moreira Zinsly, libc-alpha



On 9/4/20 1:56 PM, Raphael Moreira Zinsly via Libc-alpha wrote:
> Add stpncpy support into the POWER9 strncpy.
> ---
>  sysdeps/powerpc/powerpc64/le/power9/stpncpy.S | 24 +++++++
>  sysdeps/powerpc/powerpc64/le/power9/strncpy.S | 65 +++++++++++++++++++
>  sysdeps/powerpc/powerpc64/multiarch/Makefile  |  2 +-
>  .../powerpc64/multiarch/ifunc-impl-list.c     |  5 ++
>  .../powerpc64/multiarch/stpncpy-power9.S      | 24 +++++++
>  sysdeps/powerpc/powerpc64/multiarch/stpncpy.c |  7 ++
>  6 files changed, 126 insertions(+), 1 deletion(-)
>  create mode 100644 sysdeps/powerpc/powerpc64/le/power9/stpncpy.S
>  create mode 100644 sysdeps/powerpc/powerpc64/multiarch/stpncpy-power9.S
> 
> diff --git a/sysdeps/powerpc/powerpc64/le/power9/stpncpy.S b/sysdeps/powerpc/powerpc64/le/power9/stpncpy.S
> new file mode 100644
> index 0000000000..81d9673d8b
> --- /dev/null
> +++ b/sysdeps/powerpc/powerpc64/le/power9/stpncpy.S
> @@ -0,0 +1,24 @@
> +/* Optimized stpncpy implementation for POWER9 LE.
> +   Copyright (C) 2020 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +#define USE_AS_STPNCPY
> +#include <sysdeps/powerpc/powerpc64/le/power9/strncpy.S>
> +
> +weak_alias (__stpncpy, stpncpy)
> +libc_hidden_def (__stpncpy)
> +libc_hidden_builtin_def (stpncpy)
> diff --git a/sysdeps/powerpc/powerpc64/le/power9/strncpy.S b/sysdeps/powerpc/powerpc64/le/power9/strncpy.S
> index 34fcdee913..f7265b11ec 100644
> --- a/sysdeps/powerpc/powerpc64/le/power9/strncpy.S
> +++ b/sysdeps/powerpc/powerpc64/le/power9/strncpy.S
> @@ -18,16 +18,30 @@
>  
>  #include <sysdep.h>
>  
> +#ifdef USE_AS_STPNCPY
> +# ifndef STPNCPY
> +#   define FUNC_NAME __stpncpy
> +# else
> +#   define FUNC_NAME STPNCPY
> +# endif
> +#else
>  # ifndef STRNCPY
>  #  define FUNC_NAME strncpy
>  # else
>  #  define FUNC_NAME STRNCPY
>  # endif
> +#endif  /* !USE_AS_STPNCPY  */
>  
>  /* Implements the function
>  
>     char * [r3] strncpy (char *dest [r3], const char *src [r4], size_t n [r5])
>  
> +   or
> +
> +   char * [r3] stpncpy (char *dest [r3], const char *src [r4], size_t n [r5])
> +
> +   if USE_AS_STPNCPY is defined.
> +
>     The implementation can load bytes past a null terminator, but only
>     up to the next 16-byte aligned address, so it never crosses a page.  */
>  
> @@ -49,7 +63,15 @@ ENTRY_TOCLESS (FUNC_NAME, 4)
>  
>  	/* Empty/1-byte string optimization  */
>  	cmpdi	r5,0
> +#ifdef USE_AS_STPNCPY
> +	bgt	L(cont)
> +	/* Compute pointer to last byte copied into dest.  */
> +	addi	r3,r3,1
> +	blr
> +L(cont):
> +#else
>  	beqlr
> +#endif
>  
>  	addi	r4,r4,1
>  	neg	r7,r4
> @@ -79,12 +101,20 @@ ENTRY_TOCLESS (FUNC_NAME, 4)
>  	sldi	r10,r5,56	/* stxvl wants size in top 8 bits  */
>  	stxvl	32+v0,r11,r10	/* Partial store  */
>  
> +#ifdef USE_AS_STPNCPY
> +	/* Compute pointer to last byte copied into dest.  */
> +	add	r3,r11,r5
> +#endif
>  	blr
>  
>  L(null):
>  	sldi	r10,r8,56	/* stxvl wants size in top 8 bits  */
>  	stxvl	32+v0,r11,r10	/* Partial store  */
>  
> +#ifdef USE_AS_STPNCPY
> +	/* Compute pointer to last byte copied into dest.  */
> +	add	r3,r11,r7
> +#endif
>  	add	r11,r11,r8
>  	sub	r5,r5,r8
>  	b L(zero_padding_loop)
> @@ -168,6 +198,10 @@ L(n_tail4):
>  	sldi	r10,r5,56	/* stxvl wants size in top 8 bits  */
>  	addi	r11,r11,48	/* Offset */
>  	stxvl	32+v3,r11,r10	/* Partial store  */
> +#ifdef USE_AS_STPNCPY
> +	/* Compute pointer to last byte copied into dest.  */
> +	add	r3,r11,r5
> +#endif
>  	blr
>  
>  L(prep_n_tail1):
> @@ -179,6 +213,10 @@ L(prep_n_tail1):
>  L(n_tail1):
>  	sldi	r10,r5,56	/* stxvl wants size in top 8 bits  */
>  	stxvl	32+v0,r11,r10	/* Partial store  */
> +#ifdef USE_AS_STPNCPY
> +	/* Compute pointer to last byte copied into dest.  */
> +	add	r3,r11,r5
> +#endif
>  	blr
>  
>  L(prep_n_tail2):
> @@ -192,6 +230,10 @@ L(n_tail2):
>  	sldi	r10,r5,56	/* stxvl wants size in top 8 bits  */
>  	addi	r11,r11,16	/* offset */
>  	stxvl	32+v1,r11,r10	/* Partial store  */
> +#ifdef USE_AS_STPNCPY
> +	/* Compute pointer to last byte copied into dest.  */
> +	add	r3,r11,r5
> +#endif
>  	blr
>  
>  L(prep_n_tail3):
> @@ -206,6 +248,10 @@ L(n_tail3):
>  	sldi	r10,r5,56	/* stxvl wants size in top 8 bits  */
>  	addi	r11,r11,32	/* Offset */
>  	stxvl	32+v2,r11,r10	/* Partial store  */
> +#ifdef USE_AS_STPNCPY
> +	/* Compute pointer to last byte copied into dest.  */
> +	add	r3,r11,r5
> +#endif
>  	blr
>  
>  L(prep_tail1):
> @@ -215,6 +261,10 @@ L(tail1):
>  	addi	r9,r8,1		/* Add null terminator  */
>  	sldi	r10,r9,56	/* stxvl wants size in top 8 bits  */
>  	stxvl	32+v0,r11,r10	/* Partial store  */
> +#ifdef USE_AS_STPNCPY
> +	/* Compute pointer to last byte copied into dest.  */
> +	add	r3,r11,r8
> +#endif
>  	add	r11,r11,r9
>  	sub	r5,r5,r9
>  	b L(zero_padding_loop)
> @@ -229,6 +279,10 @@ L(tail2):
>  	sldi	r10,r9,56	/* stxvl wants size in top 8 bits  */
>  	addi	r11,r11,16	/* offset */
>  	stxvl	32+v1,r11,r10	/* Partial store  */
> +#ifdef USE_AS_STPNCPY
> +	/* Compute pointer to last byte copied into dest.  */
> +	add	r3,r11,r8
> +#endif
>  	add	r11,r11,r9
>  	sub	r5,r5,r9
>  	b L(zero_padding_loop)
> @@ -244,6 +298,10 @@ L(tail3):
>  	sldi	r10,r9,56	/* stxvl wants size in top 8 bits  */
>  	addi	r11,r11,32	/* offset */
>  	stxvl	32+v2,r11,r10	/* Partial store  */
> +#ifdef USE_AS_STPNCPY
> +	/* Compute pointer to last byte copied into dest.  */
> +	add	r3,r11,r8
> +#endif
>  	add	r11,r11,r9
>  	sub	r5,r5,r9
>  	b L(zero_padding_loop)
> @@ -259,6 +317,10 @@ L(tail4):
>  	sldi	r10,r9,56	/* stxvl wants size in top 8 bits  */
>  	addi	r11,r11,48	/* offset */
>  	stxvl	32+v3,r11,r10	/* Partial store  */
> +#ifdef USE_AS_STPNCPY
> +	/* Compute pointer to last byte copied into dest.  */
> +	add	r3,r11,r8
> +#endif
>  	add	r11,r11,r9
>  	sub	r5,r5,r9
>  
> @@ -279,3 +341,6 @@ L(zero_padding_end):
>  	blr
>  
>  END (FUNC_NAME)
> +#ifndef USE_AS_STPNCPY
> +libc_hidden_builtin_def (strncpy)
> +#endif
> diff --git a/sysdeps/powerpc/powerpc64/multiarch/Makefile b/sysdeps/powerpc/powerpc64/multiarch/Makefile
> index cd2b47b403..f46bf50732 100644
> --- a/sysdeps/powerpc/powerpc64/multiarch/Makefile
> +++ b/sysdeps/powerpc/powerpc64/multiarch/Makefile
> @@ -33,7 +33,7 @@ sysdep_routines += memcpy-power8-cached memcpy-power7 memcpy-a2 memcpy-power6 \
>  
>  ifneq (,$(filter %le,$(config-machine)))
>  sysdep_routines += strcmp-power9 strncmp-power9 strcpy-power9 stpcpy-power9 \
> -		   rawmemchr-power9 strlen-power9 strncpy-power9
> +		   rawmemchr-power9 strlen-power9 strncpy-power9 stpncpy-power9
>  endif
>  CFLAGS-strncase-power7.c += -mcpu=power7 -funroll-loops
>  CFLAGS-strncase_l-power7.c += -mcpu=power7 -funroll-loops
> diff --git a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
> index aa63e1c23f..56790bcfe3 100644
> --- a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
> +++ b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
> @@ -317,6 +317,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
>  
>    /* Support sysdeps/powerpc/powerpc64/multiarch/stpncpy.c.  */
>    IFUNC_IMPL (i, name, stpncpy,
> +#ifdef __LITTLE_ENDIAN__
> +	      IFUNC_IMPL_ADD (array, i, stpncpy,
> +			      hwcap2 & PPC_FEATURE2_ARCH_3_00,
> +			      __stpncpy_power9)
> +#endif
>  	      IFUNC_IMPL_ADD (array, i, stpncpy,
>  			      hwcap2 & PPC_FEATURE2_ARCH_2_07,
>  			      __stpncpy_power8)
> diff --git a/sysdeps/powerpc/powerpc64/multiarch/stpncpy-power9.S b/sysdeps/powerpc/powerpc64/multiarch/stpncpy-power9.S
> new file mode 100644
> index 0000000000..ccbab55c31
> --- /dev/null
> +++ b/sysdeps/powerpc/powerpc64/multiarch/stpncpy-power9.S
> @@ -0,0 +1,24 @@
> +/* Optimized stpncpy implementation for POWER9 LE.
> +   Copyright (C) 2020 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +#define STPNCPY __stpncpy_power9
> +
> +#undef libc_hidden_builtin_def
> +#define libc_hidden_builtin_def(name)
> +
> +#include <sysdeps/powerpc/powerpc64/le/power9/stpncpy.S>
> diff --git a/sysdeps/powerpc/powerpc64/multiarch/stpncpy.c b/sysdeps/powerpc/powerpc64/multiarch/stpncpy.c
> index 17df886431..ac17b26650 100644
> --- a/sysdeps/powerpc/powerpc64/multiarch/stpncpy.c
> +++ b/sysdeps/powerpc/powerpc64/multiarch/stpncpy.c
> @@ -26,10 +26,17 @@
>  extern __typeof (__stpncpy) __stpncpy_ppc attribute_hidden;
>  extern __typeof (__stpncpy) __stpncpy_power7 attribute_hidden;
>  extern __typeof (__stpncpy) __stpncpy_power8 attribute_hidden;
> +# ifdef __LITTLE_ENDIAN__
> +extern __typeof (__stpncpy) __stpncpy_power9 attribute_hidden;
> +# endif
>  # undef stpncpy
>  # undef __stpncpy
>  
>  libc_ifunc_redirected (__redirect___stpncpy, __stpncpy,
> +# ifdef __LITTLE_ENDIAN__
> +		     (hwcap2 & PPC_FEATURE2_ARCH_3_00)
> +		     ? __stpncpy_power9 :
> +# endif
>  		       (hwcap2 & PPC_FEATURE2_ARCH_2_07)
>  		       ? __stpncpy_power8
>  		       : (hwcap & PPC_FEATURE_HAS_VSX)
> 

LGTM.

Reviewed-by: Matheus Castanho <msc@linux.ibm.com>

--
Matheus Castanho

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH v2 2/2] powerpc: Add optimized stpncpy for POWER9
  2020-09-16 12:32     ` Matheus Castanho
@ 2020-09-16 12:56       ` Raphael M Zinsly
  2020-09-18 15:53         ` Paul A. Clarke
  0 siblings, 1 reply; 9+ messages in thread
From: Raphael M Zinsly @ 2020-09-16 12:56 UTC (permalink / raw)
  To: Matheus Castanho, libc-alpha

Hi Matheus,

On 16/09/2020 09:32, Matheus Castanho wrote:
> On 9/4/20 1:59 PM, Raphael M Zinsly via Libc-alpha wrote:
>> Benchtest output:
>>                                  generic_stpncpy    __stpncpy_power9  __stpncpy_power8    __stpncpy_power7    __stpncpy_ppc
> <snip>
>> Length  512, n 1024, alignment  0/ 0:    20.5111    22.9782   19.6648    21.3857 42.4801
> <snip>
>> Length  512, n 1024, alignment  1/ 6:    29.9694    24.3087   22.0513    46.7436 51.5908
> 
> These two seem to be the only cases in which the power9 version loses to
> the power8 one. Have you investigated what happens in these two specific
> cases?
> 
Yes the power8 optimization calls memset to do the zero padding at the 
end if n > length. In this case where n is way higher, memset is faster 
than the loop used in my implementation.


Thanks for the review!

Regards,
-- 
Raphael Moreira Zinsly
IBM
Linux on Power Toolchain

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH v2 2/2] powerpc: Add optimized stpncpy for POWER9
  2020-09-16 12:56       ` Raphael M Zinsly
@ 2020-09-18 15:53         ` Paul A. Clarke
  0 siblings, 0 replies; 9+ messages in thread
From: Paul A. Clarke @ 2020-09-18 15:53 UTC (permalink / raw)
  To: Raphael M Zinsly; +Cc: Matheus Castanho, libc-alpha

On Wed, Sep 16, 2020 at 09:56:59AM -0300, Raphael M Zinsly via Libc-alpha wrote:
> On 16/09/2020 09:32, Matheus Castanho wrote:
> > On 9/4/20 1:59 PM, Raphael M Zinsly via Libc-alpha wrote:
> > > Benchtest output:
> > >                                  generic_stpncpy    __stpncpy_power9  __stpncpy_power8    __stpncpy_power7    __stpncpy_ppc
> > <snip>
> > > Length  512, n 1024, alignment  0/ 0:    20.5111    22.9782   19.6648    21.3857 42.4801
> > <snip>
> > > Length  512, n 1024, alignment  1/ 6:    29.9694    24.3087   22.0513    46.7436 51.5908
> > 
> > These two seem to be the only cases in which the power9 version loses to
> > the power8 one. Have you investigated what happens in these two specific
> > cases?
> > 
> Yes the power8 optimization calls memset to do the zero padding at the end
> if n > length. In this case where n is way higher, memset is faster than the
> loop used in my implementation.

Is there some sort of threshold that would help these cases by transitioning
to memset (or replicating the relevant part of that code here?

PC

^ permalink raw reply	[flat|nested] 9+ messages in thread

end of thread, other threads:[~2020-09-18 15:53 UTC | newest]

Thread overview: 9+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2020-09-04 16:56 [PATCH v2 1/2] powerpc: Add optimized strncpy for POWER9 Raphael Moreira Zinsly
2020-09-04 16:56 ` [PATCH v2 2/2] powerpc: Add optimized stpncpy " Raphael Moreira Zinsly
2020-09-04 16:59   ` Raphael M Zinsly
2020-09-16 12:32     ` Matheus Castanho
2020-09-16 12:56       ` Raphael M Zinsly
2020-09-18 15:53         ` Paul A. Clarke
2020-09-16 12:35   ` Matheus Castanho
2020-09-04 16:59 ` [PATCH v2 1/2] powerpc: Add optimized strncpy " Raphael M Zinsly
2020-09-16 12:24 ` Matheus Castanho

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).