public inbox for libc-alpha@sourceware.org
 help / color / mirror / Atom feed
* [PATCH 1/2] powerpc: Optimized strncpy for POWER9
@ 2020-08-20 18:29 Raphael Moreira Zinsly
  2020-08-20 18:29 ` [PATCH 2/2] powerpc: Optimzed stpncpy " Raphael Moreira Zinsly
                   ` (4 more replies)
  0 siblings, 5 replies; 9+ messages in thread
From: Raphael Moreira Zinsly @ 2020-08-20 18:29 UTC (permalink / raw)
  To: libc-alpha; +Cc: Raphael Moreira Zinsly

Similar to the strcpy P9 optimization, this version uses VSX to improve
performance.
---
 sysdeps/powerpc/powerpc64/le/power9/strncpy.S | 276 ++++++++++++++++++
 sysdeps/powerpc/powerpc64/multiarch/Makefile  |   2 +-
 .../powerpc64/multiarch/ifunc-impl-list.c     |   5 +
 .../powerpc64/multiarch/strncpy-power9.S      |  26 ++
 sysdeps/powerpc/powerpc64/multiarch/strncpy.c |   7 +
 5 files changed, 315 insertions(+), 1 deletion(-)
 create mode 100644 sysdeps/powerpc/powerpc64/le/power9/strncpy.S
 create mode 100644 sysdeps/powerpc/powerpc64/multiarch/strncpy-power9.S

diff --git a/sysdeps/powerpc/powerpc64/le/power9/strncpy.S b/sysdeps/powerpc/powerpc64/le/power9/strncpy.S
new file mode 100644
index 0000000000..cde68384d4
--- /dev/null
+++ b/sysdeps/powerpc/powerpc64/le/power9/strncpy.S
@@ -0,0 +1,276 @@
+/* Optimized strncpy implementation for PowerPC64/POWER9.
+   Copyright (C) 2020 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+# ifndef STRNCPY
+#  define FUNC_NAME strncpy
+# else
+#  define FUNC_NAME STRNCPY
+# endif
+
+/* Implements the function
+
+   char * [r3] strncpy (char *dest [r3], const char *src [r4], size_t n [r5])
+
+   The implementation can load bytes past a null terminator, but only
+   up to the next 16B boundary, so it never crosses a page.  */
+
+.machine power9
+ENTRY_TOCLESS (FUNC_NAME, 4)
+	CALL_MCOUNT 2
+
+	cmpwi   r5, 0
+	beqlr
+	/* NULL string optimisation  */
+	lbz	r0,0(r4)
+	stb	r0,0(r3)
+	addi	r11,r3,1
+	addi	r5,r5,-1
+	vspltisb v18,0		/* Zeroes in v18  */
+	cmpwi	r0,0
+	beq	L(zero_padding_loop)
+
+	cmpwi	r5,0
+	beqlr
+
+L(cont):
+	addi	r4,r4,1
+	neg	r7,r4
+	rldicl	r9,r7,0,60	/* How many bytes to get source 16B aligned?  */
+
+	/* Get source 16B aligned  */
+	lvx	v0,0,r4
+	lvsr	v1,0,r4
+	vperm	v0,v18,v0,v1
+
+	vcmpequb v6,v0,v18	/* 0xff if byte is NULL, 0x00 otherwise  */
+	vctzlsbb r7,v6		/* Number of trailing zeroes  */
+	addi	r8,r7,1	/* Add null terminator  */
+
+	/* r8 = bytes including null
+	   r9 = bytes to get source 16B aligned
+	   if r8 > r9
+	      no null, copy r9 bytes
+	   else
+	      there is a null, copy r8 bytes and return.  */
+	cmpd	r8,r9
+	bgt	L(no_null)
+
+	cmpd	r8,r5		/* r8 <= n?  */
+	ble	L(null)
+	sldi	r10,r5,56	/* stxvl wants size in top 8 bits  */
+	stxvl	32+v0,r11,r10	/* Partial store  */
+
+	blr
+
+L(null):
+	sldi	r10,r8,56	/* stxvl wants size in top 8 bits  */
+	stxvl	32+v0,r11,r10	/* Partial store  */
+
+	add	r11,r11,r8
+	sub	r5,r5,r8
+	b L(zero_padding_loop)
+
+L(no_null):
+	cmpd	r9,r5		/* Check if length was reached.  */
+	bge	L(n_tail1)
+	sldi	r10,r9,56	/* stxvl wants size in top 8 bits  */
+	stxvl	32+v0,r11,r10	/* Partial store  */
+
+	add	r4,r4,r9
+	add	r11,r11,r9
+	sub	r5,r5,r9
+
+L(loop):
+	cmpldi	cr6,r5,64	/* Check if length was reached.  */
+	ble	cr6,L(final_loop)
+
+	lxv	32+v0,0(r4)
+	vcmpequb. v6,v0,v18	/* Any zero bytes?  */
+	bne	cr6,L(prep_tail1)
+
+	lxv	32+v1,16(r4)
+	vcmpequb. v6,v1,v18	/* Any zero bytes?  */
+	bne	cr6,L(prep_tail2)
+
+	lxv	32+v2,32(r4)
+	vcmpequb. v6,v2,v18	/* Any zero bytes?  */
+	bne	cr6,L(prep_tail3)
+
+	lxv	32+v3,48(r4)
+	vcmpequb. v6,v3,v18	/* Any zero bytes?  */
+	bne	cr6,L(prep_tail4)
+
+	stxv	32+v0,0(r11)
+	stxv	32+v1,16(r11)
+	stxv	32+v2,32(r11)
+	stxv	32+v3,48(r11)
+
+	addi	r4,r4,64
+	addi	r11,r11,64
+	addi	r5,r5,-64
+
+	b	L(loop)
+
+L(final_loop):
+	cmpldi	cr5,r5,16
+	lxv	32+v0,0(r4)
+	vcmpequb. v6,v0,v18	/* Any zero bytes?  */
+	ble	cr5,L(prep_n_tail1)
+	bne	cr6,L(count_tail1)
+	addi	r5,r5,-16
+
+	cmpldi	cr5,r5,16
+	lxv	32+v1,16(r4)
+	vcmpequb. v6,v1,v18	/* Any zero bytes?  */
+	ble	cr5,L(prep_n_tail2)
+	bne	cr6,L(count_tail2)
+	addi	r5,r5,-16
+
+	cmpldi	cr5,r5,16
+	lxv	32+v2,32(r4)
+	vcmpequb. v6,v2,v18	/* Any zero bytes?  */
+	ble	cr5,L(prep_n_tail3)
+	bne	cr6,L(count_tail3)
+	addi	r5,r5,-16
+
+	lxv	32+v3,48(r4)
+	vcmpequb. v6,v3,v18	/* Any zero bytes?  */
+	beq	cr6,L(n_tail4)
+
+	vctzlsbb r8,v6		/* Number of trailing zeroes  */
+	cmpd	r8,r5		/* r8 < n?  */
+	blt	L(tail4)
+L(n_tail4):
+	stxv	32+v0,0(r11)
+	stxv	32+v1,16(r11)
+	stxv	32+v2,32(r11)
+	sldi	r10,r5,56	/* stxvl wants size in top 8 bits  */
+	addi	r11,r11,48	/* Offset */
+	stxvl	32+v3,r11,r10	/* Partial store  */
+	blr
+
+L(prep_n_tail1):
+	beq	cr6,L(n_tail1)	/* Any zero bytes?  */
+	vctzlsbb r8,v6		/* Number of trailing zeroes  */
+	cmpd	r8,r5		/* r8 < n?  */
+	blt	L(tail1)
+L(n_tail1):
+	sldi	r10,r5,56	/* stxvl wants size in top 8 bits  */
+	stxvl	32+v0,r11,r10	/* Partial store  */
+	blr
+
+L(prep_n_tail2):
+	beq	cr6,L(n_tail2)	/* Any zero bytes?  */
+	vctzlsbb r8,v6		/* Number of trailing zeroes  */
+	cmpd	r8,r5		/* r8 < n?  */
+	blt	L(tail2)
+L(n_tail2):
+	stxv	32+v0,0(r11)
+	sldi	r10,r5,56	/* stxvl wants size in top 8 bits  */
+	addi	r11,r11,16	/* offset */
+	stxvl	32+v1,r11,r10	/* Partial store  */
+	blr
+
+L(prep_n_tail3):
+	beq	cr6,L(n_tail3)	/* Any zero bytes?  */
+	vctzlsbb r8,v6		/* Number of trailing zeroes  */
+	cmpd	r8,r5		/* r8 < n?  */
+	blt	L(tail3)
+L(n_tail3):
+	stxv	32+v0,0(r11)
+	stxv	32+v1,16(r11)
+	sldi	r10,r5,56	/* stxvl wants size in top 8 bits  */
+	addi	r11,r11,32	/* Offset */
+	stxvl	32+v2,r11,r10	/* Partial store  */
+	blr
+
+L(prep_tail1):
+L(count_tail1):
+	vctzlsbb r8,v6		/* Number of trailing zeroes  */
+L(tail1):
+	addi	r9,r8,1	/* Add null terminator  */
+	sldi	r10,r9,56	/* stxvl wants size in top 8 bits  */
+	stxvl	32+v0,r11,r10	/* Partial store  */
+	add	r11,r11,r9
+	sub	r5,r5,r9
+	b L(zero_padding_loop)
+
+L(prep_tail2):
+	addi	r5,r5,-16
+L(count_tail2):
+	vctzlsbb r8,v6		/* Number of trailing zeroes  */
+L(tail2):
+	addi	r9,r8,1	/* Add null terminator  */
+	stxv	32+v0,0(r11)
+	sldi	r10,r9,56	/* stxvl wants size in top 8 bits  */
+	addi	r11,r11,16	/* offset */
+	stxvl	32+v1,r11,r10	/* Partial store  */
+	add	r11,r11,r9
+	sub	r5,r5,r9
+	b L(zero_padding_loop)
+
+L(prep_tail3):
+	addi	r5,r5,-32
+L(count_tail3):
+	vctzlsbb r8,v6		/* Number of trailing zeroes  */
+L(tail3):
+	addi	r9,r8,1	/* Add null terminator  */
+	stxv	32+v0,0(r11)
+	stxv	32+v1,16(r11)
+	sldi	r10,r9,56	/* stxvl wants size in top 8 bits  */
+	addi	r11,r11,32	/* offset */
+	stxvl	32+v2,r11,r10	/* Partial store  */
+	add	r11,r11,r9
+	sub	r5,r5,r9
+	b L(zero_padding_loop)
+
+L(prep_tail4):
+	addi	r5,r5,-48
+	vctzlsbb r8,v6		/* Number of trailing zeroes  */
+L(tail4):
+	addi	r9,r8,1	/* Add null terminator  */
+	stxv	32+v0,0(r11)
+	stxv	32+v1,16(r11)
+	stxv	32+v2,32(r11)
+	sldi	r10,r9,56	/* stxvl wants size in top 8 bits  */
+	addi	r11,r11,48	/* offset */
+	stxvl	32+v3,r11,r10	/* Partial store  */
+	add	r11,r11,r9
+	sub	r5,r5,r9
+
+/* This code pads the remainder of dest with NULL bytes.  */
+L(zero_padding_loop):
+	cmpldi	cr6,r5,16	/* Check if length was reached.  */
+	ble	cr6,L(zero_padding_end)
+
+	stxv	v18,0(r11)
+	addi	r11,r11,16
+	addi	r5,r5,-16
+
+	b	L(zero_padding_loop)
+
+L(zero_padding_end):
+	sldi	r10,r5,56	/* stxvl wants size in top 8 bits  */
+	stxvl	v18,r11,r10	/* Partial store  */
+	blr
+
+L(n_tail):
+
+END (FUNC_NAME)
diff --git a/sysdeps/powerpc/powerpc64/multiarch/Makefile b/sysdeps/powerpc/powerpc64/multiarch/Makefile
index 19acb6c64a..cd2b47b403 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/Makefile
+++ b/sysdeps/powerpc/powerpc64/multiarch/Makefile
@@ -33,7 +33,7 @@ sysdep_routines += memcpy-power8-cached memcpy-power7 memcpy-a2 memcpy-power6 \
 
 ifneq (,$(filter %le,$(config-machine)))
 sysdep_routines += strcmp-power9 strncmp-power9 strcpy-power9 stpcpy-power9 \
-		   rawmemchr-power9 strlen-power9
+		   rawmemchr-power9 strlen-power9 strncpy-power9
 endif
 CFLAGS-strncase-power7.c += -mcpu=power7 -funroll-loops
 CFLAGS-strncase_l-power7.c += -mcpu=power7 -funroll-loops
diff --git a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
index ea10b00417..aa63e1c23f 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
@@ -301,6 +301,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 
   /* Support sysdeps/powerpc/powerpc64/multiarch/strncpy.c.  */
   IFUNC_IMPL (i, name, strncpy,
+#ifdef __LITTLE_ENDIAN__
+	      IFUNC_IMPL_ADD (array, i, strncpy,
+			      hwcap2 & PPC_FEATURE2_ARCH_3_00,
+			      __strncpy_power9)
+#endif
 	      IFUNC_IMPL_ADD (array, i, strncpy,
 			      hwcap2 & PPC_FEATURE2_ARCH_2_07,
 			      __strncpy_power8)
diff --git a/sysdeps/powerpc/powerpc64/multiarch/strncpy-power9.S b/sysdeps/powerpc/powerpc64/multiarch/strncpy-power9.S
new file mode 100644
index 0000000000..b9b6092f7b
--- /dev/null
+++ b/sysdeps/powerpc/powerpc64/multiarch/strncpy-power9.S
@@ -0,0 +1,26 @@
+/* Optimized strncpy implementation for POWER9/PPC64.
+   Copyright (C) 2016-2020 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#if defined __LITTLE_ENDIAN__ && IS_IN (libc)
+#define STRNCPY __strncpy_power9
+
+#undef libc_hidden_builtin_def
+#define libc_hidden_builtin_def(name)
+
+#include <sysdeps/powerpc/powerpc64/le/power9/strncpy.S>
+#endif
diff --git a/sysdeps/powerpc/powerpc64/multiarch/strncpy.c b/sysdeps/powerpc/powerpc64/multiarch/strncpy.c
index 7bacf28aca..822ceb2003 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/strncpy.c
+++ b/sysdeps/powerpc/powerpc64/multiarch/strncpy.c
@@ -28,11 +28,18 @@
 extern __typeof (strncpy) __strncpy_ppc attribute_hidden;
 extern __typeof (strncpy) __strncpy_power7 attribute_hidden;
 extern __typeof (strncpy) __strncpy_power8 attribute_hidden;
+# ifdef __LITTLE_ENDIAN__
+extern __typeof (strncpy) __strncpy_power9 attribute_hidden;
+# endif
 # undef strncpy
 
 /* Avoid DWARF definition DIE on ifunc symbol so that GDB can handle
  ifunc symbol properly. */
 libc_ifunc_redirected (__redirect_strncpy, strncpy,
+# ifdef __LITTLE_ENDIAN__
+		   (hwcap2 & PPC_FEATURE2_ARCH_3_00)
+		   ? __strncpy_power9 :
+# endif
 		       (hwcap2 & PPC_FEATURE2_ARCH_2_07)
 		       ? __strncpy_power8
 		       : (hwcap & PPC_FEATURE_HAS_VSX)
-- 
2.26.2


^ permalink raw reply	[flat|nested] 9+ messages in thread

* [PATCH 2/2] powerpc: Optimzed stpncpy for POWER9
  2020-08-20 18:29 [PATCH 1/2] powerpc: Optimized strncpy for POWER9 Raphael Moreira Zinsly
@ 2020-08-20 18:29 ` Raphael Moreira Zinsly
  2020-08-20 18:31   ` Raphael M Zinsly
  2020-08-28 17:04   ` Paul E Murphy
  2020-08-20 18:31 ` [PATCH 1/2] powerpc: Optimized strncpy " Raphael M Zinsly
                   ` (3 subsequent siblings)
  4 siblings, 2 replies; 9+ messages in thread
From: Raphael Moreira Zinsly @ 2020-08-20 18:29 UTC (permalink / raw)
  To: libc-alpha; +Cc: Raphael Moreira Zinsly

Adds stpncpy support into the POWER9 strncpy.
---
 sysdeps/powerpc/powerpc64/le/power9/stpncpy.S | 24 ++++++
 sysdeps/powerpc/powerpc64/le/power9/strncpy.S | 74 +++++++++++++++++++
 sysdeps/powerpc/powerpc64/multiarch/Makefile  |  2 +-
 .../powerpc64/multiarch/ifunc-impl-list.c     |  5 ++
 .../powerpc64/multiarch/stpncpy-power9.S      | 24 ++++++
 sysdeps/powerpc/powerpc64/multiarch/stpncpy.c |  7 ++
 6 files changed, 135 insertions(+), 1 deletion(-)
 create mode 100644 sysdeps/powerpc/powerpc64/le/power9/stpncpy.S
 create mode 100644 sysdeps/powerpc/powerpc64/multiarch/stpncpy-power9.S

diff --git a/sysdeps/powerpc/powerpc64/le/power9/stpncpy.S b/sysdeps/powerpc/powerpc64/le/power9/stpncpy.S
new file mode 100644
index 0000000000..a96840bb6f
--- /dev/null
+++ b/sysdeps/powerpc/powerpc64/le/power9/stpncpy.S
@@ -0,0 +1,24 @@
+/* Optimized stpncpy implementation for PowerPC64/POWER9.
+   Copyright (C) 2015-2020 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#define USE_AS_STPNCPY
+#include <sysdeps/powerpc/powerpc64/le/power9/strncpy.S>
+
+weak_alias (__stpncpy, stpncpy)
+libc_hidden_def (__stpncpy)
+libc_hidden_builtin_def (stpncpy)
diff --git a/sysdeps/powerpc/powerpc64/le/power9/strncpy.S b/sysdeps/powerpc/powerpc64/le/power9/strncpy.S
index cde68384d4..64b06a9040 100644
--- a/sysdeps/powerpc/powerpc64/le/power9/strncpy.S
+++ b/sysdeps/powerpc/powerpc64/le/power9/strncpy.S
@@ -18,16 +18,30 @@
 
 #include <sysdep.h>
 
+#ifdef USE_AS_STPNCPY
+# ifndef STPNCPY
+#   define FUNC_NAME __stpncpy
+# else
+#   define FUNC_NAME STPNCPY
+# endif
+#else
 # ifndef STRNCPY
 #  define FUNC_NAME strncpy
 # else
 #  define FUNC_NAME STRNCPY
 # endif
+#endif  /* !USE_AS_STPNCPY  */
 
 /* Implements the function
 
    char * [r3] strncpy (char *dest [r3], const char *src [r4], size_t n [r5])
 
+   or
+
+   char * [r3] stpncpy (char *dest [r3], const char *src [r4], size_t n [r5])
+
+   if USE_AS_STPNCPY is defined.
+
    The implementation can load bytes past a null terminator, but only
    up to the next 16B boundary, so it never crosses a page.  */
 
@@ -47,6 +61,13 @@ ENTRY_TOCLESS (FUNC_NAME, 4)
 	beq	L(zero_padding_loop)
 
 	cmpwi	r5,0
+#ifdef USE_AS_STPNCPY
+	bgt	L(cont)
+	/* stpncpy returns the dest address plus the size not counting the
+	   final '\0'.  */
+	addi	r3,r3,1
+	blr
+#endif
 	beqlr
 
 L(cont):
@@ -77,12 +98,22 @@ L(cont):
 	sldi	r10,r5,56	/* stxvl wants size in top 8 bits  */
 	stxvl	32+v0,r11,r10	/* Partial store  */
 
+#ifdef USE_AS_STPNCPY
+	/* stpncpy returns the dest address plus the size not counting the
+	   final '\0'.  */
+	add	r3,r11,r5
+#endif
 	blr
 
 L(null):
 	sldi	r10,r8,56	/* stxvl wants size in top 8 bits  */
 	stxvl	32+v0,r11,r10	/* Partial store  */
 
+#ifdef USE_AS_STPNCPY
+	/* stpncpy returns the dest address plus the size not counting the
+	   final '\0'.  */
+	add	r3,r11,r7
+#endif
 	add	r11,r11,r8
 	sub	r5,r5,r8
 	b L(zero_padding_loop)
@@ -164,6 +195,11 @@ L(n_tail4):
 	sldi	r10,r5,56	/* stxvl wants size in top 8 bits  */
 	addi	r11,r11,48	/* Offset */
 	stxvl	32+v3,r11,r10	/* Partial store  */
+#ifdef USE_AS_STPNCPY
+	/* stpncpy returns the dest address plus the size not counting the
+	   final '\0'.  */
+	add	r3,r11,r5
+#endif
 	blr
 
 L(prep_n_tail1):
@@ -174,6 +210,11 @@ L(prep_n_tail1):
 L(n_tail1):
 	sldi	r10,r5,56	/* stxvl wants size in top 8 bits  */
 	stxvl	32+v0,r11,r10	/* Partial store  */
+#ifdef USE_AS_STPNCPY
+	/* stpncpy returns the dest address plus the size not counting the
+	   final '\0'.  */
+	add	r3,r11,r5
+#endif
 	blr
 
 L(prep_n_tail2):
@@ -186,6 +227,11 @@ L(n_tail2):
 	sldi	r10,r5,56	/* stxvl wants size in top 8 bits  */
 	addi	r11,r11,16	/* offset */
 	stxvl	32+v1,r11,r10	/* Partial store  */
+#ifdef USE_AS_STPNCPY
+	/* stpncpy returns the dest address plus the size not counting the
+	   final '\0'.  */
+	add	r3,r11,r5
+#endif
 	blr
 
 L(prep_n_tail3):
@@ -199,6 +245,11 @@ L(n_tail3):
 	sldi	r10,r5,56	/* stxvl wants size in top 8 bits  */
 	addi	r11,r11,32	/* Offset */
 	stxvl	32+v2,r11,r10	/* Partial store  */
+#ifdef USE_AS_STPNCPY
+	/* stpncpy returns the dest address plus the size not counting the
+	   final '\0'.  */
+	add	r3,r11,r5
+#endif
 	blr
 
 L(prep_tail1):
@@ -208,6 +259,11 @@ L(tail1):
 	addi	r9,r8,1	/* Add null terminator  */
 	sldi	r10,r9,56	/* stxvl wants size in top 8 bits  */
 	stxvl	32+v0,r11,r10	/* Partial store  */
+#ifdef USE_AS_STPNCPY
+	/* stpncpy returns the dest address plus the size not counting the
+	   final '\0'.  */
+	add	r3,r11,r8
+#endif
 	add	r11,r11,r9
 	sub	r5,r5,r9
 	b L(zero_padding_loop)
@@ -222,6 +278,11 @@ L(tail2):
 	sldi	r10,r9,56	/* stxvl wants size in top 8 bits  */
 	addi	r11,r11,16	/* offset */
 	stxvl	32+v1,r11,r10	/* Partial store  */
+#ifdef USE_AS_STPNCPY
+	/* stpncpy returns the dest address plus the size not counting the
+	   final '\0'.  */
+	add	r3,r11,r8
+#endif
 	add	r11,r11,r9
 	sub	r5,r5,r9
 	b L(zero_padding_loop)
@@ -237,6 +298,11 @@ L(tail3):
 	sldi	r10,r9,56	/* stxvl wants size in top 8 bits  */
 	addi	r11,r11,32	/* offset */
 	stxvl	32+v2,r11,r10	/* Partial store  */
+#ifdef USE_AS_STPNCPY
+	/* stpncpy returns the dest address plus the size not counting the
+	   final '\0'.  */
+	add	r3,r11,r8
+#endif
 	add	r11,r11,r9
 	sub	r5,r5,r9
 	b L(zero_padding_loop)
@@ -252,6 +318,11 @@ L(tail4):
 	sldi	r10,r9,56	/* stxvl wants size in top 8 bits  */
 	addi	r11,r11,48	/* offset */
 	stxvl	32+v3,r11,r10	/* Partial store  */
+#ifdef USE_AS_STPNCPY
+	/* stpncpy returns the dest address plus the size not counting the
+	   final '\0'.  */
+	add	r3,r11,r8
+#endif
 	add	r11,r11,r9
 	sub	r5,r5,r9
 
@@ -274,3 +345,6 @@ L(zero_padding_end):
 L(n_tail):
 
 END (FUNC_NAME)
+#ifndef USE_AS_STPNCPY
+libc_hidden_builtin_def (strncpy)
+#endif
diff --git a/sysdeps/powerpc/powerpc64/multiarch/Makefile b/sysdeps/powerpc/powerpc64/multiarch/Makefile
index cd2b47b403..f46bf50732 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/Makefile
+++ b/sysdeps/powerpc/powerpc64/multiarch/Makefile
@@ -33,7 +33,7 @@ sysdep_routines += memcpy-power8-cached memcpy-power7 memcpy-a2 memcpy-power6 \
 
 ifneq (,$(filter %le,$(config-machine)))
 sysdep_routines += strcmp-power9 strncmp-power9 strcpy-power9 stpcpy-power9 \
-		   rawmemchr-power9 strlen-power9 strncpy-power9
+		   rawmemchr-power9 strlen-power9 strncpy-power9 stpncpy-power9
 endif
 CFLAGS-strncase-power7.c += -mcpu=power7 -funroll-loops
 CFLAGS-strncase_l-power7.c += -mcpu=power7 -funroll-loops
diff --git a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
index aa63e1c23f..56790bcfe3 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
@@ -317,6 +317,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 
   /* Support sysdeps/powerpc/powerpc64/multiarch/stpncpy.c.  */
   IFUNC_IMPL (i, name, stpncpy,
+#ifdef __LITTLE_ENDIAN__
+	      IFUNC_IMPL_ADD (array, i, stpncpy,
+			      hwcap2 & PPC_FEATURE2_ARCH_3_00,
+			      __stpncpy_power9)
+#endif
 	      IFUNC_IMPL_ADD (array, i, stpncpy,
 			      hwcap2 & PPC_FEATURE2_ARCH_2_07,
 			      __stpncpy_power8)
diff --git a/sysdeps/powerpc/powerpc64/multiarch/stpncpy-power9.S b/sysdeps/powerpc/powerpc64/multiarch/stpncpy-power9.S
new file mode 100644
index 0000000000..ecbbb5c8e9
--- /dev/null
+++ b/sysdeps/powerpc/powerpc64/multiarch/stpncpy-power9.S
@@ -0,0 +1,24 @@
+/* Optimized stpncpy implementation for POWER9/PPC64.
+   Copyright (C) 2015-2020 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#define STPNCPY __stpncpy_power9
+
+#undef libc_hidden_builtin_def
+#define libc_hidden_builtin_def(name)
+
+#include <sysdeps/powerpc/powerpc64/le/power9/stpncpy.S>
diff --git a/sysdeps/powerpc/powerpc64/multiarch/stpncpy.c b/sysdeps/powerpc/powerpc64/multiarch/stpncpy.c
index 17df886431..21702716a3 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/stpncpy.c
+++ b/sysdeps/powerpc/powerpc64/multiarch/stpncpy.c
@@ -26,10 +26,17 @@
 extern __typeof (__stpncpy) __stpncpy_ppc attribute_hidden;
 extern __typeof (__stpncpy) __stpncpy_power7 attribute_hidden;
 extern __typeof (__stpncpy) __stpncpy_power8 attribute_hidden;
+# ifdef __LITTLE_ENDIAN__
+extern __typeof (__stpncpy) __stpncpy_power9 attribute_hidden;
+# endif
 # undef stpncpy
 # undef __stpncpy
 
 libc_ifunc_redirected (__redirect___stpncpy, __stpncpy,
+# ifdef __LITTLE_ENDIAN__
+		   (hwcap2 & PPC_FEATURE2_ARCH_3_00)
+		   ? __stpncpy_power9 :
+# endif
 		       (hwcap2 & PPC_FEATURE2_ARCH_2_07)
 		       ? __stpncpy_power8
 		       : (hwcap & PPC_FEATURE_HAS_VSX)
-- 
2.26.2


^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH 1/2] powerpc: Optimized strncpy for POWER9
  2020-08-20 18:29 [PATCH 1/2] powerpc: Optimized strncpy for POWER9 Raphael Moreira Zinsly
  2020-08-20 18:29 ` [PATCH 2/2] powerpc: Optimzed stpncpy " Raphael Moreira Zinsly
@ 2020-08-20 18:31 ` Raphael M Zinsly
  2020-08-28 14:25 ` Paul E Murphy
                   ` (2 subsequent siblings)
  4 siblings, 0 replies; 9+ messages in thread
From: Raphael M Zinsly @ 2020-08-20 18:31 UTC (permalink / raw)
  To: libc-alpha

Here is the make bench output:

                             	generic_strncpy	__strncpy_power9 
__strncpy_power8	__strncpy_power7	__strncpy_ppc
Length   16, n   16, alignment  1/ 1:	7.11694	2.77348	2.80296	6.5724	10.4471
Length   16, n   16, alignment  1/ 1:	7.1557	2.75968	2.805	6.5748	10.5064
Length   16, n   16, alignment  1/ 2:	7.17956	2.79127	2.79964	6.57323 
10.3281
Length   16, n   16, alignment  2/ 1:	7.15841	2.77364	3.10582	6.2332	10.331
Length    2, n    4, alignment  7/ 2:	8.90911	2.4623	5.38449	5.64873	9.36348
Length    4, n    2, alignment  2/ 7:	6.65395	1.84558	2.58298	3.10566 
7.46376
Length    2, n    4, alignment  7/ 2:	8.70625	2.41166	5.38131	5.73421 
9.64285
Length    4, n    2, alignment  2/ 7:	6.65458	1.84354	2.58382	3.64721 
6.96163
Length   16, n   16, alignment  2/ 2:	7.01778	2.77373	3.10668	6.58047 
10.6006
Length   16, n   16, alignment  2/ 2:	7.53778	2.75789	3.10591	6.2277	10.2613
Length   16, n   16, alignment  2/ 4:	7.13828	2.79132	3.10567	6.56847	10.619
Length   16, n   16, alignment  4/ 2:	7.38659	2.77668	3.70851	6.54537 
9.17368
Length    4, n    8, alignment  6/ 4:	8.71748	2.45183	5.76669	4.65782	10.014
Length    8, n    4, alignment  4/ 6:	6.5504	1.83463	2.96574	2.66227	8.49964
Length    4, n    8, alignment  6/ 4:	8.96461	2.4499	5.78384	5.32287	9.79641
Length    8, n    4, alignment  4/ 6:	6.48083	1.83265	2.9783	3.38632	8.51888
Length   16, n   16, alignment  3/ 3:	7.7538	2.77353	3.29008	6.55912	9.94143
Length   16, n   16, alignment  3/ 3:	7.75279	2.76148	3.30616	6.5445	9.98866
Length   16, n   16, alignment  3/ 6:	7.21486	2.79444	3.33712	6.24747	10.113
Length   16, n   16, alignment  6/ 3:	6.99138	2.77778	4.50777	6.22522 
8.53482
Length    8, n   16, alignment  5/ 6:	8.26994	2.77966	4.60681	6.10938 
10.5975
Length   16, n    8, alignment  6/ 5:	6.28062	2.07193	2.57761	4.95636 
6.48035
Length    8, n   16, alignment  5/ 6:	8.17113	2.43559	4.27753	5.95453 
11.1796
Length   16, n    8, alignment  6/ 5:	6.21214	2.07239	2.57714	4.96762 
6.76041
Length   16, n   16, alignment  4/ 4:	7.31373	2.77573	3.78349	6.19349 
8.91432
Length   16, n   16, alignment  4/ 4:	7.32226	2.75658	3.70319	6.60792 
9.17307
Length   16, n   16, alignment  4/ 0:	7.58812	2.76841	3.71554	6.54282 
8.90051
Length   16, n   16, alignment  0/ 4:	6.92871	2.06944	2.66876	6.63947 
8.08171
Length   16, n   32, alignment  4/ 0:	10.2972	3.3192	6.53695	7.77295	12.332
Length   32, n   16, alignment  0/ 4:	6.98056	2.06954	2.66445	6.54976 
7.65286
Length   16, n   32, alignment  4/ 0:	10.5356	3.31343	6.53813	7.72029 
12.2915
Length   32, n   16, alignment  0/ 4:	7.36068	2.06945	2.66424	6.21052 
8.07614
Length   16, n   16, alignment  5/ 5:	7.2122	2.77732	4.17451	6.55383	10.4887
Length   16, n   16, alignment  5/ 5:	7.34438	2.77512	4.17191	6.56873 
10.5664
Length   16, n   16, alignment  5/ 2:	7.15746	2.76198	4.14481	6.56235 
10.7391
Length   16, n   16, alignment  2/ 5:	7.19372	2.79273	3.10693	6.56984 
10.2697
Length   32, n   64, alignment  3/ 2:	15.3918	4.22964	7.0146	12.5809	13.8661
Length   64, n   32, alignment  2/ 3:	10.5331	3.02942	3.54253	9.19106 
12.9356
Length   32, n   64, alignment  3/ 2:	15.369	4.17282	7.36163	12.5759	16.8501
Length   64, n   32, alignment  2/ 3:	10.5585	3.01971	3.52885	9.03369 
15.6663
Length   16, n   16, alignment  6/ 6:	7.0405	2.77527	4.53842	6.54733	7.99437
Length   16, n   16, alignment  6/ 6:	7.02801	2.76059	4.52873	6.53536 
8.45713
Length   16, n   16, alignment  6/ 4:	7.42011	2.77669	4.52223	6.57756	7.9899
Length   16, n   16, alignment  4/ 6:	7.37787	2.77507	3.77821	6.57058 
9.17396
Length   64, n  128, alignment  2/ 4:	17.188	5.33493	8.00394	12.6196	19.1784
Length  128, n   64, alignment  4/ 2:	12.7962	3.91004	5.42994	11.294	12.5273
Length   64, n  128, alignment  2/ 4:	17.2298	5.2748	8.15392	12.6039	24.3802
Length  128, n   64, alignment  4/ 2:	12.7866	3.87534	5.3334	11.8516	21.6528
Length   16, n   16, alignment  7/ 7:	7.75015	2.76775	5.59024	6.57976 
8.42318
Length   16, n   16, alignment  7/ 7:	7.81681	2.75691	5.56801	6.55397 
10.0378
Length   16, n   16, alignment  7/ 6:	7.75225	2.77446	5.56813	6.57349 
8.49645
Length   16, n   16, alignment  6/ 7:	7.23237	2.79186	4.51528	6.55304 
8.63443
Length  128, n  256, alignment  1/ 6:	19.8414	8.37691	10.3445	18.4838 
22.8314
Length  256, n  128, alignment  6/ 1:	14.7972	5.38498	8.83611	13.8521 
16.6154
Length  128, n  256, alignment  1/ 6:	19.8497	8.37754	10.3469	18.2655 
43.3568
Length  256, n  128, alignment  6/ 1:	14.7542	5.31075	8.75314	13.7759 
37.6351
Length    8, n   16, alignment  0/ 0:	8.19872	2.45818	4.27602	4.6578	7.98513
Length   32, n   16, alignment  0/ 0:	6.92066	2.07115	2.66465	2.66381 
7.75655
Length    8, n   16, alignment  7/ 2:	8.18253	2.42685	4.70317	6.01808 
9.35743
Length   32, n   16, alignment  7/ 2:	7.79714	2.60074	5.58717	6.64181 
6.98583
Length   16, n   32, alignment  0/ 0:	10.4715	3.40184	6.28388	4.86146 
11.0819
Length   64, n   32, alignment  0/ 0:	10.4403	2.54135	3.07109	3.38791 
9.35196
Length   16, n   32, alignment  6/ 4:	10.7077	3.34867	7.01321	10.4278 
11.2951
Length   64, n   32, alignment  6/ 4:	10.9215	3.03041	5.04324	8.30023 
11.2648
Length   32, n   64, alignment  0/ 0:	12.0062	4.09428	5.32372	5.48319 
14.1455
Length  128, n   64, alignment  0/ 0:	10.1803	3.47282	3.83134	4.21557 
10.6674
Length   32, n   64, alignment  5/ 6:	15.4165	4.16297	7.78876	10.8762 
15.4308
Length  128, n   64, alignment  5/ 6:	12.7332	3.91667	5.8014	10.5869	14.0961
Length   64, n  128, alignment  0/ 0:	13.238	5.24242	6.90661	8.05566	15.9848
Length  256, n  128, alignment  0/ 0:	8.759	4.9483	6.98675	6.11489	12.6755
Length   64, n  128, alignment  4/ 0:	13.6593	5.27931	8.60925	12.5916 
17.5016
Length  256, n  128, alignment  4/ 0:	13.4801	5.37114	7.47485	14.0585 
17.4517
Length  128, n  256, alignment  0/ 0:	15.3147	8.02462	8.92006	9.67769 
20.3757
Length  512, n  256, alignment  0/ 0:	11.5638	7.22535	9.80468	9.93597 
21.3421
Length  128, n  256, alignment  3/ 2:	19.8948	8.15967	10.9435	20.6146 
22.4146
Length  512, n  256, alignment  3/ 2:	18.681	7.77864	10.9269	25.9269	28.0105
Length  256, n  512, alignment  0/ 0:	19.4894	13.7363	14.8394	15.4064 
31.6341
Length 1024, n  512, alignment  0/ 0:	18.0108	11.8737	18.1779	18.5072 
41.5425
Length  256, n  512, alignment  2/ 4:	25.5662	14.5189	16.1872	29.5395 
33.7587
Length 1024, n  512, alignment  2/ 4:	28.0079	13.2347	19.067	48.1998	52.3078
Length  512, n 1024, alignment  0/ 0:	23.1385	25.4237	21.2303	23.632	47.4502
Length 2048, n 1024, alignment  0/ 0:	31.201	21.308	40.6351	39.04	75.0329
Length  512, n 1024, alignment  1/ 6:	35.6234	27.0042	24.4711	51.3364 
60.6277
Length 2048, n 1024, alignment  1/ 6:	47.442	24.0381	41.6616	85.4832	91.8897

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH 2/2] powerpc: Optimzed stpncpy for POWER9
  2020-08-20 18:29 ` [PATCH 2/2] powerpc: Optimzed stpncpy " Raphael Moreira Zinsly
@ 2020-08-20 18:31   ` Raphael M Zinsly
  2020-08-28 17:04   ` Paul E Murphy
  1 sibling, 0 replies; 9+ messages in thread
From: Raphael M Zinsly @ 2020-08-20 18:31 UTC (permalink / raw)
  To: libc-alpha

Here is the make bench output:

                             	generic_stpncpy	__stpncpy_power9 
__stpncpy_power8	__stpncpy_power7	__stpncpy_ppc
Length   16, n   16, alignment  1/ 1:	7.31792	2.79249	2.98207	6.20964 
11.2262
Length   16, n   16, alignment  1/ 1:	7.26441	2.79883	2.97986	6.09795 
11.1118
Length   16, n   16, alignment  1/ 2:	7.22475	2.82518	2.98169	6.18967 
10.9933
Length   16, n   16, alignment  2/ 1:	7.28211	2.78851	3.1079	6.06067	10.4232
Length    2, n    4, alignment  7/ 2:	9.30193	2.4733	4.30086	4.74387	9.25328
Length    4, n    2, alignment  2/ 7:	6.7756	1.91031	2.93946	3.24475	7.76389
Length    2, n    4, alignment  7/ 2:	8.81319	2.4726	4.57341	4.74421	9.44667
Length    4, n    2, alignment  2/ 7:	6.77806	1.9118	2.93637	3.1857	7.00171
Length   16, n   16, alignment  2/ 2:	7.35335	2.80104	3.10653	5.85492 
10.5689
Length   16, n   16, alignment  2/ 2:	7.14308	2.78571	3.10889	6.10044 
10.4816
Length   16, n   16, alignment  2/ 4:	7.21628	2.81563	3.10724	6.14674 
10.6005
Length   16, n   16, alignment  4/ 2:	7.47713	2.80531	3.80081	5.86977 
9.43599
Length    4, n    8, alignment  6/ 4:	8.63537	2.4676	5.53825	4.1877	9.88309
Length    8, n    4, alignment  4/ 6:	6.63429	1.91051	3.10751	2.76472	8.4156
Length    4, n    8, alignment  6/ 4:	8.59304	2.43152	5.30288	4.16475 
9.77498
Length    8, n    4, alignment  4/ 6:	6.63843	1.91047	3.19713	2.69566 
8.67023
Length   16, n   16, alignment  3/ 3:	7.45277	2.80045	3.42433	6.06204 
9.92282
Length   16, n   16, alignment  3/ 3:	8.04191	2.78645	3.43317	5.99773 
10.0662
Length   16, n   16, alignment  3/ 6:	7.5816	2.81606	3.44168	6.0801	9.94673
Length   16, n   16, alignment  6/ 3:	7.10582	2.80176	5.03947	6.06942 
8.40249
Length    8, n   16, alignment  5/ 6:	8.19747	2.42028	4.30043	5.0752	11.3093
Length   16, n    8, alignment  6/ 5:	6.37287	2.07239	2.56322	4.36972 
6.52164
Length    8, n   16, alignment  5/ 6:	8.25022	2.45124	4.05051	5.02258 
10.8683
Length   16, n    8, alignment  6/ 5:	6.31868	2.07215	2.83061	4.44584 
7.14464
Length   16, n   16, alignment  4/ 4:	7.54408	2.80105	3.82846	5.71392 
9.91359
Length   16, n   16, alignment  4/ 4:	7.66265	2.79063	3.86233	6.06489 
9.31705
Length   16, n   16, alignment  4/ 0:	7.84286	2.79896	3.83148	6.08954 
9.55253
Length   16, n   16, alignment  0/ 4:	7.36697	2.07019	2.66533	6.13894 
7.75685
Length   16, n   32, alignment  4/ 0:	10.3819	3.33088	6.32994	7.24949 
12.3827
Length   32, n   16, alignment  0/ 4:	7.15586	2.07172	2.66097	6.11743 
7.56448
Length   16, n   32, alignment  4/ 0:	10.3262	3.35225	6.34556	7.3211	12.2527
Length   32, n   16, alignment  0/ 4:	7.13287	2.07265	2.6613	6.17878	7.61901
Length   16, n   16, alignment  5/ 5:	7.22471	2.80128	4.65776	6.15455 
9.93333
Length   16, n   16, alignment  5/ 5:	7.22458	2.78586	4.65874	6.06763 
9.87968
Length   16, n   16, alignment  5/ 2:	7.22718	2.79127	4.65999	6.025	10.3775
Length   16, n   16, alignment  2/ 5:	7.73485	2.8025	3.10754	6.08303	10.3871
Length   32, n   64, alignment  3/ 2:	13.7685	4.1256	7.04965	11.5105	15.3903
Length   64, n   32, alignment  2/ 3:	10.526	3.05149	3.59497	8.45078	13.7462
Length   32, n   64, alignment  3/ 2:	13.7681	4.11611	7.08236	11.5129 
16.6004
Length   64, n   32, alignment  2/ 3:	10.962	3.05712	3.60447	8.43981	15.4906
Length   16, n   16, alignment  6/ 6:	7.30916	2.80056	5.03985	6.16331 
8.43692
Length   16, n   16, alignment  6/ 6:	7.31688	2.7914	5.02931	6.12345	8.42848
Length   16, n   16, alignment  6/ 4:	7.7402	2.7993	5.04435	6.02685	8.28199
Length   16, n   16, alignment  4/ 6:	7.79103	2.82496	3.82464	6.0778	9.31532
Length   64, n  128, alignment  2/ 4:	15.4969	5.3714	8.09812	12.6067	18.7831
Length  128, n   64, alignment  4/ 2:	12.9023	3.93138	5.46487	10.7071 
13.3253
Length   64, n  128, alignment  2/ 4:	15.4998	5.42611	7.88843	12.6007 
24.0491
Length  128, n   64, alignment  4/ 2:	12.8971	3.94646	5.49689	11.1747 
21.5779
Length   16, n   16, alignment  7/ 7:	7.68992	2.78151	6.14775	6.19397 
8.38412
Length   16, n   16, alignment  7/ 7:	7.90811	2.7803	6.11502	6.17383	8.78371
Length   16, n   16, alignment  7/ 6:	7.45456	2.80173	5.93657	6.15191 
8.38489
Length   16, n   16, alignment  6/ 7:	7.44846	2.80238	5.03654	6.1154	8.41589
Length  128, n  256, alignment  1/ 6:	17.9114	8.39532	10.3246	17.9457 
21.9452
Length  256, n  128, alignment  6/ 1:	14.8346	5.41104	8.89047	13.5379 
17.1437
Length  128, n  256, alignment  1/ 6:	17.9118	8.39985	10.3271	17.9503 
42.0831
Length  256, n  128, alignment  6/ 1:	14.8306	5.40714	9.04492	13.5227	37.819
Length    8, n   16, alignment  0/ 0:	8.19945	2.46752	4.04264	4.62897 
8.22975
Length   32, n   16, alignment  0/ 0:	7.23617	2.07229	2.66504	2.66683 
7.93411
Length    8, n   16, alignment  7/ 2:	8.26373	2.41779	4.18003	5.31418	9.0473
Length   32, n   16, alignment  7/ 2:	7.46119	2.63992	6.16424	6.14534 
7.28237
Length   16, n   32, alignment  0/ 0:	10.1282	3.42401	5.00287	5.02318 
11.4985
Length   64, n   32, alignment  0/ 0:	9.29452	2.57779	2.79807	3.1362	10.9532
Length   16, n   32, alignment  6/ 4:	10.2194	3.30297	7.48371	10.4067 
11.2264
Length   64, n   32, alignment  6/ 4:	10.6887	3.04976	5.13062	8.10511 
11.1225
Length   32, n   64, alignment  0/ 0:	12.1806	4.09924	5.12341	6.14159 
14.0965
Length  128, n   64, alignment  0/ 0:	10.1569	3.52625	3.88528	4.65782 
11.3018
Length   32, n   64, alignment  5/ 6:	13.7795	4.13456	8.53476	10.2846 
15.1556
Length  128, n   64, alignment  5/ 6:	12.8171	3.92765	5.82505	10.3559 
15.0831
Length   64, n  128, alignment  0/ 0:	13.6328	5.33523	6.43324	7.92213 
16.4658
Length  256, n  128, alignment  0/ 0:	8.92495	4.97169	7.13044	6.30158 
12.9039
Length   64, n  128, alignment  4/ 0:	13.8393	5.36588	7.52682	11.5294 
17.5523
Length  256, n  128, alignment  4/ 0:	13.5309	5.36019	7.56527	13.3503 
17.8202
Length  128, n  256, alignment  0/ 0:	15.2956	8.14449	8.79678	9.69352 
21.2463
Length  512, n  256, alignment  0/ 0:	11.5667	7.22974	10.1355	10.2592 
21.5805
Length  128, n  256, alignment  3/ 2:	18.0152	8.21506	10.9175	20.4131 
22.3927
Length  512, n  256, alignment  3/ 2:	18.7328	7.81909	11.251	25.0633	29.2378
Length  256, n  512, alignment  0/ 0:	17.5135	13.9768	15.6849	16.1219 
30.9344
Length 1024, n  512, alignment  0/ 0:	17.988	11.8498	18.4388	18.7385	41.5762
Length  256, n  512, alignment  2/ 4:	23.3724	14.8026	15.9182	28.6762 
33.9031
Length 1024, n  512, alignment  2/ 4:	27.9562	13.2785	19.5893	46.9671 
52.4943
Length  512, n 1024, alignment  0/ 0:	23.3637	25.283	21.2536	23.4228	55.6501
Length 2048, n 1024, alignment  0/ 0:	31.303	21.2731	40.7001	38.8365	75.1105
Length  512, n 1024, alignment  1/ 6:	33.0535	26.873	24.8167	51.5917	56.236
Length 2048, n 1024, alignment  1/ 6:	47.5444	24.0206	42.5163	86.0245 
92.5819

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH 1/2] powerpc: Optimized strncpy for POWER9
  2020-08-20 18:29 [PATCH 1/2] powerpc: Optimized strncpy for POWER9 Raphael Moreira Zinsly
  2020-08-20 18:29 ` [PATCH 2/2] powerpc: Optimzed stpncpy " Raphael Moreira Zinsly
  2020-08-20 18:31 ` [PATCH 1/2] powerpc: Optimized strncpy " Raphael M Zinsly
@ 2020-08-28 14:25 ` Paul E Murphy
  2020-08-28 19:12 ` Paul A. Clarke
  2020-09-02 13:20 ` Tulio Magno Quites Machado Filho
  4 siblings, 0 replies; 9+ messages in thread
From: Paul E Murphy @ 2020-08-28 14:25 UTC (permalink / raw)
  To: Raphael Moreira Zinsly, libc-alpha



On 8/20/20 1:29 PM, Raphael Moreira Zinsly via Libc-alpha wrote:
> Similar to the strcpy P9 optimization, this version uses VSX to improve
> performance.
> ---
>   sysdeps/powerpc/powerpc64/le/power9/strncpy.S | 276 ++++++++++++++++++
>   sysdeps/powerpc/powerpc64/multiarch/Makefile  |   2 +-
>   .../powerpc64/multiarch/ifunc-impl-list.c     |   5 +
>   .../powerpc64/multiarch/strncpy-power9.S      |  26 ++
>   sysdeps/powerpc/powerpc64/multiarch/strncpy.c |   7 +
>   5 files changed, 315 insertions(+), 1 deletion(-)
>   create mode 100644 sysdeps/powerpc/powerpc64/le/power9/strncpy.S
>   create mode 100644 sysdeps/powerpc/powerpc64/multiarch/strncpy-power9.S
> 
> diff --git a/sysdeps/powerpc/powerpc64/le/power9/strncpy.S b/sysdeps/powerpc/powerpc64/le/power9/strncpy.S
> new file mode 100644
> index 0000000000..cde68384d4
> --- /dev/null
> +++ b/sysdeps/powerpc/powerpc64/le/power9/strncpy.S
> @@ -0,0 +1,276 @@
> +/* Optimized strncpy implementation for PowerPC64/POWER9.
> +   Copyright (C) 2020 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +#include <sysdep.h>
> +
> +# ifndef STRNCPY
> +#  define FUNC_NAME strncpy
> +# else
> +#  define FUNC_NAME STRNCPY
> +# endif
> +
> +/* Implements the function
> +
> +   char * [r3] strncpy (char *dest [r3], const char *src [r4], size_t n [r5])
> +
> +   The implementation can load bytes past a null terminator, but only
> +   up to the next 16B boundary, so it never crosses a page.  */
> +
> +.machine power9
> +ENTRY_TOCLESS (FUNC_NAME, 4)
> +	CALL_MCOUNT 2
> +
> +	cmpwi   r5, 0
> +	beqlr
Trivial nit, an newline after branches helps readability for me.

> +	/* NULL string optimisation  */
> +	lbz	r0,0(r4)
> +	stb	r0,0(r3)
> +	addi	r11,r3,1
> +	addi	r5,r5,-1
> +	vspltisb v18,0		/* Zeroes in v18  */
> +	cmpwi	r0,0
> +	beq	L(zero_padding_loop) > +
> +	cmpwi	r5,0
> +	beqlr
OK.

> +
> +L(cont):
I think this label can be removed or replaced with a comment.

> +	addi	r4,r4,1
> +	neg	r7,r4
> +	rldicl	r9,r7,0,60	/* How many bytes to get source 16B aligned?  */
> +
> +	/* Get source 16B aligned  */
> +	lvx	v0,0,r4
> +	lvsr	v1,0,r4
> +	vperm	v0,v18,v0,v1
> +
> +	vcmpequb v6,v0,v18	/* 0xff if byte is NULL, 0x00 otherwise  */
> +	vctzlsbb r7,v6		/* Number of trailing zeroes  */
> +	addi	r8,r7,1	/* Add null terminator  */
Minor nit, can you align the comment with previous comments?

> +
> +	/* r8 = bytes including null
> +	   r9 = bytes to get source 16B aligned
> +	   if r8 > r9
> +	      no null, copy r9 bytes
> +	   else
> +	      there is a null, copy r8 bytes and return.  */
> +	cmpd	r8,r9
> +	bgt	L(no_null)
> +
> +	cmpd	r8,r5		/* r8 <= n?  */
Minor, you could use another CR and run this in parallel with the 
previous check.

> +	ble	L(null)
> +	sldi	r10,r5,56	/* stxvl wants size in top 8 bits  */
> +	stxvl	32+v0,r11,r10	/* Partial store  */
> +
> +	blr
OK.

> +
> +L(null):
> +	sldi	r10,r8,56	/* stxvl wants size in top 8 bits  */
> +	stxvl	32+v0,r11,r10	/* Partial store  */
> +
> +	add	r11,r11,r8
> +	sub	r5,r5,r8
> +	b L(zero_padding_loop)
OK.

> +
> +L(no_null):
> +	cmpd	r9,r5		/* Check if length was reached.  */
> +	bge	L(n_tail1)
An extra newline would help here.

> +	sldi	r10,r9,56	/* stxvl wants size in top 8 bits  */
> +	stxvl	32+v0,r11,r10	/* Partial store  */
> +
> +	add	r4,r4,r9
> +	add	r11,r11,r9
> +	sub	r5,r5,r9
OK.

> +
> +L(loop):
> +	cmpldi	cr6,r5,64	/* Check if length was reached.  */
> +	ble	cr6,L(final_loop)
> +
> +	lxv	32+v0,0(r4)
> +	vcmpequb. v6,v0,v18	/* Any zero bytes?  */
> +	bne	cr6,L(prep_tail1)
> +
> +	lxv	32+v1,16(r4)
> +	vcmpequb. v6,v1,v18	/* Any zero bytes?  */
> +	bne	cr6,L(prep_tail2)
> +
> +	lxv	32+v2,32(r4)
> +	vcmpequb. v6,v2,v18	/* Any zero bytes?  */
> +	bne	cr6,L(prep_tail3)
> +
> +	lxv	32+v3,48(r4)
> +	vcmpequb. v6,v3,v18	/* Any zero bytes?  */
> +	bne	cr6,L(prep_tail4)
> +
> +	stxv	32+v0,0(r11)
> +	stxv	32+v1,16(r11)
> +	stxv	32+v2,32(r11)
> +	stxv	32+v3,48(r11)
> +
> +	addi	r4,r4,64
> +	addi	r11,r11,64
> +	addi	r5,r5,-64
> +
> +	b	L(loop)
OK.

> +
> +L(final_loop):
> +	cmpldi	cr5,r5,16
> +	lxv	32+v0,0(r4)
> +	vcmpequb. v6,v0,v18	/* Any zero bytes?  */
> +	ble	cr5,L(prep_n_tail1)
> +	bne	cr6,L(count_tail1)
> +	addi	r5,r5,-16
> +
> +	cmpldi	cr5,r5,16
> +	lxv	32+v1,16(r4)
> +	vcmpequb. v6,v1,v18	/* Any zero bytes?  */
> +	ble	cr5,L(prep_n_tail2)
> +	bne	cr6,L(count_tail2)
> +	addi	r5,r5,-16
> +
> +	cmpldi	cr5,r5,16
> +	lxv	32+v2,32(r4)
> +	vcmpequb. v6,v2,v18	/* Any zero bytes?  */
> +	ble	cr5,L(prep_n_tail3)
> +	bne	cr6,L(count_tail3)
> +	addi	r5,r5,-16
OK.

> +
> +	lxv	32+v3,48(r4)
> +	vcmpequb. v6,v3,v18	/* Any zero bytes?  */
> +	beq	cr6,L(n_tail4)
> +
> +	vctzlsbb r8,v6		/* Number of trailing zeroes  */
> +	cmpd	r8,r5		/* r8 < n?  */
> +	blt	L(tail4)
OK. Newline here (and for the other similar cases below too please).

> +L(n_tail4):
> +	stxv	32+v0,0(r11)
> +	stxv	32+v1,16(r11)
> +	stxv	32+v2,32(r11)
> +	sldi	r10,r5,56	/* stxvl wants size in top 8 bits  */
> +	addi	r11,r11,48	/* Offset */
> +	stxvl	32+v3,r11,r10	/* Partial store  */
> +	blr
OK.

> +
> +L(prep_n_tail1):
> +	beq	cr6,L(n_tail1)	/* Any zero bytes?  */
> +	vctzlsbb r8,v6		/* Number of trailing zeroes  */
> +	cmpd	r8,r5		/* r8 < n?  */
> +	blt	L(tail1)
> +L(n_tail1):
> +	sldi	r10,r5,56	/* stxvl wants size in top 8 bits  */
> +	stxvl	32+v0,r11,r10	/* Partial store  */
> +	blr
OK.

> +
> +L(prep_n_tail2):
> +	beq	cr6,L(n_tail2)	/* Any zero bytes?  */
> +	vctzlsbb r8,v6		/* Number of trailing zeroes  */
> +	cmpd	r8,r5		/* r8 < n?  */
> +	blt	L(tail2)
> +L(n_tail2):
> +	stxv	32+v0,0(r11)
> +	sldi	r10,r5,56	/* stxvl wants size in top 8 bits  */
> +	addi	r11,r11,16	/* offset */
> +	stxvl	32+v1,r11,r10	/* Partial store  */
> +	blr
OK.

> +
> +L(prep_n_tail3):
> +	beq	cr6,L(n_tail3)	/* Any zero bytes?  */
> +	vctzlsbb r8,v6		/* Number of trailing zeroes  */
> +	cmpd	r8,r5		/* r8 < n?  */
> +	blt	L(tail3)
> +L(n_tail3):
> +	stxv	32+v0,0(r11)
> +	stxv	32+v1,16(r11)
> +	sldi	r10,r5,56	/* stxvl wants size in top 8 bits  */
> +	addi	r11,r11,32	/* Offset */
> +	stxvl	32+v2,r11,r10	/* Partial store  */
> +	blr
OK.

> +
> +L(prep_tail1):
> +L(count_tail1):
> +	vctzlsbb r8,v6		/* Number of trailing zeroes  */
> +L(tail1):
> +	addi	r9,r8,1	/* Add null terminator  */
Please align this comment (and the 3 other similar cases).
> +	sldi	r10,r9,56	/* stxvl wants size in top 8 bits  */
> +	stxvl	32+v0,r11,r10	/* Partial store  */
> +	add	r11,r11,r9
> +	sub	r5,r5,r9
> +	b L(zero_padding_loop)
OK.

> +
> +L(prep_tail2):
> +	addi	r5,r5,-16
> +L(count_tail2):
> +	vctzlsbb r8,v6		/* Number of trailing zeroes  */
> +L(tail2):
> +	addi	r9,r8,1	/* Add null terminator  */
> +	stxv	32+v0,0(r11)
> +	sldi	r10,r9,56	/* stxvl wants size in top 8 bits  */
> +	addi	r11,r11,16	/* offset */
> +	stxvl	32+v1,r11,r10	/* Partial store  */
> +	add	r11,r11,r9
> +	sub	r5,r5,r9
> +	b L(zero_padding_loop)
> +
> +L(prep_tail3):
> +	addi	r5,r5,-32
> +L(count_tail3):
> +	vctzlsbb r8,v6		/* Number of trailing zeroes  */
> +L(tail3):
> +	addi	r9,r8,1	/* Add null terminator  */
> +	stxv	32+v0,0(r11)
> +	stxv	32+v1,16(r11)
> +	sldi	r10,r9,56	/* stxvl wants size in top 8 bits  */
> +	addi	r11,r11,32	/* offset */
> +	stxvl	32+v2,r11,r10	/* Partial store  */
> +	add	r11,r11,r9
> +	sub	r5,r5,r9
> +	b L(zero_padding_loop)
> +
> +L(prep_tail4):
> +	addi	r5,r5,-48
> +	vctzlsbb r8,v6		/* Number of trailing zeroes  */
> +L(tail4):
> +	addi	r9,r8,1	/* Add null terminator  */
> +	stxv	32+v0,0(r11)
> +	stxv	32+v1,16(r11)
> +	stxv	32+v2,32(r11)
> +	sldi	r10,r9,56	/* stxvl wants size in top 8 bits  */
> +	addi	r11,r11,48	/* offset */
> +	stxvl	32+v3,r11,r10	/* Partial store  */
> +	add	r11,r11,r9
> +	sub	r5,r5,r9
OK.

> +
> +/* This code pads the remainder of dest with NULL bytes.  */
> +L(zero_padding_loop):
> +	cmpldi	cr6,r5,16	/* Check if length was reached.  */
> +	ble	cr6,L(zero_padding_end)
> +
> +	stxv	v18,0(r11)
> +	addi	r11,r11,16
> +	addi	r5,r5,-16
> +
> +	b	L(zero_padding_loop)
> +
OK.



> +L(zero_padding_end):
> +	sldi	r10,r5,56	/* stxvl wants size in top 8 bits  */
> +	stxvl	v18,r11,r10	/* Partial store  */
> +	blr

OK.

> +
> +L(n_tail):Is this label used?

> +
> +END (FUNC_NAME)
> diff --git a/sysdeps/powerpc/powerpc64/multiarch/Makefile b/sysdeps/powerpc/powerpc64/multiarch/Makefile
> index 19acb6c64a..cd2b47b403 100644
> --- a/sysdeps/powerpc/powerpc64/multiarch/Makefile
> +++ b/sysdeps/powerpc/powerpc64/multiarch/Makefile

OK.

> index ea10b00417..aa63e1c23f 100644
> --- a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
> +++ b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c

OK.

> diff --git a/sysdeps/powerpc/powerpc64/multiarch/strncpy-power9.S b/sysdeps/powerpc/powerpc64/multiarch/strncpy-power9.S
> new file mode 100644
> index 0000000000..b9b6092f7b
> --- /dev/null
> +++ b/sysdeps/powerpc/powerpc64/multiarch/strncpy-power9.S

OK.

> diff --git a/sysdeps/powerpc/powerpc64/multiarch/strncpy.c b/sysdeps/powerpc/powerpc64/multiarch/strncpy.c
> index 7bacf28aca..822ceb2003 100644
> --- a/sysdeps/powerpc/powerpc64/multiarch/strncpy.c
> +++ b/sysdeps/powerpc/powerpc64/multiarch/strncpy.c
> @@ -28,11 +28,18 @@
>   extern __typeof (strncpy) __strncpy_ppc attribute_hidden;
>   extern __typeof (strncpy) __strncpy_power7 attribute_hidden;
>   extern __typeof (strncpy) __strncpy_power8 attribute_hidden;
> +# ifdef __LITTLE_ENDIAN__
> +extern __typeof (strncpy) __strncpy_power9 attribute_hidden;
> +# endif
>   # undef strncpy
> 
>   /* Avoid DWARF definition DIE on ifunc symbol so that GDB can handle
>    ifunc symbol properly. */
>   libc_ifunc_redirected (__redirect_strncpy, strncpy,
> +# ifdef __LITTLE_ENDIAN__
> +		   (hwcap2 & PPC_FEATURE2_ARCH_3_00)
> +		   ? __strncpy_power9 :

Trivial nit, I think the above two lines need two extra spaces.

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH 2/2] powerpc: Optimzed stpncpy for POWER9
  2020-08-20 18:29 ` [PATCH 2/2] powerpc: Optimzed stpncpy " Raphael Moreira Zinsly
  2020-08-20 18:31   ` Raphael M Zinsly
@ 2020-08-28 17:04   ` Paul E Murphy
  1 sibling, 0 replies; 9+ messages in thread
From: Paul E Murphy @ 2020-08-28 17:04 UTC (permalink / raw)
  To: Raphael Moreira Zinsly, libc-alpha


Thank you for your contributions, I have a few minor 
comments/suggestions below.

On 8/20/20 1:29 PM, Raphael Moreira Zinsly via Libc-alpha wrote:
> Adds stpncpy support into the POWER9 strncpy.
s/Adds/Add/ s/into the/to/.

Likewise, s/Optimzed/Add optimized/ in the title.

> ---
>   sysdeps/powerpc/powerpc64/le/power9/stpncpy.S | 24 ++++++
>   sysdeps/powerpc/powerpc64/le/power9/strncpy.S | 74 +++++++++++++++++++
>   sysdeps/powerpc/powerpc64/multiarch/Makefile  |  2 +-
>   .../powerpc64/multiarch/ifunc-impl-list.c     |  5 ++
>   .../powerpc64/multiarch/stpncpy-power9.S      | 24 ++++++
>   sysdeps/powerpc/powerpc64/multiarch/stpncpy.c |  7 ++
>   6 files changed, 135 insertions(+), 1 deletion(-)
>   create mode 100644 sysdeps/powerpc/powerpc64/le/power9/stpncpy.S
>   create mode 100644 sysdeps/powerpc/powerpc64/multiarch/stpncpy-power9.S
> 
> diff --git a/sysdeps/powerpc/powerpc64/le/power9/stpncpy.S b/sysdeps/powerpc/powerpc64/le/power9/stpncpy.S
> new file mode 100644
> index 0000000000..a96840bb6f
> --- /dev/null
> +++ b/sysdeps/powerpc/powerpc64/le/power9/stpncpy.S
> @@ -0,0 +1,24 @@
> +/* Optimized stpncpy implementation for PowerPC64/POWER9.
> +   Copyright (C) 2015-2020 Free Software Foundation, Inc.
Should this date be exclusively 2020?

> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +#define USE_AS_STPNCPY
> +#include <sysdeps/powerpc/powerpc64/le/power9/strncpy.S>
> +
> +weak_alias (__stpncpy, stpncpy)
> +libc_hidden_def (__stpncpy)
> +libc_hidden_builtin_def (stpncpy)

OK.

> diff --git a/sysdeps/powerpc/powerpc64/le/power9/strncpy.S b/sysdeps/powerpc/powerpc64/le/power9/strncpy.S
> index cde68384d4..64b06a9040 100644
> --- a/sysdeps/powerpc/powerpc64/le/power9/strncpy.S
> +++ b/sysdeps/powerpc/powerpc64/le/power9/strncpy.S
> @@ -18,16 +18,30 @@
> 
>   #include <sysdep.h>
> 
> +#ifdef USE_AS_STPNCPY
> +# ifndef STPNCPY
> +#   define FUNC_NAME __stpncpy
> +# else
> +#   define FUNC_NAME STPNCPY
> +# endif
> +#else
>   # ifndef STRNCPY
>   #  define FUNC_NAME strncpy
>   # else
>   #  define FUNC_NAME STRNCPY
>   # endif
> +#endif  /* !USE_AS_STPNCPY  */
> 
>   /* Implements the function
> 
>      char * [r3] strncpy (char *dest [r3], const char *src [r4], size_t n [r5])
> 
> +   or
> +
> +   char * [r3] stpncpy (char *dest [r3], const char *src [r4], size_t n [r5])
> +
> +   if USE_AS_STPNCPY is defined.
> +
>      The implementation can load bytes past a null terminator, but only
>      up to the next 16B boundary, so it never crosses a page.  */
> 
> @@ -47,6 +61,13 @@ ENTRY_TOCLESS (FUNC_NAME, 4)
>   	beq	L(zero_padding_loop)
> 
>   	cmpwi	r5,0
> +#ifdef USE_AS_STPNCPY
> +	bgt	L(cont)
> +	/* stpncpy returns the dest address plus the size not counting the
> +	   final '\0'.  */
"Compute pointer to last byte copied into dest."  Likwise for the other 
copied instances.

> +	addi	r3,r3,1
> +	blr
> +#endif
OK.

>   	beqlr
This is unreachable in stpncpy, can this be conditionally included in 
the !stpncpy configuration?

> 
>   L(cont):
> @@ -77,12 +98,22 @@ L(cont):
>   	sldi	r10,r5,56	/* stxvl wants size in top 8 bits  */
>   	stxvl	32+v0,r11,r10	/* Partial store  */
> 
> +#ifdef USE_AS_STPNCPY
> +	/* stpncpy returns the dest address plus the size not counting the
> +	   final '\0'.  */
> +	add	r3,r11,r5
> +#endif
>   	blr
> 
>   L(null):
>   	sldi	r10,r8,56	/* stxvl wants size in top 8 bits  */
>   	stxvl	32+v0,r11,r10	/* Partial store  */
> 
> +#ifdef USE_AS_STPNCPY
> +	/* stpncpy returns the dest address plus the size not counting the
> +	   final '\0'.  */
> +	add	r3,r11,r7
> +#endif
>   	add	r11,r11,r8
>   	sub	r5,r5,r8
>   	b L(zero_padding_loop)
> @@ -164,6 +195,11 @@ L(n_tail4):
>   	sldi	r10,r5,56	/* stxvl wants size in top 8 bits  */
>   	addi	r11,r11,48	/* Offset */
>   	stxvl	32+v3,r11,r10	/* Partial store  */
> +#ifdef USE_AS_STPNCPY
> +	/* stpncpy returns the dest address plus the size not counting the
> +	   final '\0'.  */
> +	add	r3,r11,r5
> +#endif
>   	blr
> 
>   L(prep_n_tail1):
> @@ -174,6 +210,11 @@ L(prep_n_tail1):
>   L(n_tail1):
>   	sldi	r10,r5,56	/* stxvl wants size in top 8 bits  */
>   	stxvl	32+v0,r11,r10	/* Partial store  */
> +#ifdef USE_AS_STPNCPY
> +	/* stpncpy returns the dest address plus the size not counting the
> +	   final '\0'.  */
> +	add	r3,r11,r5
> +#endif
>   	blr
> 
>   L(prep_n_tail2):
> @@ -186,6 +227,11 @@ L(n_tail2):
>   	sldi	r10,r5,56	/* stxvl wants size in top 8 bits  */
>   	addi	r11,r11,16	/* offset */
>   	stxvl	32+v1,r11,r10	/* Partial store  */
> +#ifdef USE_AS_STPNCPY
> +	/* stpncpy returns the dest address plus the size not counting the
> +	   final '\0'.  */
> +	add	r3,r11,r5
> +#endif
>   	blr
> 
>   L(prep_n_tail3):
> @@ -199,6 +245,11 @@ L(n_tail3):
>   	sldi	r10,r5,56	/* stxvl wants size in top 8 bits  */
>   	addi	r11,r11,32	/* Offset */
>   	stxvl	32+v2,r11,r10	/* Partial store  */
> +#ifdef USE_AS_STPNCPY
> +	/* stpncpy returns the dest address plus the size not counting the
> +	   final '\0'.  */
> +	add	r3,r11,r5
> +#endif
>   	blr
> 
>   L(prep_tail1):
> @@ -208,6 +259,11 @@ L(tail1):
>   	addi	r9,r8,1	/* Add null terminator  */
>   	sldi	r10,r9,56	/* stxvl wants size in top 8 bits  */
>   	stxvl	32+v0,r11,r10	/* Partial store  */
> +#ifdef USE_AS_STPNCPY
> +	/* stpncpy returns the dest address plus the size not counting the
> +	   final '\0'.  */
> +	add	r3,r11,r8
> +#endif
>   	add	r11,r11,r9
>   	sub	r5,r5,r9
>   	b L(zero_padding_loop)
> @@ -222,6 +278,11 @@ L(tail2):
>   	sldi	r10,r9,56	/* stxvl wants size in top 8 bits  */
>   	addi	r11,r11,16	/* offset */
>   	stxvl	32+v1,r11,r10	/* Partial store  */
> +#ifdef USE_AS_STPNCPY
> +	/* stpncpy returns the dest address plus the size not counting the
> +	   final '\0'.  */
> +	add	r3,r11,r8
> +#endif
>   	add	r11,r11,r9
>   	sub	r5,r5,r9
>   	b L(zero_padding_loop)
> @@ -237,6 +298,11 @@ L(tail3):
>   	sldi	r10,r9,56	/* stxvl wants size in top 8 bits  */
>   	addi	r11,r11,32	/* offset */
>   	stxvl	32+v2,r11,r10	/* Partial store  */
> +#ifdef USE_AS_STPNCPY
> +	/* stpncpy returns the dest address plus the size not counting the
> +	   final '\0'.  */
> +	add	r3,r11,r8
> +#endif
>   	add	r11,r11,r9
>   	sub	r5,r5,r9
>   	b L(zero_padding_loop)
> @@ -252,6 +318,11 @@ L(tail4):
>   	sldi	r10,r9,56	/* stxvl wants size in top 8 bits  */
>   	addi	r11,r11,48	/* offset */
>   	stxvl	32+v3,r11,r10	/* Partial store  */
> +#ifdef USE_AS_STPNCPY
> +	/* stpncpy returns the dest address plus the size not counting the
> +	   final '\0'.  */
> +	add	r3,r11,r8
> +#endif
>   	add	r11,r11,r9
>   	sub	r5,r5,r9
> 
> @@ -274,3 +345,6 @@ L(zero_padding_end):
>   L(n_tail):
> 
>   END (FUNC_NAME)
> +#ifndef USE_AS_STPNCPY
> +libc_hidden_builtin_def (strncpy)
> +#endif
OK.

> diff --git a/sysdeps/powerpc/powerpc64/multiarch/Makefile b/sysdeps/powerpc/powerpc64/multiarch/Makefile
> index cd2b47b403..f46bf50732 100644
> --- a/sysdeps/powerpc/powerpc64/multiarch/Makefile
> +++ b/sysdeps/powerpc/powerpc64/multiarch/Makefile

OK.

> diff --git a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
> index aa63e1c23f..56790bcfe3 100644
> --- a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
> +++ b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c

OK.

> diff --git a/sysdeps/powerpc/powerpc64/multiarch/stpncpy-power9.S b/sysdeps/powerpc/powerpc64/multiarch/stpncpy-power9.S
> new file mode 100644
> index 0000000000..ecbbb5c8e9
> --- /dev/null
> +++ b/sysdeps/powerpc/powerpc64/multiarch/stpncpy-power9.S
> @@ -0,0 +1,24 @@
> +/* Optimized stpncpy implementation for POWER9/PPC64.
> +   Copyright (C) 2015-2020 Free Software Foundation, Inc.
Minor nit, I suspect that date should only include 2020.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +#define STPNCPY __stpncpy_power9
> +
> +#undef libc_hidden_builtin_def
> +#define libc_hidden_builtin_def(name)
> +
OK.

> +#include <sysdeps/powerpc/powerpc64/le/power9/stpncpy.S>
> diff --git a/sysdeps/powerpc/powerpc64/multiarch/stpncpy.c b/sysdeps/powerpc/powerpc64/multiarch/stpncpy.c
> index 17df886431..21702716a3 100644
> --- a/sysdeps/powerpc/powerpc64/multiarch/stpncpy.c
> +++ b/sysdeps/powerpc/powerpc64/multiarch/stpncpy.c
> @@ -26,10 +26,17 @@
>   extern __typeof (__stpncpy) __stpncpy_ppc attribute_hidden;
>   extern __typeof (__stpncpy) __stpncpy_power7 attribute_hidden;
>   extern __typeof (__stpncpy) __stpncpy_power8 attribute_hidden;
> +# ifdef __LITTLE_ENDIAN__
> +extern __typeof (__stpncpy) __stpncpy_power9 attribute_hidden;
> +# endif
>   # undef stpncpy
>   # undef __stpncpy
> 
>   libc_ifunc_redirected (__redirect___stpncpy, __stpncpy,
> +# ifdef __LITTLE_ENDIAN__
> +		   (hwcap2 & PPC_FEATURE2_ARCH_3_00)
> +		   ? __stpncpy_power9 :
> +# endif
>   		       (hwcap2 & PPC_FEATURE2_ARCH_2_07)
>   		       ? __stpncpy_power8
>   		       : (hwcap & PPC_FEATURE_HAS_VSX)
> 
I think the spacing is off by two here.

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH 1/2] powerpc: Optimized strncpy for POWER9
  2020-08-20 18:29 [PATCH 1/2] powerpc: Optimized strncpy for POWER9 Raphael Moreira Zinsly
                   ` (2 preceding siblings ...)
  2020-08-28 14:25 ` Paul E Murphy
@ 2020-08-28 19:12 ` Paul A. Clarke
  2020-09-02 13:20 ` Tulio Magno Quites Machado Filho
  4 siblings, 0 replies; 9+ messages in thread
From: Paul A. Clarke @ 2020-08-28 19:12 UTC (permalink / raw)
  To: Raphael Moreira Zinsly; +Cc: libc-alpha

On Thu, Aug 20, 2020 at 03:29:16PM -0300, Raphael Moreira Zinsly via Libc-alpha wrote:
> Similar to the strcpy P9 optimization, this version uses VSX to improve
> performance.
> ---

> diff --git a/sysdeps/powerpc/powerpc64/le/power9/strncpy.S b/sysdeps/powerpc/powerpc64/le/power9/strncpy.S
> new file mode 100644
> index 0000000000..cde68384d4
> --- /dev/null
> +++ b/sysdeps/powerpc/powerpc64/le/power9/strncpy.S
> @@ -0,0 +1,276 @@
> +/* Optimized strncpy implementation for PowerPC64/POWER9.

sysdeps/powerpc/powerpc64/multiarch/strncpy-power9.S below, has
"POWER9/PPC64".  Can we make these consistent?  Can we just say
"POWER9"? Do we need to indicate little-endian only?

> +   Copyright (C) 2020 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +#include <sysdep.h>
> +
> +# ifndef STRNCPY
> +#  define FUNC_NAME strncpy
> +# else
> +#  define FUNC_NAME STRNCPY
> +# endif
> +
> +/* Implements the function
> +
> +   char * [r3] strncpy (char *dest [r3], const char *src [r4], size_t n [r5])
> +
> +   The implementation can load bytes past a null terminator, but only
> +   up to the next 16B boundary, so it never crosses a page.  */

nit, subjective: "up to the next 16-byte aligned address"

> +
> +.machine power9
> +ENTRY_TOCLESS (FUNC_NAME, 4)
> +	CALL_MCOUNT 2
> +
> +	cmpwi   r5, 0

This should be "cmpdi".

> +	beqlr
> +	/* NULL string optimisation  */

This comment would make more sense above the "cmpdi", above.

> +	lbz	r0,0(r4)
> +	stb	r0,0(r3)
> +	addi	r11,r3,1
> +	addi	r5,r5,-1
> +	vspltisb v18,0		/* Zeroes in v18  */
> +	cmpwi	r0,0

This should be "cmpdi".

> +	beq	L(zero_padding_loop)
> +

Given the above "NULL string" comment, you could
put an "empty string optimization" comment here.

> +	cmpwi	r5,0

This should be "cmpdi".

> +	beqlr

The "addi r11,r3,1" and "vspltisb v18,0" above aren't needed until
a bit later, which penalizes the empty string case.  I think you
can move the empty string test up.  Some experiments seemed to move
the lbz and dependent stb apart.  Something like this:
	/* NULL string optimisation  */
	cmpdi	r5,0
	beqlr

	lbz	r0,0(r4)
	/* empty/1-byte string optimisation  */
	cmpdi	r5,1
	stb	r0,0(r3)
	beqlr

	cmpdi	r0,0
	addi	r11,r3,1
	addi	r5,r5,-1
	vspltisb v18,0		/* Zeroes in v18  */
	beq	L(zero_padding_loop)

(But, I didn't see significant performance difference in
some light experimentation. It might be worth another look.)

> +
> +L(cont):

This label isn't used.

> +	addi	r4,r4,1
> +	neg	r7,r4
> +	rldicl	r9,r7,0,60	/* How many bytes to get source 16B aligned?  */
> +
> +	/* Get source 16B aligned  */
> +	lvx	v0,0,r4
> +	lvsr	v1,0,r4
> +	vperm	v0,v18,v0,v1
> +
> +	vcmpequb v6,v0,v18	/* 0xff if byte is NULL, 0x00 otherwise  */
> +	vctzlsbb r7,v6		/* Number of trailing zeroes  */
> +	addi	r8,r7,1	/* Add null terminator  */
> +
> +	/* r8 = bytes including null
> +	   r9 = bytes to get source 16B aligned
> +	   if r8 > r9
> +	      no null, copy r9 bytes
> +	   else
> +	      there is a null, copy r8 bytes and return.  */
> +	cmpd	r8,r9

This should probably be "cmpld".

> +	bgt	L(no_null)
> +
> +	cmpd	r8,r5		/* r8 <= n?  */

This should probably be "cmpld".

> +	ble	L(null)
> +	sldi	r10,r5,56	/* stxvl wants size in top 8 bits  */
> +	stxvl	32+v0,r11,r10	/* Partial store  */

Do we still need this "32+v0" syntax? Is that due to a minimum supported
level of binutils which isn't VSX-aware?

> +
> +	blr
> +
> +L(null):
> +	sldi	r10,r8,56	/* stxvl wants size in top 8 bits  */
> +	stxvl	32+v0,r11,r10	/* Partial store  */
> +
> +	add	r11,r11,r8
> +	sub	r5,r5,r8
> +	b L(zero_padding_loop)
> +
> +L(no_null):
> +	cmpd	r9,r5		/* Check if length was reached.  */

This should probably be "cmpld".

> +	bge	L(n_tail1)
> +	sldi	r10,r9,56	/* stxvl wants size in top 8 bits  */
> +	stxvl	32+v0,r11,r10	/* Partial store  */
> +
> +	add	r4,r4,r9
> +	add	r11,r11,r9
> +	sub	r5,r5,r9
> +
> +L(loop):
> +	cmpldi	cr6,r5,64	/* Check if length was reached.  */
> +	ble	cr6,L(final_loop)
> +
> +	lxv	32+v0,0(r4)
> +	vcmpequb. v6,v0,v18	/* Any zero bytes?  */
> +	bne	cr6,L(prep_tail1)
> +
> +	lxv	32+v1,16(r4)
> +	vcmpequb. v6,v1,v18	/* Any zero bytes?  */
> +	bne	cr6,L(prep_tail2)
> +
> +	lxv	32+v2,32(r4)
> +	vcmpequb. v6,v2,v18	/* Any zero bytes?  */
> +	bne	cr6,L(prep_tail3)
> +
> +	lxv	32+v3,48(r4)
> +	vcmpequb. v6,v3,v18	/* Any zero bytes?  */
> +	bne	cr6,L(prep_tail4)
> +
> +	stxv	32+v0,0(r11)
> +	stxv	32+v1,16(r11)
> +	stxv	32+v2,32(r11)
> +	stxv	32+v3,48(r11)
> +
> +	addi	r4,r4,64
> +	addi	r11,r11,64
> +	addi	r5,r5,-64
> +
> +	b	L(loop)
> +
> +L(final_loop):
> +	cmpldi	cr5,r5,16
> +	lxv	32+v0,0(r4)
> +	vcmpequb. v6,v0,v18	/* Any zero bytes?  */
> +	ble	cr5,L(prep_n_tail1)
> +	bne	cr6,L(count_tail1)
> +	addi	r5,r5,-16
> +
> +	cmpldi	cr5,r5,16
> +	lxv	32+v1,16(r4)
> +	vcmpequb. v6,v1,v18	/* Any zero bytes?  */
> +	ble	cr5,L(prep_n_tail2)
> +	bne	cr6,L(count_tail2)
> +	addi	r5,r5,-16
> +
> +	cmpldi	cr5,r5,16
> +	lxv	32+v2,32(r4)
> +	vcmpequb. v6,v2,v18	/* Any zero bytes?  */
> +	ble	cr5,L(prep_n_tail3)
> +	bne	cr6,L(count_tail3)
> +	addi	r5,r5,-16
> +
> +	lxv	32+v3,48(r4)
> +	vcmpequb. v6,v3,v18	/* Any zero bytes?  */
> +	beq	cr6,L(n_tail4)
> +
> +	vctzlsbb r8,v6		/* Number of trailing zeroes  */
> +	cmpd	r8,r5		/* r8 < n?  */

This should probably be "cmpld".

> +	blt	L(tail4)
> +L(n_tail4):
> +	stxv	32+v0,0(r11)
> +	stxv	32+v1,16(r11)
> +	stxv	32+v2,32(r11)
> +	sldi	r10,r5,56	/* stxvl wants size in top 8 bits  */
> +	addi	r11,r11,48	/* Offset */
> +	stxvl	32+v3,r11,r10	/* Partial store  */
> +	blr
> +
> +L(prep_n_tail1):
> +	beq	cr6,L(n_tail1)	/* Any zero bytes?  */
> +	vctzlsbb r8,v6		/* Number of trailing zeroes  */
> +	cmpd	r8,r5		/* r8 < n?  */

This should probably be "cmpld".

> +	blt	L(tail1)
> +L(n_tail1):
> +	sldi	r10,r5,56	/* stxvl wants size in top 8 bits  */
> +	stxvl	32+v0,r11,r10	/* Partial store  */
> +	blr
> +
> +L(prep_n_tail2):
> +	beq	cr6,L(n_tail2)	/* Any zero bytes?  */
> +	vctzlsbb r8,v6		/* Number of trailing zeroes  */
> +	cmpd	r8,r5		/* r8 < n?  */

This should probably be "cmpld".

> +	blt	L(tail2)
> +L(n_tail2):
> +	stxv	32+v0,0(r11)
> +	sldi	r10,r5,56	/* stxvl wants size in top 8 bits  */
> +	addi	r11,r11,16	/* offset */
> +	stxvl	32+v1,r11,r10	/* Partial store  */
> +	blr
> +
> +L(prep_n_tail3):
> +	beq	cr6,L(n_tail3)	/* Any zero bytes?  */
> +	vctzlsbb r8,v6		/* Number of trailing zeroes  */
> +	cmpd	r8,r5		/* r8 < n?  */

This should probably be "cmpld".

> +	blt	L(tail3)
> +L(n_tail3):
> +	stxv	32+v0,0(r11)
> +	stxv	32+v1,16(r11)
> +	sldi	r10,r5,56	/* stxvl wants size in top 8 bits  */
> +	addi	r11,r11,32	/* Offset */
> +	stxvl	32+v2,r11,r10	/* Partial store  */
> +	blr
> +
> +L(prep_tail1):
> +L(count_tail1):
> +	vctzlsbb r8,v6		/* Number of trailing zeroes  */
> +L(tail1):
> +	addi	r9,r8,1	/* Add null terminator  */
> +	sldi	r10,r9,56	/* stxvl wants size in top 8 bits  */
> +	stxvl	32+v0,r11,r10	/* Partial store  */
> +	add	r11,r11,r9
> +	sub	r5,r5,r9
> +	b L(zero_padding_loop)
> +
> +L(prep_tail2):
> +	addi	r5,r5,-16
> +L(count_tail2):
> +	vctzlsbb r8,v6		/* Number of trailing zeroes  */
> +L(tail2):
> +	addi	r9,r8,1	/* Add null terminator  */
> +	stxv	32+v0,0(r11)
> +	sldi	r10,r9,56	/* stxvl wants size in top 8 bits  */
> +	addi	r11,r11,16	/* offset */
> +	stxvl	32+v1,r11,r10	/* Partial store  */
> +	add	r11,r11,r9
> +	sub	r5,r5,r9
> +	b L(zero_padding_loop)
> +
> +L(prep_tail3):
> +	addi	r5,r5,-32
> +L(count_tail3):
> +	vctzlsbb r8,v6		/* Number of trailing zeroes  */
> +L(tail3):
> +	addi	r9,r8,1	/* Add null terminator  */
> +	stxv	32+v0,0(r11)
> +	stxv	32+v1,16(r11)
> +	sldi	r10,r9,56	/* stxvl wants size in top 8 bits  */
> +	addi	r11,r11,32	/* offset */
> +	stxvl	32+v2,r11,r10	/* Partial store  */
> +	add	r11,r11,r9
> +	sub	r5,r5,r9
> +	b L(zero_padding_loop)
> +
> +L(prep_tail4):
> +	addi	r5,r5,-48
> +	vctzlsbb r8,v6		/* Number of trailing zeroes  */
> +L(tail4):
> +	addi	r9,r8,1	/* Add null terminator  */
> +	stxv	32+v0,0(r11)
> +	stxv	32+v1,16(r11)
> +	stxv	32+v2,32(r11)
> +	sldi	r10,r9,56	/* stxvl wants size in top 8 bits  */
> +	addi	r11,r11,48	/* offset */
> +	stxvl	32+v3,r11,r10	/* Partial store  */
> +	add	r11,r11,r9
> +	sub	r5,r5,r9
> +
> +/* This code pads the remainder of dest with NULL bytes.  */
> +L(zero_padding_loop):
> +	cmpldi	cr6,r5,16	/* Check if length was reached.  */
> +	ble	cr6,L(zero_padding_end)
> +
> +	stxv	v18,0(r11)
> +	addi	r11,r11,16
> +	addi	r5,r5,-16
> +
> +	b	L(zero_padding_loop)
> +
> +L(zero_padding_end):
> +	sldi	r10,r5,56	/* stxvl wants size in top 8 bits  */
> +	stxvl	v18,r11,r10	/* Partial store  */
> +	blr
> +
> +L(n_tail):
> +
> +END (FUNC_NAME)

PC

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH 1/2] powerpc: Optimized strncpy for POWER9
  2020-08-20 18:29 [PATCH 1/2] powerpc: Optimized strncpy for POWER9 Raphael Moreira Zinsly
                   ` (3 preceding siblings ...)
  2020-08-28 19:12 ` Paul A. Clarke
@ 2020-09-02 13:20 ` Tulio Magno Quites Machado Filho
  2020-09-02 14:00   ` Paul E Murphy
  4 siblings, 1 reply; 9+ messages in thread
From: Tulio Magno Quites Machado Filho @ 2020-09-02 13:20 UTC (permalink / raw)
  To: libc-alpha, Raphael Moreira Zinsly

Raphael Moreira Zinsly via Libc-alpha <libc-alpha@sourceware.org> writes:

> diff --git a/sysdeps/powerpc/powerpc64/le/power9/strncpy.S b/sysdeps/powerpc/powerpc64/le/power9/strncpy.S
> new file mode 100644
> index 0000000000..cde68384d4
> --- /dev/null
> +++ b/sysdeps/powerpc/powerpc64/le/power9/strncpy.S
> @@ -0,0 +1,276 @@
> ...
> +/* Implements the function
> +
> +   char * [r3] strncpy (char *dest [r3], const char *src [r4], size_t n [r5])
> +
> +   The implementation can load bytes past a null terminator, but only
> +   up to the next 16B boundary, so it never crosses a page.  */
> +
> +.machine power9

I don't think Binutils 2.26 supports .machine power9.  Likewise for all P9
instructions.  However, current glibc is expected to work with Binutils 2.26
(ppc64le), i.e. builds with Binutils 2.26 should not fail.

So, we either need to change this code (e.g. similar to strcmp) or we need
to bump the Binutils requirements.
The last time Binutils requirements was bumped was in 2017, so I think it's safe
to do this now.

Let me prepare a patch proposing this.

-- 
Tulio Magno

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH 1/2] powerpc: Optimized strncpy for POWER9
  2020-09-02 13:20 ` Tulio Magno Quites Machado Filho
@ 2020-09-02 14:00   ` Paul E Murphy
  0 siblings, 0 replies; 9+ messages in thread
From: Paul E Murphy @ 2020-09-02 14:00 UTC (permalink / raw)
  To: Tulio Magno Quites Machado Filho, libc-alpha, Raphael Moreira Zinsly



On 9/2/20 8:20 AM, Tulio Magno Quites Machado Filho wrote:
> Raphael Moreira Zinsly via Libc-alpha <libc-alpha@sourceware.org> writes:
> 
>> diff --git a/sysdeps/powerpc/powerpc64/le/power9/strncpy.S b/sysdeps/powerpc/powerpc64/le/power9/strncpy.S
>> new file mode 100644
>> index 0000000000..cde68384d4
>> --- /dev/null
>> +++ b/sysdeps/powerpc/powerpc64/le/power9/strncpy.S
>> @@ -0,0 +1,276 @@
>> ...
>> +/* Implements the function
>> +
>> +   char * [r3] strncpy (char *dest [r3], const char *src [r4], size_t n [r5])
>> +
>> +   The implementation can load bytes past a null terminator, but only
>> +   up to the next 16B boundary, so it never crosses a page.  */
>> +
>> +.machine power9
> 
> I don't think Binutils 2.26 supports .machine power9.  Likewise for all P9
> instructions.  However, current glibc is expected to work with Binutils 2.26
> (ppc64le), i.e. builds with Binutils 2.26 should not fail.
> 
> So, we either need to change this code (e.g. similar to strcmp) or we need
> to bump the Binutils requirements.
> The last time Binutils requirements was bumped was in 2017, so I think it's safe
> to do this now.
> 
> Let me prepare a patch proposing this.

There are at least 5 uses of .machine power9 throughout glibc today.  I 
agree with bumping at least the ppc64le requirements to match.

^ permalink raw reply	[flat|nested] 9+ messages in thread

end of thread, other threads:[~2020-09-02 14:00 UTC | newest]

Thread overview: 9+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2020-08-20 18:29 [PATCH 1/2] powerpc: Optimized strncpy for POWER9 Raphael Moreira Zinsly
2020-08-20 18:29 ` [PATCH 2/2] powerpc: Optimzed stpncpy " Raphael Moreira Zinsly
2020-08-20 18:31   ` Raphael M Zinsly
2020-08-28 17:04   ` Paul E Murphy
2020-08-20 18:31 ` [PATCH 1/2] powerpc: Optimized strncpy " Raphael M Zinsly
2020-08-28 14:25 ` Paul E Murphy
2020-08-28 19:12 ` Paul A. Clarke
2020-09-02 13:20 ` Tulio Magno Quites Machado Filho
2020-09-02 14:00   ` Paul E Murphy

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).