[PATCH] x86_64: Implement AVX2 version of strlcpy/wcslcpy function

public inbox for libc-alpha@sourceware.org
 help / color / mirror / Atom feed

* [PATCH] x86_64: Implement AVX2 version of strlcpy/wcslcpy function
@ 2023-06-30 20:48 Sunil K Pandey
  2023-06-30 21:04 ` Noah Goldstein
  2023-07-02 17:03 ` Noah Goldstein
  0 siblings, 2 replies; 24+ messages in thread
From: Sunil K Pandey @ 2023-06-30 20:48 UTC (permalink / raw)
  To: libc-alpha; +Cc: hjl.tools

This patch optimizes strlcpy/wsclcpy string functions for AVX2.
---
 sysdeps/x86_64/multiarch/Makefile          |   4 +
 sysdeps/x86_64/multiarch/ifunc-impl-list.c |  18 +
 sysdeps/x86_64/multiarch/ifunc-strlcpy.h   |  34 ++
 sysdeps/x86_64/multiarch/strlcpy-avx2.S    | 446 +++++++++++++++++++++
 sysdeps/x86_64/multiarch/strlcpy-generic.c |  25 ++
 sysdeps/x86_64/multiarch/strlcpy.c         |  36 ++
 sysdeps/x86_64/multiarch/wcslcpy-avx2.S    |   4 +
 sysdeps/x86_64/multiarch/wcslcpy-generic.c |  25 ++
 sysdeps/x86_64/multiarch/wcslcpy.c         |  35 ++
 9 files changed, 627 insertions(+)
 create mode 100644 sysdeps/x86_64/multiarch/ifunc-strlcpy.h
 create mode 100644 sysdeps/x86_64/multiarch/strlcpy-avx2.S
 create mode 100644 sysdeps/x86_64/multiarch/strlcpy-generic.c
 create mode 100644 sysdeps/x86_64/multiarch/strlcpy.c
 create mode 100644 sysdeps/x86_64/multiarch/wcslcpy-avx2.S
 create mode 100644 sysdeps/x86_64/multiarch/wcslcpy-generic.c
 create mode 100644 sysdeps/x86_64/multiarch/wcslcpy.c

diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
index e1e894c963..7e3fc081df 100644
--- a/sysdeps/x86_64/multiarch/Makefile
+++ b/sysdeps/x86_64/multiarch/Makefile
@@ -82,6 +82,8 @@ sysdep_routines += \
   strcpy-sse2 \
   strcpy-sse2-unaligned \
   strcspn-sse4 \
+  strlcpy-avx2 \
+  strlcpy-generic \
   strlen-avx2 \
   strlen-avx2-rtm \
   strlen-evex \
@@ -153,6 +155,8 @@ sysdep_routines += \
   wcscpy-evex \
   wcscpy-generic \
   wcscpy-ssse3 \
+  wcslcpy-avx2 \
+  wcslcpy-generic \
   wcslen-avx2 \
   wcslen-avx2-rtm \
   wcslen-evex \
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
index 5427ff1907..9928dee187 100644
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
@@ -751,6 +751,15 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 				     1,
 				     __strncat_sse2_unaligned))
 
+  /* Support sysdeps/x86_64/multiarch/strlcpy.c.  */
+  IFUNC_IMPL (i, name, strlcpy,
+	      X86_IFUNC_IMPL_ADD_V3 (array, i, strlcpy,
+				     CPU_FEATURE_USABLE (AVX2),
+				     __strlcpy_avx2)
+	      X86_IFUNC_IMPL_ADD_V1 (array, i, strlcpy,
+				     1,
+				     __strlcpy_generic))
+
   /* Support sysdeps/x86_64/multiarch/strncpy.c.  */
   IFUNC_IMPL (i, name, strncpy,
 	      X86_IFUNC_IMPL_ADD_V4 (array, i, strncpy,
@@ -917,6 +926,15 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 				     1,
 				     __wcscpy_generic))
 
+  /* Support sysdeps/x86_64/multiarch/wcslcpy.c.  */
+  IFUNC_IMPL (i, name, wcslcpy,
+	      X86_IFUNC_IMPL_ADD_V3 (array, i, wcslcpy,
+				     CPU_FEATURE_USABLE (AVX2),
+				     __wcslcpy_avx2)
+	      X86_IFUNC_IMPL_ADD_V1 (array, i, wcslcpy,
+				     1,
+				     __wcslcpy_generic))
+
   /* Support sysdeps/x86_64/multiarch/wcsncpy.c.  */
   IFUNC_IMPL (i, name, wcsncpy,
 	      X86_IFUNC_IMPL_ADD_V4 (array, i, wcsncpy,
diff --git a/sysdeps/x86_64/multiarch/ifunc-strlcpy.h b/sysdeps/x86_64/multiarch/ifunc-strlcpy.h
new file mode 100644
index 0000000000..982a30d15b
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/ifunc-strlcpy.h
@@ -0,0 +1,34 @@
+/* Common definition for ifunc selections.
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2023 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <init-arch.h>
+
+extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE (generic) attribute_hidden;
+
+static inline void *
+IFUNC_SELECTOR (void)
+{
+  const struct cpu_features *cpu_features = __get_cpu_features ();
+
+  if (X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, AVX2))
+    return OPTIMIZE (avx2);
+
+  return OPTIMIZE (generic);
+}
diff --git a/sysdeps/x86_64/multiarch/strlcpy-avx2.S b/sysdeps/x86_64/multiarch/strlcpy-avx2.S
new file mode 100644
index 0000000000..cf54b1e990
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strlcpy-avx2.S
@@ -0,0 +1,446 @@
+/* Strlcpy/wcslcpy optimized with AVX2.
+   Copyright (C) 2023 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <isa-level.h>
+
+#if ISA_SHOULD_BUILD (3)
+
+# include <sysdep.h>
+
+# ifndef VEC_SIZE
+#  include "x86-avx-vecs.h"
+# endif
+
+# ifndef STRLCPY
+#  define STRLCPY	__strlcpy_avx2
+# endif
+
+
+# ifdef USE_AS_WCSLCPY
+#  define CHAR_SIZE	4
+#  define MOVU		movl
+#  define VPCMPEQ	vpcmpeqd
+#  define VPMINU	vpminud
+# else
+#  define CHAR_SIZE	1
+#  define MOVU		movb
+#  define VPCMPEQ	vpcmpeqb
+#  define VPMINU	vpminub
+# endif
+
+# define PMOVMSK	vpmovmskb
+# define PAGE_SIZE	4096
+# define VEC_SIZE	32
+# define CHAR_PER_VEC	(VEC_SIZE / CHAR_SIZE)
+
+	.section SECTION(.text),"ax",@progbits
+/* Aligning entry point to 64 byte, provides better performance for
+   one vector length string.  */
+
+ENTRY_P2ALIGN (STRLCPY, 6)
+# ifdef __ILP32__
+	/* Clear the upper 32 bits.  */
+	movl    %edx, %edx
+# endif
+
+	/* Zero out vector register for end of string comparison. */
+	vpxor	%VMM(0), %VMM(0), %VMM(0)
+	/* Save source pointer for return calculation.  */
+	mov	%rsi, %r8
+	mov	%esi, %eax
+	sall	$20, %eax
+	cmpl	$((PAGE_SIZE - (VEC_SIZE)) << 20), %eax
+	ja	L(page_cross)
+
+L(page_cross_continue):
+	/* Load first vector.  */
+	VMOVU	(%rsi), %VMM(1)
+	VPCMPEQ %VMM(0), %VMM(1), %VMM(2)
+	PMOVMSK %VMM(2), %eax
+	test	%eax, %eax
+	jnz	L(ret_vec_x1)
+
+	test	%rdx, %rdx
+	jz	L(continue_second_vector)
+
+	/* Check whether we can copy full vector.  */
+	cmp	$CHAR_PER_VEC, %rdx
+	jbe	L(page_cross_small_vec_copy)
+	/* Copy first vector.  */
+	VMOVU	%VMM(1), (%rdi)
+	sub	$CHAR_PER_VEC, %rdx
+
+L(continue_second_vector):
+	/* Align RSI pointer and adjust RDI based on offset.  */
+	mov	%rsi, %rax
+	and	$-VEC_SIZE, %rsi
+	sub	%rsi, %rax
+	sub	%rax, %rdi
+
+	/* Check if string already copied N char, and RDX is 0.  */
+	test	%rdx, %rdx
+	jz	L(skip_copy_alignment_fix)
+
+	/* Adjust RDX for copy alignment fix.  */
+# ifdef USE_AS_WCSLCPY
+	shr	$2, %rax
+# endif
+	add	%rax, %rdx
+
+L(skip_copy_alignment_fix):
+	/* Load second vector.  */
+	VMOVA	(VEC_SIZE * 1)(%rsi), %VMM(1)
+	VPCMPEQ	%VMM(0), %VMM(1), %VMM(2)
+	vptest	%VMM(2), %VMM(2)
+	jnz	L(ret_vec_x2)
+
+	/* Skip copy if RDX is 0.  */
+	test	%rdx, %rdx
+	jz	L(continue_third_vector)
+
+	/* Jump below/equal(instead of below) used here, because last
+	   copy chracter must be NULL.  */
+	cmp	$CHAR_PER_VEC, %rdx
+	jbe	L(partial_copy_second_vector)
+
+	sub	$CHAR_PER_VEC, %rdx
+	/* Copy second vector.  */
+	VMOVU	%VMM(1), (VEC_SIZE * 1)(%rdi)
+
+L(continue_third_vector):
+	/* Load third vector.  */
+	VMOVA	(VEC_SIZE * 2)(%rsi), %VMM(1)
+	VPCMPEQ	%VMM(0), %VMM(1), %VMM(2)
+	vptest	%VMM(2), %VMM(2)
+	jnz	L(ret_vec_x3)
+
+	/* Skip copy if RDX is 0.  */
+	test	%rdx, %rdx
+	jz	L(continue_fourth_vector)
+
+	cmp	$CHAR_PER_VEC, %rdx
+	jbe	L(partial_copy_third_vector)
+
+	sub	$CHAR_PER_VEC, %rdx
+	/* Copy third vector.  */
+	VMOVU	%VMM(1), (VEC_SIZE * 2)(%rdi)
+
+L(continue_fourth_vector):
+	/* Load fourth vector.  */
+	VMOVA	(VEC_SIZE * 3)(%rsi), %VMM(1)
+	VPCMPEQ	%VMM(0), %VMM(1), %VMM(2)
+	vptest	%VMM(2), %VMM(2)
+	jnz	L(ret_vec_x4)
+
+	/* Skip copy if RDX is 0.  */
+	test	%rdx, %rdx
+	jz	L(loop_4x_align)
+
+	cmp	$CHAR_PER_VEC, %rdx
+	jbe	L(partial_copy_fourth_vector)
+
+	sub	$CHAR_PER_VEC, %rdx
+	/* Copy fourth vector.  */
+	VMOVU	%VMM(1), (VEC_SIZE * 3)(%rdi)
+
+
+L(loop_4x_align):
+	/* Jump to loop if RSI is already 4 vector align.  */
+	test	$(VEC_SIZE * 4 - 1), %esi
+	jz	L(loop_4x_read)
+
+	mov	%rsi, %rcx
+
+	/* Align RSI to 4x vector.  */
+	and	$(VEC_SIZE * -4), %rsi
+	sub	%rsi, %rcx
+
+	/* Adjust RDI for RSI alignment fix.  */
+	sub	%rcx, %rdi
+
+	/* Jump to loop if RDX is 0.  */
+	test    %rdx, %rdx
+	jz	L(loop_4x_read)
+
+# ifdef USE_AS_WCSLCPY
+	shr	$2, %rcx
+# endif
+
+	/* Adjust RDX for RSI alignment fix.  */
+	add	%rcx, %rdx
+	jmp	L(loop_4x_read)
+
+	.p2align 4,,6
+L(loop_4x_vec):
+	/* Skip copy if RDX is 0.  */
+	test	%rdx, %rdx
+	jz	L(loop_partial_copy_return)
+	cmp	$(CHAR_PER_VEC * 4), %rdx
+	jbe	L(loop_partial_copy)
+	VMOVU	%VMM(1), (VEC_SIZE * 4)(%rdi)
+	VMOVU	%VMM(2), (VEC_SIZE * 5)(%rdi)
+	VMOVU	%VMM(3), (VEC_SIZE * 6)(%rdi)
+	VMOVU	%VMM(4), (VEC_SIZE * 7)(%rdi)
+	sub	$(CHAR_PER_VEC * 4), %rdx
+
+L(loop_partial_copy_return):
+	sub	$(VEC_SIZE * -4), %rsi
+	sub	$(VEC_SIZE * -4), %rdi
+
+L(loop_4x_read):
+	VMOVA	(VEC_SIZE * 4)(%rsi), %VMM(1)
+	VMOVA	(VEC_SIZE * 5)(%rsi), %VMM(2)
+	VMOVA	(VEC_SIZE * 6)(%rsi), %VMM(3)
+	VMOVA	(VEC_SIZE * 7)(%rsi), %VMM(4)
+	VPMINU	%VMM(1), %VMM(2), %VMM(5)
+	VPMINU	%VMM(3), %VMM(4), %VMM(6)
+	VPMINU	%VMM(5), %VMM(6), %VMM(7)
+	VPCMPEQ	%VMM(0), %VMM(7), %VMM(7)
+	vptest	%VMM(7), %VMM(7)
+
+	jz	L(loop_4x_vec)
+
+	/* Check if string ends in first vector or second vector.  */
+	lea	(VEC_SIZE * 4)(%rsi), %rax
+	sub	%r8, %rax
+# ifdef USE_AS_WCSLCPY
+	shr	$2, %rax
+# endif
+	xor	%r10, %r10
+	VPCMPEQ	%VMM(0), %VMM(5), %VMM(6)
+	vptest	%VMM(6), %VMM(6)
+	jnz	L(endloop)
+	sub	$(CHAR_PER_VEC * -2), %rax
+	mov	$(CHAR_PER_VEC * 2), %r10
+	VMOVA	%VMM(3), %VMM(1)
+	VMOVA	%VMM(4), %VMM(2)
+
+L(endloop):
+	VPCMPEQ	%VMM(0), %VMM(1), %VMM(1)
+	VPCMPEQ	%VMM(0), %VMM(2), %VMM(2)
+	PMOVMSK %VMM(1), %rcx
+	PMOVMSK %VMM(2), %r9
+	shlq	$32, %r9
+	orq	%r9, %rcx
+	bsf	%rcx, %rcx
+	/* Shift RCX by 2, VPMOVMSK has only byte version.  */
+# ifdef USE_AS_WCSLCPY
+	shr	$2, %rcx
+# endif
+	/* At this point RAX has length to return.  */
+	add	%rcx, %rax
+	test	%rdx, %rdx
+	jz	L(ret)
+
+	/* Add 1 to account for NULL character in RDX comparison.  */
+	lea	1(%r10, %rcx), %rcx
+	cmp	%rdx, %rcx
+	cmovb	%rcx, %rdx
+
+L(loop_partial_copy):
+	cmp	$(CHAR_PER_VEC * 2), %rdx
+	jbe	L(loop_partial_first_half)
+	/* Reload first 2 vector.  */
+	VMOVA	(VEC_SIZE * 4)(%rsi), %VMM(1)
+	VMOVA	(VEC_SIZE * 5)(%rsi), %VMM(2)
+	VMOVU	%VMM(1), (VEC_SIZE * 4)(%rdi)
+	VMOVU	%VMM(2), (VEC_SIZE * 5)(%rdi)
+
+L(loop_partial_first_half):
+	/* Go back 2 vector from last and use overlapping copy.
+	   (VEC_SIZE * 4 - VEC_SIZE * 2)(%rsi, %rdx, CHAR_SIZE)
+	   (VEC_SIZE * 4 - VEC_SIZE * 1)(%rsi, %rdx, CHAR_SIZE)
+	 */
+	VMOVU	(VEC_SIZE * 2)(%rsi, %rdx, CHAR_SIZE), %VMM(3)
+	VMOVU	(VEC_SIZE * 3)(%rsi, %rdx, CHAR_SIZE), %VMM(4)
+	VMOVU	%VMM(3), (VEC_SIZE * 2)(%rdi, %rdx, CHAR_SIZE)
+	VMOVU	%VMM(4), (VEC_SIZE * 3)(%rdi, %rdx, CHAR_SIZE)
+	MOVU	$0, (VEC_SIZE * 4 - CHAR_SIZE)(%rdi, %rdx, CHAR_SIZE)
+	xor	%rdx, %rdx
+	vptest	%VMM(7), %VMM(7)
+	jz	L(loop_partial_copy_return)
+	ret
+
+	.p2align 4
+L(page_cross):
+	mov	%rsi, %rcx
+	mov	%rsi, %r11
+	and	$-VEC_SIZE, %r11
+	and	$(VEC_SIZE - 1), %rcx
+	VMOVA	(%r11), %VMM(1)
+	VPCMPEQ	%VMM(0), %VMM(1), %VMM(2)
+	PMOVMSK %VMM(2), %eax
+	shr	%cl, %eax
+	jz	L(page_cross_continue)
+
+L(ret_vec_x1):
+	bsf	%eax, %eax
+# ifdef USE_AS_WCSLCPY
+	shr	$2, %eax
+# endif
+	/* Increment by 1 to account for NULL char.  */
+	lea	1(%eax), %ecx
+	cmp	%rdx, %rcx
+	cmovb	%rcx, %rdx
+	test	%rdx, %rdx
+	jz	L(ret)
+
+L(page_cross_small_vec_copy):
+	cmp	$(16 / CHAR_SIZE), %rdx
+	jbe	L(copy_8_byte_scalar)
+	VMOVU	(%rsi), %VMM_128(1)
+	VMOVU	-16(%rsi, %rdx, CHAR_SIZE), %VMM_128(3)
+	VMOVU	%VMM_128(1), (%rdi)
+	VMOVU	%VMM_128(3), -16(%rdi, %rdx, CHAR_SIZE)
+	MOVU	$0, -(CHAR_SIZE * 1)(%rdi, %rdx, CHAR_SIZE)
+	xor	%rdx, %rdx
+	vptest	%VMM(2), %VMM(2)
+	jz	L(continue_second_vector)
+	ret
+
+L(copy_8_byte_scalar):
+	cmp	$(8 / CHAR_SIZE), %rdx
+	jbe	L(copy_4_byte_scalar)
+	movq	(%rsi), %r10
+	movq	-8(%rsi, %rdx, CHAR_SIZE), %r11
+	movq	%r10, (%rdi)
+	movq	%r11, -8(%rdi, %rdx, CHAR_SIZE)
+	MOVU	$0, -(CHAR_SIZE * 1)(%rdi, %rdx, CHAR_SIZE)
+	xor	%edx, %edx
+	vptest	%VMM(2), %VMM(2)
+	jz	L(continue_second_vector)
+	ret
+
+L(copy_4_byte_scalar):
+# ifndef USE_AS_WCSLCPY
+	cmp	$4, %rdx
+	jbe	L(copy_2_byte_scalar)
+# endif
+	movl	(%rsi), %r10d
+	movl	-4(%rsi, %rdx, CHAR_SIZE), %r11d
+	movl	%r10d, (%rdi)
+	movl	%r11d, -4(%rdi, %rdx, CHAR_SIZE)
+	MOVU	$0, -(CHAR_SIZE * 1)(%rdi, %rdx, CHAR_SIZE)
+	xor	%edx, %edx
+	vptest	%VMM(2), %VMM(2)
+	jz	L(continue_second_vector)
+	ret
+
+# ifndef USE_AS_WCSLCPY
+L(copy_2_byte_scalar):
+	cmp	$2, %rdx
+	jbe	L(copy_1_byte_scalar)
+	movw	(%rsi), %r10w
+	movw	-(CHAR_SIZE * 3)(%rsi, %rdx, CHAR_SIZE), %r11w
+	movw	%r10w, (%rdi)
+	movw	%r11w, -(CHAR_SIZE * 3)(%rdi, %rdx, CHAR_SIZE)
+	MOVU	$0, -(CHAR_SIZE * 1)(%rdi, %rdx, CHAR_SIZE)
+	xor	%edx, %edx
+	vptest	%VMM(2), %VMM(2)
+	jz	L(continue_second_vector)
+	ret
+
+L(copy_1_byte_scalar):
+	MOVU	(%rsi), %r10b
+	MOVU	%r10b, (%rdi)
+	MOVU	$0, -(CHAR_SIZE * 1)(%rdi, %rdx, CHAR_SIZE)
+	xor	%edx, %edx
+	vptest	%VMM(2), %VMM(2)
+	jz	L(continue_second_vector)
+	ret
+# endif
+
+L(ret_vec_x2):
+	PMOVMSK %VMM(2), %rax
+	bsf	%rax, %rcx
+	/* Calculate return value.  */
+	lea	VEC_SIZE(%rsi, %rcx), %rax
+	sub	%r8, %rax
+# ifdef USE_AS_WCSLCPY
+	shr	$2, %rax
+	shr	$2, %rcx
+# endif
+	inc	%rcx
+	test	%rdx, %rdx
+	jz	L(ret)
+	cmp	%rdx, %rcx
+	cmovb	%rcx, %rdx
+
+L(partial_copy_second_vector):
+	VMOVU	(%rsi, %rdx, CHAR_SIZE), %VMM(1)
+	VMOVU	%VMM(1), (%rdi, %rdx, CHAR_SIZE)
+	MOVU	$0, (VEC_SIZE - CHAR_SIZE * 1)(%rdi, %rdx, CHAR_SIZE)
+	xor	%edx, %edx
+	vptest	%VMM(2), %VMM(2)
+	jz	L(continue_third_vector)
+
+L(ret):
+	ret
+
+L(ret_vec_x3):
+	PMOVMSK %VMM(2), %rax
+	bsf	%rax, %rcx
+	/* Calculate return value.  */
+	lea	(VEC_SIZE * 2)(%rsi, %rcx), %rax
+	sub	%r8, %rax
+# ifdef USE_AS_WCSLCPY
+	shr	$2, %rax
+	shr	$2, %rcx
+# endif
+	inc	%rcx
+	test	%rdx, %rdx
+	jz	L(ret)
+	cmp	%rdx, %rcx
+	cmovb	%rcx, %rdx
+
+L(partial_copy_third_vector):
+	VMOVU	(VEC_SIZE)(%rsi, %rdx, CHAR_SIZE), %VMM(1)
+	VMOVU	%VMM(1), (VEC_SIZE)(%rdi, %rdx, CHAR_SIZE)
+	MOVU	$0, ((VEC_SIZE * 2) - CHAR_SIZE * 1)(%rdi, %rdx, CHAR_SIZE)
+	xor	%edx, %edx
+	vptest	%VMM(2), %VMM(2)
+	jz	L(continue_fourth_vector)
+	ret
+
+L(ret_vec_x4):
+	PMOVMSK %VMM(2), %rax
+	bsf	%rax, %rcx
+	/* Calculate return value.  */
+	lea	(VEC_SIZE * 3)(%rsi, %rcx), %rax
+	sub	%r8, %rax
+# ifdef USE_AS_WCSLCPY
+	shr	$2, %rax
+	shr	$2, %rcx
+# endif
+	inc	%rcx
+	test	%rdx, %rdx
+	jz	L(ret)
+	cmp	%rdx, %rcx
+	cmovb	%rcx, %rdx
+
+L(partial_copy_fourth_vector):
+	VMOVU	(VEC_SIZE * 2)(%rsi, %rdx, CHAR_SIZE), %VMM(1)
+	VMOVU	%VMM(1), (VEC_SIZE * 2)(%rdi, %rdx, CHAR_SIZE)
+	MOVU	$0, ((VEC_SIZE * 3) - CHAR_SIZE * 1)(%rdi, %rdx, CHAR_SIZE)
+	xor	%edx, %edx
+	vptest	%VMM(2), %VMM(2)
+	jz	L(continue_fourth_vector)
+	ret
+
+END (STRLCPY)
+#endif
diff --git a/sysdeps/x86_64/multiarch/strlcpy-generic.c b/sysdeps/x86_64/multiarch/strlcpy-generic.c
new file mode 100644
index 0000000000..eee3b7b086
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strlcpy-generic.c
@@ -0,0 +1,25 @@
+/* strlcpy generic.
+   Copyright (C) 2023 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+
+#include <isa-level.h>
+#if ISA_SHOULD_BUILD (1)
+# define __strlcpy  __strlcpy_generic
+# include <string/strlcpy.c>
+
+#endif
diff --git a/sysdeps/x86_64/multiarch/strlcpy.c b/sysdeps/x86_64/multiarch/strlcpy.c
new file mode 100644
index 0000000000..ded41fbcfb
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strlcpy.c
@@ -0,0 +1,36 @@
+/* Multiple versions of strlcpy.
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2023 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+/* Define multiple versions only for the definition in libc.  */
+#if IS_IN (libc)
+# define __strlcpy __redirect_strlcpy
+# include <string.h>
+# undef __strlcpy
+
+# define SYMBOL_NAME strlcpy
+# include "ifunc-strlcpy.h"
+
+libc_ifunc_redirected (__redirect_strlcpy, __strlcpy, IFUNC_SELECTOR ());
+weak_alias (__strlcpy, strlcpy)
+
+# ifdef SHARED
+__hidden_ver1 (__strlcpy, __GI___strlcpy, __redirect_strlcpy)
+  __attribute__ ((visibility ("hidden"))) __attribute_copy__ (strlcpy);
+# endif
+#endif
diff --git a/sysdeps/x86_64/multiarch/wcslcpy-avx2.S b/sysdeps/x86_64/multiarch/wcslcpy-avx2.S
new file mode 100644
index 0000000000..dafc20ded0
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wcslcpy-avx2.S
@@ -0,0 +1,4 @@
+#define STRLCPY	__wcslcpy_avx2
+#define USE_AS_WCSLCPY 1
+
+#include "strlcpy-avx2.S"
diff --git a/sysdeps/x86_64/multiarch/wcslcpy-generic.c b/sysdeps/x86_64/multiarch/wcslcpy-generic.c
new file mode 100644
index 0000000000..ffd3c0e846
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wcslcpy-generic.c
@@ -0,0 +1,25 @@
+/* wcslcpy generic.
+   Copyright (C) 2023 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+
+#include <isa-level.h>
+#if ISA_SHOULD_BUILD (1)
+# define __wcslcpy  __wcslcpy_generic
+# include <wcsmbs/wcslcpy.c>
+
+#endif
diff --git a/sysdeps/x86_64/multiarch/wcslcpy.c b/sysdeps/x86_64/multiarch/wcslcpy.c
new file mode 100644
index 0000000000..371ef9626c
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wcslcpy.c
@@ -0,0 +1,35 @@
+/* Multiple versions of wcslcpy.
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2023 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+/* Define multiple versions only for the definition in libc.  */
+#if IS_IN (libc)
+# define __wcslcpy __redirect_wcslcpy
+# include <wchar.h>
+# undef __wcslcpy
+
+# define SYMBOL_NAME wcslcpy
+# include "ifunc-strlcpy.h"
+
+libc_ifunc_redirected (__redirect_wcslcpy, __wcslcpy, IFUNC_SELECTOR ());
+weak_alias (__wcslcpy, wcslcpy)
+# ifdef SHARED
+__hidden_ver1 (__wcslcpy, __GI___wcslcpy, __redirect_wcslcpy)
+  __attribute__((visibility ("hidden"))) __attribute_copy__ (wcslcpy);
+# endif
+#endif
-- 
2.38.1


^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH] x86_64: Implement AVX2 version of strlcpy/wcslcpy function
  2023-06-30 20:48 [PATCH] x86_64: Implement AVX2 version of strlcpy/wcslcpy function Sunil K Pandey
@ 2023-06-30 21:04 ` Noah Goldstein
  2023-06-30 21:27   ` Paul Eggert
  2023-07-01  9:41   ` Florian Weimer
  2023-07-02 17:03 ` Noah Goldstein
  1 sibling, 2 replies; 24+ messages in thread
From: Noah Goldstein @ 2023-06-30 21:04 UTC (permalink / raw)
  To: Sunil K Pandey; +Cc: libc-alpha, hjl.tools

On Fri, Jun 30, 2023 at 3:48 PM Sunil K Pandey via Libc-alpha
<libc-alpha@sourceware.org> wrote:
>
> This patch optimizes strlcpy/wsclcpy string functions for AVX2.
> ---
>  sysdeps/x86_64/multiarch/Makefile          |   4 +
>  sysdeps/x86_64/multiarch/ifunc-impl-list.c |  18 +
>  sysdeps/x86_64/multiarch/ifunc-strlcpy.h   |  34 ++
>  sysdeps/x86_64/multiarch/strlcpy-avx2.S    | 446 +++++++++++++++++++++
>  sysdeps/x86_64/multiarch/strlcpy-generic.c |  25 ++
>  sysdeps/x86_64/multiarch/strlcpy.c         |  36 ++
>  sysdeps/x86_64/multiarch/wcslcpy-avx2.S    |   4 +
>  sysdeps/x86_64/multiarch/wcslcpy-generic.c |  25 ++
>  sysdeps/x86_64/multiarch/wcslcpy.c         |  35 ++
>  9 files changed, 627 insertions(+)
>  create mode 100644 sysdeps/x86_64/multiarch/ifunc-strlcpy.h
>  create mode 100644 sysdeps/x86_64/multiarch/strlcpy-avx2.S
>  create mode 100644 sysdeps/x86_64/multiarch/strlcpy-generic.c
>  create mode 100644 sysdeps/x86_64/multiarch/strlcpy.c
>  create mode 100644 sysdeps/x86_64/multiarch/wcslcpy-avx2.S
>  create mode 100644 sysdeps/x86_64/multiarch/wcslcpy-generic.c
>  create mode 100644 sysdeps/x86_64/multiarch/wcslcpy.c
>
> diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
> index e1e894c963..7e3fc081df 100644
> --- a/sysdeps/x86_64/multiarch/Makefile
> +++ b/sysdeps/x86_64/multiarch/Makefile
> @@ -82,6 +82,8 @@ sysdep_routines += \
>    strcpy-sse2 \
>    strcpy-sse2-unaligned \
>    strcspn-sse4 \
> +  strlcpy-avx2 \
> +  strlcpy-generic \
>    strlen-avx2 \
>    strlen-avx2-rtm \
>    strlen-evex \
> @@ -153,6 +155,8 @@ sysdep_routines += \
>    wcscpy-evex \
>    wcscpy-generic \
>    wcscpy-ssse3 \
> +  wcslcpy-avx2 \
> +  wcslcpy-generic \
>    wcslen-avx2 \
>    wcslen-avx2-rtm \
>    wcslen-evex \
> diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> index 5427ff1907..9928dee187 100644
> --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> @@ -751,6 +751,15 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
>                                      1,
>                                      __strncat_sse2_unaligned))
>
> +  /* Support sysdeps/x86_64/multiarch/strlcpy.c.  */
> +  IFUNC_IMPL (i, name, strlcpy,
> +             X86_IFUNC_IMPL_ADD_V3 (array, i, strlcpy,
> +                                    CPU_FEATURE_USABLE (AVX2),
> +                                    __strlcpy_avx2)
> +             X86_IFUNC_IMPL_ADD_V1 (array, i, strlcpy,
> +                                    1,
> +                                    __strlcpy_generic))
> +
>    /* Support sysdeps/x86_64/multiarch/strncpy.c.  */
>    IFUNC_IMPL (i, name, strncpy,
>               X86_IFUNC_IMPL_ADD_V4 (array, i, strncpy,
> @@ -917,6 +926,15 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
>                                      1,
>                                      __wcscpy_generic))
>
> +  /* Support sysdeps/x86_64/multiarch/wcslcpy.c.  */
> +  IFUNC_IMPL (i, name, wcslcpy,
> +             X86_IFUNC_IMPL_ADD_V3 (array, i, wcslcpy,
> +                                    CPU_FEATURE_USABLE (AVX2),
> +                                    __wcslcpy_avx2)
> +             X86_IFUNC_IMPL_ADD_V1 (array, i, wcslcpy,
> +                                    1,
> +                                    __wcslcpy_generic))
> +
>    /* Support sysdeps/x86_64/multiarch/wcsncpy.c.  */
>    IFUNC_IMPL (i, name, wcsncpy,
>               X86_IFUNC_IMPL_ADD_V4 (array, i, wcsncpy,
> diff --git a/sysdeps/x86_64/multiarch/ifunc-strlcpy.h b/sysdeps/x86_64/multiarch/ifunc-strlcpy.h
> new file mode 100644
> index 0000000000..982a30d15b
> --- /dev/null
> +++ b/sysdeps/x86_64/multiarch/ifunc-strlcpy.h
> @@ -0,0 +1,34 @@
> +/* Common definition for ifunc selections.
> +   All versions must be listed in ifunc-impl-list.c.
> +   Copyright (C) 2023 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +#include <init-arch.h>
> +
> +extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden;
> +extern __typeof (REDIRECT_NAME) OPTIMIZE (generic) attribute_hidden;
> +
> +static inline void *
> +IFUNC_SELECTOR (void)
> +{
> +  const struct cpu_features *cpu_features = __get_cpu_features ();
> +
> +  if (X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, AVX2))
> +    return OPTIMIZE (avx2);
> +
> +  return OPTIMIZE (generic);
> +}
> diff --git a/sysdeps/x86_64/multiarch/strlcpy-avx2.S b/sysdeps/x86_64/multiarch/strlcpy-avx2.S
> new file mode 100644
> index 0000000000..cf54b1e990
> --- /dev/null
> +++ b/sysdeps/x86_64/multiarch/strlcpy-avx2.S
> @@ -0,0 +1,446 @@
> +/* Strlcpy/wcslcpy optimized with AVX2.
> +   Copyright (C) 2023 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +#include <isa-level.h>
> +
> +#if ISA_SHOULD_BUILD (3)
> +
> +# include <sysdep.h>
> +
> +# ifndef VEC_SIZE
> +#  include "x86-avx-vecs.h"
> +# endif
> +
> +# ifndef STRLCPY
> +#  define STRLCPY      __strlcpy_avx2
> +# endif
> +
> +
> +# ifdef USE_AS_WCSLCPY
> +#  define CHAR_SIZE    4
> +#  define MOVU         movl
> +#  define VPCMPEQ      vpcmpeqd
> +#  define VPMINU       vpminud
> +# else
> +#  define CHAR_SIZE    1
> +#  define MOVU         movb
> +#  define VPCMPEQ      vpcmpeqb
> +#  define VPMINU       vpminub
> +# endif
> +
> +# define PMOVMSK       vpmovmskb
> +# define PAGE_SIZE     4096
> +# define VEC_SIZE      32
> +# define CHAR_PER_VEC  (VEC_SIZE / CHAR_SIZE)
> +
> +       .section SECTION(.text),"ax",@progbits
> +/* Aligning entry point to 64 byte, provides better performance for
> +   one vector length string.  */
> +
> +ENTRY_P2ALIGN (STRLCPY, 6)
> +# ifdef __ILP32__
> +       /* Clear the upper 32 bits.  */
> +       movl    %edx, %edx
> +# endif
> +
> +       /* Zero out vector register for end of string comparison. */
> +       vpxor   %VMM(0), %VMM(0), %VMM(0)
> +       /* Save source pointer for return calculation.  */
> +       mov     %rsi, %r8
> +       mov     %esi, %eax
> +       sall    $20, %eax
> +       cmpl    $((PAGE_SIZE - (VEC_SIZE)) << 20), %eax
> +       ja      L(page_cross)
> +
> +L(page_cross_continue):
> +       /* Load first vector.  */
> +       VMOVU   (%rsi), %VMM(1)
> +       VPCMPEQ %VMM(0), %VMM(1), %VMM(2)
> +       PMOVMSK %VMM(2), %eax
> +       test    %eax, %eax
> +       jnz     L(ret_vec_x1)
> +
> +       test    %rdx, %rdx
> +       jz      L(continue_second_vector)
> +
> +       /* Check whether we can copy full vector.  */
> +       cmp     $CHAR_PER_VEC, %rdx
> +       jbe     L(page_cross_small_vec_copy)
> +       /* Copy first vector.  */
> +       VMOVU   %VMM(1), (%rdi)
> +       sub     $CHAR_PER_VEC, %rdx
> +
> +L(continue_second_vector):
> +       /* Align RSI pointer and adjust RDI based on offset.  */
> +       mov     %rsi, %rax
> +       and     $-VEC_SIZE, %rsi
> +       sub     %rsi, %rax
> +       sub     %rax, %rdi
> +
> +       /* Check if string already copied N char, and RDX is 0.  */
> +       test    %rdx, %rdx
> +       jz      L(skip_copy_alignment_fix)
> +
> +       /* Adjust RDX for copy alignment fix.  */
> +# ifdef USE_AS_WCSLCPY
> +       shr     $2, %rax
> +# endif
> +       add     %rax, %rdx
> +
> +L(skip_copy_alignment_fix):
> +       /* Load second vector.  */
> +       VMOVA   (VEC_SIZE * 1)(%rsi), %VMM(1)
> +       VPCMPEQ %VMM(0), %VMM(1), %VMM(2)
> +       vptest  %VMM(2), %VMM(2)
> +       jnz     L(ret_vec_x2)
> +
> +       /* Skip copy if RDX is 0.  */
> +       test    %rdx, %rdx
> +       jz      L(continue_third_vector)
> +
> +       /* Jump below/equal(instead of below) used here, because last
> +          copy chracter must be NULL.  */
> +       cmp     $CHAR_PER_VEC, %rdx
> +       jbe     L(partial_copy_second_vector)
> +
> +       sub     $CHAR_PER_VEC, %rdx
> +       /* Copy second vector.  */
> +       VMOVU   %VMM(1), (VEC_SIZE * 1)(%rdi)
> +
> +L(continue_third_vector):
> +       /* Load third vector.  */
> +       VMOVA   (VEC_SIZE * 2)(%rsi), %VMM(1)
> +       VPCMPEQ %VMM(0), %VMM(1), %VMM(2)
> +       vptest  %VMM(2), %VMM(2)
> +       jnz     L(ret_vec_x3)
> +
> +       /* Skip copy if RDX is 0.  */
> +       test    %rdx, %rdx
> +       jz      L(continue_fourth_vector)
> +
> +       cmp     $CHAR_PER_VEC, %rdx
> +       jbe     L(partial_copy_third_vector)
> +
> +       sub     $CHAR_PER_VEC, %rdx
> +       /* Copy third vector.  */
> +       VMOVU   %VMM(1), (VEC_SIZE * 2)(%rdi)
> +
> +L(continue_fourth_vector):
> +       /* Load fourth vector.  */
> +       VMOVA   (VEC_SIZE * 3)(%rsi), %VMM(1)
> +       VPCMPEQ %VMM(0), %VMM(1), %VMM(2)
> +       vptest  %VMM(2), %VMM(2)
> +       jnz     L(ret_vec_x4)
> +
> +       /* Skip copy if RDX is 0.  */
> +       test    %rdx, %rdx
> +       jz      L(loop_4x_align)
> +
> +       cmp     $CHAR_PER_VEC, %rdx
> +       jbe     L(partial_copy_fourth_vector)
> +
> +       sub     $CHAR_PER_VEC, %rdx
> +       /* Copy fourth vector.  */
> +       VMOVU   %VMM(1), (VEC_SIZE * 3)(%rdi)
> +
> +
> +L(loop_4x_align):
> +       /* Jump to loop if RSI is already 4 vector align.  */
> +       test    $(VEC_SIZE * 4 - 1), %esi
> +       jz      L(loop_4x_read)
> +
> +       mov     %rsi, %rcx
> +
> +       /* Align RSI to 4x vector.  */
> +       and     $(VEC_SIZE * -4), %rsi
> +       sub     %rsi, %rcx
> +
> +       /* Adjust RDI for RSI alignment fix.  */
> +       sub     %rcx, %rdi
> +
> +       /* Jump to loop if RDX is 0.  */
> +       test    %rdx, %rdx
> +       jz      L(loop_4x_read)
> +
> +# ifdef USE_AS_WCSLCPY
> +       shr     $2, %rcx
> +# endif
> +
> +       /* Adjust RDX for RSI alignment fix.  */
> +       add     %rcx, %rdx
> +       jmp     L(loop_4x_read)
> +
> +       .p2align 4,,6
> +L(loop_4x_vec):
> +       /* Skip copy if RDX is 0.  */
> +       test    %rdx, %rdx
> +       jz      L(loop_partial_copy_return)
> +       cmp     $(CHAR_PER_VEC * 4), %rdx
> +       jbe     L(loop_partial_copy)
> +       VMOVU   %VMM(1), (VEC_SIZE * 4)(%rdi)
> +       VMOVU   %VMM(2), (VEC_SIZE * 5)(%rdi)
> +       VMOVU   %VMM(3), (VEC_SIZE * 6)(%rdi)
> +       VMOVU   %VMM(4), (VEC_SIZE * 7)(%rdi)
> +       sub     $(CHAR_PER_VEC * 4), %rdx
> +
> +L(loop_partial_copy_return):
> +       sub     $(VEC_SIZE * -4), %rsi
> +       sub     $(VEC_SIZE * -4), %rdi
> +
> +L(loop_4x_read):
> +       VMOVA   (VEC_SIZE * 4)(%rsi), %VMM(1)
> +       VMOVA   (VEC_SIZE * 5)(%rsi), %VMM(2)
> +       VMOVA   (VEC_SIZE * 6)(%rsi), %VMM(3)
> +       VMOVA   (VEC_SIZE * 7)(%rsi), %VMM(4)
> +       VPMINU  %VMM(1), %VMM(2), %VMM(5)
> +       VPMINU  %VMM(3), %VMM(4), %VMM(6)
> +       VPMINU  %VMM(5), %VMM(6), %VMM(7)
> +       VPCMPEQ %VMM(0), %VMM(7), %VMM(7)
> +       vptest  %VMM(7), %VMM(7)
> +
> +       jz      L(loop_4x_vec)
> +
> +       /* Check if string ends in first vector or second vector.  */
> +       lea     (VEC_SIZE * 4)(%rsi), %rax
> +       sub     %r8, %rax
> +# ifdef USE_AS_WCSLCPY
> +       shr     $2, %rax
> +# endif
> +       xor     %r10, %r10
> +       VPCMPEQ %VMM(0), %VMM(5), %VMM(6)
> +       vptest  %VMM(6), %VMM(6)
> +       jnz     L(endloop)
> +       sub     $(CHAR_PER_VEC * -2), %rax
> +       mov     $(CHAR_PER_VEC * 2), %r10
> +       VMOVA   %VMM(3), %VMM(1)
> +       VMOVA   %VMM(4), %VMM(2)
> +
> +L(endloop):
> +       VPCMPEQ %VMM(0), %VMM(1), %VMM(1)
> +       VPCMPEQ %VMM(0), %VMM(2), %VMM(2)
> +       PMOVMSK %VMM(1), %rcx
> +       PMOVMSK %VMM(2), %r9
> +       shlq    $32, %r9
> +       orq     %r9, %rcx
> +       bsf     %rcx, %rcx
> +       /* Shift RCX by 2, VPMOVMSK has only byte version.  */
> +# ifdef USE_AS_WCSLCPY
> +       shr     $2, %rcx
> +# endif
> +       /* At this point RAX has length to return.  */
> +       add     %rcx, %rax
> +       test    %rdx, %rdx
> +       jz      L(ret)
> +
> +       /* Add 1 to account for NULL character in RDX comparison.  */
> +       lea     1(%r10, %rcx), %rcx
> +       cmp     %rdx, %rcx
> +       cmovb   %rcx, %rdx
> +
> +L(loop_partial_copy):
> +       cmp     $(CHAR_PER_VEC * 2), %rdx
> +       jbe     L(loop_partial_first_half)
> +       /* Reload first 2 vector.  */
> +       VMOVA   (VEC_SIZE * 4)(%rsi), %VMM(1)
> +       VMOVA   (VEC_SIZE * 5)(%rsi), %VMM(2)
> +       VMOVU   %VMM(1), (VEC_SIZE * 4)(%rdi)
> +       VMOVU   %VMM(2), (VEC_SIZE * 5)(%rdi)
> +
> +L(loop_partial_first_half):
> +       /* Go back 2 vector from last and use overlapping copy.
> +          (VEC_SIZE * 4 - VEC_SIZE * 2)(%rsi, %rdx, CHAR_SIZE)
> +          (VEC_SIZE * 4 - VEC_SIZE * 1)(%rsi, %rdx, CHAR_SIZE)
> +        */
> +       VMOVU   (VEC_SIZE * 2)(%rsi, %rdx, CHAR_SIZE), %VMM(3)
> +       VMOVU   (VEC_SIZE * 3)(%rsi, %rdx, CHAR_SIZE), %VMM(4)
> +       VMOVU   %VMM(3), (VEC_SIZE * 2)(%rdi, %rdx, CHAR_SIZE)
> +       VMOVU   %VMM(4), (VEC_SIZE * 3)(%rdi, %rdx, CHAR_SIZE)
> +       MOVU    $0, (VEC_SIZE * 4 - CHAR_SIZE)(%rdi, %rdx, CHAR_SIZE)
> +       xor     %rdx, %rdx
> +       vptest  %VMM(7), %VMM(7)
> +       jz      L(loop_partial_copy_return)
> +       ret
> +
> +       .p2align 4
> +L(page_cross):
> +       mov     %rsi, %rcx
> +       mov     %rsi, %r11
> +       and     $-VEC_SIZE, %r11
> +       and     $(VEC_SIZE - 1), %rcx
> +       VMOVA   (%r11), %VMM(1)
> +       VPCMPEQ %VMM(0), %VMM(1), %VMM(2)
> +       PMOVMSK %VMM(2), %eax
> +       shr     %cl, %eax
> +       jz      L(page_cross_continue)
> +
> +L(ret_vec_x1):
> +       bsf     %eax, %eax
> +# ifdef USE_AS_WCSLCPY
> +       shr     $2, %eax
> +# endif
> +       /* Increment by 1 to account for NULL char.  */
> +       lea     1(%eax), %ecx
> +       cmp     %rdx, %rcx
> +       cmovb   %rcx, %rdx
> +       test    %rdx, %rdx
> +       jz      L(ret)
> +
> +L(page_cross_small_vec_copy):
> +       cmp     $(16 / CHAR_SIZE), %rdx
> +       jbe     L(copy_8_byte_scalar)
> +       VMOVU   (%rsi), %VMM_128(1)
> +       VMOVU   -16(%rsi, %rdx, CHAR_SIZE), %VMM_128(3)
> +       VMOVU   %VMM_128(1), (%rdi)
> +       VMOVU   %VMM_128(3), -16(%rdi, %rdx, CHAR_SIZE)
> +       MOVU    $0, -(CHAR_SIZE * 1)(%rdi, %rdx, CHAR_SIZE)
> +       xor     %rdx, %rdx
> +       vptest  %VMM(2), %VMM(2)
> +       jz      L(continue_second_vector)
> +       ret
> +
> +L(copy_8_byte_scalar):
> +       cmp     $(8 / CHAR_SIZE), %rdx
> +       jbe     L(copy_4_byte_scalar)
> +       movq    (%rsi), %r10
> +       movq    -8(%rsi, %rdx, CHAR_SIZE), %r11
> +       movq    %r10, (%rdi)
> +       movq    %r11, -8(%rdi, %rdx, CHAR_SIZE)
> +       MOVU    $0, -(CHAR_SIZE * 1)(%rdi, %rdx, CHAR_SIZE)
> +       xor     %edx, %edx
> +       vptest  %VMM(2), %VMM(2)
> +       jz      L(continue_second_vector)
> +       ret
> +
> +L(copy_4_byte_scalar):
> +# ifndef USE_AS_WCSLCPY
> +       cmp     $4, %rdx
> +       jbe     L(copy_2_byte_scalar)
> +# endif
> +       movl    (%rsi), %r10d
> +       movl    -4(%rsi, %rdx, CHAR_SIZE), %r11d
> +       movl    %r10d, (%rdi)
> +       movl    %r11d, -4(%rdi, %rdx, CHAR_SIZE)
> +       MOVU    $0, -(CHAR_SIZE * 1)(%rdi, %rdx, CHAR_SIZE)
> +       xor     %edx, %edx
> +       vptest  %VMM(2), %VMM(2)
> +       jz      L(continue_second_vector)
> +       ret
> +
> +# ifndef USE_AS_WCSLCPY
> +L(copy_2_byte_scalar):
> +       cmp     $2, %rdx
> +       jbe     L(copy_1_byte_scalar)
> +       movw    (%rsi), %r10w
> +       movw    -(CHAR_SIZE * 3)(%rsi, %rdx, CHAR_SIZE), %r11w
> +       movw    %r10w, (%rdi)
> +       movw    %r11w, -(CHAR_SIZE * 3)(%rdi, %rdx, CHAR_SIZE)
> +       MOVU    $0, -(CHAR_SIZE * 1)(%rdi, %rdx, CHAR_SIZE)
> +       xor     %edx, %edx
> +       vptest  %VMM(2), %VMM(2)
> +       jz      L(continue_second_vector)
> +       ret
> +
> +L(copy_1_byte_scalar):
> +       MOVU    (%rsi), %r10b
> +       MOVU    %r10b, (%rdi)
> +       MOVU    $0, -(CHAR_SIZE * 1)(%rdi, %rdx, CHAR_SIZE)
> +       xor     %edx, %edx
> +       vptest  %VMM(2), %VMM(2)
> +       jz      L(continue_second_vector)
> +       ret
> +# endif
> +
> +L(ret_vec_x2):
> +       PMOVMSK %VMM(2), %rax
> +       bsf     %rax, %rcx
> +       /* Calculate return value.  */
> +       lea     VEC_SIZE(%rsi, %rcx), %rax
> +       sub     %r8, %rax
> +# ifdef USE_AS_WCSLCPY
> +       shr     $2, %rax
> +       shr     $2, %rcx
> +# endif
> +       inc     %rcx
> +       test    %rdx, %rdx
> +       jz      L(ret)
> +       cmp     %rdx, %rcx
> +       cmovb   %rcx, %rdx
> +
> +L(partial_copy_second_vector):
> +       VMOVU   (%rsi, %rdx, CHAR_SIZE), %VMM(1)
> +       VMOVU   %VMM(1), (%rdi, %rdx, CHAR_SIZE)
> +       MOVU    $0, (VEC_SIZE - CHAR_SIZE * 1)(%rdi, %rdx, CHAR_SIZE)
> +       xor     %edx, %edx
> +       vptest  %VMM(2), %VMM(2)
> +       jz      L(continue_third_vector)
> +
> +L(ret):
> +       ret
> +
> +L(ret_vec_x3):
> +       PMOVMSK %VMM(2), %rax
> +       bsf     %rax, %rcx
> +       /* Calculate return value.  */
> +       lea     (VEC_SIZE * 2)(%rsi, %rcx), %rax
> +       sub     %r8, %rax
> +# ifdef USE_AS_WCSLCPY
> +       shr     $2, %rax
> +       shr     $2, %rcx
> +# endif
> +       inc     %rcx
> +       test    %rdx, %rdx
> +       jz      L(ret)
> +       cmp     %rdx, %rcx
> +       cmovb   %rcx, %rdx
> +
> +L(partial_copy_third_vector):
> +       VMOVU   (VEC_SIZE)(%rsi, %rdx, CHAR_SIZE), %VMM(1)
> +       VMOVU   %VMM(1), (VEC_SIZE)(%rdi, %rdx, CHAR_SIZE)
> +       MOVU    $0, ((VEC_SIZE * 2) - CHAR_SIZE * 1)(%rdi, %rdx, CHAR_SIZE)
> +       xor     %edx, %edx
> +       vptest  %VMM(2), %VMM(2)
> +       jz      L(continue_fourth_vector)
> +       ret
> +
> +L(ret_vec_x4):
> +       PMOVMSK %VMM(2), %rax
> +       bsf     %rax, %rcx
> +       /* Calculate return value.  */
> +       lea     (VEC_SIZE * 3)(%rsi, %rcx), %rax
> +       sub     %r8, %rax
> +# ifdef USE_AS_WCSLCPY
> +       shr     $2, %rax
> +       shr     $2, %rcx
> +# endif
> +       inc     %rcx
> +       test    %rdx, %rdx
> +       jz      L(ret)
> +       cmp     %rdx, %rcx
> +       cmovb   %rcx, %rdx
> +
> +L(partial_copy_fourth_vector):
> +       VMOVU   (VEC_SIZE * 2)(%rsi, %rdx, CHAR_SIZE), %VMM(1)
> +       VMOVU   %VMM(1), (VEC_SIZE * 2)(%rdi, %rdx, CHAR_SIZE)
> +       MOVU    $0, ((VEC_SIZE * 3) - CHAR_SIZE * 1)(%rdi, %rdx, CHAR_SIZE)
> +       xor     %edx, %edx
> +       vptest  %VMM(2), %VMM(2)
> +       jz      L(continue_fourth_vector)
> +       ret
> +
> +END (STRLCPY)
> +#endif
> diff --git a/sysdeps/x86_64/multiarch/strlcpy-generic.c b/sysdeps/x86_64/multiarch/strlcpy-generic.c
> new file mode 100644
> index 0000000000..eee3b7b086
> --- /dev/null
> +++ b/sysdeps/x86_64/multiarch/strlcpy-generic.c
> @@ -0,0 +1,25 @@
> +/* strlcpy generic.
> +   Copyright (C) 2023 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +
> +#include <isa-level.h>
> +#if ISA_SHOULD_BUILD (1)
> +# define __strlcpy  __strlcpy_generic
> +# include <string/strlcpy.c>
> +
> +#endif
> diff --git a/sysdeps/x86_64/multiarch/strlcpy.c b/sysdeps/x86_64/multiarch/strlcpy.c
> new file mode 100644
> index 0000000000..ded41fbcfb
> --- /dev/null
> +++ b/sysdeps/x86_64/multiarch/strlcpy.c
> @@ -0,0 +1,36 @@
> +/* Multiple versions of strlcpy.
> +   All versions must be listed in ifunc-impl-list.c.
> +   Copyright (C) 2023 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +/* Define multiple versions only for the definition in libc.  */
> +#if IS_IN (libc)
> +# define __strlcpy __redirect_strlcpy
> +# include <string.h>
> +# undef __strlcpy
> +
> +# define SYMBOL_NAME strlcpy
> +# include "ifunc-strlcpy.h"
> +
> +libc_ifunc_redirected (__redirect_strlcpy, __strlcpy, IFUNC_SELECTOR ());
> +weak_alias (__strlcpy, strlcpy)
> +
> +# ifdef SHARED
> +__hidden_ver1 (__strlcpy, __GI___strlcpy, __redirect_strlcpy)
> +  __attribute__ ((visibility ("hidden"))) __attribute_copy__ (strlcpy);
> +# endif
> +#endif
> diff --git a/sysdeps/x86_64/multiarch/wcslcpy-avx2.S b/sysdeps/x86_64/multiarch/wcslcpy-avx2.S
> new file mode 100644
> index 0000000000..dafc20ded0
> --- /dev/null
> +++ b/sysdeps/x86_64/multiarch/wcslcpy-avx2.S
> @@ -0,0 +1,4 @@
> +#define STRLCPY        __wcslcpy_avx2
> +#define USE_AS_WCSLCPY 1
> +
> +#include "strlcpy-avx2.S"
> diff --git a/sysdeps/x86_64/multiarch/wcslcpy-generic.c b/sysdeps/x86_64/multiarch/wcslcpy-generic.c
> new file mode 100644
> index 0000000000..ffd3c0e846
> --- /dev/null
> +++ b/sysdeps/x86_64/multiarch/wcslcpy-generic.c
> @@ -0,0 +1,25 @@
> +/* wcslcpy generic.
> +   Copyright (C) 2023 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +
> +#include <isa-level.h>
> +#if ISA_SHOULD_BUILD (1)
> +# define __wcslcpy  __wcslcpy_generic
> +# include <wcsmbs/wcslcpy.c>
> +
> +#endif
> diff --git a/sysdeps/x86_64/multiarch/wcslcpy.c b/sysdeps/x86_64/multiarch/wcslcpy.c
> new file mode 100644
> index 0000000000..371ef9626c
> --- /dev/null
> +++ b/sysdeps/x86_64/multiarch/wcslcpy.c
> @@ -0,0 +1,35 @@
> +/* Multiple versions of wcslcpy.
> +   All versions must be listed in ifunc-impl-list.c.
> +   Copyright (C) 2023 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +/* Define multiple versions only for the definition in libc.  */
> +#if IS_IN (libc)
> +# define __wcslcpy __redirect_wcslcpy
> +# include <wchar.h>
> +# undef __wcslcpy
> +
> +# define SYMBOL_NAME wcslcpy
> +# include "ifunc-strlcpy.h"
> +
> +libc_ifunc_redirected (__redirect_wcslcpy, __wcslcpy, IFUNC_SELECTOR ());
> +weak_alias (__wcslcpy, wcslcpy)
> +# ifdef SHARED
> +__hidden_ver1 (__wcslcpy, __GI___wcslcpy, __redirect_wcslcpy)
> +  __attribute__((visibility ("hidden"))) __attribute_copy__ (wcslcpy);
> +# endif
> +#endif
> --
> 2.38.1
>

Think we should at the very least wait for the generic strlcpy codes
to land first.

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH] x86_64: Implement AVX2 version of strlcpy/wcslcpy function
  2023-06-30 21:04 ` Noah Goldstein
@ 2023-06-30 21:27   ` Paul Eggert
  2023-06-30 22:21     ` Sunil Pandey
  2023-07-03 12:55     ` Adhemerval Zanella Netto
  2023-07-01  9:41   ` Florian Weimer
  1 sibling, 2 replies; 24+ messages in thread
From: Paul Eggert @ 2023-06-30 21:27 UTC (permalink / raw)
  To: Noah Goldstein, Sunil K Pandey; +Cc: libc-alpha, hjl.tools

On 2023-06-30 14:04, Noah Goldstein via Libc-alpha wrote:
> Think we should at the very least wait for the generic strlcpy codes
> to land first.

Let's not optimize these functions at all, unless there's good and 
measured reason to do so. In practice I expected they're called with 
small sizes for which optimization is a net minus as it consumes 
valuable maintenance time with no real benefit.

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH] x86_64: Implement AVX2 version of strlcpy/wcslcpy function
  2023-06-30 21:27   ` Paul Eggert
@ 2023-06-30 22:21     ` Sunil Pandey
  2023-06-30 23:22       ` Noah Goldstein
  2023-07-03 16:30       ` Paul Eggert
  2023-07-03 12:55     ` Adhemerval Zanella Netto
  1 sibling, 2 replies; 24+ messages in thread
From: Sunil Pandey @ 2023-06-30 22:21 UTC (permalink / raw)
  To: Paul Eggert; +Cc: Noah Goldstein, libc-alpha, hjl.tools


[-- Attachment #1.1: Type: text/plain, Size: 702 bytes --]

On Fri, Jun 30, 2023 at 2:27 PM Paul Eggert <eggert@cs.ucla.edu> wrote:

> On 2023-06-30 14:04, Noah Goldstein via Libc-alpha wrote:
> > Think we should at the very least wait for the generic strlcpy codes
> > to land first.
>
> Let's not optimize these functions at all, unless there's good and
> measured reason to do so. In practice I expected they're called with
> small sizes for which optimization is a net minus as it consumes
> valuable maintenance time with no real benefit.
>

Hi Paul,

Attached is strcpy/wcslcpy microbenchmark data based on Noah
strlcpy/wcslcpy microbenchmark patch.

https://sourceware.org/pipermail/libc-alpha/2023-April/147557.html

Thanks,
Sunil

[-- Attachment #2: bench-wcslcpy.txt --]
[-- Type: text/plain, Size: 18368 bytes --]

Function: wcslcpy
Variant: 
                                    __wcslcpy_avx2	__wcslcpy_generic
========================================================================================================================
    len=16, align1=1, align2=1, n=16:        14.99 ( 24.63%)	       19.89	
    len=16, align1=1, align2=1, n=16:        14.58 ( 19.61%)	       18.13	
    len=16, align1=1, align2=2, n=16:        16.99 (  4.02%)	       17.70	
    len=16, align1=2, align2=1, n=16:        15.14 ( 17.08%)	       18.25	
      len=2, align1=7, align2=2, n=4:         8.40 ( 44.41%)	       15.11	
      len=4, align1=2, align2=7, n=2:        10.91 ( 42.41%)	       18.95	
      len=2, align1=7, align2=2, n=4:         8.92 ( 34.99%)	       13.72	
      len=4, align1=2, align2=7, n=2:        10.92 ( 42.05%)	       18.84	
    len=16, align1=2, align2=2, n=16:        15.70 ( 11.97%)	       17.84	
    len=16, align1=2, align2=2, n=16:        14.83 ( 16.82%)	       17.83	
    len=16, align1=2, align2=4, n=16:        17.30 ( -0.46%)	       17.22	
    len=16, align1=4, align2=2, n=16:        15.44 ( 15.20%)	       18.21	
      len=4, align1=6, align2=4, n=8:        12.87 ( 14.74%)	       15.09	
      len=8, align1=4, align2=6, n=4:        13.72 ( 25.95%)	       18.53	
      len=4, align1=6, align2=4, n=8:        12.85 (  9.03%)	       14.13	
      len=8, align1=4, align2=6, n=4:        12.67 ( 31.60%)	       18.52	
    len=16, align1=3, align2=3, n=16:        14.57 ( 15.76%)	       17.30	
    len=16, align1=3, align2=3, n=16:        14.82 ( 14.03%)	       17.23	
    len=16, align1=3, align2=6, n=16:        17.02 (  3.24%)	       17.59	
    len=16, align1=6, align2=3, n=16:        15.04 ( 19.50%)	       18.68	
     len=8, align1=5, align2=6, n=16:        14.96 (  8.00%)	       16.26	
     len=16, align1=6, align2=5, n=8:        13.70 ( 25.56%)	       18.41	
     len=8, align1=5, align2=6, n=16:        14.54 (  7.87%)	       15.78	
     len=16, align1=6, align2=5, n=8:        12.35 ( 24.15%)	       16.28	
    len=16, align1=4, align2=4, n=16:        13.93 ( 14.41%)	       16.28	
    len=16, align1=4, align2=4, n=16:        13.63 ( 16.32%)	       16.29	
    len=16, align1=4, align2=0, n=16:        12.97 ( 21.40%)	       16.51	
    len=16, align1=0, align2=4, n=16:        14.09 ( 15.59%)	       16.70	
    len=16, align1=4, align2=0, n=32:        13.75 ( 31.95%)	       20.20	
    len=32, align1=0, align2=4, n=16:        19.20 ( -0.01%)	       19.20	
    len=16, align1=4, align2=0, n=32:        14.45 ( 31.46%)	       21.08	
    len=32, align1=0, align2=4, n=16:        19.07 ( -1.55%)	       18.78	
    len=16, align1=5, align2=5, n=16:        14.89 ( 15.97%)	       17.72	
    len=16, align1=5, align2=5, n=16:        13.78 ( 15.12%)	       16.23	
    len=16, align1=5, align2=2, n=16:        14.89 ( 13.72%)	       17.26	
    len=16, align1=2, align2=5, n=16:        17.32 ( -0.72%)	       17.20	
    len=32, align1=3, align2=2, n=64:        23.78 ( 20.84%)	       30.05	
    len=64, align1=2, align2=3, n=32:        24.54 (  0.82%)	       24.74	
    len=32, align1=3, align2=2, n=64:        22.48 ( 17.99%)	       27.41	
    len=64, align1=2, align2=3, n=32:        22.63 (  8.72%)	       24.79	
    len=16, align1=6, align2=6, n=16:        14.76 ( 14.10%)	       17.19	
    len=16, align1=6, align2=6, n=16:        14.57 ( 16.81%)	       17.52	
    len=16, align1=6, align2=4, n=16:        14.88 ( 13.70%)	       17.25	
    len=16, align1=4, align2=6, n=16:        16.29 ( -0.14%)	       16.27	
   len=64, align1=2, align2=4, n=128:        28.40 (  9.37%)	       31.34	
   len=128, align1=4, align2=2, n=64:        28.48 ( 10.08%)	       31.67	
   len=64, align1=2, align2=4, n=128:        29.65 ( 11.33%)	       33.44	
   len=128, align1=4, align2=2, n=64:        30.18 (  6.40%)	       32.25	
    len=16, align1=7, align2=7, n=16:        14.86 (  8.40%)	       16.22	
    len=16, align1=7, align2=7, n=16:        13.78 ( 16.30%)	       16.47	
    len=16, align1=7, align2=6, n=16:        14.23 ( 12.27%)	       16.22	
    len=16, align1=6, align2=7, n=16:        16.30 ( -0.53%)	       16.22	
  len=128, align1=1, align2=6, n=256:        35.07 ( 25.88%)	       47.32	
  len=256, align1=6, align2=1, n=128:        45.32 ( 11.90%)	       51.44	
  len=128, align1=1, align2=6, n=256:        35.14 ( 24.65%)	       46.64	
  len=256, align1=6, align2=1, n=128:        43.26 ( 15.54%)	       51.22	
     len=8, align1=0, align2=0, n=16:        13.17 ( 29.35%)	       18.65	
    len=32, align1=0, align2=0, n=16:        18.81 ( -3.57%)	       18.17	
     len=8, align1=7, align2=2, n=16:        13.92 ( -7.07%)	       13.00	
    len=32, align1=7, align2=2, n=16:        17.52 ( 14.77%)	       20.55	
    len=16, align1=0, align2=0, n=32:        13.85 ( 33.77%)	       20.91	
    len=64, align1=0, align2=0, n=32:        23.32 (  7.24%)	       25.14	
    len=16, align1=6, align2=4, n=32:        14.87 ( 17.40%)	       18.00	
    len=64, align1=6, align2=4, n=32:        23.32 ( 14.99%)	       27.43	
    len=32, align1=0, align2=0, n=64:        21.05 ( 16.72%)	       25.28	
   len=128, align1=0, align2=0, n=64:        28.81 ( 11.25%)	       32.46	
    len=32, align1=5, align2=6, n=64:        24.68 ( 10.16%)	       27.47	
   len=128, align1=5, align2=6, n=64:        28.66 (  7.24%)	       30.89	
   len=64, align1=0, align2=0, n=128:        24.98 ( 21.37%)	       31.77	
  len=256, align1=0, align2=0, n=128:        43.90 ( 18.92%)	       54.14	
   len=64, align1=4, align2=0, n=128:        26.13 ( 24.65%)	       34.68	
  len=256, align1=4, align2=0, n=128:        44.27 ( 15.06%)	       52.12	
  len=128, align1=0, align2=0, n=256:        34.29 ( 33.53%)	       51.58	
  len=512, align1=0, align2=0, n=256:        68.94 (  8.14%)	       75.05	
  len=128, align1=3, align2=2, n=256:        36.06 ( 15.45%)	       42.65	
  len=512, align1=3, align2=2, n=256:        65.15 ( 12.33%)	       74.32	
  len=256, align1=0, align2=0, n=512:        46.37 ( 30.42%)	       66.64	
 len=1024, align1=0, align2=0, n=512:       114.89 (  8.32%)	      125.31	
  len=256, align1=2, align2=4, n=512:        56.05 ( 16.50%)	       67.12	
 len=1024, align1=2, align2=4, n=512:       179.87 (-52.13%)	      118.24	
 len=512, align1=0, align2=0, n=1024:        68.16 ( 29.70%)	       96.96	
 len=512, align1=1, align2=6, n=1024:       119.39 (-26.04%)	       94.72	
   len=128, align1=1, align2=0, n=64:        27.46 ( 17.94%)	       33.46	
   len=128, align1=0, align2=0, n=64:        29.69 ( -2.62%)	       28.93	
   len=128, align1=0, align2=0, n=64:        27.25 (  6.15%)	       29.03	
   len=128, align1=0, align2=0, n=64:        27.24 (  6.61%)	       29.17	
   len=64, align1=1, align2=0, n=128:        25.50 ( 21.40%)	       32.44	
   len=64, align1=0, align2=0, n=128:        23.50 ( 27.08%)	       32.22	
   len=64, align1=0, align2=0, n=128:        24.88 ( 16.98%)	       29.97	
   len=64, align1=0, align2=0, n=128:        24.59 ( 22.98%)	       31.92	
   len=128, align1=1, align2=0, n=96:        27.46 ( 29.72%)	       39.07	
   len=128, align1=0, align2=0, n=96:        28.55 ( 20.33%)	       35.83	
   len=128, align1=0, align2=0, n=96:        27.25 ( 24.21%)	       35.95	
   len=128, align1=0, align2=0, n=96:        28.53 ( 19.86%)	       35.59	
   len=96, align1=1, align2=0, n=128:        30.65 ( 18.65%)	       37.68	
   len=96, align1=0, align2=0, n=128:        28.06 ( 19.41%)	       34.82	
   len=96, align1=0, align2=0, n=128:        27.92 ( 20.27%)	       35.02	
   len=96, align1=0, align2=0, n=128:        28.06 ( 19.43%)	       34.83	
  len=128, align1=1, align2=0, n=128:        31.31 ( 28.02%)	       43.51	
  len=128, align1=0, align2=0, n=128:        28.52 ( 29.34%)	       40.37	
  len=128, align1=0, align2=0, n=128:        27.25 ( 32.18%)	       40.17	
  len=128, align1=0, align2=0, n=128:        27.46 ( 31.33%)	       39.99	
  len=128, align1=1, align2=0, n=128:        31.32 ( 28.00%)	       43.50	
  len=128, align1=0, align2=0, n=128:        27.46 ( 31.03%)	       39.82	
  len=128, align1=0, align2=0, n=128:        27.25 ( 32.30%)	       40.25	
  len=128, align1=0, align2=0, n=128:        27.25 ( 31.97%)	       40.05	
  len=128, align1=1, align2=0, n=160:        34.00 ( 20.12%)	       42.56	
  len=128, align1=0, align2=0, n=160:        32.19 ( 30.63%)	       46.40	
  len=128, align1=0, align2=0, n=160:        32.17 ( 28.12%)	       44.76	
  len=128, align1=0, align2=0, n=160:        32.39 ( 27.63%)	       44.76	
  len=160, align1=1, align2=0, n=128:        29.84 ( 35.97%)	       46.61	
  len=160, align1=0, align2=0, n=128:        31.79 ( 25.56%)	       42.71	
  len=160, align1=0, align2=0, n=128:        32.00 ( 24.86%)	       42.59	
  len=160, align1=0, align2=0, n=128:        31.79 ( 25.85%)	       42.86	
  len=128, align1=1, align2=0, n=192:        33.81 ( 21.08%)	       42.84	
  len=128, align1=0, align2=0, n=192:        32.38 ( 29.98%)	       46.24	
  len=128, align1=0, align2=0, n=192:        32.38 ( 27.38%)	       44.58	
  len=128, align1=0, align2=0, n=192:        32.18 ( 28.29%)	       44.87	
  len=192, align1=1, align2=0, n=128:        34.71 ( 27.54%)	       47.90	
  len=192, align1=0, align2=0, n=128:        35.25 ( 22.44%)	       45.44	
  len=192, align1=0, align2=0, n=128:        35.30 ( 21.97%)	       45.24	
  len=192, align1=0, align2=0, n=128:        35.03 ( 22.17%)	       45.01	
  len=256, align1=1, align2=0, n=192:        39.58 ( 30.82%)	       57.21	
  len=256, align1=0, align2=0, n=192:        42.27 ( 24.21%)	       55.77	
  len=256, align1=0, align2=0, n=192:        41.10 ( 26.00%)	       55.54	
  len=256, align1=0, align2=0, n=192:        43.11 ( 21.51%)	       54.92	
  len=192, align1=1, align2=0, n=256:        38.15 ( 29.78%)	       54.33	
  len=192, align1=0, align2=0, n=256:        37.43 ( 32.27%)	       55.26	
  len=192, align1=0, align2=0, n=256:        37.43 ( 32.46%)	       55.42	
  len=192, align1=0, align2=0, n=256:        37.43 ( 32.46%)	       55.42	
  len=256, align1=1, align2=0, n=224:        40.87 ( 31.48%)	       59.65	
  len=256, align1=0, align2=0, n=224:        41.66 ( 26.95%)	       57.02	
  len=256, align1=0, align2=0, n=224:        41.08 ( 28.22%)	       57.24	
  len=256, align1=0, align2=0, n=224:        41.17 ( 27.86%)	       57.07	
  len=224, align1=1, align2=0, n=256:        38.96 ( 32.41%)	       57.65	
  len=224, align1=0, align2=0, n=256:        42.27 ( 28.61%)	       59.21	
  len=224, align1=0, align2=0, n=256:        40.15 ( 32.33%)	       59.34	
  len=224, align1=0, align2=0, n=256:        40.10 ( 32.78%)	       59.65	
  len=256, align1=1, align2=0, n=256:        41.22 ( 33.31%)	       61.80	
  len=256, align1=0, align2=0, n=256:        41.52 ( 29.99%)	       59.30	
  len=256, align1=0, align2=0, n=256:        41.17 ( 29.82%)	       58.66	
  len=256, align1=0, align2=0, n=256:        41.18 ( 30.68%)	       59.40	
  len=256, align1=1, align2=0, n=256:        47.52 ( 29.49%)	       67.39	
  len=256, align1=0, align2=0, n=256:        44.83 ( 30.61%)	       64.60	
  len=256, align1=0, align2=0, n=256:        45.50 ( 29.57%)	       64.60	
  len=256, align1=0, align2=0, n=256:        44.83 ( 29.93%)	       63.97	
  len=256, align1=1, align2=0, n=288:        44.21 ( 33.34%)	       66.32	
  len=256, align1=0, align2=0, n=288:        41.58 ( 33.60%)	       62.62	
  len=256, align1=0, align2=0, n=288:        44.57 ( 30.02%)	       63.69	
  len=256, align1=0, align2=0, n=288:        42.80 ( 35.55%)	       66.41	
  len=288, align1=1, align2=0, n=256:        46.39 ( 29.55%)	       65.85	
  len=288, align1=0, align2=0, n=256:        45.95 ( 28.95%)	       64.68	
  len=288, align1=0, align2=0, n=256:        46.26 ( 29.92%)	       66.02	
  len=288, align1=0, align2=0, n=256:        48.47 ( 20.26%)	       60.79	
  len=256, align1=1, align2=0, n=320:        41.81 ( 31.09%)	       60.67	
  len=256, align1=0, align2=0, n=320:        41.87 ( 34.40%)	       63.82	
  len=256, align1=0, align2=0, n=320:        41.52 ( 34.47%)	       63.35	
  len=256, align1=0, align2=0, n=320:        44.29 ( 33.29%)	       66.39	
  len=320, align1=1, align2=0, n=256:        48.70 ( 29.59%)	       69.16	
  len=320, align1=0, align2=0, n=256:        46.47 ( 24.55%)	       61.60	
  len=320, align1=0, align2=0, n=256:        45.68 ( 27.30%)	       62.83	
  len=320, align1=0, align2=0, n=256:        47.34 ( 23.15%)	       61.60	
  len=512, align1=1, align2=0, n=448:        72.59 ( 23.10%)	       94.39	
  len=512, align1=0, align2=0, n=448:        68.84 ( 38.34%)	      111.65	
  len=512, align1=0, align2=0, n=448:        69.80 ( 36.56%)	      110.03	
  len=512, align1=0, align2=0, n=448:        67.31 ( 40.49%)	      113.10	
  len=448, align1=1, align2=0, n=512:        65.75 ( 28.23%)	       91.61	
  len=448, align1=0, align2=0, n=512:        61.41 ( 30.51%)	       88.36	
  len=448, align1=0, align2=0, n=512:        65.19 ( 29.15%)	       92.02	
  len=448, align1=0, align2=0, n=512:        61.07 ( 31.08%)	       88.61	
  len=512, align1=1, align2=0, n=480:        75.89 ( 16.65%)	       91.05	
  len=512, align1=0, align2=0, n=480:        66.17 ( 26.56%)	       90.10	
  len=512, align1=0, align2=0, n=480:        65.74 ( 26.92%)	       89.96	
  len=512, align1=0, align2=0, n=480:        66.30 ( 26.50%)	       90.21	
  len=480, align1=1, align2=0, n=512:        65.24 ( 28.33%)	       91.03	
  len=480, align1=0, align2=0, n=512:        64.50 ( 30.43%)	       92.70	
  len=480, align1=0, align2=0, n=512:        64.49 ( 29.90%)	       91.99	
  len=480, align1=0, align2=0, n=512:        64.50 ( 30.11%)	       92.29	
  len=512, align1=1, align2=0, n=512:        68.43 ( 28.04%)	       95.09	
  len=512, align1=0, align2=0, n=512:        67.02 ( 27.18%)	       92.05	
  len=512, align1=0, align2=0, n=512:        67.02 ( 27.01%)	       91.82	
  len=512, align1=0, align2=0, n=512:        67.02 ( 27.05%)	       91.87	
  len=512, align1=1, align2=0, n=512:        67.68 ( 28.93%)	       95.23	
  len=512, align1=0, align2=0, n=512:        67.03 ( 27.48%)	       92.42	
  len=512, align1=0, align2=0, n=512:        67.02 ( 27.15%)	       92.00	
  len=512, align1=0, align2=0, n=512:        67.02 ( 27.33%)	       92.23	
  len=512, align1=1, align2=0, n=544:        70.63 ( 26.35%)	       95.89	
  len=512, align1=0, align2=0, n=544:        67.72 ( 29.97%)	       96.70	
  len=512, align1=0, align2=0, n=544:        67.71 ( 30.17%)	       96.95	
  len=512, align1=0, align2=0, n=544:        67.71 ( 29.99%)	       96.72	
  len=544, align1=1, align2=0, n=512:        83.22 ( 13.39%)	       96.08	
  len=544, align1=0, align2=0, n=512:        68.97 ( 27.78%)	       95.50	
  len=544, align1=0, align2=0, n=512:        71.83 ( 24.53%)	       95.18	
  len=544, align1=0, align2=0, n=512:        68.99 ( 27.28%)	       94.87	
  len=512, align1=1, align2=0, n=576:        72.60 ( 28.17%)	      101.08	
  len=512, align1=0, align2=0, n=576:        72.27 ( 25.52%)	       97.03	
  len=512, align1=0, align2=0, n=576:        67.75 ( 30.53%)	       97.52	
  len=512, align1=0, align2=0, n=576:        72.53 ( 29.10%)	      102.30	
  len=576, align1=1, align2=0, n=512:        82.05 ( 16.23%)	       97.94	
  len=576, align1=0, align2=0, n=512:        71.35 ( 26.64%)	       97.26	
  len=576, align1=0, align2=0, n=512:        74.36 ( 23.52%)	       97.23	
  len=576, align1=0, align2=0, n=512:        71.58 ( 26.50%)	       97.38	
 len=1024, align1=1, align2=0, n=960:       147.26 ( 11.02%)	      165.50	
 len=1024, align1=0, align2=0, n=960:       134.00 ( 13.30%)	      154.55	
 len=1024, align1=0, align2=0, n=960:       134.31 ( 13.26%)	      154.84	
 len=1024, align1=0, align2=0, n=960:       134.53 ( 12.97%)	      154.58	
 len=960, align1=1, align2=0, n=1024:       129.09 ( 20.84%)	      163.08	
 len=960, align1=0, align2=0, n=1024:       113.32 ( 26.35%)	      153.86	
 len=960, align1=0, align2=0, n=1024:       113.08 ( 26.77%)	      154.42	
 len=960, align1=0, align2=0, n=1024:       113.10 ( 26.50%)	      153.88	
 len=1024, align1=1, align2=0, n=992:       138.81 ( 18.75%)	      170.85	
 len=1024, align1=0, align2=0, n=992:       134.08 ( 14.74%)	      157.25	
 len=1024, align1=0, align2=0, n=992:       133.96 ( 14.83%)	      157.28	
 len=1024, align1=0, align2=0, n=992:       133.76 ( 15.03%)	      157.42	
 len=992, align1=1, align2=0, n=1024:       136.17 ( 18.21%)	      166.50	
 len=992, align1=0, align2=0, n=1024:       116.81 ( 29.71%)	      166.18	
 len=992, align1=0, align2=0, n=1024:       116.46 ( 26.72%)	      158.92	
 len=992, align1=0, align2=0, n=1024:       116.63 ( 26.64%)	      158.99	
len=1024, align1=1, align2=0, n=1024:       150.63 ( 14.32%)	      175.81	
len=1024, align1=0, align2=0, n=1024:       119.07 ( 26.07%)	      161.07	
len=1024, align1=0, align2=0, n=1024:       119.10 ( 26.06%)	      161.08	
len=1024, align1=0, align2=0, n=1024:       118.91 ( 26.16%)	      161.04	
len=1024, align1=1, align2=0, n=1024:       158.94 ( 13.17%)	      183.06	
len=1024, align1=0, align2=0, n=1024:       120.68 ( 27.45%)	      166.35	
len=1024, align1=0, align2=0, n=1024:       119.16 ( 26.03%)	      161.09	
len=1024, align1=0, align2=0, n=1024:       119.16 ( 26.02%)	      161.07	
len=1024, align1=1, align2=0, n=1056:       162.90 ( 15.29%)	      192.30	
len=1024, align1=0, align2=0, n=1056:       140.90 ( 26.76%)	      192.38	
len=1024, align1=0, align2=0, n=1056:       140.05 ( 30.28%)	      200.89	
len=1024, align1=0, align2=0, n=1056:       146.22 ( 25.04%)	      195.08	
len=1056, align1=1, align2=0, n=1024:       166.62 (  8.97%)	      183.03	
len=1056, align1=0, align2=0, n=1024:       121.48 ( 25.46%)	      162.98	
len=1056, align1=0, align2=0, n=1024:       123.93 ( 24.01%)	      163.09	
len=1056, align1=0, align2=0, n=1024:       127.86 ( 25.98%)	      172.73	
len=1024, align1=1, align2=0, n=1088:       167.49 ( 12.93%)	      192.36	
len=1024, align1=0, align2=0, n=1088:       147.48 ( 23.34%)	      192.38	
len=1024, align1=0, align2=0, n=1088:       140.01 ( 27.22%)	      192.39	
len=1024, align1=0, align2=0, n=1088:       140.09 ( 27.23%)	      192.51	
len=1088, align1=1, align2=0, n=1024:       159.00 ( 13.46%)	      183.73	
len=1088, align1=0, align2=0, n=1024:       143.31 ( 14.25%)	      167.13	
len=1088, align1=0, align2=0, n=1024:       140.46 ( 14.32%)	      163.93	
len=1088, align1=0, align2=0, n=1024:       139.85 ( 14.69%)	      163.92	

[-- Attachment #3: bench-strlcpy.txt --]
[-- Type: text/plain, Size: 18518 bytes --]

Function: strlcpy
Variant: 
                                    __strlcpy_avx2	__strlcpy_generic
========================================================================================================================
    len=16, align1=1, align2=1, n=16:        11.11 ( 32.32%)	       16.41	
    len=16, align1=1, align2=1, n=16:        10.73 ( 32.83%)	       15.98	
    len=16, align1=1, align2=2, n=16:        10.53 ( 33.23%)	       15.77	
    len=16, align1=2, align2=1, n=16:        10.89 ( 32.50%)	       16.13	
      len=2, align1=7, align2=2, n=4:         8.06 ( 35.05%)	       12.41	
      len=4, align1=2, align2=7, n=2:         8.66 ( 37.31%)	       13.82	
      len=2, align1=7, align2=2, n=4:         7.78 ( 33.85%)	       11.77	
      len=4, align1=2, align2=7, n=2:         8.70 ( 37.88%)	       14.01	
    len=16, align1=2, align2=2, n=16:        10.43 ( 31.86%)	       15.31	
    len=16, align1=2, align2=2, n=16:        10.87 ( 30.40%)	       15.62	
    len=16, align1=2, align2=4, n=16:        10.47 ( 30.24%)	       15.01	
    len=16, align1=4, align2=2, n=16:        10.56 ( 31.99%)	       15.53	
      len=4, align1=6, align2=4, n=8:        11.33 ( 18.99%)	       13.99	
      len=8, align1=4, align2=6, n=4:        10.44 ( 27.20%)	       14.34	
      len=4, align1=6, align2=4, n=8:        11.43 ( 13.14%)	       13.15	
      len=8, align1=4, align2=6, n=4:        10.83 ( 28.59%)	       15.16	
    len=16, align1=3, align2=3, n=16:        10.39 ( 33.18%)	       15.54	
    len=16, align1=3, align2=3, n=16:        10.13 ( 38.74%)	       16.53	
    len=16, align1=3, align2=6, n=16:        10.29 ( 37.51%)	       16.46	
    len=16, align1=6, align2=3, n=16:        10.56 ( 31.97%)	       15.53	
     len=8, align1=5, align2=6, n=16:        10.48 ( 22.21%)	       13.47	
     len=16, align1=6, align2=5, n=8:        10.95 ( 27.84%)	       15.17	
     len=8, align1=5, align2=6, n=16:        10.55 ( 23.09%)	       13.71	
     len=16, align1=6, align2=5, n=8:        10.98 ( 27.79%)	       15.20	
    len=16, align1=4, align2=4, n=16:        10.39 ( 32.51%)	       15.40	
    len=16, align1=4, align2=4, n=16:        10.38 ( 33.76%)	       15.68	
    len=16, align1=4, align2=0, n=16:        10.57 ( 28.87%)	       14.86	
    len=16, align1=0, align2=4, n=16:        10.28 ( 34.27%)	       15.64	
    len=16, align1=4, align2=0, n=32:        10.59 ( 23.24%)	       13.79	
    len=32, align1=0, align2=4, n=16:        11.66 ( 30.50%)	       16.77	
    len=16, align1=4, align2=0, n=32:        10.67 ( 23.98%)	       14.04	
    len=32, align1=0, align2=4, n=16:        11.06 ( 33.61%)	       16.66	
    len=16, align1=5, align2=5, n=16:        10.43 ( 33.52%)	       15.68	
    len=16, align1=5, align2=5, n=16:        10.49 ( 33.47%)	       15.77	
    len=16, align1=5, align2=2, n=16:        10.54 ( 29.46%)	       14.94	
    len=16, align1=2, align2=5, n=16:        10.20 ( 31.63%)	       14.92	
    len=32, align1=3, align2=2, n=64:        13.88 (  0.59%)	       13.97	
    len=64, align1=2, align2=3, n=32:        11.72 ( 22.36%)	       15.09	
    len=32, align1=3, align2=2, n=64:        13.49 (  2.26%)	       13.81	
    len=64, align1=2, align2=3, n=32:        11.54 ( 26.22%)	       15.64	
    len=16, align1=6, align2=6, n=16:        10.39 ( 27.70%)	       14.37	
    len=16, align1=6, align2=6, n=16:         9.94 ( 32.04%)	       14.63	
    len=16, align1=6, align2=4, n=16:         9.91 ( 33.92%)	       14.99	
    len=16, align1=4, align2=6, n=16:        10.19 ( 32.66%)	       15.14	
   len=64, align1=2, align2=4, n=128:        14.66 (  4.10%)	       15.29	
   len=128, align1=4, align2=2, n=64:        18.22 (-17.01%)	       15.57	
   len=64, align1=2, align2=4, n=128:        14.64 (  3.89%)	       15.24	
   len=128, align1=4, align2=2, n=64:        18.22 (-14.83%)	       15.86	
    len=16, align1=7, align2=7, n=16:         9.86 ( 30.07%)	       14.11	
    len=16, align1=7, align2=7, n=16:         9.86 ( 30.09%)	       14.11	
    len=16, align1=7, align2=6, n=16:         9.93 ( 32.92%)	       14.81	
    len=16, align1=6, align2=7, n=16:         9.83 ( 30.41%)	       14.13	
  len=128, align1=1, align2=6, n=256:        22.24 (  9.63%)	       24.61	
  len=256, align1=6, align2=1, n=128:        20.91 ( 12.22%)	       23.82	
  len=128, align1=1, align2=6, n=256:        22.21 (  9.86%)	       24.64	
  len=256, align1=6, align2=1, n=128:        20.81 ( 12.85%)	       23.88	
     len=8, align1=0, align2=0, n=16:        10.33 ( 20.37%)	       12.97	
    len=32, align1=0, align2=0, n=16:        10.75 ( 32.13%)	       15.84	
     len=8, align1=7, align2=2, n=16:        10.38 ( 20.33%)	       13.02	
    len=32, align1=7, align2=2, n=16:        11.03 ( 30.36%)	       15.84	
    len=16, align1=0, align2=0, n=32:         9.98 ( 26.96%)	       13.67	
    len=64, align1=0, align2=0, n=32:        10.94 ( 26.69%)	       14.92	
    len=16, align1=6, align2=4, n=32:        10.07 ( 22.77%)	       13.04	
    len=64, align1=6, align2=4, n=32:        11.68 ( 22.22%)	       15.01	
    len=32, align1=0, align2=0, n=64:        11.15 ( 11.26%)	       12.57	
   len=128, align1=0, align2=0, n=64:        17.59 ( -6.54%)	       16.51	
    len=32, align1=5, align2=6, n=64:        12.56 ( 12.27%)	       14.32	
   len=128, align1=5, align2=6, n=64:        19.12 (-20.33%)	       15.89	
   len=64, align1=0, align2=0, n=128:        12.70 ( 17.81%)	       15.45	
  len=256, align1=0, align2=0, n=128:        22.12 (  7.72%)	       23.97	
   len=64, align1=4, align2=0, n=128:        12.84 ( 18.75%)	       15.81	
  len=256, align1=4, align2=0, n=128:        21.48 ( 12.33%)	       24.50	
  len=128, align1=0, align2=0, n=256:        19.17 (  3.24%)	       19.81	
  len=512, align1=0, align2=0, n=256:        26.55 (  3.43%)	       27.49	
  len=128, align1=3, align2=2, n=256:        20.07 ( 17.46%)	       24.32	
  len=512, align1=3, align2=2, n=256:        26.65 ( 17.61%)	       32.35	
  len=256, align1=0, align2=0, n=512:        22.48 ( 14.46%)	       26.28	
 len=1024, align1=0, align2=0, n=512:        39.85 ( 12.47%)	       45.53	
  len=256, align1=2, align2=4, n=512:        27.00 (  8.13%)	       29.39	
 len=1024, align1=2, align2=4, n=512:        43.97 ( 15.73%)	       52.18	
 len=512, align1=0, align2=0, n=1024:        32.09 ( 29.08%)	       45.25	
len=2048, align1=0, align2=0, n=1024:        65.11 (  7.02%)	       70.02	
 len=512, align1=1, align2=6, n=1024:        35.13 ( 26.54%)	       47.83	
len=2048, align1=1, align2=6, n=1024:        80.38 (-15.59%)	       69.53	
   len=128, align1=1, align2=0, n=64:        18.89 (-12.93%)	       16.72	
   len=128, align1=0, align2=0, n=64:        16.93 ( -9.06%)	       15.52	
   len=128, align1=0, align2=0, n=64:        16.92 ( -8.70%)	       15.57	
   len=128, align1=0, align2=0, n=64:        17.58 (-12.44%)	       15.63	
   len=64, align1=1, align2=0, n=128:        12.84 ( 18.40%)	       15.74	
   len=64, align1=0, align2=0, n=128:        12.64 ( 19.60%)	       15.72	
   len=64, align1=0, align2=0, n=128:        12.78 ( 17.35%)	       15.47	
   len=64, align1=0, align2=0, n=128:        12.65 ( 18.44%)	       15.51	
   len=128, align1=1, align2=0, n=96:        20.15 ( -9.88%)	       18.34	
   len=128, align1=0, align2=0, n=96:        18.21 ( -3.68%)	       17.57	
   len=128, align1=0, align2=0, n=96:        18.46 ( -5.09%)	       17.57	
   len=128, align1=0, align2=0, n=96:        18.86 (  1.57%)	       19.16	
   len=96, align1=1, align2=0, n=128:        13.99 ( 15.86%)	       16.62	
   len=96, align1=0, align2=0, n=128:        14.60 ( 11.99%)	       16.59	
   len=96, align1=0, align2=0, n=128:        14.38 ( 20.13%)	       18.00	
   len=96, align1=0, align2=0, n=128:        14.34 ( 11.75%)	       16.25	
  len=128, align1=1, align2=0, n=128:        19.53 ( -0.01%)	       19.53	
  len=128, align1=0, align2=0, n=128:        20.17 ( -3.30%)	       19.53	
  len=128, align1=0, align2=0, n=128:        20.18 (-14.72%)	       17.59	
  len=128, align1=0, align2=0, n=128:        20.82 ( -0.68%)	       20.68	
  len=128, align1=1, align2=0, n=128:        20.01 ( -5.92%)	       18.89	
  len=128, align1=0, align2=0, n=128:        21.37 ( -8.22%)	       19.74	
  len=128, align1=0, align2=0, n=128:        20.17 (-14.75%)	       17.57	
  len=128, align1=0, align2=0, n=128:        20.80 (-18.42%)	       17.57	
  len=128, align1=1, align2=0, n=160:        19.65 ( 15.99%)	       23.39	
  len=128, align1=0, align2=0, n=160:        19.14 (  3.36%)	       19.80	
  len=128, align1=0, align2=0, n=160:        19.18 (  3.40%)	       19.85	
  len=128, align1=0, align2=0, n=160:        19.15 (  3.36%)	       19.81	
  len=160, align1=1, align2=0, n=128:        18.88 ( 12.02%)	       21.46	
  len=160, align1=0, align2=0, n=128:        20.16 (  9.62%)	       22.31	
  len=160, align1=0, align2=0, n=128:        20.80 (  0.05%)	       20.81	
  len=160, align1=0, align2=0, n=128:        20.16 (  8.81%)	       22.11	
  len=128, align1=1, align2=0, n=192:        19.65 ( 16.12%)	       23.42	
  len=128, align1=0, align2=0, n=192:        19.14 (  3.37%)	       19.80	
  len=128, align1=0, align2=0, n=192:        19.18 (  3.16%)	       19.80	
  len=128, align1=0, align2=0, n=192:        19.19 (  3.06%)	       19.80	
  len=192, align1=1, align2=0, n=128:        18.86 ( 19.40%)	       23.40	
  len=192, align1=0, align2=0, n=128:        20.81 (  6.46%)	       22.24	
  len=192, align1=0, align2=0, n=128:        20.81 (  8.70%)	       22.79	
  len=192, align1=0, align2=0, n=128:        21.46 (  4.55%)	       22.48	
  len=256, align1=1, align2=0, n=192:        20.83 ( 13.49%)	       24.08	
  len=256, align1=0, align2=0, n=192:        21.35 ( 15.83%)	       25.37	
  len=256, align1=0, align2=0, n=192:        20.83 ( 15.85%)	       24.75	
  len=256, align1=0, align2=0, n=192:        21.87 ( 13.82%)	       25.37	
  len=192, align1=1, align2=0, n=256:        22.27 (  5.03%)	       23.45	
  len=192, align1=0, align2=0, n=256:        19.58 ( 14.91%)	       23.02	
  len=192, align1=0, align2=0, n=256:        19.58 ( 14.91%)	       23.01	
  len=192, align1=0, align2=0, n=256:        19.57 ( 16.70%)	       23.50	
  len=256, align1=1, align2=0, n=224:        20.84 ( 19.02%)	       25.74	
  len=256, align1=0, align2=0, n=224:        20.91 ( 15.73%)	       24.81	
  len=256, align1=0, align2=0, n=224:        21.47 ( 10.79%)	       24.07	
  len=256, align1=0, align2=0, n=224:        21.47 ( 10.79%)	       24.06	
  len=224, align1=1, align2=0, n=256:        20.43 ( 16.38%)	       24.43	
  len=224, align1=0, align2=0, n=256:        19.23 ( 16.62%)	       23.06	
  len=224, align1=0, align2=0, n=256:        19.21 ( 16.84%)	       23.10	
  len=224, align1=0, align2=0, n=256:        19.24 ( 16.77%)	       23.12	
  len=256, align1=1, align2=0, n=256:        24.05 (  5.44%)	       25.44	
  len=256, align1=0, align2=0, n=256:        21.63 ( 14.98%)	       25.45	
  len=256, align1=0, align2=0, n=256:        20.81 ( 13.64%)	       24.10	
  len=256, align1=0, align2=0, n=256:        20.81 ( 13.67%)	       24.10	
  len=256, align1=1, align2=0, n=256:        24.10 ( -0.20%)	       24.05	
  len=256, align1=0, align2=0, n=256:        21.46 ( 16.56%)	       25.71	
  len=256, align1=0, align2=0, n=256:        21.46 ( 10.79%)	       24.05	
  len=256, align1=0, align2=0, n=256:        20.81 ( 14.64%)	       24.38	
  len=256, align1=1, align2=0, n=288:        24.21 ( 15.45%)	       28.63	
  len=256, align1=0, align2=0, n=288:        23.11 ( 12.68%)	       26.46	
  len=256, align1=0, align2=0, n=288:        22.55 ( 14.25%)	       26.29	
  len=256, align1=0, align2=0, n=288:        22.49 ( 14.49%)	       26.30	
  len=288, align1=1, align2=0, n=256:        24.06 (  5.36%)	       25.42	
  len=288, align1=0, align2=0, n=256:        22.82 (  7.35%)	       24.63	
  len=288, align1=0, align2=0, n=256:        22.80 ( 10.98%)	       25.62	
  len=288, align1=0, align2=0, n=256:        21.46 ( 17.56%)	       26.03	
  len=256, align1=1, align2=0, n=320:        24.17 ( 15.82%)	       28.71	
  len=256, align1=0, align2=0, n=320:        22.44 ( 14.79%)	       26.34	
  len=256, align1=0, align2=0, n=320:        22.56 ( 14.14%)	       26.27	
  len=256, align1=0, align2=0, n=320:        22.50 ( 14.35%)	       26.27	
  len=320, align1=1, align2=0, n=256:        24.10 (  8.33%)	       26.29	
  len=320, align1=0, align2=0, n=256:        22.11 ( 16.28%)	       26.41	
  len=320, align1=0, align2=0, n=256:        21.57 ( 16.27%)	       25.76	
  len=320, align1=0, align2=0, n=256:        21.46 ( 15.42%)	       25.37	
  len=512, align1=1, align2=0, n=448:        27.62 ( 31.43%)	       40.28	
  len=512, align1=0, align2=0, n=448:        27.63 ( 32.11%)	       40.70	
  len=512, align1=0, align2=0, n=448:        26.53 ( 35.05%)	       40.85	
  len=512, align1=0, align2=0, n=448:        26.51 ( 34.99%)	       40.78	
  len=448, align1=1, align2=0, n=512:        31.01 ( 28.08%)	       43.11	
  len=448, align1=0, align2=0, n=512:        29.35 ( 36.94%)	       46.54	
  len=448, align1=0, align2=0, n=512:        29.38 ( 37.01%)	       46.63	
  len=448, align1=0, align2=0, n=512:        29.38 ( 37.01%)	       46.64	
  len=512, align1=1, align2=0, n=480:        28.24 ( 35.42%)	       43.73	
  len=512, align1=0, align2=0, n=480:        28.76 ( 28.65%)	       40.31	
  len=512, align1=0, align2=0, n=480:        28.47 ( 30.82%)	       41.16	
  len=512, align1=0, align2=0, n=480:        26.70 ( 31.68%)	       39.08	
  len=480, align1=1, align2=0, n=512:        30.73 ( 26.75%)	       41.95	
  len=480, align1=0, align2=0, n=512:        28.79 ( 34.92%)	       44.23	
  len=480, align1=0, align2=0, n=512:        28.76 ( 35.89%)	       44.87	
  len=480, align1=0, align2=0, n=512:        29.39 ( 35.67%)	       45.68	
  len=512, align1=1, align2=0, n=512:        30.58 ( 25.28%)	       40.92	
  len=512, align1=0, align2=0, n=512:        26.67 ( 31.41%)	       38.87	
  len=512, align1=0, align2=0, n=512:        26.67 ( 34.15%)	       40.50	
  len=512, align1=0, align2=0, n=512:        27.17 ( 30.43%)	       39.06	
  len=512, align1=1, align2=0, n=512:        30.63 ( 25.12%)	       40.91	
  len=512, align1=0, align2=0, n=512:        26.74 ( 31.56%)	       39.06	
  len=512, align1=0, align2=0, n=512:        26.72 ( 31.55%)	       39.04	
  len=512, align1=0, align2=0, n=512:        26.74 ( 31.11%)	       38.81	
  len=512, align1=1, align2=0, n=544:        33.43 ( 21.70%)	       42.69	
  len=512, align1=0, align2=0, n=544:        31.96 ( 27.77%)	       44.25	
  len=512, align1=0, align2=0, n=544:        31.36 ( 27.40%)	       43.20	
  len=512, align1=0, align2=0, n=544:        31.41 ( 27.14%)	       43.11	
  len=544, align1=1, align2=0, n=512:        30.55 ( 25.76%)	       41.15	
  len=544, align1=0, align2=0, n=512:        27.26 ( 31.01%)	       39.51	
  len=544, align1=0, align2=0, n=512:        27.30 ( 30.74%)	       39.41	
  len=544, align1=0, align2=0, n=512:        26.65 ( 32.38%)	       39.40	
  len=512, align1=1, align2=0, n=576:        33.39 ( 21.56%)	       42.58	
  len=512, align1=0, align2=0, n=576:        31.41 ( 28.37%)	       43.85	
  len=512, align1=0, align2=0, n=576:        31.41 ( 27.57%)	       43.37	
  len=512, align1=0, align2=0, n=576:        31.42 ( 27.41%)	       43.28	
  len=576, align1=1, align2=0, n=512:        30.61 ( 27.75%)	       42.36	
  len=576, align1=0, align2=0, n=512:        27.66 ( 31.54%)	       40.40	
  len=576, align1=0, align2=0, n=512:        28.04 ( 30.84%)	       40.55	
  len=576, align1=0, align2=0, n=512:        27.94 ( 31.15%)	       40.58	
 len=1024, align1=1, align2=0, n=960:        39.78 ( 28.72%)	       55.80	
 len=1024, align1=0, align2=0, n=960:        40.87 ( 26.15%)	       55.34	
 len=1024, align1=0, align2=0, n=960:        40.06 ( 26.81%)	       54.73	
 len=1024, align1=0, align2=0, n=960:        40.25 ( 26.40%)	       54.69	
 len=960, align1=1, align2=0, n=1024:        38.74 ( 31.46%)	       56.52	
 len=960, align1=0, align2=0, n=1024:        38.37 ( 36.30%)	       60.24	
 len=960, align1=0, align2=0, n=1024:        38.37 ( 36.36%)	       60.30	
 len=960, align1=0, align2=0, n=1024:        39.88 ( 35.25%)	       61.60	
 len=1024, align1=1, align2=0, n=992:        39.71 ( 28.13%)	       55.26	
 len=1024, align1=0, align2=0, n=992:        39.85 ( 29.39%)	       56.44	
 len=1024, align1=0, align2=0, n=992:        40.34 ( 25.81%)	       54.37	
 len=1024, align1=0, align2=0, n=992:        40.31 ( 25.91%)	       54.40	
 len=992, align1=1, align2=0, n=1024:        37.72 ( 32.49%)	       55.88	
 len=992, align1=0, align2=0, n=1024:        38.37 ( 36.02%)	       59.97	
 len=992, align1=0, align2=0, n=1024:        38.42 ( 35.53%)	       59.60	
 len=992, align1=0, align2=0, n=1024:        38.40 ( 35.67%)	       59.69	
len=1024, align1=1, align2=0, n=1024:        40.88 ( 26.02%)	       55.26	
len=1024, align1=0, align2=0, n=1024:        40.36 ( 25.56%)	       54.22	
len=1024, align1=0, align2=0, n=1024:        40.31 ( 25.60%)	       54.19	
len=1024, align1=0, align2=0, n=1024:        40.35 ( 29.70%)	       57.40	
len=1024, align1=1, align2=0, n=1024:        41.03 ( 25.71%)	       55.22	
len=1024, align1=0, align2=0, n=1024:        40.37 ( 25.42%)	       54.13	
len=1024, align1=0, align2=0, n=1024:        40.31 ( 25.64%)	       54.21	
len=1024, align1=0, align2=0, n=1024:        40.32 ( 25.60%)	       54.19	
len=1024, align1=1, align2=0, n=1056:        41.06 ( 25.94%)	       55.45	
len=1024, align1=0, align2=0, n=1056:        41.06 ( 29.54%)	       58.27	
len=1024, align1=0, align2=0, n=1056:        41.05 ( 28.94%)	       57.77	
len=1024, align1=0, align2=0, n=1056:        41.02 ( 28.82%)	       57.62	
len=1056, align1=1, align2=0, n=1024:        41.00 ( 26.23%)	       55.59	
len=1056, align1=0, align2=0, n=1024:        39.67 ( 27.07%)	       54.39	
len=1056, align1=0, align2=0, n=1024:        40.34 ( 29.19%)	       56.97	
len=1056, align1=0, align2=0, n=1024:        40.37 ( 27.52%)	       55.71	
len=1024, align1=1, align2=0, n=1088:        41.02 ( 26.33%)	       55.68	
len=1024, align1=0, align2=0, n=1088:        41.06 ( 30.82%)	       59.35	
len=1024, align1=0, align2=0, n=1088:        41.05 ( 29.58%)	       58.29	
len=1024, align1=0, align2=0, n=1088:        41.14 ( 28.69%)	       57.69	
len=1088, align1=1, align2=0, n=1024:        41.31 ( 27.50%)	       56.98	
len=1088, align1=0, align2=0, n=1024:        40.32 ( 29.25%)	       56.99	
len=1088, align1=0, align2=0, n=1024:        40.74 ( 27.82%)	       56.44	
len=1088, align1=0, align2=0, n=1024:        40.70 ( 26.62%)	       55.47	

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH] x86_64: Implement AVX2 version of strlcpy/wcslcpy function
  2023-06-30 22:21     ` Sunil Pandey
@ 2023-06-30 23:22       ` Noah Goldstein
  2023-06-30 23:27         ` Noah Goldstein
  2023-07-03 16:30       ` Paul Eggert
  1 sibling, 1 reply; 24+ messages in thread
From: Noah Goldstein @ 2023-06-30 23:22 UTC (permalink / raw)
  To: Sunil Pandey; +Cc: Paul Eggert, libc-alpha, hjl.tools

On Fri, Jun 30, 2023 at 5:21 PM Sunil Pandey <skpgkp2@gmail.com> wrote:
>
>
>
> On Fri, Jun 30, 2023 at 2:27 PM Paul Eggert <eggert@cs.ucla.edu> wrote:
>>
>> On 2023-06-30 14:04, Noah Goldstein via Libc-alpha wrote:
>> > Think we should at the very least wait for the generic strlcpy codes
>> > to land first.
>>
>> Let's not optimize these functions at all, unless there's good and
>> measured reason to do so. In practice I expected they're called with
>> small sizes for which optimization is a net minus as it consumes
>> valuable maintenance time with no real benefit.
>
>
> Hi Paul,
>
> Attached is strcpy/wcslcpy microbenchmark data based on Noah strlcpy/wcslcpy microbenchmark patch.
>
I don't think the concern is that we can beat the generic impl (which hasn't
even landed yet AFAICT), it whether doing so makes sense given the
usage/goal of the functions.

> https://sourceware.org/pipermail/libc-alpha/2023-April/147557.html
>
> Thanks,
> Sunil

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH] x86_64: Implement AVX2 version of strlcpy/wcslcpy function
  2023-06-30 23:22       ` Noah Goldstein
@ 2023-06-30 23:27         ` Noah Goldstein
  0 siblings, 0 replies; 24+ messages in thread
From: Noah Goldstein @ 2023-06-30 23:27 UTC (permalink / raw)
  To: Sunil Pandey; +Cc: Paul Eggert, libc-alpha, hjl.tools

On Fri, Jun 30, 2023 at 6:22 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> On Fri, Jun 30, 2023 at 5:21 PM Sunil Pandey <skpgkp2@gmail.com> wrote:
> >
> >
> >
> > On Fri, Jun 30, 2023 at 2:27 PM Paul Eggert <eggert@cs.ucla.edu> wrote:
> >>
> >> On 2023-06-30 14:04, Noah Goldstein via Libc-alpha wrote:
> >> > Think we should at the very least wait for the generic strlcpy codes
> >> > to land first.
> >>
> >> Let's not optimize these functions at all, unless there's good and
> >> measured reason to do so. In practice I expected they're called with
> >> small sizes for which optimization is a net minus as it consumes
> >> valuable maintenance time with no real benefit.
> >
> >
> > Hi Paul,
> >
> > Attached is strcpy/wcslcpy microbenchmark data based on Noah strlcpy/wcslcpy microbenchmark patch.
> >
> I don't think the concern is that we can beat the generic impl (which hasn't
> even landed yet AFAICT), it whether doing so makes sense given the
> usage/goal of the functions.
>

That being said, I'm generally in favor of adding optimized versions since
we happen to be a position where at least several developers find it worth
their time to maintain, but not before the generic versions have landed.
> > https://sourceware.org/pipermail/libc-alpha/2023-April/147557.html
> >
> > Thanks,
> > Sunil

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH] x86_64: Implement AVX2 version of strlcpy/wcslcpy function
  2023-06-30 22:21     ` Sunil Pandey
  2023-06-30 23:22       ` Noah Goldstein
@ 2023-07-03 16:30       ` Paul Eggert
  2023-07-03 18:40         ` Noah Goldstein
  1 sibling, 1 reply; 24+ messages in thread
From: Paul Eggert @ 2023-07-03 16:30 UTC (permalink / raw)
  To: Sunil Pandey; +Cc: Noah Goldstein, libc-alpha, hjl.tools

On 2023-06-30 15:21, Sunil Pandey wrote:
> Attached is strcpy/wcslcpy microbenchmark data based on Noah
> strlcpy/wcslcpy microbenchmark patch.

Although it's helpful to know that the proposed patch improves 
microbenchmark scores, that's not enough to justify it. Let's see 
benchmarks of real programs. If they don't show significant wins, let's 
not bother.

Programs that use strlcpy, by and large, don't use it in 
performance-sensitive areas, and their developers and users are far more 
worried about security than about performance. Making the implementation 
harder to audit will likely be a net negative for these applications. 
This doesn't sound a like a win at all.

Plus, who uses wcslcpy? Why bother to tune it if nobody uses it?

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH] x86_64: Implement AVX2 version of strlcpy/wcslcpy function
  2023-07-03 16:30       ` Paul Eggert
@ 2023-07-03 18:40         ` Noah Goldstein
  2023-07-03 18:54           ` Adhemerval Zanella Netto
  2023-07-03 21:14           ` Paul Eggert
  0 siblings, 2 replies; 24+ messages in thread
From: Noah Goldstein @ 2023-07-03 18:40 UTC (permalink / raw)
  To: Paul Eggert; +Cc: Sunil Pandey, libc-alpha, hjl.tools

On Mon, Jul 3, 2023 at 11:30 AM Paul Eggert <eggert@cs.ucla.edu> wrote:
>
> On 2023-06-30 15:21, Sunil Pandey wrote:
> > Attached is strcpy/wcslcpy microbenchmark data based on Noah
> > strlcpy/wcslcpy microbenchmark patch.
>
> Although it's helpful to know that the proposed patch improves
> microbenchmark scores, that's not enough to justify it. Let's see
> benchmarks of real programs. If they don't show significant wins, let's
> not bother.
>
> Programs that use strlcpy, by and large, don't use it in
> performance-sensitive areas, and their developers and users are far more
> worried about security than about performance. Making the implementation
> harder to audit will likely be a net negative for these applications.
> This doesn't sound a like a win at all.
>
> Plus, who uses wcslcpy? Why bother to tune it if nobody uses it?

Think we should look into dropping optimized strcpy/wcscpy family
in general? For the most part don't see them in perf sensitive areas
anyways (generally people that care about perf maintain the length
and use mem* functions).

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH] x86_64: Implement AVX2 version of strlcpy/wcslcpy function
  2023-07-03 18:40         ` Noah Goldstein
@ 2023-07-03 18:54           ` Adhemerval Zanella Netto
  2023-07-03 21:14           ` Paul Eggert
  1 sibling, 0 replies; 24+ messages in thread
From: Adhemerval Zanella Netto @ 2023-07-03 18:54 UTC (permalink / raw)
  To: libc-alpha



On 03/07/23 15:40, Noah Goldstein via Libc-alpha wrote:
> On Mon, Jul 3, 2023 at 11:30 AM Paul Eggert <eggert@cs.ucla.edu> wrote:
>>
>> On 2023-06-30 15:21, Sunil Pandey wrote:
>>> Attached is strcpy/wcslcpy microbenchmark data based on Noah
>>> strlcpy/wcslcpy microbenchmark patch.
>>
>> Although it's helpful to know that the proposed patch improves
>> microbenchmark scores, that's not enough to justify it. Let's see
>> benchmarks of real programs. If they don't show significant wins, let's
>> not bother.
>>
>> Programs that use strlcpy, by and large, don't use it in
>> performance-sensitive areas, and their developers and users are far more
>> worried about security than about performance. Making the implementation
>> harder to audit will likely be a net negative for these applications.
>> This doesn't sound a like a win at all.
>>
>> Plus, who uses wcslcpy? Why bother to tune it if nobody uses it?
> 
> Think we should look into dropping optimized strcpy/wcscpy family
> in general? For the most part don't see them in perf sensitive areas
> anyways (generally people that care about perf maintain the length
> and use mem* functions).

I will go for it, these interface are provided mainly to comply with
standards and for x86 it adds only more maintenance.

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH] x86_64: Implement AVX2 version of strlcpy/wcslcpy function
  2023-07-03 18:40         ` Noah Goldstein
  2023-07-03 18:54           ` Adhemerval Zanella Netto
@ 2023-07-03 21:14           ` Paul Eggert
  2023-07-03 22:04             ` Gabriel Ravier
  1 sibling, 1 reply; 24+ messages in thread
From: Paul Eggert @ 2023-07-03 21:14 UTC (permalink / raw)
  To: Noah Goldstein; +Cc: Sunil Pandey, libc-alpha, hjl.tools

On 2023-07-03 11:40, Noah Goldstein wrote:
> Think we should look into dropping optimized strcpy/wcscpy family
> in general?

For wcscpy yes. Who uses wcscpy? Optimizing it is a worthless time sink.

strcpy optimization might be worth keeping, as it's used so much more. 
Measurements of real programs would help decide. In the meantime inertia 
suggests that when in doubt, leave it alone.

For strlcpy it's an easy call: don't optimize unless realistic 
benchmarks show it's a win.

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH] x86_64: Implement AVX2 version of strlcpy/wcslcpy function
  2023-07-03 21:14           ` Paul Eggert
@ 2023-07-03 22:04             ` Gabriel Ravier
  2023-07-03 23:12               ` Paul Eggert
  0 siblings, 1 reply; 24+ messages in thread
From: Gabriel Ravier @ 2023-07-03 22:04 UTC (permalink / raw)
  To: Paul Eggert, Noah Goldstein; +Cc: Sunil Pandey, libc-alpha, hjl.tools

On 7/3/23 23:14, Paul Eggert wrote:
> On 2023-07-03 11:40, Noah Goldstein wrote:
>> Think we should look into dropping optimized strcpy/wcscpy family
>> in general?
>
> For wcscpy yes. Who uses wcscpy? Optimizing it is a worthless time sink.
>
> strcpy optimization might be worth keeping, as it's used so much more. 
> Measurements of real programs would help decide. In the meantime 
> inertia suggests that when in doubt, leave it alone.
>
> For strlcpy it's an easy call: don't optimize unless realistic 
> benchmarks show it's a win.

I guess it depends on just how much people use BSD software on Linux, 
because if you're looking at the BSDs the amount of usage of strlcpy is 
just absurdly massive - OpenBSD's tree has 4997 occurences of it, when 
memcpy is present 13470 times. That still means memcpy is used 3 times 
as often, but the idea that strlcpy is so popular as to be used to a 
remotely comparable degree is itself kind of astonishing.


^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH] x86_64: Implement AVX2 version of strlcpy/wcslcpy function
  2023-07-03 22:04             ` Gabriel Ravier
@ 2023-07-03 23:12               ` Paul Eggert
  2023-07-04  7:45                 ` Andreas Schwab
  0 siblings, 1 reply; 24+ messages in thread
From: Paul Eggert @ 2023-07-03 23:12 UTC (permalink / raw)
  To: Gabriel Ravier, Noah Goldstein; +Cc: Sunil Pandey, libc-alpha, hjl.tools

On 2023-07-03 15:04, Gabriel Ravier wrote:
> OpenBSD's tree has 4997 occurrences of it

Many years ago the OpenBSD team went through its source code and 
replaced uses of strcpy with strlcpy, without much thought involved and 
even introducing problems in the process. I expect that not much of this 
code is used elsewhere and it's not that relevant to glibc. Of the 
little OpenBSDish code that is relevant (notably OpenSSH) I expect the 
performance difference to be so small as to not be worth optimizating 
glibc. Real-worldish benchmarks could help check this.

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH] x86_64: Implement AVX2 version of strlcpy/wcslcpy function
  2023-07-03 23:12               ` Paul Eggert
@ 2023-07-04  7:45                 ` Andreas Schwab
  0 siblings, 0 replies; 24+ messages in thread
From: Andreas Schwab @ 2023-07-04  7:45 UTC (permalink / raw)
  To: Paul Eggert
  Cc: Gabriel Ravier, Noah Goldstein, Sunil Pandey, libc-alpha, hjl.tools

On Jul 03 2023, Paul Eggert wrote:

> On 2023-07-03 15:04, Gabriel Ravier wrote:
>> OpenBSD's tree has 4997 occurrences of it
>
> Many years ago the OpenBSD team went through its source code and replaced
> uses of strcpy with strlcpy, without much thought involved and even
> introducing problems in the process.

In the Linux kernel sources all uses of strlcpy are being erased,
because the developers have realized how crappy that interface is.

-- 
Andreas Schwab, SUSE Labs, schwab@suse.de
GPG Key fingerprint = 0196 BAD8 1CE9 1970 F4BE  1748 E4D4 88E3 0EEA B9D7
"And now for something completely different."

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH] x86_64: Implement AVX2 version of strlcpy/wcslcpy function
  2023-06-30 21:27   ` Paul Eggert
  2023-06-30 22:21     ` Sunil Pandey
@ 2023-07-03 12:55     ` Adhemerval Zanella Netto
  1 sibling, 0 replies; 24+ messages in thread
From: Adhemerval Zanella Netto @ 2023-07-03 12:55 UTC (permalink / raw)
  To: Paul Eggert, Noah Goldstein, Sunil K Pandey; +Cc: libc-alpha, hjl.tools



On 30/06/23 18:27, Paul Eggert wrote:
> On 2023-06-30 14:04, Noah Goldstein via Libc-alpha wrote:
>> Think we should at the very least wait for the generic strlcpy codes
>> to land first.
> 
> Let's not optimize these functions at all, unless there's good and measured reason to do so. In practice I expected they're called with small sizes for which optimization is a net minus as it consumes valuable maintenance time with no real benefit.

I tend to agree, although these are now added in next POSIX my understanding
is there are still not encouraged to be used due multiple shortcoming on 
previous discussion.
 

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH] x86_64: Implement AVX2 version of strlcpy/wcslcpy function
  2023-06-30 21:04 ` Noah Goldstein
  2023-06-30 21:27   ` Paul Eggert
@ 2023-07-01  9:41   ` Florian Weimer
  2023-07-02  1:22     ` Noah Goldstein
  1 sibling, 1 reply; 24+ messages in thread
From: Florian Weimer @ 2023-07-01  9:41 UTC (permalink / raw)
  To: Noah Goldstein via Libc-alpha; +Cc: Sunil K Pandey, Noah Goldstein, hjl.tools

* Noah Goldstein via Libc-alpha:

> Think we should at the very least wait for the generic strlcpy codes
> to land first.

Do you mean a version of string/strlcpy.c that is based on a modified
string/stplcpy.c, rather than the one we have now that calls just strlen
and memcpy?

Thanks,
Florian


^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH] x86_64: Implement AVX2 version of strlcpy/wcslcpy function
  2023-07-01  9:41   ` Florian Weimer
@ 2023-07-02  1:22     ` Noah Goldstein
  2023-07-02  6:51       ` Florian Weimer
  0 siblings, 1 reply; 24+ messages in thread
From: Noah Goldstein @ 2023-07-02  1:22 UTC (permalink / raw)
  To: Florian Weimer; +Cc: Noah Goldstein via Libc-alpha, Sunil K Pandey, hjl.tools

On Sat, Jul 1, 2023 at 4:41 AM Florian Weimer <fweimer@redhat.com> wrote:
>
> * Noah Goldstein via Libc-alpha:
>
> > Think we should at the very least wait for the generic strlcpy codes
> > to land first.
>
> Do you mean a version of string/strlcpy.c that is based on a modified
> string/stplcpy.c, rather than the one we have now that calls just strlen
> and memcpy?

Hmm? I mean your strlcpy/strlcat patch to land.
>
> Thanks,
> Florian
>

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH] x86_64: Implement AVX2 version of strlcpy/wcslcpy function
  2023-07-02  1:22     ` Noah Goldstein
@ 2023-07-02  6:51       ` Florian Weimer
  2023-07-02 16:55         ` Noah Goldstein
  0 siblings, 1 reply; 24+ messages in thread
From: Florian Weimer @ 2023-07-02  6:51 UTC (permalink / raw)
  To: Noah Goldstein; +Cc: Noah Goldstein via Libc-alpha, Sunil K Pandey, hjl.tools

* Noah Goldstein:

> On Sat, Jul 1, 2023 at 4:41 AM Florian Weimer <fweimer@redhat.com> wrote:
>>
>> * Noah Goldstein via Libc-alpha:
>>
>> > Think we should at the very least wait for the generic strlcpy codes
>> > to land first.
>>
>> Do you mean a version of string/strlcpy.c that is based on a modified
>> string/stplcpy.c, rather than the one we have now that calls just strlen
>> and memcpy?
>
> Hmm? I mean your strlcpy/strlcat patch to land.

That has already happened?

Thanks,
Florian


^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH] x86_64: Implement AVX2 version of strlcpy/wcslcpy function
  2023-07-02  6:51       ` Florian Weimer
@ 2023-07-02 16:55         ` Noah Goldstein
  2023-07-02 17:02           ` Florian Weimer
  0 siblings, 1 reply; 24+ messages in thread
From: Noah Goldstein @ 2023-07-02 16:55 UTC (permalink / raw)
  To: Florian Weimer; +Cc: Noah Goldstein via Libc-alpha, Sunil K Pandey, hjl.tools

On Sun, Jul 2, 2023 at 1:51 AM Florian Weimer <fweimer@redhat.com> wrote:
>
> * Noah Goldstein:
>
> > On Sat, Jul 1, 2023 at 4:41 AM Florian Weimer <fweimer@redhat.com> wrote:
> >>
> >> * Noah Goldstein via Libc-alpha:
> >>
> >> > Think we should at the very least wait for the generic strlcpy codes
> >> > to land first.
> >>
> >> Do you mean a version of string/strlcpy.c that is based on a modified
> >> string/stplcpy.c, rather than the one we have now that calls just strlen
> >> and memcpy?
> >
> > Hmm? I mean your strlcpy/strlcat patch to land.
>
> That has already happened?
:/ yup had been a minute since I pulled.

Are we getting stplcpy?
>
> Thanks,
> Florian
>

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH] x86_64: Implement AVX2 version of strlcpy/wcslcpy function
  2023-07-02 16:55         ` Noah Goldstein
@ 2023-07-02 17:02           ` Florian Weimer
  0 siblings, 0 replies; 24+ messages in thread
From: Florian Weimer @ 2023-07-02 17:02 UTC (permalink / raw)
  To: Noah Goldstein; +Cc: Noah Goldstein via Libc-alpha, Sunil K Pandey, hjl.tools

* Noah Goldstein:

>> >> Do you mean a version of string/strlcpy.c that is based on a modified
>> >> string/stplcpy.c, rather than the one we have now that calls just strlen
>> >> and memcpy?
>> >
>> > Hmm? I mean your strlcpy/strlcat patch to land.
>>
>> That has already happened?
> :/ yup had been a minute since I pulled.
>
> Are we getting stplcpy?

No.  I mentioned string/stplcpy.c because it's what the generic strcpy
is based upon.  Sorry for the confusion.

Thanks,
Florian


^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH] x86_64: Implement AVX2 version of strlcpy/wcslcpy function
  2023-06-30 20:48 [PATCH] x86_64: Implement AVX2 version of strlcpy/wcslcpy function Sunil K Pandey
  2023-06-30 21:04 ` Noah Goldstein
@ 2023-07-02 17:03 ` Noah Goldstein
  2023-07-02 18:37   ` Sunil Pandey
  1 sibling, 1 reply; 24+ messages in thread
From: Noah Goldstein @ 2023-07-02 17:03 UTC (permalink / raw)
  To: Sunil K Pandey; +Cc: libc-alpha, hjl.tools

On Fri, Jun 30, 2023 at 3:48 PM Sunil K Pandey via Libc-alpha
<libc-alpha@sourceware.org> wrote:
>
> This patch optimizes strlcpy/wsclcpy string functions for AVX2.
> ---
>  sysdeps/x86_64/multiarch/Makefile          |   4 +
>  sysdeps/x86_64/multiarch/ifunc-impl-list.c |  18 +
>  sysdeps/x86_64/multiarch/ifunc-strlcpy.h   |  34 ++
>  sysdeps/x86_64/multiarch/strlcpy-avx2.S    | 446 +++++++++++++++++++++
>  sysdeps/x86_64/multiarch/strlcpy-generic.c |  25 ++
>  sysdeps/x86_64/multiarch/strlcpy.c         |  36 ++
>  sysdeps/x86_64/multiarch/wcslcpy-avx2.S    |   4 +
>  sysdeps/x86_64/multiarch/wcslcpy-generic.c |  25 ++
>  sysdeps/x86_64/multiarch/wcslcpy.c         |  35 ++
>  9 files changed, 627 insertions(+)
>  create mode 100644 sysdeps/x86_64/multiarch/ifunc-strlcpy.h
>  create mode 100644 sysdeps/x86_64/multiarch/strlcpy-avx2.S
>  create mode 100644 sysdeps/x86_64/multiarch/strlcpy-generic.c
>  create mode 100644 sysdeps/x86_64/multiarch/strlcpy.c
>  create mode 100644 sysdeps/x86_64/multiarch/wcslcpy-avx2.S
>  create mode 100644 sysdeps/x86_64/multiarch/wcslcpy-generic.c
>  create mode 100644 sysdeps/x86_64/multiarch/wcslcpy.c
>
> diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
> index e1e894c963..7e3fc081df 100644
> --- a/sysdeps/x86_64/multiarch/Makefile
> +++ b/sysdeps/x86_64/multiarch/Makefile
> @@ -82,6 +82,8 @@ sysdep_routines += \
>    strcpy-sse2 \
>    strcpy-sse2-unaligned \
>    strcspn-sse4 \
> +  strlcpy-avx2 \
> +  strlcpy-generic \
>    strlen-avx2 \
>    strlen-avx2-rtm \
>    strlen-evex \
> @@ -153,6 +155,8 @@ sysdep_routines += \
>    wcscpy-evex \
>    wcscpy-generic \
>    wcscpy-ssse3 \
> +  wcslcpy-avx2 \
> +  wcslcpy-generic \
>    wcslen-avx2 \
>    wcslen-avx2-rtm \
>    wcslen-evex \
> diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> index 5427ff1907..9928dee187 100644
> --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> @@ -751,6 +751,15 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
>                                      1,
>                                      __strncat_sse2_unaligned))
>
> +  /* Support sysdeps/x86_64/multiarch/strlcpy.c.  */
> +  IFUNC_IMPL (i, name, strlcpy,
> +             X86_IFUNC_IMPL_ADD_V3 (array, i, strlcpy,
> +                                    CPU_FEATURE_USABLE (AVX2),
> +                                    __strlcpy_avx2)
> +             X86_IFUNC_IMPL_ADD_V1 (array, i, strlcpy,
> +                                    1,
> +                                    __strlcpy_generic))
> +
>    /* Support sysdeps/x86_64/multiarch/strncpy.c.  */
>    IFUNC_IMPL (i, name, strncpy,
>               X86_IFUNC_IMPL_ADD_V4 (array, i, strncpy,
> @@ -917,6 +926,15 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
>                                      1,
>                                      __wcscpy_generic))
>
> +  /* Support sysdeps/x86_64/multiarch/wcslcpy.c.  */
> +  IFUNC_IMPL (i, name, wcslcpy,
> +             X86_IFUNC_IMPL_ADD_V3 (array, i, wcslcpy,
> +                                    CPU_FEATURE_USABLE (AVX2),
> +                                    __wcslcpy_avx2)
> +             X86_IFUNC_IMPL_ADD_V1 (array, i, wcslcpy,
> +                                    1,
> +                                    __wcslcpy_generic))
> +
>    /* Support sysdeps/x86_64/multiarch/wcsncpy.c.  */
>    IFUNC_IMPL (i, name, wcsncpy,
>               X86_IFUNC_IMPL_ADD_V4 (array, i, wcsncpy,
> diff --git a/sysdeps/x86_64/multiarch/ifunc-strlcpy.h b/sysdeps/x86_64/multiarch/ifunc-strlcpy.h
> new file mode 100644
> index 0000000000..982a30d15b
> --- /dev/null
> +++ b/sysdeps/x86_64/multiarch/ifunc-strlcpy.h
> @@ -0,0 +1,34 @@
> +/* Common definition for ifunc selections.
> +   All versions must be listed in ifunc-impl-list.c.
> +   Copyright (C) 2023 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +#include <init-arch.h>
> +
> +extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden;
> +extern __typeof (REDIRECT_NAME) OPTIMIZE (generic) attribute_hidden;
> +
> +static inline void *
> +IFUNC_SELECTOR (void)
> +{
> +  const struct cpu_features *cpu_features = __get_cpu_features ();
> +
> +  if (X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, AVX2))
> +    return OPTIMIZE (avx2);
> +
> +  return OPTIMIZE (generic);
> +}
> diff --git a/sysdeps/x86_64/multiarch/strlcpy-avx2.S b/sysdeps/x86_64/multiarch/strlcpy-avx2.S
> new file mode 100644
> index 0000000000..cf54b1e990
> --- /dev/null
> +++ b/sysdeps/x86_64/multiarch/strlcpy-avx2.S
> @@ -0,0 +1,446 @@
> +/* Strlcpy/wcslcpy optimized with AVX2.
> +   Copyright (C) 2023 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +#include <isa-level.h>
> +
> +#if ISA_SHOULD_BUILD (3)
> +
> +# include <sysdep.h>
> +
> +# ifndef VEC_SIZE
> +#  include "x86-avx-vecs.h"
> +# endif
> +
> +# ifndef STRLCPY
> +#  define STRLCPY      __strlcpy_avx2
> +# endif
> +
> +
> +# ifdef USE_AS_WCSLCPY
> +#  define CHAR_SIZE    4
> +#  define MOVU         movl
> +#  define VPCMPEQ      vpcmpeqd
> +#  define VPMINU       vpminud
> +# else
> +#  define CHAR_SIZE    1
> +#  define MOVU         movb
> +#  define VPCMPEQ      vpcmpeqb
> +#  define VPMINU       vpminub
> +# endif
> +
> +# define PMOVMSK       vpmovmskb
> +# define PAGE_SIZE     4096
> +# define VEC_SIZE      32
> +# define CHAR_PER_VEC  (VEC_SIZE / CHAR_SIZE)
> +
> +       .section SECTION(.text),"ax",@progbits
> +/* Aligning entry point to 64 byte, provides better performance for
> +   one vector length string.  */
> +
> +ENTRY_P2ALIGN (STRLCPY, 6)
> +# ifdef __ILP32__
> +       /* Clear the upper 32 bits.  */
> +       movl    %edx, %edx
> +# endif
> +
> +       /* Zero out vector register for end of string comparison. */
> +       vpxor   %VMM(0), %VMM(0), %VMM(0)
> +       /* Save source pointer for return calculation.  */
> +       mov     %rsi, %r8
> +       mov     %esi, %eax
> +       sall    $20, %eax
> +       cmpl    $((PAGE_SIZE - (VEC_SIZE)) << 20), %eax
> +       ja      L(page_cross)
> +
> +L(page_cross_continue):
> +       /* Load first vector.  */
> +       VMOVU   (%rsi), %VMM(1)
> +       VPCMPEQ %VMM(0), %VMM(1), %VMM(2)
> +       PMOVMSK %VMM(2), %eax
> +       test    %eax, %eax
> +       jnz     L(ret_vec_x1)
> +
> +       test    %rdx, %rdx
> +       jz      L(continue_second_vector)
> +
> +       /* Check whether we can copy full vector.  */
> +       cmp     $CHAR_PER_VEC, %rdx
> +       jbe     L(page_cross_small_vec_copy)
> +       /* Copy first vector.  */
> +       VMOVU   %VMM(1), (%rdi)
> +       sub     $CHAR_PER_VEC, %rdx
> +
> +L(continue_second_vector):
> +       /* Align RSI pointer and adjust RDI based on offset.  */
> +       mov     %rsi, %rax
> +       and     $-VEC_SIZE, %rsi
> +       sub     %rsi, %rax
> +       sub     %rax, %rdi
> +
> +       /* Check if string already copied N char, and RDX is 0.  */
> +       test    %rdx, %rdx
> +       jz      L(skip_copy_alignment_fix)
> +
> +       /* Adjust RDX for copy alignment fix.  */
> +# ifdef USE_AS_WCSLCPY
> +       shr     $2, %rax
> +# endif
> +       add     %rax, %rdx
> +
> +L(skip_copy_alignment_fix):
> +       /* Load second vector.  */
> +       VMOVA   (VEC_SIZE * 1)(%rsi), %VMM(1)
> +       VPCMPEQ %VMM(0), %VMM(1), %VMM(2)
> +       vptest  %VMM(2), %VMM(2)
> +       jnz     L(ret_vec_x2)
> +
> +       /* Skip copy if RDX is 0.  */
> +       test    %rdx, %rdx
> +       jz      L(continue_third_vector)
> +
> +       /* Jump below/equal(instead of below) used here, because last
> +          copy chracter must be NULL.  */
> +       cmp     $CHAR_PER_VEC, %rdx
> +       jbe     L(partial_copy_second_vector)
> +
> +       sub     $CHAR_PER_VEC, %rdx
> +       /* Copy second vector.  */
> +       VMOVU   %VMM(1), (VEC_SIZE * 1)(%rdi)
> +
> +L(continue_third_vector):
> +       /* Load third vector.  */
> +       VMOVA   (VEC_SIZE * 2)(%rsi), %VMM(1)
> +       VPCMPEQ %VMM(0), %VMM(1), %VMM(2)
> +       vptest  %VMM(2), %VMM(2)
> +       jnz     L(ret_vec_x3)
> +
> +       /* Skip copy if RDX is 0.  */
> +       test    %rdx, %rdx
> +       jz      L(continue_fourth_vector)
> +
> +       cmp     $CHAR_PER_VEC, %rdx
> +       jbe     L(partial_copy_third_vector)
> +
> +       sub     $CHAR_PER_VEC, %rdx
> +       /* Copy third vector.  */
> +       VMOVU   %VMM(1), (VEC_SIZE * 2)(%rdi)
> +
> +L(continue_fourth_vector):
> +       /* Load fourth vector.  */
> +       VMOVA   (VEC_SIZE * 3)(%rsi), %VMM(1)
> +       VPCMPEQ %VMM(0), %VMM(1), %VMM(2)
> +       vptest  %VMM(2), %VMM(2)
> +       jnz     L(ret_vec_x4)
> +
> +       /* Skip copy if RDX is 0.  */
> +       test    %rdx, %rdx
> +       jz      L(loop_4x_align)
> +
> +       cmp     $CHAR_PER_VEC, %rdx
> +       jbe     L(partial_copy_fourth_vector)
> +
> +       sub     $CHAR_PER_VEC, %rdx
> +       /* Copy fourth vector.  */
> +       VMOVU   %VMM(1), (VEC_SIZE * 3)(%rdi)
> +
> +
> +L(loop_4x_align):
> +       /* Jump to loop if RSI is already 4 vector align.  */
> +       test    $(VEC_SIZE * 4 - 1), %esi
> +       jz      L(loop_4x_read)
> +
> +       mov     %rsi, %rcx
> +
> +       /* Align RSI to 4x vector.  */
> +       and     $(VEC_SIZE * -4), %rsi
> +       sub     %rsi, %rcx
> +
> +       /* Adjust RDI for RSI alignment fix.  */
> +       sub     %rcx, %rdi
> +
> +       /* Jump to loop if RDX is 0.  */
> +       test    %rdx, %rdx
> +       jz      L(loop_4x_read)
> +
> +# ifdef USE_AS_WCSLCPY
> +       shr     $2, %rcx
> +# endif
> +
> +       /* Adjust RDX for RSI alignment fix.  */
> +       add     %rcx, %rdx
> +       jmp     L(loop_4x_read)
> +
> +       .p2align 4,,6
> +L(loop_4x_vec):
> +       /* Skip copy if RDX is 0.  */
> +       test    %rdx, %rdx
> +       jz      L(loop_partial_copy_return)
> +       cmp     $(CHAR_PER_VEC * 4), %rdx
> +       jbe     L(loop_partial_copy)
> +       VMOVU   %VMM(1), (VEC_SIZE * 4)(%rdi)
> +       VMOVU   %VMM(2), (VEC_SIZE * 5)(%rdi)
> +       VMOVU   %VMM(3), (VEC_SIZE * 6)(%rdi)
> +       VMOVU   %VMM(4), (VEC_SIZE * 7)(%rdi)
> +       sub     $(CHAR_PER_VEC * 4), %rdx
> +
> +L(loop_partial_copy_return):
> +       sub     $(VEC_SIZE * -4), %rsi
> +       sub     $(VEC_SIZE * -4), %rdi
> +
> +L(loop_4x_read):
> +       VMOVA   (VEC_SIZE * 4)(%rsi), %VMM(1)
> +       VMOVA   (VEC_SIZE * 5)(%rsi), %VMM(2)
> +       VMOVA   (VEC_SIZE * 6)(%rsi), %VMM(3)
> +       VMOVA   (VEC_SIZE * 7)(%rsi), %VMM(4)
> +       VPMINU  %VMM(1), %VMM(2), %VMM(5)
> +       VPMINU  %VMM(3), %VMM(4), %VMM(6)
> +       VPMINU  %VMM(5), %VMM(6), %VMM(7)
> +       VPCMPEQ %VMM(0), %VMM(7), %VMM(7)
> +       vptest  %VMM(7), %VMM(7)
> +
> +       jz      L(loop_4x_vec)
> +
> +       /* Check if string ends in first vector or second vector.  */
> +       lea     (VEC_SIZE * 4)(%rsi), %rax
> +       sub     %r8, %rax
> +# ifdef USE_AS_WCSLCPY
> +       shr     $2, %rax
> +# endif
> +       xor     %r10, %r10
> +       VPCMPEQ %VMM(0), %VMM(5), %VMM(6)
> +       vptest  %VMM(6), %VMM(6)
> +       jnz     L(endloop)
> +       sub     $(CHAR_PER_VEC * -2), %rax
> +       mov     $(CHAR_PER_VEC * 2), %r10
> +       VMOVA   %VMM(3), %VMM(1)
> +       VMOVA   %VMM(4), %VMM(2)
> +
> +L(endloop):
> +       VPCMPEQ %VMM(0), %VMM(1), %VMM(1)
> +       VPCMPEQ %VMM(0), %VMM(2), %VMM(2)
> +       PMOVMSK %VMM(1), %rcx
> +       PMOVMSK %VMM(2), %r9
> +       shlq    $32, %r9
> +       orq     %r9, %rcx
> +       bsf     %rcx, %rcx
> +       /* Shift RCX by 2, VPMOVMSK has only byte version.  */
> +# ifdef USE_AS_WCSLCPY
> +       shr     $2, %rcx
> +# endif
> +       /* At this point RAX has length to return.  */
> +       add     %rcx, %rax
> +       test    %rdx, %rdx
> +       jz      L(ret)
> +
> +       /* Add 1 to account for NULL character in RDX comparison.  */
> +       lea     1(%r10, %rcx), %rcx
> +       cmp     %rdx, %rcx
> +       cmovb   %rcx, %rdx
> +
> +L(loop_partial_copy):
> +       cmp     $(CHAR_PER_VEC * 2), %rdx
> +       jbe     L(loop_partial_first_half)
> +       /* Reload first 2 vector.  */
> +       VMOVA   (VEC_SIZE * 4)(%rsi), %VMM(1)
> +       VMOVA   (VEC_SIZE * 5)(%rsi), %VMM(2)
> +       VMOVU   %VMM(1), (VEC_SIZE * 4)(%rdi)
> +       VMOVU   %VMM(2), (VEC_SIZE * 5)(%rdi)
> +
> +L(loop_partial_first_half):
> +       /* Go back 2 vector from last and use overlapping copy.
> +          (VEC_SIZE * 4 - VEC_SIZE * 2)(%rsi, %rdx, CHAR_SIZE)
> +          (VEC_SIZE * 4 - VEC_SIZE * 1)(%rsi, %rdx, CHAR_SIZE)
> +        */
> +       VMOVU   (VEC_SIZE * 2)(%rsi, %rdx, CHAR_SIZE), %VMM(3)
> +       VMOVU   (VEC_SIZE * 3)(%rsi, %rdx, CHAR_SIZE), %VMM(4)
> +       VMOVU   %VMM(3), (VEC_SIZE * 2)(%rdi, %rdx, CHAR_SIZE)
> +       VMOVU   %VMM(4), (VEC_SIZE * 3)(%rdi, %rdx, CHAR_SIZE)
> +       MOVU    $0, (VEC_SIZE * 4 - CHAR_SIZE)(%rdi, %rdx, CHAR_SIZE)
> +       xor     %rdx, %rdx
> +       vptest  %VMM(7), %VMM(7)
> +       jz      L(loop_partial_copy_return)
> +       ret
> +
> +       .p2align 4
> +L(page_cross):
> +       mov     %rsi, %rcx
> +       mov     %rsi, %r11
> +       and     $-VEC_SIZE, %r11
> +       and     $(VEC_SIZE - 1), %rcx
> +       VMOVA   (%r11), %VMM(1)
> +       VPCMPEQ %VMM(0), %VMM(1), %VMM(2)
> +       PMOVMSK %VMM(2), %eax
> +       shr     %cl, %eax
> +       jz      L(page_cross_continue)
> +
> +L(ret_vec_x1):
> +       bsf     %eax, %eax
> +# ifdef USE_AS_WCSLCPY
> +       shr     $2, %eax
> +# endif
> +       /* Increment by 1 to account for NULL char.  */
> +       lea     1(%eax), %ecx
> +       cmp     %rdx, %rcx
> +       cmovb   %rcx, %rdx
> +       test    %rdx, %rdx
> +       jz      L(ret)
> +
> +L(page_cross_small_vec_copy):
> +       cmp     $(16 / CHAR_SIZE), %rdx
> +       jbe     L(copy_8_byte_scalar)
> +       VMOVU   (%rsi), %VMM_128(1)
> +       VMOVU   -16(%rsi, %rdx, CHAR_SIZE), %VMM_128(3)
> +       VMOVU   %VMM_128(1), (%rdi)
> +       VMOVU   %VMM_128(3), -16(%rdi, %rdx, CHAR_SIZE)
> +       MOVU    $0, -(CHAR_SIZE * 1)(%rdi, %rdx, CHAR_SIZE)
> +       xor     %rdx, %rdx
> +       vptest  %VMM(2), %VMM(2)
> +       jz      L(continue_second_vector)
> +       ret
> +
> +L(copy_8_byte_scalar):
> +       cmp     $(8 / CHAR_SIZE), %rdx
> +       jbe     L(copy_4_byte_scalar)
> +       movq    (%rsi), %r10
> +       movq    -8(%rsi, %rdx, CHAR_SIZE), %r11
> +       movq    %r10, (%rdi)
> +       movq    %r11, -8(%rdi, %rdx, CHAR_SIZE)
> +       MOVU    $0, -(CHAR_SIZE * 1)(%rdi, %rdx, CHAR_SIZE)
> +       xor     %edx, %edx
> +       vptest  %VMM(2), %VMM(2)
> +       jz      L(continue_second_vector)
> +       ret
> +
> +L(copy_4_byte_scalar):
> +# ifndef USE_AS_WCSLCPY
> +       cmp     $4, %rdx
> +       jbe     L(copy_2_byte_scalar)
> +# endif
> +       movl    (%rsi), %r10d
> +       movl    -4(%rsi, %rdx, CHAR_SIZE), %r11d
> +       movl    %r10d, (%rdi)
> +       movl    %r11d, -4(%rdi, %rdx, CHAR_SIZE)
> +       MOVU    $0, -(CHAR_SIZE * 1)(%rdi, %rdx, CHAR_SIZE)
> +       xor     %edx, %edx
> +       vptest  %VMM(2), %VMM(2)
> +       jz      L(continue_second_vector)
> +       ret
> +
> +# ifndef USE_AS_WCSLCPY
> +L(copy_2_byte_scalar):
> +       cmp     $2, %rdx
> +       jbe     L(copy_1_byte_scalar)
> +       movw    (%rsi), %r10w
> +       movw    -(CHAR_SIZE * 3)(%rsi, %rdx, CHAR_SIZE), %r11w
> +       movw    %r10w, (%rdi)
> +       movw    %r11w, -(CHAR_SIZE * 3)(%rdi, %rdx, CHAR_SIZE)
> +       MOVU    $0, -(CHAR_SIZE * 1)(%rdi, %rdx, CHAR_SIZE)
> +       xor     %edx, %edx
> +       vptest  %VMM(2), %VMM(2)
> +       jz      L(continue_second_vector)
> +       ret
> +
> +L(copy_1_byte_scalar):
> +       MOVU    (%rsi), %r10b
> +       MOVU    %r10b, (%rdi)
> +       MOVU    $0, -(CHAR_SIZE * 1)(%rdi, %rdx, CHAR_SIZE)
> +       xor     %edx, %edx
> +       vptest  %VMM(2), %VMM(2)
> +       jz      L(continue_second_vector)
> +       ret
> +# endif
> +
> +L(ret_vec_x2):
> +       PMOVMSK %VMM(2), %rax
> +       bsf     %rax, %rcx
> +       /* Calculate return value.  */
> +       lea     VEC_SIZE(%rsi, %rcx), %rax
> +       sub     %r8, %rax
> +# ifdef USE_AS_WCSLCPY
> +       shr     $2, %rax
> +       shr     $2, %rcx
> +# endif
> +       inc     %rcx
> +       test    %rdx, %rdx
> +       jz      L(ret)
> +       cmp     %rdx, %rcx
> +       cmovb   %rcx, %rdx
> +
> +L(partial_copy_second_vector):
> +       VMOVU   (%rsi, %rdx, CHAR_SIZE), %VMM(1)
> +       VMOVU   %VMM(1), (%rdi, %rdx, CHAR_SIZE)
> +       MOVU    $0, (VEC_SIZE - CHAR_SIZE * 1)(%rdi, %rdx, CHAR_SIZE)
> +       xor     %edx, %edx
> +       vptest  %VMM(2), %VMM(2)
> +       jz      L(continue_third_vector)
> +
> +L(ret):
> +       ret
> +
> +L(ret_vec_x3):
> +       PMOVMSK %VMM(2), %rax
> +       bsf     %rax, %rcx
> +       /* Calculate return value.  */
> +       lea     (VEC_SIZE * 2)(%rsi, %rcx), %rax
> +       sub     %r8, %rax
> +# ifdef USE_AS_WCSLCPY
> +       shr     $2, %rax
> +       shr     $2, %rcx
> +# endif
> +       inc     %rcx
> +       test    %rdx, %rdx
> +       jz      L(ret)
> +       cmp     %rdx, %rcx
> +       cmovb   %rcx, %rdx
> +
> +L(partial_copy_third_vector):
> +       VMOVU   (VEC_SIZE)(%rsi, %rdx, CHAR_SIZE), %VMM(1)
> +       VMOVU   %VMM(1), (VEC_SIZE)(%rdi, %rdx, CHAR_SIZE)
> +       MOVU    $0, ((VEC_SIZE * 2) - CHAR_SIZE * 1)(%rdi, %rdx, CHAR_SIZE)
> +       xor     %edx, %edx
> +       vptest  %VMM(2), %VMM(2)
> +       jz      L(continue_fourth_vector)
> +       ret
> +
> +L(ret_vec_x4):
> +       PMOVMSK %VMM(2), %rax
> +       bsf     %rax, %rcx
> +       /* Calculate return value.  */
> +       lea     (VEC_SIZE * 3)(%rsi, %rcx), %rax
> +       sub     %r8, %rax
> +# ifdef USE_AS_WCSLCPY
> +       shr     $2, %rax
> +       shr     $2, %rcx
> +# endif
> +       inc     %rcx
> +       test    %rdx, %rdx
> +       jz      L(ret)
> +       cmp     %rdx, %rcx
> +       cmovb   %rcx, %rdx
> +
> +L(partial_copy_fourth_vector):
> +       VMOVU   (VEC_SIZE * 2)(%rsi, %rdx, CHAR_SIZE), %VMM(1)
> +       VMOVU   %VMM(1), (VEC_SIZE * 2)(%rdi, %rdx, CHAR_SIZE)
> +       MOVU    $0, ((VEC_SIZE * 3) - CHAR_SIZE * 1)(%rdi, %rdx, CHAR_SIZE)
> +       xor     %edx, %edx
> +       vptest  %VMM(2), %VMM(2)
> +       jz      L(continue_fourth_vector)
> +       ret
> +
> +END (STRLCPY)

Is strlcpy/strlcat integratable with existing strncat impl? Had
figured they would
fit in the same file.

> +#endif
> diff --git a/sysdeps/x86_64/multiarch/strlcpy-generic.c b/sysdeps/x86_64/multiarch/strlcpy-generic.c
> new file mode 100644
> index 0000000000..eee3b7b086
> --- /dev/null
> +++ b/sysdeps/x86_64/multiarch/strlcpy-generic.c
> @@ -0,0 +1,25 @@
> +/* strlcpy generic.
> +   Copyright (C) 2023 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +
> +#include <isa-level.h>
> +#if ISA_SHOULD_BUILD (1)
> +# define __strlcpy  __strlcpy_generic
> +# include <string/strlcpy.c>
> +
> +#endif
> diff --git a/sysdeps/x86_64/multiarch/strlcpy.c b/sysdeps/x86_64/multiarch/strlcpy.c
> new file mode 100644
> index 0000000000..ded41fbcfb
> --- /dev/null
> +++ b/sysdeps/x86_64/multiarch/strlcpy.c
> @@ -0,0 +1,36 @@
> +/* Multiple versions of strlcpy.
> +   All versions must be listed in ifunc-impl-list.c.
> +   Copyright (C) 2023 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +/* Define multiple versions only for the definition in libc.  */
> +#if IS_IN (libc)
> +# define __strlcpy __redirect_strlcpy
> +# include <string.h>
> +# undef __strlcpy
> +
> +# define SYMBOL_NAME strlcpy
> +# include "ifunc-strlcpy.h"
> +
> +libc_ifunc_redirected (__redirect_strlcpy, __strlcpy, IFUNC_SELECTOR ());
> +weak_alias (__strlcpy, strlcpy)
> +
> +# ifdef SHARED
> +__hidden_ver1 (__strlcpy, __GI___strlcpy, __redirect_strlcpy)
> +  __attribute__ ((visibility ("hidden"))) __attribute_copy__ (strlcpy);
> +# endif
> +#endif
> diff --git a/sysdeps/x86_64/multiarch/wcslcpy-avx2.S b/sysdeps/x86_64/multiarch/wcslcpy-avx2.S
> new file mode 100644
> index 0000000000..dafc20ded0
> --- /dev/null
> +++ b/sysdeps/x86_64/multiarch/wcslcpy-avx2.S
> @@ -0,0 +1,4 @@
> +#define STRLCPY        __wcslcpy_avx2
> +#define USE_AS_WCSLCPY 1
> +
> +#include "strlcpy-avx2.S"
> diff --git a/sysdeps/x86_64/multiarch/wcslcpy-generic.c b/sysdeps/x86_64/multiarch/wcslcpy-generic.c
> new file mode 100644
> index 0000000000..ffd3c0e846
> --- /dev/null
> +++ b/sysdeps/x86_64/multiarch/wcslcpy-generic.c
> @@ -0,0 +1,25 @@
> +/* wcslcpy generic.
> +   Copyright (C) 2023 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +
> +#include <isa-level.h>
> +#if ISA_SHOULD_BUILD (1)
> +# define __wcslcpy  __wcslcpy_generic
> +# include <wcsmbs/wcslcpy.c>
> +
> +#endif
> diff --git a/sysdeps/x86_64/multiarch/wcslcpy.c b/sysdeps/x86_64/multiarch/wcslcpy.c
> new file mode 100644
> index 0000000000..371ef9626c
> --- /dev/null
> +++ b/sysdeps/x86_64/multiarch/wcslcpy.c
> @@ -0,0 +1,35 @@
> +/* Multiple versions of wcslcpy.
> +   All versions must be listed in ifunc-impl-list.c.
> +   Copyright (C) 2023 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +/* Define multiple versions only for the definition in libc.  */
> +#if IS_IN (libc)
> +# define __wcslcpy __redirect_wcslcpy
> +# include <wchar.h>
> +# undef __wcslcpy
> +
> +# define SYMBOL_NAME wcslcpy
> +# include "ifunc-strlcpy.h"
> +
> +libc_ifunc_redirected (__redirect_wcslcpy, __wcslcpy, IFUNC_SELECTOR ());
> +weak_alias (__wcslcpy, wcslcpy)
> +# ifdef SHARED
> +__hidden_ver1 (__wcslcpy, __GI___wcslcpy, __redirect_wcslcpy)
> +  __attribute__((visibility ("hidden"))) __attribute_copy__ (wcslcpy);
> +# endif
> +#endif
> --
> 2.38.1
>

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH] x86_64: Implement AVX2 version of strlcpy/wcslcpy function
  2023-07-02 17:03 ` Noah Goldstein
@ 2023-07-02 18:37   ` Sunil Pandey
  2023-07-02 18:54     ` Noah Goldstein
  0 siblings, 1 reply; 24+ messages in thread
From: Sunil Pandey @ 2023-07-02 18:37 UTC (permalink / raw)
  To: Noah Goldstein; +Cc: libc-alpha, hjl.tools

[-- Attachment #1: Type: text/plain, Size: 28697 bytes --]

On Sun, Jul 2, 2023 at 10:03 AM Noah Goldstein <goldstein.w.n@gmail.com>
wrote:

> On Fri, Jun 30, 2023 at 3:48 PM Sunil K Pandey via Libc-alpha
> <libc-alpha@sourceware.org> wrote:
> >
> > This patch optimizes strlcpy/wsclcpy string functions for AVX2.
> > ---
> >  sysdeps/x86_64/multiarch/Makefile          |   4 +
> >  sysdeps/x86_64/multiarch/ifunc-impl-list.c |  18 +
> >  sysdeps/x86_64/multiarch/ifunc-strlcpy.h   |  34 ++
> >  sysdeps/x86_64/multiarch/strlcpy-avx2.S    | 446 +++++++++++++++++++++
> >  sysdeps/x86_64/multiarch/strlcpy-generic.c |  25 ++
> >  sysdeps/x86_64/multiarch/strlcpy.c         |  36 ++
> >  sysdeps/x86_64/multiarch/wcslcpy-avx2.S    |   4 +
> >  sysdeps/x86_64/multiarch/wcslcpy-generic.c |  25 ++
> >  sysdeps/x86_64/multiarch/wcslcpy.c         |  35 ++
> >  9 files changed, 627 insertions(+)
> >  create mode 100644 sysdeps/x86_64/multiarch/ifunc-strlcpy.h
> >  create mode 100644 sysdeps/x86_64/multiarch/strlcpy-avx2.S
> >  create mode 100644 sysdeps/x86_64/multiarch/strlcpy-generic.c
> >  create mode 100644 sysdeps/x86_64/multiarch/strlcpy.c
> >  create mode 100644 sysdeps/x86_64/multiarch/wcslcpy-avx2.S
> >  create mode 100644 sysdeps/x86_64/multiarch/wcslcpy-generic.c
> >  create mode 100644 sysdeps/x86_64/multiarch/wcslcpy.c
> >
> > diff --git a/sysdeps/x86_64/multiarch/Makefile
> b/sysdeps/x86_64/multiarch/Makefile
> > index e1e894c963..7e3fc081df 100644
> > --- a/sysdeps/x86_64/multiarch/Makefile
> > +++ b/sysdeps/x86_64/multiarch/Makefile
> > @@ -82,6 +82,8 @@ sysdep_routines += \
> >    strcpy-sse2 \
> >    strcpy-sse2-unaligned \
> >    strcspn-sse4 \
> > +  strlcpy-avx2 \
> > +  strlcpy-generic \
> >    strlen-avx2 \
> >    strlen-avx2-rtm \
> >    strlen-evex \
> > @@ -153,6 +155,8 @@ sysdep_routines += \
> >    wcscpy-evex \
> >    wcscpy-generic \
> >    wcscpy-ssse3 \
> > +  wcslcpy-avx2 \
> > +  wcslcpy-generic \
> >    wcslen-avx2 \
> >    wcslen-avx2-rtm \
> >    wcslen-evex \
> > diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> > index 5427ff1907..9928dee187 100644
> > --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> > +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> > @@ -751,6 +751,15 @@ __libc_ifunc_impl_list (const char *name, struct
> libc_ifunc_impl *array,
> >                                      1,
> >                                      __strncat_sse2_unaligned))
> >
> > +  /* Support sysdeps/x86_64/multiarch/strlcpy.c.  */
> > +  IFUNC_IMPL (i, name, strlcpy,
> > +             X86_IFUNC_IMPL_ADD_V3 (array, i, strlcpy,
> > +                                    CPU_FEATURE_USABLE (AVX2),
> > +                                    __strlcpy_avx2)
> > +             X86_IFUNC_IMPL_ADD_V1 (array, i, strlcpy,
> > +                                    1,
> > +                                    __strlcpy_generic))
> > +
> >    /* Support sysdeps/x86_64/multiarch/strncpy.c.  */
> >    IFUNC_IMPL (i, name, strncpy,
> >               X86_IFUNC_IMPL_ADD_V4 (array, i, strncpy,
> > @@ -917,6 +926,15 @@ __libc_ifunc_impl_list (const char *name, struct
> libc_ifunc_impl *array,
> >                                      1,
> >                                      __wcscpy_generic))
> >
> > +  /* Support sysdeps/x86_64/multiarch/wcslcpy.c.  */
> > +  IFUNC_IMPL (i, name, wcslcpy,
> > +             X86_IFUNC_IMPL_ADD_V3 (array, i, wcslcpy,
> > +                                    CPU_FEATURE_USABLE (AVX2),
> > +                                    __wcslcpy_avx2)
> > +             X86_IFUNC_IMPL_ADD_V1 (array, i, wcslcpy,
> > +                                    1,
> > +                                    __wcslcpy_generic))
> > +
> >    /* Support sysdeps/x86_64/multiarch/wcsncpy.c.  */
> >    IFUNC_IMPL (i, name, wcsncpy,
> >               X86_IFUNC_IMPL_ADD_V4 (array, i, wcsncpy,
> > diff --git a/sysdeps/x86_64/multiarch/ifunc-strlcpy.h
> b/sysdeps/x86_64/multiarch/ifunc-strlcpy.h
> > new file mode 100644
> > index 0000000000..982a30d15b
> > --- /dev/null
> > +++ b/sysdeps/x86_64/multiarch/ifunc-strlcpy.h
> > @@ -0,0 +1,34 @@
> > +/* Common definition for ifunc selections.
> > +   All versions must be listed in ifunc-impl-list.c.
> > +   Copyright (C) 2023 Free Software Foundation, Inc.
> > +   This file is part of the GNU C Library.
> > +
> > +   The GNU C Library is free software; you can redistribute it and/or
> > +   modify it under the terms of the GNU Lesser General Public
> > +   License as published by the Free Software Foundation; either
> > +   version 2.1 of the License, or (at your option) any later version.
> > +
> > +   The GNU C Library is distributed in the hope that it will be useful,
> > +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> > +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > +   Lesser General Public License for more details.
> > +
> > +   You should have received a copy of the GNU Lesser General Public
> > +   License along with the GNU C Library; if not, see
> > +   <https://www.gnu.org/licenses/>.  */
> > +
> > +#include <init-arch.h>
> > +
> > +extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden;
> > +extern __typeof (REDIRECT_NAME) OPTIMIZE (generic) attribute_hidden;
> > +
> > +static inline void *
> > +IFUNC_SELECTOR (void)
> > +{
> > +  const struct cpu_features *cpu_features = __get_cpu_features ();
> > +
> > +  if (X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, AVX2))
> > +    return OPTIMIZE (avx2);
> > +
> > +  return OPTIMIZE (generic);
> > +}
> > diff --git a/sysdeps/x86_64/multiarch/strlcpy-avx2.S
> b/sysdeps/x86_64/multiarch/strlcpy-avx2.S
> > new file mode 100644
> > index 0000000000..cf54b1e990
> > --- /dev/null
> > +++ b/sysdeps/x86_64/multiarch/strlcpy-avx2.S
> > @@ -0,0 +1,446 @@
> > +/* Strlcpy/wcslcpy optimized with AVX2.
> > +   Copyright (C) 2023 Free Software Foundation, Inc.
> > +   This file is part of the GNU C Library.
> > +
> > +   The GNU C Library is free software; you can redistribute it and/or
> > +   modify it under the terms of the GNU Lesser General Public
> > +   License as published by the Free Software Foundation; either
> > +   version 2.1 of the License, or (at your option) any later version.
> > +
> > +   The GNU C Library is distributed in the hope that it will be useful,
> > +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> > +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > +   Lesser General Public License for more details.
> > +
> > +   You should have received a copy of the GNU Lesser General Public
> > +   License along with the GNU C Library; if not, see
> > +   <https://www.gnu.org/licenses/>.  */
> > +
> > +#include <isa-level.h>
> > +
> > +#if ISA_SHOULD_BUILD (3)
> > +
> > +# include <sysdep.h>
> > +
> > +# ifndef VEC_SIZE
> > +#  include "x86-avx-vecs.h"
> > +# endif
> > +
> > +# ifndef STRLCPY
> > +#  define STRLCPY      __strlcpy_avx2
> > +# endif
> > +
> > +
> > +# ifdef USE_AS_WCSLCPY
> > +#  define CHAR_SIZE    4
> > +#  define MOVU         movl
> > +#  define VPCMPEQ      vpcmpeqd
> > +#  define VPMINU       vpminud
> > +# else
> > +#  define CHAR_SIZE    1
> > +#  define MOVU         movb
> > +#  define VPCMPEQ      vpcmpeqb
> > +#  define VPMINU       vpminub
> > +# endif
> > +
> > +# define PMOVMSK       vpmovmskb
> > +# define PAGE_SIZE     4096
> > +# define VEC_SIZE      32
> > +# define CHAR_PER_VEC  (VEC_SIZE / CHAR_SIZE)
> > +
> > +       .section SECTION(.text),"ax",@progbits
> > +/* Aligning entry point to 64 byte, provides better performance for
> > +   one vector length string.  */
> > +
> > +ENTRY_P2ALIGN (STRLCPY, 6)
> > +# ifdef __ILP32__
> > +       /* Clear the upper 32 bits.  */
> > +       movl    %edx, %edx
> > +# endif
> > +
> > +       /* Zero out vector register for end of string comparison. */
> > +       vpxor   %VMM(0), %VMM(0), %VMM(0)
> > +       /* Save source pointer for return calculation.  */
> > +       mov     %rsi, %r8
> > +       mov     %esi, %eax
> > +       sall    $20, %eax
> > +       cmpl    $((PAGE_SIZE - (VEC_SIZE)) << 20), %eax
> > +       ja      L(page_cross)
> > +
> > +L(page_cross_continue):
> > +       /* Load first vector.  */
> > +       VMOVU   (%rsi), %VMM(1)
> > +       VPCMPEQ %VMM(0), %VMM(1), %VMM(2)
> > +       PMOVMSK %VMM(2), %eax
> > +       test    %eax, %eax
> > +       jnz     L(ret_vec_x1)
> > +
> > +       test    %rdx, %rdx
> > +       jz      L(continue_second_vector)
> > +
> > +       /* Check whether we can copy full vector.  */
> > +       cmp     $CHAR_PER_VEC, %rdx
> > +       jbe     L(page_cross_small_vec_copy)
> > +       /* Copy first vector.  */
> > +       VMOVU   %VMM(1), (%rdi)
> > +       sub     $CHAR_PER_VEC, %rdx
> > +
> > +L(continue_second_vector):
> > +       /* Align RSI pointer and adjust RDI based on offset.  */
> > +       mov     %rsi, %rax
> > +       and     $-VEC_SIZE, %rsi
> > +       sub     %rsi, %rax
> > +       sub     %rax, %rdi
> > +
> > +       /* Check if string already copied N char, and RDX is 0.  */
> > +       test    %rdx, %rdx
> > +       jz      L(skip_copy_alignment_fix)
> > +
> > +       /* Adjust RDX for copy alignment fix.  */
> > +# ifdef USE_AS_WCSLCPY
> > +       shr     $2, %rax
> > +# endif
> > +       add     %rax, %rdx
> > +
> > +L(skip_copy_alignment_fix):
> > +       /* Load second vector.  */
> > +       VMOVA   (VEC_SIZE * 1)(%rsi), %VMM(1)
> > +       VPCMPEQ %VMM(0), %VMM(1), %VMM(2)
> > +       vptest  %VMM(2), %VMM(2)
> > +       jnz     L(ret_vec_x2)
> > +
> > +       /* Skip copy if RDX is 0.  */
> > +       test    %rdx, %rdx
> > +       jz      L(continue_third_vector)
> > +
> > +       /* Jump below/equal(instead of below) used here, because last
> > +          copy chracter must be NULL.  */
> > +       cmp     $CHAR_PER_VEC, %rdx
> > +       jbe     L(partial_copy_second_vector)
> > +
> > +       sub     $CHAR_PER_VEC, %rdx
> > +       /* Copy second vector.  */
> > +       VMOVU   %VMM(1), (VEC_SIZE * 1)(%rdi)
> > +
> > +L(continue_third_vector):
> > +       /* Load third vector.  */
> > +       VMOVA   (VEC_SIZE * 2)(%rsi), %VMM(1)
> > +       VPCMPEQ %VMM(0), %VMM(1), %VMM(2)
> > +       vptest  %VMM(2), %VMM(2)
> > +       jnz     L(ret_vec_x3)
> > +
> > +       /* Skip copy if RDX is 0.  */
> > +       test    %rdx, %rdx
> > +       jz      L(continue_fourth_vector)
> > +
> > +       cmp     $CHAR_PER_VEC, %rdx
> > +       jbe     L(partial_copy_third_vector)
> > +
> > +       sub     $CHAR_PER_VEC, %rdx
> > +       /* Copy third vector.  */
> > +       VMOVU   %VMM(1), (VEC_SIZE * 2)(%rdi)
> > +
> > +L(continue_fourth_vector):
> > +       /* Load fourth vector.  */
> > +       VMOVA   (VEC_SIZE * 3)(%rsi), %VMM(1)
> > +       VPCMPEQ %VMM(0), %VMM(1), %VMM(2)
> > +       vptest  %VMM(2), %VMM(2)
> > +       jnz     L(ret_vec_x4)
> > +
> > +       /* Skip copy if RDX is 0.  */
> > +       test    %rdx, %rdx
> > +       jz      L(loop_4x_align)
> > +
> > +       cmp     $CHAR_PER_VEC, %rdx
> > +       jbe     L(partial_copy_fourth_vector)
> > +
> > +       sub     $CHAR_PER_VEC, %rdx
> > +       /* Copy fourth vector.  */
> > +       VMOVU   %VMM(1), (VEC_SIZE * 3)(%rdi)
> > +
> > +
> > +L(loop_4x_align):
> > +       /* Jump to loop if RSI is already 4 vector align.  */
> > +       test    $(VEC_SIZE * 4 - 1), %esi
> > +       jz      L(loop_4x_read)
> > +
> > +       mov     %rsi, %rcx
> > +
> > +       /* Align RSI to 4x vector.  */
> > +       and     $(VEC_SIZE * -4), %rsi
> > +       sub     %rsi, %rcx
> > +
> > +       /* Adjust RDI for RSI alignment fix.  */
> > +       sub     %rcx, %rdi
> > +
> > +       /* Jump to loop if RDX is 0.  */
> > +       test    %rdx, %rdx
> > +       jz      L(loop_4x_read)
> > +
> > +# ifdef USE_AS_WCSLCPY
> > +       shr     $2, %rcx
> > +# endif
> > +
> > +       /* Adjust RDX for RSI alignment fix.  */
> > +       add     %rcx, %rdx
> > +       jmp     L(loop_4x_read)
> > +
> > +       .p2align 4,,6
> > +L(loop_4x_vec):
> > +       /* Skip copy if RDX is 0.  */
> > +       test    %rdx, %rdx
> > +       jz      L(loop_partial_copy_return)
> > +       cmp     $(CHAR_PER_VEC * 4), %rdx
> > +       jbe     L(loop_partial_copy)
> > +       VMOVU   %VMM(1), (VEC_SIZE * 4)(%rdi)
> > +       VMOVU   %VMM(2), (VEC_SIZE * 5)(%rdi)
> > +       VMOVU   %VMM(3), (VEC_SIZE * 6)(%rdi)
> > +       VMOVU   %VMM(4), (VEC_SIZE * 7)(%rdi)
> > +       sub     $(CHAR_PER_VEC * 4), %rdx
> > +
> > +L(loop_partial_copy_return):
> > +       sub     $(VEC_SIZE * -4), %rsi
> > +       sub     $(VEC_SIZE * -4), %rdi
> > +
> > +L(loop_4x_read):
> > +       VMOVA   (VEC_SIZE * 4)(%rsi), %VMM(1)
> > +       VMOVA   (VEC_SIZE * 5)(%rsi), %VMM(2)
> > +       VMOVA   (VEC_SIZE * 6)(%rsi), %VMM(3)
> > +       VMOVA   (VEC_SIZE * 7)(%rsi), %VMM(4)
> > +       VPMINU  %VMM(1), %VMM(2), %VMM(5)
> > +       VPMINU  %VMM(3), %VMM(4), %VMM(6)
> > +       VPMINU  %VMM(5), %VMM(6), %VMM(7)
> > +       VPCMPEQ %VMM(0), %VMM(7), %VMM(7)
> > +       vptest  %VMM(7), %VMM(7)
> > +
> > +       jz      L(loop_4x_vec)
> > +
> > +       /* Check if string ends in first vector or second vector.  */
> > +       lea     (VEC_SIZE * 4)(%rsi), %rax
> > +       sub     %r8, %rax
> > +# ifdef USE_AS_WCSLCPY
> > +       shr     $2, %rax
> > +# endif
> > +       xor     %r10, %r10
> > +       VPCMPEQ %VMM(0), %VMM(5), %VMM(6)
> > +       vptest  %VMM(6), %VMM(6)
> > +       jnz     L(endloop)
> > +       sub     $(CHAR_PER_VEC * -2), %rax
> > +       mov     $(CHAR_PER_VEC * 2), %r10
> > +       VMOVA   %VMM(3), %VMM(1)
> > +       VMOVA   %VMM(4), %VMM(2)
> > +
> > +L(endloop):
> > +       VPCMPEQ %VMM(0), %VMM(1), %VMM(1)
> > +       VPCMPEQ %VMM(0), %VMM(2), %VMM(2)
> > +       PMOVMSK %VMM(1), %rcx
> > +       PMOVMSK %VMM(2), %r9
> > +       shlq    $32, %r9
> > +       orq     %r9, %rcx
> > +       bsf     %rcx, %rcx
> > +       /* Shift RCX by 2, VPMOVMSK has only byte version.  */
> > +# ifdef USE_AS_WCSLCPY
> > +       shr     $2, %rcx
> > +# endif
> > +       /* At this point RAX has length to return.  */
> > +       add     %rcx, %rax
> > +       test    %rdx, %rdx
> > +       jz      L(ret)
> > +
> > +       /* Add 1 to account for NULL character in RDX comparison.  */
> > +       lea     1(%r10, %rcx), %rcx
> > +       cmp     %rdx, %rcx
> > +       cmovb   %rcx, %rdx
> > +
> > +L(loop_partial_copy):
> > +       cmp     $(CHAR_PER_VEC * 2), %rdx
> > +       jbe     L(loop_partial_first_half)
> > +       /* Reload first 2 vector.  */
> > +       VMOVA   (VEC_SIZE * 4)(%rsi), %VMM(1)
> > +       VMOVA   (VEC_SIZE * 5)(%rsi), %VMM(2)
> > +       VMOVU   %VMM(1), (VEC_SIZE * 4)(%rdi)
> > +       VMOVU   %VMM(2), (VEC_SIZE * 5)(%rdi)
> > +
> > +L(loop_partial_first_half):
> > +       /* Go back 2 vector from last and use overlapping copy.
> > +          (VEC_SIZE * 4 - VEC_SIZE * 2)(%rsi, %rdx, CHAR_SIZE)
> > +          (VEC_SIZE * 4 - VEC_SIZE * 1)(%rsi, %rdx, CHAR_SIZE)
> > +        */
> > +       VMOVU   (VEC_SIZE * 2)(%rsi, %rdx, CHAR_SIZE), %VMM(3)
> > +       VMOVU   (VEC_SIZE * 3)(%rsi, %rdx, CHAR_SIZE), %VMM(4)
> > +       VMOVU   %VMM(3), (VEC_SIZE * 2)(%rdi, %rdx, CHAR_SIZE)
> > +       VMOVU   %VMM(4), (VEC_SIZE * 3)(%rdi, %rdx, CHAR_SIZE)
> > +       MOVU    $0, (VEC_SIZE * 4 - CHAR_SIZE)(%rdi, %rdx, CHAR_SIZE)
> > +       xor     %rdx, %rdx
> > +       vptest  %VMM(7), %VMM(7)
> > +       jz      L(loop_partial_copy_return)
> > +       ret
> > +
> > +       .p2align 4
> > +L(page_cross):
> > +       mov     %rsi, %rcx
> > +       mov     %rsi, %r11
> > +       and     $-VEC_SIZE, %r11
> > +       and     $(VEC_SIZE - 1), %rcx
> > +       VMOVA   (%r11), %VMM(1)
> > +       VPCMPEQ %VMM(0), %VMM(1), %VMM(2)
> > +       PMOVMSK %VMM(2), %eax
> > +       shr     %cl, %eax
> > +       jz      L(page_cross_continue)
> > +
> > +L(ret_vec_x1):
> > +       bsf     %eax, %eax
> > +# ifdef USE_AS_WCSLCPY
> > +       shr     $2, %eax
> > +# endif
> > +       /* Increment by 1 to account for NULL char.  */
> > +       lea     1(%eax), %ecx
> > +       cmp     %rdx, %rcx
> > +       cmovb   %rcx, %rdx
> > +       test    %rdx, %rdx
> > +       jz      L(ret)
> > +
> > +L(page_cross_small_vec_copy):
> > +       cmp     $(16 / CHAR_SIZE), %rdx
> > +       jbe     L(copy_8_byte_scalar)
> > +       VMOVU   (%rsi), %VMM_128(1)
> > +       VMOVU   -16(%rsi, %rdx, CHAR_SIZE), %VMM_128(3)
> > +       VMOVU   %VMM_128(1), (%rdi)
> > +       VMOVU   %VMM_128(3), -16(%rdi, %rdx, CHAR_SIZE)
> > +       MOVU    $0, -(CHAR_SIZE * 1)(%rdi, %rdx, CHAR_SIZE)
> > +       xor     %rdx, %rdx
> > +       vptest  %VMM(2), %VMM(2)
> > +       jz      L(continue_second_vector)
> > +       ret
> > +
> > +L(copy_8_byte_scalar):
> > +       cmp     $(8 / CHAR_SIZE), %rdx
> > +       jbe     L(copy_4_byte_scalar)
> > +       movq    (%rsi), %r10
> > +       movq    -8(%rsi, %rdx, CHAR_SIZE), %r11
> > +       movq    %r10, (%rdi)
> > +       movq    %r11, -8(%rdi, %rdx, CHAR_SIZE)
> > +       MOVU    $0, -(CHAR_SIZE * 1)(%rdi, %rdx, CHAR_SIZE)
> > +       xor     %edx, %edx
> > +       vptest  %VMM(2), %VMM(2)
> > +       jz      L(continue_second_vector)
> > +       ret
> > +
> > +L(copy_4_byte_scalar):
> > +# ifndef USE_AS_WCSLCPY
> > +       cmp     $4, %rdx
> > +       jbe     L(copy_2_byte_scalar)
> > +# endif
> > +       movl    (%rsi), %r10d
> > +       movl    -4(%rsi, %rdx, CHAR_SIZE), %r11d
> > +       movl    %r10d, (%rdi)
> > +       movl    %r11d, -4(%rdi, %rdx, CHAR_SIZE)
> > +       MOVU    $0, -(CHAR_SIZE * 1)(%rdi, %rdx, CHAR_SIZE)
> > +       xor     %edx, %edx
> > +       vptest  %VMM(2), %VMM(2)
> > +       jz      L(continue_second_vector)
> > +       ret
> > +
> > +# ifndef USE_AS_WCSLCPY
> > +L(copy_2_byte_scalar):
> > +       cmp     $2, %rdx
> > +       jbe     L(copy_1_byte_scalar)
> > +       movw    (%rsi), %r10w
> > +       movw    -(CHAR_SIZE * 3)(%rsi, %rdx, CHAR_SIZE), %r11w
> > +       movw    %r10w, (%rdi)
> > +       movw    %r11w, -(CHAR_SIZE * 3)(%rdi, %rdx, CHAR_SIZE)
> > +       MOVU    $0, -(CHAR_SIZE * 1)(%rdi, %rdx, CHAR_SIZE)
> > +       xor     %edx, %edx
> > +       vptest  %VMM(2), %VMM(2)
> > +       jz      L(continue_second_vector)
> > +       ret
> > +
> > +L(copy_1_byte_scalar):
> > +       MOVU    (%rsi), %r10b
> > +       MOVU    %r10b, (%rdi)
> > +       MOVU    $0, -(CHAR_SIZE * 1)(%rdi, %rdx, CHAR_SIZE)
> > +       xor     %edx, %edx
> > +       vptest  %VMM(2), %VMM(2)
> > +       jz      L(continue_second_vector)
> > +       ret
> > +# endif
> > +
> > +L(ret_vec_x2):
> > +       PMOVMSK %VMM(2), %rax
> > +       bsf     %rax, %rcx
> > +       /* Calculate return value.  */
> > +       lea     VEC_SIZE(%rsi, %rcx), %rax
> > +       sub     %r8, %rax
> > +# ifdef USE_AS_WCSLCPY
> > +       shr     $2, %rax
> > +       shr     $2, %rcx
> > +# endif
> > +       inc     %rcx
> > +       test    %rdx, %rdx
> > +       jz      L(ret)
> > +       cmp     %rdx, %rcx
> > +       cmovb   %rcx, %rdx
> > +
> > +L(partial_copy_second_vector):
> > +       VMOVU   (%rsi, %rdx, CHAR_SIZE), %VMM(1)
> > +       VMOVU   %VMM(1), (%rdi, %rdx, CHAR_SIZE)
> > +       MOVU    $0, (VEC_SIZE - CHAR_SIZE * 1)(%rdi, %rdx, CHAR_SIZE)
> > +       xor     %edx, %edx
> > +       vptest  %VMM(2), %VMM(2)
> > +       jz      L(continue_third_vector)
> > +
> > +L(ret):
> > +       ret
> > +
> > +L(ret_vec_x3):
> > +       PMOVMSK %VMM(2), %rax
> > +       bsf     %rax, %rcx
> > +       /* Calculate return value.  */
> > +       lea     (VEC_SIZE * 2)(%rsi, %rcx), %rax
> > +       sub     %r8, %rax
> > +# ifdef USE_AS_WCSLCPY
> > +       shr     $2, %rax
> > +       shr     $2, %rcx
> > +# endif
> > +       inc     %rcx
> > +       test    %rdx, %rdx
> > +       jz      L(ret)
> > +       cmp     %rdx, %rcx
> > +       cmovb   %rcx, %rdx
> > +
> > +L(partial_copy_third_vector):
> > +       VMOVU   (VEC_SIZE)(%rsi, %rdx, CHAR_SIZE), %VMM(1)
> > +       VMOVU   %VMM(1), (VEC_SIZE)(%rdi, %rdx, CHAR_SIZE)
> > +       MOVU    $0, ((VEC_SIZE * 2) - CHAR_SIZE * 1)(%rdi, %rdx,
> CHAR_SIZE)
> > +       xor     %edx, %edx
> > +       vptest  %VMM(2), %VMM(2)
> > +       jz      L(continue_fourth_vector)
> > +       ret
> > +
> > +L(ret_vec_x4):
> > +       PMOVMSK %VMM(2), %rax
> > +       bsf     %rax, %rcx
> > +       /* Calculate return value.  */
> > +       lea     (VEC_SIZE * 3)(%rsi, %rcx), %rax
> > +       sub     %r8, %rax
> > +# ifdef USE_AS_WCSLCPY
> > +       shr     $2, %rax
> > +       shr     $2, %rcx
> > +# endif
> > +       inc     %rcx
> > +       test    %rdx, %rdx
> > +       jz      L(ret)
> > +       cmp     %rdx, %rcx
> > +       cmovb   %rcx, %rdx
> > +
> > +L(partial_copy_fourth_vector):
> > +       VMOVU   (VEC_SIZE * 2)(%rsi, %rdx, CHAR_SIZE), %VMM(1)
> > +       VMOVU   %VMM(1), (VEC_SIZE * 2)(%rdi, %rdx, CHAR_SIZE)
> > +       MOVU    $0, ((VEC_SIZE * 3) - CHAR_SIZE * 1)(%rdi, %rdx,
> CHAR_SIZE)
> > +       xor     %edx, %edx
> > +       vptest  %VMM(2), %VMM(2)
> > +       jz      L(continue_fourth_vector)
> > +       ret
> > +
> > +END (STRLCPY)
>
> Is strlcpy/strlcat integratable with existing strncat impl? Had
> figured they would
> fit in the same file.
>

Hi Noah,

It may not be a good idea to put strlcpy/strlcat in the existing
strncpy/strnat impl file,
as strlcpy/strlcat functions are associated with GLIBC_2.38 ABI.

--Sunil


> > +#endif
> > diff --git a/sysdeps/x86_64/multiarch/strlcpy-generic.c
> b/sysdeps/x86_64/multiarch/strlcpy-generic.c
> > new file mode 100644
> > index 0000000000..eee3b7b086
> > --- /dev/null
> > +++ b/sysdeps/x86_64/multiarch/strlcpy-generic.c
> > @@ -0,0 +1,25 @@
> > +/* strlcpy generic.
> > +   Copyright (C) 2023 Free Software Foundation, Inc.
> > +   This file is part of the GNU C Library.
> > +
> > +   The GNU C Library is free software; you can redistribute it and/or
> > +   modify it under the terms of the GNU Lesser General Public
> > +   License as published by the Free Software Foundation; either
> > +   version 2.1 of the License, or (at your option) any later version.
> > +
> > +   The GNU C Library is distributed in the hope that it will be useful,
> > +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> > +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > +   Lesser General Public License for more details.
> > +
> > +   You should have received a copy of the GNU Lesser General Public
> > +   License along with the GNU C Library; if not, see
> > +   <https://www.gnu.org/licenses/>.  */
> > +
> > +
> > +#include <isa-level.h>
> > +#if ISA_SHOULD_BUILD (1)
> > +# define __strlcpy  __strlcpy_generic
> > +# include <string/strlcpy.c>
> > +
> > +#endif
> > diff --git a/sysdeps/x86_64/multiarch/strlcpy.c
> b/sysdeps/x86_64/multiarch/strlcpy.c
> > new file mode 100644
> > index 0000000000..ded41fbcfb
> > --- /dev/null
> > +++ b/sysdeps/x86_64/multiarch/strlcpy.c
> > @@ -0,0 +1,36 @@
> > +/* Multiple versions of strlcpy.
> > +   All versions must be listed in ifunc-impl-list.c.
> > +   Copyright (C) 2023 Free Software Foundation, Inc.
> > +   This file is part of the GNU C Library.
> > +
> > +   The GNU C Library is free software; you can redistribute it and/or
> > +   modify it under the terms of the GNU Lesser General Public
> > +   License as published by the Free Software Foundation; either
> > +   version 2.1 of the License, or (at your option) any later version.
> > +
> > +   The GNU C Library is distributed in the hope that it will be useful,
> > +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> > +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > +   Lesser General Public License for more details.
> > +
> > +   You should have received a copy of the GNU Lesser General Public
> > +   License along with the GNU C Library; if not, see
> > +   <https://www.gnu.org/licenses/>.  */
> > +
> > +/* Define multiple versions only for the definition in libc.  */
> > +#if IS_IN (libc)
> > +# define __strlcpy __redirect_strlcpy
> > +# include <string.h>
> > +# undef __strlcpy
> > +
> > +# define SYMBOL_NAME strlcpy
> > +# include "ifunc-strlcpy.h"
> > +
> > +libc_ifunc_redirected (__redirect_strlcpy, __strlcpy, IFUNC_SELECTOR
> ());
> > +weak_alias (__strlcpy, strlcpy)
> > +
> > +# ifdef SHARED
> > +__hidden_ver1 (__strlcpy, __GI___strlcpy, __redirect_strlcpy)
> > +  __attribute__ ((visibility ("hidden"))) __attribute_copy__ (strlcpy);
> > +# endif
> > +#endif
> > diff --git a/sysdeps/x86_64/multiarch/wcslcpy-avx2.S
> b/sysdeps/x86_64/multiarch/wcslcpy-avx2.S
> > new file mode 100644
> > index 0000000000..dafc20ded0
> > --- /dev/null
> > +++ b/sysdeps/x86_64/multiarch/wcslcpy-avx2.S
> > @@ -0,0 +1,4 @@
> > +#define STRLCPY        __wcslcpy_avx2
> > +#define USE_AS_WCSLCPY 1
> > +
> > +#include "strlcpy-avx2.S"
> > diff --git a/sysdeps/x86_64/multiarch/wcslcpy-generic.c
> b/sysdeps/x86_64/multiarch/wcslcpy-generic.c
> > new file mode 100644
> > index 0000000000..ffd3c0e846
> > --- /dev/null
> > +++ b/sysdeps/x86_64/multiarch/wcslcpy-generic.c
> > @@ -0,0 +1,25 @@
> > +/* wcslcpy generic.
> > +   Copyright (C) 2023 Free Software Foundation, Inc.
> > +   This file is part of the GNU C Library.
> > +
> > +   The GNU C Library is free software; you can redistribute it and/or
> > +   modify it under the terms of the GNU Lesser General Public
> > +   License as published by the Free Software Foundation; either
> > +   version 2.1 of the License, or (at your option) any later version.
> > +
> > +   The GNU C Library is distributed in the hope that it will be useful,
> > +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> > +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > +   Lesser General Public License for more details.
> > +
> > +   You should have received a copy of the GNU Lesser General Public
> > +   License along with the GNU C Library; if not, see
> > +   <https://www.gnu.org/licenses/>.  */
> > +
> > +
> > +#include <isa-level.h>
> > +#if ISA_SHOULD_BUILD (1)
> > +# define __wcslcpy  __wcslcpy_generic
> > +# include <wcsmbs/wcslcpy.c>
> > +
> > +#endif
> > diff --git a/sysdeps/x86_64/multiarch/wcslcpy.c
> b/sysdeps/x86_64/multiarch/wcslcpy.c
> > new file mode 100644
> > index 0000000000..371ef9626c
> > --- /dev/null
> > +++ b/sysdeps/x86_64/multiarch/wcslcpy.c
> > @@ -0,0 +1,35 @@
> > +/* Multiple versions of wcslcpy.
> > +   All versions must be listed in ifunc-impl-list.c.
> > +   Copyright (C) 2023 Free Software Foundation, Inc.
> > +   This file is part of the GNU C Library.
> > +
> > +   The GNU C Library is free software; you can redistribute it and/or
> > +   modify it under the terms of the GNU Lesser General Public
> > +   License as published by the Free Software Foundation; either
> > +   version 2.1 of the License, or (at your option) any later version.
> > +
> > +   The GNU C Library is distributed in the hope that it will be useful,
> > +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> > +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > +   Lesser General Public License for more details.
> > +
> > +   You should have received a copy of the GNU Lesser General Public
> > +   License along with the GNU C Library; if not, see
> > +   <https://www.gnu.org/licenses/>.  */
> > +
> > +/* Define multiple versions only for the definition in libc.  */
> > +#if IS_IN (libc)
> > +# define __wcslcpy __redirect_wcslcpy
> > +# include <wchar.h>
> > +# undef __wcslcpy
> > +
> > +# define SYMBOL_NAME wcslcpy
> > +# include "ifunc-strlcpy.h"
> > +
> > +libc_ifunc_redirected (__redirect_wcslcpy, __wcslcpy, IFUNC_SELECTOR
> ());
> > +weak_alias (__wcslcpy, wcslcpy)
> > +# ifdef SHARED
> > +__hidden_ver1 (__wcslcpy, __GI___wcslcpy, __redirect_wcslcpy)
> > +  __attribute__((visibility ("hidden"))) __attribute_copy__ (wcslcpy);
> > +# endif
> > +#endif
> > --
> > 2.38.1
> >
>

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH] x86_64: Implement AVX2 version of strlcpy/wcslcpy function
  2023-07-02 18:37   ` Sunil Pandey
@ 2023-07-02 18:54     ` Noah Goldstein
  2023-07-03  1:03       ` Sunil Pandey
  0 siblings, 1 reply; 24+ messages in thread
From: Noah Goldstein @ 2023-07-02 18:54 UTC (permalink / raw)
  To: Sunil Pandey; +Cc: libc-alpha, hjl.tools

On Sun, Jul 2, 2023 at 1:38 PM Sunil Pandey <skpgkp2@gmail.com> wrote:
>
>
>
> On Sun, Jul 2, 2023 at 10:03 AM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>>
>> On Fri, Jun 30, 2023 at 3:48 PM Sunil K Pandey via Libc-alpha
>> <libc-alpha@sourceware.org> wrote:
>> >
>> > This patch optimizes strlcpy/wsclcpy string functions for AVX2.
>> > ---
>> >  sysdeps/x86_64/multiarch/Makefile          |   4 +
>> >  sysdeps/x86_64/multiarch/ifunc-impl-list.c |  18 +
>> >  sysdeps/x86_64/multiarch/ifunc-strlcpy.h   |  34 ++
>> >  sysdeps/x86_64/multiarch/strlcpy-avx2.S    | 446 +++++++++++++++++++++
>> >  sysdeps/x86_64/multiarch/strlcpy-generic.c |  25 ++
>> >  sysdeps/x86_64/multiarch/strlcpy.c         |  36 ++
>> >  sysdeps/x86_64/multiarch/wcslcpy-avx2.S    |   4 +
>> >  sysdeps/x86_64/multiarch/wcslcpy-generic.c |  25 ++
>> >  sysdeps/x86_64/multiarch/wcslcpy.c         |  35 ++
>> >  9 files changed, 627 insertions(+)
>> >  create mode 100644 sysdeps/x86_64/multiarch/ifunc-strlcpy.h
>> >  create mode 100644 sysdeps/x86_64/multiarch/strlcpy-avx2.S
>> >  create mode 100644 sysdeps/x86_64/multiarch/strlcpy-generic.c
>> >  create mode 100644 sysdeps/x86_64/multiarch/strlcpy.c
>> >  create mode 100644 sysdeps/x86_64/multiarch/wcslcpy-avx2.S
>> >  create mode 100644 sysdeps/x86_64/multiarch/wcslcpy-generic.c
>> >  create mode 100644 sysdeps/x86_64/multiarch/wcslcpy.c
>> >
>> > diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
>> > index e1e894c963..7e3fc081df 100644
>> > --- a/sysdeps/x86_64/multiarch/Makefile
>> > +++ b/sysdeps/x86_64/multiarch/Makefile
>> > @@ -82,6 +82,8 @@ sysdep_routines += \
>> >    strcpy-sse2 \
>> >    strcpy-sse2-unaligned \
>> >    strcspn-sse4 \
>> > +  strlcpy-avx2 \
>> > +  strlcpy-generic \
>> >    strlen-avx2 \
>> >    strlen-avx2-rtm \
>> >    strlen-evex \
>> > @@ -153,6 +155,8 @@ sysdep_routines += \
>> >    wcscpy-evex \
>> >    wcscpy-generic \
>> >    wcscpy-ssse3 \
>> > +  wcslcpy-avx2 \
>> > +  wcslcpy-generic \
>> >    wcslen-avx2 \
>> >    wcslen-avx2-rtm \
>> >    wcslen-evex \
>> > diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
>> > index 5427ff1907..9928dee187 100644
>> > --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
>> > +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
>> > @@ -751,6 +751,15 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
>> >                                      1,
>> >                                      __strncat_sse2_unaligned))
>> >
>> > +  /* Support sysdeps/x86_64/multiarch/strlcpy.c.  */
>> > +  IFUNC_IMPL (i, name, strlcpy,
>> > +             X86_IFUNC_IMPL_ADD_V3 (array, i, strlcpy,
>> > +                                    CPU_FEATURE_USABLE (AVX2),
>> > +                                    __strlcpy_avx2)
>> > +             X86_IFUNC_IMPL_ADD_V1 (array, i, strlcpy,
>> > +                                    1,
>> > +                                    __strlcpy_generic))
>> > +
>> >    /* Support sysdeps/x86_64/multiarch/strncpy.c.  */
>> >    IFUNC_IMPL (i, name, strncpy,
>> >               X86_IFUNC_IMPL_ADD_V4 (array, i, strncpy,
>> > @@ -917,6 +926,15 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
>> >                                      1,
>> >                                      __wcscpy_generic))
>> >
>> > +  /* Support sysdeps/x86_64/multiarch/wcslcpy.c.  */
>> > +  IFUNC_IMPL (i, name, wcslcpy,
>> > +             X86_IFUNC_IMPL_ADD_V3 (array, i, wcslcpy,
>> > +                                    CPU_FEATURE_USABLE (AVX2),
>> > +                                    __wcslcpy_avx2)
>> > +             X86_IFUNC_IMPL_ADD_V1 (array, i, wcslcpy,
>> > +                                    1,
>> > +                                    __wcslcpy_generic))
>> > +
>> >    /* Support sysdeps/x86_64/multiarch/wcsncpy.c.  */
>> >    IFUNC_IMPL (i, name, wcsncpy,
>> >               X86_IFUNC_IMPL_ADD_V4 (array, i, wcsncpy,
>> > diff --git a/sysdeps/x86_64/multiarch/ifunc-strlcpy.h b/sysdeps/x86_64/multiarch/ifunc-strlcpy.h
>> > new file mode 100644
>> > index 0000000000..982a30d15b
>> > --- /dev/null
>> > +++ b/sysdeps/x86_64/multiarch/ifunc-strlcpy.h
>> > @@ -0,0 +1,34 @@
>> > +/* Common definition for ifunc selections.
>> > +   All versions must be listed in ifunc-impl-list.c.
>> > +   Copyright (C) 2023 Free Software Foundation, Inc.
>> > +   This file is part of the GNU C Library.
>> > +
>> > +   The GNU C Library is free software; you can redistribute it and/or
>> > +   modify it under the terms of the GNU Lesser General Public
>> > +   License as published by the Free Software Foundation; either
>> > +   version 2.1 of the License, or (at your option) any later version.
>> > +
>> > +   The GNU C Library is distributed in the hope that it will be useful,
>> > +   but WITHOUT ANY WARRANTY; without even the implied warranty of
>> > +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
>> > +   Lesser General Public License for more details.
>> > +
>> > +   You should have received a copy of the GNU Lesser General Public
>> > +   License along with the GNU C Library; if not, see
>> > +   <https://www.gnu.org/licenses/>.  */
>> > +
>> > +#include <init-arch.h>
>> > +
>> > +extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden;
>> > +extern __typeof (REDIRECT_NAME) OPTIMIZE (generic) attribute_hidden;
>> > +
>> > +static inline void *
>> > +IFUNC_SELECTOR (void)
>> > +{
>> > +  const struct cpu_features *cpu_features = __get_cpu_features ();
>> > +
>> > +  if (X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, AVX2))
>> > +    return OPTIMIZE (avx2);
>> > +
>> > +  return OPTIMIZE (generic);
>> > +}
>> > diff --git a/sysdeps/x86_64/multiarch/strlcpy-avx2.S b/sysdeps/x86_64/multiarch/strlcpy-avx2.S
>> > new file mode 100644
>> > index 0000000000..cf54b1e990
>> > --- /dev/null
>> > +++ b/sysdeps/x86_64/multiarch/strlcpy-avx2.S
>> > @@ -0,0 +1,446 @@
>> > +/* Strlcpy/wcslcpy optimized with AVX2.
>> > +   Copyright (C) 2023 Free Software Foundation, Inc.
>> > +   This file is part of the GNU C Library.
>> > +
>> > +   The GNU C Library is free software; you can redistribute it and/or
>> > +   modify it under the terms of the GNU Lesser General Public
>> > +   License as published by the Free Software Foundation; either
>> > +   version 2.1 of the License, or (at your option) any later version.
>> > +
>> > +   The GNU C Library is distributed in the hope that it will be useful,
>> > +   but WITHOUT ANY WARRANTY; without even the implied warranty of
>> > +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
>> > +   Lesser General Public License for more details.
>> > +
>> > +   You should have received a copy of the GNU Lesser General Public
>> > +   License along with the GNU C Library; if not, see
>> > +   <https://www.gnu.org/licenses/>.  */
>> > +
>> > +#include <isa-level.h>
>> > +
>> > +#if ISA_SHOULD_BUILD (3)
>> > +
>> > +# include <sysdep.h>
>> > +
>> > +# ifndef VEC_SIZE
>> > +#  include "x86-avx-vecs.h"
>> > +# endif
>> > +
>> > +# ifndef STRLCPY
>> > +#  define STRLCPY      __strlcpy_avx2
>> > +# endif
>> > +
>> > +
>> > +# ifdef USE_AS_WCSLCPY
>> > +#  define CHAR_SIZE    4
>> > +#  define MOVU         movl
>> > +#  define VPCMPEQ      vpcmpeqd
>> > +#  define VPMINU       vpminud
>> > +# else
>> > +#  define CHAR_SIZE    1
>> > +#  define MOVU         movb
>> > +#  define VPCMPEQ      vpcmpeqb
>> > +#  define VPMINU       vpminub
>> > +# endif
>> > +
>> > +# define PMOVMSK       vpmovmskb
>> > +# define PAGE_SIZE     4096
>> > +# define VEC_SIZE      32
>> > +# define CHAR_PER_VEC  (VEC_SIZE / CHAR_SIZE)
>> > +
>> > +       .section SECTION(.text),"ax",@progbits
>> > +/* Aligning entry point to 64 byte, provides better performance for
>> > +   one vector length string.  */
>> > +
>> > +ENTRY_P2ALIGN (STRLCPY, 6)
>> > +# ifdef __ILP32__
>> > +       /* Clear the upper 32 bits.  */
>> > +       movl    %edx, %edx
>> > +# endif
>> > +
>> > +       /* Zero out vector register for end of string comparison. */
>> > +       vpxor   %VMM(0), %VMM(0), %VMM(0)
>> > +       /* Save source pointer for return calculation.  */
>> > +       mov     %rsi, %r8
>> > +       mov     %esi, %eax
>> > +       sall    $20, %eax
>> > +       cmpl    $((PAGE_SIZE - (VEC_SIZE)) << 20), %eax
>> > +       ja      L(page_cross)
>> > +
>> > +L(page_cross_continue):
>> > +       /* Load first vector.  */
>> > +       VMOVU   (%rsi), %VMM(1)
>> > +       VPCMPEQ %VMM(0), %VMM(1), %VMM(2)
>> > +       PMOVMSK %VMM(2), %eax
>> > +       test    %eax, %eax
>> > +       jnz     L(ret_vec_x1)
>> > +
>> > +       test    %rdx, %rdx
>> > +       jz      L(continue_second_vector)
>> > +
>> > +       /* Check whether we can copy full vector.  */
>> > +       cmp     $CHAR_PER_VEC, %rdx
>> > +       jbe     L(page_cross_small_vec_copy)
>> > +       /* Copy first vector.  */
>> > +       VMOVU   %VMM(1), (%rdi)
>> > +       sub     $CHAR_PER_VEC, %rdx
>> > +
>> > +L(continue_second_vector):
>> > +       /* Align RSI pointer and adjust RDI based on offset.  */
>> > +       mov     %rsi, %rax
>> > +       and     $-VEC_SIZE, %rsi
>> > +       sub     %rsi, %rax
>> > +       sub     %rax, %rdi
>> > +
>> > +       /* Check if string already copied N char, and RDX is 0.  */
>> > +       test    %rdx, %rdx
>> > +       jz      L(skip_copy_alignment_fix)
>> > +
>> > +       /* Adjust RDX for copy alignment fix.  */
>> > +# ifdef USE_AS_WCSLCPY
>> > +       shr     $2, %rax
>> > +# endif
>> > +       add     %rax, %rdx
>> > +
>> > +L(skip_copy_alignment_fix):
>> > +       /* Load second vector.  */
>> > +       VMOVA   (VEC_SIZE * 1)(%rsi), %VMM(1)
>> > +       VPCMPEQ %VMM(0), %VMM(1), %VMM(2)
>> > +       vptest  %VMM(2), %VMM(2)
>> > +       jnz     L(ret_vec_x2)
>> > +
>> > +       /* Skip copy if RDX is 0.  */
>> > +       test    %rdx, %rdx
>> > +       jz      L(continue_third_vector)
>> > +
>> > +       /* Jump below/equal(instead of below) used here, because last
>> > +          copy chracter must be NULL.  */
>> > +       cmp     $CHAR_PER_VEC, %rdx
>> > +       jbe     L(partial_copy_second_vector)
>> > +
>> > +       sub     $CHAR_PER_VEC, %rdx
>> > +       /* Copy second vector.  */
>> > +       VMOVU   %VMM(1), (VEC_SIZE * 1)(%rdi)
>> > +
>> > +L(continue_third_vector):
>> > +       /* Load third vector.  */
>> > +       VMOVA   (VEC_SIZE * 2)(%rsi), %VMM(1)
>> > +       VPCMPEQ %VMM(0), %VMM(1), %VMM(2)
>> > +       vptest  %VMM(2), %VMM(2)
>> > +       jnz     L(ret_vec_x3)
>> > +
>> > +       /* Skip copy if RDX is 0.  */
>> > +       test    %rdx, %rdx
>> > +       jz      L(continue_fourth_vector)
>> > +
>> > +       cmp     $CHAR_PER_VEC, %rdx
>> > +       jbe     L(partial_copy_third_vector)
>> > +
>> > +       sub     $CHAR_PER_VEC, %rdx
>> > +       /* Copy third vector.  */
>> > +       VMOVU   %VMM(1), (VEC_SIZE * 2)(%rdi)
>> > +
>> > +L(continue_fourth_vector):
>> > +       /* Load fourth vector.  */
>> > +       VMOVA   (VEC_SIZE * 3)(%rsi), %VMM(1)
>> > +       VPCMPEQ %VMM(0), %VMM(1), %VMM(2)
>> > +       vptest  %VMM(2), %VMM(2)
>> > +       jnz     L(ret_vec_x4)
>> > +
>> > +       /* Skip copy if RDX is 0.  */
>> > +       test    %rdx, %rdx
>> > +       jz      L(loop_4x_align)
>> > +
>> > +       cmp     $CHAR_PER_VEC, %rdx
>> > +       jbe     L(partial_copy_fourth_vector)
>> > +
>> > +       sub     $CHAR_PER_VEC, %rdx
>> > +       /* Copy fourth vector.  */
>> > +       VMOVU   %VMM(1), (VEC_SIZE * 3)(%rdi)
>> > +
>> > +
>> > +L(loop_4x_align):
>> > +       /* Jump to loop if RSI is already 4 vector align.  */
>> > +       test    $(VEC_SIZE * 4 - 1), %esi
>> > +       jz      L(loop_4x_read)
>> > +
>> > +       mov     %rsi, %rcx
>> > +
>> > +       /* Align RSI to 4x vector.  */
>> > +       and     $(VEC_SIZE * -4), %rsi
>> > +       sub     %rsi, %rcx
>> > +
>> > +       /* Adjust RDI for RSI alignment fix.  */
>> > +       sub     %rcx, %rdi
>> > +
>> > +       /* Jump to loop if RDX is 0.  */
>> > +       test    %rdx, %rdx
>> > +       jz      L(loop_4x_read)
>> > +
>> > +# ifdef USE_AS_WCSLCPY
>> > +       shr     $2, %rcx
>> > +# endif
>> > +
>> > +       /* Adjust RDX for RSI alignment fix.  */
>> > +       add     %rcx, %rdx
>> > +       jmp     L(loop_4x_read)
>> > +
>> > +       .p2align 4,,6
>> > +L(loop_4x_vec):
>> > +       /* Skip copy if RDX is 0.  */
>> > +       test    %rdx, %rdx
>> > +       jz      L(loop_partial_copy_return)
>> > +       cmp     $(CHAR_PER_VEC * 4), %rdx
>> > +       jbe     L(loop_partial_copy)
>> > +       VMOVU   %VMM(1), (VEC_SIZE * 4)(%rdi)
>> > +       VMOVU   %VMM(2), (VEC_SIZE * 5)(%rdi)
>> > +       VMOVU   %VMM(3), (VEC_SIZE * 6)(%rdi)
>> > +       VMOVU   %VMM(4), (VEC_SIZE * 7)(%rdi)
>> > +       sub     $(CHAR_PER_VEC * 4), %rdx
>> > +
>> > +L(loop_partial_copy_return):
>> > +       sub     $(VEC_SIZE * -4), %rsi
>> > +       sub     $(VEC_SIZE * -4), %rdi
>> > +
>> > +L(loop_4x_read):
>> > +       VMOVA   (VEC_SIZE * 4)(%rsi), %VMM(1)
>> > +       VMOVA   (VEC_SIZE * 5)(%rsi), %VMM(2)
>> > +       VMOVA   (VEC_SIZE * 6)(%rsi), %VMM(3)
>> > +       VMOVA   (VEC_SIZE * 7)(%rsi), %VMM(4)
>> > +       VPMINU  %VMM(1), %VMM(2), %VMM(5)
>> > +       VPMINU  %VMM(3), %VMM(4), %VMM(6)
>> > +       VPMINU  %VMM(5), %VMM(6), %VMM(7)
>> > +       VPCMPEQ %VMM(0), %VMM(7), %VMM(7)
>> > +       vptest  %VMM(7), %VMM(7)
>> > +
>> > +       jz      L(loop_4x_vec)
>> > +
>> > +       /* Check if string ends in first vector or second vector.  */
>> > +       lea     (VEC_SIZE * 4)(%rsi), %rax
>> > +       sub     %r8, %rax
>> > +# ifdef USE_AS_WCSLCPY
>> > +       shr     $2, %rax
>> > +# endif
>> > +       xor     %r10, %r10
>> > +       VPCMPEQ %VMM(0), %VMM(5), %VMM(6)
>> > +       vptest  %VMM(6), %VMM(6)
>> > +       jnz     L(endloop)
>> > +       sub     $(CHAR_PER_VEC * -2), %rax
>> > +       mov     $(CHAR_PER_VEC * 2), %r10
>> > +       VMOVA   %VMM(3), %VMM(1)
>> > +       VMOVA   %VMM(4), %VMM(2)
>> > +
>> > +L(endloop):
>> > +       VPCMPEQ %VMM(0), %VMM(1), %VMM(1)
>> > +       VPCMPEQ %VMM(0), %VMM(2), %VMM(2)
>> > +       PMOVMSK %VMM(1), %rcx
>> > +       PMOVMSK %VMM(2), %r9
>> > +       shlq    $32, %r9
>> > +       orq     %r9, %rcx
>> > +       bsf     %rcx, %rcx
>> > +       /* Shift RCX by 2, VPMOVMSK has only byte version.  */
>> > +# ifdef USE_AS_WCSLCPY
>> > +       shr     $2, %rcx
>> > +# endif
>> > +       /* At this point RAX has length to return.  */
>> > +       add     %rcx, %rax
>> > +       test    %rdx, %rdx
>> > +       jz      L(ret)
>> > +
>> > +       /* Add 1 to account for NULL character in RDX comparison.  */
>> > +       lea     1(%r10, %rcx), %rcx
>> > +       cmp     %rdx, %rcx
>> > +       cmovb   %rcx, %rdx
>> > +
>> > +L(loop_partial_copy):
>> > +       cmp     $(CHAR_PER_VEC * 2), %rdx
>> > +       jbe     L(loop_partial_first_half)
>> > +       /* Reload first 2 vector.  */
>> > +       VMOVA   (VEC_SIZE * 4)(%rsi), %VMM(1)
>> > +       VMOVA   (VEC_SIZE * 5)(%rsi), %VMM(2)
>> > +       VMOVU   %VMM(1), (VEC_SIZE * 4)(%rdi)
>> > +       VMOVU   %VMM(2), (VEC_SIZE * 5)(%rdi)
>> > +
>> > +L(loop_partial_first_half):
>> > +       /* Go back 2 vector from last and use overlapping copy.
>> > +          (VEC_SIZE * 4 - VEC_SIZE * 2)(%rsi, %rdx, CHAR_SIZE)
>> > +          (VEC_SIZE * 4 - VEC_SIZE * 1)(%rsi, %rdx, CHAR_SIZE)
>> > +        */
>> > +       VMOVU   (VEC_SIZE * 2)(%rsi, %rdx, CHAR_SIZE), %VMM(3)
>> > +       VMOVU   (VEC_SIZE * 3)(%rsi, %rdx, CHAR_SIZE), %VMM(4)
>> > +       VMOVU   %VMM(3), (VEC_SIZE * 2)(%rdi, %rdx, CHAR_SIZE)
>> > +       VMOVU   %VMM(4), (VEC_SIZE * 3)(%rdi, %rdx, CHAR_SIZE)
>> > +       MOVU    $0, (VEC_SIZE * 4 - CHAR_SIZE)(%rdi, %rdx, CHAR_SIZE)
>> > +       xor     %rdx, %rdx
>> > +       vptest  %VMM(7), %VMM(7)
>> > +       jz      L(loop_partial_copy_return)
>> > +       ret
>> > +
>> > +       .p2align 4
>> > +L(page_cross):
>> > +       mov     %rsi, %rcx
>> > +       mov     %rsi, %r11
>> > +       and     $-VEC_SIZE, %r11
>> > +       and     $(VEC_SIZE - 1), %rcx
>> > +       VMOVA   (%r11), %VMM(1)
>> > +       VPCMPEQ %VMM(0), %VMM(1), %VMM(2)
>> > +       PMOVMSK %VMM(2), %eax
>> > +       shr     %cl, %eax
>> > +       jz      L(page_cross_continue)
>> > +
>> > +L(ret_vec_x1):
>> > +       bsf     %eax, %eax
>> > +# ifdef USE_AS_WCSLCPY
>> > +       shr     $2, %eax
>> > +# endif
>> > +       /* Increment by 1 to account for NULL char.  */
>> > +       lea     1(%eax), %ecx
>> > +       cmp     %rdx, %rcx
>> > +       cmovb   %rcx, %rdx
>> > +       test    %rdx, %rdx
>> > +       jz      L(ret)
>> > +
>> > +L(page_cross_small_vec_copy):
>> > +       cmp     $(16 / CHAR_SIZE), %rdx
>> > +       jbe     L(copy_8_byte_scalar)
>> > +       VMOVU   (%rsi), %VMM_128(1)
>> > +       VMOVU   -16(%rsi, %rdx, CHAR_SIZE), %VMM_128(3)
>> > +       VMOVU   %VMM_128(1), (%rdi)
>> > +       VMOVU   %VMM_128(3), -16(%rdi, %rdx, CHAR_SIZE)
>> > +       MOVU    $0, -(CHAR_SIZE * 1)(%rdi, %rdx, CHAR_SIZE)
>> > +       xor     %rdx, %rdx
>> > +       vptest  %VMM(2), %VMM(2)
>> > +       jz      L(continue_second_vector)
>> > +       ret
>> > +
>> > +L(copy_8_byte_scalar):
>> > +       cmp     $(8 / CHAR_SIZE), %rdx
>> > +       jbe     L(copy_4_byte_scalar)
>> > +       movq    (%rsi), %r10
>> > +       movq    -8(%rsi, %rdx, CHAR_SIZE), %r11
>> > +       movq    %r10, (%rdi)
>> > +       movq    %r11, -8(%rdi, %rdx, CHAR_SIZE)
>> > +       MOVU    $0, -(CHAR_SIZE * 1)(%rdi, %rdx, CHAR_SIZE)
>> > +       xor     %edx, %edx
>> > +       vptest  %VMM(2), %VMM(2)
>> > +       jz      L(continue_second_vector)
>> > +       ret
>> > +
>> > +L(copy_4_byte_scalar):
>> > +# ifndef USE_AS_WCSLCPY
>> > +       cmp     $4, %rdx
>> > +       jbe     L(copy_2_byte_scalar)
>> > +# endif
>> > +       movl    (%rsi), %r10d
>> > +       movl    -4(%rsi, %rdx, CHAR_SIZE), %r11d
>> > +       movl    %r10d, (%rdi)
>> > +       movl    %r11d, -4(%rdi, %rdx, CHAR_SIZE)
>> > +       MOVU    $0, -(CHAR_SIZE * 1)(%rdi, %rdx, CHAR_SIZE)
>> > +       xor     %edx, %edx
>> > +       vptest  %VMM(2), %VMM(2)
>> > +       jz      L(continue_second_vector)
>> > +       ret
>> > +
>> > +# ifndef USE_AS_WCSLCPY
>> > +L(copy_2_byte_scalar):
>> > +       cmp     $2, %rdx
>> > +       jbe     L(copy_1_byte_scalar)
>> > +       movw    (%rsi), %r10w
>> > +       movw    -(CHAR_SIZE * 3)(%rsi, %rdx, CHAR_SIZE), %r11w
>> > +       movw    %r10w, (%rdi)
>> > +       movw    %r11w, -(CHAR_SIZE * 3)(%rdi, %rdx, CHAR_SIZE)
>> > +       MOVU    $0, -(CHAR_SIZE * 1)(%rdi, %rdx, CHAR_SIZE)
>> > +       xor     %edx, %edx
>> > +       vptest  %VMM(2), %VMM(2)
>> > +       jz      L(continue_second_vector)
>> > +       ret
>> > +
>> > +L(copy_1_byte_scalar):
>> > +       MOVU    (%rsi), %r10b
>> > +       MOVU    %r10b, (%rdi)
>> > +       MOVU    $0, -(CHAR_SIZE * 1)(%rdi, %rdx, CHAR_SIZE)
>> > +       xor     %edx, %edx
>> > +       vptest  %VMM(2), %VMM(2)
>> > +       jz      L(continue_second_vector)
>> > +       ret
>> > +# endif
>> > +
>> > +L(ret_vec_x2):
>> > +       PMOVMSK %VMM(2), %rax
>> > +       bsf     %rax, %rcx
>> > +       /* Calculate return value.  */
>> > +       lea     VEC_SIZE(%rsi, %rcx), %rax
>> > +       sub     %r8, %rax
>> > +# ifdef USE_AS_WCSLCPY
>> > +       shr     $2, %rax
>> > +       shr     $2, %rcx
>> > +# endif
>> > +       inc     %rcx
>> > +       test    %rdx, %rdx
>> > +       jz      L(ret)
>> > +       cmp     %rdx, %rcx
>> > +       cmovb   %rcx, %rdx
>> > +
>> > +L(partial_copy_second_vector):
>> > +       VMOVU   (%rsi, %rdx, CHAR_SIZE), %VMM(1)
>> > +       VMOVU   %VMM(1), (%rdi, %rdx, CHAR_SIZE)
>> > +       MOVU    $0, (VEC_SIZE - CHAR_SIZE * 1)(%rdi, %rdx, CHAR_SIZE)
>> > +       xor     %edx, %edx
>> > +       vptest  %VMM(2), %VMM(2)
>> > +       jz      L(continue_third_vector)
>> > +
>> > +L(ret):
>> > +       ret
>> > +
>> > +L(ret_vec_x3):
>> > +       PMOVMSK %VMM(2), %rax
>> > +       bsf     %rax, %rcx
>> > +       /* Calculate return value.  */
>> > +       lea     (VEC_SIZE * 2)(%rsi, %rcx), %rax
>> > +       sub     %r8, %rax
>> > +# ifdef USE_AS_WCSLCPY
>> > +       shr     $2, %rax
>> > +       shr     $2, %rcx
>> > +# endif
>> > +       inc     %rcx
>> > +       test    %rdx, %rdx
>> > +       jz      L(ret)
>> > +       cmp     %rdx, %rcx
>> > +       cmovb   %rcx, %rdx
>> > +
>> > +L(partial_copy_third_vector):
>> > +       VMOVU   (VEC_SIZE)(%rsi, %rdx, CHAR_SIZE), %VMM(1)
>> > +       VMOVU   %VMM(1), (VEC_SIZE)(%rdi, %rdx, CHAR_SIZE)
>> > +       MOVU    $0, ((VEC_SIZE * 2) - CHAR_SIZE * 1)(%rdi, %rdx, CHAR_SIZE)
>> > +       xor     %edx, %edx
>> > +       vptest  %VMM(2), %VMM(2)
>> > +       jz      L(continue_fourth_vector)
>> > +       ret
>> > +
>> > +L(ret_vec_x4):
>> > +       PMOVMSK %VMM(2), %rax
>> > +       bsf     %rax, %rcx
>> > +       /* Calculate return value.  */
>> > +       lea     (VEC_SIZE * 3)(%rsi, %rcx), %rax
>> > +       sub     %r8, %rax
>> > +# ifdef USE_AS_WCSLCPY
>> > +       shr     $2, %rax
>> > +       shr     $2, %rcx
>> > +# endif
>> > +       inc     %rcx
>> > +       test    %rdx, %rdx
>> > +       jz      L(ret)
>> > +       cmp     %rdx, %rcx
>> > +       cmovb   %rcx, %rdx
>> > +
>> > +L(partial_copy_fourth_vector):
>> > +       VMOVU   (VEC_SIZE * 2)(%rsi, %rdx, CHAR_SIZE), %VMM(1)
>> > +       VMOVU   %VMM(1), (VEC_SIZE * 2)(%rdi, %rdx, CHAR_SIZE)
>> > +       MOVU    $0, ((VEC_SIZE * 3) - CHAR_SIZE * 1)(%rdi, %rdx, CHAR_SIZE)
>> > +       xor     %edx, %edx
>> > +       vptest  %VMM(2), %VMM(2)
>> > +       jz      L(continue_fourth_vector)
>> > +       ret
>> > +
>> > +END (STRLCPY)
>>
>> Is strlcpy/strlcat integratable with existing strncat impl? Had
>> figured they would
>> fit in the same file.
>
>
> Hi Noah,
>
> It may not be a good idea to put strlcpy/strlcat in the existing strncpy/strnat impl file,
> as strlcpy/strlcat functions are associated with GLIBC_2.38 ABI.
>
Well, we can put the impl there and include it from another to manage
any special
link cases.

> --Sunil
>
>>
>> > +#endif
>> > diff --git a/sysdeps/x86_64/multiarch/strlcpy-generic.c b/sysdeps/x86_64/multiarch/strlcpy-generic.c
>> > new file mode 100644
>> > index 0000000000..eee3b7b086
>> > --- /dev/null
>> > +++ b/sysdeps/x86_64/multiarch/strlcpy-generic.c
>> > @@ -0,0 +1,25 @@
>> > +/* strlcpy generic.
>> > +   Copyright (C) 2023 Free Software Foundation, Inc.
>> > +   This file is part of the GNU C Library.
>> > +
>> > +   The GNU C Library is free software; you can redistribute it and/or
>> > +   modify it under the terms of the GNU Lesser General Public
>> > +   License as published by the Free Software Foundation; either
>> > +   version 2.1 of the License, or (at your option) any later version.
>> > +
>> > +   The GNU C Library is distributed in the hope that it will be useful,
>> > +   but WITHOUT ANY WARRANTY; without even the implied warranty of
>> > +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
>> > +   Lesser General Public License for more details.
>> > +
>> > +   You should have received a copy of the GNU Lesser General Public
>> > +   License along with the GNU C Library; if not, see
>> > +   <https://www.gnu.org/licenses/>.  */
>> > +
>> > +
>> > +#include <isa-level.h>
>> > +#if ISA_SHOULD_BUILD (1)
>> > +# define __strlcpy  __strlcpy_generic
>> > +# include <string/strlcpy.c>
>> > +
>> > +#endif
>> > diff --git a/sysdeps/x86_64/multiarch/strlcpy.c b/sysdeps/x86_64/multiarch/strlcpy.c
>> > new file mode 100644
>> > index 0000000000..ded41fbcfb
>> > --- /dev/null
>> > +++ b/sysdeps/x86_64/multiarch/strlcpy.c
>> > @@ -0,0 +1,36 @@
>> > +/* Multiple versions of strlcpy.
>> > +   All versions must be listed in ifunc-impl-list.c.
>> > +   Copyright (C) 2023 Free Software Foundation, Inc.
>> > +   This file is part of the GNU C Library.
>> > +
>> > +   The GNU C Library is free software; you can redistribute it and/or
>> > +   modify it under the terms of the GNU Lesser General Public
>> > +   License as published by the Free Software Foundation; either
>> > +   version 2.1 of the License, or (at your option) any later version.
>> > +
>> > +   The GNU C Library is distributed in the hope that it will be useful,
>> > +   but WITHOUT ANY WARRANTY; without even the implied warranty of
>> > +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
>> > +   Lesser General Public License for more details.
>> > +
>> > +   You should have received a copy of the GNU Lesser General Public
>> > +   License along with the GNU C Library; if not, see
>> > +   <https://www.gnu.org/licenses/>.  */
>> > +
>> > +/* Define multiple versions only for the definition in libc.  */
>> > +#if IS_IN (libc)
>> > +# define __strlcpy __redirect_strlcpy
>> > +# include <string.h>
>> > +# undef __strlcpy
>> > +
>> > +# define SYMBOL_NAME strlcpy
>> > +# include "ifunc-strlcpy.h"
>> > +
>> > +libc_ifunc_redirected (__redirect_strlcpy, __strlcpy, IFUNC_SELECTOR ());
>> > +weak_alias (__strlcpy, strlcpy)
>> > +
>> > +# ifdef SHARED
>> > +__hidden_ver1 (__strlcpy, __GI___strlcpy, __redirect_strlcpy)
>> > +  __attribute__ ((visibility ("hidden"))) __attribute_copy__ (strlcpy);
>> > +# endif
>> > +#endif
>> > diff --git a/sysdeps/x86_64/multiarch/wcslcpy-avx2.S b/sysdeps/x86_64/multiarch/wcslcpy-avx2.S
>> > new file mode 100644
>> > index 0000000000..dafc20ded0
>> > --- /dev/null
>> > +++ b/sysdeps/x86_64/multiarch/wcslcpy-avx2.S
>> > @@ -0,0 +1,4 @@
>> > +#define STRLCPY        __wcslcpy_avx2
>> > +#define USE_AS_WCSLCPY 1
>> > +
>> > +#include "strlcpy-avx2.S"
>> > diff --git a/sysdeps/x86_64/multiarch/wcslcpy-generic.c b/sysdeps/x86_64/multiarch/wcslcpy-generic.c
>> > new file mode 100644
>> > index 0000000000..ffd3c0e846
>> > --- /dev/null
>> > +++ b/sysdeps/x86_64/multiarch/wcslcpy-generic.c
>> > @@ -0,0 +1,25 @@
>> > +/* wcslcpy generic.
>> > +   Copyright (C) 2023 Free Software Foundation, Inc.
>> > +   This file is part of the GNU C Library.
>> > +
>> > +   The GNU C Library is free software; you can redistribute it and/or
>> > +   modify it under the terms of the GNU Lesser General Public
>> > +   License as published by the Free Software Foundation; either
>> > +   version 2.1 of the License, or (at your option) any later version.
>> > +
>> > +   The GNU C Library is distributed in the hope that it will be useful,
>> > +   but WITHOUT ANY WARRANTY; without even the implied warranty of
>> > +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
>> > +   Lesser General Public License for more details.
>> > +
>> > +   You should have received a copy of the GNU Lesser General Public
>> > +   License along with the GNU C Library; if not, see
>> > +   <https://www.gnu.org/licenses/>.  */
>> > +
>> > +
>> > +#include <isa-level.h>
>> > +#if ISA_SHOULD_BUILD (1)
>> > +# define __wcslcpy  __wcslcpy_generic
>> > +# include <wcsmbs/wcslcpy.c>
>> > +
>> > +#endif
>> > diff --git a/sysdeps/x86_64/multiarch/wcslcpy.c b/sysdeps/x86_64/multiarch/wcslcpy.c
>> > new file mode 100644
>> > index 0000000000..371ef9626c
>> > --- /dev/null
>> > +++ b/sysdeps/x86_64/multiarch/wcslcpy.c
>> > @@ -0,0 +1,35 @@
>> > +/* Multiple versions of wcslcpy.
>> > +   All versions must be listed in ifunc-impl-list.c.
>> > +   Copyright (C) 2023 Free Software Foundation, Inc.
>> > +   This file is part of the GNU C Library.
>> > +
>> > +   The GNU C Library is free software; you can redistribute it and/or
>> > +   modify it under the terms of the GNU Lesser General Public
>> > +   License as published by the Free Software Foundation; either
>> > +   version 2.1 of the License, or (at your option) any later version.
>> > +
>> > +   The GNU C Library is distributed in the hope that it will be useful,
>> > +   but WITHOUT ANY WARRANTY; without even the implied warranty of
>> > +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
>> > +   Lesser General Public License for more details.
>> > +
>> > +   You should have received a copy of the GNU Lesser General Public
>> > +   License along with the GNU C Library; if not, see
>> > +   <https://www.gnu.org/licenses/>.  */
>> > +
>> > +/* Define multiple versions only for the definition in libc.  */
>> > +#if IS_IN (libc)
>> > +# define __wcslcpy __redirect_wcslcpy
>> > +# include <wchar.h>
>> > +# undef __wcslcpy
>> > +
>> > +# define SYMBOL_NAME wcslcpy
>> > +# include "ifunc-strlcpy.h"
>> > +
>> > +libc_ifunc_redirected (__redirect_wcslcpy, __wcslcpy, IFUNC_SELECTOR ());
>> > +weak_alias (__wcslcpy, wcslcpy)
>> > +# ifdef SHARED
>> > +__hidden_ver1 (__wcslcpy, __GI___wcslcpy, __redirect_wcslcpy)
>> > +  __attribute__((visibility ("hidden"))) __attribute_copy__ (wcslcpy);
>> > +# endif
>> > +#endif
>> > --
>> > 2.38.1
>> >

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH] x86_64: Implement AVX2 version of strlcpy/wcslcpy function
  2023-07-02 18:54     ` Noah Goldstein
@ 2023-07-03  1:03       ` Sunil Pandey
  2023-07-03  1:47         ` Noah Goldstein
  0 siblings, 1 reply; 24+ messages in thread
From: Sunil Pandey @ 2023-07-03  1:03 UTC (permalink / raw)
  To: Noah Goldstein; +Cc: libc-alpha, hjl.tools

[-- Attachment #1: Type: text/plain, Size: 31468 bytes --]

On Sun, Jul 2, 2023 at 11:54 AM Noah Goldstein <goldstein.w.n@gmail.com>
wrote:

> On Sun, Jul 2, 2023 at 1:38 PM Sunil Pandey <skpgkp2@gmail.com> wrote:
> >
> >
> >
> > On Sun, Jul 2, 2023 at 10:03 AM Noah Goldstein <goldstein.w.n@gmail.com>
> wrote:
> >>
> >> On Fri, Jun 30, 2023 at 3:48 PM Sunil K Pandey via Libc-alpha
> >> <libc-alpha@sourceware.org> wrote:
> >> >
> >> > This patch optimizes strlcpy/wsclcpy string functions for AVX2.
> >> > ---
> >> >  sysdeps/x86_64/multiarch/Makefile          |   4 +
> >> >  sysdeps/x86_64/multiarch/ifunc-impl-list.c |  18 +
> >> >  sysdeps/x86_64/multiarch/ifunc-strlcpy.h   |  34 ++
> >> >  sysdeps/x86_64/multiarch/strlcpy-avx2.S    | 446
> +++++++++++++++++++++
> >> >  sysdeps/x86_64/multiarch/strlcpy-generic.c |  25 ++
> >> >  sysdeps/x86_64/multiarch/strlcpy.c         |  36 ++
> >> >  sysdeps/x86_64/multiarch/wcslcpy-avx2.S    |   4 +
> >> >  sysdeps/x86_64/multiarch/wcslcpy-generic.c |  25 ++
> >> >  sysdeps/x86_64/multiarch/wcslcpy.c         |  35 ++
> >> >  9 files changed, 627 insertions(+)
> >> >  create mode 100644 sysdeps/x86_64/multiarch/ifunc-strlcpy.h
> >> >  create mode 100644 sysdeps/x86_64/multiarch/strlcpy-avx2.S
> >> >  create mode 100644 sysdeps/x86_64/multiarch/strlcpy-generic.c
> >> >  create mode 100644 sysdeps/x86_64/multiarch/strlcpy.c
> >> >  create mode 100644 sysdeps/x86_64/multiarch/wcslcpy-avx2.S
> >> >  create mode 100644 sysdeps/x86_64/multiarch/wcslcpy-generic.c
> >> >  create mode 100644 sysdeps/x86_64/multiarch/wcslcpy.c
> >> >
> >> > diff --git a/sysdeps/x86_64/multiarch/Makefile
> b/sysdeps/x86_64/multiarch/Makefile
> >> > index e1e894c963..7e3fc081df 100644
> >> > --- a/sysdeps/x86_64/multiarch/Makefile
> >> > +++ b/sysdeps/x86_64/multiarch/Makefile
> >> > @@ -82,6 +82,8 @@ sysdep_routines += \
> >> >    strcpy-sse2 \
> >> >    strcpy-sse2-unaligned \
> >> >    strcspn-sse4 \
> >> > +  strlcpy-avx2 \
> >> > +  strlcpy-generic \
> >> >    strlen-avx2 \
> >> >    strlen-avx2-rtm \
> >> >    strlen-evex \
> >> > @@ -153,6 +155,8 @@ sysdep_routines += \
> >> >    wcscpy-evex \
> >> >    wcscpy-generic \
> >> >    wcscpy-ssse3 \
> >> > +  wcslcpy-avx2 \
> >> > +  wcslcpy-generic \
> >> >    wcslen-avx2 \
> >> >    wcslen-avx2-rtm \
> >> >    wcslen-evex \
> >> > diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> >> > index 5427ff1907..9928dee187 100644
> >> > --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> >> > +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> >> > @@ -751,6 +751,15 @@ __libc_ifunc_impl_list (const char *name, struct
> libc_ifunc_impl *array,
> >> >                                      1,
> >> >                                      __strncat_sse2_unaligned))
> >> >
> >> > +  /* Support sysdeps/x86_64/multiarch/strlcpy.c.  */
> >> > +  IFUNC_IMPL (i, name, strlcpy,
> >> > +             X86_IFUNC_IMPL_ADD_V3 (array, i, strlcpy,
> >> > +                                    CPU_FEATURE_USABLE (AVX2),
> >> > +                                    __strlcpy_avx2)
> >> > +             X86_IFUNC_IMPL_ADD_V1 (array, i, strlcpy,
> >> > +                                    1,
> >> > +                                    __strlcpy_generic))
> >> > +
> >> >    /* Support sysdeps/x86_64/multiarch/strncpy.c.  */
> >> >    IFUNC_IMPL (i, name, strncpy,
> >> >               X86_IFUNC_IMPL_ADD_V4 (array, i, strncpy,
> >> > @@ -917,6 +926,15 @@ __libc_ifunc_impl_list (const char *name, struct
> libc_ifunc_impl *array,
> >> >                                      1,
> >> >                                      __wcscpy_generic))
> >> >
> >> > +  /* Support sysdeps/x86_64/multiarch/wcslcpy.c.  */
> >> > +  IFUNC_IMPL (i, name, wcslcpy,
> >> > +             X86_IFUNC_IMPL_ADD_V3 (array, i, wcslcpy,
> >> > +                                    CPU_FEATURE_USABLE (AVX2),
> >> > +                                    __wcslcpy_avx2)
> >> > +             X86_IFUNC_IMPL_ADD_V1 (array, i, wcslcpy,
> >> > +                                    1,
> >> > +                                    __wcslcpy_generic))
> >> > +
> >> >    /* Support sysdeps/x86_64/multiarch/wcsncpy.c.  */
> >> >    IFUNC_IMPL (i, name, wcsncpy,
> >> >               X86_IFUNC_IMPL_ADD_V4 (array, i, wcsncpy,
> >> > diff --git a/sysdeps/x86_64/multiarch/ifunc-strlcpy.h
> b/sysdeps/x86_64/multiarch/ifunc-strlcpy.h
> >> > new file mode 100644
> >> > index 0000000000..982a30d15b
> >> > --- /dev/null
> >> > +++ b/sysdeps/x86_64/multiarch/ifunc-strlcpy.h
> >> > @@ -0,0 +1,34 @@
> >> > +/* Common definition for ifunc selections.
> >> > +   All versions must be listed in ifunc-impl-list.c.
> >> > +   Copyright (C) 2023 Free Software Foundation, Inc.
> >> > +   This file is part of the GNU C Library.
> >> > +
> >> > +   The GNU C Library is free software; you can redistribute it and/or
> >> > +   modify it under the terms of the GNU Lesser General Public
> >> > +   License as published by the Free Software Foundation; either
> >> > +   version 2.1 of the License, or (at your option) any later version.
> >> > +
> >> > +   The GNU C Library is distributed in the hope that it will be
> useful,
> >> > +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> >> > +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> >> > +   Lesser General Public License for more details.
> >> > +
> >> > +   You should have received a copy of the GNU Lesser General Public
> >> > +   License along with the GNU C Library; if not, see
> >> > +   <https://www.gnu.org/licenses/>.  */
> >> > +
> >> > +#include <init-arch.h>
> >> > +
> >> > +extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden;
> >> > +extern __typeof (REDIRECT_NAME) OPTIMIZE (generic) attribute_hidden;
> >> > +
> >> > +static inline void *
> >> > +IFUNC_SELECTOR (void)
> >> > +{
> >> > +  const struct cpu_features *cpu_features = __get_cpu_features ();
> >> > +
> >> > +  if (X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, AVX2))
> >> > +    return OPTIMIZE (avx2);
> >> > +
> >> > +  return OPTIMIZE (generic);
> >> > +}
> >> > diff --git a/sysdeps/x86_64/multiarch/strlcpy-avx2.S
> b/sysdeps/x86_64/multiarch/strlcpy-avx2.S
> >> > new file mode 100644
> >> > index 0000000000..cf54b1e990
> >> > --- /dev/null
> >> > +++ b/sysdeps/x86_64/multiarch/strlcpy-avx2.S
> >> > @@ -0,0 +1,446 @@
> >> > +/* Strlcpy/wcslcpy optimized with AVX2.
> >> > +   Copyright (C) 2023 Free Software Foundation, Inc.
> >> > +   This file is part of the GNU C Library.
> >> > +
> >> > +   The GNU C Library is free software; you can redistribute it and/or
> >> > +   modify it under the terms of the GNU Lesser General Public
> >> > +   License as published by the Free Software Foundation; either
> >> > +   version 2.1 of the License, or (at your option) any later version.
> >> > +
> >> > +   The GNU C Library is distributed in the hope that it will be
> useful,
> >> > +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> >> > +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> >> > +   Lesser General Public License for more details.
> >> > +
> >> > +   You should have received a copy of the GNU Lesser General Public
> >> > +   License along with the GNU C Library; if not, see
> >> > +   <https://www.gnu.org/licenses/>.  */
> >> > +
> >> > +#include <isa-level.h>
> >> > +
> >> > +#if ISA_SHOULD_BUILD (3)
> >> > +
> >> > +# include <sysdep.h>
> >> > +
> >> > +# ifndef VEC_SIZE
> >> > +#  include "x86-avx-vecs.h"
> >> > +# endif
> >> > +
> >> > +# ifndef STRLCPY
> >> > +#  define STRLCPY      __strlcpy_avx2
> >> > +# endif
> >> > +
> >> > +
> >> > +# ifdef USE_AS_WCSLCPY
> >> > +#  define CHAR_SIZE    4
> >> > +#  define MOVU         movl
> >> > +#  define VPCMPEQ      vpcmpeqd
> >> > +#  define VPMINU       vpminud
> >> > +# else
> >> > +#  define CHAR_SIZE    1
> >> > +#  define MOVU         movb
> >> > +#  define VPCMPEQ      vpcmpeqb
> >> > +#  define VPMINU       vpminub
> >> > +# endif
> >> > +
> >> > +# define PMOVMSK       vpmovmskb
> >> > +# define PAGE_SIZE     4096
> >> > +# define VEC_SIZE      32
> >> > +# define CHAR_PER_VEC  (VEC_SIZE / CHAR_SIZE)
> >> > +
> >> > +       .section SECTION(.text),"ax",@progbits
> >> > +/* Aligning entry point to 64 byte, provides better performance for
> >> > +   one vector length string.  */
> >> > +
> >> > +ENTRY_P2ALIGN (STRLCPY, 6)
> >> > +# ifdef __ILP32__
> >> > +       /* Clear the upper 32 bits.  */
> >> > +       movl    %edx, %edx
> >> > +# endif
> >> > +
> >> > +       /* Zero out vector register for end of string comparison. */
> >> > +       vpxor   %VMM(0), %VMM(0), %VMM(0)
> >> > +       /* Save source pointer for return calculation.  */
> >> > +       mov     %rsi, %r8
> >> > +       mov     %esi, %eax
> >> > +       sall    $20, %eax
> >> > +       cmpl    $((PAGE_SIZE - (VEC_SIZE)) << 20), %eax
> >> > +       ja      L(page_cross)
> >> > +
> >> > +L(page_cross_continue):
> >> > +       /* Load first vector.  */
> >> > +       VMOVU   (%rsi), %VMM(1)
> >> > +       VPCMPEQ %VMM(0), %VMM(1), %VMM(2)
> >> > +       PMOVMSK %VMM(2), %eax
> >> > +       test    %eax, %eax
> >> > +       jnz     L(ret_vec_x1)
> >> > +
> >> > +       test    %rdx, %rdx
> >> > +       jz      L(continue_second_vector)
> >> > +
> >> > +       /* Check whether we can copy full vector.  */
> >> > +       cmp     $CHAR_PER_VEC, %rdx
> >> > +       jbe     L(page_cross_small_vec_copy)
> >> > +       /* Copy first vector.  */
> >> > +       VMOVU   %VMM(1), (%rdi)
> >> > +       sub     $CHAR_PER_VEC, %rdx
> >> > +
> >> > +L(continue_second_vector):
> >> > +       /* Align RSI pointer and adjust RDI based on offset.  */
> >> > +       mov     %rsi, %rax
> >> > +       and     $-VEC_SIZE, %rsi
> >> > +       sub     %rsi, %rax
> >> > +       sub     %rax, %rdi
> >> > +
> >> > +       /* Check if string already copied N char, and RDX is 0.  */
> >> > +       test    %rdx, %rdx
> >> > +       jz      L(skip_copy_alignment_fix)
> >> > +
> >> > +       /* Adjust RDX for copy alignment fix.  */
> >> > +# ifdef USE_AS_WCSLCPY
> >> > +       shr     $2, %rax
> >> > +# endif
> >> > +       add     %rax, %rdx
> >> > +
> >> > +L(skip_copy_alignment_fix):
> >> > +       /* Load second vector.  */
> >> > +       VMOVA   (VEC_SIZE * 1)(%rsi), %VMM(1)
> >> > +       VPCMPEQ %VMM(0), %VMM(1), %VMM(2)
> >> > +       vptest  %VMM(2), %VMM(2)
> >> > +       jnz     L(ret_vec_x2)
> >> > +
> >> > +       /* Skip copy if RDX is 0.  */
> >> > +       test    %rdx, %rdx
> >> > +       jz      L(continue_third_vector)
> >> > +
> >> > +       /* Jump below/equal(instead of below) used here, because last
> >> > +          copy chracter must be NULL.  */
> >> > +       cmp     $CHAR_PER_VEC, %rdx
> >> > +       jbe     L(partial_copy_second_vector)
> >> > +
> >> > +       sub     $CHAR_PER_VEC, %rdx
> >> > +       /* Copy second vector.  */
> >> > +       VMOVU   %VMM(1), (VEC_SIZE * 1)(%rdi)
> >> > +
> >> > +L(continue_third_vector):
> >> > +       /* Load third vector.  */
> >> > +       VMOVA   (VEC_SIZE * 2)(%rsi), %VMM(1)
> >> > +       VPCMPEQ %VMM(0), %VMM(1), %VMM(2)
> >> > +       vptest  %VMM(2), %VMM(2)
> >> > +       jnz     L(ret_vec_x3)
> >> > +
> >> > +       /* Skip copy if RDX is 0.  */
> >> > +       test    %rdx, %rdx
> >> > +       jz      L(continue_fourth_vector)
> >> > +
> >> > +       cmp     $CHAR_PER_VEC, %rdx
> >> > +       jbe     L(partial_copy_third_vector)
> >> > +
> >> > +       sub     $CHAR_PER_VEC, %rdx
> >> > +       /* Copy third vector.  */
> >> > +       VMOVU   %VMM(1), (VEC_SIZE * 2)(%rdi)
> >> > +
> >> > +L(continue_fourth_vector):
> >> > +       /* Load fourth vector.  */
> >> > +       VMOVA   (VEC_SIZE * 3)(%rsi), %VMM(1)
> >> > +       VPCMPEQ %VMM(0), %VMM(1), %VMM(2)
> >> > +       vptest  %VMM(2), %VMM(2)
> >> > +       jnz     L(ret_vec_x4)
> >> > +
> >> > +       /* Skip copy if RDX is 0.  */
> >> > +       test    %rdx, %rdx
> >> > +       jz      L(loop_4x_align)
> >> > +
> >> > +       cmp     $CHAR_PER_VEC, %rdx
> >> > +       jbe     L(partial_copy_fourth_vector)
> >> > +
> >> > +       sub     $CHAR_PER_VEC, %rdx
> >> > +       /* Copy fourth vector.  */
> >> > +       VMOVU   %VMM(1), (VEC_SIZE * 3)(%rdi)
> >> > +
> >> > +
> >> > +L(loop_4x_align):
> >> > +       /* Jump to loop if RSI is already 4 vector align.  */
> >> > +       test    $(VEC_SIZE * 4 - 1), %esi
> >> > +       jz      L(loop_4x_read)
> >> > +
> >> > +       mov     %rsi, %rcx
> >> > +
> >> > +       /* Align RSI to 4x vector.  */
> >> > +       and     $(VEC_SIZE * -4), %rsi
> >> > +       sub     %rsi, %rcx
> >> > +
> >> > +       /* Adjust RDI for RSI alignment fix.  */
> >> > +       sub     %rcx, %rdi
> >> > +
> >> > +       /* Jump to loop if RDX is 0.  */
> >> > +       test    %rdx, %rdx
> >> > +       jz      L(loop_4x_read)
> >> > +
> >> > +# ifdef USE_AS_WCSLCPY
> >> > +       shr     $2, %rcx
> >> > +# endif
> >> > +
> >> > +       /* Adjust RDX for RSI alignment fix.  */
> >> > +       add     %rcx, %rdx
> >> > +       jmp     L(loop_4x_read)
> >> > +
> >> > +       .p2align 4,,6
> >> > +L(loop_4x_vec):
> >> > +       /* Skip copy if RDX is 0.  */
> >> > +       test    %rdx, %rdx
> >> > +       jz      L(loop_partial_copy_return)
> >> > +       cmp     $(CHAR_PER_VEC * 4), %rdx
> >> > +       jbe     L(loop_partial_copy)
> >> > +       VMOVU   %VMM(1), (VEC_SIZE * 4)(%rdi)
> >> > +       VMOVU   %VMM(2), (VEC_SIZE * 5)(%rdi)
> >> > +       VMOVU   %VMM(3), (VEC_SIZE * 6)(%rdi)
> >> > +       VMOVU   %VMM(4), (VEC_SIZE * 7)(%rdi)
> >> > +       sub     $(CHAR_PER_VEC * 4), %rdx
> >> > +
> >> > +L(loop_partial_copy_return):
> >> > +       sub     $(VEC_SIZE * -4), %rsi
> >> > +       sub     $(VEC_SIZE * -4), %rdi
> >> > +
> >> > +L(loop_4x_read):
> >> > +       VMOVA   (VEC_SIZE * 4)(%rsi), %VMM(1)
> >> > +       VMOVA   (VEC_SIZE * 5)(%rsi), %VMM(2)
> >> > +       VMOVA   (VEC_SIZE * 6)(%rsi), %VMM(3)
> >> > +       VMOVA   (VEC_SIZE * 7)(%rsi), %VMM(4)
> >> > +       VPMINU  %VMM(1), %VMM(2), %VMM(5)
> >> > +       VPMINU  %VMM(3), %VMM(4), %VMM(6)
> >> > +       VPMINU  %VMM(5), %VMM(6), %VMM(7)
> >> > +       VPCMPEQ %VMM(0), %VMM(7), %VMM(7)
> >> > +       vptest  %VMM(7), %VMM(7)
> >> > +
> >> > +       jz      L(loop_4x_vec)
> >> > +
> >> > +       /* Check if string ends in first vector or second vector.  */
> >> > +       lea     (VEC_SIZE * 4)(%rsi), %rax
> >> > +       sub     %r8, %rax
> >> > +# ifdef USE_AS_WCSLCPY
> >> > +       shr     $2, %rax
> >> > +# endif
> >> > +       xor     %r10, %r10
> >> > +       VPCMPEQ %VMM(0), %VMM(5), %VMM(6)
> >> > +       vptest  %VMM(6), %VMM(6)
> >> > +       jnz     L(endloop)
> >> > +       sub     $(CHAR_PER_VEC * -2), %rax
> >> > +       mov     $(CHAR_PER_VEC * 2), %r10
> >> > +       VMOVA   %VMM(3), %VMM(1)
> >> > +       VMOVA   %VMM(4), %VMM(2)
> >> > +
> >> > +L(endloop):
> >> > +       VPCMPEQ %VMM(0), %VMM(1), %VMM(1)
> >> > +       VPCMPEQ %VMM(0), %VMM(2), %VMM(2)
> >> > +       PMOVMSK %VMM(1), %rcx
> >> > +       PMOVMSK %VMM(2), %r9
> >> > +       shlq    $32, %r9
> >> > +       orq     %r9, %rcx
> >> > +       bsf     %rcx, %rcx
> >> > +       /* Shift RCX by 2, VPMOVMSK has only byte version.  */
> >> > +# ifdef USE_AS_WCSLCPY
> >> > +       shr     $2, %rcx
> >> > +# endif
> >> > +       /* At this point RAX has length to return.  */
> >> > +       add     %rcx, %rax
> >> > +       test    %rdx, %rdx
> >> > +       jz      L(ret)
> >> > +
> >> > +       /* Add 1 to account for NULL character in RDX comparison.  */
> >> > +       lea     1(%r10, %rcx), %rcx
> >> > +       cmp     %rdx, %rcx
> >> > +       cmovb   %rcx, %rdx
> >> > +
> >> > +L(loop_partial_copy):
> >> > +       cmp     $(CHAR_PER_VEC * 2), %rdx
> >> > +       jbe     L(loop_partial_first_half)
> >> > +       /* Reload first 2 vector.  */
> >> > +       VMOVA   (VEC_SIZE * 4)(%rsi), %VMM(1)
> >> > +       VMOVA   (VEC_SIZE * 5)(%rsi), %VMM(2)
> >> > +       VMOVU   %VMM(1), (VEC_SIZE * 4)(%rdi)
> >> > +       VMOVU   %VMM(2), (VEC_SIZE * 5)(%rdi)
> >> > +
> >> > +L(loop_partial_first_half):
> >> > +       /* Go back 2 vector from last and use overlapping copy.
> >> > +          (VEC_SIZE * 4 - VEC_SIZE * 2)(%rsi, %rdx, CHAR_SIZE)
> >> > +          (VEC_SIZE * 4 - VEC_SIZE * 1)(%rsi, %rdx, CHAR_SIZE)
> >> > +        */
> >> > +       VMOVU   (VEC_SIZE * 2)(%rsi, %rdx, CHAR_SIZE), %VMM(3)
> >> > +       VMOVU   (VEC_SIZE * 3)(%rsi, %rdx, CHAR_SIZE), %VMM(4)
> >> > +       VMOVU   %VMM(3), (VEC_SIZE * 2)(%rdi, %rdx, CHAR_SIZE)
> >> > +       VMOVU   %VMM(4), (VEC_SIZE * 3)(%rdi, %rdx, CHAR_SIZE)
> >> > +       MOVU    $0, (VEC_SIZE * 4 - CHAR_SIZE)(%rdi, %rdx, CHAR_SIZE)
> >> > +       xor     %rdx, %rdx
> >> > +       vptest  %VMM(7), %VMM(7)
> >> > +       jz      L(loop_partial_copy_return)
> >> > +       ret
> >> > +
> >> > +       .p2align 4
> >> > +L(page_cross):
> >> > +       mov     %rsi, %rcx
> >> > +       mov     %rsi, %r11
> >> > +       and     $-VEC_SIZE, %r11
> >> > +       and     $(VEC_SIZE - 1), %rcx
> >> > +       VMOVA   (%r11), %VMM(1)
> >> > +       VPCMPEQ %VMM(0), %VMM(1), %VMM(2)
> >> > +       PMOVMSK %VMM(2), %eax
> >> > +       shr     %cl, %eax
> >> > +       jz      L(page_cross_continue)
> >> > +
> >> > +L(ret_vec_x1):
> >> > +       bsf     %eax, %eax
> >> > +# ifdef USE_AS_WCSLCPY
> >> > +       shr     $2, %eax
> >> > +# endif
> >> > +       /* Increment by 1 to account for NULL char.  */
> >> > +       lea     1(%eax), %ecx
> >> > +       cmp     %rdx, %rcx
> >> > +       cmovb   %rcx, %rdx
> >> > +       test    %rdx, %rdx
> >> > +       jz      L(ret)
> >> > +
> >> > +L(page_cross_small_vec_copy):
> >> > +       cmp     $(16 / CHAR_SIZE), %rdx
> >> > +       jbe     L(copy_8_byte_scalar)
> >> > +       VMOVU   (%rsi), %VMM_128(1)
> >> > +       VMOVU   -16(%rsi, %rdx, CHAR_SIZE), %VMM_128(3)
> >> > +       VMOVU   %VMM_128(1), (%rdi)
> >> > +       VMOVU   %VMM_128(3), -16(%rdi, %rdx, CHAR_SIZE)
> >> > +       MOVU    $0, -(CHAR_SIZE * 1)(%rdi, %rdx, CHAR_SIZE)
> >> > +       xor     %rdx, %rdx
> >> > +       vptest  %VMM(2), %VMM(2)
> >> > +       jz      L(continue_second_vector)
> >> > +       ret
> >> > +
> >> > +L(copy_8_byte_scalar):
> >> > +       cmp     $(8 / CHAR_SIZE), %rdx
> >> > +       jbe     L(copy_4_byte_scalar)
> >> > +       movq    (%rsi), %r10
> >> > +       movq    -8(%rsi, %rdx, CHAR_SIZE), %r11
> >> > +       movq    %r10, (%rdi)
> >> > +       movq    %r11, -8(%rdi, %rdx, CHAR_SIZE)
> >> > +       MOVU    $0, -(CHAR_SIZE * 1)(%rdi, %rdx, CHAR_SIZE)
> >> > +       xor     %edx, %edx
> >> > +       vptest  %VMM(2), %VMM(2)
> >> > +       jz      L(continue_second_vector)
> >> > +       ret
> >> > +
> >> > +L(copy_4_byte_scalar):
> >> > +# ifndef USE_AS_WCSLCPY
> >> > +       cmp     $4, %rdx
> >> > +       jbe     L(copy_2_byte_scalar)
> >> > +# endif
> >> > +       movl    (%rsi), %r10d
> >> > +       movl    -4(%rsi, %rdx, CHAR_SIZE), %r11d
> >> > +       movl    %r10d, (%rdi)
> >> > +       movl    %r11d, -4(%rdi, %rdx, CHAR_SIZE)
> >> > +       MOVU    $0, -(CHAR_SIZE * 1)(%rdi, %rdx, CHAR_SIZE)
> >> > +       xor     %edx, %edx
> >> > +       vptest  %VMM(2), %VMM(2)
> >> > +       jz      L(continue_second_vector)
> >> > +       ret
> >> > +
> >> > +# ifndef USE_AS_WCSLCPY
> >> > +L(copy_2_byte_scalar):
> >> > +       cmp     $2, %rdx
> >> > +       jbe     L(copy_1_byte_scalar)
> >> > +       movw    (%rsi), %r10w
> >> > +       movw    -(CHAR_SIZE * 3)(%rsi, %rdx, CHAR_SIZE), %r11w
> >> > +       movw    %r10w, (%rdi)
> >> > +       movw    %r11w, -(CHAR_SIZE * 3)(%rdi, %rdx, CHAR_SIZE)
> >> > +       MOVU    $0, -(CHAR_SIZE * 1)(%rdi, %rdx, CHAR_SIZE)
> >> > +       xor     %edx, %edx
> >> > +       vptest  %VMM(2), %VMM(2)
> >> > +       jz      L(continue_second_vector)
> >> > +       ret
> >> > +
> >> > +L(copy_1_byte_scalar):
> >> > +       MOVU    (%rsi), %r10b
> >> > +       MOVU    %r10b, (%rdi)
> >> > +       MOVU    $0, -(CHAR_SIZE * 1)(%rdi, %rdx, CHAR_SIZE)
> >> > +       xor     %edx, %edx
> >> > +       vptest  %VMM(2), %VMM(2)
> >> > +       jz      L(continue_second_vector)
> >> > +       ret
> >> > +# endif
> >> > +
> >> > +L(ret_vec_x2):
> >> > +       PMOVMSK %VMM(2), %rax
> >> > +       bsf     %rax, %rcx
> >> > +       /* Calculate return value.  */
> >> > +       lea     VEC_SIZE(%rsi, %rcx), %rax
> >> > +       sub     %r8, %rax
> >> > +# ifdef USE_AS_WCSLCPY
> >> > +       shr     $2, %rax
> >> > +       shr     $2, %rcx
> >> > +# endif
> >> > +       inc     %rcx
> >> > +       test    %rdx, %rdx
> >> > +       jz      L(ret)
> >> > +       cmp     %rdx, %rcx
> >> > +       cmovb   %rcx, %rdx
> >> > +
> >> > +L(partial_copy_second_vector):
> >> > +       VMOVU   (%rsi, %rdx, CHAR_SIZE), %VMM(1)
> >> > +       VMOVU   %VMM(1), (%rdi, %rdx, CHAR_SIZE)
> >> > +       MOVU    $0, (VEC_SIZE - CHAR_SIZE * 1)(%rdi, %rdx, CHAR_SIZE)
> >> > +       xor     %edx, %edx
> >> > +       vptest  %VMM(2), %VMM(2)
> >> > +       jz      L(continue_third_vector)
> >> > +
> >> > +L(ret):
> >> > +       ret
> >> > +
> >> > +L(ret_vec_x3):
> >> > +       PMOVMSK %VMM(2), %rax
> >> > +       bsf     %rax, %rcx
> >> > +       /* Calculate return value.  */
> >> > +       lea     (VEC_SIZE * 2)(%rsi, %rcx), %rax
> >> > +       sub     %r8, %rax
> >> > +# ifdef USE_AS_WCSLCPY
> >> > +       shr     $2, %rax
> >> > +       shr     $2, %rcx
> >> > +# endif
> >> > +       inc     %rcx
> >> > +       test    %rdx, %rdx
> >> > +       jz      L(ret)
> >> > +       cmp     %rdx, %rcx
> >> > +       cmovb   %rcx, %rdx
> >> > +
> >> > +L(partial_copy_third_vector):
> >> > +       VMOVU   (VEC_SIZE)(%rsi, %rdx, CHAR_SIZE), %VMM(1)
> >> > +       VMOVU   %VMM(1), (VEC_SIZE)(%rdi, %rdx, CHAR_SIZE)
> >> > +       MOVU    $0, ((VEC_SIZE * 2) - CHAR_SIZE * 1)(%rdi, %rdx,
> CHAR_SIZE)
> >> > +       xor     %edx, %edx
> >> > +       vptest  %VMM(2), %VMM(2)
> >> > +       jz      L(continue_fourth_vector)
> >> > +       ret
> >> > +
> >> > +L(ret_vec_x4):
> >> > +       PMOVMSK %VMM(2), %rax
> >> > +       bsf     %rax, %rcx
> >> > +       /* Calculate return value.  */
> >> > +       lea     (VEC_SIZE * 3)(%rsi, %rcx), %rax
> >> > +       sub     %r8, %rax
> >> > +# ifdef USE_AS_WCSLCPY
> >> > +       shr     $2, %rax
> >> > +       shr     $2, %rcx
> >> > +# endif
> >> > +       inc     %rcx
> >> > +       test    %rdx, %rdx
> >> > +       jz      L(ret)
> >> > +       cmp     %rdx, %rcx
> >> > +       cmovb   %rcx, %rdx
> >> > +
> >> > +L(partial_copy_fourth_vector):
> >> > +       VMOVU   (VEC_SIZE * 2)(%rsi, %rdx, CHAR_SIZE), %VMM(1)
> >> > +       VMOVU   %VMM(1), (VEC_SIZE * 2)(%rdi, %rdx, CHAR_SIZE)
> >> > +       MOVU    $0, ((VEC_SIZE * 3) - CHAR_SIZE * 1)(%rdi, %rdx,
> CHAR_SIZE)
> >> > +       xor     %edx, %edx
> >> > +       vptest  %VMM(2), %VMM(2)
> >> > +       jz      L(continue_fourth_vector)
> >> > +       ret
> >> > +
> >> > +END (STRLCPY)
> >>
> >> Is strlcpy/strlcat integratable with existing strncat impl? Had
> >> figured they would
> >> fit in the same file.
> >
> >
> > Hi Noah,
> >
> > It may not be a good idea to put strlcpy/strlcat in the existing
> strncpy/strnat impl file,
> > as strlcpy/strlcat functions are associated with GLIBC_2.38 ABI.
> >
> Well, we can put the impl there and include it from another to manage
> any special
> link cases.
>

Due to ABI, none of strlcpy/strlcat changes can go in the glibc version
earlier than 2.38,
to avoid any future strncpy backporting complications, it is better to keep
them in separate
files for now.


> > --Sunil
> >
> >>
> >> > +#endif
> >> > diff --git a/sysdeps/x86_64/multiarch/strlcpy-generic.c
> b/sysdeps/x86_64/multiarch/strlcpy-generic.c
> >> > new file mode 100644
> >> > index 0000000000..eee3b7b086
> >> > --- /dev/null
> >> > +++ b/sysdeps/x86_64/multiarch/strlcpy-generic.c
> >> > @@ -0,0 +1,25 @@
> >> > +/* strlcpy generic.
> >> > +   Copyright (C) 2023 Free Software Foundation, Inc.
> >> > +   This file is part of the GNU C Library.
> >> > +
> >> > +   The GNU C Library is free software; you can redistribute it and/or
> >> > +   modify it under the terms of the GNU Lesser General Public
> >> > +   License as published by the Free Software Foundation; either
> >> > +   version 2.1 of the License, or (at your option) any later version.
> >> > +
> >> > +   The GNU C Library is distributed in the hope that it will be
> useful,
> >> > +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> >> > +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> >> > +   Lesser General Public License for more details.
> >> > +
> >> > +   You should have received a copy of the GNU Lesser General Public
> >> > +   License along with the GNU C Library; if not, see
> >> > +   <https://www.gnu.org/licenses/>.  */
> >> > +
> >> > +
> >> > +#include <isa-level.h>
> >> > +#if ISA_SHOULD_BUILD (1)
> >> > +# define __strlcpy  __strlcpy_generic
> >> > +# include <string/strlcpy.c>
> >> > +
> >> > +#endif
> >> > diff --git a/sysdeps/x86_64/multiarch/strlcpy.c
> b/sysdeps/x86_64/multiarch/strlcpy.c
> >> > new file mode 100644
> >> > index 0000000000..ded41fbcfb
> >> > --- /dev/null
> >> > +++ b/sysdeps/x86_64/multiarch/strlcpy.c
> >> > @@ -0,0 +1,36 @@
> >> > +/* Multiple versions of strlcpy.
> >> > +   All versions must be listed in ifunc-impl-list.c.
> >> > +   Copyright (C) 2023 Free Software Foundation, Inc.
> >> > +   This file is part of the GNU C Library.
> >> > +
> >> > +   The GNU C Library is free software; you can redistribute it and/or
> >> > +   modify it under the terms of the GNU Lesser General Public
> >> > +   License as published by the Free Software Foundation; either
> >> > +   version 2.1 of the License, or (at your option) any later version.
> >> > +
> >> > +   The GNU C Library is distributed in the hope that it will be
> useful,
> >> > +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> >> > +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> >> > +   Lesser General Public License for more details.
> >> > +
> >> > +   You should have received a copy of the GNU Lesser General Public
> >> > +   License along with the GNU C Library; if not, see
> >> > +   <https://www.gnu.org/licenses/>.  */
> >> > +
> >> > +/* Define multiple versions only for the definition in libc.  */
> >> > +#if IS_IN (libc)
> >> > +# define __strlcpy __redirect_strlcpy
> >> > +# include <string.h>
> >> > +# undef __strlcpy
> >> > +
> >> > +# define SYMBOL_NAME strlcpy
> >> > +# include "ifunc-strlcpy.h"
> >> > +
> >> > +libc_ifunc_redirected (__redirect_strlcpy, __strlcpy, IFUNC_SELECTOR
> ());
> >> > +weak_alias (__strlcpy, strlcpy)
> >> > +
> >> > +# ifdef SHARED
> >> > +__hidden_ver1 (__strlcpy, __GI___strlcpy, __redirect_strlcpy)
> >> > +  __attribute__ ((visibility ("hidden"))) __attribute_copy__
> (strlcpy);
> >> > +# endif
> >> > +#endif
> >> > diff --git a/sysdeps/x86_64/multiarch/wcslcpy-avx2.S
> b/sysdeps/x86_64/multiarch/wcslcpy-avx2.S
> >> > new file mode 100644
> >> > index 0000000000..dafc20ded0
> >> > --- /dev/null
> >> > +++ b/sysdeps/x86_64/multiarch/wcslcpy-avx2.S
> >> > @@ -0,0 +1,4 @@
> >> > +#define STRLCPY        __wcslcpy_avx2
> >> > +#define USE_AS_WCSLCPY 1
> >> > +
> >> > +#include "strlcpy-avx2.S"
> >> > diff --git a/sysdeps/x86_64/multiarch/wcslcpy-generic.c
> b/sysdeps/x86_64/multiarch/wcslcpy-generic.c
> >> > new file mode 100644
> >> > index 0000000000..ffd3c0e846
> >> > --- /dev/null
> >> > +++ b/sysdeps/x86_64/multiarch/wcslcpy-generic.c
> >> > @@ -0,0 +1,25 @@
> >> > +/* wcslcpy generic.
> >> > +   Copyright (C) 2023 Free Software Foundation, Inc.
> >> > +   This file is part of the GNU C Library.
> >> > +
> >> > +   The GNU C Library is free software; you can redistribute it and/or
> >> > +   modify it under the terms of the GNU Lesser General Public
> >> > +   License as published by the Free Software Foundation; either
> >> > +   version 2.1 of the License, or (at your option) any later version.
> >> > +
> >> > +   The GNU C Library is distributed in the hope that it will be
> useful,
> >> > +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> >> > +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> >> > +   Lesser General Public License for more details.
> >> > +
> >> > +   You should have received a copy of the GNU Lesser General Public
> >> > +   License along with the GNU C Library; if not, see
> >> > +   <https://www.gnu.org/licenses/>.  */
> >> > +
> >> > +
> >> > +#include <isa-level.h>
> >> > +#if ISA_SHOULD_BUILD (1)
> >> > +# define __wcslcpy  __wcslcpy_generic
> >> > +# include <wcsmbs/wcslcpy.c>
> >> > +
> >> > +#endif
> >> > diff --git a/sysdeps/x86_64/multiarch/wcslcpy.c
> b/sysdeps/x86_64/multiarch/wcslcpy.c
> >> > new file mode 100644
> >> > index 0000000000..371ef9626c
> >> > --- /dev/null
> >> > +++ b/sysdeps/x86_64/multiarch/wcslcpy.c
> >> > @@ -0,0 +1,35 @@
> >> > +/* Multiple versions of wcslcpy.
> >> > +   All versions must be listed in ifunc-impl-list.c.
> >> > +   Copyright (C) 2023 Free Software Foundation, Inc.
> >> > +   This file is part of the GNU C Library.
> >> > +
> >> > +   The GNU C Library is free software; you can redistribute it and/or
> >> > +   modify it under the terms of the GNU Lesser General Public
> >> > +   License as published by the Free Software Foundation; either
> >> > +   version 2.1 of the License, or (at your option) any later version.
> >> > +
> >> > +   The GNU C Library is distributed in the hope that it will be
> useful,
> >> > +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> >> > +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> >> > +   Lesser General Public License for more details.
> >> > +
> >> > +   You should have received a copy of the GNU Lesser General Public
> >> > +   License along with the GNU C Library; if not, see
> >> > +   <https://www.gnu.org/licenses/>.  */
> >> > +
> >> > +/* Define multiple versions only for the definition in libc.  */
> >> > +#if IS_IN (libc)
> >> > +# define __wcslcpy __redirect_wcslcpy
> >> > +# include <wchar.h>
> >> > +# undef __wcslcpy
> >> > +
> >> > +# define SYMBOL_NAME wcslcpy
> >> > +# include "ifunc-strlcpy.h"
> >> > +
> >> > +libc_ifunc_redirected (__redirect_wcslcpy, __wcslcpy, IFUNC_SELECTOR
> ());
> >> > +weak_alias (__wcslcpy, wcslcpy)
> >> > +# ifdef SHARED
> >> > +__hidden_ver1 (__wcslcpy, __GI___wcslcpy, __redirect_wcslcpy)
> >> > +  __attribute__((visibility ("hidden"))) __attribute_copy__
> (wcslcpy);
> >> > +# endif
> >> > +#endif
> >> > --
> >> > 2.38.1
> >> >
>

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH] x86_64: Implement AVX2 version of strlcpy/wcslcpy function
  2023-07-03  1:03       ` Sunil Pandey
@ 2023-07-03  1:47         ` Noah Goldstein
  0 siblings, 0 replies; 24+ messages in thread
From: Noah Goldstein @ 2023-07-03  1:47 UTC (permalink / raw)
  To: Sunil Pandey; +Cc: libc-alpha, hjl.tools

On Sun, Jul 2, 2023 at 8:04 PM Sunil Pandey <skpgkp2@gmail.com> wrote:
>
>
>
> On Sun, Jul 2, 2023 at 11:54 AM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>>
>> On Sun, Jul 2, 2023 at 1:38 PM Sunil Pandey <skpgkp2@gmail.com> wrote:
>> >
>> >
>> >
>> > On Sun, Jul 2, 2023 at 10:03 AM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>> >>
>> >> On Fri, Jun 30, 2023 at 3:48 PM Sunil K Pandey via Libc-alpha
>> >> <libc-alpha@sourceware.org> wrote:
>> >> >
>> >> > This patch optimizes strlcpy/wsclcpy string functions for AVX2.
>> >> > ---
>> >> >  sysdeps/x86_64/multiarch/Makefile          |   4 +
>> >> >  sysdeps/x86_64/multiarch/ifunc-impl-list.c |  18 +
>> >> >  sysdeps/x86_64/multiarch/ifunc-strlcpy.h   |  34 ++
>> >> >  sysdeps/x86_64/multiarch/strlcpy-avx2.S    | 446 +++++++++++++++++++++
>> >> >  sysdeps/x86_64/multiarch/strlcpy-generic.c |  25 ++
>> >> >  sysdeps/x86_64/multiarch/strlcpy.c         |  36 ++
>> >> >  sysdeps/x86_64/multiarch/wcslcpy-avx2.S    |   4 +
>> >> >  sysdeps/x86_64/multiarch/wcslcpy-generic.c |  25 ++
>> >> >  sysdeps/x86_64/multiarch/wcslcpy.c         |  35 ++
>> >> >  9 files changed, 627 insertions(+)
>> >> >  create mode 100644 sysdeps/x86_64/multiarch/ifunc-strlcpy.h
>> >> >  create mode 100644 sysdeps/x86_64/multiarch/strlcpy-avx2.S
>> >> >  create mode 100644 sysdeps/x86_64/multiarch/strlcpy-generic.c
>> >> >  create mode 100644 sysdeps/x86_64/multiarch/strlcpy.c
>> >> >  create mode 100644 sysdeps/x86_64/multiarch/wcslcpy-avx2.S
>> >> >  create mode 100644 sysdeps/x86_64/multiarch/wcslcpy-generic.c
>> >> >  create mode 100644 sysdeps/x86_64/multiarch/wcslcpy.c
>> >> >
>> >> > diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
>> >> > index e1e894c963..7e3fc081df 100644
>> >> > --- a/sysdeps/x86_64/multiarch/Makefile
>> >> > +++ b/sysdeps/x86_64/multiarch/Makefile
>> >> > @@ -82,6 +82,8 @@ sysdep_routines += \
>> >> >    strcpy-sse2 \
>> >> >    strcpy-sse2-unaligned \
>> >> >    strcspn-sse4 \
>> >> > +  strlcpy-avx2 \
>> >> > +  strlcpy-generic \
>> >> >    strlen-avx2 \
>> >> >    strlen-avx2-rtm \
>> >> >    strlen-evex \
>> >> > @@ -153,6 +155,8 @@ sysdep_routines += \
>> >> >    wcscpy-evex \
>> >> >    wcscpy-generic \
>> >> >    wcscpy-ssse3 \
>> >> > +  wcslcpy-avx2 \
>> >> > +  wcslcpy-generic \
>> >> >    wcslen-avx2 \
>> >> >    wcslen-avx2-rtm \
>> >> >    wcslen-evex \
>> >> > diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
>> >> > index 5427ff1907..9928dee187 100644
>> >> > --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
>> >> > +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
>> >> > @@ -751,6 +751,15 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
>> >> >                                      1,
>> >> >                                      __strncat_sse2_unaligned))
>> >> >
>> >> > +  /* Support sysdeps/x86_64/multiarch/strlcpy.c.  */
>> >> > +  IFUNC_IMPL (i, name, strlcpy,
>> >> > +             X86_IFUNC_IMPL_ADD_V3 (array, i, strlcpy,
>> >> > +                                    CPU_FEATURE_USABLE (AVX2),
>> >> > +                                    __strlcpy_avx2)
>> >> > +             X86_IFUNC_IMPL_ADD_V1 (array, i, strlcpy,
>> >> > +                                    1,
>> >> > +                                    __strlcpy_generic))
>> >> > +
>> >> >    /* Support sysdeps/x86_64/multiarch/strncpy.c.  */
>> >> >    IFUNC_IMPL (i, name, strncpy,
>> >> >               X86_IFUNC_IMPL_ADD_V4 (array, i, strncpy,
>> >> > @@ -917,6 +926,15 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
>> >> >                                      1,
>> >> >                                      __wcscpy_generic))
>> >> >
>> >> > +  /* Support sysdeps/x86_64/multiarch/wcslcpy.c.  */
>> >> > +  IFUNC_IMPL (i, name, wcslcpy,
>> >> > +             X86_IFUNC_IMPL_ADD_V3 (array, i, wcslcpy,
>> >> > +                                    CPU_FEATURE_USABLE (AVX2),
>> >> > +                                    __wcslcpy_avx2)
>> >> > +             X86_IFUNC_IMPL_ADD_V1 (array, i, wcslcpy,
>> >> > +                                    1,
>> >> > +                                    __wcslcpy_generic))
>> >> > +
>> >> >    /* Support sysdeps/x86_64/multiarch/wcsncpy.c.  */
>> >> >    IFUNC_IMPL (i, name, wcsncpy,
>> >> >               X86_IFUNC_IMPL_ADD_V4 (array, i, wcsncpy,
>> >> > diff --git a/sysdeps/x86_64/multiarch/ifunc-strlcpy.h b/sysdeps/x86_64/multiarch/ifunc-strlcpy.h
>> >> > new file mode 100644
>> >> > index 0000000000..982a30d15b
>> >> > --- /dev/null
>> >> > +++ b/sysdeps/x86_64/multiarch/ifunc-strlcpy.h
>> >> > @@ -0,0 +1,34 @@
>> >> > +/* Common definition for ifunc selections.
>> >> > +   All versions must be listed in ifunc-impl-list.c.
>> >> > +   Copyright (C) 2023 Free Software Foundation, Inc.
>> >> > +   This file is part of the GNU C Library.
>> >> > +
>> >> > +   The GNU C Library is free software; you can redistribute it and/or
>> >> > +   modify it under the terms of the GNU Lesser General Public
>> >> > +   License as published by the Free Software Foundation; either
>> >> > +   version 2.1 of the License, or (at your option) any later version.
>> >> > +
>> >> > +   The GNU C Library is distributed in the hope that it will be useful,
>> >> > +   but WITHOUT ANY WARRANTY; without even the implied warranty of
>> >> > +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
>> >> > +   Lesser General Public License for more details.
>> >> > +
>> >> > +   You should have received a copy of the GNU Lesser General Public
>> >> > +   License along with the GNU C Library; if not, see
>> >> > +   <https://www.gnu.org/licenses/>.  */
>> >> > +
>> >> > +#include <init-arch.h>
>> >> > +
>> >> > +extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden;
>> >> > +extern __typeof (REDIRECT_NAME) OPTIMIZE (generic) attribute_hidden;
>> >> > +
>> >> > +static inline void *
>> >> > +IFUNC_SELECTOR (void)
>> >> > +{
>> >> > +  const struct cpu_features *cpu_features = __get_cpu_features ();
>> >> > +
>> >> > +  if (X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, AVX2))
>> >> > +    return OPTIMIZE (avx2);
>> >> > +
>> >> > +  return OPTIMIZE (generic);
>> >> > +}
>> >> > diff --git a/sysdeps/x86_64/multiarch/strlcpy-avx2.S b/sysdeps/x86_64/multiarch/strlcpy-avx2.S
>> >> > new file mode 100644
>> >> > index 0000000000..cf54b1e990
>> >> > --- /dev/null
>> >> > +++ b/sysdeps/x86_64/multiarch/strlcpy-avx2.S
>> >> > @@ -0,0 +1,446 @@
>> >> > +/* Strlcpy/wcslcpy optimized with AVX2.
>> >> > +   Copyright (C) 2023 Free Software Foundation, Inc.
>> >> > +   This file is part of the GNU C Library.
>> >> > +
>> >> > +   The GNU C Library is free software; you can redistribute it and/or
>> >> > +   modify it under the terms of the GNU Lesser General Public
>> >> > +   License as published by the Free Software Foundation; either
>> >> > +   version 2.1 of the License, or (at your option) any later version.
>> >> > +
>> >> > +   The GNU C Library is distributed in the hope that it will be useful,
>> >> > +   but WITHOUT ANY WARRANTY; without even the implied warranty of
>> >> > +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
>> >> > +   Lesser General Public License for more details.
>> >> > +
>> >> > +   You should have received a copy of the GNU Lesser General Public
>> >> > +   License along with the GNU C Library; if not, see
>> >> > +   <https://www.gnu.org/licenses/>.  */
>> >> > +
>> >> > +#include <isa-level.h>
>> >> > +
>> >> > +#if ISA_SHOULD_BUILD (3)
>> >> > +
>> >> > +# include <sysdep.h>
>> >> > +
>> >> > +# ifndef VEC_SIZE
>> >> > +#  include "x86-avx-vecs.h"
>> >> > +# endif
>> >> > +
>> >> > +# ifndef STRLCPY
>> >> > +#  define STRLCPY      __strlcpy_avx2
>> >> > +# endif
>> >> > +
>> >> > +
>> >> > +# ifdef USE_AS_WCSLCPY
>> >> > +#  define CHAR_SIZE    4
>> >> > +#  define MOVU         movl
>> >> > +#  define VPCMPEQ      vpcmpeqd
>> >> > +#  define VPMINU       vpminud
>> >> > +# else
>> >> > +#  define CHAR_SIZE    1
>> >> > +#  define MOVU         movb
>> >> > +#  define VPCMPEQ      vpcmpeqb
>> >> > +#  define VPMINU       vpminub
>> >> > +# endif
>> >> > +
>> >> > +# define PMOVMSK       vpmovmskb
>> >> > +# define PAGE_SIZE     4096
>> >> > +# define VEC_SIZE      32
>> >> > +# define CHAR_PER_VEC  (VEC_SIZE / CHAR_SIZE)
>> >> > +
>> >> > +       .section SECTION(.text),"ax",@progbits
>> >> > +/* Aligning entry point to 64 byte, provides better performance for
>> >> > +   one vector length string.  */
>> >> > +
>> >> > +ENTRY_P2ALIGN (STRLCPY, 6)
>> >> > +# ifdef __ILP32__
>> >> > +       /* Clear the upper 32 bits.  */
>> >> > +       movl    %edx, %edx
>> >> > +# endif
>> >> > +
>> >> > +       /* Zero out vector register for end of string comparison. */
>> >> > +       vpxor   %VMM(0), %VMM(0), %VMM(0)
>> >> > +       /* Save source pointer for return calculation.  */
>> >> > +       mov     %rsi, %r8
>> >> > +       mov     %esi, %eax
>> >> > +       sall    $20, %eax
>> >> > +       cmpl    $((PAGE_SIZE - (VEC_SIZE)) << 20), %eax
>> >> > +       ja      L(page_cross)
>> >> > +
>> >> > +L(page_cross_continue):
>> >> > +       /* Load first vector.  */
>> >> > +       VMOVU   (%rsi), %VMM(1)
>> >> > +       VPCMPEQ %VMM(0), %VMM(1), %VMM(2)
>> >> > +       PMOVMSK %VMM(2), %eax
>> >> > +       test    %eax, %eax
>> >> > +       jnz     L(ret_vec_x1)
>> >> > +
>> >> > +       test    %rdx, %rdx
>> >> > +       jz      L(continue_second_vector)
>> >> > +
>> >> > +       /* Check whether we can copy full vector.  */
>> >> > +       cmp     $CHAR_PER_VEC, %rdx
>> >> > +       jbe     L(page_cross_small_vec_copy)
>> >> > +       /* Copy first vector.  */
>> >> > +       VMOVU   %VMM(1), (%rdi)
>> >> > +       sub     $CHAR_PER_VEC, %rdx
>> >> > +
>> >> > +L(continue_second_vector):
>> >> > +       /* Align RSI pointer and adjust RDI based on offset.  */
>> >> > +       mov     %rsi, %rax
>> >> > +       and     $-VEC_SIZE, %rsi
>> >> > +       sub     %rsi, %rax
>> >> > +       sub     %rax, %rdi
>> >> > +
>> >> > +       /* Check if string already copied N char, and RDX is 0.  */
>> >> > +       test    %rdx, %rdx
>> >> > +       jz      L(skip_copy_alignment_fix)
>> >> > +
>> >> > +       /* Adjust RDX for copy alignment fix.  */
>> >> > +# ifdef USE_AS_WCSLCPY
>> >> > +       shr     $2, %rax
>> >> > +# endif
>> >> > +       add     %rax, %rdx
>> >> > +
>> >> > +L(skip_copy_alignment_fix):
>> >> > +       /* Load second vector.  */
>> >> > +       VMOVA   (VEC_SIZE * 1)(%rsi), %VMM(1)
>> >> > +       VPCMPEQ %VMM(0), %VMM(1), %VMM(2)
>> >> > +       vptest  %VMM(2), %VMM(2)
>> >> > +       jnz     L(ret_vec_x2)
>> >> > +
>> >> > +       /* Skip copy if RDX is 0.  */
>> >> > +       test    %rdx, %rdx
>> >> > +       jz      L(continue_third_vector)
>> >> > +
>> >> > +       /* Jump below/equal(instead of below) used here, because last
>> >> > +          copy chracter must be NULL.  */
>> >> > +       cmp     $CHAR_PER_VEC, %rdx
>> >> > +       jbe     L(partial_copy_second_vector)
>> >> > +
>> >> > +       sub     $CHAR_PER_VEC, %rdx
>> >> > +       /* Copy second vector.  */
>> >> > +       VMOVU   %VMM(1), (VEC_SIZE * 1)(%rdi)
>> >> > +
>> >> > +L(continue_third_vector):
>> >> > +       /* Load third vector.  */
>> >> > +       VMOVA   (VEC_SIZE * 2)(%rsi), %VMM(1)
>> >> > +       VPCMPEQ %VMM(0), %VMM(1), %VMM(2)
>> >> > +       vptest  %VMM(2), %VMM(2)
>> >> > +       jnz     L(ret_vec_x3)
>> >> > +
>> >> > +       /* Skip copy if RDX is 0.  */
>> >> > +       test    %rdx, %rdx
>> >> > +       jz      L(continue_fourth_vector)
>> >> > +
>> >> > +       cmp     $CHAR_PER_VEC, %rdx
>> >> > +       jbe     L(partial_copy_third_vector)
>> >> > +
>> >> > +       sub     $CHAR_PER_VEC, %rdx
>> >> > +       /* Copy third vector.  */
>> >> > +       VMOVU   %VMM(1), (VEC_SIZE * 2)(%rdi)
>> >> > +
>> >> > +L(continue_fourth_vector):
>> >> > +       /* Load fourth vector.  */
>> >> > +       VMOVA   (VEC_SIZE * 3)(%rsi), %VMM(1)
>> >> > +       VPCMPEQ %VMM(0), %VMM(1), %VMM(2)
>> >> > +       vptest  %VMM(2), %VMM(2)
>> >> > +       jnz     L(ret_vec_x4)
>> >> > +
>> >> > +       /* Skip copy if RDX is 0.  */
>> >> > +       test    %rdx, %rdx
>> >> > +       jz      L(loop_4x_align)
>> >> > +
>> >> > +       cmp     $CHAR_PER_VEC, %rdx
>> >> > +       jbe     L(partial_copy_fourth_vector)
>> >> > +
>> >> > +       sub     $CHAR_PER_VEC, %rdx
>> >> > +       /* Copy fourth vector.  */
>> >> > +       VMOVU   %VMM(1), (VEC_SIZE * 3)(%rdi)
>> >> > +
>> >> > +
>> >> > +L(loop_4x_align):
>> >> > +       /* Jump to loop if RSI is already 4 vector align.  */
>> >> > +       test    $(VEC_SIZE * 4 - 1), %esi
>> >> > +       jz      L(loop_4x_read)
>> >> > +
>> >> > +       mov     %rsi, %rcx
>> >> > +
>> >> > +       /* Align RSI to 4x vector.  */
>> >> > +       and     $(VEC_SIZE * -4), %rsi
>> >> > +       sub     %rsi, %rcx
>> >> > +
>> >> > +       /* Adjust RDI for RSI alignment fix.  */
>> >> > +       sub     %rcx, %rdi
>> >> > +
>> >> > +       /* Jump to loop if RDX is 0.  */
>> >> > +       test    %rdx, %rdx
>> >> > +       jz      L(loop_4x_read)
>> >> > +
>> >> > +# ifdef USE_AS_WCSLCPY
>> >> > +       shr     $2, %rcx
>> >> > +# endif
>> >> > +
>> >> > +       /* Adjust RDX for RSI alignment fix.  */
>> >> > +       add     %rcx, %rdx
>> >> > +       jmp     L(loop_4x_read)
>> >> > +
>> >> > +       .p2align 4,,6
>> >> > +L(loop_4x_vec):
>> >> > +       /* Skip copy if RDX is 0.  */
>> >> > +       test    %rdx, %rdx
>> >> > +       jz      L(loop_partial_copy_return)
>> >> > +       cmp     $(CHAR_PER_VEC * 4), %rdx
>> >> > +       jbe     L(loop_partial_copy)
>> >> > +       VMOVU   %VMM(1), (VEC_SIZE * 4)(%rdi)
>> >> > +       VMOVU   %VMM(2), (VEC_SIZE * 5)(%rdi)
>> >> > +       VMOVU   %VMM(3), (VEC_SIZE * 6)(%rdi)
>> >> > +       VMOVU   %VMM(4), (VEC_SIZE * 7)(%rdi)
>> >> > +       sub     $(CHAR_PER_VEC * 4), %rdx
>> >> > +
>> >> > +L(loop_partial_copy_return):
>> >> > +       sub     $(VEC_SIZE * -4), %rsi
>> >> > +       sub     $(VEC_SIZE * -4), %rdi
>> >> > +
>> >> > +L(loop_4x_read):
>> >> > +       VMOVA   (VEC_SIZE * 4)(%rsi), %VMM(1)
>> >> > +       VMOVA   (VEC_SIZE * 5)(%rsi), %VMM(2)
>> >> > +       VMOVA   (VEC_SIZE * 6)(%rsi), %VMM(3)
>> >> > +       VMOVA   (VEC_SIZE * 7)(%rsi), %VMM(4)
>> >> > +       VPMINU  %VMM(1), %VMM(2), %VMM(5)
>> >> > +       VPMINU  %VMM(3), %VMM(4), %VMM(6)
>> >> > +       VPMINU  %VMM(5), %VMM(6), %VMM(7)
>> >> > +       VPCMPEQ %VMM(0), %VMM(7), %VMM(7)
>> >> > +       vptest  %VMM(7), %VMM(7)
>> >> > +
>> >> > +       jz      L(loop_4x_vec)
>> >> > +
>> >> > +       /* Check if string ends in first vector or second vector.  */
>> >> > +       lea     (VEC_SIZE * 4)(%rsi), %rax
>> >> > +       sub     %r8, %rax
>> >> > +# ifdef USE_AS_WCSLCPY
>> >> > +       shr     $2, %rax
>> >> > +# endif
>> >> > +       xor     %r10, %r10
>> >> > +       VPCMPEQ %VMM(0), %VMM(5), %VMM(6)
>> >> > +       vptest  %VMM(6), %VMM(6)
>> >> > +       jnz     L(endloop)
>> >> > +       sub     $(CHAR_PER_VEC * -2), %rax
>> >> > +       mov     $(CHAR_PER_VEC * 2), %r10
>> >> > +       VMOVA   %VMM(3), %VMM(1)
>> >> > +       VMOVA   %VMM(4), %VMM(2)
>> >> > +
>> >> > +L(endloop):
>> >> > +       VPCMPEQ %VMM(0), %VMM(1), %VMM(1)
>> >> > +       VPCMPEQ %VMM(0), %VMM(2), %VMM(2)
>> >> > +       PMOVMSK %VMM(1), %rcx
>> >> > +       PMOVMSK %VMM(2), %r9
>> >> > +       shlq    $32, %r9
>> >> > +       orq     %r9, %rcx
>> >> > +       bsf     %rcx, %rcx
>> >> > +       /* Shift RCX by 2, VPMOVMSK has only byte version.  */
>> >> > +# ifdef USE_AS_WCSLCPY
>> >> > +       shr     $2, %rcx
>> >> > +# endif
>> >> > +       /* At this point RAX has length to return.  */
>> >> > +       add     %rcx, %rax
>> >> > +       test    %rdx, %rdx
>> >> > +       jz      L(ret)
>> >> > +
>> >> > +       /* Add 1 to account for NULL character in RDX comparison.  */
>> >> > +       lea     1(%r10, %rcx), %rcx
>> >> > +       cmp     %rdx, %rcx
>> >> > +       cmovb   %rcx, %rdx
>> >> > +
>> >> > +L(loop_partial_copy):
>> >> > +       cmp     $(CHAR_PER_VEC * 2), %rdx
>> >> > +       jbe     L(loop_partial_first_half)
>> >> > +       /* Reload first 2 vector.  */
>> >> > +       VMOVA   (VEC_SIZE * 4)(%rsi), %VMM(1)
>> >> > +       VMOVA   (VEC_SIZE * 5)(%rsi), %VMM(2)
>> >> > +       VMOVU   %VMM(1), (VEC_SIZE * 4)(%rdi)
>> >> > +       VMOVU   %VMM(2), (VEC_SIZE * 5)(%rdi)
>> >> > +
>> >> > +L(loop_partial_first_half):
>> >> > +       /* Go back 2 vector from last and use overlapping copy.
>> >> > +          (VEC_SIZE * 4 - VEC_SIZE * 2)(%rsi, %rdx, CHAR_SIZE)
>> >> > +          (VEC_SIZE * 4 - VEC_SIZE * 1)(%rsi, %rdx, CHAR_SIZE)
>> >> > +        */
>> >> > +       VMOVU   (VEC_SIZE * 2)(%rsi, %rdx, CHAR_SIZE), %VMM(3)
>> >> > +       VMOVU   (VEC_SIZE * 3)(%rsi, %rdx, CHAR_SIZE), %VMM(4)
>> >> > +       VMOVU   %VMM(3), (VEC_SIZE * 2)(%rdi, %rdx, CHAR_SIZE)
>> >> > +       VMOVU   %VMM(4), (VEC_SIZE * 3)(%rdi, %rdx, CHAR_SIZE)
>> >> > +       MOVU    $0, (VEC_SIZE * 4 - CHAR_SIZE)(%rdi, %rdx, CHAR_SIZE)
>> >> > +       xor     %rdx, %rdx
>> >> > +       vptest  %VMM(7), %VMM(7)
>> >> > +       jz      L(loop_partial_copy_return)
>> >> > +       ret
>> >> > +
>> >> > +       .p2align 4
>> >> > +L(page_cross):
>> >> > +       mov     %rsi, %rcx
>> >> > +       mov     %rsi, %r11
>> >> > +       and     $-VEC_SIZE, %r11
>> >> > +       and     $(VEC_SIZE - 1), %rcx
>> >> > +       VMOVA   (%r11), %VMM(1)
>> >> > +       VPCMPEQ %VMM(0), %VMM(1), %VMM(2)
>> >> > +       PMOVMSK %VMM(2), %eax
>> >> > +       shr     %cl, %eax
>> >> > +       jz      L(page_cross_continue)
>> >> > +
>> >> > +L(ret_vec_x1):
>> >> > +       bsf     %eax, %eax
>> >> > +# ifdef USE_AS_WCSLCPY
>> >> > +       shr     $2, %eax
>> >> > +# endif
>> >> > +       /* Increment by 1 to account for NULL char.  */
>> >> > +       lea     1(%eax), %ecx
>> >> > +       cmp     %rdx, %rcx
>> >> > +       cmovb   %rcx, %rdx
>> >> > +       test    %rdx, %rdx
>> >> > +       jz      L(ret)
>> >> > +
>> >> > +L(page_cross_small_vec_copy):
>> >> > +       cmp     $(16 / CHAR_SIZE), %rdx
>> >> > +       jbe     L(copy_8_byte_scalar)
>> >> > +       VMOVU   (%rsi), %VMM_128(1)
>> >> > +       VMOVU   -16(%rsi, %rdx, CHAR_SIZE), %VMM_128(3)
>> >> > +       VMOVU   %VMM_128(1), (%rdi)
>> >> > +       VMOVU   %VMM_128(3), -16(%rdi, %rdx, CHAR_SIZE)
>> >> > +       MOVU    $0, -(CHAR_SIZE * 1)(%rdi, %rdx, CHAR_SIZE)
>> >> > +       xor     %rdx, %rdx
>> >> > +       vptest  %VMM(2), %VMM(2)
>> >> > +       jz      L(continue_second_vector)
>> >> > +       ret
>> >> > +
>> >> > +L(copy_8_byte_scalar):
>> >> > +       cmp     $(8 / CHAR_SIZE), %rdx
>> >> > +       jbe     L(copy_4_byte_scalar)
>> >> > +       movq    (%rsi), %r10
>> >> > +       movq    -8(%rsi, %rdx, CHAR_SIZE), %r11
>> >> > +       movq    %r10, (%rdi)
>> >> > +       movq    %r11, -8(%rdi, %rdx, CHAR_SIZE)
>> >> > +       MOVU    $0, -(CHAR_SIZE * 1)(%rdi, %rdx, CHAR_SIZE)
>> >> > +       xor     %edx, %edx
>> >> > +       vptest  %VMM(2), %VMM(2)
>> >> > +       jz      L(continue_second_vector)
>> >> > +       ret
>> >> > +
>> >> > +L(copy_4_byte_scalar):
>> >> > +# ifndef USE_AS_WCSLCPY
>> >> > +       cmp     $4, %rdx
>> >> > +       jbe     L(copy_2_byte_scalar)
>> >> > +# endif
>> >> > +       movl    (%rsi), %r10d
>> >> > +       movl    -4(%rsi, %rdx, CHAR_SIZE), %r11d
>> >> > +       movl    %r10d, (%rdi)
>> >> > +       movl    %r11d, -4(%rdi, %rdx, CHAR_SIZE)
>> >> > +       MOVU    $0, -(CHAR_SIZE * 1)(%rdi, %rdx, CHAR_SIZE)
>> >> > +       xor     %edx, %edx
>> >> > +       vptest  %VMM(2), %VMM(2)
>> >> > +       jz      L(continue_second_vector)
>> >> > +       ret
>> >> > +
>> >> > +# ifndef USE_AS_WCSLCPY
>> >> > +L(copy_2_byte_scalar):
>> >> > +       cmp     $2, %rdx
>> >> > +       jbe     L(copy_1_byte_scalar)
>> >> > +       movw    (%rsi), %r10w
>> >> > +       movw    -(CHAR_SIZE * 3)(%rsi, %rdx, CHAR_SIZE), %r11w
>> >> > +       movw    %r10w, (%rdi)
>> >> > +       movw    %r11w, -(CHAR_SIZE * 3)(%rdi, %rdx, CHAR_SIZE)
>> >> > +       MOVU    $0, -(CHAR_SIZE * 1)(%rdi, %rdx, CHAR_SIZE)
>> >> > +       xor     %edx, %edx
>> >> > +       vptest  %VMM(2), %VMM(2)
>> >> > +       jz      L(continue_second_vector)
>> >> > +       ret
>> >> > +
>> >> > +L(copy_1_byte_scalar):
>> >> > +       MOVU    (%rsi), %r10b
>> >> > +       MOVU    %r10b, (%rdi)
>> >> > +       MOVU    $0, -(CHAR_SIZE * 1)(%rdi, %rdx, CHAR_SIZE)
>> >> > +       xor     %edx, %edx
>> >> > +       vptest  %VMM(2), %VMM(2)
>> >> > +       jz      L(continue_second_vector)
>> >> > +       ret
>> >> > +# endif
>> >> > +
>> >> > +L(ret_vec_x2):
>> >> > +       PMOVMSK %VMM(2), %rax
>> >> > +       bsf     %rax, %rcx
>> >> > +       /* Calculate return value.  */
>> >> > +       lea     VEC_SIZE(%rsi, %rcx), %rax
>> >> > +       sub     %r8, %rax
>> >> > +# ifdef USE_AS_WCSLCPY
>> >> > +       shr     $2, %rax
>> >> > +       shr     $2, %rcx
>> >> > +# endif
>> >> > +       inc     %rcx
>> >> > +       test    %rdx, %rdx
>> >> > +       jz      L(ret)
>> >> > +       cmp     %rdx, %rcx
>> >> > +       cmovb   %rcx, %rdx
>> >> > +
>> >> > +L(partial_copy_second_vector):
>> >> > +       VMOVU   (%rsi, %rdx, CHAR_SIZE), %VMM(1)
>> >> > +       VMOVU   %VMM(1), (%rdi, %rdx, CHAR_SIZE)
>> >> > +       MOVU    $0, (VEC_SIZE - CHAR_SIZE * 1)(%rdi, %rdx, CHAR_SIZE)
>> >> > +       xor     %edx, %edx
>> >> > +       vptest  %VMM(2), %VMM(2)
>> >> > +       jz      L(continue_third_vector)
>> >> > +
>> >> > +L(ret):
>> >> > +       ret
>> >> > +
>> >> > +L(ret_vec_x3):
>> >> > +       PMOVMSK %VMM(2), %rax
>> >> > +       bsf     %rax, %rcx
>> >> > +       /* Calculate return value.  */
>> >> > +       lea     (VEC_SIZE * 2)(%rsi, %rcx), %rax
>> >> > +       sub     %r8, %rax
>> >> > +# ifdef USE_AS_WCSLCPY
>> >> > +       shr     $2, %rax
>> >> > +       shr     $2, %rcx
>> >> > +# endif
>> >> > +       inc     %rcx
>> >> > +       test    %rdx, %rdx
>> >> > +       jz      L(ret)
>> >> > +       cmp     %rdx, %rcx
>> >> > +       cmovb   %rcx, %rdx
>> >> > +
>> >> > +L(partial_copy_third_vector):
>> >> > +       VMOVU   (VEC_SIZE)(%rsi, %rdx, CHAR_SIZE), %VMM(1)
>> >> > +       VMOVU   %VMM(1), (VEC_SIZE)(%rdi, %rdx, CHAR_SIZE)
>> >> > +       MOVU    $0, ((VEC_SIZE * 2) - CHAR_SIZE * 1)(%rdi, %rdx, CHAR_SIZE)
>> >> > +       xor     %edx, %edx
>> >> > +       vptest  %VMM(2), %VMM(2)
>> >> > +       jz      L(continue_fourth_vector)
>> >> > +       ret
>> >> > +
>> >> > +L(ret_vec_x4):
>> >> > +       PMOVMSK %VMM(2), %rax
>> >> > +       bsf     %rax, %rcx
>> >> > +       /* Calculate return value.  */
>> >> > +       lea     (VEC_SIZE * 3)(%rsi, %rcx), %rax
>> >> > +       sub     %r8, %rax
>> >> > +# ifdef USE_AS_WCSLCPY
>> >> > +       shr     $2, %rax
>> >> > +       shr     $2, %rcx
>> >> > +# endif
>> >> > +       inc     %rcx
>> >> > +       test    %rdx, %rdx
>> >> > +       jz      L(ret)
>> >> > +       cmp     %rdx, %rcx
>> >> > +       cmovb   %rcx, %rdx
>> >> > +
>> >> > +L(partial_copy_fourth_vector):
>> >> > +       VMOVU   (VEC_SIZE * 2)(%rsi, %rdx, CHAR_SIZE), %VMM(1)
>> >> > +       VMOVU   %VMM(1), (VEC_SIZE * 2)(%rdi, %rdx, CHAR_SIZE)
>> >> > +       MOVU    $0, ((VEC_SIZE * 3) - CHAR_SIZE * 1)(%rdi, %rdx, CHAR_SIZE)
>> >> > +       xor     %edx, %edx
>> >> > +       vptest  %VMM(2), %VMM(2)
>> >> > +       jz      L(continue_fourth_vector)
>> >> > +       ret
>> >> > +
>> >> > +END (STRLCPY)
>> >>
>> >> Is strlcpy/strlcat integratable with existing strncat impl? Had
>> >> figured they would
>> >> fit in the same file.
>> >
>> >
>> > Hi Noah,
>> >
>> > It may not be a good idea to put strlcpy/strlcat in the existing strncpy/strnat impl file,
>> > as strlcpy/strlcat functions are associated with GLIBC_2.38 ABI.
>> >
>> Well, we can put the impl there and include it from another to manage
>> any special
>> link cases.
>
>
> Due to ABI, none of strlcpy/strlcat changes can go in the glibc version earlier than 2.38,
> to avoid any future strncpy backporting complications, it is better to keep them in separate
> files for now.
>
I get that, but can't we just have an impl file that implements all
the functions logic. It would
only build strl* if its included to (similar to how strlen avx512 impl
is currently setup).

>>
>> > --Sunil
>> >
>> >>
>> >> > +#endif
>> >> > diff --git a/sysdeps/x86_64/multiarch/strlcpy-generic.c b/sysdeps/x86_64/multiarch/strlcpy-generic.c
>> >> > new file mode 100644
>> >> > index 0000000000..eee3b7b086
>> >> > --- /dev/null
>> >> > +++ b/sysdeps/x86_64/multiarch/strlcpy-generic.c
>> >> > @@ -0,0 +1,25 @@
>> >> > +/* strlcpy generic.
>> >> > +   Copyright (C) 2023 Free Software Foundation, Inc.
>> >> > +   This file is part of the GNU C Library.
>> >> > +
>> >> > +   The GNU C Library is free software; you can redistribute it and/or
>> >> > +   modify it under the terms of the GNU Lesser General Public
>> >> > +   License as published by the Free Software Foundation; either
>> >> > +   version 2.1 of the License, or (at your option) any later version.
>> >> > +
>> >> > +   The GNU C Library is distributed in the hope that it will be useful,
>> >> > +   but WITHOUT ANY WARRANTY; without even the implied warranty of
>> >> > +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
>> >> > +   Lesser General Public License for more details.
>> >> > +
>> >> > +   You should have received a copy of the GNU Lesser General Public
>> >> > +   License along with the GNU C Library; if not, see
>> >> > +   <https://www.gnu.org/licenses/>.  */
>> >> > +
>> >> > +
>> >> > +#include <isa-level.h>
>> >> > +#if ISA_SHOULD_BUILD (1)
>> >> > +# define __strlcpy  __strlcpy_generic
>> >> > +# include <string/strlcpy.c>
>> >> > +
>> >> > +#endif
>> >> > diff --git a/sysdeps/x86_64/multiarch/strlcpy.c b/sysdeps/x86_64/multiarch/strlcpy.c
>> >> > new file mode 100644
>> >> > index 0000000000..ded41fbcfb
>> >> > --- /dev/null
>> >> > +++ b/sysdeps/x86_64/multiarch/strlcpy.c
>> >> > @@ -0,0 +1,36 @@
>> >> > +/* Multiple versions of strlcpy.
>> >> > +   All versions must be listed in ifunc-impl-list.c.
>> >> > +   Copyright (C) 2023 Free Software Foundation, Inc.
>> >> > +   This file is part of the GNU C Library.
>> >> > +
>> >> > +   The GNU C Library is free software; you can redistribute it and/or
>> >> > +   modify it under the terms of the GNU Lesser General Public
>> >> > +   License as published by the Free Software Foundation; either
>> >> > +   version 2.1 of the License, or (at your option) any later version.
>> >> > +
>> >> > +   The GNU C Library is distributed in the hope that it will be useful,
>> >> > +   but WITHOUT ANY WARRANTY; without even the implied warranty of
>> >> > +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
>> >> > +   Lesser General Public License for more details.
>> >> > +
>> >> > +   You should have received a copy of the GNU Lesser General Public
>> >> > +   License along with the GNU C Library; if not, see
>> >> > +   <https://www.gnu.org/licenses/>.  */
>> >> > +
>> >> > +/* Define multiple versions only for the definition in libc.  */
>> >> > +#if IS_IN (libc)
>> >> > +# define __strlcpy __redirect_strlcpy
>> >> > +# include <string.h>
>> >> > +# undef __strlcpy
>> >> > +
>> >> > +# define SYMBOL_NAME strlcpy
>> >> > +# include "ifunc-strlcpy.h"
>> >> > +
>> >> > +libc_ifunc_redirected (__redirect_strlcpy, __strlcpy, IFUNC_SELECTOR ());
>> >> > +weak_alias (__strlcpy, strlcpy)
>> >> > +
>> >> > +# ifdef SHARED
>> >> > +__hidden_ver1 (__strlcpy, __GI___strlcpy, __redirect_strlcpy)
>> >> > +  __attribute__ ((visibility ("hidden"))) __attribute_copy__ (strlcpy);
>> >> > +# endif
>> >> > +#endif
>> >> > diff --git a/sysdeps/x86_64/multiarch/wcslcpy-avx2.S b/sysdeps/x86_64/multiarch/wcslcpy-avx2.S
>> >> > new file mode 100644
>> >> > index 0000000000..dafc20ded0
>> >> > --- /dev/null
>> >> > +++ b/sysdeps/x86_64/multiarch/wcslcpy-avx2.S
>> >> > @@ -0,0 +1,4 @@
>> >> > +#define STRLCPY        __wcslcpy_avx2
>> >> > +#define USE_AS_WCSLCPY 1
>> >> > +
>> >> > +#include "strlcpy-avx2.S"
>> >> > diff --git a/sysdeps/x86_64/multiarch/wcslcpy-generic.c b/sysdeps/x86_64/multiarch/wcslcpy-generic.c
>> >> > new file mode 100644
>> >> > index 0000000000..ffd3c0e846
>> >> > --- /dev/null
>> >> > +++ b/sysdeps/x86_64/multiarch/wcslcpy-generic.c
>> >> > @@ -0,0 +1,25 @@
>> >> > +/* wcslcpy generic.
>> >> > +   Copyright (C) 2023 Free Software Foundation, Inc.
>> >> > +   This file is part of the GNU C Library.
>> >> > +
>> >> > +   The GNU C Library is free software; you can redistribute it and/or
>> >> > +   modify it under the terms of the GNU Lesser General Public
>> >> > +   License as published by the Free Software Foundation; either
>> >> > +   version 2.1 of the License, or (at your option) any later version.
>> >> > +
>> >> > +   The GNU C Library is distributed in the hope that it will be useful,
>> >> > +   but WITHOUT ANY WARRANTY; without even the implied warranty of
>> >> > +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
>> >> > +   Lesser General Public License for more details.
>> >> > +
>> >> > +   You should have received a copy of the GNU Lesser General Public
>> >> > +   License along with the GNU C Library; if not, see
>> >> > +   <https://www.gnu.org/licenses/>.  */
>> >> > +
>> >> > +
>> >> > +#include <isa-level.h>
>> >> > +#if ISA_SHOULD_BUILD (1)
>> >> > +# define __wcslcpy  __wcslcpy_generic
>> >> > +# include <wcsmbs/wcslcpy.c>
>> >> > +
>> >> > +#endif
>> >> > diff --git a/sysdeps/x86_64/multiarch/wcslcpy.c b/sysdeps/x86_64/multiarch/wcslcpy.c
>> >> > new file mode 100644
>> >> > index 0000000000..371ef9626c
>> >> > --- /dev/null
>> >> > +++ b/sysdeps/x86_64/multiarch/wcslcpy.c
>> >> > @@ -0,0 +1,35 @@
>> >> > +/* Multiple versions of wcslcpy.
>> >> > +   All versions must be listed in ifunc-impl-list.c.
>> >> > +   Copyright (C) 2023 Free Software Foundation, Inc.
>> >> > +   This file is part of the GNU C Library.
>> >> > +
>> >> > +   The GNU C Library is free software; you can redistribute it and/or
>> >> > +   modify it under the terms of the GNU Lesser General Public
>> >> > +   License as published by the Free Software Foundation; either
>> >> > +   version 2.1 of the License, or (at your option) any later version.
>> >> > +
>> >> > +   The GNU C Library is distributed in the hope that it will be useful,
>> >> > +   but WITHOUT ANY WARRANTY; without even the implied warranty of
>> >> > +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
>> >> > +   Lesser General Public License for more details.
>> >> > +
>> >> > +   You should have received a copy of the GNU Lesser General Public
>> >> > +   License along with the GNU C Library; if not, see
>> >> > +   <https://www.gnu.org/licenses/>.  */
>> >> > +
>> >> > +/* Define multiple versions only for the definition in libc.  */
>> >> > +#if IS_IN (libc)
>> >> > +# define __wcslcpy __redirect_wcslcpy
>> >> > +# include <wchar.h>
>> >> > +# undef __wcslcpy
>> >> > +
>> >> > +# define SYMBOL_NAME wcslcpy
>> >> > +# include "ifunc-strlcpy.h"
>> >> > +
>> >> > +libc_ifunc_redirected (__redirect_wcslcpy, __wcslcpy, IFUNC_SELECTOR ());
>> >> > +weak_alias (__wcslcpy, wcslcpy)
>> >> > +# ifdef SHARED
>> >> > +__hidden_ver1 (__wcslcpy, __GI___wcslcpy, __redirect_wcslcpy)
>> >> > +  __attribute__((visibility ("hidden"))) __attribute_copy__ (wcslcpy);
>> >> > +# endif
>> >> > +#endif
>> >> > --
>> >> > 2.38.1
>> >> >

^ permalink raw reply	[flat|nested] 24+ messages in thread

end of thread, other threads:[~2023-07-04  7:45 UTC | newest]

Thread overview: 24+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2023-06-30 20:48 [PATCH] x86_64: Implement AVX2 version of strlcpy/wcslcpy function Sunil K Pandey
2023-06-30 21:04 ` Noah Goldstein
2023-06-30 21:27   ` Paul Eggert
2023-06-30 22:21     ` Sunil Pandey
2023-06-30 23:22       ` Noah Goldstein
2023-06-30 23:27         ` Noah Goldstein
2023-07-03 16:30       ` Paul Eggert
2023-07-03 18:40         ` Noah Goldstein
2023-07-03 18:54           ` Adhemerval Zanella Netto
2023-07-03 21:14           ` Paul Eggert
2023-07-03 22:04             ` Gabriel Ravier
2023-07-03 23:12               ` Paul Eggert
2023-07-04  7:45                 ` Andreas Schwab
2023-07-03 12:55     ` Adhemerval Zanella Netto
2023-07-01  9:41   ` Florian Weimer
2023-07-02  1:22     ` Noah Goldstein
2023-07-02  6:51       ` Florian Weimer
2023-07-02 16:55         ` Noah Goldstein
2023-07-02 17:02           ` Florian Weimer
2023-07-02 17:03 ` Noah Goldstein
2023-07-02 18:37   ` Sunil Pandey
2023-07-02 18:54     ` Noah Goldstein
2023-07-03  1:03       ` Sunil Pandey
2023-07-03  1:47         ` Noah Goldstein

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).