public inbox for libc-alpha@sourceware.org
 help / color / mirror / Atom feed
* [PATCH] x86_64: Implement AVX2 version of strlcpy/wcslcpy function
@ 2023-06-30 20:48 Sunil K Pandey
  2023-06-30 21:04 ` Noah Goldstein
  2023-07-02 17:03 ` Noah Goldstein
  0 siblings, 2 replies; 24+ messages in thread
From: Sunil K Pandey @ 2023-06-30 20:48 UTC (permalink / raw)
  To: libc-alpha; +Cc: hjl.tools

This patch optimizes strlcpy/wsclcpy string functions for AVX2.
---
 sysdeps/x86_64/multiarch/Makefile          |   4 +
 sysdeps/x86_64/multiarch/ifunc-impl-list.c |  18 +
 sysdeps/x86_64/multiarch/ifunc-strlcpy.h   |  34 ++
 sysdeps/x86_64/multiarch/strlcpy-avx2.S    | 446 +++++++++++++++++++++
 sysdeps/x86_64/multiarch/strlcpy-generic.c |  25 ++
 sysdeps/x86_64/multiarch/strlcpy.c         |  36 ++
 sysdeps/x86_64/multiarch/wcslcpy-avx2.S    |   4 +
 sysdeps/x86_64/multiarch/wcslcpy-generic.c |  25 ++
 sysdeps/x86_64/multiarch/wcslcpy.c         |  35 ++
 9 files changed, 627 insertions(+)
 create mode 100644 sysdeps/x86_64/multiarch/ifunc-strlcpy.h
 create mode 100644 sysdeps/x86_64/multiarch/strlcpy-avx2.S
 create mode 100644 sysdeps/x86_64/multiarch/strlcpy-generic.c
 create mode 100644 sysdeps/x86_64/multiarch/strlcpy.c
 create mode 100644 sysdeps/x86_64/multiarch/wcslcpy-avx2.S
 create mode 100644 sysdeps/x86_64/multiarch/wcslcpy-generic.c
 create mode 100644 sysdeps/x86_64/multiarch/wcslcpy.c

diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
index e1e894c963..7e3fc081df 100644
--- a/sysdeps/x86_64/multiarch/Makefile
+++ b/sysdeps/x86_64/multiarch/Makefile
@@ -82,6 +82,8 @@ sysdep_routines += \
   strcpy-sse2 \
   strcpy-sse2-unaligned \
   strcspn-sse4 \
+  strlcpy-avx2 \
+  strlcpy-generic \
   strlen-avx2 \
   strlen-avx2-rtm \
   strlen-evex \
@@ -153,6 +155,8 @@ sysdep_routines += \
   wcscpy-evex \
   wcscpy-generic \
   wcscpy-ssse3 \
+  wcslcpy-avx2 \
+  wcslcpy-generic \
   wcslen-avx2 \
   wcslen-avx2-rtm \
   wcslen-evex \
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
index 5427ff1907..9928dee187 100644
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
@@ -751,6 +751,15 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 				     1,
 				     __strncat_sse2_unaligned))
 
+  /* Support sysdeps/x86_64/multiarch/strlcpy.c.  */
+  IFUNC_IMPL (i, name, strlcpy,
+	      X86_IFUNC_IMPL_ADD_V3 (array, i, strlcpy,
+				     CPU_FEATURE_USABLE (AVX2),
+				     __strlcpy_avx2)
+	      X86_IFUNC_IMPL_ADD_V1 (array, i, strlcpy,
+				     1,
+				     __strlcpy_generic))
+
   /* Support sysdeps/x86_64/multiarch/strncpy.c.  */
   IFUNC_IMPL (i, name, strncpy,
 	      X86_IFUNC_IMPL_ADD_V4 (array, i, strncpy,
@@ -917,6 +926,15 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 				     1,
 				     __wcscpy_generic))
 
+  /* Support sysdeps/x86_64/multiarch/wcslcpy.c.  */
+  IFUNC_IMPL (i, name, wcslcpy,
+	      X86_IFUNC_IMPL_ADD_V3 (array, i, wcslcpy,
+				     CPU_FEATURE_USABLE (AVX2),
+				     __wcslcpy_avx2)
+	      X86_IFUNC_IMPL_ADD_V1 (array, i, wcslcpy,
+				     1,
+				     __wcslcpy_generic))
+
   /* Support sysdeps/x86_64/multiarch/wcsncpy.c.  */
   IFUNC_IMPL (i, name, wcsncpy,
 	      X86_IFUNC_IMPL_ADD_V4 (array, i, wcsncpy,
diff --git a/sysdeps/x86_64/multiarch/ifunc-strlcpy.h b/sysdeps/x86_64/multiarch/ifunc-strlcpy.h
new file mode 100644
index 0000000000..982a30d15b
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/ifunc-strlcpy.h
@@ -0,0 +1,34 @@
+/* Common definition for ifunc selections.
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2023 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <init-arch.h>
+
+extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE (generic) attribute_hidden;
+
+static inline void *
+IFUNC_SELECTOR (void)
+{
+  const struct cpu_features *cpu_features = __get_cpu_features ();
+
+  if (X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, AVX2))
+    return OPTIMIZE (avx2);
+
+  return OPTIMIZE (generic);
+}
diff --git a/sysdeps/x86_64/multiarch/strlcpy-avx2.S b/sysdeps/x86_64/multiarch/strlcpy-avx2.S
new file mode 100644
index 0000000000..cf54b1e990
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strlcpy-avx2.S
@@ -0,0 +1,446 @@
+/* Strlcpy/wcslcpy optimized with AVX2.
+   Copyright (C) 2023 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <isa-level.h>
+
+#if ISA_SHOULD_BUILD (3)
+
+# include <sysdep.h>
+
+# ifndef VEC_SIZE
+#  include "x86-avx-vecs.h"
+# endif
+
+# ifndef STRLCPY
+#  define STRLCPY	__strlcpy_avx2
+# endif
+
+
+# ifdef USE_AS_WCSLCPY
+#  define CHAR_SIZE	4
+#  define MOVU		movl
+#  define VPCMPEQ	vpcmpeqd
+#  define VPMINU	vpminud
+# else
+#  define CHAR_SIZE	1
+#  define MOVU		movb
+#  define VPCMPEQ	vpcmpeqb
+#  define VPMINU	vpminub
+# endif
+
+# define PMOVMSK	vpmovmskb
+# define PAGE_SIZE	4096
+# define VEC_SIZE	32
+# define CHAR_PER_VEC	(VEC_SIZE / CHAR_SIZE)
+
+	.section SECTION(.text),"ax",@progbits
+/* Aligning entry point to 64 byte, provides better performance for
+   one vector length string.  */
+
+ENTRY_P2ALIGN (STRLCPY, 6)
+# ifdef __ILP32__
+	/* Clear the upper 32 bits.  */
+	movl    %edx, %edx
+# endif
+
+	/* Zero out vector register for end of string comparison. */
+	vpxor	%VMM(0), %VMM(0), %VMM(0)
+	/* Save source pointer for return calculation.  */
+	mov	%rsi, %r8
+	mov	%esi, %eax
+	sall	$20, %eax
+	cmpl	$((PAGE_SIZE - (VEC_SIZE)) << 20), %eax
+	ja	L(page_cross)
+
+L(page_cross_continue):
+	/* Load first vector.  */
+	VMOVU	(%rsi), %VMM(1)
+	VPCMPEQ %VMM(0), %VMM(1), %VMM(2)
+	PMOVMSK %VMM(2), %eax
+	test	%eax, %eax
+	jnz	L(ret_vec_x1)
+
+	test	%rdx, %rdx
+	jz	L(continue_second_vector)
+
+	/* Check whether we can copy full vector.  */
+	cmp	$CHAR_PER_VEC, %rdx
+	jbe	L(page_cross_small_vec_copy)
+	/* Copy first vector.  */
+	VMOVU	%VMM(1), (%rdi)
+	sub	$CHAR_PER_VEC, %rdx
+
+L(continue_second_vector):
+	/* Align RSI pointer and adjust RDI based on offset.  */
+	mov	%rsi, %rax
+	and	$-VEC_SIZE, %rsi
+	sub	%rsi, %rax
+	sub	%rax, %rdi
+
+	/* Check if string already copied N char, and RDX is 0.  */
+	test	%rdx, %rdx
+	jz	L(skip_copy_alignment_fix)
+
+	/* Adjust RDX for copy alignment fix.  */
+# ifdef USE_AS_WCSLCPY
+	shr	$2, %rax
+# endif
+	add	%rax, %rdx
+
+L(skip_copy_alignment_fix):
+	/* Load second vector.  */
+	VMOVA	(VEC_SIZE * 1)(%rsi), %VMM(1)
+	VPCMPEQ	%VMM(0), %VMM(1), %VMM(2)
+	vptest	%VMM(2), %VMM(2)
+	jnz	L(ret_vec_x2)
+
+	/* Skip copy if RDX is 0.  */
+	test	%rdx, %rdx
+	jz	L(continue_third_vector)
+
+	/* Jump below/equal(instead of below) used here, because last
+	   copy chracter must be NULL.  */
+	cmp	$CHAR_PER_VEC, %rdx
+	jbe	L(partial_copy_second_vector)
+
+	sub	$CHAR_PER_VEC, %rdx
+	/* Copy second vector.  */
+	VMOVU	%VMM(1), (VEC_SIZE * 1)(%rdi)
+
+L(continue_third_vector):
+	/* Load third vector.  */
+	VMOVA	(VEC_SIZE * 2)(%rsi), %VMM(1)
+	VPCMPEQ	%VMM(0), %VMM(1), %VMM(2)
+	vptest	%VMM(2), %VMM(2)
+	jnz	L(ret_vec_x3)
+
+	/* Skip copy if RDX is 0.  */
+	test	%rdx, %rdx
+	jz	L(continue_fourth_vector)
+
+	cmp	$CHAR_PER_VEC, %rdx
+	jbe	L(partial_copy_third_vector)
+
+	sub	$CHAR_PER_VEC, %rdx
+	/* Copy third vector.  */
+	VMOVU	%VMM(1), (VEC_SIZE * 2)(%rdi)
+
+L(continue_fourth_vector):
+	/* Load fourth vector.  */
+	VMOVA	(VEC_SIZE * 3)(%rsi), %VMM(1)
+	VPCMPEQ	%VMM(0), %VMM(1), %VMM(2)
+	vptest	%VMM(2), %VMM(2)
+	jnz	L(ret_vec_x4)
+
+	/* Skip copy if RDX is 0.  */
+	test	%rdx, %rdx
+	jz	L(loop_4x_align)
+
+	cmp	$CHAR_PER_VEC, %rdx
+	jbe	L(partial_copy_fourth_vector)
+
+	sub	$CHAR_PER_VEC, %rdx
+	/* Copy fourth vector.  */
+	VMOVU	%VMM(1), (VEC_SIZE * 3)(%rdi)
+
+
+L(loop_4x_align):
+	/* Jump to loop if RSI is already 4 vector align.  */
+	test	$(VEC_SIZE * 4 - 1), %esi
+	jz	L(loop_4x_read)
+
+	mov	%rsi, %rcx
+
+	/* Align RSI to 4x vector.  */
+	and	$(VEC_SIZE * -4), %rsi
+	sub	%rsi, %rcx
+
+	/* Adjust RDI for RSI alignment fix.  */
+	sub	%rcx, %rdi
+
+	/* Jump to loop if RDX is 0.  */
+	test    %rdx, %rdx
+	jz	L(loop_4x_read)
+
+# ifdef USE_AS_WCSLCPY
+	shr	$2, %rcx
+# endif
+
+	/* Adjust RDX for RSI alignment fix.  */
+	add	%rcx, %rdx
+	jmp	L(loop_4x_read)
+
+	.p2align 4,,6
+L(loop_4x_vec):
+	/* Skip copy if RDX is 0.  */
+	test	%rdx, %rdx
+	jz	L(loop_partial_copy_return)
+	cmp	$(CHAR_PER_VEC * 4), %rdx
+	jbe	L(loop_partial_copy)
+	VMOVU	%VMM(1), (VEC_SIZE * 4)(%rdi)
+	VMOVU	%VMM(2), (VEC_SIZE * 5)(%rdi)
+	VMOVU	%VMM(3), (VEC_SIZE * 6)(%rdi)
+	VMOVU	%VMM(4), (VEC_SIZE * 7)(%rdi)
+	sub	$(CHAR_PER_VEC * 4), %rdx
+
+L(loop_partial_copy_return):
+	sub	$(VEC_SIZE * -4), %rsi
+	sub	$(VEC_SIZE * -4), %rdi
+
+L(loop_4x_read):
+	VMOVA	(VEC_SIZE * 4)(%rsi), %VMM(1)
+	VMOVA	(VEC_SIZE * 5)(%rsi), %VMM(2)
+	VMOVA	(VEC_SIZE * 6)(%rsi), %VMM(3)
+	VMOVA	(VEC_SIZE * 7)(%rsi), %VMM(4)
+	VPMINU	%VMM(1), %VMM(2), %VMM(5)
+	VPMINU	%VMM(3), %VMM(4), %VMM(6)
+	VPMINU	%VMM(5), %VMM(6), %VMM(7)
+	VPCMPEQ	%VMM(0), %VMM(7), %VMM(7)
+	vptest	%VMM(7), %VMM(7)
+
+	jz	L(loop_4x_vec)
+
+	/* Check if string ends in first vector or second vector.  */
+	lea	(VEC_SIZE * 4)(%rsi), %rax
+	sub	%r8, %rax
+# ifdef USE_AS_WCSLCPY
+	shr	$2, %rax
+# endif
+	xor	%r10, %r10
+	VPCMPEQ	%VMM(0), %VMM(5), %VMM(6)
+	vptest	%VMM(6), %VMM(6)
+	jnz	L(endloop)
+	sub	$(CHAR_PER_VEC * -2), %rax
+	mov	$(CHAR_PER_VEC * 2), %r10
+	VMOVA	%VMM(3), %VMM(1)
+	VMOVA	%VMM(4), %VMM(2)
+
+L(endloop):
+	VPCMPEQ	%VMM(0), %VMM(1), %VMM(1)
+	VPCMPEQ	%VMM(0), %VMM(2), %VMM(2)
+	PMOVMSK %VMM(1), %rcx
+	PMOVMSK %VMM(2), %r9
+	shlq	$32, %r9
+	orq	%r9, %rcx
+	bsf	%rcx, %rcx
+	/* Shift RCX by 2, VPMOVMSK has only byte version.  */
+# ifdef USE_AS_WCSLCPY
+	shr	$2, %rcx
+# endif
+	/* At this point RAX has length to return.  */
+	add	%rcx, %rax
+	test	%rdx, %rdx
+	jz	L(ret)
+
+	/* Add 1 to account for NULL character in RDX comparison.  */
+	lea	1(%r10, %rcx), %rcx
+	cmp	%rdx, %rcx
+	cmovb	%rcx, %rdx
+
+L(loop_partial_copy):
+	cmp	$(CHAR_PER_VEC * 2), %rdx
+	jbe	L(loop_partial_first_half)
+	/* Reload first 2 vector.  */
+	VMOVA	(VEC_SIZE * 4)(%rsi), %VMM(1)
+	VMOVA	(VEC_SIZE * 5)(%rsi), %VMM(2)
+	VMOVU	%VMM(1), (VEC_SIZE * 4)(%rdi)
+	VMOVU	%VMM(2), (VEC_SIZE * 5)(%rdi)
+
+L(loop_partial_first_half):
+	/* Go back 2 vector from last and use overlapping copy.
+	   (VEC_SIZE * 4 - VEC_SIZE * 2)(%rsi, %rdx, CHAR_SIZE)
+	   (VEC_SIZE * 4 - VEC_SIZE * 1)(%rsi, %rdx, CHAR_SIZE)
+	 */
+	VMOVU	(VEC_SIZE * 2)(%rsi, %rdx, CHAR_SIZE), %VMM(3)
+	VMOVU	(VEC_SIZE * 3)(%rsi, %rdx, CHAR_SIZE), %VMM(4)
+	VMOVU	%VMM(3), (VEC_SIZE * 2)(%rdi, %rdx, CHAR_SIZE)
+	VMOVU	%VMM(4), (VEC_SIZE * 3)(%rdi, %rdx, CHAR_SIZE)
+	MOVU	$0, (VEC_SIZE * 4 - CHAR_SIZE)(%rdi, %rdx, CHAR_SIZE)
+	xor	%rdx, %rdx
+	vptest	%VMM(7), %VMM(7)
+	jz	L(loop_partial_copy_return)
+	ret
+
+	.p2align 4
+L(page_cross):
+	mov	%rsi, %rcx
+	mov	%rsi, %r11
+	and	$-VEC_SIZE, %r11
+	and	$(VEC_SIZE - 1), %rcx
+	VMOVA	(%r11), %VMM(1)
+	VPCMPEQ	%VMM(0), %VMM(1), %VMM(2)
+	PMOVMSK %VMM(2), %eax
+	shr	%cl, %eax
+	jz	L(page_cross_continue)
+
+L(ret_vec_x1):
+	bsf	%eax, %eax
+# ifdef USE_AS_WCSLCPY
+	shr	$2, %eax
+# endif
+	/* Increment by 1 to account for NULL char.  */
+	lea	1(%eax), %ecx
+	cmp	%rdx, %rcx
+	cmovb	%rcx, %rdx
+	test	%rdx, %rdx
+	jz	L(ret)
+
+L(page_cross_small_vec_copy):
+	cmp	$(16 / CHAR_SIZE), %rdx
+	jbe	L(copy_8_byte_scalar)
+	VMOVU	(%rsi), %VMM_128(1)
+	VMOVU	-16(%rsi, %rdx, CHAR_SIZE), %VMM_128(3)
+	VMOVU	%VMM_128(1), (%rdi)
+	VMOVU	%VMM_128(3), -16(%rdi, %rdx, CHAR_SIZE)
+	MOVU	$0, -(CHAR_SIZE * 1)(%rdi, %rdx, CHAR_SIZE)
+	xor	%rdx, %rdx
+	vptest	%VMM(2), %VMM(2)
+	jz	L(continue_second_vector)
+	ret
+
+L(copy_8_byte_scalar):
+	cmp	$(8 / CHAR_SIZE), %rdx
+	jbe	L(copy_4_byte_scalar)
+	movq	(%rsi), %r10
+	movq	-8(%rsi, %rdx, CHAR_SIZE), %r11
+	movq	%r10, (%rdi)
+	movq	%r11, -8(%rdi, %rdx, CHAR_SIZE)
+	MOVU	$0, -(CHAR_SIZE * 1)(%rdi, %rdx, CHAR_SIZE)
+	xor	%edx, %edx
+	vptest	%VMM(2), %VMM(2)
+	jz	L(continue_second_vector)
+	ret
+
+L(copy_4_byte_scalar):
+# ifndef USE_AS_WCSLCPY
+	cmp	$4, %rdx
+	jbe	L(copy_2_byte_scalar)
+# endif
+	movl	(%rsi), %r10d
+	movl	-4(%rsi, %rdx, CHAR_SIZE), %r11d
+	movl	%r10d, (%rdi)
+	movl	%r11d, -4(%rdi, %rdx, CHAR_SIZE)
+	MOVU	$0, -(CHAR_SIZE * 1)(%rdi, %rdx, CHAR_SIZE)
+	xor	%edx, %edx
+	vptest	%VMM(2), %VMM(2)
+	jz	L(continue_second_vector)
+	ret
+
+# ifndef USE_AS_WCSLCPY
+L(copy_2_byte_scalar):
+	cmp	$2, %rdx
+	jbe	L(copy_1_byte_scalar)
+	movw	(%rsi), %r10w
+	movw	-(CHAR_SIZE * 3)(%rsi, %rdx, CHAR_SIZE), %r11w
+	movw	%r10w, (%rdi)
+	movw	%r11w, -(CHAR_SIZE * 3)(%rdi, %rdx, CHAR_SIZE)
+	MOVU	$0, -(CHAR_SIZE * 1)(%rdi, %rdx, CHAR_SIZE)
+	xor	%edx, %edx
+	vptest	%VMM(2), %VMM(2)
+	jz	L(continue_second_vector)
+	ret
+
+L(copy_1_byte_scalar):
+	MOVU	(%rsi), %r10b
+	MOVU	%r10b, (%rdi)
+	MOVU	$0, -(CHAR_SIZE * 1)(%rdi, %rdx, CHAR_SIZE)
+	xor	%edx, %edx
+	vptest	%VMM(2), %VMM(2)
+	jz	L(continue_second_vector)
+	ret
+# endif
+
+L(ret_vec_x2):
+	PMOVMSK %VMM(2), %rax
+	bsf	%rax, %rcx
+	/* Calculate return value.  */
+	lea	VEC_SIZE(%rsi, %rcx), %rax
+	sub	%r8, %rax
+# ifdef USE_AS_WCSLCPY
+	shr	$2, %rax
+	shr	$2, %rcx
+# endif
+	inc	%rcx
+	test	%rdx, %rdx
+	jz	L(ret)
+	cmp	%rdx, %rcx
+	cmovb	%rcx, %rdx
+
+L(partial_copy_second_vector):
+	VMOVU	(%rsi, %rdx, CHAR_SIZE), %VMM(1)
+	VMOVU	%VMM(1), (%rdi, %rdx, CHAR_SIZE)
+	MOVU	$0, (VEC_SIZE - CHAR_SIZE * 1)(%rdi, %rdx, CHAR_SIZE)
+	xor	%edx, %edx
+	vptest	%VMM(2), %VMM(2)
+	jz	L(continue_third_vector)
+
+L(ret):
+	ret
+
+L(ret_vec_x3):
+	PMOVMSK %VMM(2), %rax
+	bsf	%rax, %rcx
+	/* Calculate return value.  */
+	lea	(VEC_SIZE * 2)(%rsi, %rcx), %rax
+	sub	%r8, %rax
+# ifdef USE_AS_WCSLCPY
+	shr	$2, %rax
+	shr	$2, %rcx
+# endif
+	inc	%rcx
+	test	%rdx, %rdx
+	jz	L(ret)
+	cmp	%rdx, %rcx
+	cmovb	%rcx, %rdx
+
+L(partial_copy_third_vector):
+	VMOVU	(VEC_SIZE)(%rsi, %rdx, CHAR_SIZE), %VMM(1)
+	VMOVU	%VMM(1), (VEC_SIZE)(%rdi, %rdx, CHAR_SIZE)
+	MOVU	$0, ((VEC_SIZE * 2) - CHAR_SIZE * 1)(%rdi, %rdx, CHAR_SIZE)
+	xor	%edx, %edx
+	vptest	%VMM(2), %VMM(2)
+	jz	L(continue_fourth_vector)
+	ret
+
+L(ret_vec_x4):
+	PMOVMSK %VMM(2), %rax
+	bsf	%rax, %rcx
+	/* Calculate return value.  */
+	lea	(VEC_SIZE * 3)(%rsi, %rcx), %rax
+	sub	%r8, %rax
+# ifdef USE_AS_WCSLCPY
+	shr	$2, %rax
+	shr	$2, %rcx
+# endif
+	inc	%rcx
+	test	%rdx, %rdx
+	jz	L(ret)
+	cmp	%rdx, %rcx
+	cmovb	%rcx, %rdx
+
+L(partial_copy_fourth_vector):
+	VMOVU	(VEC_SIZE * 2)(%rsi, %rdx, CHAR_SIZE), %VMM(1)
+	VMOVU	%VMM(1), (VEC_SIZE * 2)(%rdi, %rdx, CHAR_SIZE)
+	MOVU	$0, ((VEC_SIZE * 3) - CHAR_SIZE * 1)(%rdi, %rdx, CHAR_SIZE)
+	xor	%edx, %edx
+	vptest	%VMM(2), %VMM(2)
+	jz	L(continue_fourth_vector)
+	ret
+
+END (STRLCPY)
+#endif
diff --git a/sysdeps/x86_64/multiarch/strlcpy-generic.c b/sysdeps/x86_64/multiarch/strlcpy-generic.c
new file mode 100644
index 0000000000..eee3b7b086
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strlcpy-generic.c
@@ -0,0 +1,25 @@
+/* strlcpy generic.
+   Copyright (C) 2023 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+
+#include <isa-level.h>
+#if ISA_SHOULD_BUILD (1)
+# define __strlcpy  __strlcpy_generic
+# include <string/strlcpy.c>
+
+#endif
diff --git a/sysdeps/x86_64/multiarch/strlcpy.c b/sysdeps/x86_64/multiarch/strlcpy.c
new file mode 100644
index 0000000000..ded41fbcfb
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strlcpy.c
@@ -0,0 +1,36 @@
+/* Multiple versions of strlcpy.
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2023 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+/* Define multiple versions only for the definition in libc.  */
+#if IS_IN (libc)
+# define __strlcpy __redirect_strlcpy
+# include <string.h>
+# undef __strlcpy
+
+# define SYMBOL_NAME strlcpy
+# include "ifunc-strlcpy.h"
+
+libc_ifunc_redirected (__redirect_strlcpy, __strlcpy, IFUNC_SELECTOR ());
+weak_alias (__strlcpy, strlcpy)
+
+# ifdef SHARED
+__hidden_ver1 (__strlcpy, __GI___strlcpy, __redirect_strlcpy)
+  __attribute__ ((visibility ("hidden"))) __attribute_copy__ (strlcpy);
+# endif
+#endif
diff --git a/sysdeps/x86_64/multiarch/wcslcpy-avx2.S b/sysdeps/x86_64/multiarch/wcslcpy-avx2.S
new file mode 100644
index 0000000000..dafc20ded0
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wcslcpy-avx2.S
@@ -0,0 +1,4 @@
+#define STRLCPY	__wcslcpy_avx2
+#define USE_AS_WCSLCPY 1
+
+#include "strlcpy-avx2.S"
diff --git a/sysdeps/x86_64/multiarch/wcslcpy-generic.c b/sysdeps/x86_64/multiarch/wcslcpy-generic.c
new file mode 100644
index 0000000000..ffd3c0e846
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wcslcpy-generic.c
@@ -0,0 +1,25 @@
+/* wcslcpy generic.
+   Copyright (C) 2023 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+
+#include <isa-level.h>
+#if ISA_SHOULD_BUILD (1)
+# define __wcslcpy  __wcslcpy_generic
+# include <wcsmbs/wcslcpy.c>
+
+#endif
diff --git a/sysdeps/x86_64/multiarch/wcslcpy.c b/sysdeps/x86_64/multiarch/wcslcpy.c
new file mode 100644
index 0000000000..371ef9626c
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wcslcpy.c
@@ -0,0 +1,35 @@
+/* Multiple versions of wcslcpy.
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2023 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+/* Define multiple versions only for the definition in libc.  */
+#if IS_IN (libc)
+# define __wcslcpy __redirect_wcslcpy
+# include <wchar.h>
+# undef __wcslcpy
+
+# define SYMBOL_NAME wcslcpy
+# include "ifunc-strlcpy.h"
+
+libc_ifunc_redirected (__redirect_wcslcpy, __wcslcpy, IFUNC_SELECTOR ());
+weak_alias (__wcslcpy, wcslcpy)
+# ifdef SHARED
+__hidden_ver1 (__wcslcpy, __GI___wcslcpy, __redirect_wcslcpy)
+  __attribute__((visibility ("hidden"))) __attribute_copy__ (wcslcpy);
+# endif
+#endif
-- 
2.38.1


^ permalink raw reply	[flat|nested] 24+ messages in thread

end of thread, other threads:[~2023-07-04  7:45 UTC | newest]

Thread overview: 24+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2023-06-30 20:48 [PATCH] x86_64: Implement AVX2 version of strlcpy/wcslcpy function Sunil K Pandey
2023-06-30 21:04 ` Noah Goldstein
2023-06-30 21:27   ` Paul Eggert
2023-06-30 22:21     ` Sunil Pandey
2023-06-30 23:22       ` Noah Goldstein
2023-06-30 23:27         ` Noah Goldstein
2023-07-03 16:30       ` Paul Eggert
2023-07-03 18:40         ` Noah Goldstein
2023-07-03 18:54           ` Adhemerval Zanella Netto
2023-07-03 21:14           ` Paul Eggert
2023-07-03 22:04             ` Gabriel Ravier
2023-07-03 23:12               ` Paul Eggert
2023-07-04  7:45                 ` Andreas Schwab
2023-07-03 12:55     ` Adhemerval Zanella Netto
2023-07-01  9:41   ` Florian Weimer
2023-07-02  1:22     ` Noah Goldstein
2023-07-02  6:51       ` Florian Weimer
2023-07-02 16:55         ` Noah Goldstein
2023-07-02 17:02           ` Florian Weimer
2023-07-02 17:03 ` Noah Goldstein
2023-07-02 18:37   ` Sunil Pandey
2023-07-02 18:54     ` Noah Goldstein
2023-07-03  1:03       ` Sunil Pandey
2023-07-03  1:47         ` Noah Goldstein

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).