[PATCH v2] x86-64: Optimize bzero

public inbox for libc-alpha@sourceware.org
 help / color / mirror / Atom feed

* [PATCH v2] x86-64: Optimize bzero
@ 2022-02-08 22:43 H.J. Lu
  2022-02-08 23:56 ` Noah Goldstein
  2022-02-09 11:41 ` Adhemerval Zanella
  0 siblings, 2 replies; 39+ messages in thread
From: H.J. Lu @ 2022-02-08 22:43 UTC (permalink / raw)
  To: libc-alpha

Rebase against the current master branch.

--
memset with zero as the value to set is by far the majority value (99%+
for Python3 and GCC).

bzero can be slightly more optimized for this case by using a zero-idiom
xor for broadcasting the set value to a register (vector or GPR).

Co-developed-by: Noah Goldstein <goldstein.w.n@gmail.com>
---
 sysdeps/x86_64/memset.S                       |   8 ++
 sysdeps/x86_64/multiarch/Makefile             |   1 +
 sysdeps/x86_64/multiarch/bzero.c              | 106 +++++++++++++++++
 sysdeps/x86_64/multiarch/ifunc-impl-list.c    |  42 +++++++
 .../memset-avx2-unaligned-erms-rtm.S          |   1 +
 .../multiarch/memset-avx2-unaligned-erms.S    |   6 +
 .../multiarch/memset-avx512-unaligned-erms.S  |   3 +
 .../multiarch/memset-evex-unaligned-erms.S    |   3 +
 .../multiarch/memset-sse2-unaligned-erms.S    |   1 +
 .../multiarch/memset-vec-unaligned-erms.S     | 110 ++++++++++++++----
 10 files changed, 256 insertions(+), 25 deletions(-)
 create mode 100644 sysdeps/x86_64/multiarch/bzero.c

diff --git a/sysdeps/x86_64/memset.S b/sysdeps/x86_64/memset.S
index 3f0517bbfc..af26e9cedc 100644
--- a/sysdeps/x86_64/memset.S
+++ b/sysdeps/x86_64/memset.S
@@ -35,6 +35,9 @@
   punpcklwd %xmm0, %xmm0; \
   pshufd $0, %xmm0, %xmm0
 
+# define BZERO_ZERO_VEC0() \
+  pxor %xmm0, %xmm0
+
 # define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
   movd d, %xmm0; \
   pshufd $0, %xmm0, %xmm0; \
@@ -53,6 +56,10 @@
 # define MEMSET_SYMBOL(p,s)	memset
 #endif
 
+#ifndef BZERO_SYMBOL
+# define BZERO_SYMBOL(p,s)	__bzero
+#endif
+
 #ifndef WMEMSET_SYMBOL
 # define WMEMSET_CHK_SYMBOL(p,s) p
 # define WMEMSET_SYMBOL(p,s)	__wmemset
@@ -63,6 +70,7 @@
 libc_hidden_builtin_def (memset)
 
 #if IS_IN (libc)
+weak_alias (__bzero, bzero)
 libc_hidden_def (__wmemset)
 weak_alias (__wmemset, wmemset)
 libc_hidden_weak (wmemset)
diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
index 4274bfdd0d..e7b413edad 100644
--- a/sysdeps/x86_64/multiarch/Makefile
+++ b/sysdeps/x86_64/multiarch/Makefile
@@ -1,6 +1,7 @@
 ifeq ($(subdir),string)
 
 sysdep_routines += \
+  bzero \
   memchr-avx2 \
   memchr-avx2-rtm \
   memchr-evex \
diff --git a/sysdeps/x86_64/multiarch/bzero.c b/sysdeps/x86_64/multiarch/bzero.c
new file mode 100644
index 0000000000..58a14b2c33
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/bzero.c
@@ -0,0 +1,106 @@
+/* Multiple versions of bzero.
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+/* Define multiple versions only for the definition in libc.  */
+#if IS_IN (libc)
+# define __bzero __redirect___bzero
+# include <string.h>
+# undef __bzero
+
+# define SYMBOL_NAME __bzero
+# include <init-arch.h>
+
+extern __typeof (REDIRECT_NAME) OPTIMIZE1 (sse2_unaligned)
+  attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE1 (sse2_unaligned_erms)
+  attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE1 (avx2_unaligned) attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE1 (avx2_unaligned_erms)
+  attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE1 (avx2_unaligned_rtm)
+  attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE1 (avx2_unaligned_erms_rtm)
+  attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE1 (evex_unaligned)
+  attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE1 (evex_unaligned_erms)
+  attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE1 (avx512_unaligned)
+  attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE1 (avx512_unaligned_erms)
+  attribute_hidden;
+
+static inline void *
+IFUNC_SELECTOR (void)
+{
+  const struct cpu_features* cpu_features = __get_cpu_features ();
+
+  if (CPU_FEATURE_USABLE_P (cpu_features, AVX512F)
+      && !CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_AVX512))
+    {
+      if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
+          && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW)
+          && CPU_FEATURE_USABLE_P (cpu_features, BMI2))
+	{
+	  if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
+	    return OPTIMIZE1 (avx512_unaligned_erms);
+
+	  return OPTIMIZE1 (avx512_unaligned);
+	}
+    }
+
+  if (CPU_FEATURE_USABLE_P (cpu_features, AVX2))
+    {
+      if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
+          && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW)
+          && CPU_FEATURE_USABLE_P (cpu_features, BMI2))
+	{
+	  if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
+	    return OPTIMIZE1 (evex_unaligned_erms);
+
+	  return OPTIMIZE1 (evex_unaligned);
+	}
+
+      if (CPU_FEATURE_USABLE_P (cpu_features, RTM))
+	{
+	  if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
+	    return OPTIMIZE1 (avx2_unaligned_erms_rtm);
+
+	  return OPTIMIZE1 (avx2_unaligned_rtm);
+	}
+
+      if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
+	{
+	  if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
+	    return OPTIMIZE1 (avx2_unaligned_erms);
+
+	  return OPTIMIZE1 (avx2_unaligned);
+	}
+    }
+
+  if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
+    return OPTIMIZE1 (sse2_unaligned_erms);
+
+  return OPTIMIZE1 (sse2_unaligned);
+}
+
+libc_ifunc_redirected (__redirect___bzero, __bzero, IFUNC_SELECTOR ());
+
+weak_alias (__bzero, bzero)
+#endif
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
index 68a56797d4..a594f4176e 100644
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
@@ -300,6 +300,48 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 			      __memset_avx512_no_vzeroupper)
 	     )
 
+  /* Support sysdeps/x86_64/multiarch/bzero.c.  */
+  IFUNC_IMPL (i, name, bzero,
+	      IFUNC_IMPL_ADD (array, i, bzero, 1,
+			      __bzero_sse2_unaligned)
+	      IFUNC_IMPL_ADD (array, i, bzero, 1,
+			      __bzero_sse2_unaligned_erms)
+	      IFUNC_IMPL_ADD (array, i, bzero,
+			      CPU_FEATURE_USABLE (AVX2),
+			      __bzero_avx2_unaligned)
+	      IFUNC_IMPL_ADD (array, i, bzero,
+			      CPU_FEATURE_USABLE (AVX2),
+			      __bzero_avx2_unaligned_erms)
+	      IFUNC_IMPL_ADD (array, i, bzero,
+			      (CPU_FEATURE_USABLE (AVX2)
+			       && CPU_FEATURE_USABLE (RTM)),
+			      __bzero_avx2_unaligned_rtm)
+	      IFUNC_IMPL_ADD (array, i, bzero,
+			      (CPU_FEATURE_USABLE (AVX2)
+			       && CPU_FEATURE_USABLE (RTM)),
+			      __bzero_avx2_unaligned_erms_rtm)
+	      IFUNC_IMPL_ADD (array, i, bzero,
+			      (CPU_FEATURE_USABLE (AVX512VL)
+			       && CPU_FEATURE_USABLE (AVX512BW)
+			       && CPU_FEATURE_USABLE (BMI2)),
+			      __bzero_evex_unaligned)
+	      IFUNC_IMPL_ADD (array, i, bzero,
+			      (CPU_FEATURE_USABLE (AVX512VL)
+			       && CPU_FEATURE_USABLE (AVX512BW)
+			       && CPU_FEATURE_USABLE (BMI2)),
+			      __bzero_evex_unaligned_erms)
+	      IFUNC_IMPL_ADD (array, i, bzero,
+			      (CPU_FEATURE_USABLE (AVX512VL)
+			       && CPU_FEATURE_USABLE (AVX512BW)
+			       && CPU_FEATURE_USABLE (BMI2)),
+			      __bzero_avx512_unaligned_erms)
+	      IFUNC_IMPL_ADD (array, i, bzero,
+			      (CPU_FEATURE_USABLE (AVX512VL)
+			       && CPU_FEATURE_USABLE (AVX512BW)
+			       && CPU_FEATURE_USABLE (BMI2)),
+			      __bzero_avx512_unaligned)
+	     )
+
   /* Support sysdeps/x86_64/multiarch/rawmemchr.c.  */
   IFUNC_IMPL (i, name, rawmemchr,
 	      IFUNC_IMPL_ADD (array, i, rawmemchr,
diff --git a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms-rtm.S b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms-rtm.S
index 8ac3e479bb..5a5ee6f672 100644
--- a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms-rtm.S
+++ b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms-rtm.S
@@ -5,6 +5,7 @@
 
 #define SECTION(p) p##.avx.rtm
 #define MEMSET_SYMBOL(p,s)	p##_avx2_##s##_rtm
+#define BZERO_SYMBOL(p,s)	p##_avx2_##s##_rtm
 #define WMEMSET_SYMBOL(p,s)	p##_avx2_##s##_rtm
 
 #include "memset-avx2-unaligned-erms.S"
diff --git a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
index c0bf2875d0..a093a2831f 100644
--- a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
@@ -14,6 +14,9 @@
   vmovd d, %xmm0; \
   movq r, %rax;
 
+# define BZERO_ZERO_VEC0() \
+  vpxor %xmm0, %xmm0, %xmm0
+
 # define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
   MEMSET_SET_VEC0_AND_SET_RETURN(d, r)
 
@@ -29,6 +32,9 @@
 # ifndef MEMSET_SYMBOL
 #  define MEMSET_SYMBOL(p,s)	p##_avx2_##s
 # endif
+# ifndef BZERO_SYMBOL
+#  define BZERO_SYMBOL(p,s)	p##_avx2_##s
+# endif
 # ifndef WMEMSET_SYMBOL
 #  define WMEMSET_SYMBOL(p,s)	p##_avx2_##s
 # endif
diff --git a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
index 5241216a77..727c92133a 100644
--- a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
@@ -19,6 +19,9 @@
   vpbroadcastb d, %VEC0; \
   movq r, %rax
 
+# define BZERO_ZERO_VEC0() \
+  vpxorq %XMM0, %XMM0, %XMM0
+
 # define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
   vpbroadcastd d, %VEC0; \
   movq r, %rax
diff --git a/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
index 6370021506..5d8fa78f05 100644
--- a/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
@@ -19,6 +19,9 @@
   vpbroadcastb d, %VEC0; \
   movq r, %rax
 
+# define BZERO_ZERO_VEC0() \
+  vpxorq %XMM0, %XMM0, %XMM0
+
 # define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
   vpbroadcastd d, %VEC0; \
   movq r, %rax
diff --git a/sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S
index 8a6f0c561a..329c58ee46 100644
--- a/sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S
@@ -22,6 +22,7 @@
 
 #if IS_IN (libc)
 # define MEMSET_SYMBOL(p,s)	p##_sse2_##s
+# define BZERO_SYMBOL(p,s)	MEMSET_SYMBOL (p, s)
 # define WMEMSET_SYMBOL(p,s)	p##_sse2_##s
 
 # ifdef SHARED
diff --git a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
index 1b502b78e4..7c94fcdae1 100644
--- a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
@@ -26,6 +26,10 @@
 
 #include <sysdep.h>
 
+#ifndef BZERO_SYMBOL
+# define BZERO_SYMBOL(p,s)		MEMSET_SYMBOL (p, s)
+#endif
+
 #ifndef MEMSET_CHK_SYMBOL
 # define MEMSET_CHK_SYMBOL(p,s)		MEMSET_SYMBOL(p, s)
 #endif
@@ -87,6 +91,18 @@
 # define XMM_SMALL	0
 #endif
 
+#ifdef USE_LESS_VEC_MASK_STORE
+# define SET_REG64	rcx
+# define SET_REG32	ecx
+# define SET_REG16	cx
+# define SET_REG8	cl
+#else
+# define SET_REG64	rsi
+# define SET_REG32	esi
+# define SET_REG16	si
+# define SET_REG8	sil
+#endif
+
 #define PAGE_SIZE 4096
 
 /* Macro to calculate size of small memset block for aligning
@@ -96,18 +112,6 @@
 
 #ifndef SECTION
 # error SECTION is not defined!
-#endif
-
-	.section SECTION(.text),"ax",@progbits
-#if VEC_SIZE == 16 && IS_IN (libc)
-ENTRY (__bzero)
-	mov	%RDI_LP, %RAX_LP /* Set return value.  */
-	mov	%RSI_LP, %RDX_LP /* Set n.  */
-	xorl	%esi, %esi
-	pxor	%XMM0, %XMM0
-	jmp	L(entry_from_bzero)
-END (__bzero)
-weak_alias (__bzero, bzero)
 #endif
 
 #if IS_IN (libc)
@@ -123,12 +127,37 @@ ENTRY (WMEMSET_SYMBOL (__wmemset, unaligned))
 	WMEMSET_SET_VEC0_AND_SET_RETURN (%esi, %rdi)
 	WMEMSET_VDUP_TO_VEC0_LOW()
 	cmpq	$VEC_SIZE, %rdx
-	jb	L(less_vec_no_vdup)
+	jb	L(less_vec_from_wmemset)
 	WMEMSET_VDUP_TO_VEC0_HIGH()
 	jmp	L(entry_from_wmemset)
 END (WMEMSET_SYMBOL (__wmemset, unaligned))
 #endif
 
+ENTRY (BZERO_SYMBOL(__bzero, unaligned))
+#if VEC_SIZE > 16
+	BZERO_ZERO_VEC0 ()
+#endif
+	mov	%RDI_LP, %RAX_LP
+	mov	%RSI_LP, %RDX_LP
+#ifndef USE_LESS_VEC_MASK_STORE
+	xorl	%esi, %esi
+#endif
+	cmp	$VEC_SIZE, %RDX_LP
+	jb	L(less_vec_no_vdup)
+#ifdef USE_LESS_VEC_MASK_STORE
+	xorl	%esi, %esi
+#endif
+#if VEC_SIZE <= 16
+	BZERO_ZERO_VEC0 ()
+#endif
+	cmp	$(VEC_SIZE * 2), %RDX_LP
+	ja	L(more_2x_vec)
+	/* From VEC and to 2 * VEC.  No branch when size == VEC_SIZE.  */
+	VMOVU	%VEC(0), (%rdi)
+	VMOVU	%VEC(0), (VEC_SIZE * -1)(%rdi, %rdx)
+	VZEROUPPER_RETURN
+END (BZERO_SYMBOL(__bzero, unaligned))
+
 #if defined SHARED && IS_IN (libc)
 ENTRY_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned))
 	cmp	%RDX_LP, %RCX_LP
@@ -142,7 +171,6 @@ ENTRY (MEMSET_SYMBOL (__memset, unaligned))
 	/* Clear the upper 32 bits.  */
 	mov	%edx, %edx
 # endif
-L(entry_from_bzero):
 	cmpq	$VEC_SIZE, %rdx
 	jb	L(less_vec)
 	MEMSET_VDUP_TO_VEC0_HIGH()
@@ -187,6 +215,31 @@ END (__memset_erms)
 END (MEMSET_SYMBOL (__memset, erms))
 # endif
 
+ENTRY_P2ALIGN (BZERO_SYMBOL(__bzero, unaligned_erms), 6)
+# if VEC_SIZE > 16
+	BZERO_ZERO_VEC0 ()
+# endif
+	mov	%RDI_LP, %RAX_LP
+	mov	%RSI_LP, %RDX_LP
+# ifndef USE_LESS_VEC_MASK_STORE
+	xorl	%esi, %esi
+# endif
+	cmp	$VEC_SIZE, %RDX_LP
+	jb	L(less_vec_no_vdup)
+# ifdef USE_LESS_VEC_MASK_STORE
+	xorl	%esi, %esi
+# endif
+# if VEC_SIZE <= 16
+	BZERO_ZERO_VEC0 ()
+# endif
+	cmp	$(VEC_SIZE * 2), %RDX_LP
+	ja	L(stosb_more_2x_vec)
+	/* From VEC and to 2 * VEC.  No branch when size == VEC_SIZE.  */
+	VMOVU	%VEC(0), (%rdi)
+	VMOVU	%VEC(0), (VEC_SIZE * -1)(%rdi, %rdx)
+	VZEROUPPER_RETURN
+END (BZERO_SYMBOL(__bzero, unaligned_erms))
+
 # if defined SHARED && IS_IN (libc)
 ENTRY_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned_erms))
 	cmp	%RDX_LP, %RCX_LP
@@ -229,6 +282,7 @@ L(last_2x_vec):
 	.p2align 4,, 10
 L(less_vec):
 L(less_vec_no_vdup):
+L(less_vec_from_wmemset):
 	/* Less than 1 VEC.  */
 # if VEC_SIZE != 16 && VEC_SIZE != 32 && VEC_SIZE != 64
 #  error Unsupported VEC_SIZE!
@@ -374,8 +428,11 @@ L(less_vec):
 	/* Broadcast esi to partial register (i.e VEC_SIZE == 32 broadcast to
 	   xmm). This is only does anything for AVX2.  */
 	MEMSET_VDUP_TO_VEC0_LOW ()
+L(less_vec_from_wmemset):
+#if VEC_SIZE > 16
 L(less_vec_no_vdup):
 #endif
+#endif
 L(cross_page):
 #if VEC_SIZE > 32
 	cmpl	$32, %edx
@@ -386,7 +443,10 @@ L(cross_page):
 	jge	L(between_16_31)
 #endif
 #ifndef USE_XMM_LESS_VEC
-	MOVQ	%XMM0, %rcx
+	MOVQ	%XMM0, %SET_REG64
+#endif
+#if VEC_SIZE <= 16
+L(less_vec_no_vdup):
 #endif
 	cmpl	$8, %edx
 	jge	L(between_8_15)
@@ -395,7 +455,7 @@ L(cross_page):
 	cmpl	$1, %edx
 	jg	L(between_2_3)
 	jl	L(between_0_0)
-	movb	%sil, (%LESS_VEC_REG)
+	movb	%SET_REG8, (%LESS_VEC_REG)
 L(between_0_0):
 	ret
 
@@ -428,8 +488,8 @@ L(between_8_15):
 	MOVQ	%XMM0, (%rdi)
 	MOVQ	%XMM0, -8(%rdi, %rdx)
 #else
-	movq	%rcx, (%LESS_VEC_REG)
-	movq	%rcx, -8(%LESS_VEC_REG, %rdx)
+	movq	%SET_REG64, (%LESS_VEC_REG)
+	movq	%SET_REG64, -8(%LESS_VEC_REG, %rdx)
 #endif
 	ret
 
@@ -442,8 +502,8 @@ L(between_4_7):
 	MOVD	%XMM0, (%rdi)
 	MOVD	%XMM0, -4(%rdi, %rdx)
 #else
-	movl	%ecx, (%LESS_VEC_REG)
-	movl	%ecx, -4(%LESS_VEC_REG, %rdx)
+	movl	%SET_REG32, (%LESS_VEC_REG)
+	movl	%SET_REG32, -4(%LESS_VEC_REG, %rdx)
 #endif
 	ret
 
@@ -452,12 +512,12 @@ L(between_4_7):
 L(between_2_3):
 	/* From 2 to 3.  No branch when size == 2.  */
 #ifdef USE_XMM_LESS_VEC
-	movb	%sil, (%rdi)
-	movb	%sil, 1(%rdi)
-	movb	%sil, -1(%rdi, %rdx)
+	movb	%SET_REG8, (%rdi)
+	movb	%SET_REG8, 1(%rdi)
+	movb	%SET_REG8, -1(%rdi, %rdx)
 #else
-	movw	%cx, (%LESS_VEC_REG)
-	movb	%sil, -1(%LESS_VEC_REG, %rdx)
+	movw	%SET_REG16, (%LESS_VEC_REG)
+	movb	%SET_REG8, -1(%LESS_VEC_REG, %rdx)
 #endif
 	ret
 END (MEMSET_SYMBOL (__memset, unaligned_erms))
-- 
2.34.1


^ permalink raw reply	[flat|nested] 39+ messages in thread

* Re: [PATCH v2] x86-64: Optimize bzero
  2022-02-08 22:43 [PATCH v2] x86-64: Optimize bzero H.J. Lu
@ 2022-02-08 23:56 ` Noah Goldstein
  2022-02-09 11:41 ` Adhemerval Zanella
  1 sibling, 0 replies; 39+ messages in thread
From: Noah Goldstein @ 2022-02-08 23:56 UTC (permalink / raw)
  To: H.J. Lu; +Cc: GNU C Library

On Tue, Feb 8, 2022 at 4:43 PM H.J. Lu <hjl.tools@gmail.com> wrote:
>
> Rebase against the current master branch.
>
> --
> memset with zero as the value to set is by far the majority value (99%+
> for Python3 and GCC).
>
> bzero can be slightly more optimized for this case by using a zero-idiom
> xor for broadcasting the set value to a register (vector or GPR).
>
> Co-developed-by: Noah Goldstein <goldstein.w.n@gmail.com>
> ---
>  sysdeps/x86_64/memset.S                       |   8 ++
>  sysdeps/x86_64/multiarch/Makefile             |   1 +
>  sysdeps/x86_64/multiarch/bzero.c              | 106 +++++++++++++++++
>  sysdeps/x86_64/multiarch/ifunc-impl-list.c    |  42 +++++++
>  .../memset-avx2-unaligned-erms-rtm.S          |   1 +
>  .../multiarch/memset-avx2-unaligned-erms.S    |   6 +
>  .../multiarch/memset-avx512-unaligned-erms.S  |   3 +
>  .../multiarch/memset-evex-unaligned-erms.S    |   3 +
>  .../multiarch/memset-sse2-unaligned-erms.S    |   1 +
>  .../multiarch/memset-vec-unaligned-erms.S     | 110 ++++++++++++++----
>  10 files changed, 256 insertions(+), 25 deletions(-)
>  create mode 100644 sysdeps/x86_64/multiarch/bzero.c
>
> diff --git a/sysdeps/x86_64/memset.S b/sysdeps/x86_64/memset.S
> index 3f0517bbfc..af26e9cedc 100644
> --- a/sysdeps/x86_64/memset.S
> +++ b/sysdeps/x86_64/memset.S
> @@ -35,6 +35,9 @@
>    punpcklwd %xmm0, %xmm0; \
>    pshufd $0, %xmm0, %xmm0
>
> +# define BZERO_ZERO_VEC0() \
> +  pxor %xmm0, %xmm0
> +
>  # define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
>    movd d, %xmm0; \
>    pshufd $0, %xmm0, %xmm0; \
> @@ -53,6 +56,10 @@
>  # define MEMSET_SYMBOL(p,s)    memset
>  #endif
>
> +#ifndef BZERO_SYMBOL
> +# define BZERO_SYMBOL(p,s)     __bzero
> +#endif
> +
>  #ifndef WMEMSET_SYMBOL
>  # define WMEMSET_CHK_SYMBOL(p,s) p
>  # define WMEMSET_SYMBOL(p,s)   __wmemset
> @@ -63,6 +70,7 @@
>  libc_hidden_builtin_def (memset)
>
>  #if IS_IN (libc)
> +weak_alias (__bzero, bzero)
>  libc_hidden_def (__wmemset)
>  weak_alias (__wmemset, wmemset)
>  libc_hidden_weak (wmemset)
> diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
> index 4274bfdd0d..e7b413edad 100644
> --- a/sysdeps/x86_64/multiarch/Makefile
> +++ b/sysdeps/x86_64/multiarch/Makefile
> @@ -1,6 +1,7 @@
>  ifeq ($(subdir),string)
>
>  sysdep_routines += \
> +  bzero \
>    memchr-avx2 \
>    memchr-avx2-rtm \
>    memchr-evex \
> diff --git a/sysdeps/x86_64/multiarch/bzero.c b/sysdeps/x86_64/multiarch/bzero.c
> new file mode 100644
> index 0000000000..58a14b2c33
> --- /dev/null
> +++ b/sysdeps/x86_64/multiarch/bzero.c
> @@ -0,0 +1,106 @@
> +/* Multiple versions of bzero.
> +   All versions must be listed in ifunc-impl-list.c.
> +   Copyright (C) 2022 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +/* Define multiple versions only for the definition in libc.  */
> +#if IS_IN (libc)
> +# define __bzero __redirect___bzero
> +# include <string.h>
> +# undef __bzero
> +
> +# define SYMBOL_NAME __bzero
> +# include <init-arch.h>
> +
> +extern __typeof (REDIRECT_NAME) OPTIMIZE1 (sse2_unaligned)
> +  attribute_hidden;
> +extern __typeof (REDIRECT_NAME) OPTIMIZE1 (sse2_unaligned_erms)
> +  attribute_hidden;
> +extern __typeof (REDIRECT_NAME) OPTIMIZE1 (avx2_unaligned) attribute_hidden;
> +extern __typeof (REDIRECT_NAME) OPTIMIZE1 (avx2_unaligned_erms)
> +  attribute_hidden;
> +extern __typeof (REDIRECT_NAME) OPTIMIZE1 (avx2_unaligned_rtm)
> +  attribute_hidden;
> +extern __typeof (REDIRECT_NAME) OPTIMIZE1 (avx2_unaligned_erms_rtm)
> +  attribute_hidden;
> +extern __typeof (REDIRECT_NAME) OPTIMIZE1 (evex_unaligned)
> +  attribute_hidden;
> +extern __typeof (REDIRECT_NAME) OPTIMIZE1 (evex_unaligned_erms)
> +  attribute_hidden;
> +extern __typeof (REDIRECT_NAME) OPTIMIZE1 (avx512_unaligned)
> +  attribute_hidden;
> +extern __typeof (REDIRECT_NAME) OPTIMIZE1 (avx512_unaligned_erms)
> +  attribute_hidden;
> +
> +static inline void *
> +IFUNC_SELECTOR (void)
> +{
> +  const struct cpu_features* cpu_features = __get_cpu_features ();
> +
> +  if (CPU_FEATURE_USABLE_P (cpu_features, AVX512F)
> +      && !CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_AVX512))
> +    {
> +      if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
> +          && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW)
> +          && CPU_FEATURE_USABLE_P (cpu_features, BMI2))
> +       {
> +         if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
> +           return OPTIMIZE1 (avx512_unaligned_erms);
> +
> +         return OPTIMIZE1 (avx512_unaligned);
> +       }
> +    }
> +
> +  if (CPU_FEATURE_USABLE_P (cpu_features, AVX2))
> +    {
> +      if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
> +          && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW)
> +          && CPU_FEATURE_USABLE_P (cpu_features, BMI2))
> +       {
> +         if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
> +           return OPTIMIZE1 (evex_unaligned_erms);
> +
> +         return OPTIMIZE1 (evex_unaligned);
> +       }
> +
> +      if (CPU_FEATURE_USABLE_P (cpu_features, RTM))
> +       {
> +         if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
> +           return OPTIMIZE1 (avx2_unaligned_erms_rtm);
> +
> +         return OPTIMIZE1 (avx2_unaligned_rtm);
> +       }
> +
> +      if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
> +       {
> +         if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
> +           return OPTIMIZE1 (avx2_unaligned_erms);
> +
> +         return OPTIMIZE1 (avx2_unaligned);
> +       }
> +    }
> +
> +  if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
> +    return OPTIMIZE1 (sse2_unaligned_erms);
> +
> +  return OPTIMIZE1 (sse2_unaligned);
> +}
> +
> +libc_ifunc_redirected (__redirect___bzero, __bzero, IFUNC_SELECTOR ());
> +
> +weak_alias (__bzero, bzero)
> +#endif
> diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> index 68a56797d4..a594f4176e 100644
> --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> @@ -300,6 +300,48 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
>                               __memset_avx512_no_vzeroupper)
>              )
>
> +  /* Support sysdeps/x86_64/multiarch/bzero.c.  */
> +  IFUNC_IMPL (i, name, bzero,
> +             IFUNC_IMPL_ADD (array, i, bzero, 1,
> +                             __bzero_sse2_unaligned)
> +             IFUNC_IMPL_ADD (array, i, bzero, 1,
> +                             __bzero_sse2_unaligned_erms)
> +             IFUNC_IMPL_ADD (array, i, bzero,
> +                             CPU_FEATURE_USABLE (AVX2),
> +                             __bzero_avx2_unaligned)
> +             IFUNC_IMPL_ADD (array, i, bzero,
> +                             CPU_FEATURE_USABLE (AVX2),
> +                             __bzero_avx2_unaligned_erms)
> +             IFUNC_IMPL_ADD (array, i, bzero,
> +                             (CPU_FEATURE_USABLE (AVX2)
> +                              && CPU_FEATURE_USABLE (RTM)),
> +                             __bzero_avx2_unaligned_rtm)
> +             IFUNC_IMPL_ADD (array, i, bzero,
> +                             (CPU_FEATURE_USABLE (AVX2)
> +                              && CPU_FEATURE_USABLE (RTM)),
> +                             __bzero_avx2_unaligned_erms_rtm)
> +             IFUNC_IMPL_ADD (array, i, bzero,
> +                             (CPU_FEATURE_USABLE (AVX512VL)
> +                              && CPU_FEATURE_USABLE (AVX512BW)
> +                              && CPU_FEATURE_USABLE (BMI2)),
> +                             __bzero_evex_unaligned)
> +             IFUNC_IMPL_ADD (array, i, bzero,
> +                             (CPU_FEATURE_USABLE (AVX512VL)
> +                              && CPU_FEATURE_USABLE (AVX512BW)
> +                              && CPU_FEATURE_USABLE (BMI2)),
> +                             __bzero_evex_unaligned_erms)
> +             IFUNC_IMPL_ADD (array, i, bzero,
> +                             (CPU_FEATURE_USABLE (AVX512VL)
> +                              && CPU_FEATURE_USABLE (AVX512BW)
> +                              && CPU_FEATURE_USABLE (BMI2)),
> +                             __bzero_avx512_unaligned_erms)
> +             IFUNC_IMPL_ADD (array, i, bzero,
> +                             (CPU_FEATURE_USABLE (AVX512VL)
> +                              && CPU_FEATURE_USABLE (AVX512BW)
> +                              && CPU_FEATURE_USABLE (BMI2)),
> +                             __bzero_avx512_unaligned)
> +            )
> +
>    /* Support sysdeps/x86_64/multiarch/rawmemchr.c.  */
>    IFUNC_IMPL (i, name, rawmemchr,
>               IFUNC_IMPL_ADD (array, i, rawmemchr,
> diff --git a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms-rtm.S b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms-rtm.S
> index 8ac3e479bb..5a5ee6f672 100644
> --- a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms-rtm.S
> +++ b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms-rtm.S
> @@ -5,6 +5,7 @@
>
>  #define SECTION(p) p##.avx.rtm
>  #define MEMSET_SYMBOL(p,s)     p##_avx2_##s##_rtm
> +#define BZERO_SYMBOL(p,s)      p##_avx2_##s##_rtm
>  #define WMEMSET_SYMBOL(p,s)    p##_avx2_##s##_rtm
>
>  #include "memset-avx2-unaligned-erms.S"
> diff --git a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
> index c0bf2875d0..a093a2831f 100644
> --- a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
> +++ b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
> @@ -14,6 +14,9 @@
>    vmovd d, %xmm0; \
>    movq r, %rax;
>
> +# define BZERO_ZERO_VEC0() \
> +  vpxor %xmm0, %xmm0, %xmm0
> +
>  # define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
>    MEMSET_SET_VEC0_AND_SET_RETURN(d, r)
>
> @@ -29,6 +32,9 @@
>  # ifndef MEMSET_SYMBOL
>  #  define MEMSET_SYMBOL(p,s)   p##_avx2_##s
>  # endif
> +# ifndef BZERO_SYMBOL
> +#  define BZERO_SYMBOL(p,s)    p##_avx2_##s
> +# endif
>  # ifndef WMEMSET_SYMBOL
>  #  define WMEMSET_SYMBOL(p,s)  p##_avx2_##s
>  # endif
> diff --git a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
> index 5241216a77..727c92133a 100644
> --- a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
> +++ b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
> @@ -19,6 +19,9 @@
>    vpbroadcastb d, %VEC0; \
>    movq r, %rax
>
> +# define BZERO_ZERO_VEC0() \
> +  vpxorq %XMM0, %XMM0, %XMM0
> +
>  # define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
>    vpbroadcastd d, %VEC0; \
>    movq r, %rax
> diff --git a/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
> index 6370021506..5d8fa78f05 100644
> --- a/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
> +++ b/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
> @@ -19,6 +19,9 @@
>    vpbroadcastb d, %VEC0; \
>    movq r, %rax
>
> +# define BZERO_ZERO_VEC0() \
> +  vpxorq %XMM0, %XMM0, %XMM0
> +
>  # define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
>    vpbroadcastd d, %VEC0; \
>    movq r, %rax
> diff --git a/sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S
> index 8a6f0c561a..329c58ee46 100644
> --- a/sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S
> +++ b/sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S
> @@ -22,6 +22,7 @@
>
>  #if IS_IN (libc)
>  # define MEMSET_SYMBOL(p,s)    p##_sse2_##s
> +# define BZERO_SYMBOL(p,s)     MEMSET_SYMBOL (p, s)
>  # define WMEMSET_SYMBOL(p,s)   p##_sse2_##s
>
>  # ifdef SHARED
> diff --git a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
> index 1b502b78e4..7c94fcdae1 100644
> --- a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
> +++ b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
> @@ -26,6 +26,10 @@
>
>  #include <sysdep.h>
>
> +#ifndef BZERO_SYMBOL
> +# define BZERO_SYMBOL(p,s)             MEMSET_SYMBOL (p, s)
> +#endif
> +
>  #ifndef MEMSET_CHK_SYMBOL
>  # define MEMSET_CHK_SYMBOL(p,s)                MEMSET_SYMBOL(p, s)
>  #endif
> @@ -87,6 +91,18 @@
>  # define XMM_SMALL     0
>  #endif
>
> +#ifdef USE_LESS_VEC_MASK_STORE
> +# define SET_REG64     rcx
> +# define SET_REG32     ecx
> +# define SET_REG16     cx
> +# define SET_REG8      cl
> +#else
> +# define SET_REG64     rsi
> +# define SET_REG32     esi
> +# define SET_REG16     si
> +# define SET_REG8      sil
> +#endif
> +
>  #define PAGE_SIZE 4096
>
>  /* Macro to calculate size of small memset block for aligning
> @@ -96,18 +112,6 @@
>
>  #ifndef SECTION
>  # error SECTION is not defined!
> -#endif
> -
> -       .section SECTION(.text),"ax",@progbits
> -#if VEC_SIZE == 16 && IS_IN (libc)
> -ENTRY (__bzero)
> -       mov     %RDI_LP, %RAX_LP /* Set return value.  */
> -       mov     %RSI_LP, %RDX_LP /* Set n.  */
> -       xorl    %esi, %esi
> -       pxor    %XMM0, %XMM0
> -       jmp     L(entry_from_bzero)
> -END (__bzero)
> -weak_alias (__bzero, bzero)
>  #endif
>
>  #if IS_IN (libc)
> @@ -123,12 +127,37 @@ ENTRY (WMEMSET_SYMBOL (__wmemset, unaligned))
>         WMEMSET_SET_VEC0_AND_SET_RETURN (%esi, %rdi)
>         WMEMSET_VDUP_TO_VEC0_LOW()
>         cmpq    $VEC_SIZE, %rdx
> -       jb      L(less_vec_no_vdup)
> +       jb      L(less_vec_from_wmemset)
>         WMEMSET_VDUP_TO_VEC0_HIGH()
>         jmp     L(entry_from_wmemset)
>  END (WMEMSET_SYMBOL (__wmemset, unaligned))
>  #endif
>
> +ENTRY (BZERO_SYMBOL(__bzero, unaligned))
> +#if VEC_SIZE > 16
> +       BZERO_ZERO_VEC0 ()
> +#endif
> +       mov     %RDI_LP, %RAX_LP
> +       mov     %RSI_LP, %RDX_LP
> +#ifndef USE_LESS_VEC_MASK_STORE
> +       xorl    %esi, %esi
> +#endif
> +       cmp     $VEC_SIZE, %RDX_LP
> +       jb      L(less_vec_no_vdup)
> +#ifdef USE_LESS_VEC_MASK_STORE
> +       xorl    %esi, %esi
> +#endif
> +#if VEC_SIZE <= 16
> +       BZERO_ZERO_VEC0 ()
> +#endif
> +       cmp     $(VEC_SIZE * 2), %RDX_LP
> +       ja      L(more_2x_vec)
> +       /* From VEC and to 2 * VEC.  No branch when size == VEC_SIZE.  */
> +       VMOVU   %VEC(0), (%rdi)
> +       VMOVU   %VEC(0), (VEC_SIZE * -1)(%rdi, %rdx)
> +       VZEROUPPER_RETURN
> +END (BZERO_SYMBOL(__bzero, unaligned))
> +
>  #if defined SHARED && IS_IN (libc)
>  ENTRY_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned))
>         cmp     %RDX_LP, %RCX_LP
> @@ -142,7 +171,6 @@ ENTRY (MEMSET_SYMBOL (__memset, unaligned))
>         /* Clear the upper 32 bits.  */
>         mov     %edx, %edx
>  # endif
> -L(entry_from_bzero):
>         cmpq    $VEC_SIZE, %rdx
>         jb      L(less_vec)
>         MEMSET_VDUP_TO_VEC0_HIGH()
> @@ -187,6 +215,31 @@ END (__memset_erms)
>  END (MEMSET_SYMBOL (__memset, erms))
>  # endif
>
> +ENTRY_P2ALIGN (BZERO_SYMBOL(__bzero, unaligned_erms), 6)
> +# if VEC_SIZE > 16
> +       BZERO_ZERO_VEC0 ()
> +# endif
> +       mov     %RDI_LP, %RAX_LP
> +       mov     %RSI_LP, %RDX_LP
> +# ifndef USE_LESS_VEC_MASK_STORE
> +       xorl    %esi, %esi
> +# endif
> +       cmp     $VEC_SIZE, %RDX_LP
> +       jb      L(less_vec_no_vdup)
> +# ifdef USE_LESS_VEC_MASK_STORE
> +       xorl    %esi, %esi
> +# endif
> +# if VEC_SIZE <= 16
> +       BZERO_ZERO_VEC0 ()
> +# endif
> +       cmp     $(VEC_SIZE * 2), %RDX_LP
> +       ja      L(stosb_more_2x_vec)
> +       /* From VEC and to 2 * VEC.  No branch when size == VEC_SIZE.  */
> +       VMOVU   %VEC(0), (%rdi)
> +       VMOVU   %VEC(0), (VEC_SIZE * -1)(%rdi, %rdx)
> +       VZEROUPPER_RETURN
> +END (BZERO_SYMBOL(__bzero, unaligned_erms))
> +
>  # if defined SHARED && IS_IN (libc)
>  ENTRY_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned_erms))
>         cmp     %RDX_LP, %RCX_LP
> @@ -229,6 +282,7 @@ L(last_2x_vec):
>         .p2align 4,, 10
>  L(less_vec):
>  L(less_vec_no_vdup):
> +L(less_vec_from_wmemset):
>         /* Less than 1 VEC.  */
>  # if VEC_SIZE != 16 && VEC_SIZE != 32 && VEC_SIZE != 64
>  #  error Unsupported VEC_SIZE!
> @@ -374,8 +428,11 @@ L(less_vec):
>         /* Broadcast esi to partial register (i.e VEC_SIZE == 32 broadcast to
>            xmm). This is only does anything for AVX2.  */
>         MEMSET_VDUP_TO_VEC0_LOW ()
> +L(less_vec_from_wmemset):
> +#if VEC_SIZE > 16
>  L(less_vec_no_vdup):
>  #endif
> +#endif
>  L(cross_page):
>  #if VEC_SIZE > 32
>         cmpl    $32, %edx
> @@ -386,7 +443,10 @@ L(cross_page):
>         jge     L(between_16_31)
>  #endif
>  #ifndef USE_XMM_LESS_VEC
> -       MOVQ    %XMM0, %rcx
> +       MOVQ    %XMM0, %SET_REG64
> +#endif
> +#if VEC_SIZE <= 16
> +L(less_vec_no_vdup):
>  #endif
>         cmpl    $8, %edx
>         jge     L(between_8_15)
> @@ -395,7 +455,7 @@ L(cross_page):
>         cmpl    $1, %edx
>         jg      L(between_2_3)
>         jl      L(between_0_0)
> -       movb    %sil, (%LESS_VEC_REG)
> +       movb    %SET_REG8, (%LESS_VEC_REG)
>  L(between_0_0):
>         ret
>
> @@ -428,8 +488,8 @@ L(between_8_15):
>         MOVQ    %XMM0, (%rdi)
>         MOVQ    %XMM0, -8(%rdi, %rdx)
>  #else
> -       movq    %rcx, (%LESS_VEC_REG)
> -       movq    %rcx, -8(%LESS_VEC_REG, %rdx)
> +       movq    %SET_REG64, (%LESS_VEC_REG)
> +       movq    %SET_REG64, -8(%LESS_VEC_REG, %rdx)
>  #endif
>         ret
>
> @@ -442,8 +502,8 @@ L(between_4_7):
>         MOVD    %XMM0, (%rdi)
>         MOVD    %XMM0, -4(%rdi, %rdx)
>  #else
> -       movl    %ecx, (%LESS_VEC_REG)
> -       movl    %ecx, -4(%LESS_VEC_REG, %rdx)
> +       movl    %SET_REG32, (%LESS_VEC_REG)
> +       movl    %SET_REG32, -4(%LESS_VEC_REG, %rdx)
>  #endif
>         ret
>
> @@ -452,12 +512,12 @@ L(between_4_7):
>  L(between_2_3):
>         /* From 2 to 3.  No branch when size == 2.  */
>  #ifdef USE_XMM_LESS_VEC
> -       movb    %sil, (%rdi)
> -       movb    %sil, 1(%rdi)
> -       movb    %sil, -1(%rdi, %rdx)
> +       movb    %SET_REG8, (%rdi)
> +       movb    %SET_REG8, 1(%rdi)
> +       movb    %SET_REG8, -1(%rdi, %rdx)
>  #else
> -       movw    %cx, (%LESS_VEC_REG)
> -       movb    %sil, -1(%LESS_VEC_REG, %rdx)
> +       movw    %SET_REG16, (%LESS_VEC_REG)
> +       movb    %SET_REG8, -1(%LESS_VEC_REG, %rdx)
>  #endif
>         ret
>  END (MEMSET_SYMBOL (__memset, unaligned_erms))
> --
> 2.34.1
>

LGTM.

^ permalink raw reply	[flat|nested] 39+ messages in thread

* Re: [PATCH v2] x86-64: Optimize bzero
  2022-02-08 22:43 [PATCH v2] x86-64: Optimize bzero H.J. Lu
  2022-02-08 23:56 ` Noah Goldstein
@ 2022-02-09 11:41 ` Adhemerval Zanella
  2022-02-09 22:14   ` Noah Goldstein
  1 sibling, 1 reply; 39+ messages in thread
From: Adhemerval Zanella @ 2022-02-09 11:41 UTC (permalink / raw)
  To: H.J. Lu, libc-alpha, Wilco Dijkstra



On 08/02/2022 19:43, H.J. Lu via Libc-alpha wrote:
> Rebase against the current master branch.
> 
> --
> memset with zero as the value to set is by far the majority value (99%+
> for Python3 and GCC).
> 
> bzero can be slightly more optimized for this case by using a zero-idiom
> xor for broadcasting the set value to a register (vector or GPR).
> 
> Co-developed-by: Noah Goldstein <goldstein.w.n@gmail.com>

Is it really worth to ressurerect bzero with this multiple ifunc variants?
Would Python3/GCC or any programs start to replace memset with bzero for
the sake of this optimization?

I agree with Wilco where the gain are marginal in this case.

> ---
>  sysdeps/x86_64/memset.S                       |   8 ++
>  sysdeps/x86_64/multiarch/Makefile             |   1 +
>  sysdeps/x86_64/multiarch/bzero.c              | 106 +++++++++++++++++
>  sysdeps/x86_64/multiarch/ifunc-impl-list.c    |  42 +++++++
>  .../memset-avx2-unaligned-erms-rtm.S          |   1 +
>  .../multiarch/memset-avx2-unaligned-erms.S    |   6 +
>  .../multiarch/memset-avx512-unaligned-erms.S  |   3 +
>  .../multiarch/memset-evex-unaligned-erms.S    |   3 +
>  .../multiarch/memset-sse2-unaligned-erms.S    |   1 +
>  .../multiarch/memset-vec-unaligned-erms.S     | 110 ++++++++++++++----
>  10 files changed, 256 insertions(+), 25 deletions(-)
>  create mode 100644 sysdeps/x86_64/multiarch/bzero.c
> 
> diff --git a/sysdeps/x86_64/memset.S b/sysdeps/x86_64/memset.S
> index 3f0517bbfc..af26e9cedc 100644
> --- a/sysdeps/x86_64/memset.S
> +++ b/sysdeps/x86_64/memset.S
> @@ -35,6 +35,9 @@
>    punpcklwd %xmm0, %xmm0; \
>    pshufd $0, %xmm0, %xmm0
>  
> +# define BZERO_ZERO_VEC0() \
> +  pxor %xmm0, %xmm0
> +
>  # define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
>    movd d, %xmm0; \
>    pshufd $0, %xmm0, %xmm0; \
> @@ -53,6 +56,10 @@
>  # define MEMSET_SYMBOL(p,s)	memset
>  #endif
>  
> +#ifndef BZERO_SYMBOL
> +# define BZERO_SYMBOL(p,s)	__bzero
> +#endif
> +
>  #ifndef WMEMSET_SYMBOL
>  # define WMEMSET_CHK_SYMBOL(p,s) p
>  # define WMEMSET_SYMBOL(p,s)	__wmemset
> @@ -63,6 +70,7 @@
>  libc_hidden_builtin_def (memset)
>  
>  #if IS_IN (libc)
> +weak_alias (__bzero, bzero)
>  libc_hidden_def (__wmemset)
>  weak_alias (__wmemset, wmemset)
>  libc_hidden_weak (wmemset)
> diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
> index 4274bfdd0d..e7b413edad 100644
> --- a/sysdeps/x86_64/multiarch/Makefile
> +++ b/sysdeps/x86_64/multiarch/Makefile
> @@ -1,6 +1,7 @@
>  ifeq ($(subdir),string)
>  
>  sysdep_routines += \
> +  bzero \
>    memchr-avx2 \
>    memchr-avx2-rtm \
>    memchr-evex \
> diff --git a/sysdeps/x86_64/multiarch/bzero.c b/sysdeps/x86_64/multiarch/bzero.c
> new file mode 100644
> index 0000000000..58a14b2c33
> --- /dev/null
> +++ b/sysdeps/x86_64/multiarch/bzero.c
> @@ -0,0 +1,106 @@
> +/* Multiple versions of bzero.
> +   All versions must be listed in ifunc-impl-list.c.
> +   Copyright (C) 2022 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +/* Define multiple versions only for the definition in libc.  */
> +#if IS_IN (libc)
> +# define __bzero __redirect___bzero
> +# include <string.h>
> +# undef __bzero
> +
> +# define SYMBOL_NAME __bzero
> +# include <init-arch.h>
> +
> +extern __typeof (REDIRECT_NAME) OPTIMIZE1 (sse2_unaligned)
> +  attribute_hidden;
> +extern __typeof (REDIRECT_NAME) OPTIMIZE1 (sse2_unaligned_erms)
> +  attribute_hidden;
> +extern __typeof (REDIRECT_NAME) OPTIMIZE1 (avx2_unaligned) attribute_hidden;
> +extern __typeof (REDIRECT_NAME) OPTIMIZE1 (avx2_unaligned_erms)
> +  attribute_hidden;
> +extern __typeof (REDIRECT_NAME) OPTIMIZE1 (avx2_unaligned_rtm)
> +  attribute_hidden;
> +extern __typeof (REDIRECT_NAME) OPTIMIZE1 (avx2_unaligned_erms_rtm)
> +  attribute_hidden;
> +extern __typeof (REDIRECT_NAME) OPTIMIZE1 (evex_unaligned)
> +  attribute_hidden;
> +extern __typeof (REDIRECT_NAME) OPTIMIZE1 (evex_unaligned_erms)
> +  attribute_hidden;
> +extern __typeof (REDIRECT_NAME) OPTIMIZE1 (avx512_unaligned)
> +  attribute_hidden;
> +extern __typeof (REDIRECT_NAME) OPTIMIZE1 (avx512_unaligned_erms)
> +  attribute_hidden;
> +
> +static inline void *
> +IFUNC_SELECTOR (void)
> +{
> +  const struct cpu_features* cpu_features = __get_cpu_features ();
> +
> +  if (CPU_FEATURE_USABLE_P (cpu_features, AVX512F)
> +      && !CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_AVX512))
> +    {
> +      if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
> +          && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW)
> +          && CPU_FEATURE_USABLE_P (cpu_features, BMI2))
> +	{
> +	  if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
> +	    return OPTIMIZE1 (avx512_unaligned_erms);
> +
> +	  return OPTIMIZE1 (avx512_unaligned);
> +	}
> +    }
> +
> +  if (CPU_FEATURE_USABLE_P (cpu_features, AVX2))
> +    {
> +      if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
> +          && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW)
> +          && CPU_FEATURE_USABLE_P (cpu_features, BMI2))
> +	{
> +	  if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
> +	    return OPTIMIZE1 (evex_unaligned_erms);
> +
> +	  return OPTIMIZE1 (evex_unaligned);
> +	}
> +
> +      if (CPU_FEATURE_USABLE_P (cpu_features, RTM))
> +	{
> +	  if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
> +	    return OPTIMIZE1 (avx2_unaligned_erms_rtm);
> +
> +	  return OPTIMIZE1 (avx2_unaligned_rtm);
> +	}
> +
> +      if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
> +	{
> +	  if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
> +	    return OPTIMIZE1 (avx2_unaligned_erms);
> +
> +	  return OPTIMIZE1 (avx2_unaligned);
> +	}
> +    }
> +
> +  if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
> +    return OPTIMIZE1 (sse2_unaligned_erms);
> +
> +  return OPTIMIZE1 (sse2_unaligned);
> +}
> +
> +libc_ifunc_redirected (__redirect___bzero, __bzero, IFUNC_SELECTOR ());
> +
> +weak_alias (__bzero, bzero)
> +#endif
> diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> index 68a56797d4..a594f4176e 100644
> --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> @@ -300,6 +300,48 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
>  			      __memset_avx512_no_vzeroupper)
>  	     )
>  
> +  /* Support sysdeps/x86_64/multiarch/bzero.c.  */
> +  IFUNC_IMPL (i, name, bzero,
> +	      IFUNC_IMPL_ADD (array, i, bzero, 1,
> +			      __bzero_sse2_unaligned)
> +	      IFUNC_IMPL_ADD (array, i, bzero, 1,
> +			      __bzero_sse2_unaligned_erms)
> +	      IFUNC_IMPL_ADD (array, i, bzero,
> +			      CPU_FEATURE_USABLE (AVX2),
> +			      __bzero_avx2_unaligned)
> +	      IFUNC_IMPL_ADD (array, i, bzero,
> +			      CPU_FEATURE_USABLE (AVX2),
> +			      __bzero_avx2_unaligned_erms)
> +	      IFUNC_IMPL_ADD (array, i, bzero,
> +			      (CPU_FEATURE_USABLE (AVX2)
> +			       && CPU_FEATURE_USABLE (RTM)),
> +			      __bzero_avx2_unaligned_rtm)
> +	      IFUNC_IMPL_ADD (array, i, bzero,
> +			      (CPU_FEATURE_USABLE (AVX2)
> +			       && CPU_FEATURE_USABLE (RTM)),
> +			      __bzero_avx2_unaligned_erms_rtm)
> +	      IFUNC_IMPL_ADD (array, i, bzero,
> +			      (CPU_FEATURE_USABLE (AVX512VL)
> +			       && CPU_FEATURE_USABLE (AVX512BW)
> +			       && CPU_FEATURE_USABLE (BMI2)),
> +			      __bzero_evex_unaligned)
> +	      IFUNC_IMPL_ADD (array, i, bzero,
> +			      (CPU_FEATURE_USABLE (AVX512VL)
> +			       && CPU_FEATURE_USABLE (AVX512BW)
> +			       && CPU_FEATURE_USABLE (BMI2)),
> +			      __bzero_evex_unaligned_erms)
> +	      IFUNC_IMPL_ADD (array, i, bzero,
> +			      (CPU_FEATURE_USABLE (AVX512VL)
> +			       && CPU_FEATURE_USABLE (AVX512BW)
> +			       && CPU_FEATURE_USABLE (BMI2)),
> +			      __bzero_avx512_unaligned_erms)
> +	      IFUNC_IMPL_ADD (array, i, bzero,
> +			      (CPU_FEATURE_USABLE (AVX512VL)
> +			       && CPU_FEATURE_USABLE (AVX512BW)
> +			       && CPU_FEATURE_USABLE (BMI2)),
> +			      __bzero_avx512_unaligned)
> +	     )
> +
>    /* Support sysdeps/x86_64/multiarch/rawmemchr.c.  */
>    IFUNC_IMPL (i, name, rawmemchr,
>  	      IFUNC_IMPL_ADD (array, i, rawmemchr,
> diff --git a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms-rtm.S b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms-rtm.S
> index 8ac3e479bb..5a5ee6f672 100644
> --- a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms-rtm.S
> +++ b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms-rtm.S
> @@ -5,6 +5,7 @@
>  
>  #define SECTION(p) p##.avx.rtm
>  #define MEMSET_SYMBOL(p,s)	p##_avx2_##s##_rtm
> +#define BZERO_SYMBOL(p,s)	p##_avx2_##s##_rtm
>  #define WMEMSET_SYMBOL(p,s)	p##_avx2_##s##_rtm
>  
>  #include "memset-avx2-unaligned-erms.S"
> diff --git a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
> index c0bf2875d0..a093a2831f 100644
> --- a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
> +++ b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
> @@ -14,6 +14,9 @@
>    vmovd d, %xmm0; \
>    movq r, %rax;
>  
> +# define BZERO_ZERO_VEC0() \
> +  vpxor %xmm0, %xmm0, %xmm0
> +
>  # define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
>    MEMSET_SET_VEC0_AND_SET_RETURN(d, r)
>  
> @@ -29,6 +32,9 @@
>  # ifndef MEMSET_SYMBOL
>  #  define MEMSET_SYMBOL(p,s)	p##_avx2_##s
>  # endif
> +# ifndef BZERO_SYMBOL
> +#  define BZERO_SYMBOL(p,s)	p##_avx2_##s
> +# endif
>  # ifndef WMEMSET_SYMBOL
>  #  define WMEMSET_SYMBOL(p,s)	p##_avx2_##s
>  # endif
> diff --git a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
> index 5241216a77..727c92133a 100644
> --- a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
> +++ b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
> @@ -19,6 +19,9 @@
>    vpbroadcastb d, %VEC0; \
>    movq r, %rax
>  
> +# define BZERO_ZERO_VEC0() \
> +  vpxorq %XMM0, %XMM0, %XMM0
> +
>  # define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
>    vpbroadcastd d, %VEC0; \
>    movq r, %rax
> diff --git a/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
> index 6370021506..5d8fa78f05 100644
> --- a/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
> +++ b/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
> @@ -19,6 +19,9 @@
>    vpbroadcastb d, %VEC0; \
>    movq r, %rax
>  
> +# define BZERO_ZERO_VEC0() \
> +  vpxorq %XMM0, %XMM0, %XMM0
> +
>  # define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
>    vpbroadcastd d, %VEC0; \
>    movq r, %rax
> diff --git a/sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S
> index 8a6f0c561a..329c58ee46 100644
> --- a/sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S
> +++ b/sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S
> @@ -22,6 +22,7 @@
>  
>  #if IS_IN (libc)
>  # define MEMSET_SYMBOL(p,s)	p##_sse2_##s
> +# define BZERO_SYMBOL(p,s)	MEMSET_SYMBOL (p, s)
>  # define WMEMSET_SYMBOL(p,s)	p##_sse2_##s
>  
>  # ifdef SHARED
> diff --git a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
> index 1b502b78e4..7c94fcdae1 100644
> --- a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
> +++ b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
> @@ -26,6 +26,10 @@
>  
>  #include <sysdep.h>
>  
> +#ifndef BZERO_SYMBOL
> +# define BZERO_SYMBOL(p,s)		MEMSET_SYMBOL (p, s)
> +#endif
> +
>  #ifndef MEMSET_CHK_SYMBOL
>  # define MEMSET_CHK_SYMBOL(p,s)		MEMSET_SYMBOL(p, s)
>  #endif
> @@ -87,6 +91,18 @@
>  # define XMM_SMALL	0
>  #endif
>  
> +#ifdef USE_LESS_VEC_MASK_STORE
> +# define SET_REG64	rcx
> +# define SET_REG32	ecx
> +# define SET_REG16	cx
> +# define SET_REG8	cl
> +#else
> +# define SET_REG64	rsi
> +# define SET_REG32	esi
> +# define SET_REG16	si
> +# define SET_REG8	sil
> +#endif
> +
>  #define PAGE_SIZE 4096
>  
>  /* Macro to calculate size of small memset block for aligning
> @@ -96,18 +112,6 @@
>  
>  #ifndef SECTION
>  # error SECTION is not defined!
> -#endif
> -
> -	.section SECTION(.text),"ax",@progbits
> -#if VEC_SIZE == 16 && IS_IN (libc)
> -ENTRY (__bzero)
> -	mov	%RDI_LP, %RAX_LP /* Set return value.  */
> -	mov	%RSI_LP, %RDX_LP /* Set n.  */
> -	xorl	%esi, %esi
> -	pxor	%XMM0, %XMM0
> -	jmp	L(entry_from_bzero)
> -END (__bzero)
> -weak_alias (__bzero, bzero)
>  #endif
>  
>  #if IS_IN (libc)
> @@ -123,12 +127,37 @@ ENTRY (WMEMSET_SYMBOL (__wmemset, unaligned))
>  	WMEMSET_SET_VEC0_AND_SET_RETURN (%esi, %rdi)
>  	WMEMSET_VDUP_TO_VEC0_LOW()
>  	cmpq	$VEC_SIZE, %rdx
> -	jb	L(less_vec_no_vdup)
> +	jb	L(less_vec_from_wmemset)
>  	WMEMSET_VDUP_TO_VEC0_HIGH()
>  	jmp	L(entry_from_wmemset)
>  END (WMEMSET_SYMBOL (__wmemset, unaligned))
>  #endif
>  
> +ENTRY (BZERO_SYMBOL(__bzero, unaligned))
> +#if VEC_SIZE > 16
> +	BZERO_ZERO_VEC0 ()
> +#endif
> +	mov	%RDI_LP, %RAX_LP
> +	mov	%RSI_LP, %RDX_LP
> +#ifndef USE_LESS_VEC_MASK_STORE
> +	xorl	%esi, %esi
> +#endif
> +	cmp	$VEC_SIZE, %RDX_LP
> +	jb	L(less_vec_no_vdup)
> +#ifdef USE_LESS_VEC_MASK_STORE
> +	xorl	%esi, %esi
> +#endif
> +#if VEC_SIZE <= 16
> +	BZERO_ZERO_VEC0 ()
> +#endif
> +	cmp	$(VEC_SIZE * 2), %RDX_LP
> +	ja	L(more_2x_vec)
> +	/* From VEC and to 2 * VEC.  No branch when size == VEC_SIZE.  */
> +	VMOVU	%VEC(0), (%rdi)
> +	VMOVU	%VEC(0), (VEC_SIZE * -1)(%rdi, %rdx)
> +	VZEROUPPER_RETURN
> +END (BZERO_SYMBOL(__bzero, unaligned))
> +
>  #if defined SHARED && IS_IN (libc)
>  ENTRY_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned))
>  	cmp	%RDX_LP, %RCX_LP
> @@ -142,7 +171,6 @@ ENTRY (MEMSET_SYMBOL (__memset, unaligned))
>  	/* Clear the upper 32 bits.  */
>  	mov	%edx, %edx
>  # endif
> -L(entry_from_bzero):
>  	cmpq	$VEC_SIZE, %rdx
>  	jb	L(less_vec)
>  	MEMSET_VDUP_TO_VEC0_HIGH()
> @@ -187,6 +215,31 @@ END (__memset_erms)
>  END (MEMSET_SYMBOL (__memset, erms))
>  # endif
>  
> +ENTRY_P2ALIGN (BZERO_SYMBOL(__bzero, unaligned_erms), 6)
> +# if VEC_SIZE > 16
> +	BZERO_ZERO_VEC0 ()
> +# endif
> +	mov	%RDI_LP, %RAX_LP
> +	mov	%RSI_LP, %RDX_LP
> +# ifndef USE_LESS_VEC_MASK_STORE
> +	xorl	%esi, %esi
> +# endif
> +	cmp	$VEC_SIZE, %RDX_LP
> +	jb	L(less_vec_no_vdup)
> +# ifdef USE_LESS_VEC_MASK_STORE
> +	xorl	%esi, %esi
> +# endif
> +# if VEC_SIZE <= 16
> +	BZERO_ZERO_VEC0 ()
> +# endif
> +	cmp	$(VEC_SIZE * 2), %RDX_LP
> +	ja	L(stosb_more_2x_vec)
> +	/* From VEC and to 2 * VEC.  No branch when size == VEC_SIZE.  */
> +	VMOVU	%VEC(0), (%rdi)
> +	VMOVU	%VEC(0), (VEC_SIZE * -1)(%rdi, %rdx)
> +	VZEROUPPER_RETURN
> +END (BZERO_SYMBOL(__bzero, unaligned_erms))
> +
>  # if defined SHARED && IS_IN (libc)
>  ENTRY_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned_erms))
>  	cmp	%RDX_LP, %RCX_LP
> @@ -229,6 +282,7 @@ L(last_2x_vec):
>  	.p2align 4,, 10
>  L(less_vec):
>  L(less_vec_no_vdup):
> +L(less_vec_from_wmemset):
>  	/* Less than 1 VEC.  */
>  # if VEC_SIZE != 16 && VEC_SIZE != 32 && VEC_SIZE != 64
>  #  error Unsupported VEC_SIZE!
> @@ -374,8 +428,11 @@ L(less_vec):
>  	/* Broadcast esi to partial register (i.e VEC_SIZE == 32 broadcast to
>  	   xmm). This is only does anything for AVX2.  */
>  	MEMSET_VDUP_TO_VEC0_LOW ()
> +L(less_vec_from_wmemset):
> +#if VEC_SIZE > 16
>  L(less_vec_no_vdup):
>  #endif
> +#endif
>  L(cross_page):
>  #if VEC_SIZE > 32
>  	cmpl	$32, %edx
> @@ -386,7 +443,10 @@ L(cross_page):
>  	jge	L(between_16_31)
>  #endif
>  #ifndef USE_XMM_LESS_VEC
> -	MOVQ	%XMM0, %rcx
> +	MOVQ	%XMM0, %SET_REG64
> +#endif
> +#if VEC_SIZE <= 16
> +L(less_vec_no_vdup):
>  #endif
>  	cmpl	$8, %edx
>  	jge	L(between_8_15)
> @@ -395,7 +455,7 @@ L(cross_page):
>  	cmpl	$1, %edx
>  	jg	L(between_2_3)
>  	jl	L(between_0_0)
> -	movb	%sil, (%LESS_VEC_REG)
> +	movb	%SET_REG8, (%LESS_VEC_REG)
>  L(between_0_0):
>  	ret
>  
> @@ -428,8 +488,8 @@ L(between_8_15):
>  	MOVQ	%XMM0, (%rdi)
>  	MOVQ	%XMM0, -8(%rdi, %rdx)
>  #else
> -	movq	%rcx, (%LESS_VEC_REG)
> -	movq	%rcx, -8(%LESS_VEC_REG, %rdx)
> +	movq	%SET_REG64, (%LESS_VEC_REG)
> +	movq	%SET_REG64, -8(%LESS_VEC_REG, %rdx)
>  #endif
>  	ret
>  
> @@ -442,8 +502,8 @@ L(between_4_7):
>  	MOVD	%XMM0, (%rdi)
>  	MOVD	%XMM0, -4(%rdi, %rdx)
>  #else
> -	movl	%ecx, (%LESS_VEC_REG)
> -	movl	%ecx, -4(%LESS_VEC_REG, %rdx)
> +	movl	%SET_REG32, (%LESS_VEC_REG)
> +	movl	%SET_REG32, -4(%LESS_VEC_REG, %rdx)
>  #endif
>  	ret
>  
> @@ -452,12 +512,12 @@ L(between_4_7):
>  L(between_2_3):
>  	/* From 2 to 3.  No branch when size == 2.  */
>  #ifdef USE_XMM_LESS_VEC
> -	movb	%sil, (%rdi)
> -	movb	%sil, 1(%rdi)
> -	movb	%sil, -1(%rdi, %rdx)
> +	movb	%SET_REG8, (%rdi)
> +	movb	%SET_REG8, 1(%rdi)
> +	movb	%SET_REG8, -1(%rdi, %rdx)
>  #else
> -	movw	%cx, (%LESS_VEC_REG)
> -	movb	%sil, -1(%LESS_VEC_REG, %rdx)
> +	movw	%SET_REG16, (%LESS_VEC_REG)
> +	movb	%SET_REG8, -1(%LESS_VEC_REG, %rdx)
>  #endif
>  	ret
>  END (MEMSET_SYMBOL (__memset, unaligned_erms))

^ permalink raw reply	[flat|nested] 39+ messages in thread

* Re: [PATCH v2] x86-64: Optimize bzero
  2022-02-09 11:41 ` Adhemerval Zanella
@ 2022-02-09 22:14   ` Noah Goldstein
  2022-02-10 12:35     ` Adhemerval Zanella
  0 siblings, 1 reply; 39+ messages in thread
From: Noah Goldstein @ 2022-02-09 22:14 UTC (permalink / raw)
  To: Adhemerval Zanella; +Cc: H.J. Lu, GNU C Library, Wilco Dijkstra

On Wed, Feb 9, 2022 at 5:41 AM Adhemerval Zanella via Libc-alpha
<libc-alpha@sourceware.org> wrote:
>
>
>
> On 08/02/2022 19:43, H.J. Lu via Libc-alpha wrote:
> > Rebase against the current master branch.
> >
> > --
> > memset with zero as the value to set is by far the majority value (99%+
> > for Python3 and GCC).
> >
> > bzero can be slightly more optimized for this case by using a zero-idiom
> > xor for broadcasting the set value to a register (vector or GPR).
> >
> > Co-developed-by: Noah Goldstein <goldstein.w.n@gmail.com>
>
> Is it really worth to ressurerect bzero with this multiple ifunc variants?
> Would Python3/GCC or any programs start to replace memset with bzero for
> the sake of this optimization?
>
> I agree with Wilco where the gain are marginal in this case.

The cost is only 1 cache line and it doesn't interfere with memset at all
so it's unlikely to cause any problems.

The saving is in the lane-cross broadcast which is on the critical
path for memsets in [VEC_SIZE, 2 * VEC_SIZE] (think 32-64).

As well for EVEX + AVX512, because it uses predicate execution for
[0, VEC_SIZE] there is a slight benefit there (although only in throughput
because the critical path in mask construction is longer than the lane
VEC setup).

Agreed it's not clear if it's worth it to start replacing memset calls with
bzero calls, but at the very least this will improve existing code that
uses bzero.


>
> > ---
> >  sysdeps/x86_64/memset.S                       |   8 ++
> >  sysdeps/x86_64/multiarch/Makefile             |   1 +
> >  sysdeps/x86_64/multiarch/bzero.c              | 106 +++++++++++++++++
> >  sysdeps/x86_64/multiarch/ifunc-impl-list.c    |  42 +++++++
> >  .../memset-avx2-unaligned-erms-rtm.S          |   1 +
> >  .../multiarch/memset-avx2-unaligned-erms.S    |   6 +
> >  .../multiarch/memset-avx512-unaligned-erms.S  |   3 +
> >  .../multiarch/memset-evex-unaligned-erms.S    |   3 +
> >  .../multiarch/memset-sse2-unaligned-erms.S    |   1 +
> >  .../multiarch/memset-vec-unaligned-erms.S     | 110 ++++++++++++++----
> >  10 files changed, 256 insertions(+), 25 deletions(-)
> >  create mode 100644 sysdeps/x86_64/multiarch/bzero.c
> >
> > diff --git a/sysdeps/x86_64/memset.S b/sysdeps/x86_64/memset.S
> > index 3f0517bbfc..af26e9cedc 100644
> > --- a/sysdeps/x86_64/memset.S
> > +++ b/sysdeps/x86_64/memset.S
> > @@ -35,6 +35,9 @@
> >    punpcklwd %xmm0, %xmm0; \
> >    pshufd $0, %xmm0, %xmm0
> >
> > +# define BZERO_ZERO_VEC0() \
> > +  pxor %xmm0, %xmm0
> > +
> >  # define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
> >    movd d, %xmm0; \
> >    pshufd $0, %xmm0, %xmm0; \
> > @@ -53,6 +56,10 @@
> >  # define MEMSET_SYMBOL(p,s)  memset
> >  #endif
> >
> > +#ifndef BZERO_SYMBOL
> > +# define BZERO_SYMBOL(p,s)   __bzero
> > +#endif
> > +
> >  #ifndef WMEMSET_SYMBOL
> >  # define WMEMSET_CHK_SYMBOL(p,s) p
> >  # define WMEMSET_SYMBOL(p,s) __wmemset
> > @@ -63,6 +70,7 @@
> >  libc_hidden_builtin_def (memset)
> >
> >  #if IS_IN (libc)
> > +weak_alias (__bzero, bzero)
> >  libc_hidden_def (__wmemset)
> >  weak_alias (__wmemset, wmemset)
> >  libc_hidden_weak (wmemset)
> > diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
> > index 4274bfdd0d..e7b413edad 100644
> > --- a/sysdeps/x86_64/multiarch/Makefile
> > +++ b/sysdeps/x86_64/multiarch/Makefile
> > @@ -1,6 +1,7 @@
> >  ifeq ($(subdir),string)
> >
> >  sysdep_routines += \
> > +  bzero \
> >    memchr-avx2 \
> >    memchr-avx2-rtm \
> >    memchr-evex \
> > diff --git a/sysdeps/x86_64/multiarch/bzero.c b/sysdeps/x86_64/multiarch/bzero.c
> > new file mode 100644
> > index 0000000000..58a14b2c33
> > --- /dev/null
> > +++ b/sysdeps/x86_64/multiarch/bzero.c
> > @@ -0,0 +1,106 @@
> > +/* Multiple versions of bzero.
> > +   All versions must be listed in ifunc-impl-list.c.
> > +   Copyright (C) 2022 Free Software Foundation, Inc.
> > +   This file is part of the GNU C Library.
> > +
> > +   The GNU C Library is free software; you can redistribute it and/or
> > +   modify it under the terms of the GNU Lesser General Public
> > +   License as published by the Free Software Foundation; either
> > +   version 2.1 of the License, or (at your option) any later version.
> > +
> > +   The GNU C Library is distributed in the hope that it will be useful,
> > +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> > +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > +   Lesser General Public License for more details.
> > +
> > +   You should have received a copy of the GNU Lesser General Public
> > +   License along with the GNU C Library; if not, see
> > +   <https://www.gnu.org/licenses/>.  */
> > +
> > +/* Define multiple versions only for the definition in libc.  */
> > +#if IS_IN (libc)
> > +# define __bzero __redirect___bzero
> > +# include <string.h>
> > +# undef __bzero
> > +
> > +# define SYMBOL_NAME __bzero
> > +# include <init-arch.h>
> > +
> > +extern __typeof (REDIRECT_NAME) OPTIMIZE1 (sse2_unaligned)
> > +  attribute_hidden;
> > +extern __typeof (REDIRECT_NAME) OPTIMIZE1 (sse2_unaligned_erms)
> > +  attribute_hidden;
> > +extern __typeof (REDIRECT_NAME) OPTIMIZE1 (avx2_unaligned) attribute_hidden;
> > +extern __typeof (REDIRECT_NAME) OPTIMIZE1 (avx2_unaligned_erms)
> > +  attribute_hidden;
> > +extern __typeof (REDIRECT_NAME) OPTIMIZE1 (avx2_unaligned_rtm)
> > +  attribute_hidden;
> > +extern __typeof (REDIRECT_NAME) OPTIMIZE1 (avx2_unaligned_erms_rtm)
> > +  attribute_hidden;
> > +extern __typeof (REDIRECT_NAME) OPTIMIZE1 (evex_unaligned)
> > +  attribute_hidden;
> > +extern __typeof (REDIRECT_NAME) OPTIMIZE1 (evex_unaligned_erms)
> > +  attribute_hidden;
> > +extern __typeof (REDIRECT_NAME) OPTIMIZE1 (avx512_unaligned)
> > +  attribute_hidden;
> > +extern __typeof (REDIRECT_NAME) OPTIMIZE1 (avx512_unaligned_erms)
> > +  attribute_hidden;
> > +
> > +static inline void *
> > +IFUNC_SELECTOR (void)
> > +{
> > +  const struct cpu_features* cpu_features = __get_cpu_features ();
> > +
> > +  if (CPU_FEATURE_USABLE_P (cpu_features, AVX512F)
> > +      && !CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_AVX512))
> > +    {
> > +      if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
> > +          && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW)
> > +          && CPU_FEATURE_USABLE_P (cpu_features, BMI2))
> > +     {
> > +       if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
> > +         return OPTIMIZE1 (avx512_unaligned_erms);
> > +
> > +       return OPTIMIZE1 (avx512_unaligned);
> > +     }
> > +    }
> > +
> > +  if (CPU_FEATURE_USABLE_P (cpu_features, AVX2))
> > +    {
> > +      if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
> > +          && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW)
> > +          && CPU_FEATURE_USABLE_P (cpu_features, BMI2))
> > +     {
> > +       if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
> > +         return OPTIMIZE1 (evex_unaligned_erms);
> > +
> > +       return OPTIMIZE1 (evex_unaligned);
> > +     }
> > +
> > +      if (CPU_FEATURE_USABLE_P (cpu_features, RTM))
> > +     {
> > +       if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
> > +         return OPTIMIZE1 (avx2_unaligned_erms_rtm);
> > +
> > +       return OPTIMIZE1 (avx2_unaligned_rtm);
> > +     }
> > +
> > +      if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
> > +     {
> > +       if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
> > +         return OPTIMIZE1 (avx2_unaligned_erms);
> > +
> > +       return OPTIMIZE1 (avx2_unaligned);
> > +     }
> > +    }
> > +
> > +  if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
> > +    return OPTIMIZE1 (sse2_unaligned_erms);
> > +
> > +  return OPTIMIZE1 (sse2_unaligned);
> > +}
> > +
> > +libc_ifunc_redirected (__redirect___bzero, __bzero, IFUNC_SELECTOR ());
> > +
> > +weak_alias (__bzero, bzero)
> > +#endif
> > diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> > index 68a56797d4..a594f4176e 100644
> > --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> > +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> > @@ -300,6 +300,48 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> >                             __memset_avx512_no_vzeroupper)
> >            )
> >
> > +  /* Support sysdeps/x86_64/multiarch/bzero.c.  */
> > +  IFUNC_IMPL (i, name, bzero,
> > +           IFUNC_IMPL_ADD (array, i, bzero, 1,
> > +                           __bzero_sse2_unaligned)
> > +           IFUNC_IMPL_ADD (array, i, bzero, 1,
> > +                           __bzero_sse2_unaligned_erms)
> > +           IFUNC_IMPL_ADD (array, i, bzero,
> > +                           CPU_FEATURE_USABLE (AVX2),
> > +                           __bzero_avx2_unaligned)
> > +           IFUNC_IMPL_ADD (array, i, bzero,
> > +                           CPU_FEATURE_USABLE (AVX2),
> > +                           __bzero_avx2_unaligned_erms)
> > +           IFUNC_IMPL_ADD (array, i, bzero,
> > +                           (CPU_FEATURE_USABLE (AVX2)
> > +                            && CPU_FEATURE_USABLE (RTM)),
> > +                           __bzero_avx2_unaligned_rtm)
> > +           IFUNC_IMPL_ADD (array, i, bzero,
> > +                           (CPU_FEATURE_USABLE (AVX2)
> > +                            && CPU_FEATURE_USABLE (RTM)),
> > +                           __bzero_avx2_unaligned_erms_rtm)
> > +           IFUNC_IMPL_ADD (array, i, bzero,
> > +                           (CPU_FEATURE_USABLE (AVX512VL)
> > +                            && CPU_FEATURE_USABLE (AVX512BW)
> > +                            && CPU_FEATURE_USABLE (BMI2)),
> > +                           __bzero_evex_unaligned)
> > +           IFUNC_IMPL_ADD (array, i, bzero,
> > +                           (CPU_FEATURE_USABLE (AVX512VL)
> > +                            && CPU_FEATURE_USABLE (AVX512BW)
> > +                            && CPU_FEATURE_USABLE (BMI2)),
> > +                           __bzero_evex_unaligned_erms)
> > +           IFUNC_IMPL_ADD (array, i, bzero,
> > +                           (CPU_FEATURE_USABLE (AVX512VL)
> > +                            && CPU_FEATURE_USABLE (AVX512BW)
> > +                            && CPU_FEATURE_USABLE (BMI2)),
> > +                           __bzero_avx512_unaligned_erms)
> > +           IFUNC_IMPL_ADD (array, i, bzero,
> > +                           (CPU_FEATURE_USABLE (AVX512VL)
> > +                            && CPU_FEATURE_USABLE (AVX512BW)
> > +                            && CPU_FEATURE_USABLE (BMI2)),
> > +                           __bzero_avx512_unaligned)
> > +          )
> > +
> >    /* Support sysdeps/x86_64/multiarch/rawmemchr.c.  */
> >    IFUNC_IMPL (i, name, rawmemchr,
> >             IFUNC_IMPL_ADD (array, i, rawmemchr,
> > diff --git a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms-rtm.S b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms-rtm.S
> > index 8ac3e479bb..5a5ee6f672 100644
> > --- a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms-rtm.S
> > +++ b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms-rtm.S
> > @@ -5,6 +5,7 @@
> >
> >  #define SECTION(p) p##.avx.rtm
> >  #define MEMSET_SYMBOL(p,s)   p##_avx2_##s##_rtm
> > +#define BZERO_SYMBOL(p,s)    p##_avx2_##s##_rtm
> >  #define WMEMSET_SYMBOL(p,s)  p##_avx2_##s##_rtm
> >
> >  #include "memset-avx2-unaligned-erms.S"
> > diff --git a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
> > index c0bf2875d0..a093a2831f 100644
> > --- a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
> > +++ b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
> > @@ -14,6 +14,9 @@
> >    vmovd d, %xmm0; \
> >    movq r, %rax;
> >
> > +# define BZERO_ZERO_VEC0() \
> > +  vpxor %xmm0, %xmm0, %xmm0
> > +
> >  # define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
> >    MEMSET_SET_VEC0_AND_SET_RETURN(d, r)
> >
> > @@ -29,6 +32,9 @@
> >  # ifndef MEMSET_SYMBOL
> >  #  define MEMSET_SYMBOL(p,s) p##_avx2_##s
> >  # endif
> > +# ifndef BZERO_SYMBOL
> > +#  define BZERO_SYMBOL(p,s)  p##_avx2_##s
> > +# endif
> >  # ifndef WMEMSET_SYMBOL
> >  #  define WMEMSET_SYMBOL(p,s)        p##_avx2_##s
> >  # endif
> > diff --git a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
> > index 5241216a77..727c92133a 100644
> > --- a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
> > +++ b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
> > @@ -19,6 +19,9 @@
> >    vpbroadcastb d, %VEC0; \
> >    movq r, %rax
> >
> > +# define BZERO_ZERO_VEC0() \
> > +  vpxorq %XMM0, %XMM0, %XMM0
> > +
> >  # define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
> >    vpbroadcastd d, %VEC0; \
> >    movq r, %rax
> > diff --git a/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
> > index 6370021506..5d8fa78f05 100644
> > --- a/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
> > +++ b/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
> > @@ -19,6 +19,9 @@
> >    vpbroadcastb d, %VEC0; \
> >    movq r, %rax
> >
> > +# define BZERO_ZERO_VEC0() \
> > +  vpxorq %XMM0, %XMM0, %XMM0
> > +
> >  # define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
> >    vpbroadcastd d, %VEC0; \
> >    movq r, %rax
> > diff --git a/sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S
> > index 8a6f0c561a..329c58ee46 100644
> > --- a/sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S
> > +++ b/sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S
> > @@ -22,6 +22,7 @@
> >
> >  #if IS_IN (libc)
> >  # define MEMSET_SYMBOL(p,s)  p##_sse2_##s
> > +# define BZERO_SYMBOL(p,s)   MEMSET_SYMBOL (p, s)
> >  # define WMEMSET_SYMBOL(p,s) p##_sse2_##s
> >
> >  # ifdef SHARED
> > diff --git a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
> > index 1b502b78e4..7c94fcdae1 100644
> > --- a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
> > +++ b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
> > @@ -26,6 +26,10 @@
> >
> >  #include <sysdep.h>
> >
> > +#ifndef BZERO_SYMBOL
> > +# define BZERO_SYMBOL(p,s)           MEMSET_SYMBOL (p, s)
> > +#endif
> > +
> >  #ifndef MEMSET_CHK_SYMBOL
> >  # define MEMSET_CHK_SYMBOL(p,s)              MEMSET_SYMBOL(p, s)
> >  #endif
> > @@ -87,6 +91,18 @@
> >  # define XMM_SMALL   0
> >  #endif
> >
> > +#ifdef USE_LESS_VEC_MASK_STORE
> > +# define SET_REG64   rcx
> > +# define SET_REG32   ecx
> > +# define SET_REG16   cx
> > +# define SET_REG8    cl
> > +#else
> > +# define SET_REG64   rsi
> > +# define SET_REG32   esi
> > +# define SET_REG16   si
> > +# define SET_REG8    sil
> > +#endif
> > +
> >  #define PAGE_SIZE 4096
> >
> >  /* Macro to calculate size of small memset block for aligning
> > @@ -96,18 +112,6 @@
> >
> >  #ifndef SECTION
> >  # error SECTION is not defined!
> > -#endif
> > -
> > -     .section SECTION(.text),"ax",@progbits
> > -#if VEC_SIZE == 16 && IS_IN (libc)
> > -ENTRY (__bzero)
> > -     mov     %RDI_LP, %RAX_LP /* Set return value.  */
> > -     mov     %RSI_LP, %RDX_LP /* Set n.  */
> > -     xorl    %esi, %esi
> > -     pxor    %XMM0, %XMM0
> > -     jmp     L(entry_from_bzero)
> > -END (__bzero)
> > -weak_alias (__bzero, bzero)
> >  #endif
> >
> >  #if IS_IN (libc)
> > @@ -123,12 +127,37 @@ ENTRY (WMEMSET_SYMBOL (__wmemset, unaligned))
> >       WMEMSET_SET_VEC0_AND_SET_RETURN (%esi, %rdi)
> >       WMEMSET_VDUP_TO_VEC0_LOW()
> >       cmpq    $VEC_SIZE, %rdx
> > -     jb      L(less_vec_no_vdup)
> > +     jb      L(less_vec_from_wmemset)
> >       WMEMSET_VDUP_TO_VEC0_HIGH()
> >       jmp     L(entry_from_wmemset)
> >  END (WMEMSET_SYMBOL (__wmemset, unaligned))
> >  #endif
> >
> > +ENTRY (BZERO_SYMBOL(__bzero, unaligned))
> > +#if VEC_SIZE > 16
> > +     BZERO_ZERO_VEC0 ()
> > +#endif
> > +     mov     %RDI_LP, %RAX_LP
> > +     mov     %RSI_LP, %RDX_LP
> > +#ifndef USE_LESS_VEC_MASK_STORE
> > +     xorl    %esi, %esi
> > +#endif
> > +     cmp     $VEC_SIZE, %RDX_LP
> > +     jb      L(less_vec_no_vdup)
> > +#ifdef USE_LESS_VEC_MASK_STORE
> > +     xorl    %esi, %esi
> > +#endif
> > +#if VEC_SIZE <= 16
> > +     BZERO_ZERO_VEC0 ()
> > +#endif
> > +     cmp     $(VEC_SIZE * 2), %RDX_LP
> > +     ja      L(more_2x_vec)
> > +     /* From VEC and to 2 * VEC.  No branch when size == VEC_SIZE.  */
> > +     VMOVU   %VEC(0), (%rdi)
> > +     VMOVU   %VEC(0), (VEC_SIZE * -1)(%rdi, %rdx)
> > +     VZEROUPPER_RETURN
> > +END (BZERO_SYMBOL(__bzero, unaligned))
> > +
> >  #if defined SHARED && IS_IN (libc)
> >  ENTRY_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned))
> >       cmp     %RDX_LP, %RCX_LP
> > @@ -142,7 +171,6 @@ ENTRY (MEMSET_SYMBOL (__memset, unaligned))
> >       /* Clear the upper 32 bits.  */
> >       mov     %edx, %edx
> >  # endif
> > -L(entry_from_bzero):
> >       cmpq    $VEC_SIZE, %rdx
> >       jb      L(less_vec)
> >       MEMSET_VDUP_TO_VEC0_HIGH()
> > @@ -187,6 +215,31 @@ END (__memset_erms)
> >  END (MEMSET_SYMBOL (__memset, erms))
> >  # endif
> >
> > +ENTRY_P2ALIGN (BZERO_SYMBOL(__bzero, unaligned_erms), 6)
> > +# if VEC_SIZE > 16
> > +     BZERO_ZERO_VEC0 ()
> > +# endif
> > +     mov     %RDI_LP, %RAX_LP
> > +     mov     %RSI_LP, %RDX_LP
> > +# ifndef USE_LESS_VEC_MASK_STORE
> > +     xorl    %esi, %esi
> > +# endif
> > +     cmp     $VEC_SIZE, %RDX_LP
> > +     jb      L(less_vec_no_vdup)
> > +# ifdef USE_LESS_VEC_MASK_STORE
> > +     xorl    %esi, %esi
> > +# endif
> > +# if VEC_SIZE <= 16
> > +     BZERO_ZERO_VEC0 ()
> > +# endif
> > +     cmp     $(VEC_SIZE * 2), %RDX_LP
> > +     ja      L(stosb_more_2x_vec)
> > +     /* From VEC and to 2 * VEC.  No branch when size == VEC_SIZE.  */
> > +     VMOVU   %VEC(0), (%rdi)
> > +     VMOVU   %VEC(0), (VEC_SIZE * -1)(%rdi, %rdx)
> > +     VZEROUPPER_RETURN
> > +END (BZERO_SYMBOL(__bzero, unaligned_erms))
> > +
> >  # if defined SHARED && IS_IN (libc)
> >  ENTRY_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned_erms))
> >       cmp     %RDX_LP, %RCX_LP
> > @@ -229,6 +282,7 @@ L(last_2x_vec):
> >       .p2align 4,, 10
> >  L(less_vec):
> >  L(less_vec_no_vdup):
> > +L(less_vec_from_wmemset):
> >       /* Less than 1 VEC.  */
> >  # if VEC_SIZE != 16 && VEC_SIZE != 32 && VEC_SIZE != 64
> >  #  error Unsupported VEC_SIZE!
> > @@ -374,8 +428,11 @@ L(less_vec):
> >       /* Broadcast esi to partial register (i.e VEC_SIZE == 32 broadcast to
> >          xmm). This is only does anything for AVX2.  */
> >       MEMSET_VDUP_TO_VEC0_LOW ()
> > +L(less_vec_from_wmemset):
> > +#if VEC_SIZE > 16
> >  L(less_vec_no_vdup):
> >  #endif
> > +#endif
> >  L(cross_page):
> >  #if VEC_SIZE > 32
> >       cmpl    $32, %edx
> > @@ -386,7 +443,10 @@ L(cross_page):
> >       jge     L(between_16_31)
> >  #endif
> >  #ifndef USE_XMM_LESS_VEC
> > -     MOVQ    %XMM0, %rcx
> > +     MOVQ    %XMM0, %SET_REG64
> > +#endif
> > +#if VEC_SIZE <= 16
> > +L(less_vec_no_vdup):
> >  #endif
> >       cmpl    $8, %edx
> >       jge     L(between_8_15)
> > @@ -395,7 +455,7 @@ L(cross_page):
> >       cmpl    $1, %edx
> >       jg      L(between_2_3)
> >       jl      L(between_0_0)
> > -     movb    %sil, (%LESS_VEC_REG)
> > +     movb    %SET_REG8, (%LESS_VEC_REG)
> >  L(between_0_0):
> >       ret
> >
> > @@ -428,8 +488,8 @@ L(between_8_15):
> >       MOVQ    %XMM0, (%rdi)
> >       MOVQ    %XMM0, -8(%rdi, %rdx)
> >  #else
> > -     movq    %rcx, (%LESS_VEC_REG)
> > -     movq    %rcx, -8(%LESS_VEC_REG, %rdx)
> > +     movq    %SET_REG64, (%LESS_VEC_REG)
> > +     movq    %SET_REG64, -8(%LESS_VEC_REG, %rdx)
> >  #endif
> >       ret
> >
> > @@ -442,8 +502,8 @@ L(between_4_7):
> >       MOVD    %XMM0, (%rdi)
> >       MOVD    %XMM0, -4(%rdi, %rdx)
> >  #else
> > -     movl    %ecx, (%LESS_VEC_REG)
> > -     movl    %ecx, -4(%LESS_VEC_REG, %rdx)
> > +     movl    %SET_REG32, (%LESS_VEC_REG)
> > +     movl    %SET_REG32, -4(%LESS_VEC_REG, %rdx)
> >  #endif
> >       ret
> >
> > @@ -452,12 +512,12 @@ L(between_4_7):
> >  L(between_2_3):
> >       /* From 2 to 3.  No branch when size == 2.  */
> >  #ifdef USE_XMM_LESS_VEC
> > -     movb    %sil, (%rdi)
> > -     movb    %sil, 1(%rdi)
> > -     movb    %sil, -1(%rdi, %rdx)
> > +     movb    %SET_REG8, (%rdi)
> > +     movb    %SET_REG8, 1(%rdi)
> > +     movb    %SET_REG8, -1(%rdi, %rdx)
> >  #else
> > -     movw    %cx, (%LESS_VEC_REG)
> > -     movb    %sil, -1(%LESS_VEC_REG, %rdx)
> > +     movw    %SET_REG16, (%LESS_VEC_REG)
> > +     movb    %SET_REG8, -1(%LESS_VEC_REG, %rdx)
> >  #endif
> >       ret
> >  END (MEMSET_SYMBOL (__memset, unaligned_erms))

^ permalink raw reply	[flat|nested] 39+ messages in thread

* Re: [PATCH v2] x86-64: Optimize bzero
  2022-02-09 22:14   ` Noah Goldstein
@ 2022-02-10 12:35     ` Adhemerval Zanella
  2022-02-10 13:01       ` Wilco Dijkstra
  0 siblings, 1 reply; 39+ messages in thread
From: Adhemerval Zanella @ 2022-02-10 12:35 UTC (permalink / raw)
  To: Noah Goldstein; +Cc: H.J. Lu, GNU C Library, Wilco Dijkstra



On 09/02/2022 19:14, Noah Goldstein wrote:
> On Wed, Feb 9, 2022 at 5:41 AM Adhemerval Zanella via Libc-alpha
> <libc-alpha@sourceware.org> wrote:
>>
>>
>>
>> On 08/02/2022 19:43, H.J. Lu via Libc-alpha wrote:
>>> Rebase against the current master branch.
>>>
>>> --
>>> memset with zero as the value to set is by far the majority value (99%+
>>> for Python3 and GCC).
>>>
>>> bzero can be slightly more optimized for this case by using a zero-idiom
>>> xor for broadcasting the set value to a register (vector or GPR).
>>>
>>> Co-developed-by: Noah Goldstein <goldstein.w.n@gmail.com>
>>
>> Is it really worth to ressurerect bzero with this multiple ifunc variants?
>> Would Python3/GCC or any programs start to replace memset with bzero for
>> the sake of this optimization?
>>
>> I agree with Wilco where the gain are marginal in this case.
> 
> The cost is only 1 cache line and it doesn't interfere with memset at all
> so it's unlikely to cause any problems.
> 
> The saving is in the lane-cross broadcast which is on the critical
> path for memsets in [VEC_SIZE, 2 * VEC_SIZE] (think 32-64).
> 
> As well for EVEX + AVX512, because it uses predicate execution for
> [0, VEC_SIZE] there is a slight benefit there (although only in throughput
> because the critical path in mask construction is longer than the lane
> VEC setup).
> 
> Agreed it's not clear if it's worth it to start replacing memset calls with
> bzero calls, but at the very least this will improve existing code that
> uses bzero.
> 

My point is this is a lot of code and infrastructure for a symbol marked
as legacy for POSIX.1-2001 and removed on POSIX.1-2008 for the sake of
marginal gains in specific cases.

^ permalink raw reply	[flat|nested] 39+ messages in thread

* Re: [PATCH v2] x86-64: Optimize bzero
  2022-02-10 12:35     ` Adhemerval Zanella
@ 2022-02-10 13:01       ` Wilco Dijkstra
  2022-02-10 13:10         ` Adhemerval Zanella
                           ` (2 more replies)
  0 siblings, 3 replies; 39+ messages in thread
From: Wilco Dijkstra @ 2022-02-10 13:01 UTC (permalink / raw)
  To: Adhemerval Zanella, Noah Goldstein; +Cc: H.J. Lu, GNU C Library

Hi,

>> The saving is in the lane-cross broadcast which is on the critical
>> path for memsets in [VEC_SIZE, 2 * VEC_SIZE] (think 32-64).

What is the speedup in eg. bench-memset? Generally the OoO engine will
be able to hide a small increase in latency, so I'd be surprised it shows up
as a significant gain.

If you can show a good speedup in an important application (or benchmark
like SPEC2017) then it may be worth pursuing. However there are other
optimization opportunities that may be easier or give a larger benefit.

>> Agreed it's not clear if it's worth it to start replacing memset calls with
>> bzero calls, but at the very least this will improve existing code that
>> uses bzero.

No code uses bzero, no compiler emits bzero. It died 2 decades ago...

> My point is this is a lot of code and infrastructure for a symbol marked
> as legacy for POSIX.1-2001 and removed on POSIX.1-2008 for the sake of
> marginal gains in specific cases.

Indeed, what we really should discuss is how to remove the last traces of
bcopy and bcmp from GLIBC. Do we need to keep a compatibility symbol
or could we just get rid of it altogether?

Cheers,
Wilco

^ permalink raw reply	[flat|nested] 39+ messages in thread

* Re: [PATCH v2] x86-64: Optimize bzero
  2022-02-10 13:01       ` Wilco Dijkstra
@ 2022-02-10 13:10         ` Adhemerval Zanella
  2022-02-10 13:16           ` Adhemerval Zanella
  2022-02-10 13:17           ` Wilco Dijkstra
  2022-02-10 18:28         ` Noah Goldstein
  2022-02-10 18:35         ` Noah Goldstein
  2 siblings, 2 replies; 39+ messages in thread
From: Adhemerval Zanella @ 2022-02-10 13:10 UTC (permalink / raw)
  To: Wilco Dijkstra, Noah Goldstein; +Cc: H.J. Lu, GNU C Library



On 10/02/2022 10:01, Wilco Dijkstra wrote:
> Hi,
> 
>>> The saving is in the lane-cross broadcast which is on the critical
>>> path for memsets in [VEC_SIZE, 2 * VEC_SIZE] (think 32-64).
> 
> What is the speedup in eg. bench-memset? Generally the OoO engine will
> be able to hide a small increase in latency, so I'd be surprised it shows up
> as a significant gain.
> 
> If you can show a good speedup in an important application (or benchmark
> like SPEC2017) then it may be worth pursuing. However there are other
> optimization opportunities that may be easier or give a larger benefit.
> 
>>> Agreed it's not clear if it's worth it to start replacing memset calls with
>>> bzero calls, but at the very least this will improve existing code that
>>> uses bzero.
> 
> No code uses bzero, no compiler emits bzero. It died 2 decades ago...
> 
>> My point is this is a lot of code and infrastructure for a symbol marked
>> as legacy for POSIX.1-2001 and removed on POSIX.1-2008 for the sake of
>> marginal gains in specific cases.
> 
> Indeed, what we really should discuss is how to remove the last traces of
> bcopy and bcmp from GLIBC. Do we need to keep a compatibility symbol
> or could we just get rid of it altogether?

We need to keep the symbols as-is afaiu, since callers might still target
old POSIX where the symbol is defined as supported.  We might add either
compiler or linker warning stating the symbols is deprecated, but imho it
would just be better if we stop trying to microoptimize it and just use
the generic interface (which call memset).

^ permalink raw reply	[flat|nested] 39+ messages in thread

* Re: [PATCH v2] x86-64: Optimize bzero
  2022-02-10 13:10         ` Adhemerval Zanella
@ 2022-02-10 13:16           ` Adhemerval Zanella
  2022-02-10 13:17           ` Wilco Dijkstra
  1 sibling, 0 replies; 39+ messages in thread
From: Adhemerval Zanella @ 2022-02-10 13:16 UTC (permalink / raw)
  To: Wilco Dijkstra, Noah Goldstein; +Cc: H.J. Lu, GNU C Library



On 10/02/2022 10:10, Adhemerval Zanella wrote:
> 
> 
> On 10/02/2022 10:01, Wilco Dijkstra wrote:
>> Hi,
>>
>>>> The saving is in the lane-cross broadcast which is on the critical
>>>> path for memsets in [VEC_SIZE, 2 * VEC_SIZE] (think 32-64).
>>
>> What is the speedup in eg. bench-memset? Generally the OoO engine will
>> be able to hide a small increase in latency, so I'd be surprised it shows up
>> as a significant gain.
>>
>> If you can show a good speedup in an important application (or benchmark
>> like SPEC2017) then it may be worth pursuing. However there are other
>> optimization opportunities that may be easier or give a larger benefit.
>>
>>>> Agreed it's not clear if it's worth it to start replacing memset calls with
>>>> bzero calls, but at the very least this will improve existing code that
>>>> uses bzero.
>>
>> No code uses bzero, no compiler emits bzero. It died 2 decades ago...
>>
>>> My point is this is a lot of code and infrastructure for a symbol marked
>>> as legacy for POSIX.1-2001 and removed on POSIX.1-2008 for the sake of
>>> marginal gains in specific cases.
>>
>> Indeed, what we really should discuss is how to remove the last traces of
>> bcopy and bcmp from GLIBC. Do we need to keep a compatibility symbol
>> or could we just get rid of it altogether?
> 
> We need to keep the symbols as-is afaiu, since callers might still target
> old POSIX where the symbol is defined as supported.  We might add either
> compiler or linker warning stating the symbols is deprecated, but imho it
> would just be better if we stop trying to microoptimize it and just use
> the generic interface (which call memset).

And it even makes more sense since gcc either generates memset call or
expand bzero, so there is even less point in adding either optimized
version for it or ifunc variants to call memset (to avoid the intra-plt
call).

I will probably cleanup all the bzero optimization we have, it is
unfortunate that this patch got in without further discussion.

^ permalink raw reply	[flat|nested] 39+ messages in thread

* Re: [PATCH v2] x86-64: Optimize bzero
  2022-02-10 13:10         ` Adhemerval Zanella
  2022-02-10 13:16           ` Adhemerval Zanella
@ 2022-02-10 13:17           ` Wilco Dijkstra
  2022-02-10 13:22             ` Adhemerval Zanella
  1 sibling, 1 reply; 39+ messages in thread
From: Wilco Dijkstra @ 2022-02-10 13:17 UTC (permalink / raw)
  To: Adhemerval Zanella, Noah Goldstein; +Cc: H.J. Lu, GNU C Library

Hi,

> We need to keep the symbols as-is afaiu, since callers might still target
> old POSIX where the symbol is defined as supported.  We might add either
> compiler or linker warning stating the symbols is deprecated, but imho it
> would just be better if we stop trying to microoptimize it and just use
> the generic interface (which call memset).

Compilers have been doing that forever - if you use bzero GCC and LLVM
emit a call to memset. No new calls are emitted to these functions, so there
is no point in having target-specific optimization at all.

However we could disallow use of these functions at all in future GLIBCs.

Cheers,
Wilco

^ permalink raw reply	[flat|nested] 39+ messages in thread

* Re: [PATCH v2] x86-64: Optimize bzero
  2022-02-10 13:17           ` Wilco Dijkstra
@ 2022-02-10 13:22             ` Adhemerval Zanella
  2022-02-10 17:50               ` Alejandro Colomar (man-pages)
  0 siblings, 1 reply; 39+ messages in thread
From: Adhemerval Zanella @ 2022-02-10 13:22 UTC (permalink / raw)
  To: Wilco Dijkstra, Noah Goldstein; +Cc: H.J. Lu, GNU C Library



On 10/02/2022 10:17, Wilco Dijkstra wrote:
> Hi,
> 
>> We need to keep the symbols as-is afaiu, since callers might still target
>> old POSIX where the symbol is defined as supported.  We might add either
>> compiler or linker warning stating the symbols is deprecated, but imho it
>> would just be better if we stop trying to microoptimize it and just use
>> the generic interface (which call memset).
> 
> Compilers have been doing that forever - if you use bzero GCC and LLVM
> emit a call to memset. No new calls are emitted to these functions, so there
> is no point in having target-specific optimization at all.
> 
> However we could disallow use of these functions at all in future GLIBCs.

Unfortunately we can not stop providing such symbols unless we start to stop
being compatible to legacy POSIX.  And I think we already do not use these
internally.

Also we can start to remove to other bstring functions as well, bcmp and bcopy.

^ permalink raw reply	[flat|nested] 39+ messages in thread

* Re: [PATCH v2] x86-64: Optimize bzero
  2022-02-10 13:22             ` Adhemerval Zanella
@ 2022-02-10 17:50               ` Alejandro Colomar (man-pages)
  2022-02-10 19:19                 ` Wilco Dijkstra
  2022-02-10 19:42                 ` Adhemerval Zanella
  0 siblings, 2 replies; 39+ messages in thread
From: Alejandro Colomar (man-pages) @ 2022-02-10 17:50 UTC (permalink / raw)
  To: Adhemerval Zanella, Wilco Dijkstra, Noah Goldstein, H.J. Lu; +Cc: GNU C Library

Hi,

On 2/10/22 14:16, Adhemerval Zanella via Libc-alpha wrote:
> On 10/02/2022 10:10, Adhemerval Zanella wrote:
>> On 10/02/2022 10:01, Wilco Dijkstra wrote:
>>>>> Agreed it's not clear if it's worth it to start replacing memset
calls with
>>>>> bzero calls, but at the very least this will improve existing code
that
>>>>> uses bzero.
>>>
>>> No code uses bzero, no compiler emits bzero. It died 2 decades ago...

Sorry to ruin your day, but there's a bzero(3) user here :)

There are rational reasons to prefer bzero(3) due to it's better
interface, and only one to prefer memset(3): standards people decided to
remove bzero(3).  See <https://stackoverflow.com/a/17097978/6872717>.

Consider the following quote from "UNIX Network Programming" by Stevens
et al., Section 1.2 (emphasis added):
> `bzero` is not an ANSI C function. It is derived from early Berkely
> networking code. Nevertheless, we use it throughout the text, instead
> of the ANSI C `memset` function, because `bzero` is easier to remember
> (with only two arguments) than `memset` (with three arguments). Almost
> every vendor that supports the sockets API also provides `bzero`, and
> if not, we provide a macro definition in our `unp.h` header.
>
> Indeed, **the author of TCPv3 [TCP/IP Illustrated, Volume 3 - Stevens
1996] made the mistake of swapping the second
> and third arguments to `memset` in 10 occurrences in the first
> printing**. A C compiler cannot catch this error because both arguments
> are of the same type. (Actually, the second argument is an `int` and
> the third argument is `size_t`, which is typically an `unsigned int`,
> but the values specified, 0 and 16, respectively, are still acceptable
> for the other type of argument.) The call to `memset` still worked,
> because only a few of the socket functions actually require that the
> final 8 bytes of an Internet socket address structure be set to 0.
> Nevertheless, it was an error, and one that could be avoided by using
> `bzero`, because swapping the two arguments to `bzero` will always be
> caught by the C compiler if function prototypes are used.

I consistently use bzero(3), and dislike the interface of memset(3) for
zeroing memory.  I checked how many memset(3) calls there are in a
codebase of mine, and there is exactly 1 call to memset(3), for setting
an array representing a large bitfield to 1s: memset(arr, UCHAR_MAX,
sizeof(arr)).  And there are 41 calls to bzero(3).

>>>
>>>> My point is this is a lot of code and infrastructure for a symbol
marked
>>>> as legacy for POSIX.1-2001 and removed on POSIX.1-2008 for the sake of
>>>> marginal gains in specific cases.
>>>
>>> Indeed, what we really should discuss is how to remove the last
traces of
>>> bcopy and bcmp from GLIBC. Do we need to keep a compatibility symbol
>>> or could we just get rid of it altogether?

I think it's sad that POSIX removed bzero(3).  In the end, people need
to zero memory, and there's no simpler interface than bzero(3) for that.
 memset(3) has a useless extra parameter.  Even if you just do a simple
wrapper as the following, which is no big overhead for glibc I guess,
you would be improving (or not worsening) my and hopefully others' lives:

static inline bzero(s, n)
{
	memset(s, 0, n);
}

Is that really a pain to maintain?

If libc ever removes bzero(3), it's not like I'm going to start using
memset(3).  I'll just define it myself.  That's not an improvement, but
the opposite, IMO.

Ideally, POSIX would reconsider some day, and reincorporate bzero(3),
but I don't expect that any soon :(.

>>
>> We need to keep the symbols as-is afaiu, since callers might still target
>> old POSIX where the symbol is defined as supported.  We might add either
>> compiler or linker warning stating the symbols is deprecated, but imho it
>> would just be better if we stop trying to microoptimize it and just use
>> the generic interface (which call memset).

No please, I know they are deprecated, and explicitly want to use it.
Don't need some extra spurious warning.

Other projects that I have touched (including nginx and shadow-utils),
seem to have some similar opinion to me, and they define memzero() or
some other similar name, with an interface identical to bzero(3) (just a
different name to avoid problems), to wrap either bzero(3) or memset(3),
depending on what is available.

Thanks,

Alex

-- 
Alejandro Colomar
Linux man-pages comaintainer; https://www.kernel.org/doc/man-pages/
http://www.alejandro-colomar.es/

^ permalink raw reply	[flat|nested] 39+ messages in thread

* Re: [PATCH v2] x86-64: Optimize bzero
  2022-02-10 13:01       ` Wilco Dijkstra
  2022-02-10 13:10         ` Adhemerval Zanella
@ 2022-02-10 18:28         ` Noah Goldstein
  2022-02-10 18:35         ` Noah Goldstein
  2 siblings, 0 replies; 39+ messages in thread
From: Noah Goldstein @ 2022-02-10 18:28 UTC (permalink / raw)
  To: Wilco Dijkstra; +Cc: Adhemerval Zanella, H.J. Lu, GNU C Library

On Thu, Feb 10, 2022 at 7:02 AM Wilco Dijkstra <Wilco.Dijkstra@arm.com> wrote:
>
> Hi,
>
> >> The saving is in the lane-cross broadcast which is on the critical
> >> path for memsets in [VEC_SIZE, 2 * VEC_SIZE] (think 32-64).
>
> What is the speedup in eg. bench-memset? Generally the OoO engine will
> be able to hide a small increase in latency, so I'd be surprised it shows up
> as a significant gain.
>
> If you can show a good speedup in an important application (or benchmark
> like SPEC2017) then it may be worth pursuing. However there are other
> optimization opportunities that may be easier or give a larger benefit.

Very much so doubt any benefit on SPEC/other unless the compiler
decided to build zeros with long dependency chains instead of
xor.

>
> >> Agreed it's not clear if it's worth it to start replacing memset calls with
> >> bzero calls, but at the very least this will improve existing code that
> >> uses bzero.
>
> No code uses bzero, no compiler emits bzero. It died 2 decades ago...
>
> > My point is this is a lot of code and infrastructure for a symbol marked
> > as legacy for POSIX.1-2001 and removed on POSIX.1-2008 for the sake of
> > marginal gains in specific cases.
>
> Indeed, what we really should discuss is how to remove the last traces of
> bcopy and bcmp from GLIBC. Do we need to keep a compatibility symbol
> or could we just get rid of it altogether?
>
> Cheers,
> Wilco

^ permalink raw reply	[flat|nested] 39+ messages in thread

* Re: [PATCH v2] x86-64: Optimize bzero
  2022-02-10 13:01       ` Wilco Dijkstra
  2022-02-10 13:10         ` Adhemerval Zanella
  2022-02-10 18:28         ` Noah Goldstein
@ 2022-02-10 18:35         ` Noah Goldstein
  2 siblings, 0 replies; 39+ messages in thread
From: Noah Goldstein @ 2022-02-10 18:35 UTC (permalink / raw)
  To: Wilco Dijkstra; +Cc: Adhemerval Zanella, H.J. Lu, GNU C Library

On Thu, Feb 10, 2022 at 7:02 AM Wilco Dijkstra <Wilco.Dijkstra@arm.com> wrote:
>
> Hi,
>
> >> The saving is in the lane-cross broadcast which is on the critical
> >> path for memsets in [VEC_SIZE, 2 * VEC_SIZE] (think 32-64).
>
> What is the speedup in eg. bench-memset? Generally the OoO engine will
> be able to hide a small increase in latency, so I'd be surprised it shows up
> as a significant gain.

Well comparing the previous sse2 bzero against avx2/evex/avx512 versions
there is obviously speedup. Comparing memset-${version} vs bzero-${version}
it's ambiguous if there is any benefits.

>
> If you can show a good speedup in an important application (or benchmark
> like SPEC2017) then it may be worth pursuing. However there are other
> optimization opportunities that may be easier or give a larger benefit.
>
> >> Agreed it's not clear if it's worth it to start replacing memset calls with
> >> bzero calls, but at the very least this will improve existing code that
> >> uses bzero.
>
> No code uses bzero, no compiler emits bzero. It died 2 decades ago...
>
> > My point is this is a lot of code and infrastructure for a symbol marked
> > as legacy for POSIX.1-2001 and removed on POSIX.1-2008 for the sake of
> > marginal gains in specific cases.
>
> Indeed, what we really should discuss is how to remove the last traces of
> bcopy and bcmp from GLIBC. Do we need to keep a compatibility symbol
> or could we just get rid of it altogether?
>
> Cheers,
> Wilco

^ permalink raw reply	[flat|nested] 39+ messages in thread

* Re: [PATCH v2] x86-64: Optimize bzero
  2022-02-10 17:50               ` Alejandro Colomar (man-pages)
@ 2022-02-10 19:19                 ` Wilco Dijkstra
  2022-02-10 20:27                   ` Alejandro Colomar (man-pages)
  2022-02-10 19:42                 ` Adhemerval Zanella
  1 sibling, 1 reply; 39+ messages in thread
From: Wilco Dijkstra @ 2022-02-10 19:19 UTC (permalink / raw)
  To: Alejandro Colomar (man-pages),
	Adhemerval Zanella, Noah Goldstein, H.J. Lu
  Cc: GNU C Library

Hi Alex,

Don't worry, you're not a bzero user - just check your binaries and
I bet you will find zero calls to bzero!

GCC/LLVM avoid calls to bzero, bcmp, bcopy and mempcpy. The goal is to
avoid unnecessary proliferation of almost identical string functions where
there is absolutely no performance benefit. In fact this duplication is typically
a net loss, partly due to reducing optimization effort and the extra runtime
overhead due to having multiple copies of the same function in caches and
predictors.

However if you care about having a standardized call to zero memory, just
propose it for the next C/C++ standard. Given we will not agree on naming,
we could just add memzero, memclr, memset_zero and bcopy as synonyms
for memset.

Cheers,
Wilco

^ permalink raw reply	[flat|nested] 39+ messages in thread

* Re: [PATCH v2] x86-64: Optimize bzero
  2022-02-10 17:50               ` Alejandro Colomar (man-pages)
  2022-02-10 19:19                 ` Wilco Dijkstra
@ 2022-02-10 19:42                 ` Adhemerval Zanella
  1 sibling, 0 replies; 39+ messages in thread
From: Adhemerval Zanella @ 2022-02-10 19:42 UTC (permalink / raw)
  To: Alejandro Colomar (man-pages), Wilco Dijkstra, Noah Goldstein, H.J. Lu
  Cc: GNU C Library



On 10/02/2022 14:50, Alejandro Colomar (man-pages) wrote:
> Hi,
> 
> 
> On 2/10/22 14:16, Adhemerval Zanella via Libc-alpha wrote:
>> On 10/02/2022 10:10, Adhemerval Zanella wrote:
>>> On 10/02/2022 10:01, Wilco Dijkstra wrote:
>>>>>> Agreed it's not clear if it's worth it to start replacing memset
> calls with
>>>>>> bzero calls, but at the very least this will improve existing code
> that
>>>>>> uses bzero.
>>>>
>>>> No code uses bzero, no compiler emits bzero. It died 2 decades ago...
> 
> Sorry to ruin your day, but there's a bzero(3) user here :)
> 
> There are rational reasons to prefer bzero(3) due to it's better
> interface, and only one to prefer memset(3): standards people decided to
> remove bzero(3).  See <https://stackoverflow.com/a/17097978/6872717>.
> 
> Consider the following quote from "UNIX Network Programming" by Stevens
> et al., Section 1.2 (emphasis added):
>> `bzero` is not an ANSI C function. It is derived from early Berkely
>> networking code. Nevertheless, we use it throughout the text, instead
>> of the ANSI C `memset` function, because `bzero` is easier to remember
>> (with only two arguments) than `memset` (with three arguments). Almost
>> every vendor that supports the sockets API also provides `bzero`, and
>> if not, we provide a macro definition in our `unp.h` header.
>>
>> Indeed, **the author of TCPv3 [TCP/IP Illustrated, Volume 3 - Stevens
> 1996] made the mistake of swapping the second
>> and third arguments to `memset` in 10 occurrences in the first
>> printing**. A C compiler cannot catch this error because both arguments
>> are of the same type. (Actually, the second argument is an `int` and
>> the third argument is `size_t`, which is typically an `unsigned int`,
>> but the values specified, 0 and 16, respectively, are still acceptable
>> for the other type of argument.) The call to `memset` still worked,
>> because only a few of the socket functions actually require that the
>> final 8 bytes of an Internet socket address structure be set to 0.
>> Nevertheless, it was an error, and one that could be avoided by using
>> `bzero`, because swapping the two arguments to `bzero` will always be
>> caught by the C compiler if function prototypes are used.
> 
> I consistently use bzero(3), and dislike the interface of memset(3) for
> zeroing memory.  I checked how many memset(3) calls there are in a
> codebase of mine, and there is exactly 1 call to memset(3), for setting
> an array representing a large bitfield to 1s: memset(arr, UCHAR_MAX,
> sizeof(arr)).  And there are 41 calls to bzero(3).

The bzero won't go away since glibc will continue to support old POSIX 
interfaces, but Wilco has a point: it is an interface that has been 
deprecated for more than 2 decades and usually only BSD-like code
still keeps using it (explicit_bzero was initialized from openbsd).

> 
>>>>
>>>>> My point is this is a lot of code and infrastructure for a symbol
> marked
>>>>> as legacy for POSIX.1-2001 and removed on POSIX.1-2008 for the sake of
>>>>> marginal gains in specific cases.
>>>>
>>>> Indeed, what we really should discuss is how to remove the last
> traces of
>>>> bcopy and bcmp from GLIBC. Do we need to keep a compatibility symbol
>>>> or could we just get rid of it altogether?
> 
> I think it's sad that POSIX removed bzero(3).  In the end, people need
> to zero memory, and there's no simpler interface than bzero(3) for that.
>  memset(3) has a useless extra parameter.  Even if you just do a simple
> wrapper as the following, which is no big overhead for glibc I guess,
> you would be improving (or not worsening) my and hopefully others' lives:
> 
> static inline bzero(s, n)
> {
> 	memset(s, 0, n);
> }
> 
> Is that really a pain to maintain?

Yes, this is *exactly* what we are trying to remove and I will oppose to
reintroduce it after all the cleanups we did in this front (for instance
18b10de7ced9e9).

Besides your code most likely is calling memset under the hoods, gcc by 
default converts bzero calls to memset.  It seems to be really tricky to 
actually force gcc emit a bzero call to libc, I would could make by not 
including string.h, adding an explicit prototype to bzero, along with
-fno-builtin:

$ cat bzero.c 
#include <stddef.h>
extern void bzero (void *, size_t s);

int main (int argc, char *argv[])
{
  char b[1024];
  bzero (b, sizeof b);
  return 0;
}
$ gcc -Wall -fno-builtin bzero.c -o test && objdump -R test | grep -w bzero
0000000000003fd0 R_X86_64_JUMP_SLOT  bzero@GLIBC_2.2.5

> 
> If libc ever removes bzero(3), it's not like I'm going to start using
> memset(3).  I'll just define it myself.  That's not an improvement, but
> the opposite, IMO.
> 
> Ideally, POSIX would reconsider some day, and reincorporate bzero(3),
> but I don't expect that any soon :(.

I really think it is unlikely, but it is just my view.

> 
>>>
>>> We need to keep the symbols as-is afaiu, since callers might still target
>>> old POSIX where the symbol is defined as supported.  We might add either
>>> compiler or linker warning stating the symbols is deprecated, but imho it
>>> would just be better if we stop trying to microoptimize it and just use
>>> the generic interface (which call memset).
> 
> No please, I know they are deprecated, and explicitly want to use it.
> Don't need some extra spurious warning.
> 
> Other projects that I have touched (including nginx and shadow-utils),
> seem to have some similar opinion to me, and they define memzero() or
> some other similar name, with an interface identical to bzero(3) (just a
> different name to avoid problems), to wrap either bzero(3) or memset(3),
> depending on what is available.

We are discussing different subjects here: what I want is to remove the
glibc *internal* optimization for bzero, which is essentially an
implementation detail.  In a first glance it would change performance,
however gcc does a hard job replacing bzero/bcmp/bcopy with their
str* counterparts, so it highly unlike that newer binaries will actually
call bzero.

I did an experiment on my system and indeed there is no calls to bzero.

$ cat count_bzero.sh
#!/bin/bash

files=`IFS=':';for i in $PATH; do test -d "$i" && find "$i" -maxdepth 1 -executable -type f; done`
total=0
for file in $files; do
  bzero=`objdump -R $file 2>&1`
  if [ $? -eq 0 ]; then
    ncalls=`echo $bzero | grep -w bzero | wc -l`
    ((total=total+ncalls))
    if [ $ncalls -gt 0 ]; then
      echo "$file: $ncalls"
    fi
  fi
done
echo "TOTAL=$total"
$ ./count_bzero.sh
TOTAL=0



^ permalink raw reply	[flat|nested] 39+ messages in thread

* Re: [PATCH v2] x86-64: Optimize bzero
  2022-02-10 19:19                 ` Wilco Dijkstra
@ 2022-02-10 20:27                   ` Alejandro Colomar (man-pages)
  2022-02-10 20:42                     ` Adhemerval Zanella
  0 siblings, 1 reply; 39+ messages in thread
From: Alejandro Colomar (man-pages) @ 2022-02-10 20:27 UTC (permalink / raw)
  To: Wilco Dijkstra, Adhemerval Zanella, Noah Goldstein, H.J. Lu; +Cc: GNU C Library

Hi Wilco and Adhemerval,


On 2/10/22 14:01, Wilco Dijkstra via Libc-alpha wrote:
> No code uses bzero, no compiler emits bzero. It died 2 decades ago...
>
>> My point is this is a lot of code and infrastructure for a symbol marked
>> as legacy for POSIX.1-2001 and removed on POSIX.1-2008 for the sake of
>> marginal gains in specific cases.

That was what triggered my fears.  I as a user, not a compiler or libc
writer, am not really interested in the implementation details, as long
as the implementation does the magic for me.  I want to be able to keep
using bzero(3) in source code, but if the compiler transforms it into
memset(3) (or a for loop), I don't even want to know.



On 2/10/22 20:19, Wilco Dijkstra wrote:
> Hi Alex,
> 
> Don't worry, you're not a bzero user - just check your binaries and
> I bet you will find zero calls to bzero!
> 

:)

> GCC/LLVM avoid calls to bzero, bcmp, bcopy and mempcpy. The goal is to
> avoid unnecessary proliferation of almost identical string functions where
> there is absolutely no performance benefit. In fact this duplication is typically
> a net loss, partly due to reducing optimization effort and the extra runtime
> overhead due to having multiple copies of the same function in caches and
> predictors.
> 
> However if you care about having a standardized call to zero memory, just
> propose it for the next C/C++ standard. Given we will not agree on naming,
> we could just add memzero, memclr, memset_zero and bcopy as synonyms
> for memset.

Hmm, I'll do so.  Will try to press bzero(3) into C3x.

On 2/10/22 20:42, Adhemerval Zanella wrote:
> The bzero won't go away since glibc will continue to support old POSIX
> interfaces, but Wilco has a point: it is an interface that has been
> deprecated for more than 2 decades and usually only BSD-like code
> still keeps using it (explicit_bzero was initialized from openbsd).
> [...]>>
>> static inline bzero(s, n)
>> {
>> 	memset(s, 0, n);
>> }
>>
>> Is that really a pain to maintain?
>
> Yes, this is *exactly* what we are trying to remove and I will oppose to
> reintroduce it after all the cleanups we did in this front (for instance
> 18b10de7ced9e9).
>
> Besides your code most likely is calling memset under the hoods, gcc by
> default converts bzero calls to memset.  It seems to be really tricky to
> actually force gcc emit a bzero call to libc, I would could make by not
> including string.h, adding an explicit prototype to bzero, along with
> -fno-builtin:

I guess that it's GCC that defines that (or equivalent) code, instead of
glibc.  As a user-space programmer, that's fine by me :-).

>> If libc ever removes bzero(3), it's not like I'm going to start using
>> memset(3).  I'll just define it myself.  That's not an improvement, but
>> the opposite, IMO.
>>
>> Ideally, POSIX would reconsider some day, and reincorporate bzero(3),
>> but I don't expect that any soon :(.
>
> I really think it is unlikely, but it is just my view.
> [...]>> Other projects that I have touched (including nginx and
shadow-utils),
>> seem to have some similar opinion to me, and they define memzero() or
>> some other similar name, with an interface identical to bzero(3) (just a
>> different name to avoid problems), to wrap either bzero(3) or memset(3),
>> depending on what is available.
>
> We are discussing different subjects here: what I want is to remove the
> glibc *internal* optimization for bzero, which is essentially an
> implementation detail.  In a first glance it would change performance,
> however gcc does a hard job replacing bzero/bcmp/bcopy with their
> str* counterparts, so it highly unlike that newer binaries will actually
> call bzero.

Okay, then yes, go ahead and remove bzero(3) from glibc if GCC will
continue supporting it.  Just remember that some users keep writing and
wanting to write bzero(3) instead of memset(3) in their .c files, so
it's far from being dead in source code.

Cheers,

Alex

-- 
Alejandro Colomar
Linux man-pages comaintainer; https://www.kernel.org/doc/man-pages/
http://www.alejandro-colomar.es/

^ permalink raw reply	[flat|nested] 39+ messages in thread

* Re: [PATCH v2] x86-64: Optimize bzero
  2022-02-10 20:27                   ` Alejandro Colomar (man-pages)
@ 2022-02-10 20:42                     ` Adhemerval Zanella
  2022-02-10 21:07                       ` Patrick McGehearty
  2022-02-10 22:00                       ` Alejandro Colomar (man-pages)
  0 siblings, 2 replies; 39+ messages in thread
From: Adhemerval Zanella @ 2022-02-10 20:42 UTC (permalink / raw)
  To: Alejandro Colomar (man-pages), Wilco Dijkstra, Noah Goldstein, H.J. Lu
  Cc: GNU C Library



On 10/02/2022 17:27, Alejandro Colomar (man-pages) wrote:
>> We are discussing different subjects here: what I want is to remove the
>> glibc *internal* optimization for bzero, which is essentially an
>> implementation detail.  In a first glance it would change performance,
>> however gcc does a hard job replacing bzero/bcmp/bcopy with their
>> str* counterparts, so it highly unlike that newer binaries will actually
>> call bzero.
> 
> Okay, then yes, go ahead and remove bzero(3) from glibc if GCC will
> continue supporting it.  Just remember that some users keep writing and
> wanting to write bzero(3) instead of memset(3) in their .c files, so
> it's far from being dead in source code.

Again, I am not proposing to *remove* bzero, but rather the internal
optimizations that currently only adds code complexity and maintenance
burden.  My patchset [1] will keep the ABI as-is, the difference is
bcopy and bzero will use the default implementation on all architectures.

[1] https://patchwork.sourceware.org/project/glibc/list/?series=7243

^ permalink raw reply	[flat|nested] 39+ messages in thread

* Re: [PATCH v2] x86-64: Optimize bzero
  2022-02-10 20:42                     ` Adhemerval Zanella
@ 2022-02-10 21:07                       ` Patrick McGehearty
  2022-02-11 13:01                         ` Adhemerval Zanella
  2022-02-10 22:00                       ` Alejandro Colomar (man-pages)
  1 sibling, 1 reply; 39+ messages in thread
From: Patrick McGehearty @ 2022-02-10 21:07 UTC (permalink / raw)
  To: libc-alpha

Just as another point of information, Solaris libc implemented
bzero as moving arguments around appropriately then jumping to
memset. Noone noticed enough to file a complaint. Of course,
short fixed-length bzero was handled with in line stores of zero
by the compiler. For long vector bzeroing, the overhead was
negligible.

When certain Sparc hardware implementations provided faster methods
for zeroing a cache line at a time on cache line boundaries,
memset added a single test for zero ifandonlyif the length of code
to memset was over a threshold that seemed likely to make it
worthwhile to use the faster method. The principal advantage
of the fast zeroing operation is that it did not require data
to move from memory to cache before writing zeros to memory,
protecting cache locality in the face of large block zeroing.
I was responsible for much of that optimization effort.
Whether that optimization was really worth it is open for debate
for a variety of reasons that I won't go into just now.

Apps still used bzero or memset(target,zero,length) according to
their preferences, but the code was unified under memset.

I am inclined to agree with keeping bzero in the API for
compatibility with old code/old binaries/old programmers. :-)

Using shared memset code for the implementation of bzero
is worthwhile for reducing future maintenance costs.

- Patrick McGehearty
former Sparc/Solaris performance tuning person

On 2/10/2022 2:42 PM, Adhemerval Zanella via Libc-alpha wrote:
>
> On 10/02/2022 17:27, Alejandro Colomar (man-pages) wrote:
>>> We are discussing different subjects here: what I want is to remove the
>>> glibc *internal* optimization for bzero, which is essentially an
>>> implementation detail.  In a first glance it would change performance,
>>> however gcc does a hard job replacing bzero/bcmp/bcopy with their
>>> str* counterparts, so it highly unlike that newer binaries will actually
>>> call bzero.
>> Okay, then yes, go ahead and remove bzero(3) from glibc if GCC will
>> continue supporting it.  Just remember that some users keep writing and
>> wanting to write bzero(3) instead of memset(3) in their .c files, so
>> it's far from being dead in source code.
> Again, I am not proposing to *remove* bzero, but rather the internal
> optimizations that currently only adds code complexity and maintenance
> burden.  My patchset [1] will keep the ABI as-is, the difference is
> bcopy and bzero will use the default implementation on all architectures.
>
> [1] https://patchwork.sourceware.org/project/glibc/list/?series=7243

^ permalink raw reply	[flat|nested] 39+ messages in thread

* Re: [PATCH v2] x86-64: Optimize bzero
  2022-02-10 20:42                     ` Adhemerval Zanella
  2022-02-10 21:07                       ` Patrick McGehearty
@ 2022-02-10 22:00                       ` Alejandro Colomar (man-pages)
  1 sibling, 0 replies; 39+ messages in thread
From: Alejandro Colomar (man-pages) @ 2022-02-10 22:00 UTC (permalink / raw)
  To: Adhemerval Zanella, Wilco Dijkstra, Noah Goldstein, H.J. Lu; +Cc: GNU C Library

On 2/10/22 21:42, Adhemerval Zanella wrote:
> Again, I am not proposing to *remove* bzero, but rather the internal
> optimizations that currently only adds code complexity and maintenance
> burden.  My patchset [1] will keep the ABI as-is, the difference is
> bcopy and bzero will use the default implementation on all architectures.

That's good.

Thanks,

Alex

-- 
Alejandro Colomar
Linux man-pages comaintainer; https://www.kernel.org/doc/man-pages/
http://www.alejandro-colomar.es/

^ permalink raw reply	[flat|nested] 39+ messages in thread

* Re: [PATCH v2] x86-64: Optimize bzero
  2022-02-10 21:07                       ` Patrick McGehearty
@ 2022-02-11 13:01                         ` Adhemerval Zanella
  2022-02-12 23:46                           ` Noah Goldstein
  0 siblings, 1 reply; 39+ messages in thread
From: Adhemerval Zanella @ 2022-02-11 13:01 UTC (permalink / raw)
  To: Patrick McGehearty, libc-alpha



On 10/02/2022 18:07, Patrick McGehearty via Libc-alpha wrote:
> Just as another point of information, Solaris libc implemented
> bzero as moving arguments around appropriately then jumping to
> memset. Noone noticed enough to file a complaint. Of course,
> short fixed-length bzero was handled with in line stores of zero
> by the compiler. For long vector bzeroing, the overhead was
> negligible.
> 
> When certain Sparc hardware implementations provided faster methods
> for zeroing a cache line at a time on cache line boundaries,
> memset added a single test for zero ifandonlyif the length of code
> to memset was over a threshold that seemed likely to make it
> worthwhile to use the faster method. The principal advantage
> of the fast zeroing operation is that it did not require data
> to move from memory to cache before writing zeros to memory,
> protecting cache locality in the face of large block zeroing.
> I was responsible for much of that optimization effort.
> Whether that optimization was really worth it is open for debate
> for a variety of reasons that I won't go into just now.

Afaik this is pretty much what optimized memset implementations
does, if architecture allows it. For instance, aarch64 uses 
'dc zva' for sizes larger than 256 and powerpc uses dcbz with a 
similar strategy.

> 
> Apps still used bzero or memset(target,zero,length) according to
> their preferences, but the code was unified under memset.
> 
> I am inclined to agree with keeping bzero in the API for
> compatibility with old code/old binaries/old programmers. :-)

The main driver to remove the bzero internal implementation is just
the *currently* gcc just do not generate bzero calls as default
(I couldn't find a single binary that calls bzero in my system).

So to actually see any performance advantage from the optimized
bzero, we will need to reevaluate the gcc optimization to transform
it on memset (which will need to be applied per-architecture base)
which I seem highly unlikely gcc maintainer will accept it.

Some time ago LLVM tried to do something similar to bcmp, but in the
end it was not a good idea to use an already define symbol and it
ended up with __memcmp_eq instead. 

> 
> Using shared memset code for the implementation of bzero
> is worthwhile for reducing future maintenance costs.
> 
> - Patrick McGehearty
> former Sparc/Solaris performance tuning person
> 
> 
> 
> On 2/10/2022 2:42 PM, Adhemerval Zanella via Libc-alpha wrote:
>>
>> On 10/02/2022 17:27, Alejandro Colomar (man-pages) wrote:
>>>> We are discussing different subjects here: what I want is to remove the
>>>> glibc *internal* optimization for bzero, which is essentially an
>>>> implementation detail.  In a first glance it would change performance,
>>>> however gcc does a hard job replacing bzero/bcmp/bcopy with their
>>>> str* counterparts, so it highly unlike that newer binaries will actually
>>>> call bzero.
>>> Okay, then yes, go ahead and remove bzero(3) from glibc if GCC will
>>> continue supporting it.  Just remember that some users keep writing and
>>> wanting to write bzero(3) instead of memset(3) in their .c files, so
>>> it's far from being dead in source code.
>> Again, I am not proposing to *remove* bzero, but rather the internal
>> optimizations that currently only adds code complexity and maintenance
>> burden.  My patchset [1] will keep the ABI as-is, the difference is
>> bcopy and bzero will use the default implementation on all architectures.
>>
>> [1] https://patchwork.sourceware.org/project/glibc/list/?series=7243
> 

^ permalink raw reply	[flat|nested] 39+ messages in thread

* Re: [PATCH v2] x86-64: Optimize bzero
  2022-02-11 13:01                         ` Adhemerval Zanella
@ 2022-02-12 23:46                           ` Noah Goldstein
  2022-02-14 12:07                             ` Adhemerval Zanella
  0 siblings, 1 reply; 39+ messages in thread
From: Noah Goldstein @ 2022-02-12 23:46 UTC (permalink / raw)
  To: Adhemerval Zanella; +Cc: Patrick McGehearty, GNU C Library

On Fri, Feb 11, 2022 at 7:01 AM Adhemerval Zanella via Libc-alpha
<libc-alpha@sourceware.org> wrote:
>
>
>
> On 10/02/2022 18:07, Patrick McGehearty via Libc-alpha wrote:
> > Just as another point of information, Solaris libc implemented
> > bzero as moving arguments around appropriately then jumping to
> > memset. Noone noticed enough to file a complaint. Of course,
> > short fixed-length bzero was handled with in line stores of zero
> > by the compiler. For long vector bzeroing, the overhead was
> > negligible.
> >
> > When certain Sparc hardware implementations provided faster methods
> > for zeroing a cache line at a time on cache line boundaries,
> > memset added a single test for zero ifandonlyif the length of code
> > to memset was over a threshold that seemed likely to make it
> > worthwhile to use the faster method. The principal advantage
> > of the fast zeroing operation is that it did not require data
> > to move from memory to cache before writing zeros to memory,
> > protecting cache locality in the face of large block zeroing.
> > I was responsible for much of that optimization effort.
> > Whether that optimization was really worth it is open for debate
> > for a variety of reasons that I won't go into just now.
>
> Afaik this is pretty much what optimized memset implementations
> does, if architecture allows it. For instance, aarch64 uses
> 'dc zva' for sizes larger than 256 and powerpc uses dcbz with a
> similar strategy.
>
> >
> > Apps still used bzero or memset(target,zero,length) according to
> > their preferences, but the code was unified under memset.
> >
> > I am inclined to agree with keeping bzero in the API for
> > compatibility with old code/old binaries/old programmers. :-)
>
> The main driver to remove the bzero internal implementation is just
> the *currently* gcc just do not generate bzero calls as default
> (I couldn't find a single binary that calls bzero in my system).

Does it make sense then to add '__memsetzero' so that we can have
a function optimized for setting zero?

>
> So to actually see any performance advantage from the optimized
> bzero, we will need to reevaluate the gcc optimization to transform
> it on memset (which will need to be applied per-architecture base)
> which I seem highly unlikely gcc maintainer will accept it.
>
> Some time ago LLVM tried to do something similar to bcmp, but in the
> end it was not a good idea to use an already define symbol and it
> ended up with __memcmp_eq instead.
>
> >
> > Using shared memset code for the implementation of bzero
> > is worthwhile for reducing future maintenance costs.
> >
> > - Patrick McGehearty
> > former Sparc/Solaris performance tuning person
> >
> >
> >
> > On 2/10/2022 2:42 PM, Adhemerval Zanella via Libc-alpha wrote:
> >>
> >> On 10/02/2022 17:27, Alejandro Colomar (man-pages) wrote:
> >>>> We are discussing different subjects here: what I want is to remove the
> >>>> glibc *internal* optimization for bzero, which is essentially an
> >>>> implementation detail.  In a first glance it would change performance,
> >>>> however gcc does a hard job replacing bzero/bcmp/bcopy with their
> >>>> str* counterparts, so it highly unlike that newer binaries will actually
> >>>> call bzero.
> >>> Okay, then yes, go ahead and remove bzero(3) from glibc if GCC will
> >>> continue supporting it.  Just remember that some users keep writing and
> >>> wanting to write bzero(3) instead of memset(3) in their .c files, so
> >>> it's far from being dead in source code.
> >> Again, I am not proposing to *remove* bzero, but rather the internal
> >> optimizations that currently only adds code complexity and maintenance
> >> burden.  My patchset [1] will keep the ABI as-is, the difference is
> >> bcopy and bzero will use the default implementation on all architectures.
> >>
> >> [1] https://patchwork.sourceware.org/project/glibc/list/?series=7243
> >

^ permalink raw reply	[flat|nested] 39+ messages in thread

* Re: [PATCH v2] x86-64: Optimize bzero
  2022-02-12 23:46                           ` Noah Goldstein
@ 2022-02-14 12:07                             ` Adhemerval Zanella
  2022-02-14 12:41                               ` Noah Goldstein
  0 siblings, 1 reply; 39+ messages in thread
From: Adhemerval Zanella @ 2022-02-14 12:07 UTC (permalink / raw)
  To: Noah Goldstein; +Cc: Patrick McGehearty, GNU C Library



On 12/02/2022 20:46, Noah Goldstein wrote:
> On Fri, Feb 11, 2022 at 7:01 AM Adhemerval Zanella via Libc-alpha
> <libc-alpha@sourceware.org> wrote:
>>
>>
>>
>> On 10/02/2022 18:07, Patrick McGehearty via Libc-alpha wrote:
>>> Just as another point of information, Solaris libc implemented
>>> bzero as moving arguments around appropriately then jumping to
>>> memset. Noone noticed enough to file a complaint. Of course,
>>> short fixed-length bzero was handled with in line stores of zero
>>> by the compiler. For long vector bzeroing, the overhead was
>>> negligible.
>>>
>>> When certain Sparc hardware implementations provided faster methods
>>> for zeroing a cache line at a time on cache line boundaries,
>>> memset added a single test for zero ifandonlyif the length of code
>>> to memset was over a threshold that seemed likely to make it
>>> worthwhile to use the faster method. The principal advantage
>>> of the fast zeroing operation is that it did not require data
>>> to move from memory to cache before writing zeros to memory,
>>> protecting cache locality in the face of large block zeroing.
>>> I was responsible for much of that optimization effort.
>>> Whether that optimization was really worth it is open for debate
>>> for a variety of reasons that I won't go into just now.
>>
>> Afaik this is pretty much what optimized memset implementations
>> does, if architecture allows it. For instance, aarch64 uses
>> 'dc zva' for sizes larger than 256 and powerpc uses dcbz with a
>> similar strategy.
>>
>>>
>>> Apps still used bzero or memset(target,zero,length) according to
>>> their preferences, but the code was unified under memset.
>>>
>>> I am inclined to agree with keeping bzero in the API for
>>> compatibility with old code/old binaries/old programmers. :-)
>>
>> The main driver to remove the bzero internal implementation is just
>> the *currently* gcc just do not generate bzero calls as default
>> (I couldn't find a single binary that calls bzero in my system).
> 
> Does it make sense then to add '__memsetzero' so that we can have
> a function optimized for setting zero?

Will it be really a huge gain instead of a microoptimization that will
just a bunch of more ifunc variants along with the maintenance cost
associated with this?

My understanding is __memsetzero would maybe yield some gain in the
store mask generation (some architecture might have a zero register
or some instruction to generate one), however it would require to
use the same strategy as memset to use specific architecture instruction
that optimize cache utilization (dc zva, dcbz).

So it would mostly require a lot of arch-specific code to to share
the memset code with __memsetzero (to avoid increasing code size),
so I am not sure if this is really a gain in the long term.

^ permalink raw reply	[flat|nested] 39+ messages in thread

* Re: [PATCH v2] x86-64: Optimize bzero
  2022-02-14 12:07                             ` Adhemerval Zanella
@ 2022-02-14 12:41                               ` Noah Goldstein
  2022-02-14 14:07                                 ` Adhemerval Zanella
  0 siblings, 1 reply; 39+ messages in thread
From: Noah Goldstein @ 2022-02-14 12:41 UTC (permalink / raw)
  To: Adhemerval Zanella; +Cc: Patrick McGehearty, GNU C Library

On Mon, Feb 14, 2022 at 6:07 AM Adhemerval Zanella
<adhemerval.zanella@linaro.org> wrote:
>
>
>
> On 12/02/2022 20:46, Noah Goldstein wrote:
> > On Fri, Feb 11, 2022 at 7:01 AM Adhemerval Zanella via Libc-alpha
> > <libc-alpha@sourceware.org> wrote:
> >>
> >>
> >>
> >> On 10/02/2022 18:07, Patrick McGehearty via Libc-alpha wrote:
> >>> Just as another point of information, Solaris libc implemented
> >>> bzero as moving arguments around appropriately then jumping to
> >>> memset. Noone noticed enough to file a complaint. Of course,
> >>> short fixed-length bzero was handled with in line stores of zero
> >>> by the compiler. For long vector bzeroing, the overhead was
> >>> negligible.
> >>>
> >>> When certain Sparc hardware implementations provided faster methods
> >>> for zeroing a cache line at a time on cache line boundaries,
> >>> memset added a single test for zero ifandonlyif the length of code
> >>> to memset was over a threshold that seemed likely to make it
> >>> worthwhile to use the faster method. The principal advantage
> >>> of the fast zeroing operation is that it did not require data
> >>> to move from memory to cache before writing zeros to memory,
> >>> protecting cache locality in the face of large block zeroing.
> >>> I was responsible for much of that optimization effort.
> >>> Whether that optimization was really worth it is open for debate
> >>> for a variety of reasons that I won't go into just now.
> >>
> >> Afaik this is pretty much what optimized memset implementations
> >> does, if architecture allows it. For instance, aarch64 uses
> >> 'dc zva' for sizes larger than 256 and powerpc uses dcbz with a
> >> similar strategy.
> >>
> >>>
> >>> Apps still used bzero or memset(target,zero,length) according to
> >>> their preferences, but the code was unified under memset.
> >>>
> >>> I am inclined to agree with keeping bzero in the API for
> >>> compatibility with old code/old binaries/old programmers. :-)
> >>
> >> The main driver to remove the bzero internal implementation is just
> >> the *currently* gcc just do not generate bzero calls as default
> >> (I couldn't find a single binary that calls bzero in my system).
> >
> > Does it make sense then to add '__memsetzero' so that we can have
> > a function optimized for setting zero?
>
> Will it be really a huge gain instead of a microoptimization that will
> just a bunch of more ifunc variants along with the maintenance cost
> associated with this?
Is there any way it can be setup so that one C impl can cover all the
arch that want to just leave `__memsetzero` as an alias to `memset`?
I know they have incompatible interfaces that make it hard but would
a weak static inline in string.h work?

For some of the shorter control flows (which are generally small sizes
and very hot) we saw reasonable benefits on x86_64.

The most significant was the EVEX/AVX2 [32, 64] case where it net
us ~25% throughput. This is a pretty hot set value so it may be worth it.

>
> My understanding is __memsetzero would maybe yield some gain in the
> store mask generation (some architecture might have a zero register
> or some instruction to generate one), however it would require to
> use the same strategy as memset to use specific architecture instruction
> that optimize cache utilization (dc zva, dcbz).
>
> So it would mostly require a lot of arch-specific code to to share
> the memset code with __memsetzero (to avoid increasing code size),
> so I am not sure if this is really a gain in the long term.

It's worth noting that between the two `memset` is the cold function
and `__memsetzero` is the hot one. Based on profiles of GCC11 and
Python3.7.7 setting zero covers 99%+ cases.

^ permalink raw reply	[flat|nested] 39+ messages in thread

* Re: [PATCH v2] x86-64: Optimize bzero
  2022-02-14 12:41                               ` Noah Goldstein
@ 2022-02-14 14:07                                 ` Adhemerval Zanella
  2022-02-14 15:03                                   ` H.J. Lu
  0 siblings, 1 reply; 39+ messages in thread
From: Adhemerval Zanella @ 2022-02-14 14:07 UTC (permalink / raw)
  To: Noah Goldstein; +Cc: Patrick McGehearty, GNU C Library



On 14/02/2022 09:41, Noah Goldstein wrote:
> On Mon, Feb 14, 2022 at 6:07 AM Adhemerval Zanella
> <adhemerval.zanella@linaro.org> wrote:
>>
>>
>>
>> On 12/02/2022 20:46, Noah Goldstein wrote:
>>> On Fri, Feb 11, 2022 at 7:01 AM Adhemerval Zanella via Libc-alpha
>>> <libc-alpha@sourceware.org> wrote:
>>>>
>>>>
>>>>
>>>> On 10/02/2022 18:07, Patrick McGehearty via Libc-alpha wrote:
>>>>> Just as another point of information, Solaris libc implemented
>>>>> bzero as moving arguments around appropriately then jumping to
>>>>> memset. Noone noticed enough to file a complaint. Of course,
>>>>> short fixed-length bzero was handled with in line stores of zero
>>>>> by the compiler. For long vector bzeroing, the overhead was
>>>>> negligible.
>>>>>
>>>>> When certain Sparc hardware implementations provided faster methods
>>>>> for zeroing a cache line at a time on cache line boundaries,
>>>>> memset added a single test for zero ifandonlyif the length of code
>>>>> to memset was over a threshold that seemed likely to make it
>>>>> worthwhile to use the faster method. The principal advantage
>>>>> of the fast zeroing operation is that it did not require data
>>>>> to move from memory to cache before writing zeros to memory,
>>>>> protecting cache locality in the face of large block zeroing.
>>>>> I was responsible for much of that optimization effort.
>>>>> Whether that optimization was really worth it is open for debate
>>>>> for a variety of reasons that I won't go into just now.
>>>>
>>>> Afaik this is pretty much what optimized memset implementations
>>>> does, if architecture allows it. For instance, aarch64 uses
>>>> 'dc zva' for sizes larger than 256 and powerpc uses dcbz with a
>>>> similar strategy.
>>>>
>>>>>
>>>>> Apps still used bzero or memset(target,zero,length) according to
>>>>> their preferences, but the code was unified under memset.
>>>>>
>>>>> I am inclined to agree with keeping bzero in the API for
>>>>> compatibility with old code/old binaries/old programmers. :-)
>>>>
>>>> The main driver to remove the bzero internal implementation is just
>>>> the *currently* gcc just do not generate bzero calls as default
>>>> (I couldn't find a single binary that calls bzero in my system).
>>>
>>> Does it make sense then to add '__memsetzero' so that we can have
>>> a function optimized for setting zero?
>>
>> Will it be really a huge gain instead of a microoptimization that will
>> just a bunch of more ifunc variants along with the maintenance cost
>> associated with this?
> Is there any way it can be setup so that one C impl can cover all the
> arch that want to just leave `__memsetzero` as an alias to `memset`?
> I know they have incompatible interfaces that make it hard but would
> a weak static inline in string.h work?
> 
> For some of the shorter control flows (which are generally small sizes
> and very hot) we saw reasonable benefits on x86_64.
> 
> The most significant was the EVEX/AVX2 [32, 64] case where it net
> us ~25% throughput. This is a pretty hot set value so it may be worth it.

With different prototypes and semantics we won't be able to define an
alias. What we used to do, but we move away in recent version, was to
define static inline function that glue the two function if optimization
is set.

> 
>>
>> My understanding is __memsetzero would maybe yield some gain in the
>> store mask generation (some architecture might have a zero register
>> or some instruction to generate one), however it would require to
>> use the same strategy as memset to use specific architecture instruction
>> that optimize cache utilization (dc zva, dcbz).
>>
>> So it would mostly require a lot of arch-specific code to to share
>> the memset code with __memsetzero (to avoid increasing code size),
>> so I am not sure if this is really a gain in the long term.
> 
> It's worth noting that between the two `memset` is the cold function
> and `__memsetzero` is the hot one. Based on profiles of GCC11 and
> Python3.7.7 setting zero covers 99%+ cases.

This is very workload specific and I think with more advance compiler
optimization like LTO and PGO such calls could most likely being
optimized by the compiler itself (either by inline or by create a
synthetic function to handle it).

What I worried is such symbols might ended up as the AEBI memcpy variants
that was added as way to optimize when alignment is know to be multiple
of words, but it ended up not being implemented and also not being generated
by the compiler (at least not gcc).

^ permalink raw reply	[flat|nested] 39+ messages in thread

* Re: [PATCH v2] x86-64: Optimize bzero
  2022-02-14 14:07                                 ` Adhemerval Zanella
@ 2022-02-14 15:03                                   ` H.J. Lu
  2022-05-04  6:35                                     ` Sunil Pandey
  0 siblings, 1 reply; 39+ messages in thread
From: H.J. Lu @ 2022-02-14 15:03 UTC (permalink / raw)
  To: Adhemerval Zanella; +Cc: Noah Goldstein, GNU C Library

On Mon, Feb 14, 2022 at 6:07 AM Adhemerval Zanella via Libc-alpha
<libc-alpha@sourceware.org> wrote:
>
>
>
> On 14/02/2022 09:41, Noah Goldstein wrote:
> > On Mon, Feb 14, 2022 at 6:07 AM Adhemerval Zanella
> > <adhemerval.zanella@linaro.org> wrote:
> >>
> >>
> >>
> >> On 12/02/2022 20:46, Noah Goldstein wrote:
> >>> On Fri, Feb 11, 2022 at 7:01 AM Adhemerval Zanella via Libc-alpha
> >>> <libc-alpha@sourceware.org> wrote:
> >>>>
> >>>>
> >>>>
> >>>> On 10/02/2022 18:07, Patrick McGehearty via Libc-alpha wrote:
> >>>>> Just as another point of information, Solaris libc implemented
> >>>>> bzero as moving arguments around appropriately then jumping to
> >>>>> memset. Noone noticed enough to file a complaint. Of course,
> >>>>> short fixed-length bzero was handled with in line stores of zero
> >>>>> by the compiler. For long vector bzeroing, the overhead was
> >>>>> negligible.
> >>>>>
> >>>>> When certain Sparc hardware implementations provided faster methods
> >>>>> for zeroing a cache line at a time on cache line boundaries,
> >>>>> memset added a single test for zero ifandonlyif the length of code
> >>>>> to memset was over a threshold that seemed likely to make it
> >>>>> worthwhile to use the faster method. The principal advantage
> >>>>> of the fast zeroing operation is that it did not require data
> >>>>> to move from memory to cache before writing zeros to memory,
> >>>>> protecting cache locality in the face of large block zeroing.
> >>>>> I was responsible for much of that optimization effort.
> >>>>> Whether that optimization was really worth it is open for debate
> >>>>> for a variety of reasons that I won't go into just now.
> >>>>
> >>>> Afaik this is pretty much what optimized memset implementations
> >>>> does, if architecture allows it. For instance, aarch64 uses
> >>>> 'dc zva' for sizes larger than 256 and powerpc uses dcbz with a
> >>>> similar strategy.
> >>>>
> >>>>>
> >>>>> Apps still used bzero or memset(target,zero,length) according to
> >>>>> their preferences, but the code was unified under memset.
> >>>>>
> >>>>> I am inclined to agree with keeping bzero in the API for
> >>>>> compatibility with old code/old binaries/old programmers. :-)
> >>>>
> >>>> The main driver to remove the bzero internal implementation is just
> >>>> the *currently* gcc just do not generate bzero calls as default
> >>>> (I couldn't find a single binary that calls bzero in my system).
> >>>
> >>> Does it make sense then to add '__memsetzero' so that we can have
> >>> a function optimized for setting zero?
> >>
> >> Will it be really a huge gain instead of a microoptimization that will
> >> just a bunch of more ifunc variants along with the maintenance cost
> >> associated with this?
> > Is there any way it can be setup so that one C impl can cover all the
> > arch that want to just leave `__memsetzero` as an alias to `memset`?
> > I know they have incompatible interfaces that make it hard but would
> > a weak static inline in string.h work?
> >
> > For some of the shorter control flows (which are generally small sizes
> > and very hot) we saw reasonable benefits on x86_64.
> >
> > The most significant was the EVEX/AVX2 [32, 64] case where it net
> > us ~25% throughput. This is a pretty hot set value so it may be worth it.
>
> With different prototypes and semantics we won't be able to define an
> alias. What we used to do, but we move away in recent version, was to
> define static inline function that glue the two function if optimization
> is set.

I have

/* NB: bzero returns void and __memsetzero returns void *.  */
asm (".weak bzero");
asm ("bzero = __memsetzero");
asm (".global __bzero");
asm ("__bzero = __memsetzero");

> >
> >>
> >> My understanding is __memsetzero would maybe yield some gain in the
> >> store mask generation (some architecture might have a zero register
> >> or some instruction to generate one), however it would require to
> >> use the same strategy as memset to use specific architecture instruction
> >> that optimize cache utilization (dc zva, dcbz).
> >>
> >> So it would mostly require a lot of arch-specific code to to share
> >> the memset code with __memsetzero (to avoid increasing code size),
> >> so I am not sure if this is really a gain in the long term.
> >
> > It's worth noting that between the two `memset` is the cold function
> > and `__memsetzero` is the hot one. Based on profiles of GCC11 and
> > Python3.7.7 setting zero covers 99%+ cases.
>
> This is very workload specific and I think with more advance compiler
> optimization like LTO and PGO such calls could most likely being
> optimized by the compiler itself (either by inline or by create a
> synthetic function to handle it).
>
> What I worried is such symbols might ended up as the AEBI memcpy variants
> that was added as way to optimize when alignment is know to be multiple
> of words, but it ended up not being implemented and also not being generated
> by the compiler (at least not gcc).



-- 
H.J.

^ permalink raw reply	[flat|nested] 39+ messages in thread

* Re: [PATCH v2] x86-64: Optimize bzero
  2022-02-14 15:03                                   ` H.J. Lu
@ 2022-05-04  6:35                                     ` Sunil Pandey
  2022-05-04 12:52                                       ` Adhemerval Zanella
  0 siblings, 1 reply; 39+ messages in thread
From: Sunil Pandey @ 2022-05-04  6:35 UTC (permalink / raw)
  To: H.J. Lu, Libc-stable Mailing List; +Cc: Adhemerval Zanella, GNU C Library

On Mon, Feb 14, 2022 at 7:04 AM H.J. Lu via Libc-alpha
<libc-alpha@sourceware.org> wrote:
>
> On Mon, Feb 14, 2022 at 6:07 AM Adhemerval Zanella via Libc-alpha
> <libc-alpha@sourceware.org> wrote:
> >
> >
> >
> > On 14/02/2022 09:41, Noah Goldstein wrote:
> > > On Mon, Feb 14, 2022 at 6:07 AM Adhemerval Zanella
> > > <adhemerval.zanella@linaro.org> wrote:
> > >>
> > >>
> > >>
> > >> On 12/02/2022 20:46, Noah Goldstein wrote:
> > >>> On Fri, Feb 11, 2022 at 7:01 AM Adhemerval Zanella via Libc-alpha
> > >>> <libc-alpha@sourceware.org> wrote:
> > >>>>
> > >>>>
> > >>>>
> > >>>> On 10/02/2022 18:07, Patrick McGehearty via Libc-alpha wrote:
> > >>>>> Just as another point of information, Solaris libc implemented
> > >>>>> bzero as moving arguments around appropriately then jumping to
> > >>>>> memset. Noone noticed enough to file a complaint. Of course,
> > >>>>> short fixed-length bzero was handled with in line stores of zero
> > >>>>> by the compiler. For long vector bzeroing, the overhead was
> > >>>>> negligible.
> > >>>>>
> > >>>>> When certain Sparc hardware implementations provided faster methods
> > >>>>> for zeroing a cache line at a time on cache line boundaries,
> > >>>>> memset added a single test for zero ifandonlyif the length of code
> > >>>>> to memset was over a threshold that seemed likely to make it
> > >>>>> worthwhile to use the faster method. The principal advantage
> > >>>>> of the fast zeroing operation is that it did not require data
> > >>>>> to move from memory to cache before writing zeros to memory,
> > >>>>> protecting cache locality in the face of large block zeroing.
> > >>>>> I was responsible for much of that optimization effort.
> > >>>>> Whether that optimization was really worth it is open for debate
> > >>>>> for a variety of reasons that I won't go into just now.
> > >>>>
> > >>>> Afaik this is pretty much what optimized memset implementations
> > >>>> does, if architecture allows it. For instance, aarch64 uses
> > >>>> 'dc zva' for sizes larger than 256 and powerpc uses dcbz with a
> > >>>> similar strategy.
> > >>>>
> > >>>>>
> > >>>>> Apps still used bzero or memset(target,zero,length) according to
> > >>>>> their preferences, but the code was unified under memset.
> > >>>>>
> > >>>>> I am inclined to agree with keeping bzero in the API for
> > >>>>> compatibility with old code/old binaries/old programmers. :-)
> > >>>>
> > >>>> The main driver to remove the bzero internal implementation is just
> > >>>> the *currently* gcc just do not generate bzero calls as default
> > >>>> (I couldn't find a single binary that calls bzero in my system).
> > >>>
> > >>> Does it make sense then to add '__memsetzero' so that we can have
> > >>> a function optimized for setting zero?
> > >>
> > >> Will it be really a huge gain instead of a microoptimization that will
> > >> just a bunch of more ifunc variants along with the maintenance cost
> > >> associated with this?
> > > Is there any way it can be setup so that one C impl can cover all the
> > > arch that want to just leave `__memsetzero` as an alias to `memset`?
> > > I know they have incompatible interfaces that make it hard but would
> > > a weak static inline in string.h work?
> > >
> > > For some of the shorter control flows (which are generally small sizes
> > > and very hot) we saw reasonable benefits on x86_64.
> > >
> > > The most significant was the EVEX/AVX2 [32, 64] case where it net
> > > us ~25% throughput. This is a pretty hot set value so it may be worth it.
> >
> > With different prototypes and semantics we won't be able to define an
> > alias. What we used to do, but we move away in recent version, was to
> > define static inline function that glue the two function if optimization
> > is set.
>
> I have
>
> /* NB: bzero returns void and __memsetzero returns void *.  */
> asm (".weak bzero");
> asm ("bzero = __memsetzero");
> asm (".global __bzero");
> asm ("__bzero = __memsetzero");
>
> > >
> > >>
> > >> My understanding is __memsetzero would maybe yield some gain in the
> > >> store mask generation (some architecture might have a zero register
> > >> or some instruction to generate one), however it would require to
> > >> use the same strategy as memset to use specific architecture instruction
> > >> that optimize cache utilization (dc zva, dcbz).
> > >>
> > >> So it would mostly require a lot of arch-specific code to to share
> > >> the memset code with __memsetzero (to avoid increasing code size),
> > >> so I am not sure if this is really a gain in the long term.
> > >
> > > It's worth noting that between the two `memset` is the cold function
> > > and `__memsetzero` is the hot one. Based on profiles of GCC11 and
> > > Python3.7.7 setting zero covers 99%+ cases.
> >
> > This is very workload specific and I think with more advance compiler
> > optimization like LTO and PGO such calls could most likely being
> > optimized by the compiler itself (either by inline or by create a
> > synthetic function to handle it).
> >
> > What I worried is such symbols might ended up as the AEBI memcpy variants
> > that was added as way to optimize when alignment is know to be multiple
> > of words, but it ended up not being implemented and also not being generated
> > by the compiler (at least not gcc).
>
>
>
> --
> H.J.

I would like to backport this patch to release branches.
Any comments or objections?

--Sunil

^ permalink raw reply	[flat|nested] 39+ messages in thread

* Re: [PATCH v2] x86-64: Optimize bzero
  2022-05-04  6:35                                     ` Sunil Pandey
@ 2022-05-04 12:52                                       ` Adhemerval Zanella
  2022-05-04 14:50                                         ` H.J. Lu
  0 siblings, 1 reply; 39+ messages in thread
From: Adhemerval Zanella @ 2022-05-04 12:52 UTC (permalink / raw)
  To: Sunil Pandey, H.J. Lu, Libc-stable Mailing List; +Cc: GNU C Library



On 04/05/2022 03:35, Sunil Pandey wrote:
> On Mon, Feb 14, 2022 at 7:04 AM H.J. Lu via Libc-alpha
> <libc-alpha@sourceware.org> wrote:
>>
>> On Mon, Feb 14, 2022 at 6:07 AM Adhemerval Zanella via Libc-alpha
>> <libc-alpha@sourceware.org> wrote:
>>>
>>>
>>>
>>> On 14/02/2022 09:41, Noah Goldstein wrote:
>>>> On Mon, Feb 14, 2022 at 6:07 AM Adhemerval Zanella
>>>> <adhemerval.zanella@linaro.org> wrote:
>>>>>
>>>>>
>>>>>
>>>>> On 12/02/2022 20:46, Noah Goldstein wrote:
>>>>>> On Fri, Feb 11, 2022 at 7:01 AM Adhemerval Zanella via Libc-alpha
>>>>>> <libc-alpha@sourceware.org> wrote:
>>>>>>>
>>>>>>>
>>>>>>>
>>>>>>> On 10/02/2022 18:07, Patrick McGehearty via Libc-alpha wrote:
>>>>>>>> Just as another point of information, Solaris libc implemented
>>>>>>>> bzero as moving arguments around appropriately then jumping to
>>>>>>>> memset. Noone noticed enough to file a complaint. Of course,
>>>>>>>> short fixed-length bzero was handled with in line stores of zero
>>>>>>>> by the compiler. For long vector bzeroing, the overhead was
>>>>>>>> negligible.
>>>>>>>>
>>>>>>>> When certain Sparc hardware implementations provided faster methods
>>>>>>>> for zeroing a cache line at a time on cache line boundaries,
>>>>>>>> memset added a single test for zero ifandonlyif the length of code
>>>>>>>> to memset was over a threshold that seemed likely to make it
>>>>>>>> worthwhile to use the faster method. The principal advantage
>>>>>>>> of the fast zeroing operation is that it did not require data
>>>>>>>> to move from memory to cache before writing zeros to memory,
>>>>>>>> protecting cache locality in the face of large block zeroing.
>>>>>>>> I was responsible for much of that optimization effort.
>>>>>>>> Whether that optimization was really worth it is open for debate
>>>>>>>> for a variety of reasons that I won't go into just now.
>>>>>>>
>>>>>>> Afaik this is pretty much what optimized memset implementations
>>>>>>> does, if architecture allows it. For instance, aarch64 uses
>>>>>>> 'dc zva' for sizes larger than 256 and powerpc uses dcbz with a
>>>>>>> similar strategy.
>>>>>>>
>>>>>>>>
>>>>>>>> Apps still used bzero or memset(target,zero,length) according to
>>>>>>>> their preferences, but the code was unified under memset.
>>>>>>>>
>>>>>>>> I am inclined to agree with keeping bzero in the API for
>>>>>>>> compatibility with old code/old binaries/old programmers. :-)
>>>>>>>
>>>>>>> The main driver to remove the bzero internal implementation is just
>>>>>>> the *currently* gcc just do not generate bzero calls as default
>>>>>>> (I couldn't find a single binary that calls bzero in my system).
>>>>>>
>>>>>> Does it make sense then to add '__memsetzero' so that we can have
>>>>>> a function optimized for setting zero?
>>>>>
>>>>> Will it be really a huge gain instead of a microoptimization that will
>>>>> just a bunch of more ifunc variants along with the maintenance cost
>>>>> associated with this?
>>>> Is there any way it can be setup so that one C impl can cover all the
>>>> arch that want to just leave `__memsetzero` as an alias to `memset`?
>>>> I know they have incompatible interfaces that make it hard but would
>>>> a weak static inline in string.h work?
>>>>
>>>> For some of the shorter control flows (which are generally small sizes
>>>> and very hot) we saw reasonable benefits on x86_64.
>>>>
>>>> The most significant was the EVEX/AVX2 [32, 64] case where it net
>>>> us ~25% throughput. This is a pretty hot set value so it may be worth it.
>>>
>>> With different prototypes and semantics we won't be able to define an
>>> alias. What we used to do, but we move away in recent version, was to
>>> define static inline function that glue the two function if optimization
>>> is set.
>>
>> I have
>>
>> /* NB: bzero returns void and __memsetzero returns void *.  */
>> asm (".weak bzero");
>> asm ("bzero = __memsetzero");
>> asm (".global __bzero");
>> asm ("__bzero = __memsetzero");
>>
>>>>
>>>>>
>>>>> My understanding is __memsetzero would maybe yield some gain in the
>>>>> store mask generation (some architecture might have a zero register
>>>>> or some instruction to generate one), however it would require to
>>>>> use the same strategy as memset to use specific architecture instruction
>>>>> that optimize cache utilization (dc zva, dcbz).
>>>>>
>>>>> So it would mostly require a lot of arch-specific code to to share
>>>>> the memset code with __memsetzero (to avoid increasing code size),
>>>>> so I am not sure if this is really a gain in the long term.
>>>>
>>>> It's worth noting that between the two `memset` is the cold function
>>>> and `__memsetzero` is the hot one. Based on profiles of GCC11 and
>>>> Python3.7.7 setting zero covers 99%+ cases.
>>>
>>> This is very workload specific and I think with more advance compiler
>>> optimization like LTO and PGO such calls could most likely being
>>> optimized by the compiler itself (either by inline or by create a
>>> synthetic function to handle it).
>>>
>>> What I worried is such symbols might ended up as the AEBI memcpy variants
>>> that was added as way to optimize when alignment is know to be multiple
>>> of words, but it ended up not being implemented and also not being generated
>>> by the compiler (at least not gcc).
>>
>>
>>
>> --
>> H.J.
> 
> I would like to backport this patch to release branches.
> Any comments or objections?

Nothing really against, but as previous discussion we had on this maillist optimizing
bzero does not yield much gain compared to memset (compiler won't generate libcall
for loop transformation, among other shortcomings). My idea is to follow other
architecture and just remove all x86_64 optimizations.

^ permalink raw reply	[flat|nested] 39+ messages in thread

* Re: [PATCH v2] x86-64: Optimize bzero
  2022-05-04 12:52                                       ` Adhemerval Zanella
@ 2022-05-04 14:50                                         ` H.J. Lu
  2022-05-04 14:54                                           ` Adhemerval Zanella
  0 siblings, 1 reply; 39+ messages in thread
From: H.J. Lu @ 2022-05-04 14:50 UTC (permalink / raw)
  To: Adhemerval Zanella; +Cc: Sunil Pandey, Libc-stable Mailing List, GNU C Library

On Wed, May 4, 2022 at 5:52 AM Adhemerval Zanella
<adhemerval.zanella@linaro.org> wrote:
>
>
>
> On 04/05/2022 03:35, Sunil Pandey wrote:
> > On Mon, Feb 14, 2022 at 7:04 AM H.J. Lu via Libc-alpha
> > <libc-alpha@sourceware.org> wrote:
> >>
> >> On Mon, Feb 14, 2022 at 6:07 AM Adhemerval Zanella via Libc-alpha
> >> <libc-alpha@sourceware.org> wrote:
> >>>
> >>>
> >>>
> >>> On 14/02/2022 09:41, Noah Goldstein wrote:
> >>>> On Mon, Feb 14, 2022 at 6:07 AM Adhemerval Zanella
> >>>> <adhemerval.zanella@linaro.org> wrote:
> >>>>>
> >>>>>
> >>>>>
> >>>>> On 12/02/2022 20:46, Noah Goldstein wrote:
> >>>>>> On Fri, Feb 11, 2022 at 7:01 AM Adhemerval Zanella via Libc-alpha
> >>>>>> <libc-alpha@sourceware.org> wrote:
> >>>>>>>
> >>>>>>>
> >>>>>>>
> >>>>>>> On 10/02/2022 18:07, Patrick McGehearty via Libc-alpha wrote:
> >>>>>>>> Just as another point of information, Solaris libc implemented
> >>>>>>>> bzero as moving arguments around appropriately then jumping to
> >>>>>>>> memset. Noone noticed enough to file a complaint. Of course,
> >>>>>>>> short fixed-length bzero was handled with in line stores of zero
> >>>>>>>> by the compiler. For long vector bzeroing, the overhead was
> >>>>>>>> negligible.
> >>>>>>>>
> >>>>>>>> When certain Sparc hardware implementations provided faster methods
> >>>>>>>> for zeroing a cache line at a time on cache line boundaries,
> >>>>>>>> memset added a single test for zero ifandonlyif the length of code
> >>>>>>>> to memset was over a threshold that seemed likely to make it
> >>>>>>>> worthwhile to use the faster method. The principal advantage
> >>>>>>>> of the fast zeroing operation is that it did not require data
> >>>>>>>> to move from memory to cache before writing zeros to memory,
> >>>>>>>> protecting cache locality in the face of large block zeroing.
> >>>>>>>> I was responsible for much of that optimization effort.
> >>>>>>>> Whether that optimization was really worth it is open for debate
> >>>>>>>> for a variety of reasons that I won't go into just now.
> >>>>>>>
> >>>>>>> Afaik this is pretty much what optimized memset implementations
> >>>>>>> does, if architecture allows it. For instance, aarch64 uses
> >>>>>>> 'dc zva' for sizes larger than 256 and powerpc uses dcbz with a
> >>>>>>> similar strategy.
> >>>>>>>
> >>>>>>>>
> >>>>>>>> Apps still used bzero or memset(target,zero,length) according to
> >>>>>>>> their preferences, but the code was unified under memset.
> >>>>>>>>
> >>>>>>>> I am inclined to agree with keeping bzero in the API for
> >>>>>>>> compatibility with old code/old binaries/old programmers. :-)
> >>>>>>>
> >>>>>>> The main driver to remove the bzero internal implementation is just
> >>>>>>> the *currently* gcc just do not generate bzero calls as default
> >>>>>>> (I couldn't find a single binary that calls bzero in my system).
> >>>>>>
> >>>>>> Does it make sense then to add '__memsetzero' so that we can have
> >>>>>> a function optimized for setting zero?
> >>>>>
> >>>>> Will it be really a huge gain instead of a microoptimization that will
> >>>>> just a bunch of more ifunc variants along with the maintenance cost
> >>>>> associated with this?
> >>>> Is there any way it can be setup so that one C impl can cover all the
> >>>> arch that want to just leave `__memsetzero` as an alias to `memset`?
> >>>> I know they have incompatible interfaces that make it hard but would
> >>>> a weak static inline in string.h work?
> >>>>
> >>>> For some of the shorter control flows (which are generally small sizes
> >>>> and very hot) we saw reasonable benefits on x86_64.
> >>>>
> >>>> The most significant was the EVEX/AVX2 [32, 64] case where it net
> >>>> us ~25% throughput. This is a pretty hot set value so it may be worth it.
> >>>
> >>> With different prototypes and semantics we won't be able to define an
> >>> alias. What we used to do, but we move away in recent version, was to
> >>> define static inline function that glue the two function if optimization
> >>> is set.
> >>
> >> I have
> >>
> >> /* NB: bzero returns void and __memsetzero returns void *.  */
> >> asm (".weak bzero");
> >> asm ("bzero = __memsetzero");
> >> asm (".global __bzero");
> >> asm ("__bzero = __memsetzero");
> >>
> >>>>
> >>>>>
> >>>>> My understanding is __memsetzero would maybe yield some gain in the
> >>>>> store mask generation (some architecture might have a zero register
> >>>>> or some instruction to generate one), however it would require to
> >>>>> use the same strategy as memset to use specific architecture instruction
> >>>>> that optimize cache utilization (dc zva, dcbz).
> >>>>>
> >>>>> So it would mostly require a lot of arch-specific code to to share
> >>>>> the memset code with __memsetzero (to avoid increasing code size),
> >>>>> so I am not sure if this is really a gain in the long term.
> >>>>
> >>>> It's worth noting that between the two `memset` is the cold function
> >>>> and `__memsetzero` is the hot one. Based on profiles of GCC11 and
> >>>> Python3.7.7 setting zero covers 99%+ cases.
> >>>
> >>> This is very workload specific and I think with more advance compiler
> >>> optimization like LTO and PGO such calls could most likely being
> >>> optimized by the compiler itself (either by inline or by create a
> >>> synthetic function to handle it).
> >>>
> >>> What I worried is such symbols might ended up as the AEBI memcpy variants
> >>> that was added as way to optimize when alignment is know to be multiple
> >>> of words, but it ended up not being implemented and also not being generated
> >>> by the compiler (at least not gcc).
> >>
> >>
> >>
> >> --
> >> H.J.
> >
> > I would like to backport this patch to release branches.
> > Any comments or objections?
>
> Nothing really against, but as previous discussion we had on this maillist optimizing
> bzero does not yield much gain compared to memset (compiler won't generate libcall
> for loop transformation, among other shortcomings). My idea is to follow other
> architecture and just remove all x86_64 optimizations.

We'd like to reduce the differences between master and release branches to help
future backports to release branches.

-- 
H.J.

^ permalink raw reply	[flat|nested] 39+ messages in thread

* Re: [PATCH v2] x86-64: Optimize bzero
  2022-05-04 14:50                                         ` H.J. Lu
@ 2022-05-04 14:54                                           ` Adhemerval Zanella
  0 siblings, 0 replies; 39+ messages in thread
From: Adhemerval Zanella @ 2022-05-04 14:54 UTC (permalink / raw)
  To: H.J. Lu; +Cc: Sunil Pandey, Libc-stable Mailing List, GNU C Library



On 04/05/2022 11:50, H.J. Lu wrote:
> On Wed, May 4, 2022 at 5:52 AM Adhemerval Zanella
> <adhemerval.zanella@linaro.org> wrote:
>>
>>
>>
>> On 04/05/2022 03:35, Sunil Pandey wrote:
>>> On Mon, Feb 14, 2022 at 7:04 AM H.J. Lu via Libc-alpha
>>> <libc-alpha@sourceware.org> wrote:
>>>>
>>>> On Mon, Feb 14, 2022 at 6:07 AM Adhemerval Zanella via Libc-alpha
>>>> <libc-alpha@sourceware.org> wrote:
>>>>>
>>>>>
>>>>>
>>>>> On 14/02/2022 09:41, Noah Goldstein wrote:
>>>>>> On Mon, Feb 14, 2022 at 6:07 AM Adhemerval Zanella
>>>>>> <adhemerval.zanella@linaro.org> wrote:
>>>>>>>
>>>>>>>
>>>>>>>
>>>>>>> On 12/02/2022 20:46, Noah Goldstein wrote:
>>>>>>>> On Fri, Feb 11, 2022 at 7:01 AM Adhemerval Zanella via Libc-alpha
>>>>>>>> <libc-alpha@sourceware.org> wrote:
>>>>>>>>>
>>>>>>>>>
>>>>>>>>>
>>>>>>>>> On 10/02/2022 18:07, Patrick McGehearty via Libc-alpha wrote:
>>>>>>>>>> Just as another point of information, Solaris libc implemented
>>>>>>>>>> bzero as moving arguments around appropriately then jumping to
>>>>>>>>>> memset. Noone noticed enough to file a complaint. Of course,
>>>>>>>>>> short fixed-length bzero was handled with in line stores of zero
>>>>>>>>>> by the compiler. For long vector bzeroing, the overhead was
>>>>>>>>>> negligible.
>>>>>>>>>>
>>>>>>>>>> When certain Sparc hardware implementations provided faster methods
>>>>>>>>>> for zeroing a cache line at a time on cache line boundaries,
>>>>>>>>>> memset added a single test for zero ifandonlyif the length of code
>>>>>>>>>> to memset was over a threshold that seemed likely to make it
>>>>>>>>>> worthwhile to use the faster method. The principal advantage
>>>>>>>>>> of the fast zeroing operation is that it did not require data
>>>>>>>>>> to move from memory to cache before writing zeros to memory,
>>>>>>>>>> protecting cache locality in the face of large block zeroing.
>>>>>>>>>> I was responsible for much of that optimization effort.
>>>>>>>>>> Whether that optimization was really worth it is open for debate
>>>>>>>>>> for a variety of reasons that I won't go into just now.
>>>>>>>>>
>>>>>>>>> Afaik this is pretty much what optimized memset implementations
>>>>>>>>> does, if architecture allows it. For instance, aarch64 uses
>>>>>>>>> 'dc zva' for sizes larger than 256 and powerpc uses dcbz with a
>>>>>>>>> similar strategy.
>>>>>>>>>
>>>>>>>>>>
>>>>>>>>>> Apps still used bzero or memset(target,zero,length) according to
>>>>>>>>>> their preferences, but the code was unified under memset.
>>>>>>>>>>
>>>>>>>>>> I am inclined to agree with keeping bzero in the API for
>>>>>>>>>> compatibility with old code/old binaries/old programmers. :-)
>>>>>>>>>
>>>>>>>>> The main driver to remove the bzero internal implementation is just
>>>>>>>>> the *currently* gcc just do not generate bzero calls as default
>>>>>>>>> (I couldn't find a single binary that calls bzero in my system).
>>>>>>>>
>>>>>>>> Does it make sense then to add '__memsetzero' so that we can have
>>>>>>>> a function optimized for setting zero?
>>>>>>>
>>>>>>> Will it be really a huge gain instead of a microoptimization that will
>>>>>>> just a bunch of more ifunc variants along with the maintenance cost
>>>>>>> associated with this?
>>>>>> Is there any way it can be setup so that one C impl can cover all the
>>>>>> arch that want to just leave `__memsetzero` as an alias to `memset`?
>>>>>> I know they have incompatible interfaces that make it hard but would
>>>>>> a weak static inline in string.h work?
>>>>>>
>>>>>> For some of the shorter control flows (which are generally small sizes
>>>>>> and very hot) we saw reasonable benefits on x86_64.
>>>>>>
>>>>>> The most significant was the EVEX/AVX2 [32, 64] case where it net
>>>>>> us ~25% throughput. This is a pretty hot set value so it may be worth it.
>>>>>
>>>>> With different prototypes and semantics we won't be able to define an
>>>>> alias. What we used to do, but we move away in recent version, was to
>>>>> define static inline function that glue the two function if optimization
>>>>> is set.
>>>>
>>>> I have
>>>>
>>>> /* NB: bzero returns void and __memsetzero returns void *.  */
>>>> asm (".weak bzero");
>>>> asm ("bzero = __memsetzero");
>>>> asm (".global __bzero");
>>>> asm ("__bzero = __memsetzero");
>>>>
>>>>>>
>>>>>>>
>>>>>>> My understanding is __memsetzero would maybe yield some gain in the
>>>>>>> store mask generation (some architecture might have a zero register
>>>>>>> or some instruction to generate one), however it would require to
>>>>>>> use the same strategy as memset to use specific architecture instruction
>>>>>>> that optimize cache utilization (dc zva, dcbz).
>>>>>>>
>>>>>>> So it would mostly require a lot of arch-specific code to to share
>>>>>>> the memset code with __memsetzero (to avoid increasing code size),
>>>>>>> so I am not sure if this is really a gain in the long term.
>>>>>>
>>>>>> It's worth noting that between the two `memset` is the cold function
>>>>>> and `__memsetzero` is the hot one. Based on profiles of GCC11 and
>>>>>> Python3.7.7 setting zero covers 99%+ cases.
>>>>>
>>>>> This is very workload specific and I think with more advance compiler
>>>>> optimization like LTO and PGO such calls could most likely being
>>>>> optimized by the compiler itself (either by inline or by create a
>>>>> synthetic function to handle it).
>>>>>
>>>>> What I worried is such symbols might ended up as the AEBI memcpy variants
>>>>> that was added as way to optimize when alignment is know to be multiple
>>>>> of words, but it ended up not being implemented and also not being generated
>>>>> by the compiler (at least not gcc).
>>>>
>>>>
>>>>
>>>> --
>>>> H.J.
>>>
>>> I would like to backport this patch to release branches.
>>> Any comments or objections?
>>
>> Nothing really against, but as previous discussion we had on this maillist optimizing
>> bzero does not yield much gain compared to memset (compiler won't generate libcall
>> for loop transformation, among other shortcomings). My idea is to follow other
>> architecture and just remove all x86_64 optimizations.
> 
> We'd like to reduce the differences between master and release branches to help
> future backports to release branches.
> 

Ok, fair enough. 

^ permalink raw reply	[flat|nested] 39+ messages in thread

* Re: [PATCH v2] x86-64: Optimize bzero
  2022-02-24 23:21       ` Noah Goldstein
@ 2022-02-25 17:37         ` Noah Goldstein
  0 siblings, 0 replies; 39+ messages in thread
From: Noah Goldstein @ 2022-02-25 17:37 UTC (permalink / raw)
  To: Wilco Dijkstra; +Cc: GNU C Library, Adhemerval Zanella, H.J. Lu

On Thu, Feb 24, 2022 at 5:21 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> On Thu, Feb 24, 2022 at 4:58 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> >
> > On Thu, Feb 24, 2022 at 7:16 AM Wilco Dijkstra <Wilco.Dijkstra@arm.com> wrote:
> > >
> > > Hi Noah,
> > >
> > > > The performance improvement for the build/link step for varying amount of
> > > > small functions per file was consistently in the ~.8% range. Not mind blowing
> > > > but I believe its a genuine improvement.
> > >
> > > I don't see how it's possible to get anywhere near 0.8%. I tried compiling a file with
> > > 10000 empty functions, and the latest __memset_exex_unaligned_erms takes about
> > > 1.16% of total time.
> >
> > Smart sanity check I'll start using.
> >
> > What method are you using for getting total function call overhead?
> >
> > Using `perf record` + `pref report` and see a fair amount of variance but much
> > higher memset overheader (counting `_*unaligned_erms` and `*_unaligned`)
> > in `cc1` and `as`.
> >
> > From average of 3 runs compiling file with 1/10/100/1000 functions I get:
> >
> > 1: 4.04%
> > 10: 3.94%
> > 100: 2.86%
> > 1000: 2.68%
> >
> > So its slightly less insane, arguing for the following speedups:
> >
> > 1: ~15%
> > 10: ~15%
> > 100: ~25% <--- this makes little to no sense
> > 1000: ~15%
> >
> > personally agree with you that those numbers seem to high though.
> >
> > In the best case micro-benchmark that stressed the p5 bottleneck
> > this is about what we see.
> >
> > >
> > > There are 81.5 million calls to memset in 48 billion cycles for this benchmark. That
> > > means 6.8 cycles per memset call on average. A 0.8% speedup would require making
> > > each memset 4.7 cycles faster, and that's not possible with bzero.
> > >
> > > To verify whether vpbroadcastb is a bottleneck I repeated it 16 times. This increased
> > > the memset percentage to 1.86%, however the total cycles didn't change measurably.
> > >
> > > I'm not sure how you're measuring this, but it's clear what you're seeing is not a
> > > speedup from bzero.
>
> Bah, youre right. The the non-memsetzero GCC  was choosing my systems
> memset implementation (2.31 + avx2).
>
> Sorry, I'll rerun tonight and post an update.

After rerun using the same `memset` version for both the
`__memsetzero` and `memset`
runs I see no measurable performance improvement.

> > >
> > > Cheers,
> > > Wilco

^ permalink raw reply	[flat|nested] 39+ messages in thread

* Re: [PATCH v2] x86-64: Optimize bzero
  2022-02-25 13:51       ` Wilco Dijkstra
@ 2022-02-25 17:35         ` Noah Goldstein
  0 siblings, 0 replies; 39+ messages in thread
From: Noah Goldstein @ 2022-02-25 17:35 UTC (permalink / raw)
  To: Wilco Dijkstra; +Cc: GNU C Library, Adhemerval Zanella, H.J. Lu

On Fri, Feb 25, 2022 at 7:51 AM Wilco Dijkstra <Wilco.Dijkstra@arm.com> wrote:
>
> Hi Noah,
>
> > What method are you using for getting total function call overhead?
> >
> > Using `perf record` + `pref report` and see a fair amount of variance but much
> > higher memset overheader (counting `_*unaligned_erms` and `*_unaligned`)
> > in `cc1` and `as`.
> >
> > From average of 3 runs compiling file with 1/10/100/1000 functions I get:
>
> For accurate profiles you need to run for longer and increase the sampling rate.

Ran with `-F max`. Would say error bar is ~.5% still (which is [12, 25]% of the
overall overhead so not insignificant.

> The 1000 function version takes less than a second to compile, which is why I
> went for 10000 functions at 20000 samples per second.
>
> I used LD_PRELOAD to use the latest version of __memset_exex_unaligned_erms.
> You can also trivially collect statistics of memset this way. Eg. in this benchmark
> 75% of memset is <= 32 bytes and average memset size 44 bytes, so it has a far
> higher percentage of really small memsets compared to normal compilation.

Does that mean you manually inserted timers to get total function call overhead?

But I see, was just curious if there was some simple audit more
precise than `perf record`
that didn't require building custom infrastructure.

>
> Cheers,
> Wilco

^ permalink raw reply	[flat|nested] 39+ messages in thread

* Re: [PATCH v2] x86-64: Optimize bzero
  2022-02-24 22:58     ` Noah Goldstein
  2022-02-24 23:21       ` Noah Goldstein
@ 2022-02-25 13:51       ` Wilco Dijkstra
  2022-02-25 17:35         ` Noah Goldstein
  1 sibling, 1 reply; 39+ messages in thread
From: Wilco Dijkstra @ 2022-02-25 13:51 UTC (permalink / raw)
  To: Noah Goldstein; +Cc: GNU C Library, Adhemerval Zanella, H.J. Lu

Hi Noah,

> What method are you using for getting total function call overhead?
>
> Using `perf record` + `pref report` and see a fair amount of variance but much
> higher memset overheader (counting `_*unaligned_erms` and `*_unaligned`)
> in `cc1` and `as`.
>
> From average of 3 runs compiling file with 1/10/100/1000 functions I get:

For accurate profiles you need to run for longer and increase the sampling rate.
The 1000 function version takes less than a second to compile, which is why I
went for 10000 functions at 20000 samples per second.

I used LD_PRELOAD to use the latest version of __memset_exex_unaligned_erms.
You can also trivially collect statistics of memset this way. Eg. in this benchmark
75% of memset is <= 32 bytes and average memset size 44 bytes, so it has a far
higher percentage of really small memsets compared to normal compilation.

Cheers,
Wilco

^ permalink raw reply	[flat|nested] 39+ messages in thread

* Re: [PATCH v2] x86-64: Optimize bzero
  2022-02-24 22:58     ` Noah Goldstein
@ 2022-02-24 23:21       ` Noah Goldstein
  2022-02-25 17:37         ` Noah Goldstein
  2022-02-25 13:51       ` Wilco Dijkstra
  1 sibling, 1 reply; 39+ messages in thread
From: Noah Goldstein @ 2022-02-24 23:21 UTC (permalink / raw)
  To: Wilco Dijkstra; +Cc: GNU C Library, Adhemerval Zanella, H.J. Lu

On Thu, Feb 24, 2022 at 4:58 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> On Thu, Feb 24, 2022 at 7:16 AM Wilco Dijkstra <Wilco.Dijkstra@arm.com> wrote:
> >
> > Hi Noah,
> >
> > > The performance improvement for the build/link step for varying amount of
> > > small functions per file was consistently in the ~.8% range. Not mind blowing
> > > but I believe its a genuine improvement.
> >
> > I don't see how it's possible to get anywhere near 0.8%. I tried compiling a file with
> > 10000 empty functions, and the latest __memset_exex_unaligned_erms takes about
> > 1.16% of total time.
>
> Smart sanity check I'll start using.
>
> What method are you using for getting total function call overhead?
>
> Using `perf record` + `pref report` and see a fair amount of variance but much
> higher memset overheader (counting `_*unaligned_erms` and `*_unaligned`)
> in `cc1` and `as`.
>
> From average of 3 runs compiling file with 1/10/100/1000 functions I get:
>
> 1: 4.04%
> 10: 3.94%
> 100: 2.86%
> 1000: 2.68%
>
> So its slightly less insane, arguing for the following speedups:
>
> 1: ~15%
> 10: ~15%
> 100: ~25% <--- this makes little to no sense
> 1000: ~15%
>
> personally agree with you that those numbers seem to high though.
>
> In the best case micro-benchmark that stressed the p5 bottleneck
> this is about what we see.
>
> >
> > There are 81.5 million calls to memset in 48 billion cycles for this benchmark. That
> > means 6.8 cycles per memset call on average. A 0.8% speedup would require making
> > each memset 4.7 cycles faster, and that's not possible with bzero.
> >
> > To verify whether vpbroadcastb is a bottleneck I repeated it 16 times. This increased
> > the memset percentage to 1.86%, however the total cycles didn't change measurably.
> >
> > I'm not sure how you're measuring this, but it's clear what you're seeing is not a
> > speedup from bzero.

Bah, youre right. The the non-memsetzero GCC  was choosing my systems
memset implementation (2.31 + avx2).

Sorry, I'll rerun tonight and post an update.
> >
> > Cheers,
> > Wilco

^ permalink raw reply	[flat|nested] 39+ messages in thread

* Re: [PATCH v2] x86-64: Optimize bzero
  2022-02-24 13:16   ` Wilco Dijkstra
  2022-02-24 15:48     ` H.J. Lu
@ 2022-02-24 22:58     ` Noah Goldstein
  2022-02-24 23:21       ` Noah Goldstein
  2022-02-25 13:51       ` Wilco Dijkstra
  1 sibling, 2 replies; 39+ messages in thread
From: Noah Goldstein @ 2022-02-24 22:58 UTC (permalink / raw)
  To: Wilco Dijkstra; +Cc: GNU C Library, Adhemerval Zanella, H.J. Lu

On Thu, Feb 24, 2022 at 7:16 AM Wilco Dijkstra <Wilco.Dijkstra@arm.com> wrote:
>
> Hi Noah,
>
> > The performance improvement for the build/link step for varying amount of
> > small functions per file was consistently in the ~.8% range. Not mind blowing
> > but I believe its a genuine improvement.
>
> I don't see how it's possible to get anywhere near 0.8%. I tried compiling a file with
> 10000 empty functions, and the latest __memset_exex_unaligned_erms takes about
> 1.16% of total time.

Smart sanity check I'll start using.

What method are you using for getting total function call overhead?

Using `perf record` + `pref report` and see a fair amount of variance but much
higher memset overheader (counting `_*unaligned_erms` and `*_unaligned`)
in `cc1` and `as`.

From average of 3 runs compiling file with 1/10/100/1000 functions I get:

1: 4.04%
10: 3.94%
100: 2.86%
1000: 2.68%

So its slightly less insane, arguing for the following speedups:

1: ~15%
10: ~15%
100: ~25% <--- this makes little to no sense
1000: ~15%

personally agree with you that those numbers seem to high though.

In the best case micro-benchmark that stressed the p5 bottleneck
this is about what we see.

>
> There are 81.5 million calls to memset in 48 billion cycles for this benchmark. That
> means 6.8 cycles per memset call on average. A 0.8% speedup would require making
> each memset 4.7 cycles faster, and that's not possible with bzero.
>
> To verify whether vpbroadcastb is a bottleneck I repeated it 16 times. This increased
> the memset percentage to 1.86%, however the total cycles didn't change measurably.
>
> I'm not sure how you're measuring this, but it's clear what you're seeing is not a
> speedup from bzero.
>
> Cheers,
> Wilco

^ permalink raw reply	[flat|nested] 39+ messages in thread

* Re: [PATCH v2] x86-64: Optimize bzero
  2022-02-24 13:16   ` Wilco Dijkstra
@ 2022-02-24 15:48     ` H.J. Lu
  2022-02-24 22:58     ` Noah Goldstein
  1 sibling, 0 replies; 39+ messages in thread
From: H.J. Lu @ 2022-02-24 15:48 UTC (permalink / raw)
  To: Wilco Dijkstra; +Cc: Noah Goldstein, GNU C Library, Adhemerval Zanella

On Thu, Feb 24, 2022 at 5:16 AM Wilco Dijkstra <Wilco.Dijkstra@arm.com> wrote:
>
> Hi Noah,
>
> > The performance improvement for the build/link step for varying amount of
> > small functions per file was consistently in the ~.8% range. Not mind blowing
> > but I believe its a genuine improvement.
>
> I don't see how it's possible to get anywhere near 0.8%. I tried compiling a file with
> 10000 empty functions, and the latest __memset_exex_unaligned_erms takes about
> 1.16% of total time.
>
> There are 81.5 million calls to memset in 48 billion cycles for this benchmark. That
> means 6.8 cycles per memset call on average. A 0.8% speedup would require making
> each memset 4.7 cycles faster, and that's not possible with bzero.
>
> To verify whether vpbroadcastb is a bottleneck I repeated it 16 times. This increased
> the memset percentage to 1.86%, however the total cycles didn't change measurably.
>
> I'm not sure how you're measuring this, but it's clear what you're seeing is not a
> speedup from bzero.

We are trying to improve memset with zero value.  The speedup will be
visible only
for small sizes (< 32 bytes).

-- 
H.J.

^ permalink raw reply	[flat|nested] 39+ messages in thread

* Re: [PATCH v2] x86-64: Optimize bzero
  2022-02-23  8:12 ` Noah Goldstein
  2022-02-23 12:09   ` Adhemerval Zanella
@ 2022-02-24 13:16   ` Wilco Dijkstra
  2022-02-24 15:48     ` H.J. Lu
  2022-02-24 22:58     ` Noah Goldstein
  1 sibling, 2 replies; 39+ messages in thread
From: Wilco Dijkstra @ 2022-02-24 13:16 UTC (permalink / raw)
  To: Noah Goldstein; +Cc: GNU C Library, Adhemerval Zanella, H.J. Lu

Hi Noah,

> The performance improvement for the build/link step for varying amount of
> small functions per file was consistently in the ~.8% range. Not mind blowing
> but I believe its a genuine improvement.

I don't see how it's possible to get anywhere near 0.8%. I tried compiling a file with
10000 empty functions, and the latest __memset_exex_unaligned_erms takes about
1.16% of total time. 

There are 81.5 million calls to memset in 48 billion cycles for this benchmark. That
means 6.8 cycles per memset call on average. A 0.8% speedup would require making
each memset 4.7 cycles faster, and that's not possible with bzero.

To verify whether vpbroadcastb is a bottleneck I repeated it 16 times. This increased
the memset percentage to 1.86%, however the total cycles didn't change measurably.

I'm not sure how you're measuring this, but it's clear what you're seeing is not a
speedup from bzero.

Cheers,
Wilco

^ permalink raw reply	[flat|nested] 39+ messages in thread

* Re: [PATCH v2] x86-64: Optimize bzero
  2022-02-23  8:12 ` Noah Goldstein
@ 2022-02-23 12:09   ` Adhemerval Zanella
  2022-02-24 13:16   ` Wilco Dijkstra
  1 sibling, 0 replies; 39+ messages in thread
From: Adhemerval Zanella @ 2022-02-23 12:09 UTC (permalink / raw)
  To: Noah Goldstein, Wilco Dijkstra; +Cc: GNU C Library, H.J. Lu



On 23/02/2022 05:12, Noah Goldstein wrote:
> On Tue, Feb 15, 2022 at 7:38 AM Wilco Dijkstra <Wilco.Dijkstra@arm.com> wrote:
>>
>> Hi,
>>
>>> Is there any way it can be setup so that one C impl can cover all the
>>> arch that want to just leave `__memsetzero` as an alias to `memset`?
>>> I know they have incompatible interfaces that make it hard but would
>>> a weak static inline in string.h work?
>>
>> No that won't work. A C implementation similar to current string/bzero.c
>> adds unacceptable overhead (since most targets just implement memset and
>> will continue to do so). An inline function in string.h would introduce target
>> hacks in our headers, something we've been working hard to remove over the
>> years.
>>
>> The only reasonable option is a target specific optimization in GCC and LLVM
>> so that memsetzero is only emitted when it is known an optimized GLIBC
>> implementation exists (similar to mempcpy).
>>
>>> It's worth noting that between the two `memset` is the cold function
>>> and `__memsetzero` is the hot one. Based on profiles of GCC11 and
>>> Python3.7.7 setting zero covers 99%+ cases.
>>
>> There is no doubt memset of zero is by far the most common. What is in doubt
>> is whether micro-optimizing is worth it on modern cores. Does Python speed up
>> by a measurable amount if you use memsetzero?
> 
> Ran a few benchmarks for GCC/Python3.7
> 
> There is no measurable benefit using '__memsetzero' in Python3.7
> 
> For GCC there are some cases where there is a consistent speedup
> though it's not universal.
> 
> Times are geomean (N=30) of memsetzero / memset
> (1.0 means no difference, less than 1 means improvement, greater than
> 1 regression).
> 
>  Size, N Funcs,  Type, memsetzero / memset
> small,       1, bench,             0.99986
> small,       1, build,             0.99378
> small,       1,  link,             0.99241
> small,      10, bench,             0.99712
> small,      10, build,             0.99393
> small,      10,  link,             0.99245
> small,     100, bench,             0.99659
> small,     100, build,             0.99271
> small,     100,  link,             0.99227
> small,     250, bench,             1.00195
> small,     250, build,             0.99609
> small,     250,  link,             0.99744
> large,     N/A, bench,             0.99930
> 
> 
> The "small" size basically means the file was filled with essentially empty
> functions i.e
> ```
> int foo(void) { return 0; }
> ```
> 
> N Funcs refers to the number of these functions per file, so small-250 would
> be 250 empty functions per file.
> 
> Bench recompiled the same file 100x times
> Build compiled all the files
> Link linked all the files with a main that emitted 1x call per function
> 
> The "large" size was a realistic file someone might compile (in this case
> a freeze of sqlite3.c).
> 
> The performance improvement for the build/link step for varying amount of
> small functions per file was consistently in the ~.8% range. Not mind blowing
> but I believe its a genuine improvement.
> 
> I don't think this shows expected GCC usage is going to be faster, but
> I do think it shows that the effects of this change could be noticeable in an
> application.
> 

I hardly consider this marginal improvement a good reason to add a libc
symbol, specially because it is unlikely most architecture will ever provide
an optimized version of this (aarch64 maintainer said there are not looking
forward to it) and newer architecture extensions (such as s390 mvcle) or 
compile optimizations (such PGO or LTO) might remove the function call
altogether.

> NB: I'm not exactly certain why 'bench' doesn't follow the same trend
> as build/link.
> The only thing I notice is 'bench' takes longer (implemented in a Makefile
> loop) so possibly to '+ c' term just dampens any performance differences.
> The math for this doesn't work out 100% so there is a bit to still be skeptical
> of.

^ permalink raw reply	[flat|nested] 39+ messages in thread

* Re: [PATCH v2] x86-64: Optimize bzero
  2022-02-15 13:38 Wilco Dijkstra
@ 2022-02-23  8:12 ` Noah Goldstein
  2022-02-23 12:09   ` Adhemerval Zanella
  2022-02-24 13:16   ` Wilco Dijkstra
  0 siblings, 2 replies; 39+ messages in thread
From: Noah Goldstein @ 2022-02-23  8:12 UTC (permalink / raw)
  To: Wilco Dijkstra; +Cc: GNU C Library, Adhemerval Zanella, H.J. Lu

On Tue, Feb 15, 2022 at 7:38 AM Wilco Dijkstra <Wilco.Dijkstra@arm.com> wrote:
>
> Hi,
>
> > Is there any way it can be setup so that one C impl can cover all the
> > arch that want to just leave `__memsetzero` as an alias to `memset`?
> > I know they have incompatible interfaces that make it hard but would
> > a weak static inline in string.h work?
>
> No that won't work. A C implementation similar to current string/bzero.c
> adds unacceptable overhead (since most targets just implement memset and
> will continue to do so). An inline function in string.h would introduce target
> hacks in our headers, something we've been working hard to remove over the
> years.
>
> The only reasonable option is a target specific optimization in GCC and LLVM
> so that memsetzero is only emitted when it is known an optimized GLIBC
> implementation exists (similar to mempcpy).
>
> > It's worth noting that between the two `memset` is the cold function
> > and `__memsetzero` is the hot one. Based on profiles of GCC11 and
> > Python3.7.7 setting zero covers 99%+ cases.
>
> There is no doubt memset of zero is by far the most common. What is in doubt
> is whether micro-optimizing is worth it on modern cores. Does Python speed up
> by a measurable amount if you use memsetzero?

Ran a few benchmarks for GCC/Python3.7

There is no measurable benefit using '__memsetzero' in Python3.7

For GCC there are some cases where there is a consistent speedup
though it's not universal.

Times are geomean (N=30) of memsetzero / memset
(1.0 means no difference, less than 1 means improvement, greater than
1 regression).

 Size, N Funcs,  Type, memsetzero / memset
small,       1, bench,             0.99986
small,       1, build,             0.99378
small,       1,  link,             0.99241
small,      10, bench,             0.99712
small,      10, build,             0.99393
small,      10,  link,             0.99245
small,     100, bench,             0.99659
small,     100, build,             0.99271
small,     100,  link,             0.99227
small,     250, bench,             1.00195
small,     250, build,             0.99609
small,     250,  link,             0.99744
large,     N/A, bench,             0.99930


The "small" size basically means the file was filled with essentially empty
functions i.e
```
int foo(void) { return 0; }
```

N Funcs refers to the number of these functions per file, so small-250 would
be 250 empty functions per file.

Bench recompiled the same file 100x times
Build compiled all the files
Link linked all the files with a main that emitted 1x call per function

The "large" size was a realistic file someone might compile (in this case
a freeze of sqlite3.c).

The performance improvement for the build/link step for varying amount of
small functions per file was consistently in the ~.8% range. Not mind blowing
but I believe its a genuine improvement.

I don't think this shows expected GCC usage is going to be faster, but
I do think it shows that the effects of this change could be noticeable in an
application.

NB: I'm not exactly certain why 'bench' doesn't follow the same trend
as build/link.
The only thing I notice is 'bench' takes longer (implemented in a Makefile
loop) so possibly to '+ c' term just dampens any performance differences.
The math for this doesn't work out 100% so there is a bit to still be skeptical
of.


>
> Cheers,
> Wilco

^ permalink raw reply	[flat|nested] 39+ messages in thread

* [PATCH v2] x86-64: Optimize bzero
@ 2022-02-15 13:38 Wilco Dijkstra
  2022-02-23  8:12 ` Noah Goldstein
  0 siblings, 1 reply; 39+ messages in thread
From: Wilco Dijkstra @ 2022-02-15 13:38 UTC (permalink / raw)
  To: Noah Goldstein; +Cc: 'GNU C Library', Adhemerval Zanella, H.J. Lu

Hi,

> Is there any way it can be setup so that one C impl can cover all the
> arch that want to just leave `__memsetzero` as an alias to `memset`?
> I know they have incompatible interfaces that make it hard but would
> a weak static inline in string.h work?

No that won't work. A C implementation similar to current string/bzero.c
adds unacceptable overhead (since most targets just implement memset and
will continue to do so). An inline function in string.h would introduce target
hacks in our headers, something we've been working hard to remove over the
years.

The only reasonable option is a target specific optimization in GCC and LLVM
so that memsetzero is only emitted when it is known an optimized GLIBC
implementation exists (similar to mempcpy).

> It's worth noting that between the two `memset` is the cold function
> and `__memsetzero` is the hot one. Based on profiles of GCC11 and
> Python3.7.7 setting zero covers 99%+ cases.

There is no doubt memset of zero is by far the most common. What is in doubt
is whether micro-optimizing is worth it on modern cores. Does Python speed up
by a measurable amount if you use memsetzero?

Cheers,
Wilco

^ permalink raw reply	[flat|nested] 39+ messages in thread

end of thread, other threads:[~2022-05-04 14:54 UTC | newest]

Thread overview: 39+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-02-08 22:43 [PATCH v2] x86-64: Optimize bzero H.J. Lu
2022-02-08 23:56 ` Noah Goldstein
2022-02-09 11:41 ` Adhemerval Zanella
2022-02-09 22:14   ` Noah Goldstein
2022-02-10 12:35     ` Adhemerval Zanella
2022-02-10 13:01       ` Wilco Dijkstra
2022-02-10 13:10         ` Adhemerval Zanella
2022-02-10 13:16           ` Adhemerval Zanella
2022-02-10 13:17           ` Wilco Dijkstra
2022-02-10 13:22             ` Adhemerval Zanella
2022-02-10 17:50               ` Alejandro Colomar (man-pages)
2022-02-10 19:19                 ` Wilco Dijkstra
2022-02-10 20:27                   ` Alejandro Colomar (man-pages)
2022-02-10 20:42                     ` Adhemerval Zanella
2022-02-10 21:07                       ` Patrick McGehearty
2022-02-11 13:01                         ` Adhemerval Zanella
2022-02-12 23:46                           ` Noah Goldstein
2022-02-14 12:07                             ` Adhemerval Zanella
2022-02-14 12:41                               ` Noah Goldstein
2022-02-14 14:07                                 ` Adhemerval Zanella
2022-02-14 15:03                                   ` H.J. Lu
2022-05-04  6:35                                     ` Sunil Pandey
2022-05-04 12:52                                       ` Adhemerval Zanella
2022-05-04 14:50                                         ` H.J. Lu
2022-05-04 14:54                                           ` Adhemerval Zanella
2022-02-10 22:00                       ` Alejandro Colomar (man-pages)
2022-02-10 19:42                 ` Adhemerval Zanella
2022-02-10 18:28         ` Noah Goldstein
2022-02-10 18:35         ` Noah Goldstein
2022-02-15 13:38 Wilco Dijkstra
2022-02-23  8:12 ` Noah Goldstein
2022-02-23 12:09   ` Adhemerval Zanella
2022-02-24 13:16   ` Wilco Dijkstra
2022-02-24 15:48     ` H.J. Lu
2022-02-24 22:58     ` Noah Goldstein
2022-02-24 23:21       ` Noah Goldstein
2022-02-25 17:37         ` Noah Goldstein
2022-02-25 13:51       ` Wilco Dijkstra
2022-02-25 17:35         ` Noah Goldstein

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).