public inbox for libc-alpha@sourceware.org
 help / color / mirror / Atom feed
* [PATCH] x86-64: Optimize memset for zeroing
@ 2021-12-31 18:20 H.J. Lu
  2021-12-31 20:21 ` Noah Goldstein
  0 siblings, 1 reply; 14+ messages in thread
From: H.J. Lu @ 2021-12-31 18:20 UTC (permalink / raw)
  To: libc-alpha; +Cc: Noah Goldstein, arjan

Update MEMSET_VDUP_TO_VEC0_AND_SET_RETURN to use PXOR, which has lower
lantency and higher throughput than VPBROADCAST, for zero constant.
Since the most common usage of memset is to zero a block of memory, the
branch predictor will make the compare/jmp basically free and PXOR is
almost like being executed unconditionally.
---
 sysdeps/x86_64/memset.S                            | 14 ++++++++++++--
 .../x86_64/multiarch/memset-avx2-unaligned-erms.S  | 14 ++++++++++++--
 .../multiarch/memset-avx512-unaligned-erms.S       | 10 ++++++++++
 .../x86_64/multiarch/memset-evex-unaligned-erms.S  | 10 ++++++++++
 .../x86_64/multiarch/memset-vec-unaligned-erms.S   | 13 +++++++++++++
 5 files changed, 57 insertions(+), 4 deletions(-)

diff --git a/sysdeps/x86_64/memset.S b/sysdeps/x86_64/memset.S
index 0137eba4cd..513f9c703d 100644
--- a/sysdeps/x86_64/memset.S
+++ b/sysdeps/x86_64/memset.S
@@ -29,15 +29,25 @@
 #define VMOVA     movaps
 
 #define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
-  movd d, %xmm0; \
   movq r, %rax; \
+  testl d, d; \
+  jnz 1f; \
+  pxor %xmm0, %xmm0
+
+# define MEMSET_VDUP_TO_VEC0(d) \
+  movd d, %xmm0; \
   punpcklbw %xmm0, %xmm0; \
   punpcklwd %xmm0, %xmm0; \
   pshufd $0, %xmm0, %xmm0
 
 #define WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
-  movd d, %xmm0; \
   movq r, %rax; \
+  testl d, d; \
+  jnz 1f; \
+  pxor %xmm0, %xmm0
+
+# define WMEMSET_VDUP_TO_VEC0(d) \
+  movd d, %xmm0; \
   pshufd $0, %xmm0, %xmm0
 
 #define SECTION(p)		p
diff --git a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
index 1af668af0a..8004a27750 100644
--- a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
@@ -11,13 +11,23 @@
 # define VMOVA     vmovdqa
 
 # define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
-  vmovd d, %xmm0; \
   movq r, %rax; \
+  testl d, d; \
+  jnz 1f; \
+  vpxor %xmm0, %xmm0, %xmm0
+
+# define MEMSET_VDUP_TO_VEC0(d) \
+  vmovd d, %xmm0; \
   vpbroadcastb %xmm0, %ymm0
 
 # define WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
-  vmovd d, %xmm0; \
   movq r, %rax; \
+  testl d, d; \
+  jnz 1f; \
+  vpxor %xmm0, %xmm0, %xmm0
+
+# define WMEMSET_VDUP_TO_VEC0(d) \
+  vmovd d, %xmm0; \
   vpbroadcastd %xmm0, %ymm0
 
 # ifndef SECTION
diff --git a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
index f14d6f8493..61ff9ccf6f 100644
--- a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
@@ -17,10 +17,20 @@
 
 # define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
   movq r, %rax; \
+  testl d, d; \
+  jnz 1f; \
+  vpxorq %XMM0, %XMM0, %XMM0
+
+# define MEMSET_VDUP_TO_VEC0(d) \
   vpbroadcastb d, %VEC0
 
 # define WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
   movq r, %rax; \
+  testl d, d; \
+  jnz 1f; \
+  vpxorq %XMM0, %XMM0, %XMM0
+
+# define WMEMSET_VDUP_TO_VEC0(d) \
   vpbroadcastd d, %VEC0
 
 # define SECTION(p)		p##.evex512
diff --git a/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
index 64b09e77cc..85544fb0fc 100644
--- a/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
@@ -17,10 +17,20 @@
 
 # define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
   movq r, %rax; \
+  testl d, d; \
+  jnz 1f; \
+  vpxorq %XMM0, %XMM0, %XMM0
+
+# define MEMSET_VDUP_TO_VEC0(d) \
   vpbroadcastb d, %VEC0
 
 # define WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
   movq r, %rax; \
+  testl d, d; \
+  jnz 1f; \
+  vpxorq %XMM0, %XMM0, %XMM0
+
+# define WMEMSET_VDUP_TO_VEC0(d) \
   vpbroadcastd d, %VEC0
 
 # define SECTION(p)		p##.evex
diff --git a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
index e723413a66..4ca34a19ba 100644
--- a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
@@ -112,6 +112,9 @@ ENTRY (WMEMSET_SYMBOL (__wmemset, unaligned))
 	shl	$2, %RDX_LP
 	WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi)
 	jmp	L(entry_from_bzero)
+1:
+	WMEMSET_VDUP_TO_VEC0 (%esi)
+	jmp	L(entry_from_bzero)
 END (WMEMSET_SYMBOL (__wmemset, unaligned))
 #endif
 
@@ -124,6 +127,7 @@ END_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned))
 
 ENTRY (MEMSET_SYMBOL (__memset, unaligned))
 	MEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi)
+2:
 # ifdef __ILP32__
 	/* Clear the upper 32 bits.  */
 	mov	%edx, %edx
@@ -137,6 +141,10 @@ L(entry_from_bzero):
 	VMOVU	%VEC(0), -VEC_SIZE(%rdi,%rdx)
 	VMOVU	%VEC(0), (%rdi)
 	VZEROUPPER_RETURN
+
+1:
+	MEMSET_VDUP_TO_VEC0 (%esi)
+	jmp	2b
 #if defined USE_MULTIARCH && IS_IN (libc)
 END (MEMSET_SYMBOL (__memset, unaligned))
 
@@ -180,6 +188,7 @@ END_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned_erms))
 
 ENTRY_P2ALIGN (MEMSET_SYMBOL (__memset, unaligned_erms), 6)
 	MEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi)
+2:
 # ifdef __ILP32__
 	/* Clear the upper 32 bits.  */
 	mov	%edx, %edx
@@ -193,6 +202,10 @@ ENTRY_P2ALIGN (MEMSET_SYMBOL (__memset, unaligned_erms), 6)
 	VMOVU	%VEC(0), (%rax)
 	VMOVU	%VEC(0), -VEC_SIZE(%rax, %rdx)
 	VZEROUPPER_RETURN
+
+1:
+	MEMSET_VDUP_TO_VEC0 (%esi)
+	jmp	2b
 #endif
 
 	.p2align 4,, 10
-- 
2.33.1


^ permalink raw reply	[flat|nested] 14+ messages in thread

end of thread, other threads:[~2022-01-03 21:34 UTC | newest]

Thread overview: 14+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-12-31 18:20 [PATCH] x86-64: Optimize memset for zeroing H.J. Lu
2021-12-31 20:21 ` Noah Goldstein
2021-12-31 20:35   ` H.J. Lu
2021-12-31 20:43     ` Florian Weimer
2021-12-31 20:52       ` H.J. Lu
2021-12-31 21:02         ` Florian Weimer
2021-12-31 21:15           ` Noah Goldstein
2021-12-31 22:05             ` Florian Weimer
2021-12-31 22:14     ` Noah Goldstein
2021-12-31 22:19       ` Noah Goldstein
2021-12-31 22:21         ` H.J. Lu
2022-01-02 16:01   ` Cristian Rodríguez
2022-01-03 20:09   ` Patrick McGehearty
2022-01-03 21:34     ` Noah Goldstein

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).