[PATCH] x86-64: Optimize memset for zeroing

public inbox for libc-alpha@sourceware.org
 help / color / mirror / Atom feed

* [PATCH] x86-64: Optimize memset for zeroing
@ 2021-12-31 18:20 H.J. Lu
  2021-12-31 20:21 ` Noah Goldstein
  0 siblings, 1 reply; 14+ messages in thread
From: H.J. Lu @ 2021-12-31 18:20 UTC (permalink / raw)
  To: libc-alpha; +Cc: Noah Goldstein, arjan

Update MEMSET_VDUP_TO_VEC0_AND_SET_RETURN to use PXOR, which has lower
lantency and higher throughput than VPBROADCAST, for zero constant.
Since the most common usage of memset is to zero a block of memory, the
branch predictor will make the compare/jmp basically free and PXOR is
almost like being executed unconditionally.
---
 sysdeps/x86_64/memset.S                            | 14 ++++++++++++--
 .../x86_64/multiarch/memset-avx2-unaligned-erms.S  | 14 ++++++++++++--
 .../multiarch/memset-avx512-unaligned-erms.S       | 10 ++++++++++
 .../x86_64/multiarch/memset-evex-unaligned-erms.S  | 10 ++++++++++
 .../x86_64/multiarch/memset-vec-unaligned-erms.S   | 13 +++++++++++++
 5 files changed, 57 insertions(+), 4 deletions(-)

diff --git a/sysdeps/x86_64/memset.S b/sysdeps/x86_64/memset.S
index 0137eba4cd..513f9c703d 100644
--- a/sysdeps/x86_64/memset.S
+++ b/sysdeps/x86_64/memset.S
@@ -29,15 +29,25 @@
 #define VMOVA     movaps
 
 #define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
-  movd d, %xmm0; \
   movq r, %rax; \
+  testl d, d; \
+  jnz 1f; \
+  pxor %xmm0, %xmm0
+
+# define MEMSET_VDUP_TO_VEC0(d) \
+  movd d, %xmm0; \
   punpcklbw %xmm0, %xmm0; \
   punpcklwd %xmm0, %xmm0; \
   pshufd $0, %xmm0, %xmm0
 
 #define WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
-  movd d, %xmm0; \
   movq r, %rax; \
+  testl d, d; \
+  jnz 1f; \
+  pxor %xmm0, %xmm0
+
+# define WMEMSET_VDUP_TO_VEC0(d) \
+  movd d, %xmm0; \
   pshufd $0, %xmm0, %xmm0
 
 #define SECTION(p)		p
diff --git a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
index 1af668af0a..8004a27750 100644
--- a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
@@ -11,13 +11,23 @@
 # define VMOVA     vmovdqa
 
 # define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
-  vmovd d, %xmm0; \
   movq r, %rax; \
+  testl d, d; \
+  jnz 1f; \
+  vpxor %xmm0, %xmm0, %xmm0
+
+# define MEMSET_VDUP_TO_VEC0(d) \
+  vmovd d, %xmm0; \
   vpbroadcastb %xmm0, %ymm0
 
 # define WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
-  vmovd d, %xmm0; \
   movq r, %rax; \
+  testl d, d; \
+  jnz 1f; \
+  vpxor %xmm0, %xmm0, %xmm0
+
+# define WMEMSET_VDUP_TO_VEC0(d) \
+  vmovd d, %xmm0; \
   vpbroadcastd %xmm0, %ymm0
 
 # ifndef SECTION
diff --git a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
index f14d6f8493..61ff9ccf6f 100644
--- a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
@@ -17,10 +17,20 @@
 
 # define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
   movq r, %rax; \
+  testl d, d; \
+  jnz 1f; \
+  vpxorq %XMM0, %XMM0, %XMM0
+
+# define MEMSET_VDUP_TO_VEC0(d) \
   vpbroadcastb d, %VEC0
 
 # define WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
   movq r, %rax; \
+  testl d, d; \
+  jnz 1f; \
+  vpxorq %XMM0, %XMM0, %XMM0
+
+# define WMEMSET_VDUP_TO_VEC0(d) \
   vpbroadcastd d, %VEC0
 
 # define SECTION(p)		p##.evex512
diff --git a/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
index 64b09e77cc..85544fb0fc 100644
--- a/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
@@ -17,10 +17,20 @@
 
 # define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
   movq r, %rax; \
+  testl d, d; \
+  jnz 1f; \
+  vpxorq %XMM0, %XMM0, %XMM0
+
+# define MEMSET_VDUP_TO_VEC0(d) \
   vpbroadcastb d, %VEC0
 
 # define WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
   movq r, %rax; \
+  testl d, d; \
+  jnz 1f; \
+  vpxorq %XMM0, %XMM0, %XMM0
+
+# define WMEMSET_VDUP_TO_VEC0(d) \
   vpbroadcastd d, %VEC0
 
 # define SECTION(p)		p##.evex
diff --git a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
index e723413a66..4ca34a19ba 100644
--- a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
@@ -112,6 +112,9 @@ ENTRY (WMEMSET_SYMBOL (__wmemset, unaligned))
 	shl	$2, %RDX_LP
 	WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi)
 	jmp	L(entry_from_bzero)
+1:
+	WMEMSET_VDUP_TO_VEC0 (%esi)
+	jmp	L(entry_from_bzero)
 END (WMEMSET_SYMBOL (__wmemset, unaligned))
 #endif
 
@@ -124,6 +127,7 @@ END_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned))
 
 ENTRY (MEMSET_SYMBOL (__memset, unaligned))
 	MEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi)
+2:
 # ifdef __ILP32__
 	/* Clear the upper 32 bits.  */
 	mov	%edx, %edx
@@ -137,6 +141,10 @@ L(entry_from_bzero):
 	VMOVU	%VEC(0), -VEC_SIZE(%rdi,%rdx)
 	VMOVU	%VEC(0), (%rdi)
 	VZEROUPPER_RETURN
+
+1:
+	MEMSET_VDUP_TO_VEC0 (%esi)
+	jmp	2b
 #if defined USE_MULTIARCH && IS_IN (libc)
 END (MEMSET_SYMBOL (__memset, unaligned))
 
@@ -180,6 +188,7 @@ END_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned_erms))
 
 ENTRY_P2ALIGN (MEMSET_SYMBOL (__memset, unaligned_erms), 6)
 	MEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi)
+2:
 # ifdef __ILP32__
 	/* Clear the upper 32 bits.  */
 	mov	%edx, %edx
@@ -193,6 +202,10 @@ ENTRY_P2ALIGN (MEMSET_SYMBOL (__memset, unaligned_erms), 6)
 	VMOVU	%VEC(0), (%rax)
 	VMOVU	%VEC(0), -VEC_SIZE(%rax, %rdx)
 	VZEROUPPER_RETURN
+
+1:
+	MEMSET_VDUP_TO_VEC0 (%esi)
+	jmp	2b
 #endif
 
 	.p2align 4,, 10
-- 
2.33.1


^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH] x86-64: Optimize memset for zeroing
  2021-12-31 18:20 [PATCH] x86-64: Optimize memset for zeroing H.J. Lu
@ 2021-12-31 20:21 ` Noah Goldstein
  2021-12-31 20:35   ` H.J. Lu
                     ` (2 more replies)
  0 siblings, 3 replies; 14+ messages in thread
From: Noah Goldstein @ 2021-12-31 20:21 UTC (permalink / raw)
  To: H.J. Lu; +Cc: GNU C Library, Arjan van de Ven

On Fri, Dec 31, 2021 at 12:20 PM H.J. Lu <hjl.tools@gmail.com> wrote:
>
> Update MEMSET_VDUP_TO_VEC0_AND_SET_RETURN to use PXOR, which has lower
> lantency and higher throughput than VPBROADCAST, for zero constant.
> Since the most common usage of memset is to zero a block of memory, the
> branch predictor will make the compare/jmp basically free and PXOR is
> almost like being executed unconditionally.

Any benchmark results? Is the broadcast on the critical path for any size?

Also imagine the vast majority of memset zero are compile time known.

I think it might make more sense to give bzero() the fall-through instead and
add a patch in GCC to prefer bzero > memset.


> ---
>  sysdeps/x86_64/memset.S                            | 14 ++++++++++++--
>  .../x86_64/multiarch/memset-avx2-unaligned-erms.S  | 14 ++++++++++++--
>  .../multiarch/memset-avx512-unaligned-erms.S       | 10 ++++++++++
>  .../x86_64/multiarch/memset-evex-unaligned-erms.S  | 10 ++++++++++
>  .../x86_64/multiarch/memset-vec-unaligned-erms.S   | 13 +++++++++++++
>  5 files changed, 57 insertions(+), 4 deletions(-)
>
> diff --git a/sysdeps/x86_64/memset.S b/sysdeps/x86_64/memset.S
> index 0137eba4cd..513f9c703d 100644
> --- a/sysdeps/x86_64/memset.S
> +++ b/sysdeps/x86_64/memset.S
> @@ -29,15 +29,25 @@
>  #define VMOVA     movaps
>
>  #define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
> -  movd d, %xmm0; \
>    movq r, %rax; \
> +  testl d, d; \
> +  jnz 1f; \
> +  pxor %xmm0, %xmm0
> +
> +# define MEMSET_VDUP_TO_VEC0(d) \
> +  movd d, %xmm0; \
>    punpcklbw %xmm0, %xmm0; \
>    punpcklwd %xmm0, %xmm0; \
>    pshufd $0, %xmm0, %xmm0
>
>  #define WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
> -  movd d, %xmm0; \
>    movq r, %rax; \
> +  testl d, d; \
> +  jnz 1f; \
> +  pxor %xmm0, %xmm0
> +
> +# define WMEMSET_VDUP_TO_VEC0(d) \
> +  movd d, %xmm0; \
>    pshufd $0, %xmm0, %xmm0
>
>  #define SECTION(p)             p
> diff --git a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
> index 1af668af0a..8004a27750 100644
> --- a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
> +++ b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
> @@ -11,13 +11,23 @@
>  # define VMOVA     vmovdqa
>
>  # define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
> -  vmovd d, %xmm0; \
>    movq r, %rax; \
> +  testl d, d; \
> +  jnz 1f; \
> +  vpxor %xmm0, %xmm0, %xmm0
> +
> +# define MEMSET_VDUP_TO_VEC0(d) \
> +  vmovd d, %xmm0; \
>    vpbroadcastb %xmm0, %ymm0
>
>  # define WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
> -  vmovd d, %xmm0; \
>    movq r, %rax; \
> +  testl d, d; \
> +  jnz 1f; \
> +  vpxor %xmm0, %xmm0, %xmm0
> +
> +# define WMEMSET_VDUP_TO_VEC0(d) \
> +  vmovd d, %xmm0; \
>    vpbroadcastd %xmm0, %ymm0
>
>  # ifndef SECTION
> diff --git a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
> index f14d6f8493..61ff9ccf6f 100644
> --- a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
> +++ b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
> @@ -17,10 +17,20 @@
>
>  # define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
>    movq r, %rax; \
> +  testl d, d; \
> +  jnz 1f; \
> +  vpxorq %XMM0, %XMM0, %XMM0
> +
> +# define MEMSET_VDUP_TO_VEC0(d) \
>    vpbroadcastb d, %VEC0
>
>  # define WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
>    movq r, %rax; \
> +  testl d, d; \
> +  jnz 1f; \
> +  vpxorq %XMM0, %XMM0, %XMM0
> +
> +# define WMEMSET_VDUP_TO_VEC0(d) \
>    vpbroadcastd d, %VEC0
>
>  # define SECTION(p)            p##.evex512
> diff --git a/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
> index 64b09e77cc..85544fb0fc 100644
> --- a/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
> +++ b/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
> @@ -17,10 +17,20 @@
>
>  # define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
>    movq r, %rax; \
> +  testl d, d; \
> +  jnz 1f; \
> +  vpxorq %XMM0, %XMM0, %XMM0
> +
> +# define MEMSET_VDUP_TO_VEC0(d) \
>    vpbroadcastb d, %VEC0
>
>  # define WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
>    movq r, %rax; \
> +  testl d, d; \
> +  jnz 1f; \
> +  vpxorq %XMM0, %XMM0, %XMM0
> +
> +# define WMEMSET_VDUP_TO_VEC0(d) \
>    vpbroadcastd d, %VEC0
>
>  # define SECTION(p)            p##.evex
> diff --git a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
> index e723413a66..4ca34a19ba 100644
> --- a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
> +++ b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
> @@ -112,6 +112,9 @@ ENTRY (WMEMSET_SYMBOL (__wmemset, unaligned))
>         shl     $2, %RDX_LP
>         WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi)
>         jmp     L(entry_from_bzero)
> +1:
> +       WMEMSET_VDUP_TO_VEC0 (%esi)
> +       jmp     L(entry_from_bzero)
>  END (WMEMSET_SYMBOL (__wmemset, unaligned))
>  #endif
>
> @@ -124,6 +127,7 @@ END_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned))
>
>  ENTRY (MEMSET_SYMBOL (__memset, unaligned))
>         MEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi)
> +2:
>  # ifdef __ILP32__
>         /* Clear the upper 32 bits.  */
>         mov     %edx, %edx
> @@ -137,6 +141,10 @@ L(entry_from_bzero):
>         VMOVU   %VEC(0), -VEC_SIZE(%rdi,%rdx)
>         VMOVU   %VEC(0), (%rdi)
>         VZEROUPPER_RETURN
> +
> +1:
> +       MEMSET_VDUP_TO_VEC0 (%esi)
> +       jmp     2b
>  #if defined USE_MULTIARCH && IS_IN (libc)
>  END (MEMSET_SYMBOL (__memset, unaligned))
>
> @@ -180,6 +188,7 @@ END_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned_erms))
>
>  ENTRY_P2ALIGN (MEMSET_SYMBOL (__memset, unaligned_erms), 6)
>         MEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi)
> +2:
>  # ifdef __ILP32__
>         /* Clear the upper 32 bits.  */
>         mov     %edx, %edx
> @@ -193,6 +202,10 @@ ENTRY_P2ALIGN (MEMSET_SYMBOL (__memset, unaligned_erms), 6)
>         VMOVU   %VEC(0), (%rax)
>         VMOVU   %VEC(0), -VEC_SIZE(%rax, %rdx)
>         VZEROUPPER_RETURN
> +
> +1:
> +       MEMSET_VDUP_TO_VEC0 (%esi)
> +       jmp     2b
>  #endif
>
>         .p2align 4,, 10
> --
> 2.33.1
>

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH] x86-64: Optimize memset for zeroing
  2021-12-31 20:21 ` Noah Goldstein
@ 2021-12-31 20:35   ` H.J. Lu
  2021-12-31 20:43     ` Florian Weimer
  2021-12-31 22:14     ` Noah Goldstein
  2022-01-02 16:01   ` Cristian Rodríguez
  2022-01-03 20:09   ` Patrick McGehearty
  2 siblings, 2 replies; 14+ messages in thread
From: H.J. Lu @ 2021-12-31 20:35 UTC (permalink / raw)
  To: Noah Goldstein, GCC Development; +Cc: GNU C Library, Arjan van de Ven

On Fri, Dec 31, 2021 at 12:21 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> On Fri, Dec 31, 2021 at 12:20 PM H.J. Lu <hjl.tools@gmail.com> wrote:
> >
> > Update MEMSET_VDUP_TO_VEC0_AND_SET_RETURN to use PXOR, which has lower
> > lantency and higher throughput than VPBROADCAST, for zero constant.
> > Since the most common usage of memset is to zero a block of memory, the
> > branch predictor will make the compare/jmp basically free and PXOR is
> > almost like being executed unconditionally.
>
> Any benchmark results? Is the broadcast on the critical path for any size?

Can you run your workloads to see how many memset calls are zeroing?

> Also imagine the vast majority of memset zero are compile time known.
>
> I think it might make more sense to give bzero() the fall-through instead and

bzero is an alias of SSE2 memset in glibc.   Should we add __memsetzero
like __memcmpeq?  It should be almost free in glibc.  GCC can use
__memsetzero if it is available.

> add a patch in GCC to prefer bzero > memset.
>
>
> > ---
> >  sysdeps/x86_64/memset.S                            | 14 ++++++++++++--
> >  .../x86_64/multiarch/memset-avx2-unaligned-erms.S  | 14 ++++++++++++--
> >  .../multiarch/memset-avx512-unaligned-erms.S       | 10 ++++++++++
> >  .../x86_64/multiarch/memset-evex-unaligned-erms.S  | 10 ++++++++++
> >  .../x86_64/multiarch/memset-vec-unaligned-erms.S   | 13 +++++++++++++
> >  5 files changed, 57 insertions(+), 4 deletions(-)
> >
> > diff --git a/sysdeps/x86_64/memset.S b/sysdeps/x86_64/memset.S
> > index 0137eba4cd..513f9c703d 100644
> > --- a/sysdeps/x86_64/memset.S
> > +++ b/sysdeps/x86_64/memset.S
> > @@ -29,15 +29,25 @@
> >  #define VMOVA     movaps
> >
> >  #define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
> > -  movd d, %xmm0; \
> >    movq r, %rax; \
> > +  testl d, d; \
> > +  jnz 1f; \
> > +  pxor %xmm0, %xmm0
> > +
> > +# define MEMSET_VDUP_TO_VEC0(d) \
> > +  movd d, %xmm0; \
> >    punpcklbw %xmm0, %xmm0; \
> >    punpcklwd %xmm0, %xmm0; \
> >    pshufd $0, %xmm0, %xmm0
> >
> >  #define WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
> > -  movd d, %xmm0; \
> >    movq r, %rax; \
> > +  testl d, d; \
> > +  jnz 1f; \
> > +  pxor %xmm0, %xmm0
> > +
> > +# define WMEMSET_VDUP_TO_VEC0(d) \
> > +  movd d, %xmm0; \
> >    pshufd $0, %xmm0, %xmm0
> >
> >  #define SECTION(p)             p
> > diff --git a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
> > index 1af668af0a..8004a27750 100644
> > --- a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
> > +++ b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
> > @@ -11,13 +11,23 @@
> >  # define VMOVA     vmovdqa
> >
> >  # define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
> > -  vmovd d, %xmm0; \
> >    movq r, %rax; \
> > +  testl d, d; \
> > +  jnz 1f; \
> > +  vpxor %xmm0, %xmm0, %xmm0
> > +
> > +# define MEMSET_VDUP_TO_VEC0(d) \
> > +  vmovd d, %xmm0; \
> >    vpbroadcastb %xmm0, %ymm0
> >
> >  # define WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
> > -  vmovd d, %xmm0; \
> >    movq r, %rax; \
> > +  testl d, d; \
> > +  jnz 1f; \
> > +  vpxor %xmm0, %xmm0, %xmm0
> > +
> > +# define WMEMSET_VDUP_TO_VEC0(d) \
> > +  vmovd d, %xmm0; \
> >    vpbroadcastd %xmm0, %ymm0
> >
> >  # ifndef SECTION
> > diff --git a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
> > index f14d6f8493..61ff9ccf6f 100644
> > --- a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
> > +++ b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
> > @@ -17,10 +17,20 @@
> >
> >  # define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
> >    movq r, %rax; \
> > +  testl d, d; \
> > +  jnz 1f; \
> > +  vpxorq %XMM0, %XMM0, %XMM0
> > +
> > +# define MEMSET_VDUP_TO_VEC0(d) \
> >    vpbroadcastb d, %VEC0
> >
> >  # define WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
> >    movq r, %rax; \
> > +  testl d, d; \
> > +  jnz 1f; \
> > +  vpxorq %XMM0, %XMM0, %XMM0
> > +
> > +# define WMEMSET_VDUP_TO_VEC0(d) \
> >    vpbroadcastd d, %VEC0
> >
> >  # define SECTION(p)            p##.evex512
> > diff --git a/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
> > index 64b09e77cc..85544fb0fc 100644
> > --- a/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
> > +++ b/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
> > @@ -17,10 +17,20 @@
> >
> >  # define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
> >    movq r, %rax; \
> > +  testl d, d; \
> > +  jnz 1f; \
> > +  vpxorq %XMM0, %XMM0, %XMM0
> > +
> > +# define MEMSET_VDUP_TO_VEC0(d) \
> >    vpbroadcastb d, %VEC0
> >
> >  # define WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
> >    movq r, %rax; \
> > +  testl d, d; \
> > +  jnz 1f; \
> > +  vpxorq %XMM0, %XMM0, %XMM0
> > +
> > +# define WMEMSET_VDUP_TO_VEC0(d) \
> >    vpbroadcastd d, %VEC0
> >
> >  # define SECTION(p)            p##.evex
> > diff --git a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
> > index e723413a66..4ca34a19ba 100644
> > --- a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
> > +++ b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
> > @@ -112,6 +112,9 @@ ENTRY (WMEMSET_SYMBOL (__wmemset, unaligned))
> >         shl     $2, %RDX_LP
> >         WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi)
> >         jmp     L(entry_from_bzero)
> > +1:
> > +       WMEMSET_VDUP_TO_VEC0 (%esi)
> > +       jmp     L(entry_from_bzero)
> >  END (WMEMSET_SYMBOL (__wmemset, unaligned))
> >  #endif
> >
> > @@ -124,6 +127,7 @@ END_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned))
> >
> >  ENTRY (MEMSET_SYMBOL (__memset, unaligned))
> >         MEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi)
> > +2:
> >  # ifdef __ILP32__
> >         /* Clear the upper 32 bits.  */
> >         mov     %edx, %edx
> > @@ -137,6 +141,10 @@ L(entry_from_bzero):
> >         VMOVU   %VEC(0), -VEC_SIZE(%rdi,%rdx)
> >         VMOVU   %VEC(0), (%rdi)
> >         VZEROUPPER_RETURN
> > +
> > +1:
> > +       MEMSET_VDUP_TO_VEC0 (%esi)
> > +       jmp     2b
> >  #if defined USE_MULTIARCH && IS_IN (libc)
> >  END (MEMSET_SYMBOL (__memset, unaligned))
> >
> > @@ -180,6 +188,7 @@ END_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned_erms))
> >
> >  ENTRY_P2ALIGN (MEMSET_SYMBOL (__memset, unaligned_erms), 6)
> >         MEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi)
> > +2:
> >  # ifdef __ILP32__
> >         /* Clear the upper 32 bits.  */
> >         mov     %edx, %edx
> > @@ -193,6 +202,10 @@ ENTRY_P2ALIGN (MEMSET_SYMBOL (__memset, unaligned_erms), 6)
> >         VMOVU   %VEC(0), (%rax)
> >         VMOVU   %VEC(0), -VEC_SIZE(%rax, %rdx)
> >         VZEROUPPER_RETURN
> > +
> > +1:
> > +       MEMSET_VDUP_TO_VEC0 (%esi)
> > +       jmp     2b
> >  #endif
> >
> >         .p2align 4,, 10
> > --
> > 2.33.1
> >



-- 
H.J.

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH] x86-64: Optimize memset for zeroing
  2021-12-31 20:35   ` H.J. Lu
@ 2021-12-31 20:43     ` Florian Weimer
  2021-12-31 20:52       ` H.J. Lu
  2021-12-31 22:14     ` Noah Goldstein
  1 sibling, 1 reply; 14+ messages in thread
From: Florian Weimer @ 2021-12-31 20:43 UTC (permalink / raw)
  To: H.J. Lu via Libc-alpha
  Cc: Noah Goldstein, GCC Development, H.J. Lu, Arjan van de Ven

* H. J. Lu via Libc-alpha:

> bzero is an alias of SSE2 memset in glibc.   Should we add __memsetzero
> like __memcmpeq?  It should be almost free in glibc.  GCC can use
> __memsetzero if it is available.

bzero does not have the interface ambiguity that bcmp has.  So the
only reason for not using it would be namespace cleanliness.

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH] x86-64: Optimize memset for zeroing
  2021-12-31 20:43     ` Florian Weimer
@ 2021-12-31 20:52       ` H.J. Lu
  2021-12-31 21:02         ` Florian Weimer
  0 siblings, 1 reply; 14+ messages in thread
From: H.J. Lu @ 2021-12-31 20:52 UTC (permalink / raw)
  To: Florian Weimer
  Cc: H.J. Lu via Libc-alpha, Noah Goldstein, GCC Development,
	Arjan van de Ven

On Fri, Dec 31, 2021 at 12:43 PM Florian Weimer <fw@deneb.enyo.de> wrote:
>
> * H. J. Lu via Libc-alpha:
>
> > bzero is an alias of SSE2 memset in glibc.   Should we add __memsetzero
> > like __memcmpeq?  It should be almost free in glibc.  GCC can use
> > __memsetzero if it is available.
>
> bzero does not have the interface ambiguity that bcmp has.  So the
> only reason for not using it would be namespace cleanliness.

bzero isn't a standard C function and it isn't optimized like memset
in glibc.

-- 
H.J.

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH] x86-64: Optimize memset for zeroing
  2021-12-31 20:52       ` H.J. Lu
@ 2021-12-31 21:02         ` Florian Weimer
  2021-12-31 21:15           ` Noah Goldstein
  0 siblings, 1 reply; 14+ messages in thread
From: Florian Weimer @ 2021-12-31 21:02 UTC (permalink / raw)
  To: H.J. Lu
  Cc: H.J. Lu via Libc-alpha, Noah Goldstein, GCC Development,
	Arjan van de Ven

* H. J. Lu:

> On Fri, Dec 31, 2021 at 12:43 PM Florian Weimer <fw@deneb.enyo.de> wrote:
>>
>> * H. J. Lu via Libc-alpha:
>>
>> > bzero is an alias of SSE2 memset in glibc.   Should we add __memsetzero
>> > like __memcmpeq?  It should be almost free in glibc.  GCC can use
>> > __memsetzero if it is available.
>>
>> bzero does not have the interface ambiguity that bcmp has.  So the
>> only reason for not using it would be namespace cleanliness.
>
> bzero isn't a standard C function and it isn't optimized like memset
> in glibc.

GCC already uses non-standard functions whose names are not
implementation-defined for optimization purposes if a suitable
prototype is available.  stpcpy is an example, for:

  strcpy (a, b);
  return a + strlen (a);

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH] x86-64: Optimize memset for zeroing
  2021-12-31 21:02         ` Florian Weimer
@ 2021-12-31 21:15           ` Noah Goldstein
  2021-12-31 22:05             ` Florian Weimer
  0 siblings, 1 reply; 14+ messages in thread
From: Noah Goldstein @ 2021-12-31 21:15 UTC (permalink / raw)
  To: Florian Weimer
  Cc: H.J. Lu, H.J. Lu via Libc-alpha, GCC Development, Arjan van de Ven

On Fri, Dec 31, 2021 at 3:02 PM Florian Weimer <fw@deneb.enyo.de> wrote:
>
> * H. J. Lu:
>
> > On Fri, Dec 31, 2021 at 12:43 PM Florian Weimer <fw@deneb.enyo.de> wrote:
> >>
> >> * H. J. Lu via Libc-alpha:
> >>
> >> > bzero is an alias of SSE2 memset in glibc.   Should we add __memsetzero
> >> > like __memcmpeq?  It should be almost free in glibc.  GCC can use
> >> > __memsetzero if it is available.

Think __memsetzero makes sense.
> >>
> >> bzero does not have the interface ambiguity that bcmp has.  So the
> >> only reason for not using it would be namespace cleanliness.
> >
> > bzero isn't a standard C function and it isn't optimized like memset
> > in glibc.

It could be an issue if the "optimization" was made and then the
binary was run with an older version of GCC that was still using
the aliasing bzero. Would end up being a deoptimization then.


>
> GCC already uses non-standard functions whose names are not
> implementation-defined for optimization purposes if a suitable
> prototype is available.  stpcpy is an example, for:
>
>   strcpy (a, b);
>   return a + strlen (a);

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH] x86-64: Optimize memset for zeroing
  2021-12-31 21:15           ` Noah Goldstein
@ 2021-12-31 22:05             ` Florian Weimer
  0 siblings, 0 replies; 14+ messages in thread
From: Florian Weimer @ 2021-12-31 22:05 UTC (permalink / raw)
  To: Noah Goldstein
  Cc: H.J. Lu, H.J. Lu via Libc-alpha, GCC Development, Arjan van de Ven

* Noah Goldstein:

>> >> bzero does not have the interface ambiguity that bcmp has.  So the
>> >> only reason for not using it would be namespace cleanliness.
>> >
>> > bzero isn't a standard C function and it isn't optimized like memset
>> > in glibc.
>
> It could be an issue if the "optimization" was made and then the
> binary was run with an older version of GCC that was still using
> the aliasing bzero. Would end up being a deoptimization then.

That's a fair point.

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH] x86-64: Optimize memset for zeroing
  2021-12-31 20:35   ` H.J. Lu
  2021-12-31 20:43     ` Florian Weimer
@ 2021-12-31 22:14     ` Noah Goldstein
  2021-12-31 22:19       ` Noah Goldstein
  1 sibling, 1 reply; 14+ messages in thread
From: Noah Goldstein @ 2021-12-31 22:14 UTC (permalink / raw)
  To: H.J. Lu; +Cc: GCC Development, GNU C Library, Arjan van de Ven

On Fri, Dec 31, 2021 at 2:36 PM H.J. Lu <hjl.tools@gmail.com> wrote:
>
> On Fri, Dec 31, 2021 at 12:21 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> >
> > On Fri, Dec 31, 2021 at 12:20 PM H.J. Lu <hjl.tools@gmail.com> wrote:
> > >
> > > Update MEMSET_VDUP_TO_VEC0_AND_SET_RETURN to use PXOR, which has lower
> > > lantency and higher throughput than VPBROADCAST, for zero constant.
> > > Since the most common usage of memset is to zero a block of memory, the
> > > branch predictor will make the compare/jmp basically free and PXOR is
> > > almost like being executed unconditionally.
> >
> > Any benchmark results? Is the broadcast on the critical path for any size?
>
> Can you run your workloads to see how many memset calls are zeroing?

Python3.7.7 running pyperf 99.6% of calls are zero.
GCC11.2 compiling llvm 99.1% of calls are zero.
>
> > Also imagine the vast majority of memset zero are compile time known.
> >
> > I think it might make more sense to give bzero() the fall-through instead and
>
> bzero is an alias of SSE2 memset in glibc.   Should we add __memsetzero
> like __memcmpeq?  It should be almost free in glibc.  GCC can use
> __memsetzero if it is available.
>
> > add a patch in GCC to prefer bzero > memset.
> >
> >
> > > ---
> > >  sysdeps/x86_64/memset.S                            | 14 ++++++++++++--
> > >  .../x86_64/multiarch/memset-avx2-unaligned-erms.S  | 14 ++++++++++++--
> > >  .../multiarch/memset-avx512-unaligned-erms.S       | 10 ++++++++++
> > >  .../x86_64/multiarch/memset-evex-unaligned-erms.S  | 10 ++++++++++
> > >  .../x86_64/multiarch/memset-vec-unaligned-erms.S   | 13 +++++++++++++
> > >  5 files changed, 57 insertions(+), 4 deletions(-)
> > >
> > > diff --git a/sysdeps/x86_64/memset.S b/sysdeps/x86_64/memset.S
> > > index 0137eba4cd..513f9c703d 100644
> > > --- a/sysdeps/x86_64/memset.S
> > > +++ b/sysdeps/x86_64/memset.S
> > > @@ -29,15 +29,25 @@
> > >  #define VMOVA     movaps
> > >
> > >  #define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
> > > -  movd d, %xmm0; \
> > >    movq r, %rax; \
> > > +  testl d, d; \
> > > +  jnz 1f; \
> > > +  pxor %xmm0, %xmm0
> > > +
> > > +# define MEMSET_VDUP_TO_VEC0(d) \
> > > +  movd d, %xmm0; \
> > >    punpcklbw %xmm0, %xmm0; \
> > >    punpcklwd %xmm0, %xmm0; \
> > >    pshufd $0, %xmm0, %xmm0
> > >
> > >  #define WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
> > > -  movd d, %xmm0; \
> > >    movq r, %rax; \
> > > +  testl d, d; \
> > > +  jnz 1f; \
> > > +  pxor %xmm0, %xmm0
> > > +
> > > +# define WMEMSET_VDUP_TO_VEC0(d) \
> > > +  movd d, %xmm0; \
> > >    pshufd $0, %xmm0, %xmm0
> > >
> > >  #define SECTION(p)             p
> > > diff --git a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
> > > index 1af668af0a..8004a27750 100644
> > > --- a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
> > > +++ b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
> > > @@ -11,13 +11,23 @@
> > >  # define VMOVA     vmovdqa
> > >
> > >  # define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
> > > -  vmovd d, %xmm0; \
> > >    movq r, %rax; \
> > > +  testl d, d; \
> > > +  jnz 1f; \
> > > +  vpxor %xmm0, %xmm0, %xmm0
> > > +
> > > +# define MEMSET_VDUP_TO_VEC0(d) \
> > > +  vmovd d, %xmm0; \
> > >    vpbroadcastb %xmm0, %ymm0
> > >
> > >  # define WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
> > > -  vmovd d, %xmm0; \
> > >    movq r, %rax; \
> > > +  testl d, d; \
> > > +  jnz 1f; \
> > > +  vpxor %xmm0, %xmm0, %xmm0
> > > +
> > > +# define WMEMSET_VDUP_TO_VEC0(d) \
> > > +  vmovd d, %xmm0; \
> > >    vpbroadcastd %xmm0, %ymm0
> > >
> > >  # ifndef SECTION
> > > diff --git a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
> > > index f14d6f8493..61ff9ccf6f 100644
> > > --- a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
> > > +++ b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
> > > @@ -17,10 +17,20 @@
> > >
> > >  # define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
> > >    movq r, %rax; \
> > > +  testl d, d; \
> > > +  jnz 1f; \
> > > +  vpxorq %XMM0, %XMM0, %XMM0
> > > +
> > > +# define MEMSET_VDUP_TO_VEC0(d) \
> > >    vpbroadcastb d, %VEC0
> > >
> > >  # define WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
> > >    movq r, %rax; \
> > > +  testl d, d; \
> > > +  jnz 1f; \
> > > +  vpxorq %XMM0, %XMM0, %XMM0
> > > +
> > > +# define WMEMSET_VDUP_TO_VEC0(d) \
> > >    vpbroadcastd d, %VEC0
> > >
> > >  # define SECTION(p)            p##.evex512
> > > diff --git a/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
> > > index 64b09e77cc..85544fb0fc 100644
> > > --- a/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
> > > +++ b/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
> > > @@ -17,10 +17,20 @@
> > >
> > >  # define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
> > >    movq r, %rax; \
> > > +  testl d, d; \
> > > +  jnz 1f; \
> > > +  vpxorq %XMM0, %XMM0, %XMM0
> > > +
> > > +# define MEMSET_VDUP_TO_VEC0(d) \
> > >    vpbroadcastb d, %VEC0
> > >
> > >  # define WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
> > >    movq r, %rax; \
> > > +  testl d, d; \
> > > +  jnz 1f; \
> > > +  vpxorq %XMM0, %XMM0, %XMM0
> > > +
> > > +# define WMEMSET_VDUP_TO_VEC0(d) \
> > >    vpbroadcastd d, %VEC0
> > >
> > >  # define SECTION(p)            p##.evex
> > > diff --git a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
> > > index e723413a66..4ca34a19ba 100644
> > > --- a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
> > > +++ b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
> > > @@ -112,6 +112,9 @@ ENTRY (WMEMSET_SYMBOL (__wmemset, unaligned))
> > >         shl     $2, %RDX_LP
> > >         WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi)
> > >         jmp     L(entry_from_bzero)
> > > +1:
> > > +       WMEMSET_VDUP_TO_VEC0 (%esi)
> > > +       jmp     L(entry_from_bzero)
> > >  END (WMEMSET_SYMBOL (__wmemset, unaligned))
> > >  #endif
> > >
> > > @@ -124,6 +127,7 @@ END_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned))
> > >
> > >  ENTRY (MEMSET_SYMBOL (__memset, unaligned))
> > >         MEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi)
> > > +2:
> > >  # ifdef __ILP32__
> > >         /* Clear the upper 32 bits.  */
> > >         mov     %edx, %edx
> > > @@ -137,6 +141,10 @@ L(entry_from_bzero):
> > >         VMOVU   %VEC(0), -VEC_SIZE(%rdi,%rdx)
> > >         VMOVU   %VEC(0), (%rdi)
> > >         VZEROUPPER_RETURN
> > > +
> > > +1:
> > > +       MEMSET_VDUP_TO_VEC0 (%esi)
> > > +       jmp     2b
> > >  #if defined USE_MULTIARCH && IS_IN (libc)
> > >  END (MEMSET_SYMBOL (__memset, unaligned))
> > >
> > > @@ -180,6 +188,7 @@ END_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned_erms))
> > >
> > >  ENTRY_P2ALIGN (MEMSET_SYMBOL (__memset, unaligned_erms), 6)
> > >         MEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi)
> > > +2:
> > >  # ifdef __ILP32__
> > >         /* Clear the upper 32 bits.  */
> > >         mov     %edx, %edx
> > > @@ -193,6 +202,10 @@ ENTRY_P2ALIGN (MEMSET_SYMBOL (__memset, unaligned_erms), 6)
> > >         VMOVU   %VEC(0), (%rax)
> > >         VMOVU   %VEC(0), -VEC_SIZE(%rax, %rdx)
> > >         VZEROUPPER_RETURN
> > > +
> > > +1:
> > > +       MEMSET_VDUP_TO_VEC0 (%esi)
> > > +       jmp     2b
> > >  #endif
> > >
> > >         .p2align 4,, 10
> > > --
> > > 2.33.1
> > >
>
>
>
> --
> H.J.

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH] x86-64: Optimize memset for zeroing
  2021-12-31 22:14     ` Noah Goldstein
@ 2021-12-31 22:19       ` Noah Goldstein
  2021-12-31 22:21         ` H.J. Lu
  0 siblings, 1 reply; 14+ messages in thread
From: Noah Goldstein @ 2021-12-31 22:19 UTC (permalink / raw)
  To: H.J. Lu; +Cc: GCC Development, GNU C Library, Arjan van de Ven

On Fri, Dec 31, 2021 at 4:14 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> On Fri, Dec 31, 2021 at 2:36 PM H.J. Lu <hjl.tools@gmail.com> wrote:
> >
> > On Fri, Dec 31, 2021 at 12:21 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> > >
> > > On Fri, Dec 31, 2021 at 12:20 PM H.J. Lu <hjl.tools@gmail.com> wrote:
> > > >
> > > > Update MEMSET_VDUP_TO_VEC0_AND_SET_RETURN to use PXOR, which has lower
> > > > lantency and higher throughput than VPBROADCAST, for zero constant.
> > > > Since the most common usage of memset is to zero a block of memory, the
> > > > branch predictor will make the compare/jmp basically free and PXOR is
> > > > almost like being executed unconditionally.
> > >
> > > Any benchmark results? Is the broadcast on the critical path for any size?
> >
> > Can you run your workloads to see how many memset calls are zeroing?
>
> Python3.7.7 running pyperf 99.6% of calls are zero.
> GCC11.2 compiling llvm 99.1% of calls are zero.

I like the idea of this optimization, just don't think we want to implement
it with a branch like this. Even though it will be hyperpredictable under
heavy usage, its extra interference, will add a second branch to the
first BTB prediction, and will likely incur more misses than
that rates above because the BHT entry may be overwritten by other
branches in the application between calls.

 '__memsetzero' makes sense to me and then we
can just organize the code so that __memsetzero gets the fallthrough
path.
> >
> > > Also imagine the vast majority of memset zero are compile time known.
> > >
> > > I think it might make more sense to give bzero() the fall-through instead and
> >
> > bzero is an alias of SSE2 memset in glibc.   Should we add __memsetzero
> > like __memcmpeq?  It should be almost free in glibc.  GCC can use
> > __memsetzero if it is available.
> >
> > > add a patch in GCC to prefer bzero > memset.
> > >
> > >
> > > > ---
> > > >  sysdeps/x86_64/memset.S                            | 14 ++++++++++++--
> > > >  .../x86_64/multiarch/memset-avx2-unaligned-erms.S  | 14 ++++++++++++--
> > > >  .../multiarch/memset-avx512-unaligned-erms.S       | 10 ++++++++++
> > > >  .../x86_64/multiarch/memset-evex-unaligned-erms.S  | 10 ++++++++++
> > > >  .../x86_64/multiarch/memset-vec-unaligned-erms.S   | 13 +++++++++++++
> > > >  5 files changed, 57 insertions(+), 4 deletions(-)
> > > >
> > > > diff --git a/sysdeps/x86_64/memset.S b/sysdeps/x86_64/memset.S
> > > > index 0137eba4cd..513f9c703d 100644
> > > > --- a/sysdeps/x86_64/memset.S
> > > > +++ b/sysdeps/x86_64/memset.S
> > > > @@ -29,15 +29,25 @@
> > > >  #define VMOVA     movaps
> > > >
> > > >  #define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
> > > > -  movd d, %xmm0; \
> > > >    movq r, %rax; \
> > > > +  testl d, d; \
> > > > +  jnz 1f; \
> > > > +  pxor %xmm0, %xmm0
> > > > +
> > > > +# define MEMSET_VDUP_TO_VEC0(d) \
> > > > +  movd d, %xmm0; \
> > > >    punpcklbw %xmm0, %xmm0; \
> > > >    punpcklwd %xmm0, %xmm0; \
> > > >    pshufd $0, %xmm0, %xmm0
> > > >
> > > >  #define WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
> > > > -  movd d, %xmm0; \
> > > >    movq r, %rax; \
> > > > +  testl d, d; \
> > > > +  jnz 1f; \
> > > > +  pxor %xmm0, %xmm0
> > > > +
> > > > +# define WMEMSET_VDUP_TO_VEC0(d) \
> > > > +  movd d, %xmm0; \
> > > >    pshufd $0, %xmm0, %xmm0
> > > >
> > > >  #define SECTION(p)             p
> > > > diff --git a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
> > > > index 1af668af0a..8004a27750 100644
> > > > --- a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
> > > > +++ b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
> > > > @@ -11,13 +11,23 @@
> > > >  # define VMOVA     vmovdqa
> > > >
> > > >  # define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
> > > > -  vmovd d, %xmm0; \
> > > >    movq r, %rax; \
> > > > +  testl d, d; \
> > > > +  jnz 1f; \
> > > > +  vpxor %xmm0, %xmm0, %xmm0
> > > > +
> > > > +# define MEMSET_VDUP_TO_VEC0(d) \
> > > > +  vmovd d, %xmm0; \
> > > >    vpbroadcastb %xmm0, %ymm0
> > > >
> > > >  # define WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
> > > > -  vmovd d, %xmm0; \
> > > >    movq r, %rax; \
> > > > +  testl d, d; \
> > > > +  jnz 1f; \
> > > > +  vpxor %xmm0, %xmm0, %xmm0
> > > > +
> > > > +# define WMEMSET_VDUP_TO_VEC0(d) \
> > > > +  vmovd d, %xmm0; \
> > > >    vpbroadcastd %xmm0, %ymm0
> > > >
> > > >  # ifndef SECTION
> > > > diff --git a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
> > > > index f14d6f8493..61ff9ccf6f 100644
> > > > --- a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
> > > > +++ b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
> > > > @@ -17,10 +17,20 @@
> > > >
> > > >  # define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
> > > >    movq r, %rax; \
> > > > +  testl d, d; \
> > > > +  jnz 1f; \
> > > > +  vpxorq %XMM0, %XMM0, %XMM0
> > > > +
> > > > +# define MEMSET_VDUP_TO_VEC0(d) \
> > > >    vpbroadcastb d, %VEC0
> > > >
> > > >  # define WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
> > > >    movq r, %rax; \
> > > > +  testl d, d; \
> > > > +  jnz 1f; \
> > > > +  vpxorq %XMM0, %XMM0, %XMM0
> > > > +
> > > > +# define WMEMSET_VDUP_TO_VEC0(d) \
> > > >    vpbroadcastd d, %VEC0
> > > >
> > > >  # define SECTION(p)            p##.evex512
> > > > diff --git a/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
> > > > index 64b09e77cc..85544fb0fc 100644
> > > > --- a/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
> > > > +++ b/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
> > > > @@ -17,10 +17,20 @@
> > > >
> > > >  # define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
> > > >    movq r, %rax; \
> > > > +  testl d, d; \
> > > > +  jnz 1f; \
> > > > +  vpxorq %XMM0, %XMM0, %XMM0
> > > > +
> > > > +# define MEMSET_VDUP_TO_VEC0(d) \
> > > >    vpbroadcastb d, %VEC0
> > > >
> > > >  # define WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
> > > >    movq r, %rax; \
> > > > +  testl d, d; \
> > > > +  jnz 1f; \
> > > > +  vpxorq %XMM0, %XMM0, %XMM0
> > > > +
> > > > +# define WMEMSET_VDUP_TO_VEC0(d) \
> > > >    vpbroadcastd d, %VEC0
> > > >
> > > >  # define SECTION(p)            p##.evex
> > > > diff --git a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
> > > > index e723413a66..4ca34a19ba 100644
> > > > --- a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
> > > > +++ b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
> > > > @@ -112,6 +112,9 @@ ENTRY (WMEMSET_SYMBOL (__wmemset, unaligned))
> > > >         shl     $2, %RDX_LP
> > > >         WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi)
> > > >         jmp     L(entry_from_bzero)
> > > > +1:
> > > > +       WMEMSET_VDUP_TO_VEC0 (%esi)
> > > > +       jmp     L(entry_from_bzero)
> > > >  END (WMEMSET_SYMBOL (__wmemset, unaligned))
> > > >  #endif
> > > >
> > > > @@ -124,6 +127,7 @@ END_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned))
> > > >
> > > >  ENTRY (MEMSET_SYMBOL (__memset, unaligned))
> > > >         MEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi)
> > > > +2:
> > > >  # ifdef __ILP32__
> > > >         /* Clear the upper 32 bits.  */
> > > >         mov     %edx, %edx
> > > > @@ -137,6 +141,10 @@ L(entry_from_bzero):
> > > >         VMOVU   %VEC(0), -VEC_SIZE(%rdi,%rdx)
> > > >         VMOVU   %VEC(0), (%rdi)
> > > >         VZEROUPPER_RETURN
> > > > +
> > > > +1:
> > > > +       MEMSET_VDUP_TO_VEC0 (%esi)
> > > > +       jmp     2b
> > > >  #if defined USE_MULTIARCH && IS_IN (libc)
> > > >  END (MEMSET_SYMBOL (__memset, unaligned))
> > > >
> > > > @@ -180,6 +188,7 @@ END_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned_erms))
> > > >
> > > >  ENTRY_P2ALIGN (MEMSET_SYMBOL (__memset, unaligned_erms), 6)
> > > >         MEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi)
> > > > +2:
> > > >  # ifdef __ILP32__
> > > >         /* Clear the upper 32 bits.  */
> > > >         mov     %edx, %edx
> > > > @@ -193,6 +202,10 @@ ENTRY_P2ALIGN (MEMSET_SYMBOL (__memset, unaligned_erms), 6)
> > > >         VMOVU   %VEC(0), (%rax)
> > > >         VMOVU   %VEC(0), -VEC_SIZE(%rax, %rdx)
> > > >         VZEROUPPER_RETURN
> > > > +
> > > > +1:
> > > > +       MEMSET_VDUP_TO_VEC0 (%esi)
> > > > +       jmp     2b
> > > >  #endif
> > > >
> > > >         .p2align 4,, 10
> > > > --
> > > > 2.33.1
> > > >
> >
> >
> >
> > --
> > H.J.

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH] x86-64: Optimize memset for zeroing
  2021-12-31 22:19       ` Noah Goldstein
@ 2021-12-31 22:21         ` H.J. Lu
  0 siblings, 0 replies; 14+ messages in thread
From: H.J. Lu @ 2021-12-31 22:21 UTC (permalink / raw)
  To: Noah Goldstein; +Cc: GCC Development, GNU C Library, Arjan van de Ven

On Fri, Dec 31, 2021 at 2:19 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> On Fri, Dec 31, 2021 at 4:14 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> >
> > On Fri, Dec 31, 2021 at 2:36 PM H.J. Lu <hjl.tools@gmail.com> wrote:
> > >
> > > On Fri, Dec 31, 2021 at 12:21 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> > > >
> > > > On Fri, Dec 31, 2021 at 12:20 PM H.J. Lu <hjl.tools@gmail.com> wrote:
> > > > >
> > > > > Update MEMSET_VDUP_TO_VEC0_AND_SET_RETURN to use PXOR, which has lower
> > > > > lantency and higher throughput than VPBROADCAST, for zero constant.
> > > > > Since the most common usage of memset is to zero a block of memory, the
> > > > > branch predictor will make the compare/jmp basically free and PXOR is
> > > > > almost like being executed unconditionally.
> > > >
> > > > Any benchmark results? Is the broadcast on the critical path for any size?
> > >
> > > Can you run your workloads to see how many memset calls are zeroing?
> >
> > Python3.7.7 running pyperf 99.6% of calls are zero.
> > GCC11.2 compiling llvm 99.1% of calls are zero.
>
> I like the idea of this optimization, just don't think we want to implement
> it with a branch like this. Even though it will be hyperpredictable under
> heavy usage, its extra interference, will add a second branch to the
> first BTB prediction, and will likely incur more misses than
> that rates above because the BHT entry may be overwritten by other
> branches in the application between calls.
>
>  '__memsetzero' makes sense to me and then we
> can just organize the code so that __memsetzero gets the fallthrough
> path.

We can do it for glibc 2.36.

> > >
> > > > Also imagine the vast majority of memset zero are compile time known.
> > > >
> > > > I think it might make more sense to give bzero() the fall-through instead and
> > >
> > > bzero is an alias of SSE2 memset in glibc.   Should we add __memsetzero
> > > like __memcmpeq?  It should be almost free in glibc.  GCC can use
> > > __memsetzero if it is available.
> > >
> > > > add a patch in GCC to prefer bzero > memset.
> > > >
> > > >
> > > > > ---
> > > > >  sysdeps/x86_64/memset.S                            | 14 ++++++++++++--
> > > > >  .../x86_64/multiarch/memset-avx2-unaligned-erms.S  | 14 ++++++++++++--
> > > > >  .../multiarch/memset-avx512-unaligned-erms.S       | 10 ++++++++++
> > > > >  .../x86_64/multiarch/memset-evex-unaligned-erms.S  | 10 ++++++++++
> > > > >  .../x86_64/multiarch/memset-vec-unaligned-erms.S   | 13 +++++++++++++
> > > > >  5 files changed, 57 insertions(+), 4 deletions(-)
> > > > >
> > > > > diff --git a/sysdeps/x86_64/memset.S b/sysdeps/x86_64/memset.S
> > > > > index 0137eba4cd..513f9c703d 100644
> > > > > --- a/sysdeps/x86_64/memset.S
> > > > > +++ b/sysdeps/x86_64/memset.S
> > > > > @@ -29,15 +29,25 @@
> > > > >  #define VMOVA     movaps
> > > > >
> > > > >  #define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
> > > > > -  movd d, %xmm0; \
> > > > >    movq r, %rax; \
> > > > > +  testl d, d; \
> > > > > +  jnz 1f; \
> > > > > +  pxor %xmm0, %xmm0
> > > > > +
> > > > > +# define MEMSET_VDUP_TO_VEC0(d) \
> > > > > +  movd d, %xmm0; \
> > > > >    punpcklbw %xmm0, %xmm0; \
> > > > >    punpcklwd %xmm0, %xmm0; \
> > > > >    pshufd $0, %xmm0, %xmm0
> > > > >
> > > > >  #define WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
> > > > > -  movd d, %xmm0; \
> > > > >    movq r, %rax; \
> > > > > +  testl d, d; \
> > > > > +  jnz 1f; \
> > > > > +  pxor %xmm0, %xmm0
> > > > > +
> > > > > +# define WMEMSET_VDUP_TO_VEC0(d) \
> > > > > +  movd d, %xmm0; \
> > > > >    pshufd $0, %xmm0, %xmm0
> > > > >
> > > > >  #define SECTION(p)             p
> > > > > diff --git a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
> > > > > index 1af668af0a..8004a27750 100644
> > > > > --- a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
> > > > > +++ b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
> > > > > @@ -11,13 +11,23 @@
> > > > >  # define VMOVA     vmovdqa
> > > > >
> > > > >  # define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
> > > > > -  vmovd d, %xmm0; \
> > > > >    movq r, %rax; \
> > > > > +  testl d, d; \
> > > > > +  jnz 1f; \
> > > > > +  vpxor %xmm0, %xmm0, %xmm0
> > > > > +
> > > > > +# define MEMSET_VDUP_TO_VEC0(d) \
> > > > > +  vmovd d, %xmm0; \
> > > > >    vpbroadcastb %xmm0, %ymm0
> > > > >
> > > > >  # define WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
> > > > > -  vmovd d, %xmm0; \
> > > > >    movq r, %rax; \
> > > > > +  testl d, d; \
> > > > > +  jnz 1f; \
> > > > > +  vpxor %xmm0, %xmm0, %xmm0
> > > > > +
> > > > > +# define WMEMSET_VDUP_TO_VEC0(d) \
> > > > > +  vmovd d, %xmm0; \
> > > > >    vpbroadcastd %xmm0, %ymm0
> > > > >
> > > > >  # ifndef SECTION
> > > > > diff --git a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
> > > > > index f14d6f8493..61ff9ccf6f 100644
> > > > > --- a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
> > > > > +++ b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
> > > > > @@ -17,10 +17,20 @@
> > > > >
> > > > >  # define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
> > > > >    movq r, %rax; \
> > > > > +  testl d, d; \
> > > > > +  jnz 1f; \
> > > > > +  vpxorq %XMM0, %XMM0, %XMM0
> > > > > +
> > > > > +# define MEMSET_VDUP_TO_VEC0(d) \
> > > > >    vpbroadcastb d, %VEC0
> > > > >
> > > > >  # define WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
> > > > >    movq r, %rax; \
> > > > > +  testl d, d; \
> > > > > +  jnz 1f; \
> > > > > +  vpxorq %XMM0, %XMM0, %XMM0
> > > > > +
> > > > > +# define WMEMSET_VDUP_TO_VEC0(d) \
> > > > >    vpbroadcastd d, %VEC0
> > > > >
> > > > >  # define SECTION(p)            p##.evex512
> > > > > diff --git a/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
> > > > > index 64b09e77cc..85544fb0fc 100644
> > > > > --- a/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
> > > > > +++ b/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
> > > > > @@ -17,10 +17,20 @@
> > > > >
> > > > >  # define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
> > > > >    movq r, %rax; \
> > > > > +  testl d, d; \
> > > > > +  jnz 1f; \
> > > > > +  vpxorq %XMM0, %XMM0, %XMM0
> > > > > +
> > > > > +# define MEMSET_VDUP_TO_VEC0(d) \
> > > > >    vpbroadcastb d, %VEC0
> > > > >
> > > > >  # define WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
> > > > >    movq r, %rax; \
> > > > > +  testl d, d; \
> > > > > +  jnz 1f; \
> > > > > +  vpxorq %XMM0, %XMM0, %XMM0
> > > > > +
> > > > > +# define WMEMSET_VDUP_TO_VEC0(d) \
> > > > >    vpbroadcastd d, %VEC0
> > > > >
> > > > >  # define SECTION(p)            p##.evex
> > > > > diff --git a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
> > > > > index e723413a66..4ca34a19ba 100644
> > > > > --- a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
> > > > > +++ b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
> > > > > @@ -112,6 +112,9 @@ ENTRY (WMEMSET_SYMBOL (__wmemset, unaligned))
> > > > >         shl     $2, %RDX_LP
> > > > >         WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi)
> > > > >         jmp     L(entry_from_bzero)
> > > > > +1:
> > > > > +       WMEMSET_VDUP_TO_VEC0 (%esi)
> > > > > +       jmp     L(entry_from_bzero)
> > > > >  END (WMEMSET_SYMBOL (__wmemset, unaligned))
> > > > >  #endif
> > > > >
> > > > > @@ -124,6 +127,7 @@ END_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned))
> > > > >
> > > > >  ENTRY (MEMSET_SYMBOL (__memset, unaligned))
> > > > >         MEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi)
> > > > > +2:
> > > > >  # ifdef __ILP32__
> > > > >         /* Clear the upper 32 bits.  */
> > > > >         mov     %edx, %edx
> > > > > @@ -137,6 +141,10 @@ L(entry_from_bzero):
> > > > >         VMOVU   %VEC(0), -VEC_SIZE(%rdi,%rdx)
> > > > >         VMOVU   %VEC(0), (%rdi)
> > > > >         VZEROUPPER_RETURN
> > > > > +
> > > > > +1:
> > > > > +       MEMSET_VDUP_TO_VEC0 (%esi)
> > > > > +       jmp     2b
> > > > >  #if defined USE_MULTIARCH && IS_IN (libc)
> > > > >  END (MEMSET_SYMBOL (__memset, unaligned))
> > > > >
> > > > > @@ -180,6 +188,7 @@ END_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned_erms))
> > > > >
> > > > >  ENTRY_P2ALIGN (MEMSET_SYMBOL (__memset, unaligned_erms), 6)
> > > > >         MEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi)
> > > > > +2:
> > > > >  # ifdef __ILP32__
> > > > >         /* Clear the upper 32 bits.  */
> > > > >         mov     %edx, %edx
> > > > > @@ -193,6 +202,10 @@ ENTRY_P2ALIGN (MEMSET_SYMBOL (__memset, unaligned_erms), 6)
> > > > >         VMOVU   %VEC(0), (%rax)
> > > > >         VMOVU   %VEC(0), -VEC_SIZE(%rax, %rdx)
> > > > >         VZEROUPPER_RETURN
> > > > > +
> > > > > +1:
> > > > > +       MEMSET_VDUP_TO_VEC0 (%esi)
> > > > > +       jmp     2b
> > > > >  #endif
> > > > >
> > > > >         .p2align 4,, 10
> > > > > --
> > > > > 2.33.1
> > > > >
> > >
> > >
> > >
> > > --
> > > H.J.



-- 
H.J.

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH] x86-64: Optimize memset for zeroing
  2021-12-31 20:21 ` Noah Goldstein
  2021-12-31 20:35   ` H.J. Lu
@ 2022-01-02 16:01   ` Cristian Rodríguez
  2022-01-03 20:09   ` Patrick McGehearty
  2 siblings, 0 replies; 14+ messages in thread
From: Cristian Rodríguez @ 2022-01-02 16:01 UTC (permalink / raw)
  To: Noah Goldstein; +Cc: H.J. Lu, Arjan van de Ven, GNU C Library

On Fri, Dec 31, 2021 at 5:21 PM Noah Goldstein via Libc-alpha <
libc-alpha@sourceware.org> wrote:

>
> Any benchmark results? Is the broadcast on the critical path for any size?
>
> Also imagine the vast majority of memset zero are compile time known.
>


..and  gcc was to transform  memset foo to zero -->  foo = {};   ?
IIRC there was a bug report open about that.

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH] x86-64: Optimize memset for zeroing
  2021-12-31 20:21 ` Noah Goldstein
  2021-12-31 20:35   ` H.J. Lu
  2022-01-02 16:01   ` Cristian Rodríguez
@ 2022-01-03 20:09   ` Patrick McGehearty
  2022-01-03 21:34     ` Noah Goldstein
  2 siblings, 1 reply; 14+ messages in thread
From: Patrick McGehearty @ 2022-01-03 20:09 UTC (permalink / raw)
  To: libc-alpha


On 12/31/2021 2:21 PM, Noah Goldstein via Libc-alpha wrote:
> On Fri, Dec 31, 2021 at 12:20 PM H.J. Lu <hjl.tools@gmail.com> wrote:
>> Update MEMSET_VDUP_TO_VEC0_AND_SET_RETURN to use PXOR, which has lower
>> lantency and higher throughput than VPBROADCAST, for zero constant.
>> Since the most common usage of memset is to zero a block of memory, the
>> branch predictor will make the compare/jmp basically free and PXOR is
>> almost like being executed unconditionally.
> Any benchmark results? Is the broadcast on the critical path for any size?
>
> Also imagine the vast majority of memset zero are compile time known.
>
> I think it might make more sense to give bzero() the fall-through instead and
> add a patch in GCC to prefer bzero > memset.

My experience with memset (target, zero, len) in other environments is 
that when zero is known
to be zero and len is known to be modest at compile time, the compiler 
will simply inline
suitable store or clear instructions to the target address.

If the len is less than multiple cache lines, the performance difference 
between setting a register
to zero and storing the register repeatedly vs having architecture 
specific instructions for clearing
cache lines (or similar) was negligible.

The real performance advantage for having separate code for bzero vs 
memset is when you
are clearing large data structures (i.e. pages in the kernel or big 
blocks of workspace in apps).
That is the case that any bzero equivalent optimizations should be 
focused on.
One test near the beginning of memset (either the very first test or 
after it is determined that
len is not small) can split off to bzero specific code instead of the 
usual memset code.

- patrick

>
>
>> ---
>>   sysdeps/x86_64/memset.S                            | 14 ++++++++++++--
>>   .../x86_64/multiarch/memset-avx2-unaligned-erms.S  | 14 ++++++++++++--
>>   .../multiarch/memset-avx512-unaligned-erms.S       | 10 ++++++++++
>>   .../x86_64/multiarch/memset-evex-unaligned-erms.S  | 10 ++++++++++
>>   .../x86_64/multiarch/memset-vec-unaligned-erms.S   | 13 +++++++++++++
>>   5 files changed, 57 insertions(+), 4 deletions(-)
>>
>> diff --git a/sysdeps/x86_64/memset.S b/sysdeps/x86_64/memset.S
>> index 0137eba4cd..513f9c703d 100644
>> --- a/sysdeps/x86_64/memset.S
>> +++ b/sysdeps/x86_64/memset.S
>> @@ -29,15 +29,25 @@
>>   #define VMOVA     movaps
>>
>>   #define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
>> -  movd d, %xmm0; \
>>     movq r, %rax; \
>> +  testl d, d; \
>> +  jnz 1f; \
>> +  pxor %xmm0, %xmm0
>> +
>> +# define MEMSET_VDUP_TO_VEC0(d) \
>> +  movd d, %xmm0; \
>>     punpcklbw %xmm0, %xmm0; \
>>     punpcklwd %xmm0, %xmm0; \
>>     pshufd $0, %xmm0, %xmm0
>>
>>   #define WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
>> -  movd d, %xmm0; \
>>     movq r, %rax; \
>> +  testl d, d; \
>> +  jnz 1f; \
>> +  pxor %xmm0, %xmm0
>> +
>> +# define WMEMSET_VDUP_TO_VEC0(d) \
>> +  movd d, %xmm0; \
>>     pshufd $0, %xmm0, %xmm0
>>
>>   #define SECTION(p)             p
>> diff --git a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
>> index 1af668af0a..8004a27750 100644
>> --- a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
>> +++ b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
>> @@ -11,13 +11,23 @@
>>   # define VMOVA     vmovdqa
>>
>>   # define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
>> -  vmovd d, %xmm0; \
>>     movq r, %rax; \
>> +  testl d, d; \
>> +  jnz 1f; \
>> +  vpxor %xmm0, %xmm0, %xmm0
>> +
>> +# define MEMSET_VDUP_TO_VEC0(d) \
>> +  vmovd d, %xmm0; \
>>     vpbroadcastb %xmm0, %ymm0
>>
>>   # define WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
>> -  vmovd d, %xmm0; \
>>     movq r, %rax; \
>> +  testl d, d; \
>> +  jnz 1f; \
>> +  vpxor %xmm0, %xmm0, %xmm0
>> +
>> +# define WMEMSET_VDUP_TO_VEC0(d) \
>> +  vmovd d, %xmm0; \
>>     vpbroadcastd %xmm0, %ymm0
>>
>>   # ifndef SECTION
>> diff --git a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
>> index f14d6f8493..61ff9ccf6f 100644
>> --- a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
>> +++ b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
>> @@ -17,10 +17,20 @@
>>
>>   # define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
>>     movq r, %rax; \
>> +  testl d, d; \
>> +  jnz 1f; \
>> +  vpxorq %XMM0, %XMM0, %XMM0
>> +
>> +# define MEMSET_VDUP_TO_VEC0(d) \
>>     vpbroadcastb d, %VEC0
>>
>>   # define WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
>>     movq r, %rax; \
>> +  testl d, d; \
>> +  jnz 1f; \
>> +  vpxorq %XMM0, %XMM0, %XMM0
>> +
>> +# define WMEMSET_VDUP_TO_VEC0(d) \
>>     vpbroadcastd d, %VEC0
>>
>>   # define SECTION(p)            p##.evex512
>> diff --git a/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
>> index 64b09e77cc..85544fb0fc 100644
>> --- a/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
>> +++ b/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
>> @@ -17,10 +17,20 @@
>>
>>   # define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
>>     movq r, %rax; \
>> +  testl d, d; \
>> +  jnz 1f; \
>> +  vpxorq %XMM0, %XMM0, %XMM0
>> +
>> +# define MEMSET_VDUP_TO_VEC0(d) \
>>     vpbroadcastb d, %VEC0
>>
>>   # define WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
>>     movq r, %rax; \
>> +  testl d, d; \
>> +  jnz 1f; \
>> +  vpxorq %XMM0, %XMM0, %XMM0
>> +
>> +# define WMEMSET_VDUP_TO_VEC0(d) \
>>     vpbroadcastd d, %VEC0
>>
>>   # define SECTION(p)            p##.evex
>> diff --git a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
>> index e723413a66..4ca34a19ba 100644
>> --- a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
>> +++ b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
>> @@ -112,6 +112,9 @@ ENTRY (WMEMSET_SYMBOL (__wmemset, unaligned))
>>          shl     $2, %RDX_LP
>>          WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi)
>>          jmp     L(entry_from_bzero)
>> +1:
>> +       WMEMSET_VDUP_TO_VEC0 (%esi)
>> +       jmp     L(entry_from_bzero)
>>   END (WMEMSET_SYMBOL (__wmemset, unaligned))
>>   #endif
>>
>> @@ -124,6 +127,7 @@ END_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned))
>>
>>   ENTRY (MEMSET_SYMBOL (__memset, unaligned))
>>          MEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi)
>> +2:
>>   # ifdef __ILP32__
>>          /* Clear the upper 32 bits.  */
>>          mov     %edx, %edx
>> @@ -137,6 +141,10 @@ L(entry_from_bzero):
>>          VMOVU   %VEC(0), -VEC_SIZE(%rdi,%rdx)
>>          VMOVU   %VEC(0), (%rdi)
>>          VZEROUPPER_RETURN
>> +
>> +1:
>> +       MEMSET_VDUP_TO_VEC0 (%esi)
>> +       jmp     2b
>>   #if defined USE_MULTIARCH && IS_IN (libc)
>>   END (MEMSET_SYMBOL (__memset, unaligned))
>>
>> @@ -180,6 +188,7 @@ END_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned_erms))
>>
>>   ENTRY_P2ALIGN (MEMSET_SYMBOL (__memset, unaligned_erms), 6)
>>          MEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi)
>> +2:
>>   # ifdef __ILP32__
>>          /* Clear the upper 32 bits.  */
>>          mov     %edx, %edx
>> @@ -193,6 +202,10 @@ ENTRY_P2ALIGN (MEMSET_SYMBOL (__memset, unaligned_erms), 6)
>>          VMOVU   %VEC(0), (%rax)
>>          VMOVU   %VEC(0), -VEC_SIZE(%rax, %rdx)
>>          VZEROUPPER_RETURN
>> +
>> +1:
>> +       MEMSET_VDUP_TO_VEC0 (%esi)
>> +       jmp     2b
>>   #endif
>>
>>          .p2align 4,, 10
>> --
>> 2.33.1
>>


^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [PATCH] x86-64: Optimize memset for zeroing
  2022-01-03 20:09   ` Patrick McGehearty
@ 2022-01-03 21:34     ` Noah Goldstein
  0 siblings, 0 replies; 14+ messages in thread
From: Noah Goldstein @ 2022-01-03 21:34 UTC (permalink / raw)
  To: Patrick McGehearty; +Cc: GNU C Library

On Mon, Jan 3, 2022 at 2:09 PM Patrick McGehearty via Libc-alpha
<libc-alpha@sourceware.org> wrote:
>
>
> On 12/31/2021 2:21 PM, Noah Goldstein via Libc-alpha wrote:
> > On Fri, Dec 31, 2021 at 12:20 PM H.J. Lu <hjl.tools@gmail.com> wrote:
> >> Update MEMSET_VDUP_TO_VEC0_AND_SET_RETURN to use PXOR, which has lower
> >> lantency and higher throughput than VPBROADCAST, for zero constant.
> >> Since the most common usage of memset is to zero a block of memory, the
> >> branch predictor will make the compare/jmp basically free and PXOR is
> >> almost like being executed unconditionally.
> > Any benchmark results? Is the broadcast on the critical path for any size?
> >
> > Also imagine the vast majority of memset zero are compile time known.
> >
> > I think it might make more sense to give bzero() the fall-through instead and
> > add a patch in GCC to prefer bzero > memset.
>
> My experience with memset (target, zero, len) in other environments is
> that when zero is known
> to be zero and len is known to be modest at compile time, the compiler
> will simply inline
> suitable store or clear instructions to the target address.
>
> If the len is less than multiple cache lines, the performance difference
> between setting a register
> to zero and storing the register repeatedly vs having architecture
> specific instructions for clearing
> cache lines (or similar) was negligible.
>
> The real performance advantage for having separate code for bzero vs
> memset is when you
> are clearing large data structures (i.e. pages in the kernel or big
> blocks of workspace in apps).
> That is the case that any bzero equivalent optimizations should be
> focused on.
> One test near the beginning of memset (either the very first test or
> after it is determined that
> len is not small) can split off to bzero specific code instead of the
> usual memset code.

If it's a large size the effect of optimizing out a branch at the beginning
should also be negligible. Unless there is a better method of memset
zero than `rep stosb` or 4x VEC loop, then think large sizes are going
to be relatively unaffected by this change.

I think its small sizes where length is non-constant where this has a
chance of mattering. Think for the SSE2/AVX implementations that
plausible because the broadcast logic has high enough latency
it could still be inflight for copies in [32, 64] or [16, 31].
>
> - patrick
>
> >
> >
> >> ---
> >>   sysdeps/x86_64/memset.S                            | 14 ++++++++++++--
> >>   .../x86_64/multiarch/memset-avx2-unaligned-erms.S  | 14 ++++++++++++--
> >>   .../multiarch/memset-avx512-unaligned-erms.S       | 10 ++++++++++
> >>   .../x86_64/multiarch/memset-evex-unaligned-erms.S  | 10 ++++++++++
> >>   .../x86_64/multiarch/memset-vec-unaligned-erms.S   | 13 +++++++++++++
> >>   5 files changed, 57 insertions(+), 4 deletions(-)
> >>
> >> diff --git a/sysdeps/x86_64/memset.S b/sysdeps/x86_64/memset.S
> >> index 0137eba4cd..513f9c703d 100644
> >> --- a/sysdeps/x86_64/memset.S
> >> +++ b/sysdeps/x86_64/memset.S
> >> @@ -29,15 +29,25 @@
> >>   #define VMOVA     movaps
> >>
> >>   #define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
> >> -  movd d, %xmm0; \
> >>     movq r, %rax; \
> >> +  testl d, d; \
> >> +  jnz 1f; \
> >> +  pxor %xmm0, %xmm0
> >> +
> >> +# define MEMSET_VDUP_TO_VEC0(d) \
> >> +  movd d, %xmm0; \
> >>     punpcklbw %xmm0, %xmm0; \
> >>     punpcklwd %xmm0, %xmm0; \
> >>     pshufd $0, %xmm0, %xmm0
> >>
> >>   #define WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
> >> -  movd d, %xmm0; \
> >>     movq r, %rax; \
> >> +  testl d, d; \
> >> +  jnz 1f; \
> >> +  pxor %xmm0, %xmm0
> >> +
> >> +# define WMEMSET_VDUP_TO_VEC0(d) \
> >> +  movd d, %xmm0; \
> >>     pshufd $0, %xmm0, %xmm0
> >>
> >>   #define SECTION(p)             p
> >> diff --git a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
> >> index 1af668af0a..8004a27750 100644
> >> --- a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
> >> +++ b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
> >> @@ -11,13 +11,23 @@
> >>   # define VMOVA     vmovdqa
> >>
> >>   # define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
> >> -  vmovd d, %xmm0; \
> >>     movq r, %rax; \
> >> +  testl d, d; \
> >> +  jnz 1f; \
> >> +  vpxor %xmm0, %xmm0, %xmm0
> >> +
> >> +# define MEMSET_VDUP_TO_VEC0(d) \
> >> +  vmovd d, %xmm0; \
> >>     vpbroadcastb %xmm0, %ymm0
> >>
> >>   # define WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
> >> -  vmovd d, %xmm0; \
> >>     movq r, %rax; \
> >> +  testl d, d; \
> >> +  jnz 1f; \
> >> +  vpxor %xmm0, %xmm0, %xmm0
> >> +
> >> +# define WMEMSET_VDUP_TO_VEC0(d) \
> >> +  vmovd d, %xmm0; \
> >>     vpbroadcastd %xmm0, %ymm0
> >>
> >>   # ifndef SECTION
> >> diff --git a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
> >> index f14d6f8493..61ff9ccf6f 100644
> >> --- a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
> >> +++ b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
> >> @@ -17,10 +17,20 @@
> >>
> >>   # define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
> >>     movq r, %rax; \
> >> +  testl d, d; \
> >> +  jnz 1f; \
> >> +  vpxorq %XMM0, %XMM0, %XMM0
> >> +
> >> +# define MEMSET_VDUP_TO_VEC0(d) \
> >>     vpbroadcastb d, %VEC0
> >>
> >>   # define WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
> >>     movq r, %rax; \
> >> +  testl d, d; \
> >> +  jnz 1f; \
> >> +  vpxorq %XMM0, %XMM0, %XMM0
> >> +
> >> +# define WMEMSET_VDUP_TO_VEC0(d) \
> >>     vpbroadcastd d, %VEC0
> >>
> >>   # define SECTION(p)            p##.evex512
> >> diff --git a/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
> >> index 64b09e77cc..85544fb0fc 100644
> >> --- a/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
> >> +++ b/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
> >> @@ -17,10 +17,20 @@
> >>
> >>   # define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
> >>     movq r, %rax; \
> >> +  testl d, d; \
> >> +  jnz 1f; \
> >> +  vpxorq %XMM0, %XMM0, %XMM0
> >> +
> >> +# define MEMSET_VDUP_TO_VEC0(d) \
> >>     vpbroadcastb d, %VEC0
> >>
> >>   # define WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
> >>     movq r, %rax; \
> >> +  testl d, d; \
> >> +  jnz 1f; \
> >> +  vpxorq %XMM0, %XMM0, %XMM0
> >> +
> >> +# define WMEMSET_VDUP_TO_VEC0(d) \
> >>     vpbroadcastd d, %VEC0
> >>
> >>   # define SECTION(p)            p##.evex
> >> diff --git a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
> >> index e723413a66..4ca34a19ba 100644
> >> --- a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
> >> +++ b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
> >> @@ -112,6 +112,9 @@ ENTRY (WMEMSET_SYMBOL (__wmemset, unaligned))
> >>          shl     $2, %RDX_LP
> >>          WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi)
> >>          jmp     L(entry_from_bzero)
> >> +1:
> >> +       WMEMSET_VDUP_TO_VEC0 (%esi)
> >> +       jmp     L(entry_from_bzero)
> >>   END (WMEMSET_SYMBOL (__wmemset, unaligned))
> >>   #endif
> >>
> >> @@ -124,6 +127,7 @@ END_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned))
> >>
> >>   ENTRY (MEMSET_SYMBOL (__memset, unaligned))
> >>          MEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi)
> >> +2:
> >>   # ifdef __ILP32__
> >>          /* Clear the upper 32 bits.  */
> >>          mov     %edx, %edx
> >> @@ -137,6 +141,10 @@ L(entry_from_bzero):
> >>          VMOVU   %VEC(0), -VEC_SIZE(%rdi,%rdx)
> >>          VMOVU   %VEC(0), (%rdi)
> >>          VZEROUPPER_RETURN
> >> +
> >> +1:
> >> +       MEMSET_VDUP_TO_VEC0 (%esi)
> >> +       jmp     2b
> >>   #if defined USE_MULTIARCH && IS_IN (libc)
> >>   END (MEMSET_SYMBOL (__memset, unaligned))
> >>
> >> @@ -180,6 +188,7 @@ END_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned_erms))
> >>
> >>   ENTRY_P2ALIGN (MEMSET_SYMBOL (__memset, unaligned_erms), 6)
> >>          MEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi)
> >> +2:
> >>   # ifdef __ILP32__
> >>          /* Clear the upper 32 bits.  */
> >>          mov     %edx, %edx
> >> @@ -193,6 +202,10 @@ ENTRY_P2ALIGN (MEMSET_SYMBOL (__memset, unaligned_erms), 6)
> >>          VMOVU   %VEC(0), (%rax)
> >>          VMOVU   %VEC(0), -VEC_SIZE(%rax, %rdx)
> >>          VZEROUPPER_RETURN
> >> +
> >> +1:
> >> +       MEMSET_VDUP_TO_VEC0 (%esi)
> >> +       jmp     2b
> >>   #endif
> >>
> >>          .p2align 4,, 10
> >> --
> >> 2.33.1
> >>
>

^ permalink raw reply	[flat|nested] 14+ messages in thread

end of thread, other threads:[~2022-01-03 21:34 UTC | newest]

Thread overview: 14+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2021-12-31 18:20 [PATCH] x86-64: Optimize memset for zeroing H.J. Lu
2021-12-31 20:21 ` Noah Goldstein
2021-12-31 20:35   ` H.J. Lu
2021-12-31 20:43     ` Florian Weimer
2021-12-31 20:52       ` H.J. Lu
2021-12-31 21:02         ` Florian Weimer
2021-12-31 21:15           ` Noah Goldstein
2021-12-31 22:05             ` Florian Weimer
2021-12-31 22:14     ` Noah Goldstein
2021-12-31 22:19       ` Noah Goldstein
2021-12-31 22:21         ` H.J. Lu
2022-01-02 16:01   ` Cristian Rodríguez
2022-01-03 20:09   ` Patrick McGehearty
2022-01-03 21:34     ` Noah Goldstein

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).