From: Noah Goldstein <goldstein.w.n@gmail.com>
To: "H.J. Lu" <hjl.tools@gmail.com>
Cc: GCC Development <gcc@gcc.gnu.org>,
GNU C Library <libc-alpha@sourceware.org>,
Arjan van de Ven <arjan@linux.intel.com>
Subject: Re: [PATCH] x86-64: Optimize memset for zeroing
Date: Fri, 31 Dec 2021 16:14:43 -0600 [thread overview]
Message-ID: <CAFUsyf+0MbZvvF-aQpCk6SO6dp=TwYEw3oSCLTbL8Qs8c3tQ6A@mail.gmail.com> (raw)
In-Reply-To: <CAMe9rOq0cW28i-_DeLfdxD3HDRTXy17ie9ZC1NxsqfRLjBz4Yw@mail.gmail.com>
On Fri, Dec 31, 2021 at 2:36 PM H.J. Lu <hjl.tools@gmail.com> wrote:
>
> On Fri, Dec 31, 2021 at 12:21 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> >
> > On Fri, Dec 31, 2021 at 12:20 PM H.J. Lu <hjl.tools@gmail.com> wrote:
> > >
> > > Update MEMSET_VDUP_TO_VEC0_AND_SET_RETURN to use PXOR, which has lower
> > > lantency and higher throughput than VPBROADCAST, for zero constant.
> > > Since the most common usage of memset is to zero a block of memory, the
> > > branch predictor will make the compare/jmp basically free and PXOR is
> > > almost like being executed unconditionally.
> >
> > Any benchmark results? Is the broadcast on the critical path for any size?
>
> Can you run your workloads to see how many memset calls are zeroing?
Python3.7.7 running pyperf 99.6% of calls are zero.
GCC11.2 compiling llvm 99.1% of calls are zero.
>
> > Also imagine the vast majority of memset zero are compile time known.
> >
> > I think it might make more sense to give bzero() the fall-through instead and
>
> bzero is an alias of SSE2 memset in glibc. Should we add __memsetzero
> like __memcmpeq? It should be almost free in glibc. GCC can use
> __memsetzero if it is available.
>
> > add a patch in GCC to prefer bzero > memset.
> >
> >
> > > ---
> > > sysdeps/x86_64/memset.S | 14 ++++++++++++--
> > > .../x86_64/multiarch/memset-avx2-unaligned-erms.S | 14 ++++++++++++--
> > > .../multiarch/memset-avx512-unaligned-erms.S | 10 ++++++++++
> > > .../x86_64/multiarch/memset-evex-unaligned-erms.S | 10 ++++++++++
> > > .../x86_64/multiarch/memset-vec-unaligned-erms.S | 13 +++++++++++++
> > > 5 files changed, 57 insertions(+), 4 deletions(-)
> > >
> > > diff --git a/sysdeps/x86_64/memset.S b/sysdeps/x86_64/memset.S
> > > index 0137eba4cd..513f9c703d 100644
> > > --- a/sysdeps/x86_64/memset.S
> > > +++ b/sysdeps/x86_64/memset.S
> > > @@ -29,15 +29,25 @@
> > > #define VMOVA movaps
> > >
> > > #define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
> > > - movd d, %xmm0; \
> > > movq r, %rax; \
> > > + testl d, d; \
> > > + jnz 1f; \
> > > + pxor %xmm0, %xmm0
> > > +
> > > +# define MEMSET_VDUP_TO_VEC0(d) \
> > > + movd d, %xmm0; \
> > > punpcklbw %xmm0, %xmm0; \
> > > punpcklwd %xmm0, %xmm0; \
> > > pshufd $0, %xmm0, %xmm0
> > >
> > > #define WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
> > > - movd d, %xmm0; \
> > > movq r, %rax; \
> > > + testl d, d; \
> > > + jnz 1f; \
> > > + pxor %xmm0, %xmm0
> > > +
> > > +# define WMEMSET_VDUP_TO_VEC0(d) \
> > > + movd d, %xmm0; \
> > > pshufd $0, %xmm0, %xmm0
> > >
> > > #define SECTION(p) p
> > > diff --git a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
> > > index 1af668af0a..8004a27750 100644
> > > --- a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
> > > +++ b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
> > > @@ -11,13 +11,23 @@
> > > # define VMOVA vmovdqa
> > >
> > > # define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
> > > - vmovd d, %xmm0; \
> > > movq r, %rax; \
> > > + testl d, d; \
> > > + jnz 1f; \
> > > + vpxor %xmm0, %xmm0, %xmm0
> > > +
> > > +# define MEMSET_VDUP_TO_VEC0(d) \
> > > + vmovd d, %xmm0; \
> > > vpbroadcastb %xmm0, %ymm0
> > >
> > > # define WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
> > > - vmovd d, %xmm0; \
> > > movq r, %rax; \
> > > + testl d, d; \
> > > + jnz 1f; \
> > > + vpxor %xmm0, %xmm0, %xmm0
> > > +
> > > +# define WMEMSET_VDUP_TO_VEC0(d) \
> > > + vmovd d, %xmm0; \
> > > vpbroadcastd %xmm0, %ymm0
> > >
> > > # ifndef SECTION
> > > diff --git a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
> > > index f14d6f8493..61ff9ccf6f 100644
> > > --- a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
> > > +++ b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
> > > @@ -17,10 +17,20 @@
> > >
> > > # define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
> > > movq r, %rax; \
> > > + testl d, d; \
> > > + jnz 1f; \
> > > + vpxorq %XMM0, %XMM0, %XMM0
> > > +
> > > +# define MEMSET_VDUP_TO_VEC0(d) \
> > > vpbroadcastb d, %VEC0
> > >
> > > # define WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
> > > movq r, %rax; \
> > > + testl d, d; \
> > > + jnz 1f; \
> > > + vpxorq %XMM0, %XMM0, %XMM0
> > > +
> > > +# define WMEMSET_VDUP_TO_VEC0(d) \
> > > vpbroadcastd d, %VEC0
> > >
> > > # define SECTION(p) p##.evex512
> > > diff --git a/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
> > > index 64b09e77cc..85544fb0fc 100644
> > > --- a/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
> > > +++ b/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
> > > @@ -17,10 +17,20 @@
> > >
> > > # define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
> > > movq r, %rax; \
> > > + testl d, d; \
> > > + jnz 1f; \
> > > + vpxorq %XMM0, %XMM0, %XMM0
> > > +
> > > +# define MEMSET_VDUP_TO_VEC0(d) \
> > > vpbroadcastb d, %VEC0
> > >
> > > # define WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
> > > movq r, %rax; \
> > > + testl d, d; \
> > > + jnz 1f; \
> > > + vpxorq %XMM0, %XMM0, %XMM0
> > > +
> > > +# define WMEMSET_VDUP_TO_VEC0(d) \
> > > vpbroadcastd d, %VEC0
> > >
> > > # define SECTION(p) p##.evex
> > > diff --git a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
> > > index e723413a66..4ca34a19ba 100644
> > > --- a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
> > > +++ b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
> > > @@ -112,6 +112,9 @@ ENTRY (WMEMSET_SYMBOL (__wmemset, unaligned))
> > > shl $2, %RDX_LP
> > > WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi)
> > > jmp L(entry_from_bzero)
> > > +1:
> > > + WMEMSET_VDUP_TO_VEC0 (%esi)
> > > + jmp L(entry_from_bzero)
> > > END (WMEMSET_SYMBOL (__wmemset, unaligned))
> > > #endif
> > >
> > > @@ -124,6 +127,7 @@ END_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned))
> > >
> > > ENTRY (MEMSET_SYMBOL (__memset, unaligned))
> > > MEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi)
> > > +2:
> > > # ifdef __ILP32__
> > > /* Clear the upper 32 bits. */
> > > mov %edx, %edx
> > > @@ -137,6 +141,10 @@ L(entry_from_bzero):
> > > VMOVU %VEC(0), -VEC_SIZE(%rdi,%rdx)
> > > VMOVU %VEC(0), (%rdi)
> > > VZEROUPPER_RETURN
> > > +
> > > +1:
> > > + MEMSET_VDUP_TO_VEC0 (%esi)
> > > + jmp 2b
> > > #if defined USE_MULTIARCH && IS_IN (libc)
> > > END (MEMSET_SYMBOL (__memset, unaligned))
> > >
> > > @@ -180,6 +188,7 @@ END_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned_erms))
> > >
> > > ENTRY_P2ALIGN (MEMSET_SYMBOL (__memset, unaligned_erms), 6)
> > > MEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi)
> > > +2:
> > > # ifdef __ILP32__
> > > /* Clear the upper 32 bits. */
> > > mov %edx, %edx
> > > @@ -193,6 +202,10 @@ ENTRY_P2ALIGN (MEMSET_SYMBOL (__memset, unaligned_erms), 6)
> > > VMOVU %VEC(0), (%rax)
> > > VMOVU %VEC(0), -VEC_SIZE(%rax, %rdx)
> > > VZEROUPPER_RETURN
> > > +
> > > +1:
> > > + MEMSET_VDUP_TO_VEC0 (%esi)
> > > + jmp 2b
> > > #endif
> > >
> > > .p2align 4,, 10
> > > --
> > > 2.33.1
> > >
>
>
>
> --
> H.J.
next prev parent reply other threads:[~2021-12-31 22:14 UTC|newest]
Thread overview: 9+ messages / expand[flat|nested] mbox.gz Atom feed top
[not found] <20211231182010.107040-1-hjl.tools@gmail.com>
[not found] ` <CAFUsyfJ5wHJHkfcL84j+GLCX-Dba4qVt_RuCpu=aLy7-_YbHLA@mail.gmail.com>
2021-12-31 20:35 ` H.J. Lu
2021-12-31 20:43 ` Florian Weimer
2021-12-31 20:52 ` H.J. Lu
2021-12-31 21:02 ` Florian Weimer
2021-12-31 21:15 ` Noah Goldstein
2021-12-31 22:05 ` Florian Weimer
2021-12-31 22:14 ` Noah Goldstein [this message]
2021-12-31 22:19 ` Noah Goldstein
2021-12-31 22:21 ` H.J. Lu
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to='CAFUsyf+0MbZvvF-aQpCk6SO6dp=TwYEw3oSCLTbL8Qs8c3tQ6A@mail.gmail.com' \
--to=goldstein.w.n@gmail.com \
--cc=arjan@linux.intel.com \
--cc=gcc@gcc.gnu.org \
--cc=hjl.tools@gmail.com \
--cc=libc-alpha@sourceware.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).