public inbox for libc-alpha@sourceware.org
 help / color / mirror / Atom feed
From: Noah Goldstein <goldstein.w.n@gmail.com>
To: "H.J. Lu" <hjl.tools@gmail.com>
Cc: GNU C Library <libc-alpha@sourceware.org>,
	Arjan van de Ven <arjan@linux.intel.com>
Subject: Re: [PATCH] x86-64: Optimize memset for zeroing
Date: Fri, 31 Dec 2021 14:21:18 -0600	[thread overview]
Message-ID: <CAFUsyfJ5wHJHkfcL84j+GLCX-Dba4qVt_RuCpu=aLy7-_YbHLA@mail.gmail.com> (raw)
In-Reply-To: <20211231182010.107040-1-hjl.tools@gmail.com>

On Fri, Dec 31, 2021 at 12:20 PM H.J. Lu <hjl.tools@gmail.com> wrote:
>
> Update MEMSET_VDUP_TO_VEC0_AND_SET_RETURN to use PXOR, which has lower
> lantency and higher throughput than VPBROADCAST, for zero constant.
> Since the most common usage of memset is to zero a block of memory, the
> branch predictor will make the compare/jmp basically free and PXOR is
> almost like being executed unconditionally.

Any benchmark results? Is the broadcast on the critical path for any size?

Also imagine the vast majority of memset zero are compile time known.

I think it might make more sense to give bzero() the fall-through instead and
add a patch in GCC to prefer bzero > memset.


> ---
>  sysdeps/x86_64/memset.S                            | 14 ++++++++++++--
>  .../x86_64/multiarch/memset-avx2-unaligned-erms.S  | 14 ++++++++++++--
>  .../multiarch/memset-avx512-unaligned-erms.S       | 10 ++++++++++
>  .../x86_64/multiarch/memset-evex-unaligned-erms.S  | 10 ++++++++++
>  .../x86_64/multiarch/memset-vec-unaligned-erms.S   | 13 +++++++++++++
>  5 files changed, 57 insertions(+), 4 deletions(-)
>
> diff --git a/sysdeps/x86_64/memset.S b/sysdeps/x86_64/memset.S
> index 0137eba4cd..513f9c703d 100644
> --- a/sysdeps/x86_64/memset.S
> +++ b/sysdeps/x86_64/memset.S
> @@ -29,15 +29,25 @@
>  #define VMOVA     movaps
>
>  #define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
> -  movd d, %xmm0; \
>    movq r, %rax; \
> +  testl d, d; \
> +  jnz 1f; \
> +  pxor %xmm0, %xmm0
> +
> +# define MEMSET_VDUP_TO_VEC0(d) \
> +  movd d, %xmm0; \
>    punpcklbw %xmm0, %xmm0; \
>    punpcklwd %xmm0, %xmm0; \
>    pshufd $0, %xmm0, %xmm0
>
>  #define WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
> -  movd d, %xmm0; \
>    movq r, %rax; \
> +  testl d, d; \
> +  jnz 1f; \
> +  pxor %xmm0, %xmm0
> +
> +# define WMEMSET_VDUP_TO_VEC0(d) \
> +  movd d, %xmm0; \
>    pshufd $0, %xmm0, %xmm0
>
>  #define SECTION(p)             p
> diff --git a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
> index 1af668af0a..8004a27750 100644
> --- a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
> +++ b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
> @@ -11,13 +11,23 @@
>  # define VMOVA     vmovdqa
>
>  # define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
> -  vmovd d, %xmm0; \
>    movq r, %rax; \
> +  testl d, d; \
> +  jnz 1f; \
> +  vpxor %xmm0, %xmm0, %xmm0
> +
> +# define MEMSET_VDUP_TO_VEC0(d) \
> +  vmovd d, %xmm0; \
>    vpbroadcastb %xmm0, %ymm0
>
>  # define WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
> -  vmovd d, %xmm0; \
>    movq r, %rax; \
> +  testl d, d; \
> +  jnz 1f; \
> +  vpxor %xmm0, %xmm0, %xmm0
> +
> +# define WMEMSET_VDUP_TO_VEC0(d) \
> +  vmovd d, %xmm0; \
>    vpbroadcastd %xmm0, %ymm0
>
>  # ifndef SECTION
> diff --git a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
> index f14d6f8493..61ff9ccf6f 100644
> --- a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
> +++ b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
> @@ -17,10 +17,20 @@
>
>  # define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
>    movq r, %rax; \
> +  testl d, d; \
> +  jnz 1f; \
> +  vpxorq %XMM0, %XMM0, %XMM0
> +
> +# define MEMSET_VDUP_TO_VEC0(d) \
>    vpbroadcastb d, %VEC0
>
>  # define WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
>    movq r, %rax; \
> +  testl d, d; \
> +  jnz 1f; \
> +  vpxorq %XMM0, %XMM0, %XMM0
> +
> +# define WMEMSET_VDUP_TO_VEC0(d) \
>    vpbroadcastd d, %VEC0
>
>  # define SECTION(p)            p##.evex512
> diff --git a/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
> index 64b09e77cc..85544fb0fc 100644
> --- a/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
> +++ b/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
> @@ -17,10 +17,20 @@
>
>  # define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
>    movq r, %rax; \
> +  testl d, d; \
> +  jnz 1f; \
> +  vpxorq %XMM0, %XMM0, %XMM0
> +
> +# define MEMSET_VDUP_TO_VEC0(d) \
>    vpbroadcastb d, %VEC0
>
>  # define WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
>    movq r, %rax; \
> +  testl d, d; \
> +  jnz 1f; \
> +  vpxorq %XMM0, %XMM0, %XMM0
> +
> +# define WMEMSET_VDUP_TO_VEC0(d) \
>    vpbroadcastd d, %VEC0
>
>  # define SECTION(p)            p##.evex
> diff --git a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
> index e723413a66..4ca34a19ba 100644
> --- a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
> +++ b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
> @@ -112,6 +112,9 @@ ENTRY (WMEMSET_SYMBOL (__wmemset, unaligned))
>         shl     $2, %RDX_LP
>         WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi)
>         jmp     L(entry_from_bzero)
> +1:
> +       WMEMSET_VDUP_TO_VEC0 (%esi)
> +       jmp     L(entry_from_bzero)
>  END (WMEMSET_SYMBOL (__wmemset, unaligned))
>  #endif
>
> @@ -124,6 +127,7 @@ END_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned))
>
>  ENTRY (MEMSET_SYMBOL (__memset, unaligned))
>         MEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi)
> +2:
>  # ifdef __ILP32__
>         /* Clear the upper 32 bits.  */
>         mov     %edx, %edx
> @@ -137,6 +141,10 @@ L(entry_from_bzero):
>         VMOVU   %VEC(0), -VEC_SIZE(%rdi,%rdx)
>         VMOVU   %VEC(0), (%rdi)
>         VZEROUPPER_RETURN
> +
> +1:
> +       MEMSET_VDUP_TO_VEC0 (%esi)
> +       jmp     2b
>  #if defined USE_MULTIARCH && IS_IN (libc)
>  END (MEMSET_SYMBOL (__memset, unaligned))
>
> @@ -180,6 +188,7 @@ END_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned_erms))
>
>  ENTRY_P2ALIGN (MEMSET_SYMBOL (__memset, unaligned_erms), 6)
>         MEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi)
> +2:
>  # ifdef __ILP32__
>         /* Clear the upper 32 bits.  */
>         mov     %edx, %edx
> @@ -193,6 +202,10 @@ ENTRY_P2ALIGN (MEMSET_SYMBOL (__memset, unaligned_erms), 6)
>         VMOVU   %VEC(0), (%rax)
>         VMOVU   %VEC(0), -VEC_SIZE(%rax, %rdx)
>         VZEROUPPER_RETURN
> +
> +1:
> +       MEMSET_VDUP_TO_VEC0 (%esi)
> +       jmp     2b
>  #endif
>
>         .p2align 4,, 10
> --
> 2.33.1
>

  reply	other threads:[~2021-12-31 20:21 UTC|newest]

Thread overview: 14+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2021-12-31 18:20 H.J. Lu
2021-12-31 20:21 ` Noah Goldstein [this message]
2021-12-31 20:35   ` H.J. Lu
2021-12-31 20:43     ` Florian Weimer
2021-12-31 20:52       ` H.J. Lu
2021-12-31 21:02         ` Florian Weimer
2021-12-31 21:15           ` Noah Goldstein
2021-12-31 22:05             ` Florian Weimer
2021-12-31 22:14     ` Noah Goldstein
2021-12-31 22:19       ` Noah Goldstein
2021-12-31 22:21         ` H.J. Lu
2022-01-02 16:01   ` Cristian Rodríguez
2022-01-03 20:09   ` Patrick McGehearty
2022-01-03 21:34     ` Noah Goldstein

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to='CAFUsyfJ5wHJHkfcL84j+GLCX-Dba4qVt_RuCpu=aLy7-_YbHLA@mail.gmail.com' \
    --to=goldstein.w.n@gmail.com \
    --cc=arjan@linux.intel.com \
    --cc=hjl.tools@gmail.com \
    --cc=libc-alpha@sourceware.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).