Re: [PATCH] x86-64: Optimize memset for zeroing

public inbox for libc-alpha@sourceware.org
 help / color / mirror / Atom feed

From: Patrick McGehearty <patrick.mcgehearty@oracle.com>
To: libc-alpha@sourceware.org
Subject: Re: [PATCH] x86-64: Optimize memset for zeroing
Date: Mon, 3 Jan 2022 14:09:15 -0600	[thread overview]
Message-ID: <f792de70-a2c4-7c8e-857b-aba662cd39c0@oracle.com> (raw)
In-Reply-To: <CAFUsyfJ5wHJHkfcL84j+GLCX-Dba4qVt_RuCpu=aLy7-_YbHLA@mail.gmail.com>


On 12/31/2021 2:21 PM, Noah Goldstein via Libc-alpha wrote:
> On Fri, Dec 31, 2021 at 12:20 PM H.J. Lu <hjl.tools@gmail.com> wrote:
>> Update MEMSET_VDUP_TO_VEC0_AND_SET_RETURN to use PXOR, which has lower
>> lantency and higher throughput than VPBROADCAST, for zero constant.
>> Since the most common usage of memset is to zero a block of memory, the
>> branch predictor will make the compare/jmp basically free and PXOR is
>> almost like being executed unconditionally.
> Any benchmark results? Is the broadcast on the critical path for any size?
>
> Also imagine the vast majority of memset zero are compile time known.
>
> I think it might make more sense to give bzero() the fall-through instead and
> add a patch in GCC to prefer bzero > memset.

My experience with memset (target, zero, len) in other environments is 
that when zero is known
to be zero and len is known to be modest at compile time, the compiler 
will simply inline
suitable store or clear instructions to the target address.

If the len is less than multiple cache lines, the performance difference 
between setting a register
to zero and storing the register repeatedly vs having architecture 
specific instructions for clearing
cache lines (or similar) was negligible.

The real performance advantage for having separate code for bzero vs 
memset is when you
are clearing large data structures (i.e. pages in the kernel or big 
blocks of workspace in apps).
That is the case that any bzero equivalent optimizations should be 
focused on.
One test near the beginning of memset (either the very first test or 
after it is determined that
len is not small) can split off to bzero specific code instead of the 
usual memset code.

- patrick

>
>
>> ---
>>   sysdeps/x86_64/memset.S                            | 14 ++++++++++++--
>>   .../x86_64/multiarch/memset-avx2-unaligned-erms.S  | 14 ++++++++++++--
>>   .../multiarch/memset-avx512-unaligned-erms.S       | 10 ++++++++++
>>   .../x86_64/multiarch/memset-evex-unaligned-erms.S  | 10 ++++++++++
>>   .../x86_64/multiarch/memset-vec-unaligned-erms.S   | 13 +++++++++++++
>>   5 files changed, 57 insertions(+), 4 deletions(-)
>>
>> diff --git a/sysdeps/x86_64/memset.S b/sysdeps/x86_64/memset.S
>> index 0137eba4cd..513f9c703d 100644
>> --- a/sysdeps/x86_64/memset.S
>> +++ b/sysdeps/x86_64/memset.S
>> @@ -29,15 +29,25 @@
>>   #define VMOVA     movaps
>>
>>   #define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
>> -  movd d, %xmm0; \
>>     movq r, %rax; \
>> +  testl d, d; \
>> +  jnz 1f; \
>> +  pxor %xmm0, %xmm0
>> +
>> +# define MEMSET_VDUP_TO_VEC0(d) \
>> +  movd d, %xmm0; \
>>     punpcklbw %xmm0, %xmm0; \
>>     punpcklwd %xmm0, %xmm0; \
>>     pshufd $0, %xmm0, %xmm0
>>
>>   #define WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
>> -  movd d, %xmm0; \
>>     movq r, %rax; \
>> +  testl d, d; \
>> +  jnz 1f; \
>> +  pxor %xmm0, %xmm0
>> +
>> +# define WMEMSET_VDUP_TO_VEC0(d) \
>> +  movd d, %xmm0; \
>>     pshufd $0, %xmm0, %xmm0
>>
>>   #define SECTION(p)             p
>> diff --git a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
>> index 1af668af0a..8004a27750 100644
>> --- a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
>> +++ b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
>> @@ -11,13 +11,23 @@
>>   # define VMOVA     vmovdqa
>>
>>   # define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
>> -  vmovd d, %xmm0; \
>>     movq r, %rax; \
>> +  testl d, d; \
>> +  jnz 1f; \
>> +  vpxor %xmm0, %xmm0, %xmm0
>> +
>> +# define MEMSET_VDUP_TO_VEC0(d) \
>> +  vmovd d, %xmm0; \
>>     vpbroadcastb %xmm0, %ymm0
>>
>>   # define WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
>> -  vmovd d, %xmm0; \
>>     movq r, %rax; \
>> +  testl d, d; \
>> +  jnz 1f; \
>> +  vpxor %xmm0, %xmm0, %xmm0
>> +
>> +# define WMEMSET_VDUP_TO_VEC0(d) \
>> +  vmovd d, %xmm0; \
>>     vpbroadcastd %xmm0, %ymm0
>>
>>   # ifndef SECTION
>> diff --git a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
>> index f14d6f8493..61ff9ccf6f 100644
>> --- a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
>> +++ b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
>> @@ -17,10 +17,20 @@
>>
>>   # define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
>>     movq r, %rax; \
>> +  testl d, d; \
>> +  jnz 1f; \
>> +  vpxorq %XMM0, %XMM0, %XMM0
>> +
>> +# define MEMSET_VDUP_TO_VEC0(d) \
>>     vpbroadcastb d, %VEC0
>>
>>   # define WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
>>     movq r, %rax; \
>> +  testl d, d; \
>> +  jnz 1f; \
>> +  vpxorq %XMM0, %XMM0, %XMM0
>> +
>> +# define WMEMSET_VDUP_TO_VEC0(d) \
>>     vpbroadcastd d, %VEC0
>>
>>   # define SECTION(p)            p##.evex512
>> diff --git a/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
>> index 64b09e77cc..85544fb0fc 100644
>> --- a/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
>> +++ b/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
>> @@ -17,10 +17,20 @@
>>
>>   # define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
>>     movq r, %rax; \
>> +  testl d, d; \
>> +  jnz 1f; \
>> +  vpxorq %XMM0, %XMM0, %XMM0
>> +
>> +# define MEMSET_VDUP_TO_VEC0(d) \
>>     vpbroadcastb d, %VEC0
>>
>>   # define WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
>>     movq r, %rax; \
>> +  testl d, d; \
>> +  jnz 1f; \
>> +  vpxorq %XMM0, %XMM0, %XMM0
>> +
>> +# define WMEMSET_VDUP_TO_VEC0(d) \
>>     vpbroadcastd d, %VEC0
>>
>>   # define SECTION(p)            p##.evex
>> diff --git a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
>> index e723413a66..4ca34a19ba 100644
>> --- a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
>> +++ b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
>> @@ -112,6 +112,9 @@ ENTRY (WMEMSET_SYMBOL (__wmemset, unaligned))
>>          shl     $2, %RDX_LP
>>          WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi)
>>          jmp     L(entry_from_bzero)
>> +1:
>> +       WMEMSET_VDUP_TO_VEC0 (%esi)
>> +       jmp     L(entry_from_bzero)
>>   END (WMEMSET_SYMBOL (__wmemset, unaligned))
>>   #endif
>>
>> @@ -124,6 +127,7 @@ END_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned))
>>
>>   ENTRY (MEMSET_SYMBOL (__memset, unaligned))
>>          MEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi)
>> +2:
>>   # ifdef __ILP32__
>>          /* Clear the upper 32 bits.  */
>>          mov     %edx, %edx
>> @@ -137,6 +141,10 @@ L(entry_from_bzero):
>>          VMOVU   %VEC(0), -VEC_SIZE(%rdi,%rdx)
>>          VMOVU   %VEC(0), (%rdi)
>>          VZEROUPPER_RETURN
>> +
>> +1:
>> +       MEMSET_VDUP_TO_VEC0 (%esi)
>> +       jmp     2b
>>   #if defined USE_MULTIARCH && IS_IN (libc)
>>   END (MEMSET_SYMBOL (__memset, unaligned))
>>
>> @@ -180,6 +188,7 @@ END_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned_erms))
>>
>>   ENTRY_P2ALIGN (MEMSET_SYMBOL (__memset, unaligned_erms), 6)
>>          MEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi)
>> +2:
>>   # ifdef __ILP32__
>>          /* Clear the upper 32 bits.  */
>>          mov     %edx, %edx
>> @@ -193,6 +202,10 @@ ENTRY_P2ALIGN (MEMSET_SYMBOL (__memset, unaligned_erms), 6)
>>          VMOVU   %VEC(0), (%rax)
>>          VMOVU   %VEC(0), -VEC_SIZE(%rax, %rdx)
>>          VZEROUPPER_RETURN
>> +
>> +1:
>> +       MEMSET_VDUP_TO_VEC0 (%esi)
>> +       jmp     2b
>>   #endif
>>
>>          .p2align 4,, 10
>> --
>> 2.33.1
>>

next prev parent reply	other threads:[~2022-01-03 20:09 UTC|newest]

Thread overview: 14+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2021-12-31 18:20 H.J. Lu
2021-12-31 20:21 ` Noah Goldstein
2021-12-31 20:35   ` H.J. Lu
2021-12-31 20:43     ` Florian Weimer
2021-12-31 20:52       ` H.J. Lu
2021-12-31 21:02         ` Florian Weimer
2021-12-31 21:15           ` Noah Goldstein
2021-12-31 22:05             ` Florian Weimer
2021-12-31 22:14     ` Noah Goldstein
2021-12-31 22:19       ` Noah Goldstein
2021-12-31 22:21         ` H.J. Lu
2022-01-02 16:01   ` Cristian Rodríguez
2022-01-03 20:09   ` Patrick McGehearty [this message]
2022-01-03 21:34     ` Noah Goldstein

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=f792de70-a2c4-7c8e-857b-aba662cd39c0@oracle.com \
    --to=patrick.mcgehearty@oracle.com \
    --cc=libc-alpha@sourceware.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).