From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: from mail-pg1-x52c.google.com (mail-pg1-x52c.google.com [IPv6:2607:f8b0:4864:20::52c]) by sourceware.org (Postfix) with ESMTPS id 1EE203858D39 for ; Fri, 31 Dec 2021 22:19:56 +0000 (GMT) DMARC-Filter: OpenDMARC Filter v1.4.1 sourceware.org 1EE203858D39 Received: by mail-pg1-x52c.google.com with SMTP id v25so24829735pge.2 for ; Fri, 31 Dec 2021 14:19:56 -0800 (PST) X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=1e100.net; s=20210112; h=x-gm-message-state:mime-version:references:in-reply-to:from:date :message-id:subject:to:cc; bh=har8sTwWNifJc8azNmnM+/TRDKzBtXt5eJjbWizBFok=; b=uEUbEZ9VnjbNLqv9S04zdRYkp0Kphn1W8to6XKu9Qxq+OzRyHnds0wsQskg11DsBOK 1BcO8isVxTB4b+hS+uPHjwv9nPv0kBBQRYhKxrOITvaKkdOPDwLNM9AR7uJsydIyHQmC iKyNu/rg08zGGQDKSpHtaNTJaJ1MLGWCOYZjHtamsYpnLhBlc7r8lhDJo/fBpiMuVTRz bg07t7hQYns/1+opka5RlECRYW7YCDQXaHzVQN28IGjwta1zwYWXWx9qwNcmrj1knzuL A5BB2jOEe9YU8Y/DRuOpE2c2GqPg7Zy+wP9VwzbPjVv908DHBizS1W5GXAdD/+zLbyp0 OGOg== X-Gm-Message-State: AOAM532ZQKf0/iF3mRzyueOanwtunnz/oEMqg3YMPDrDWan2EgMXH8Ke WA1uE5jFUvvpYqeppZZeNsY/ll9UX6wwxycDPlQ= X-Google-Smtp-Source: ABdhPJyGMCv8db/saTFADmXnYTAYCLSJb1ZVUxFkMBroJntFvk4RHLQxYFNRykFOfoiqvb1Ej5gGswTPv1PQUHry2F8= X-Received: by 2002:aa7:9515:0:b0:4ba:77b5:ef82 with SMTP id b21-20020aa79515000000b004ba77b5ef82mr37846011pfp.11.1640989194940; Fri, 31 Dec 2021 14:19:54 -0800 (PST) MIME-Version: 1.0 References: <20211231182010.107040-1-hjl.tools@gmail.com> In-Reply-To: From: Noah Goldstein Date: Fri, 31 Dec 2021 16:19:44 -0600 Message-ID: Subject: Re: [PATCH] x86-64: Optimize memset for zeroing To: "H.J. Lu" Cc: GCC Development , GNU C Library , Arjan van de Ven Content-Type: text/plain; charset="UTF-8" X-Spam-Status: No, score=-12.6 required=5.0 tests=BAYES_00, DKIM_SIGNED, DKIM_VALID, DKIM_VALID_AU, DKIM_VALID_EF, FREEMAIL_FROM, GIT_PATCH_0, RCVD_IN_DNSWL_NONE, SPF_HELO_NONE, SPF_PASS, TXREP autolearn=ham autolearn_force=no version=3.4.4 X-Spam-Checker-Version: SpamAssassin 3.4.4 (2020-01-24) on server2.sourceware.org X-BeenThere: gcc@gcc.gnu.org X-Mailman-Version: 2.1.29 Precedence: list List-Id: Gcc mailing list List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , X-List-Received-Date: Fri, 31 Dec 2021 22:19:58 -0000 On Fri, Dec 31, 2021 at 4:14 PM Noah Goldstein wrote: > > On Fri, Dec 31, 2021 at 2:36 PM H.J. Lu wrote: > > > > On Fri, Dec 31, 2021 at 12:21 PM Noah Goldstein wrote: > > > > > > On Fri, Dec 31, 2021 at 12:20 PM H.J. Lu wrote: > > > > > > > > Update MEMSET_VDUP_TO_VEC0_AND_SET_RETURN to use PXOR, which has lower > > > > lantency and higher throughput than VPBROADCAST, for zero constant. > > > > Since the most common usage of memset is to zero a block of memory, the > > > > branch predictor will make the compare/jmp basically free and PXOR is > > > > almost like being executed unconditionally. > > > > > > Any benchmark results? Is the broadcast on the critical path for any size? > > > > Can you run your workloads to see how many memset calls are zeroing? > > Python3.7.7 running pyperf 99.6% of calls are zero. > GCC11.2 compiling llvm 99.1% of calls are zero. I like the idea of this optimization, just don't think we want to implement it with a branch like this. Even though it will be hyperpredictable under heavy usage, its extra interference, will add a second branch to the first BTB prediction, and will likely incur more misses than that rates above because the BHT entry may be overwritten by other branches in the application between calls. '__memsetzero' makes sense to me and then we can just organize the code so that __memsetzero gets the fallthrough path. > > > > > Also imagine the vast majority of memset zero are compile time known. > > > > > > I think it might make more sense to give bzero() the fall-through instead and > > > > bzero is an alias of SSE2 memset in glibc. Should we add __memsetzero > > like __memcmpeq? It should be almost free in glibc. GCC can use > > __memsetzero if it is available. > > > > > add a patch in GCC to prefer bzero > memset. > > > > > > > > > > --- > > > > sysdeps/x86_64/memset.S | 14 ++++++++++++-- > > > > .../x86_64/multiarch/memset-avx2-unaligned-erms.S | 14 ++++++++++++-- > > > > .../multiarch/memset-avx512-unaligned-erms.S | 10 ++++++++++ > > > > .../x86_64/multiarch/memset-evex-unaligned-erms.S | 10 ++++++++++ > > > > .../x86_64/multiarch/memset-vec-unaligned-erms.S | 13 +++++++++++++ > > > > 5 files changed, 57 insertions(+), 4 deletions(-) > > > > > > > > diff --git a/sysdeps/x86_64/memset.S b/sysdeps/x86_64/memset.S > > > > index 0137eba4cd..513f9c703d 100644 > > > > --- a/sysdeps/x86_64/memset.S > > > > +++ b/sysdeps/x86_64/memset.S > > > > @@ -29,15 +29,25 @@ > > > > #define VMOVA movaps > > > > > > > > #define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \ > > > > - movd d, %xmm0; \ > > > > movq r, %rax; \ > > > > + testl d, d; \ > > > > + jnz 1f; \ > > > > + pxor %xmm0, %xmm0 > > > > + > > > > +# define MEMSET_VDUP_TO_VEC0(d) \ > > > > + movd d, %xmm0; \ > > > > punpcklbw %xmm0, %xmm0; \ > > > > punpcklwd %xmm0, %xmm0; \ > > > > pshufd $0, %xmm0, %xmm0 > > > > > > > > #define WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \ > > > > - movd d, %xmm0; \ > > > > movq r, %rax; \ > > > > + testl d, d; \ > > > > + jnz 1f; \ > > > > + pxor %xmm0, %xmm0 > > > > + > > > > +# define WMEMSET_VDUP_TO_VEC0(d) \ > > > > + movd d, %xmm0; \ > > > > pshufd $0, %xmm0, %xmm0 > > > > > > > > #define SECTION(p) p > > > > diff --git a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S > > > > index 1af668af0a..8004a27750 100644 > > > > --- a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S > > > > +++ b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S > > > > @@ -11,13 +11,23 @@ > > > > # define VMOVA vmovdqa > > > > > > > > # define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \ > > > > - vmovd d, %xmm0; \ > > > > movq r, %rax; \ > > > > + testl d, d; \ > > > > + jnz 1f; \ > > > > + vpxor %xmm0, %xmm0, %xmm0 > > > > + > > > > +# define MEMSET_VDUP_TO_VEC0(d) \ > > > > + vmovd d, %xmm0; \ > > > > vpbroadcastb %xmm0, %ymm0 > > > > > > > > # define WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \ > > > > - vmovd d, %xmm0; \ > > > > movq r, %rax; \ > > > > + testl d, d; \ > > > > + jnz 1f; \ > > > > + vpxor %xmm0, %xmm0, %xmm0 > > > > + > > > > +# define WMEMSET_VDUP_TO_VEC0(d) \ > > > > + vmovd d, %xmm0; \ > > > > vpbroadcastd %xmm0, %ymm0 > > > > > > > > # ifndef SECTION > > > > diff --git a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S > > > > index f14d6f8493..61ff9ccf6f 100644 > > > > --- a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S > > > > +++ b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S > > > > @@ -17,10 +17,20 @@ > > > > > > > > # define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \ > > > > movq r, %rax; \ > > > > + testl d, d; \ > > > > + jnz 1f; \ > > > > + vpxorq %XMM0, %XMM0, %XMM0 > > > > + > > > > +# define MEMSET_VDUP_TO_VEC0(d) \ > > > > vpbroadcastb d, %VEC0 > > > > > > > > # define WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \ > > > > movq r, %rax; \ > > > > + testl d, d; \ > > > > + jnz 1f; \ > > > > + vpxorq %XMM0, %XMM0, %XMM0 > > > > + > > > > +# define WMEMSET_VDUP_TO_VEC0(d) \ > > > > vpbroadcastd d, %VEC0 > > > > > > > > # define SECTION(p) p##.evex512 > > > > diff --git a/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S > > > > index 64b09e77cc..85544fb0fc 100644 > > > > --- a/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S > > > > +++ b/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S > > > > @@ -17,10 +17,20 @@ > > > > > > > > # define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \ > > > > movq r, %rax; \ > > > > + testl d, d; \ > > > > + jnz 1f; \ > > > > + vpxorq %XMM0, %XMM0, %XMM0 > > > > + > > > > +# define MEMSET_VDUP_TO_VEC0(d) \ > > > > vpbroadcastb d, %VEC0 > > > > > > > > # define WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \ > > > > movq r, %rax; \ > > > > + testl d, d; \ > > > > + jnz 1f; \ > > > > + vpxorq %XMM0, %XMM0, %XMM0 > > > > + > > > > +# define WMEMSET_VDUP_TO_VEC0(d) \ > > > > vpbroadcastd d, %VEC0 > > > > > > > > # define SECTION(p) p##.evex > > > > diff --git a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S > > > > index e723413a66..4ca34a19ba 100644 > > > > --- a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S > > > > +++ b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S > > > > @@ -112,6 +112,9 @@ ENTRY (WMEMSET_SYMBOL (__wmemset, unaligned)) > > > > shl $2, %RDX_LP > > > > WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi) > > > > jmp L(entry_from_bzero) > > > > +1: > > > > + WMEMSET_VDUP_TO_VEC0 (%esi) > > > > + jmp L(entry_from_bzero) > > > > END (WMEMSET_SYMBOL (__wmemset, unaligned)) > > > > #endif > > > > > > > > @@ -124,6 +127,7 @@ END_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned)) > > > > > > > > ENTRY (MEMSET_SYMBOL (__memset, unaligned)) > > > > MEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi) > > > > +2: > > > > # ifdef __ILP32__ > > > > /* Clear the upper 32 bits. */ > > > > mov %edx, %edx > > > > @@ -137,6 +141,10 @@ L(entry_from_bzero): > > > > VMOVU %VEC(0), -VEC_SIZE(%rdi,%rdx) > > > > VMOVU %VEC(0), (%rdi) > > > > VZEROUPPER_RETURN > > > > + > > > > +1: > > > > + MEMSET_VDUP_TO_VEC0 (%esi) > > > > + jmp 2b > > > > #if defined USE_MULTIARCH && IS_IN (libc) > > > > END (MEMSET_SYMBOL (__memset, unaligned)) > > > > > > > > @@ -180,6 +188,7 @@ END_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned_erms)) > > > > > > > > ENTRY_P2ALIGN (MEMSET_SYMBOL (__memset, unaligned_erms), 6) > > > > MEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi) > > > > +2: > > > > # ifdef __ILP32__ > > > > /* Clear the upper 32 bits. */ > > > > mov %edx, %edx > > > > @@ -193,6 +202,10 @@ ENTRY_P2ALIGN (MEMSET_SYMBOL (__memset, unaligned_erms), 6) > > > > VMOVU %VEC(0), (%rax) > > > > VMOVU %VEC(0), -VEC_SIZE(%rax, %rdx) > > > > VZEROUPPER_RETURN > > > > + > > > > +1: > > > > + MEMSET_VDUP_TO_VEC0 (%esi) > > > > + jmp 2b > > > > #endif > > > > > > > > .p2align 4,, 10 > > > > -- > > > > 2.33.1 > > > > > > > > > > > > -- > > H.J.