From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: from mail-pj1-x102c.google.com (mail-pj1-x102c.google.com [IPv6:2607:f8b0:4864:20::102c]) by sourceware.org (Postfix) with ESMTPS id 0198C3858409 for ; Fri, 31 Dec 2021 22:14:55 +0000 (GMT) DMARC-Filter: OpenDMARC Filter v1.4.1 sourceware.org 0198C3858409 Received: by mail-pj1-x102c.google.com with SMTP id rj2-20020a17090b3e8200b001b1944bad25so26764144pjb.5 for ; Fri, 31 Dec 2021 14:14:54 -0800 (PST) X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=1e100.net; s=20210112; h=x-gm-message-state:mime-version:references:in-reply-to:from:date :message-id:subject:to:cc; bh=Aa6X5rO0oyn+zG/PhsmNVckWvcJqDR14WdyB30ZiXTw=; b=O1I5oFuCXIzOBswU1+KjSpWyLsGqzWHhG9MOdzJioroIVq+5s+1CgtRzPfnbsb9hmJ 9GcPJGr2BMDoEPPTEiBlQrwSnzmDav5OWGqjTijrQiXOehVpZ8UtKjD42Up9duS2A0Rm RKcKMvDwyPeQs6LNDLKe/Sh28kDBWfulghs7PPZCb9HB/sxVfk576VxZ+vlWAQgHQbei 6K8QpVesFuuVl6m0yWu49cM2M4cTjtDno2MlHzsqLwhe5wtx+vnW+qHRUqpYMf8/FD/G abawuFCLqJ/1n8JC2675mDi5Lp00HAsE7arEYadnmqoxqgH7CztWequRZhkmHPKW3B5V KtIg== X-Gm-Message-State: AOAM530NiN+a3mGI+KfOPTccrK/gFCPDdtCBpNHU81+EtSLGw1Qo50ha Y49D0wpPQJ5BqUIss55sKc01s53UmDLePJ14Udw= X-Google-Smtp-Source: ABdhPJzVrcYbE/hGKTaFQR0A1xYFm4SOLolnJMLPXOLB+zwgdpwx75h569izvuhkDvpUqxx9Oh7nWJQWjGYjEEOYc0I= X-Received: by 2002:a17:903:2486:b0:149:1ce1:d45d with SMTP id p6-20020a170903248600b001491ce1d45dmr36711505plw.57.1640988893990; Fri, 31 Dec 2021 14:14:53 -0800 (PST) MIME-Version: 1.0 References: <20211231182010.107040-1-hjl.tools@gmail.com> In-Reply-To: From: Noah Goldstein Date: Fri, 31 Dec 2021 16:14:43 -0600 Message-ID: Subject: Re: [PATCH] x86-64: Optimize memset for zeroing To: "H.J. Lu" Cc: GCC Development , GNU C Library , Arjan van de Ven Content-Type: text/plain; charset="UTF-8" X-Spam-Status: No, score=-8.8 required=5.0 tests=BAYES_00, DKIM_SIGNED, DKIM_VALID, DKIM_VALID_AU, DKIM_VALID_EF, FREEMAIL_FROM, GIT_PATCH_0, RCVD_IN_DNSWL_NONE, SPF_HELO_NONE, SPF_PASS, TXREP autolearn=ham autolearn_force=no version=3.4.4 X-Spam-Checker-Version: SpamAssassin 3.4.4 (2020-01-24) on server2.sourceware.org X-BeenThere: gcc@gcc.gnu.org X-Mailman-Version: 2.1.29 Precedence: list List-Id: Gcc mailing list List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , X-List-Received-Date: Fri, 31 Dec 2021 22:14:56 -0000 On Fri, Dec 31, 2021 at 2:36 PM H.J. Lu wrote: > > On Fri, Dec 31, 2021 at 12:21 PM Noah Goldstein wrote: > > > > On Fri, Dec 31, 2021 at 12:20 PM H.J. Lu wrote: > > > > > > Update MEMSET_VDUP_TO_VEC0_AND_SET_RETURN to use PXOR, which has lower > > > lantency and higher throughput than VPBROADCAST, for zero constant. > > > Since the most common usage of memset is to zero a block of memory, the > > > branch predictor will make the compare/jmp basically free and PXOR is > > > almost like being executed unconditionally. > > > > Any benchmark results? Is the broadcast on the critical path for any size? > > Can you run your workloads to see how many memset calls are zeroing? Python3.7.7 running pyperf 99.6% of calls are zero. GCC11.2 compiling llvm 99.1% of calls are zero. > > > Also imagine the vast majority of memset zero are compile time known. > > > > I think it might make more sense to give bzero() the fall-through instead and > > bzero is an alias of SSE2 memset in glibc. Should we add __memsetzero > like __memcmpeq? It should be almost free in glibc. GCC can use > __memsetzero if it is available. > > > add a patch in GCC to prefer bzero > memset. > > > > > > > --- > > > sysdeps/x86_64/memset.S | 14 ++++++++++++-- > > > .../x86_64/multiarch/memset-avx2-unaligned-erms.S | 14 ++++++++++++-- > > > .../multiarch/memset-avx512-unaligned-erms.S | 10 ++++++++++ > > > .../x86_64/multiarch/memset-evex-unaligned-erms.S | 10 ++++++++++ > > > .../x86_64/multiarch/memset-vec-unaligned-erms.S | 13 +++++++++++++ > > > 5 files changed, 57 insertions(+), 4 deletions(-) > > > > > > diff --git a/sysdeps/x86_64/memset.S b/sysdeps/x86_64/memset.S > > > index 0137eba4cd..513f9c703d 100644 > > > --- a/sysdeps/x86_64/memset.S > > > +++ b/sysdeps/x86_64/memset.S > > > @@ -29,15 +29,25 @@ > > > #define VMOVA movaps > > > > > > #define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \ > > > - movd d, %xmm0; \ > > > movq r, %rax; \ > > > + testl d, d; \ > > > + jnz 1f; \ > > > + pxor %xmm0, %xmm0 > > > + > > > +# define MEMSET_VDUP_TO_VEC0(d) \ > > > + movd d, %xmm0; \ > > > punpcklbw %xmm0, %xmm0; \ > > > punpcklwd %xmm0, %xmm0; \ > > > pshufd $0, %xmm0, %xmm0 > > > > > > #define WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \ > > > - movd d, %xmm0; \ > > > movq r, %rax; \ > > > + testl d, d; \ > > > + jnz 1f; \ > > > + pxor %xmm0, %xmm0 > > > + > > > +# define WMEMSET_VDUP_TO_VEC0(d) \ > > > + movd d, %xmm0; \ > > > pshufd $0, %xmm0, %xmm0 > > > > > > #define SECTION(p) p > > > diff --git a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S > > > index 1af668af0a..8004a27750 100644 > > > --- a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S > > > +++ b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S > > > @@ -11,13 +11,23 @@ > > > # define VMOVA vmovdqa > > > > > > # define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \ > > > - vmovd d, %xmm0; \ > > > movq r, %rax; \ > > > + testl d, d; \ > > > + jnz 1f; \ > > > + vpxor %xmm0, %xmm0, %xmm0 > > > + > > > +# define MEMSET_VDUP_TO_VEC0(d) \ > > > + vmovd d, %xmm0; \ > > > vpbroadcastb %xmm0, %ymm0 > > > > > > # define WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \ > > > - vmovd d, %xmm0; \ > > > movq r, %rax; \ > > > + testl d, d; \ > > > + jnz 1f; \ > > > + vpxor %xmm0, %xmm0, %xmm0 > > > + > > > +# define WMEMSET_VDUP_TO_VEC0(d) \ > > > + vmovd d, %xmm0; \ > > > vpbroadcastd %xmm0, %ymm0 > > > > > > # ifndef SECTION > > > diff --git a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S > > > index f14d6f8493..61ff9ccf6f 100644 > > > --- a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S > > > +++ b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S > > > @@ -17,10 +17,20 @@ > > > > > > # define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \ > > > movq r, %rax; \ > > > + testl d, d; \ > > > + jnz 1f; \ > > > + vpxorq %XMM0, %XMM0, %XMM0 > > > + > > > +# define MEMSET_VDUP_TO_VEC0(d) \ > > > vpbroadcastb d, %VEC0 > > > > > > # define WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \ > > > movq r, %rax; \ > > > + testl d, d; \ > > > + jnz 1f; \ > > > + vpxorq %XMM0, %XMM0, %XMM0 > > > + > > > +# define WMEMSET_VDUP_TO_VEC0(d) \ > > > vpbroadcastd d, %VEC0 > > > > > > # define SECTION(p) p##.evex512 > > > diff --git a/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S > > > index 64b09e77cc..85544fb0fc 100644 > > > --- a/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S > > > +++ b/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S > > > @@ -17,10 +17,20 @@ > > > > > > # define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \ > > > movq r, %rax; \ > > > + testl d, d; \ > > > + jnz 1f; \ > > > + vpxorq %XMM0, %XMM0, %XMM0 > > > + > > > +# define MEMSET_VDUP_TO_VEC0(d) \ > > > vpbroadcastb d, %VEC0 > > > > > > # define WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \ > > > movq r, %rax; \ > > > + testl d, d; \ > > > + jnz 1f; \ > > > + vpxorq %XMM0, %XMM0, %XMM0 > > > + > > > +# define WMEMSET_VDUP_TO_VEC0(d) \ > > > vpbroadcastd d, %VEC0 > > > > > > # define SECTION(p) p##.evex > > > diff --git a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S > > > index e723413a66..4ca34a19ba 100644 > > > --- a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S > > > +++ b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S > > > @@ -112,6 +112,9 @@ ENTRY (WMEMSET_SYMBOL (__wmemset, unaligned)) > > > shl $2, %RDX_LP > > > WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi) > > > jmp L(entry_from_bzero) > > > +1: > > > + WMEMSET_VDUP_TO_VEC0 (%esi) > > > + jmp L(entry_from_bzero) > > > END (WMEMSET_SYMBOL (__wmemset, unaligned)) > > > #endif > > > > > > @@ -124,6 +127,7 @@ END_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned)) > > > > > > ENTRY (MEMSET_SYMBOL (__memset, unaligned)) > > > MEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi) > > > +2: > > > # ifdef __ILP32__ > > > /* Clear the upper 32 bits. */ > > > mov %edx, %edx > > > @@ -137,6 +141,10 @@ L(entry_from_bzero): > > > VMOVU %VEC(0), -VEC_SIZE(%rdi,%rdx) > > > VMOVU %VEC(0), (%rdi) > > > VZEROUPPER_RETURN > > > + > > > +1: > > > + MEMSET_VDUP_TO_VEC0 (%esi) > > > + jmp 2b > > > #if defined USE_MULTIARCH && IS_IN (libc) > > > END (MEMSET_SYMBOL (__memset, unaligned)) > > > > > > @@ -180,6 +188,7 @@ END_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned_erms)) > > > > > > ENTRY_P2ALIGN (MEMSET_SYMBOL (__memset, unaligned_erms), 6) > > > MEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi) > > > +2: > > > # ifdef __ILP32__ > > > /* Clear the upper 32 bits. */ > > > mov %edx, %edx > > > @@ -193,6 +202,10 @@ ENTRY_P2ALIGN (MEMSET_SYMBOL (__memset, unaligned_erms), 6) > > > VMOVU %VEC(0), (%rax) > > > VMOVU %VEC(0), -VEC_SIZE(%rax, %rdx) > > > VZEROUPPER_RETURN > > > + > > > +1: > > > + MEMSET_VDUP_TO_VEC0 (%esi) > > > + jmp 2b > > > #endif > > > > > > .p2align 4,, 10 > > > -- > > > 2.33.1 > > > > > > > -- > H.J.