From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: from mail-pf1-x433.google.com (mail-pf1-x433.google.com [IPv6:2607:f8b0:4864:20::433]) by sourceware.org (Postfix) with ESMTPS id 023A13858C27 for ; Mon, 3 Jan 2022 21:34:42 +0000 (GMT) DMARC-Filter: OpenDMARC Filter v1.4.1 sourceware.org 023A13858C27 Received: by mail-pf1-x433.google.com with SMTP id 8so30468486pfo.4 for ; Mon, 03 Jan 2022 13:34:41 -0800 (PST) X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=1e100.net; s=20210112; h=x-gm-message-state:mime-version:references:in-reply-to:from:date :message-id:subject:to:cc; bh=16yy2JxV0WzQJzU+0rsPyYayptPdZS4D5/vrSRlQvZk=; b=lYKNLjFBfLSs26OjsnbhKwlSIb/39Z0eJ91ugH3lZGVJ3dPiwlJbVsdMLihajCu2+g NrYhR56dUNGdikGInVbPBdYib0EaEOxae0XqsNtHUTDdVAPunViSrMrI0sHb0GYSiVgz sU8+pG0XnpVdCKGENFECPFZ5FlPn731lRz3VKeagg2j03AkBCPgjD2IOTOpQjjNLCsCS aRM/5KvsBirKBDGvmrALa9E8ioRPVmThjc61Ovw629U5LVb1pQrkqULyOdYMANYpMmL/ iY50nP4Ch+kFuf8DmaRPLtQ/wt+ft/O8OHUmsH3JQZiU1oLl4AAqae7vJvKFCtKFpVEu vQDg== X-Gm-Message-State: AOAM5305x5IDFOio+hsI0xFgW4pUL7unaGcFHnKrgwsmIx1GPXpW04rW m5ViX2N7YqVPP3EHNid0yFKCRw8ZvrkvxhuepCc= X-Google-Smtp-Source: ABdhPJy0Z97nd+idQtGRw/GN3SPYAGvu28PI+C7ka8UNdfucDbvZMgPmMSWO3V496BjQQnZJz3d/o1uVd22SIMf7H30= X-Received: by 2002:a63:1e46:: with SMTP id p6mr41920464pgm.18.1641245681007; Mon, 03 Jan 2022 13:34:41 -0800 (PST) MIME-Version: 1.0 References: <20211231182010.107040-1-hjl.tools@gmail.com> In-Reply-To: From: Noah Goldstein Date: Mon, 3 Jan 2022 15:34:30 -0600 Message-ID: Subject: Re: [PATCH] x86-64: Optimize memset for zeroing To: Patrick McGehearty Cc: GNU C Library Content-Type: text/plain; charset="UTF-8" X-Spam-Status: No, score=-9.5 required=5.0 tests=BAYES_00, DKIM_SIGNED, DKIM_VALID, DKIM_VALID_AU, DKIM_VALID_EF, FREEMAIL_FROM, GIT_PATCH_0, RCVD_IN_DNSWL_NONE, SPF_HELO_NONE, SPF_PASS, TXREP autolearn=ham autolearn_force=no version=3.4.4 X-Spam-Checker-Version: SpamAssassin 3.4.4 (2020-01-24) on server2.sourceware.org X-BeenThere: libc-alpha@sourceware.org X-Mailman-Version: 2.1.29 Precedence: list List-Id: Libc-alpha mailing list List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , X-List-Received-Date: Mon, 03 Jan 2022 21:34:44 -0000 On Mon, Jan 3, 2022 at 2:09 PM Patrick McGehearty via Libc-alpha wrote: > > > On 12/31/2021 2:21 PM, Noah Goldstein via Libc-alpha wrote: > > On Fri, Dec 31, 2021 at 12:20 PM H.J. Lu wrote: > >> Update MEMSET_VDUP_TO_VEC0_AND_SET_RETURN to use PXOR, which has lower > >> lantency and higher throughput than VPBROADCAST, for zero constant. > >> Since the most common usage of memset is to zero a block of memory, the > >> branch predictor will make the compare/jmp basically free and PXOR is > >> almost like being executed unconditionally. > > Any benchmark results? Is the broadcast on the critical path for any size? > > > > Also imagine the vast majority of memset zero are compile time known. > > > > I think it might make more sense to give bzero() the fall-through instead and > > add a patch in GCC to prefer bzero > memset. > > My experience with memset (target, zero, len) in other environments is > that when zero is known > to be zero and len is known to be modest at compile time, the compiler > will simply inline > suitable store or clear instructions to the target address. > > If the len is less than multiple cache lines, the performance difference > between setting a register > to zero and storing the register repeatedly vs having architecture > specific instructions for clearing > cache lines (or similar) was negligible. > > The real performance advantage for having separate code for bzero vs > memset is when you > are clearing large data structures (i.e. pages in the kernel or big > blocks of workspace in apps). > That is the case that any bzero equivalent optimizations should be > focused on. > One test near the beginning of memset (either the very first test or > after it is determined that > len is not small) can split off to bzero specific code instead of the > usual memset code. If it's a large size the effect of optimizing out a branch at the beginning should also be negligible. Unless there is a better method of memset zero than `rep stosb` or 4x VEC loop, then think large sizes are going to be relatively unaffected by this change. I think its small sizes where length is non-constant where this has a chance of mattering. Think for the SSE2/AVX implementations that plausible because the broadcast logic has high enough latency it could still be inflight for copies in [32, 64] or [16, 31]. > > - patrick > > > > > > >> --- > >> sysdeps/x86_64/memset.S | 14 ++++++++++++-- > >> .../x86_64/multiarch/memset-avx2-unaligned-erms.S | 14 ++++++++++++-- > >> .../multiarch/memset-avx512-unaligned-erms.S | 10 ++++++++++ > >> .../x86_64/multiarch/memset-evex-unaligned-erms.S | 10 ++++++++++ > >> .../x86_64/multiarch/memset-vec-unaligned-erms.S | 13 +++++++++++++ > >> 5 files changed, 57 insertions(+), 4 deletions(-) > >> > >> diff --git a/sysdeps/x86_64/memset.S b/sysdeps/x86_64/memset.S > >> index 0137eba4cd..513f9c703d 100644 > >> --- a/sysdeps/x86_64/memset.S > >> +++ b/sysdeps/x86_64/memset.S > >> @@ -29,15 +29,25 @@ > >> #define VMOVA movaps > >> > >> #define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \ > >> - movd d, %xmm0; \ > >> movq r, %rax; \ > >> + testl d, d; \ > >> + jnz 1f; \ > >> + pxor %xmm0, %xmm0 > >> + > >> +# define MEMSET_VDUP_TO_VEC0(d) \ > >> + movd d, %xmm0; \ > >> punpcklbw %xmm0, %xmm0; \ > >> punpcklwd %xmm0, %xmm0; \ > >> pshufd $0, %xmm0, %xmm0 > >> > >> #define WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \ > >> - movd d, %xmm0; \ > >> movq r, %rax; \ > >> + testl d, d; \ > >> + jnz 1f; \ > >> + pxor %xmm0, %xmm0 > >> + > >> +# define WMEMSET_VDUP_TO_VEC0(d) \ > >> + movd d, %xmm0; \ > >> pshufd $0, %xmm0, %xmm0 > >> > >> #define SECTION(p) p > >> diff --git a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S > >> index 1af668af0a..8004a27750 100644 > >> --- a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S > >> +++ b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S > >> @@ -11,13 +11,23 @@ > >> # define VMOVA vmovdqa > >> > >> # define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \ > >> - vmovd d, %xmm0; \ > >> movq r, %rax; \ > >> + testl d, d; \ > >> + jnz 1f; \ > >> + vpxor %xmm0, %xmm0, %xmm0 > >> + > >> +# define MEMSET_VDUP_TO_VEC0(d) \ > >> + vmovd d, %xmm0; \ > >> vpbroadcastb %xmm0, %ymm0 > >> > >> # define WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \ > >> - vmovd d, %xmm0; \ > >> movq r, %rax; \ > >> + testl d, d; \ > >> + jnz 1f; \ > >> + vpxor %xmm0, %xmm0, %xmm0 > >> + > >> +# define WMEMSET_VDUP_TO_VEC0(d) \ > >> + vmovd d, %xmm0; \ > >> vpbroadcastd %xmm0, %ymm0 > >> > >> # ifndef SECTION > >> diff --git a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S > >> index f14d6f8493..61ff9ccf6f 100644 > >> --- a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S > >> +++ b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S > >> @@ -17,10 +17,20 @@ > >> > >> # define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \ > >> movq r, %rax; \ > >> + testl d, d; \ > >> + jnz 1f; \ > >> + vpxorq %XMM0, %XMM0, %XMM0 > >> + > >> +# define MEMSET_VDUP_TO_VEC0(d) \ > >> vpbroadcastb d, %VEC0 > >> > >> # define WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \ > >> movq r, %rax; \ > >> + testl d, d; \ > >> + jnz 1f; \ > >> + vpxorq %XMM0, %XMM0, %XMM0 > >> + > >> +# define WMEMSET_VDUP_TO_VEC0(d) \ > >> vpbroadcastd d, %VEC0 > >> > >> # define SECTION(p) p##.evex512 > >> diff --git a/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S > >> index 64b09e77cc..85544fb0fc 100644 > >> --- a/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S > >> +++ b/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S > >> @@ -17,10 +17,20 @@ > >> > >> # define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \ > >> movq r, %rax; \ > >> + testl d, d; \ > >> + jnz 1f; \ > >> + vpxorq %XMM0, %XMM0, %XMM0 > >> + > >> +# define MEMSET_VDUP_TO_VEC0(d) \ > >> vpbroadcastb d, %VEC0 > >> > >> # define WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \ > >> movq r, %rax; \ > >> + testl d, d; \ > >> + jnz 1f; \ > >> + vpxorq %XMM0, %XMM0, %XMM0 > >> + > >> +# define WMEMSET_VDUP_TO_VEC0(d) \ > >> vpbroadcastd d, %VEC0 > >> > >> # define SECTION(p) p##.evex > >> diff --git a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S > >> index e723413a66..4ca34a19ba 100644 > >> --- a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S > >> +++ b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S > >> @@ -112,6 +112,9 @@ ENTRY (WMEMSET_SYMBOL (__wmemset, unaligned)) > >> shl $2, %RDX_LP > >> WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi) > >> jmp L(entry_from_bzero) > >> +1: > >> + WMEMSET_VDUP_TO_VEC0 (%esi) > >> + jmp L(entry_from_bzero) > >> END (WMEMSET_SYMBOL (__wmemset, unaligned)) > >> #endif > >> > >> @@ -124,6 +127,7 @@ END_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned)) > >> > >> ENTRY (MEMSET_SYMBOL (__memset, unaligned)) > >> MEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi) > >> +2: > >> # ifdef __ILP32__ > >> /* Clear the upper 32 bits. */ > >> mov %edx, %edx > >> @@ -137,6 +141,10 @@ L(entry_from_bzero): > >> VMOVU %VEC(0), -VEC_SIZE(%rdi,%rdx) > >> VMOVU %VEC(0), (%rdi) > >> VZEROUPPER_RETURN > >> + > >> +1: > >> + MEMSET_VDUP_TO_VEC0 (%esi) > >> + jmp 2b > >> #if defined USE_MULTIARCH && IS_IN (libc) > >> END (MEMSET_SYMBOL (__memset, unaligned)) > >> > >> @@ -180,6 +188,7 @@ END_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned_erms)) > >> > >> ENTRY_P2ALIGN (MEMSET_SYMBOL (__memset, unaligned_erms), 6) > >> MEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi) > >> +2: > >> # ifdef __ILP32__ > >> /* Clear the upper 32 bits. */ > >> mov %edx, %edx > >> @@ -193,6 +202,10 @@ ENTRY_P2ALIGN (MEMSET_SYMBOL (__memset, unaligned_erms), 6) > >> VMOVU %VEC(0), (%rax) > >> VMOVU %VEC(0), -VEC_SIZE(%rax, %rdx) > >> VZEROUPPER_RETURN > >> + > >> +1: > >> + MEMSET_VDUP_TO_VEC0 (%esi) > >> + jmp 2b > >> #endif > >> > >> .p2align 4,, 10 > >> -- > >> 2.33.1 > >> >