From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: from mail-pj1-x102f.google.com (mail-pj1-x102f.google.com [IPv6:2607:f8b0:4864:20::102f]) by sourceware.org (Postfix) with ESMTPS id 3D8FD3858C78 for ; Wed, 9 Feb 2022 22:14:14 +0000 (GMT) DMARC-Filter: OpenDMARC Filter v1.4.1 sourceware.org 3D8FD3858C78 Received: by mail-pj1-x102f.google.com with SMTP id v4so3465862pjh.2 for ; Wed, 09 Feb 2022 14:14:14 -0800 (PST) X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=1e100.net; s=20210112; h=x-gm-message-state:mime-version:references:in-reply-to:from:date :message-id:subject:to:cc; bh=tpH8INtNcKeg2naa+ZiYc38aZ8dmYlQ56kjMNVWQv2A=; b=jMR6snNciMmCATrFny/VV/eq5NAiZnfK4oZQdQ9nUe0kIl+TCGuvFMOoDEjYxdFG7J WnTextf+p15nSFy9/KE6NR917Acv30ceLn0jrsSrSlBzzOe+KxS+PvXmR2Bm/nwDJDdH iXYJywUpeSc8PlQI/gPJ31a+yHF1nTNGmrRwm2dGO2qElmPkdV1IDyVdK4eKjVFR6LYn ywRtC8qSg6ojmqgO7558MolzIs4ijwQYZ9a9oiPOESDHzdjdAsTeaZJd0XJ1UZtmaOUp 6f8j8HF2zXCWCDr1gJtwOQcOPw5svlmfVSvf6DHwOXeLHJzUb+MmJh+oyh6PikuiLKFV Ct/A== X-Gm-Message-State: AOAM5330uxB8mdfP1Nb7PhnSvOc9q0PgzQID0XlmrXjolrZlSl6gHoiS W0LbbZVHueC0BFx5BAJSgs8/kP7av8BZLoL+svg= X-Google-Smtp-Source: ABdhPJwA58a9fXaUKkX3ja/L9RTh6zkLh42hiSotvkksN4ZMreR3/GQ2A0FgknxRk54cVk/o/Y0TqRkQy9voVPJoUFM= X-Received: by 2002:a17:903:185:: with SMTP id z5mr241028plg.22.1644444853051; Wed, 09 Feb 2022 14:14:13 -0800 (PST) MIME-Version: 1.0 References: <20220208224319.40271-1-hjl.tools@gmail.com> In-Reply-To: From: Noah Goldstein Date: Wed, 9 Feb 2022 16:14:02 -0600 Message-ID: Subject: Re: [PATCH v2] x86-64: Optimize bzero To: Adhemerval Zanella Cc: "H.J. Lu" , GNU C Library , Wilco Dijkstra Content-Type: text/plain; charset="UTF-8" X-Spam-Status: No, score=-8.7 required=5.0 tests=BAYES_00, DKIM_SIGNED, DKIM_VALID, DKIM_VALID_AU, DKIM_VALID_EF, FREEMAIL_FROM, GIT_PATCH_0, KAM_SHORT, KAM_STOCKGEN, RCVD_IN_DNSWL_NONE, SPF_HELO_NONE, SPF_PASS, TXREP, T_SCC_BODY_TEXT_LINE autolearn=ham autolearn_force=no version=3.4.4 X-Spam-Checker-Version: SpamAssassin 3.4.4 (2020-01-24) on server2.sourceware.org X-BeenThere: libc-alpha@sourceware.org X-Mailman-Version: 2.1.29 Precedence: list List-Id: Libc-alpha mailing list List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , X-List-Received-Date: Wed, 09 Feb 2022 22:14:17 -0000 On Wed, Feb 9, 2022 at 5:41 AM Adhemerval Zanella via Libc-alpha wrote: > > > > On 08/02/2022 19:43, H.J. Lu via Libc-alpha wrote: > > Rebase against the current master branch. > > > > -- > > memset with zero as the value to set is by far the majority value (99%+ > > for Python3 and GCC). > > > > bzero can be slightly more optimized for this case by using a zero-idiom > > xor for broadcasting the set value to a register (vector or GPR). > > > > Co-developed-by: Noah Goldstein > > Is it really worth to ressurerect bzero with this multiple ifunc variants? > Would Python3/GCC or any programs start to replace memset with bzero for > the sake of this optimization? > > I agree with Wilco where the gain are marginal in this case. The cost is only 1 cache line and it doesn't interfere with memset at all so it's unlikely to cause any problems. The saving is in the lane-cross broadcast which is on the critical path for memsets in [VEC_SIZE, 2 * VEC_SIZE] (think 32-64). As well for EVEX + AVX512, because it uses predicate execution for [0, VEC_SIZE] there is a slight benefit there (although only in throughput because the critical path in mask construction is longer than the lane VEC setup). Agreed it's not clear if it's worth it to start replacing memset calls with bzero calls, but at the very least this will improve existing code that uses bzero. > > > --- > > sysdeps/x86_64/memset.S | 8 ++ > > sysdeps/x86_64/multiarch/Makefile | 1 + > > sysdeps/x86_64/multiarch/bzero.c | 106 +++++++++++++++++ > > sysdeps/x86_64/multiarch/ifunc-impl-list.c | 42 +++++++ > > .../memset-avx2-unaligned-erms-rtm.S | 1 + > > .../multiarch/memset-avx2-unaligned-erms.S | 6 + > > .../multiarch/memset-avx512-unaligned-erms.S | 3 + > > .../multiarch/memset-evex-unaligned-erms.S | 3 + > > .../multiarch/memset-sse2-unaligned-erms.S | 1 + > > .../multiarch/memset-vec-unaligned-erms.S | 110 ++++++++++++++---- > > 10 files changed, 256 insertions(+), 25 deletions(-) > > create mode 100644 sysdeps/x86_64/multiarch/bzero.c > > > > diff --git a/sysdeps/x86_64/memset.S b/sysdeps/x86_64/memset.S > > index 3f0517bbfc..af26e9cedc 100644 > > --- a/sysdeps/x86_64/memset.S > > +++ b/sysdeps/x86_64/memset.S > > @@ -35,6 +35,9 @@ > > punpcklwd %xmm0, %xmm0; \ > > pshufd $0, %xmm0, %xmm0 > > > > +# define BZERO_ZERO_VEC0() \ > > + pxor %xmm0, %xmm0 > > + > > # define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \ > > movd d, %xmm0; \ > > pshufd $0, %xmm0, %xmm0; \ > > @@ -53,6 +56,10 @@ > > # define MEMSET_SYMBOL(p,s) memset > > #endif > > > > +#ifndef BZERO_SYMBOL > > +# define BZERO_SYMBOL(p,s) __bzero > > +#endif > > + > > #ifndef WMEMSET_SYMBOL > > # define WMEMSET_CHK_SYMBOL(p,s) p > > # define WMEMSET_SYMBOL(p,s) __wmemset > > @@ -63,6 +70,7 @@ > > libc_hidden_builtin_def (memset) > > > > #if IS_IN (libc) > > +weak_alias (__bzero, bzero) > > libc_hidden_def (__wmemset) > > weak_alias (__wmemset, wmemset) > > libc_hidden_weak (wmemset) > > diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile > > index 4274bfdd0d..e7b413edad 100644 > > --- a/sysdeps/x86_64/multiarch/Makefile > > +++ b/sysdeps/x86_64/multiarch/Makefile > > @@ -1,6 +1,7 @@ > > ifeq ($(subdir),string) > > > > sysdep_routines += \ > > + bzero \ > > memchr-avx2 \ > > memchr-avx2-rtm \ > > memchr-evex \ > > diff --git a/sysdeps/x86_64/multiarch/bzero.c b/sysdeps/x86_64/multiarch/bzero.c > > new file mode 100644 > > index 0000000000..58a14b2c33 > > --- /dev/null > > +++ b/sysdeps/x86_64/multiarch/bzero.c > > @@ -0,0 +1,106 @@ > > +/* Multiple versions of bzero. > > + All versions must be listed in ifunc-impl-list.c. > > + Copyright (C) 2022 Free Software Foundation, Inc. > > + This file is part of the GNU C Library. > > + > > + The GNU C Library is free software; you can redistribute it and/or > > + modify it under the terms of the GNU Lesser General Public > > + License as published by the Free Software Foundation; either > > + version 2.1 of the License, or (at your option) any later version. > > + > > + The GNU C Library is distributed in the hope that it will be useful, > > + but WITHOUT ANY WARRANTY; without even the implied warranty of > > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > > + Lesser General Public License for more details. > > + > > + You should have received a copy of the GNU Lesser General Public > > + License along with the GNU C Library; if not, see > > + . */ > > + > > +/* Define multiple versions only for the definition in libc. */ > > +#if IS_IN (libc) > > +# define __bzero __redirect___bzero > > +# include > > +# undef __bzero > > + > > +# define SYMBOL_NAME __bzero > > +# include > > + > > +extern __typeof (REDIRECT_NAME) OPTIMIZE1 (sse2_unaligned) > > + attribute_hidden; > > +extern __typeof (REDIRECT_NAME) OPTIMIZE1 (sse2_unaligned_erms) > > + attribute_hidden; > > +extern __typeof (REDIRECT_NAME) OPTIMIZE1 (avx2_unaligned) attribute_hidden; > > +extern __typeof (REDIRECT_NAME) OPTIMIZE1 (avx2_unaligned_erms) > > + attribute_hidden; > > +extern __typeof (REDIRECT_NAME) OPTIMIZE1 (avx2_unaligned_rtm) > > + attribute_hidden; > > +extern __typeof (REDIRECT_NAME) OPTIMIZE1 (avx2_unaligned_erms_rtm) > > + attribute_hidden; > > +extern __typeof (REDIRECT_NAME) OPTIMIZE1 (evex_unaligned) > > + attribute_hidden; > > +extern __typeof (REDIRECT_NAME) OPTIMIZE1 (evex_unaligned_erms) > > + attribute_hidden; > > +extern __typeof (REDIRECT_NAME) OPTIMIZE1 (avx512_unaligned) > > + attribute_hidden; > > +extern __typeof (REDIRECT_NAME) OPTIMIZE1 (avx512_unaligned_erms) > > + attribute_hidden; > > + > > +static inline void * > > +IFUNC_SELECTOR (void) > > +{ > > + const struct cpu_features* cpu_features = __get_cpu_features (); > > + > > + if (CPU_FEATURE_USABLE_P (cpu_features, AVX512F) > > + && !CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_AVX512)) > > + { > > + if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL) > > + && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW) > > + && CPU_FEATURE_USABLE_P (cpu_features, BMI2)) > > + { > > + if (CPU_FEATURE_USABLE_P (cpu_features, ERMS)) > > + return OPTIMIZE1 (avx512_unaligned_erms); > > + > > + return OPTIMIZE1 (avx512_unaligned); > > + } > > + } > > + > > + if (CPU_FEATURE_USABLE_P (cpu_features, AVX2)) > > + { > > + if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL) > > + && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW) > > + && CPU_FEATURE_USABLE_P (cpu_features, BMI2)) > > + { > > + if (CPU_FEATURE_USABLE_P (cpu_features, ERMS)) > > + return OPTIMIZE1 (evex_unaligned_erms); > > + > > + return OPTIMIZE1 (evex_unaligned); > > + } > > + > > + if (CPU_FEATURE_USABLE_P (cpu_features, RTM)) > > + { > > + if (CPU_FEATURE_USABLE_P (cpu_features, ERMS)) > > + return OPTIMIZE1 (avx2_unaligned_erms_rtm); > > + > > + return OPTIMIZE1 (avx2_unaligned_rtm); > > + } > > + > > + if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER)) > > + { > > + if (CPU_FEATURE_USABLE_P (cpu_features, ERMS)) > > + return OPTIMIZE1 (avx2_unaligned_erms); > > + > > + return OPTIMIZE1 (avx2_unaligned); > > + } > > + } > > + > > + if (CPU_FEATURE_USABLE_P (cpu_features, ERMS)) > > + return OPTIMIZE1 (sse2_unaligned_erms); > > + > > + return OPTIMIZE1 (sse2_unaligned); > > +} > > + > > +libc_ifunc_redirected (__redirect___bzero, __bzero, IFUNC_SELECTOR ()); > > + > > +weak_alias (__bzero, bzero) > > +#endif > > diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c > > index 68a56797d4..a594f4176e 100644 > > --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c > > +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c > > @@ -300,6 +300,48 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, > > __memset_avx512_no_vzeroupper) > > ) > > > > + /* Support sysdeps/x86_64/multiarch/bzero.c. */ > > + IFUNC_IMPL (i, name, bzero, > > + IFUNC_IMPL_ADD (array, i, bzero, 1, > > + __bzero_sse2_unaligned) > > + IFUNC_IMPL_ADD (array, i, bzero, 1, > > + __bzero_sse2_unaligned_erms) > > + IFUNC_IMPL_ADD (array, i, bzero, > > + CPU_FEATURE_USABLE (AVX2), > > + __bzero_avx2_unaligned) > > + IFUNC_IMPL_ADD (array, i, bzero, > > + CPU_FEATURE_USABLE (AVX2), > > + __bzero_avx2_unaligned_erms) > > + IFUNC_IMPL_ADD (array, i, bzero, > > + (CPU_FEATURE_USABLE (AVX2) > > + && CPU_FEATURE_USABLE (RTM)), > > + __bzero_avx2_unaligned_rtm) > > + IFUNC_IMPL_ADD (array, i, bzero, > > + (CPU_FEATURE_USABLE (AVX2) > > + && CPU_FEATURE_USABLE (RTM)), > > + __bzero_avx2_unaligned_erms_rtm) > > + IFUNC_IMPL_ADD (array, i, bzero, > > + (CPU_FEATURE_USABLE (AVX512VL) > > + && CPU_FEATURE_USABLE (AVX512BW) > > + && CPU_FEATURE_USABLE (BMI2)), > > + __bzero_evex_unaligned) > > + IFUNC_IMPL_ADD (array, i, bzero, > > + (CPU_FEATURE_USABLE (AVX512VL) > > + && CPU_FEATURE_USABLE (AVX512BW) > > + && CPU_FEATURE_USABLE (BMI2)), > > + __bzero_evex_unaligned_erms) > > + IFUNC_IMPL_ADD (array, i, bzero, > > + (CPU_FEATURE_USABLE (AVX512VL) > > + && CPU_FEATURE_USABLE (AVX512BW) > > + && CPU_FEATURE_USABLE (BMI2)), > > + __bzero_avx512_unaligned_erms) > > + IFUNC_IMPL_ADD (array, i, bzero, > > + (CPU_FEATURE_USABLE (AVX512VL) > > + && CPU_FEATURE_USABLE (AVX512BW) > > + && CPU_FEATURE_USABLE (BMI2)), > > + __bzero_avx512_unaligned) > > + ) > > + > > /* Support sysdeps/x86_64/multiarch/rawmemchr.c. */ > > IFUNC_IMPL (i, name, rawmemchr, > > IFUNC_IMPL_ADD (array, i, rawmemchr, > > diff --git a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms-rtm.S b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms-rtm.S > > index 8ac3e479bb..5a5ee6f672 100644 > > --- a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms-rtm.S > > +++ b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms-rtm.S > > @@ -5,6 +5,7 @@ > > > > #define SECTION(p) p##.avx.rtm > > #define MEMSET_SYMBOL(p,s) p##_avx2_##s##_rtm > > +#define BZERO_SYMBOL(p,s) p##_avx2_##s##_rtm > > #define WMEMSET_SYMBOL(p,s) p##_avx2_##s##_rtm > > > > #include "memset-avx2-unaligned-erms.S" > > diff --git a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S > > index c0bf2875d0..a093a2831f 100644 > > --- a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S > > +++ b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S > > @@ -14,6 +14,9 @@ > > vmovd d, %xmm0; \ > > movq r, %rax; > > > > +# define BZERO_ZERO_VEC0() \ > > + vpxor %xmm0, %xmm0, %xmm0 > > + > > # define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \ > > MEMSET_SET_VEC0_AND_SET_RETURN(d, r) > > > > @@ -29,6 +32,9 @@ > > # ifndef MEMSET_SYMBOL > > # define MEMSET_SYMBOL(p,s) p##_avx2_##s > > # endif > > +# ifndef BZERO_SYMBOL > > +# define BZERO_SYMBOL(p,s) p##_avx2_##s > > +# endif > > # ifndef WMEMSET_SYMBOL > > # define WMEMSET_SYMBOL(p,s) p##_avx2_##s > > # endif > > diff --git a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S > > index 5241216a77..727c92133a 100644 > > --- a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S > > +++ b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S > > @@ -19,6 +19,9 @@ > > vpbroadcastb d, %VEC0; \ > > movq r, %rax > > > > +# define BZERO_ZERO_VEC0() \ > > + vpxorq %XMM0, %XMM0, %XMM0 > > + > > # define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \ > > vpbroadcastd d, %VEC0; \ > > movq r, %rax > > diff --git a/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S > > index 6370021506..5d8fa78f05 100644 > > --- a/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S > > +++ b/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S > > @@ -19,6 +19,9 @@ > > vpbroadcastb d, %VEC0; \ > > movq r, %rax > > > > +# define BZERO_ZERO_VEC0() \ > > + vpxorq %XMM0, %XMM0, %XMM0 > > + > > # define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \ > > vpbroadcastd d, %VEC0; \ > > movq r, %rax > > diff --git a/sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S > > index 8a6f0c561a..329c58ee46 100644 > > --- a/sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S > > +++ b/sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S > > @@ -22,6 +22,7 @@ > > > > #if IS_IN (libc) > > # define MEMSET_SYMBOL(p,s) p##_sse2_##s > > +# define BZERO_SYMBOL(p,s) MEMSET_SYMBOL (p, s) > > # define WMEMSET_SYMBOL(p,s) p##_sse2_##s > > > > # ifdef SHARED > > diff --git a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S > > index 1b502b78e4..7c94fcdae1 100644 > > --- a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S > > +++ b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S > > @@ -26,6 +26,10 @@ > > > > #include > > > > +#ifndef BZERO_SYMBOL > > +# define BZERO_SYMBOL(p,s) MEMSET_SYMBOL (p, s) > > +#endif > > + > > #ifndef MEMSET_CHK_SYMBOL > > # define MEMSET_CHK_SYMBOL(p,s) MEMSET_SYMBOL(p, s) > > #endif > > @@ -87,6 +91,18 @@ > > # define XMM_SMALL 0 > > #endif > > > > +#ifdef USE_LESS_VEC_MASK_STORE > > +# define SET_REG64 rcx > > +# define SET_REG32 ecx > > +# define SET_REG16 cx > > +# define SET_REG8 cl > > +#else > > +# define SET_REG64 rsi > > +# define SET_REG32 esi > > +# define SET_REG16 si > > +# define SET_REG8 sil > > +#endif > > + > > #define PAGE_SIZE 4096 > > > > /* Macro to calculate size of small memset block for aligning > > @@ -96,18 +112,6 @@ > > > > #ifndef SECTION > > # error SECTION is not defined! > > -#endif > > - > > - .section SECTION(.text),"ax",@progbits > > -#if VEC_SIZE == 16 && IS_IN (libc) > > -ENTRY (__bzero) > > - mov %RDI_LP, %RAX_LP /* Set return value. */ > > - mov %RSI_LP, %RDX_LP /* Set n. */ > > - xorl %esi, %esi > > - pxor %XMM0, %XMM0 > > - jmp L(entry_from_bzero) > > -END (__bzero) > > -weak_alias (__bzero, bzero) > > #endif > > > > #if IS_IN (libc) > > @@ -123,12 +127,37 @@ ENTRY (WMEMSET_SYMBOL (__wmemset, unaligned)) > > WMEMSET_SET_VEC0_AND_SET_RETURN (%esi, %rdi) > > WMEMSET_VDUP_TO_VEC0_LOW() > > cmpq $VEC_SIZE, %rdx > > - jb L(less_vec_no_vdup) > > + jb L(less_vec_from_wmemset) > > WMEMSET_VDUP_TO_VEC0_HIGH() > > jmp L(entry_from_wmemset) > > END (WMEMSET_SYMBOL (__wmemset, unaligned)) > > #endif > > > > +ENTRY (BZERO_SYMBOL(__bzero, unaligned)) > > +#if VEC_SIZE > 16 > > + BZERO_ZERO_VEC0 () > > +#endif > > + mov %RDI_LP, %RAX_LP > > + mov %RSI_LP, %RDX_LP > > +#ifndef USE_LESS_VEC_MASK_STORE > > + xorl %esi, %esi > > +#endif > > + cmp $VEC_SIZE, %RDX_LP > > + jb L(less_vec_no_vdup) > > +#ifdef USE_LESS_VEC_MASK_STORE > > + xorl %esi, %esi > > +#endif > > +#if VEC_SIZE <= 16 > > + BZERO_ZERO_VEC0 () > > +#endif > > + cmp $(VEC_SIZE * 2), %RDX_LP > > + ja L(more_2x_vec) > > + /* From VEC and to 2 * VEC. No branch when size == VEC_SIZE. */ > > + VMOVU %VEC(0), (%rdi) > > + VMOVU %VEC(0), (VEC_SIZE * -1)(%rdi, %rdx) > > + VZEROUPPER_RETURN > > +END (BZERO_SYMBOL(__bzero, unaligned)) > > + > > #if defined SHARED && IS_IN (libc) > > ENTRY_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned)) > > cmp %RDX_LP, %RCX_LP > > @@ -142,7 +171,6 @@ ENTRY (MEMSET_SYMBOL (__memset, unaligned)) > > /* Clear the upper 32 bits. */ > > mov %edx, %edx > > # endif > > -L(entry_from_bzero): > > cmpq $VEC_SIZE, %rdx > > jb L(less_vec) > > MEMSET_VDUP_TO_VEC0_HIGH() > > @@ -187,6 +215,31 @@ END (__memset_erms) > > END (MEMSET_SYMBOL (__memset, erms)) > > # endif > > > > +ENTRY_P2ALIGN (BZERO_SYMBOL(__bzero, unaligned_erms), 6) > > +# if VEC_SIZE > 16 > > + BZERO_ZERO_VEC0 () > > +# endif > > + mov %RDI_LP, %RAX_LP > > + mov %RSI_LP, %RDX_LP > > +# ifndef USE_LESS_VEC_MASK_STORE > > + xorl %esi, %esi > > +# endif > > + cmp $VEC_SIZE, %RDX_LP > > + jb L(less_vec_no_vdup) > > +# ifdef USE_LESS_VEC_MASK_STORE > > + xorl %esi, %esi > > +# endif > > +# if VEC_SIZE <= 16 > > + BZERO_ZERO_VEC0 () > > +# endif > > + cmp $(VEC_SIZE * 2), %RDX_LP > > + ja L(stosb_more_2x_vec) > > + /* From VEC and to 2 * VEC. No branch when size == VEC_SIZE. */ > > + VMOVU %VEC(0), (%rdi) > > + VMOVU %VEC(0), (VEC_SIZE * -1)(%rdi, %rdx) > > + VZEROUPPER_RETURN > > +END (BZERO_SYMBOL(__bzero, unaligned_erms)) > > + > > # if defined SHARED && IS_IN (libc) > > ENTRY_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned_erms)) > > cmp %RDX_LP, %RCX_LP > > @@ -229,6 +282,7 @@ L(last_2x_vec): > > .p2align 4,, 10 > > L(less_vec): > > L(less_vec_no_vdup): > > +L(less_vec_from_wmemset): > > /* Less than 1 VEC. */ > > # if VEC_SIZE != 16 && VEC_SIZE != 32 && VEC_SIZE != 64 > > # error Unsupported VEC_SIZE! > > @@ -374,8 +428,11 @@ L(less_vec): > > /* Broadcast esi to partial register (i.e VEC_SIZE == 32 broadcast to > > xmm). This is only does anything for AVX2. */ > > MEMSET_VDUP_TO_VEC0_LOW () > > +L(less_vec_from_wmemset): > > +#if VEC_SIZE > 16 > > L(less_vec_no_vdup): > > #endif > > +#endif > > L(cross_page): > > #if VEC_SIZE > 32 > > cmpl $32, %edx > > @@ -386,7 +443,10 @@ L(cross_page): > > jge L(between_16_31) > > #endif > > #ifndef USE_XMM_LESS_VEC > > - MOVQ %XMM0, %rcx > > + MOVQ %XMM0, %SET_REG64 > > +#endif > > +#if VEC_SIZE <= 16 > > +L(less_vec_no_vdup): > > #endif > > cmpl $8, %edx > > jge L(between_8_15) > > @@ -395,7 +455,7 @@ L(cross_page): > > cmpl $1, %edx > > jg L(between_2_3) > > jl L(between_0_0) > > - movb %sil, (%LESS_VEC_REG) > > + movb %SET_REG8, (%LESS_VEC_REG) > > L(between_0_0): > > ret > > > > @@ -428,8 +488,8 @@ L(between_8_15): > > MOVQ %XMM0, (%rdi) > > MOVQ %XMM0, -8(%rdi, %rdx) > > #else > > - movq %rcx, (%LESS_VEC_REG) > > - movq %rcx, -8(%LESS_VEC_REG, %rdx) > > + movq %SET_REG64, (%LESS_VEC_REG) > > + movq %SET_REG64, -8(%LESS_VEC_REG, %rdx) > > #endif > > ret > > > > @@ -442,8 +502,8 @@ L(between_4_7): > > MOVD %XMM0, (%rdi) > > MOVD %XMM0, -4(%rdi, %rdx) > > #else > > - movl %ecx, (%LESS_VEC_REG) > > - movl %ecx, -4(%LESS_VEC_REG, %rdx) > > + movl %SET_REG32, (%LESS_VEC_REG) > > + movl %SET_REG32, -4(%LESS_VEC_REG, %rdx) > > #endif > > ret > > > > @@ -452,12 +512,12 @@ L(between_4_7): > > L(between_2_3): > > /* From 2 to 3. No branch when size == 2. */ > > #ifdef USE_XMM_LESS_VEC > > - movb %sil, (%rdi) > > - movb %sil, 1(%rdi) > > - movb %sil, -1(%rdi, %rdx) > > + movb %SET_REG8, (%rdi) > > + movb %SET_REG8, 1(%rdi) > > + movb %SET_REG8, -1(%rdi, %rdx) > > #else > > - movw %cx, (%LESS_VEC_REG) > > - movb %sil, -1(%LESS_VEC_REG, %rdx) > > + movw %SET_REG16, (%LESS_VEC_REG) > > + movb %SET_REG8, -1(%LESS_VEC_REG, %rdx) > > #endif > > ret > > END (MEMSET_SYMBOL (__memset, unaligned_erms))