From: Noah Goldstein <goldstein.w.n@gmail.com>
To: "H.J. Lu" <hjl.tools@gmail.com>
Cc: GNU C Library <libc-alpha@sourceware.org>
Subject: Re: [PATCH v2] x86-64: Optimize bzero
Date: Tue, 8 Feb 2022 17:56:31 -0600 [thread overview]
Message-ID: <CAFUsyfJvBmS6fYhV08-+rh7WOJ7Cdk8SXR64uJUXJsSZ9E=14w@mail.gmail.com> (raw)
In-Reply-To: <20220208224319.40271-1-hjl.tools@gmail.com>
On Tue, Feb 8, 2022 at 4:43 PM H.J. Lu <hjl.tools@gmail.com> wrote:
>
> Rebase against the current master branch.
>
> --
> memset with zero as the value to set is by far the majority value (99%+
> for Python3 and GCC).
>
> bzero can be slightly more optimized for this case by using a zero-idiom
> xor for broadcasting the set value to a register (vector or GPR).
>
> Co-developed-by: Noah Goldstein <goldstein.w.n@gmail.com>
> ---
> sysdeps/x86_64/memset.S | 8 ++
> sysdeps/x86_64/multiarch/Makefile | 1 +
> sysdeps/x86_64/multiarch/bzero.c | 106 +++++++++++++++++
> sysdeps/x86_64/multiarch/ifunc-impl-list.c | 42 +++++++
> .../memset-avx2-unaligned-erms-rtm.S | 1 +
> .../multiarch/memset-avx2-unaligned-erms.S | 6 +
> .../multiarch/memset-avx512-unaligned-erms.S | 3 +
> .../multiarch/memset-evex-unaligned-erms.S | 3 +
> .../multiarch/memset-sse2-unaligned-erms.S | 1 +
> .../multiarch/memset-vec-unaligned-erms.S | 110 ++++++++++++++----
> 10 files changed, 256 insertions(+), 25 deletions(-)
> create mode 100644 sysdeps/x86_64/multiarch/bzero.c
>
> diff --git a/sysdeps/x86_64/memset.S b/sysdeps/x86_64/memset.S
> index 3f0517bbfc..af26e9cedc 100644
> --- a/sysdeps/x86_64/memset.S
> +++ b/sysdeps/x86_64/memset.S
> @@ -35,6 +35,9 @@
> punpcklwd %xmm0, %xmm0; \
> pshufd $0, %xmm0, %xmm0
>
> +# define BZERO_ZERO_VEC0() \
> + pxor %xmm0, %xmm0
> +
> # define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
> movd d, %xmm0; \
> pshufd $0, %xmm0, %xmm0; \
> @@ -53,6 +56,10 @@
> # define MEMSET_SYMBOL(p,s) memset
> #endif
>
> +#ifndef BZERO_SYMBOL
> +# define BZERO_SYMBOL(p,s) __bzero
> +#endif
> +
> #ifndef WMEMSET_SYMBOL
> # define WMEMSET_CHK_SYMBOL(p,s) p
> # define WMEMSET_SYMBOL(p,s) __wmemset
> @@ -63,6 +70,7 @@
> libc_hidden_builtin_def (memset)
>
> #if IS_IN (libc)
> +weak_alias (__bzero, bzero)
> libc_hidden_def (__wmemset)
> weak_alias (__wmemset, wmemset)
> libc_hidden_weak (wmemset)
> diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
> index 4274bfdd0d..e7b413edad 100644
> --- a/sysdeps/x86_64/multiarch/Makefile
> +++ b/sysdeps/x86_64/multiarch/Makefile
> @@ -1,6 +1,7 @@
> ifeq ($(subdir),string)
>
> sysdep_routines += \
> + bzero \
> memchr-avx2 \
> memchr-avx2-rtm \
> memchr-evex \
> diff --git a/sysdeps/x86_64/multiarch/bzero.c b/sysdeps/x86_64/multiarch/bzero.c
> new file mode 100644
> index 0000000000..58a14b2c33
> --- /dev/null
> +++ b/sysdeps/x86_64/multiarch/bzero.c
> @@ -0,0 +1,106 @@
> +/* Multiple versions of bzero.
> + All versions must be listed in ifunc-impl-list.c.
> + Copyright (C) 2022 Free Software Foundation, Inc.
> + This file is part of the GNU C Library.
> +
> + The GNU C Library is free software; you can redistribute it and/or
> + modify it under the terms of the GNU Lesser General Public
> + License as published by the Free Software Foundation; either
> + version 2.1 of the License, or (at your option) any later version.
> +
> + The GNU C Library is distributed in the hope that it will be useful,
> + but WITHOUT ANY WARRANTY; without even the implied warranty of
> + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + Lesser General Public License for more details.
> +
> + You should have received a copy of the GNU Lesser General Public
> + License along with the GNU C Library; if not, see
> + <https://www.gnu.org/licenses/>. */
> +
> +/* Define multiple versions only for the definition in libc. */
> +#if IS_IN (libc)
> +# define __bzero __redirect___bzero
> +# include <string.h>
> +# undef __bzero
> +
> +# define SYMBOL_NAME __bzero
> +# include <init-arch.h>
> +
> +extern __typeof (REDIRECT_NAME) OPTIMIZE1 (sse2_unaligned)
> + attribute_hidden;
> +extern __typeof (REDIRECT_NAME) OPTIMIZE1 (sse2_unaligned_erms)
> + attribute_hidden;
> +extern __typeof (REDIRECT_NAME) OPTIMIZE1 (avx2_unaligned) attribute_hidden;
> +extern __typeof (REDIRECT_NAME) OPTIMIZE1 (avx2_unaligned_erms)
> + attribute_hidden;
> +extern __typeof (REDIRECT_NAME) OPTIMIZE1 (avx2_unaligned_rtm)
> + attribute_hidden;
> +extern __typeof (REDIRECT_NAME) OPTIMIZE1 (avx2_unaligned_erms_rtm)
> + attribute_hidden;
> +extern __typeof (REDIRECT_NAME) OPTIMIZE1 (evex_unaligned)
> + attribute_hidden;
> +extern __typeof (REDIRECT_NAME) OPTIMIZE1 (evex_unaligned_erms)
> + attribute_hidden;
> +extern __typeof (REDIRECT_NAME) OPTIMIZE1 (avx512_unaligned)
> + attribute_hidden;
> +extern __typeof (REDIRECT_NAME) OPTIMIZE1 (avx512_unaligned_erms)
> + attribute_hidden;
> +
> +static inline void *
> +IFUNC_SELECTOR (void)
> +{
> + const struct cpu_features* cpu_features = __get_cpu_features ();
> +
> + if (CPU_FEATURE_USABLE_P (cpu_features, AVX512F)
> + && !CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_AVX512))
> + {
> + if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
> + && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW)
> + && CPU_FEATURE_USABLE_P (cpu_features, BMI2))
> + {
> + if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
> + return OPTIMIZE1 (avx512_unaligned_erms);
> +
> + return OPTIMIZE1 (avx512_unaligned);
> + }
> + }
> +
> + if (CPU_FEATURE_USABLE_P (cpu_features, AVX2))
> + {
> + if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
> + && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW)
> + && CPU_FEATURE_USABLE_P (cpu_features, BMI2))
> + {
> + if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
> + return OPTIMIZE1 (evex_unaligned_erms);
> +
> + return OPTIMIZE1 (evex_unaligned);
> + }
> +
> + if (CPU_FEATURE_USABLE_P (cpu_features, RTM))
> + {
> + if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
> + return OPTIMIZE1 (avx2_unaligned_erms_rtm);
> +
> + return OPTIMIZE1 (avx2_unaligned_rtm);
> + }
> +
> + if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
> + {
> + if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
> + return OPTIMIZE1 (avx2_unaligned_erms);
> +
> + return OPTIMIZE1 (avx2_unaligned);
> + }
> + }
> +
> + if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
> + return OPTIMIZE1 (sse2_unaligned_erms);
> +
> + return OPTIMIZE1 (sse2_unaligned);
> +}
> +
> +libc_ifunc_redirected (__redirect___bzero, __bzero, IFUNC_SELECTOR ());
> +
> +weak_alias (__bzero, bzero)
> +#endif
> diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> index 68a56797d4..a594f4176e 100644
> --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> @@ -300,6 +300,48 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> __memset_avx512_no_vzeroupper)
> )
>
> + /* Support sysdeps/x86_64/multiarch/bzero.c. */
> + IFUNC_IMPL (i, name, bzero,
> + IFUNC_IMPL_ADD (array, i, bzero, 1,
> + __bzero_sse2_unaligned)
> + IFUNC_IMPL_ADD (array, i, bzero, 1,
> + __bzero_sse2_unaligned_erms)
> + IFUNC_IMPL_ADD (array, i, bzero,
> + CPU_FEATURE_USABLE (AVX2),
> + __bzero_avx2_unaligned)
> + IFUNC_IMPL_ADD (array, i, bzero,
> + CPU_FEATURE_USABLE (AVX2),
> + __bzero_avx2_unaligned_erms)
> + IFUNC_IMPL_ADD (array, i, bzero,
> + (CPU_FEATURE_USABLE (AVX2)
> + && CPU_FEATURE_USABLE (RTM)),
> + __bzero_avx2_unaligned_rtm)
> + IFUNC_IMPL_ADD (array, i, bzero,
> + (CPU_FEATURE_USABLE (AVX2)
> + && CPU_FEATURE_USABLE (RTM)),
> + __bzero_avx2_unaligned_erms_rtm)
> + IFUNC_IMPL_ADD (array, i, bzero,
> + (CPU_FEATURE_USABLE (AVX512VL)
> + && CPU_FEATURE_USABLE (AVX512BW)
> + && CPU_FEATURE_USABLE (BMI2)),
> + __bzero_evex_unaligned)
> + IFUNC_IMPL_ADD (array, i, bzero,
> + (CPU_FEATURE_USABLE (AVX512VL)
> + && CPU_FEATURE_USABLE (AVX512BW)
> + && CPU_FEATURE_USABLE (BMI2)),
> + __bzero_evex_unaligned_erms)
> + IFUNC_IMPL_ADD (array, i, bzero,
> + (CPU_FEATURE_USABLE (AVX512VL)
> + && CPU_FEATURE_USABLE (AVX512BW)
> + && CPU_FEATURE_USABLE (BMI2)),
> + __bzero_avx512_unaligned_erms)
> + IFUNC_IMPL_ADD (array, i, bzero,
> + (CPU_FEATURE_USABLE (AVX512VL)
> + && CPU_FEATURE_USABLE (AVX512BW)
> + && CPU_FEATURE_USABLE (BMI2)),
> + __bzero_avx512_unaligned)
> + )
> +
> /* Support sysdeps/x86_64/multiarch/rawmemchr.c. */
> IFUNC_IMPL (i, name, rawmemchr,
> IFUNC_IMPL_ADD (array, i, rawmemchr,
> diff --git a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms-rtm.S b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms-rtm.S
> index 8ac3e479bb..5a5ee6f672 100644
> --- a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms-rtm.S
> +++ b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms-rtm.S
> @@ -5,6 +5,7 @@
>
> #define SECTION(p) p##.avx.rtm
> #define MEMSET_SYMBOL(p,s) p##_avx2_##s##_rtm
> +#define BZERO_SYMBOL(p,s) p##_avx2_##s##_rtm
> #define WMEMSET_SYMBOL(p,s) p##_avx2_##s##_rtm
>
> #include "memset-avx2-unaligned-erms.S"
> diff --git a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
> index c0bf2875d0..a093a2831f 100644
> --- a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
> +++ b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
> @@ -14,6 +14,9 @@
> vmovd d, %xmm0; \
> movq r, %rax;
>
> +# define BZERO_ZERO_VEC0() \
> + vpxor %xmm0, %xmm0, %xmm0
> +
> # define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
> MEMSET_SET_VEC0_AND_SET_RETURN(d, r)
>
> @@ -29,6 +32,9 @@
> # ifndef MEMSET_SYMBOL
> # define MEMSET_SYMBOL(p,s) p##_avx2_##s
> # endif
> +# ifndef BZERO_SYMBOL
> +# define BZERO_SYMBOL(p,s) p##_avx2_##s
> +# endif
> # ifndef WMEMSET_SYMBOL
> # define WMEMSET_SYMBOL(p,s) p##_avx2_##s
> # endif
> diff --git a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
> index 5241216a77..727c92133a 100644
> --- a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
> +++ b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
> @@ -19,6 +19,9 @@
> vpbroadcastb d, %VEC0; \
> movq r, %rax
>
> +# define BZERO_ZERO_VEC0() \
> + vpxorq %XMM0, %XMM0, %XMM0
> +
> # define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
> vpbroadcastd d, %VEC0; \
> movq r, %rax
> diff --git a/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
> index 6370021506..5d8fa78f05 100644
> --- a/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
> +++ b/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
> @@ -19,6 +19,9 @@
> vpbroadcastb d, %VEC0; \
> movq r, %rax
>
> +# define BZERO_ZERO_VEC0() \
> + vpxorq %XMM0, %XMM0, %XMM0
> +
> # define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
> vpbroadcastd d, %VEC0; \
> movq r, %rax
> diff --git a/sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S
> index 8a6f0c561a..329c58ee46 100644
> --- a/sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S
> +++ b/sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S
> @@ -22,6 +22,7 @@
>
> #if IS_IN (libc)
> # define MEMSET_SYMBOL(p,s) p##_sse2_##s
> +# define BZERO_SYMBOL(p,s) MEMSET_SYMBOL (p, s)
> # define WMEMSET_SYMBOL(p,s) p##_sse2_##s
>
> # ifdef SHARED
> diff --git a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
> index 1b502b78e4..7c94fcdae1 100644
> --- a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
> +++ b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
> @@ -26,6 +26,10 @@
>
> #include <sysdep.h>
>
> +#ifndef BZERO_SYMBOL
> +# define BZERO_SYMBOL(p,s) MEMSET_SYMBOL (p, s)
> +#endif
> +
> #ifndef MEMSET_CHK_SYMBOL
> # define MEMSET_CHK_SYMBOL(p,s) MEMSET_SYMBOL(p, s)
> #endif
> @@ -87,6 +91,18 @@
> # define XMM_SMALL 0
> #endif
>
> +#ifdef USE_LESS_VEC_MASK_STORE
> +# define SET_REG64 rcx
> +# define SET_REG32 ecx
> +# define SET_REG16 cx
> +# define SET_REG8 cl
> +#else
> +# define SET_REG64 rsi
> +# define SET_REG32 esi
> +# define SET_REG16 si
> +# define SET_REG8 sil
> +#endif
> +
> #define PAGE_SIZE 4096
>
> /* Macro to calculate size of small memset block for aligning
> @@ -96,18 +112,6 @@
>
> #ifndef SECTION
> # error SECTION is not defined!
> -#endif
> -
> - .section SECTION(.text),"ax",@progbits
> -#if VEC_SIZE == 16 && IS_IN (libc)
> -ENTRY (__bzero)
> - mov %RDI_LP, %RAX_LP /* Set return value. */
> - mov %RSI_LP, %RDX_LP /* Set n. */
> - xorl %esi, %esi
> - pxor %XMM0, %XMM0
> - jmp L(entry_from_bzero)
> -END (__bzero)
> -weak_alias (__bzero, bzero)
> #endif
>
> #if IS_IN (libc)
> @@ -123,12 +127,37 @@ ENTRY (WMEMSET_SYMBOL (__wmemset, unaligned))
> WMEMSET_SET_VEC0_AND_SET_RETURN (%esi, %rdi)
> WMEMSET_VDUP_TO_VEC0_LOW()
> cmpq $VEC_SIZE, %rdx
> - jb L(less_vec_no_vdup)
> + jb L(less_vec_from_wmemset)
> WMEMSET_VDUP_TO_VEC0_HIGH()
> jmp L(entry_from_wmemset)
> END (WMEMSET_SYMBOL (__wmemset, unaligned))
> #endif
>
> +ENTRY (BZERO_SYMBOL(__bzero, unaligned))
> +#if VEC_SIZE > 16
> + BZERO_ZERO_VEC0 ()
> +#endif
> + mov %RDI_LP, %RAX_LP
> + mov %RSI_LP, %RDX_LP
> +#ifndef USE_LESS_VEC_MASK_STORE
> + xorl %esi, %esi
> +#endif
> + cmp $VEC_SIZE, %RDX_LP
> + jb L(less_vec_no_vdup)
> +#ifdef USE_LESS_VEC_MASK_STORE
> + xorl %esi, %esi
> +#endif
> +#if VEC_SIZE <= 16
> + BZERO_ZERO_VEC0 ()
> +#endif
> + cmp $(VEC_SIZE * 2), %RDX_LP
> + ja L(more_2x_vec)
> + /* From VEC and to 2 * VEC. No branch when size == VEC_SIZE. */
> + VMOVU %VEC(0), (%rdi)
> + VMOVU %VEC(0), (VEC_SIZE * -1)(%rdi, %rdx)
> + VZEROUPPER_RETURN
> +END (BZERO_SYMBOL(__bzero, unaligned))
> +
> #if defined SHARED && IS_IN (libc)
> ENTRY_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned))
> cmp %RDX_LP, %RCX_LP
> @@ -142,7 +171,6 @@ ENTRY (MEMSET_SYMBOL (__memset, unaligned))
> /* Clear the upper 32 bits. */
> mov %edx, %edx
> # endif
> -L(entry_from_bzero):
> cmpq $VEC_SIZE, %rdx
> jb L(less_vec)
> MEMSET_VDUP_TO_VEC0_HIGH()
> @@ -187,6 +215,31 @@ END (__memset_erms)
> END (MEMSET_SYMBOL (__memset, erms))
> # endif
>
> +ENTRY_P2ALIGN (BZERO_SYMBOL(__bzero, unaligned_erms), 6)
> +# if VEC_SIZE > 16
> + BZERO_ZERO_VEC0 ()
> +# endif
> + mov %RDI_LP, %RAX_LP
> + mov %RSI_LP, %RDX_LP
> +# ifndef USE_LESS_VEC_MASK_STORE
> + xorl %esi, %esi
> +# endif
> + cmp $VEC_SIZE, %RDX_LP
> + jb L(less_vec_no_vdup)
> +# ifdef USE_LESS_VEC_MASK_STORE
> + xorl %esi, %esi
> +# endif
> +# if VEC_SIZE <= 16
> + BZERO_ZERO_VEC0 ()
> +# endif
> + cmp $(VEC_SIZE * 2), %RDX_LP
> + ja L(stosb_more_2x_vec)
> + /* From VEC and to 2 * VEC. No branch when size == VEC_SIZE. */
> + VMOVU %VEC(0), (%rdi)
> + VMOVU %VEC(0), (VEC_SIZE * -1)(%rdi, %rdx)
> + VZEROUPPER_RETURN
> +END (BZERO_SYMBOL(__bzero, unaligned_erms))
> +
> # if defined SHARED && IS_IN (libc)
> ENTRY_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned_erms))
> cmp %RDX_LP, %RCX_LP
> @@ -229,6 +282,7 @@ L(last_2x_vec):
> .p2align 4,, 10
> L(less_vec):
> L(less_vec_no_vdup):
> +L(less_vec_from_wmemset):
> /* Less than 1 VEC. */
> # if VEC_SIZE != 16 && VEC_SIZE != 32 && VEC_SIZE != 64
> # error Unsupported VEC_SIZE!
> @@ -374,8 +428,11 @@ L(less_vec):
> /* Broadcast esi to partial register (i.e VEC_SIZE == 32 broadcast to
> xmm). This is only does anything for AVX2. */
> MEMSET_VDUP_TO_VEC0_LOW ()
> +L(less_vec_from_wmemset):
> +#if VEC_SIZE > 16
> L(less_vec_no_vdup):
> #endif
> +#endif
> L(cross_page):
> #if VEC_SIZE > 32
> cmpl $32, %edx
> @@ -386,7 +443,10 @@ L(cross_page):
> jge L(between_16_31)
> #endif
> #ifndef USE_XMM_LESS_VEC
> - MOVQ %XMM0, %rcx
> + MOVQ %XMM0, %SET_REG64
> +#endif
> +#if VEC_SIZE <= 16
> +L(less_vec_no_vdup):
> #endif
> cmpl $8, %edx
> jge L(between_8_15)
> @@ -395,7 +455,7 @@ L(cross_page):
> cmpl $1, %edx
> jg L(between_2_3)
> jl L(between_0_0)
> - movb %sil, (%LESS_VEC_REG)
> + movb %SET_REG8, (%LESS_VEC_REG)
> L(between_0_0):
> ret
>
> @@ -428,8 +488,8 @@ L(between_8_15):
> MOVQ %XMM0, (%rdi)
> MOVQ %XMM0, -8(%rdi, %rdx)
> #else
> - movq %rcx, (%LESS_VEC_REG)
> - movq %rcx, -8(%LESS_VEC_REG, %rdx)
> + movq %SET_REG64, (%LESS_VEC_REG)
> + movq %SET_REG64, -8(%LESS_VEC_REG, %rdx)
> #endif
> ret
>
> @@ -442,8 +502,8 @@ L(between_4_7):
> MOVD %XMM0, (%rdi)
> MOVD %XMM0, -4(%rdi, %rdx)
> #else
> - movl %ecx, (%LESS_VEC_REG)
> - movl %ecx, -4(%LESS_VEC_REG, %rdx)
> + movl %SET_REG32, (%LESS_VEC_REG)
> + movl %SET_REG32, -4(%LESS_VEC_REG, %rdx)
> #endif
> ret
>
> @@ -452,12 +512,12 @@ L(between_4_7):
> L(between_2_3):
> /* From 2 to 3. No branch when size == 2. */
> #ifdef USE_XMM_LESS_VEC
> - movb %sil, (%rdi)
> - movb %sil, 1(%rdi)
> - movb %sil, -1(%rdi, %rdx)
> + movb %SET_REG8, (%rdi)
> + movb %SET_REG8, 1(%rdi)
> + movb %SET_REG8, -1(%rdi, %rdx)
> #else
> - movw %cx, (%LESS_VEC_REG)
> - movb %sil, -1(%LESS_VEC_REG, %rdx)
> + movw %SET_REG16, (%LESS_VEC_REG)
> + movb %SET_REG8, -1(%LESS_VEC_REG, %rdx)
> #endif
> ret
> END (MEMSET_SYMBOL (__memset, unaligned_erms))
> --
> 2.34.1
>
LGTM.
next prev parent reply other threads:[~2022-02-08 23:56 UTC|newest]
Thread overview: 39+ messages / expand[flat|nested] mbox.gz Atom feed top
2022-02-08 22:43 H.J. Lu
2022-02-08 23:56 ` Noah Goldstein [this message]
2022-02-09 11:41 ` Adhemerval Zanella
2022-02-09 22:14 ` Noah Goldstein
2022-02-10 12:35 ` Adhemerval Zanella
2022-02-10 13:01 ` Wilco Dijkstra
2022-02-10 13:10 ` Adhemerval Zanella
2022-02-10 13:16 ` Adhemerval Zanella
2022-02-10 13:17 ` Wilco Dijkstra
2022-02-10 13:22 ` Adhemerval Zanella
2022-02-10 17:50 ` Alejandro Colomar (man-pages)
2022-02-10 19:19 ` Wilco Dijkstra
2022-02-10 20:27 ` Alejandro Colomar (man-pages)
2022-02-10 20:42 ` Adhemerval Zanella
2022-02-10 21:07 ` Patrick McGehearty
2022-02-11 13:01 ` Adhemerval Zanella
2022-02-12 23:46 ` Noah Goldstein
2022-02-14 12:07 ` Adhemerval Zanella
2022-02-14 12:41 ` Noah Goldstein
2022-02-14 14:07 ` Adhemerval Zanella
2022-02-14 15:03 ` H.J. Lu
2022-05-04 6:35 ` Sunil Pandey
2022-05-04 12:52 ` Adhemerval Zanella
2022-05-04 14:50 ` H.J. Lu
2022-05-04 14:54 ` Adhemerval Zanella
2022-02-10 22:00 ` Alejandro Colomar (man-pages)
2022-02-10 19:42 ` Adhemerval Zanella
2022-02-10 18:28 ` Noah Goldstein
2022-02-10 18:35 ` Noah Goldstein
2022-02-15 13:38 Wilco Dijkstra
2022-02-23 8:12 ` Noah Goldstein
2022-02-23 12:09 ` Adhemerval Zanella
2022-02-24 13:16 ` Wilco Dijkstra
2022-02-24 15:48 ` H.J. Lu
2022-02-24 22:58 ` Noah Goldstein
2022-02-24 23:21 ` Noah Goldstein
2022-02-25 17:37 ` Noah Goldstein
2022-02-25 13:51 ` Wilco Dijkstra
2022-02-25 17:35 ` Noah Goldstein
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to='CAFUsyfJvBmS6fYhV08-+rh7WOJ7Cdk8SXR64uJUXJsSZ9E=14w@mail.gmail.com' \
--to=goldstein.w.n@gmail.com \
--cc=hjl.tools@gmail.com \
--cc=libc-alpha@sourceware.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).