From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: from mail-vs1-xe29.google.com (mail-vs1-xe29.google.com [IPv6:2607:f8b0:4864:20::e29]) by sourceware.org (Postfix) with ESMTPS id 4336F3858C56; Wed, 4 May 2022 05:47:13 +0000 (GMT) DMARC-Filter: OpenDMARC Filter v1.4.1 sourceware.org 4336F3858C56 Received: by mail-vs1-xe29.google.com with SMTP id e10so275107vsr.1; Tue, 03 May 2022 22:47:13 -0700 (PDT) X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=1e100.net; s=20210112; h=x-gm-message-state:mime-version:references:in-reply-to:from:date :message-id:subject:to:cc; bh=bkDQw6nRkxUWYxlDI9NIHVHkzoqKn/5KpCYm+LiWvtg=; b=VE6rwp/oVCKVcKlWXoGDf17xuChNjwLSMvDDQmbNepZ5tR8re9eO+kZ4JNgWNLB+Bx hrjwfldhNkPxq22kc5+YyyYRfMpDAc9jSQf2grAqhsnyM+xyLrvr8uJW7mEK1j6FuXPI puf9VqKoBiju605tzJnxmhL03OvlKDKRK90YqF8gj95aIfUex2utg/O+7Qs4H1vkYbNz eEHjhdoH7P2U/lq3LgeQD33zkb+YVaOfvdbo4Pg01TbScbSquNEJjFVkxgSVYeunCHry I0j8KyHHRp0NM5P88aBx+q2GTvg0pmC30LBadWM/8ss3kFiKYtivGpUcKisf1CnCeCaX qCUg== X-Gm-Message-State: AOAM530HPPF4NrjCoVIA0pFTBvKnbcRxUmfvK9eG0Lhv6f67Zy06u9uE xg+CgJwiwUuHTLSPpyJ7WArXBs4t8DNzrgevps8= X-Google-Smtp-Source: ABdhPJwEJnFRmvHmKQkZlrOfSO1wbrcbhpxssk83AscvgAGyWxsTNr/MWi7w0ZdP5hdwczLgauQNpO/CvnhH7wOdYcM= X-Received: by 2002:a67:ee4f:0:b0:32c:ee75:6e98 with SMTP id g15-20020a67ee4f000000b0032cee756e98mr5653674vsp.79.1651643232445; Tue, 03 May 2022 22:47:12 -0700 (PDT) MIME-Version: 1.0 References: <20220205224201.1166468-1-goldstein.w.n@gmail.com> <20220206065418.4165462-1-goldstein.w.n@gmail.com> In-Reply-To: From: Sunil Pandey Date: Tue, 3 May 2022 22:46:36 -0700 Message-ID: Subject: Re: [PATCH v2] x86: Improve vec generation in memset-vec-unaligned-erms.S To: Noah Goldstein , Libc-stable Mailing List Cc: "H.J. Lu" , GNU C Library Content-Type: text/plain; charset="UTF-8" X-Spam-Status: No, score=-7.6 required=5.0 tests=BAYES_00, DKIM_SIGNED, DKIM_VALID, DKIM_VALID_AU, DKIM_VALID_EF, FREEMAIL_ENVFROM_END_DIGIT, FREEMAIL_FROM, GIT_PATCH_0, HK_RANDOM_ENVFROM, HK_RANDOM_FROM, RCVD_IN_DNSWL_NONE, SPF_HELO_NONE, SPF_PASS, TXREP, T_SCC_BODY_TEXT_LINE autolearn=ham autolearn_force=no version=3.4.4 X-Spam-Checker-Version: SpamAssassin 3.4.4 (2020-01-24) on server2.sourceware.org X-BeenThere: libc-stable@sourceware.org X-Mailman-Version: 2.1.29 Precedence: list List-Id: Libc-stable mailing list List-Unsubscribe: , List-Archive: List-Help: List-Subscribe: , X-List-Received-Date: Wed, 04 May 2022 05:47:16 -0000 On Sun, Feb 6, 2022 at 7:48 PM Noah Goldstein via Libc-alpha wrote: > > On Sun, Feb 6, 2022 at 10:29 AM H.J. Lu wrote: > > > > On Sat, Feb 5, 2022 at 10:54 PM Noah Goldstein wrote: > > > > > > No bug. > > > > > > Split vec generation into multiple steps. This allows the > > > broadcast in AVX2 to use 'xmm' registers for the L(less_vec) > > > case. This saves an expensive lane-cross instruction and removes > > > the need for 'vzeroupper'. > > > > > > For SSE2 replace 2x 'punpck' instructions with zero-idiom 'pxor' for > > > byte broadcast. > > > > > > Results for memset-avx2 small (geomean of N = 20 benchset runs). > > > > > > size, New Time, Old Time, New / Old > > > 0, 4.100, 3.831, 0.934 > > > 1, 5.074, 4.399, 0.867 > > > 2, 4.433, 4.411, 0.995 > > > 4, 4.487, 4.415, 0.984 > > > 8, 4.454, 4.396, 0.987 > > > 16, 4.502, 4.443, 0.987 > > > > > > All relevant string/wcsmbs tests are passing. > > > --- > > > sysdeps/x86_64/memset.S | 21 ++- > > > .../multiarch/memset-avx2-unaligned-erms.S | 18 +- > > > .../multiarch/memset-avx512-unaligned-erms.S | 18 +- > > > .../multiarch/memset-evex-unaligned-erms.S | 18 +- > > > .../multiarch/memset-vec-unaligned-erms.S | 164 +++++++++++------- > > > 5 files changed, 152 insertions(+), 87 deletions(-) > > > > > > diff --git a/sysdeps/x86_64/memset.S b/sysdeps/x86_64/memset.S > > > index 65c09bd0ac..ccf036be53 100644 > > > --- a/sysdeps/x86_64/memset.S > > > +++ b/sysdeps/x86_64/memset.S > > > @@ -28,17 +28,22 @@ > > > #define VMOVU movups > > > #define VMOVA movaps > > > > > > -#define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \ > > > +# define MEMSET_SET_VEC0_AND_SET_RETURN(d, r) \ > > > movd d, %xmm0; \ > > > - movq r, %rax; \ > > > - punpcklbw %xmm0, %xmm0; \ > > > - punpcklwd %xmm0, %xmm0; \ > > > - pshufd $0, %xmm0, %xmm0 > > > + pxor %xmm1, %xmm1; \ > > > + pshufb %xmm1, %xmm0; \ > > > + movq r, %rax > > > > > > -#define WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \ > > > +# define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \ > > > movd d, %xmm0; \ > > > - movq r, %rax; \ > > > - pshufd $0, %xmm0, %xmm0 > > > + pshufd $0, %xmm0, %xmm0; \ > > > + movq r, %rax > > > + > > > +# define MEMSET_VDUP_TO_VEC0_HIGH() > > > +# define MEMSET_VDUP_TO_VEC0_LOW() > > > + > > > +# define WMEMSET_VDUP_TO_VEC0_HIGH() > > > +# define WMEMSET_VDUP_TO_VEC0_LOW() > > > > > > #define SECTION(p) p > > > > > > diff --git a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S > > > index 1af668af0a..c0bf2875d0 100644 > > > --- a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S > > > +++ b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S > > > @@ -10,15 +10,18 @@ > > > # define VMOVU vmovdqu > > > # define VMOVA vmovdqa > > > > > > -# define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \ > > > +# define MEMSET_SET_VEC0_AND_SET_RETURN(d, r) \ > > > vmovd d, %xmm0; \ > > > - movq r, %rax; \ > > > - vpbroadcastb %xmm0, %ymm0 > > > + movq r, %rax; > > > > > > -# define WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \ > > > - vmovd d, %xmm0; \ > > > - movq r, %rax; \ > > > - vpbroadcastd %xmm0, %ymm0 > > > +# define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \ > > > + MEMSET_SET_VEC0_AND_SET_RETURN(d, r) > > > + > > > +# define MEMSET_VDUP_TO_VEC0_HIGH() vpbroadcastb %xmm0, %ymm0 > > > +# define MEMSET_VDUP_TO_VEC0_LOW() vpbroadcastb %xmm0, %xmm0 > > > + > > > +# define WMEMSET_VDUP_TO_VEC0_HIGH() vpbroadcastd %xmm0, %ymm0 > > > +# define WMEMSET_VDUP_TO_VEC0_LOW() vpbroadcastd %xmm0, %xmm0 > > > > > > # ifndef SECTION > > > # define SECTION(p) p##.avx > > > @@ -30,5 +33,6 @@ > > > # define WMEMSET_SYMBOL(p,s) p##_avx2_##s > > > # endif > > > > > > +# define USE_XMM_LESS_VEC > > > # include "memset-vec-unaligned-erms.S" > > > #endif > > > diff --git a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S > > > index f14d6f8493..5241216a77 100644 > > > --- a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S > > > +++ b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S > > > @@ -15,13 +15,19 @@ > > > > > > # define VZEROUPPER > > > > > > -# define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \ > > > - movq r, %rax; \ > > > - vpbroadcastb d, %VEC0 > > > +# define MEMSET_SET_VEC0_AND_SET_RETURN(d, r) \ > > > + vpbroadcastb d, %VEC0; \ > > > + movq r, %rax > > > > > > -# define WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \ > > > - movq r, %rax; \ > > > - vpbroadcastd d, %VEC0 > > > +# define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \ > > > + vpbroadcastd d, %VEC0; \ > > > + movq r, %rax > > > + > > > +# define MEMSET_VDUP_TO_VEC0_HIGH() > > > +# define MEMSET_VDUP_TO_VEC0_LOW() > > > + > > > +# define WMEMSET_VDUP_TO_VEC0_HIGH() > > > +# define WMEMSET_VDUP_TO_VEC0_LOW() > > > > > > # define SECTION(p) p##.evex512 > > > # define MEMSET_SYMBOL(p,s) p##_avx512_##s > > > diff --git a/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S > > > index 64b09e77cc..6370021506 100644 > > > --- a/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S > > > +++ b/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S > > > @@ -15,13 +15,19 @@ > > > > > > # define VZEROUPPER > > > > > > -# define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \ > > > - movq r, %rax; \ > > > - vpbroadcastb d, %VEC0 > > > +# define MEMSET_SET_VEC0_AND_SET_RETURN(d, r) \ > > > + vpbroadcastb d, %VEC0; \ > > > + movq r, %rax > > > > > > -# define WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \ > > > - movq r, %rax; \ > > > - vpbroadcastd d, %VEC0 > > > +# define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \ > > > + vpbroadcastd d, %VEC0; \ > > > + movq r, %rax > > > + > > > +# define MEMSET_VDUP_TO_VEC0_HIGH() > > > +# define MEMSET_VDUP_TO_VEC0_LOW() > > > + > > > +# define WMEMSET_VDUP_TO_VEC0_HIGH() > > > +# define WMEMSET_VDUP_TO_VEC0_LOW() > > > > > > # define SECTION(p) p##.evex > > > # define MEMSET_SYMBOL(p,s) p##_evex_##s > > > diff --git a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S > > > index 1e0511c79a..1b502b78e4 100644 > > > --- a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S > > > +++ b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S > > > @@ -58,8 +58,10 @@ > > > #ifndef MOVQ > > > # if VEC_SIZE > 16 > > > # define MOVQ vmovq > > > +# define MOVD vmovd > > > # else > > > # define MOVQ movq > > > +# define MOVD movd > > > # endif > > > #endif > > > > > > @@ -72,9 +74,17 @@ > > > #if defined USE_WITH_EVEX || defined USE_WITH_AVX512 > > > # define END_REG rcx > > > # define LOOP_REG rdi > > > +# define LESS_VEC_REG rax > > > #else > > > # define END_REG rdi > > > # define LOOP_REG rdx > > > +# define LESS_VEC_REG rdi > > > +#endif > > > + > > > +#ifdef USE_XMM_LESS_VEC > > > +# define XMM_SMALL 1 > > > +#else > > > +# define XMM_SMALL 0 > > > #endif > > > > > > #define PAGE_SIZE 4096 > > > @@ -110,8 +120,12 @@ END_CHK (WMEMSET_CHK_SYMBOL (__wmemset_chk, unaligned)) > > > > > > ENTRY (WMEMSET_SYMBOL (__wmemset, unaligned)) > > > shl $2, %RDX_LP > > > - WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi) > > > - jmp L(entry_from_bzero) > > > + WMEMSET_SET_VEC0_AND_SET_RETURN (%esi, %rdi) > > > + WMEMSET_VDUP_TO_VEC0_LOW() > > > + cmpq $VEC_SIZE, %rdx > > > + jb L(less_vec_no_vdup) > > > + WMEMSET_VDUP_TO_VEC0_HIGH() > > > + jmp L(entry_from_wmemset) > > > END (WMEMSET_SYMBOL (__wmemset, unaligned)) > > > #endif > > > > > > @@ -123,7 +137,7 @@ END_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned)) > > > #endif > > > > > > ENTRY (MEMSET_SYMBOL (__memset, unaligned)) > > > - MEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi) > > > + MEMSET_SET_VEC0_AND_SET_RETURN (%esi, %rdi) > > > # ifdef __ILP32__ > > > /* Clear the upper 32 bits. */ > > > mov %edx, %edx > > > @@ -131,6 +145,8 @@ ENTRY (MEMSET_SYMBOL (__memset, unaligned)) > > > L(entry_from_bzero): > > > cmpq $VEC_SIZE, %rdx > > > jb L(less_vec) > > > + MEMSET_VDUP_TO_VEC0_HIGH() > > > +L(entry_from_wmemset): > > > cmpq $(VEC_SIZE * 2), %rdx > > > ja L(more_2x_vec) > > > /* From VEC and to 2 * VEC. No branch when size == VEC_SIZE. */ > > > @@ -179,27 +195,27 @@ END_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned_erms)) > > > # endif > > > > > > ENTRY_P2ALIGN (MEMSET_SYMBOL (__memset, unaligned_erms), 6) > > > - MEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi) > > > + MEMSET_SET_VEC0_AND_SET_RETURN (%esi, %rdi) > > > # ifdef __ILP32__ > > > /* Clear the upper 32 bits. */ > > > mov %edx, %edx > > > # endif > > > cmp $VEC_SIZE, %RDX_LP > > > jb L(less_vec) > > > + MEMSET_VDUP_TO_VEC0_HIGH () > > > cmp $(VEC_SIZE * 2), %RDX_LP > > > ja L(stosb_more_2x_vec) > > > - /* From VEC and to 2 * VEC. No branch when size == VEC_SIZE. > > > - */ > > > - VMOVU %VEC(0), (%rax) > > > - VMOVU %VEC(0), -VEC_SIZE(%rax, %rdx) > > > + /* From VEC and to 2 * VEC. No branch when size == VEC_SIZE. */ > > > + VMOVU %VEC(0), (%rdi) > > > + VMOVU %VEC(0), (VEC_SIZE * -1)(%rdi, %rdx) > > > VZEROUPPER_RETURN > > > #endif > > > > > > - .p2align 4,, 10 > > > + .p2align 4,, 4 > > > L(last_2x_vec): > > > #ifdef USE_LESS_VEC_MASK_STORE > > > - VMOVU %VEC(0), (VEC_SIZE * 2 + LOOP_4X_OFFSET)(%rcx) > > > - VMOVU %VEC(0), (VEC_SIZE * 3 + LOOP_4X_OFFSET)(%rcx) > > > + VMOVU %VEC(0), (VEC_SIZE * -2)(%rdi, %rdx) > > > + VMOVU %VEC(0), (VEC_SIZE * -1)(%rdi, %rdx) > > > #else > > > VMOVU %VEC(0), (VEC_SIZE * -2)(%rdi) > > > VMOVU %VEC(0), (VEC_SIZE * -1)(%rdi) > > > @@ -212,6 +228,7 @@ L(last_2x_vec): > > > #ifdef USE_LESS_VEC_MASK_STORE > > > .p2align 4,, 10 > > > L(less_vec): > > > +L(less_vec_no_vdup): > > > /* Less than 1 VEC. */ > > > # if VEC_SIZE != 16 && VEC_SIZE != 32 && VEC_SIZE != 64 > > > # error Unsupported VEC_SIZE! > > > @@ -262,28 +279,18 @@ L(stosb_more_2x_vec): > > > /* Fallthrough goes to L(loop_4x_vec). Tests for memset (2x, 4x] > > > and (4x, 8x] jump to target. */ > > > L(more_2x_vec): > > > - > > > - /* Two different methods of setting up pointers / compare. The > > > - two methods are based on the fact that EVEX/AVX512 mov > > > - instructions take more bytes then AVX2/SSE2 mov instructions. As > > > - well that EVEX/AVX512 machines also have fast LEA_BID. Both > > > - setup and END_REG to avoid complex address mode. For EVEX/AVX512 > > > - this saves code size and keeps a few targets in one fetch block. > > > - For AVX2/SSE2 this helps prevent AGU bottlenecks. */ > > > -#if defined USE_WITH_EVEX || defined USE_WITH_AVX512 > > > - /* If EVEX/AVX512 compute END_REG - (VEC_SIZE * 4 + > > > - LOOP_4X_OFFSET) with LEA_BID. */ > > > - > > > - /* END_REG is rcx for EVEX/AVX512. */ > > > - leaq -(VEC_SIZE * 4 + LOOP_4X_OFFSET)(%rdi, %rdx), %END_REG > > > -#endif > > > - > > > - /* Stores to first 2x VEC before cmp as any path forward will > > > - require it. */ > > > - VMOVU %VEC(0), (%rax) > > > - VMOVU %VEC(0), VEC_SIZE(%rax) > > > + /* Store next 2x vec regardless. */ > > > + VMOVU %VEC(0), (%rdi) > > > + VMOVU %VEC(0), (VEC_SIZE * 1)(%rdi) > > > > > > > > > + /* Two different methods of setting up pointers / compare. The two > > > + methods are based on the fact that EVEX/AVX512 mov instructions take > > > + more bytes then AVX2/SSE2 mov instructions. As well that EVEX/AVX512 > > > + machines also have fast LEA_BID. Both setup and END_REG to avoid complex > > > + address mode. For EVEX/AVX512 this saves code size and keeps a few > > > + targets in one fetch block. For AVX2/SSE2 this helps prevent AGU > > > + bottlenecks. */ > > > #if !(defined USE_WITH_EVEX || defined USE_WITH_AVX512) > > > /* If AVX2/SSE2 compute END_REG (rdi) with ALU. */ > > > addq %rdx, %END_REG > > > @@ -292,6 +299,15 @@ L(more_2x_vec): > > > cmpq $(VEC_SIZE * 4), %rdx > > > jbe L(last_2x_vec) > > > > > > + > > > +#if defined USE_WITH_EVEX || defined USE_WITH_AVX512 > > > + /* If EVEX/AVX512 compute END_REG - (VEC_SIZE * 4 + LOOP_4X_OFFSET) with > > > + LEA_BID. */ > > > + > > > + /* END_REG is rcx for EVEX/AVX512. */ > > > + leaq -(VEC_SIZE * 4 + LOOP_4X_OFFSET)(%rdi, %rdx), %END_REG > > > +#endif > > > + > > > /* Store next 2x vec regardless. */ > > > VMOVU %VEC(0), (VEC_SIZE * 2)(%rax) > > > VMOVU %VEC(0), (VEC_SIZE * 3)(%rax) > > > @@ -355,65 +371,93 @@ L(stosb_local): > > > /* Define L(less_vec) only if not otherwise defined. */ > > > .p2align 4 > > > L(less_vec): > > > + /* Broadcast esi to partial register (i.e VEC_SIZE == 32 broadcast to > > > + xmm). This is only does anything for AVX2. */ > > > + MEMSET_VDUP_TO_VEC0_LOW () > > > +L(less_vec_no_vdup): > > > #endif > > > L(cross_page): > > > #if VEC_SIZE > 32 > > > cmpl $32, %edx > > > - jae L(between_32_63) > > > + jge L(between_32_63) > > > #endif > > > #if VEC_SIZE > 16 > > > cmpl $16, %edx > > > - jae L(between_16_31) > > > + jge L(between_16_31) > > > +#endif > > > +#ifndef USE_XMM_LESS_VEC > > > + MOVQ %XMM0, %rcx > > > #endif > > > - MOVQ %XMM0, %rdi > > > cmpl $8, %edx > > > - jae L(between_8_15) > > > + jge L(between_8_15) > > > cmpl $4, %edx > > > - jae L(between_4_7) > > > + jge L(between_4_7) > > > cmpl $1, %edx > > > - ja L(between_2_3) > > > - jb L(return) > > > - movb %sil, (%rax) > > > - VZEROUPPER_RETURN > > > + jg L(between_2_3) > > > + jl L(between_0_0) > > > + movb %sil, (%LESS_VEC_REG) > > > +L(between_0_0): > > > + ret > > > > > > - /* Align small targets only if not doing so would cross a fetch > > > - line. */ > > > + /* Align small targets only if not doing so would cross a fetch line. > > > + */ > > > #if VEC_SIZE > 32 > > > .p2align 4,, SMALL_MEMSET_ALIGN(MOV_SIZE, RET_SIZE) > > > /* From 32 to 63. No branch when size == 32. */ > > > L(between_32_63): > > > - VMOVU %YMM0, (%rax) > > > - VMOVU %YMM0, -32(%rax, %rdx) > > > + VMOVU %YMM0, (%LESS_VEC_REG) > > > + VMOVU %YMM0, -32(%LESS_VEC_REG, %rdx) > > > VZEROUPPER_RETURN > > > #endif > > > > > > #if VEC_SIZE >= 32 > > > - .p2align 4,, SMALL_MEMSET_ALIGN(MOV_SIZE, RET_SIZE) > > > + .p2align 4,, SMALL_MEMSET_ALIGN(MOV_SIZE, 1) > > > L(between_16_31): > > > /* From 16 to 31. No branch when size == 16. */ > > > - VMOVU %XMM0, (%rax) > > > - VMOVU %XMM0, -16(%rax, %rdx) > > > - VZEROUPPER_RETURN > > > + VMOVU %XMM0, (%LESS_VEC_REG) > > > + VMOVU %XMM0, -16(%LESS_VEC_REG, %rdx) > > > + ret > > > #endif > > > > > > - .p2align 4,, SMALL_MEMSET_ALIGN(3, RET_SIZE) > > > + /* Move size is 3 for SSE2, EVEX, and AVX512. Move size is 4 for AVX2. > > > + */ > > > + .p2align 4,, SMALL_MEMSET_ALIGN(3 + XMM_SMALL, 1) > > > L(between_8_15): > > > /* From 8 to 15. No branch when size == 8. */ > > > - movq %rdi, (%rax) > > > - movq %rdi, -8(%rax, %rdx) > > > - VZEROUPPER_RETURN > > > +#ifdef USE_XMM_LESS_VEC > > > + MOVQ %XMM0, (%rdi) > > > + MOVQ %XMM0, -8(%rdi, %rdx) > > > +#else > > > + movq %rcx, (%LESS_VEC_REG) > > > + movq %rcx, -8(%LESS_VEC_REG, %rdx) > > > +#endif > > > + ret > > > > > > - .p2align 4,, SMALL_MEMSET_ALIGN(2, RET_SIZE) > > > + /* Move size is 2 for SSE2, EVEX, and AVX512. Move size is 4 for AVX2. > > > + */ > > > + .p2align 4,, SMALL_MEMSET_ALIGN(2 << XMM_SMALL, 1) > > > L(between_4_7): > > > /* From 4 to 7. No branch when size == 4. */ > > > - movl %edi, (%rax) > > > - movl %edi, -4(%rax, %rdx) > > > - VZEROUPPER_RETURN > > > +#ifdef USE_XMM_LESS_VEC > > > + MOVD %XMM0, (%rdi) > > > + MOVD %XMM0, -4(%rdi, %rdx) > > > +#else > > > + movl %ecx, (%LESS_VEC_REG) > > > + movl %ecx, -4(%LESS_VEC_REG, %rdx) > > > +#endif > > > + ret > > > > > > - .p2align 4,, SMALL_MEMSET_ALIGN(3, RET_SIZE) > > > + /* 4 * XMM_SMALL for the third mov for AVX2. */ > > > + .p2align 4,, 4 * XMM_SMALL + SMALL_MEMSET_ALIGN(3, 1) > > > L(between_2_3): > > > /* From 2 to 3. No branch when size == 2. */ > > > - movw %di, (%rax) > > > - movb %dil, -1(%rax, %rdx) > > > - VZEROUPPER_RETURN > > > +#ifdef USE_XMM_LESS_VEC > > > + movb %sil, (%rdi) > > > + movb %sil, 1(%rdi) > > > + movb %sil, -1(%rdi, %rdx) > > > +#else > > > + movw %cx, (%LESS_VEC_REG) > > > + movb %sil, -1(%LESS_VEC_REG, %rdx) > > > +#endif > > > + ret > > > END (MEMSET_SYMBOL (__memset, unaligned_erms)) > > > -- > > > 2.25.1 > > > > > > > LGTM. > > > > Reviewed-by: H.J. Lu > > > > Thanks. > > Thanks pushed. > > > > -- > > H.J. I would like to backport this patch to release branches. Any comments or objections? --Sunil