From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: from mail-qt1-x833.google.com (mail-qt1-x833.google.com [IPv6:2607:f8b0:4864:20::833]) by sourceware.org (Postfix) with ESMTPS id E49B63858D1E for ; Sat, 15 Oct 2022 02:54:23 +0000 (GMT) DMARC-Filter: OpenDMARC Filter v1.4.1 sourceware.org E49B63858D1E Authentication-Results: sourceware.org; dmarc=pass (p=none dis=none) header.from=gmail.com Authentication-Results: sourceware.org; spf=pass smtp.mailfrom=gmail.com Received: by mail-qt1-x833.google.com with SMTP id a24so4822675qto.10 for ; Fri, 14 Oct 2022 19:54:23 -0700 (PDT) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=gmail.com; s=20210112; h=cc:to:subject:message-id:date:from:in-reply-to:references :mime-version:from:to:cc:subject:date:message-id:reply-to; bh=Fs44jVkFYx44D0m/iz7ewImM6T2lLMkt1xIBXvSbF4k=; b=LOIotaDxNRxaxJgGNNgqsag9CK2Kmy2IU+lfBrCLg9kXHQ3kf0wsnOnJWrpSSpuN6f gr8reB2rUkx3VYiMGjYDGQm063M+kBUCHV2SedAMf6grICcwHozmaCOOnIGIpWt0/7YH 8t5ZpS9swTkdJBWowfYmYysCV6gYbsh7zMUH758t1ZtMDl/OQ1oKVQZpsjoC0AOYn3y6 G/cSoahNOT9tm0we979pycntmxvQGM4Y+YOoIgAdCzGQgZIAG+6F6fkO+5B6FiKDrQQu gp7ymlMZhFKtk48vTKRP1axtR3klxuDQzJ/8WbGut7yFr+AN0ak6dmzIgwbdZwTwNChb obdw== X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=1e100.net; s=20210112; h=cc:to:subject:message-id:date:from:in-reply-to:references :mime-version:x-gm-message-state:from:to:cc:subject:date:message-id :reply-to; bh=Fs44jVkFYx44D0m/iz7ewImM6T2lLMkt1xIBXvSbF4k=; b=itXw1gcrjeNZH6a+F5dD0igMNp/DBGjg2UPU3tdhYAWmcEXV2ZV425n2cPe2hTrPRu cfAoAx3vSIISkGGOhrYDNIOKYORk5aGmY0A8LyCKk7qmRnmnm6+yrdE5T/Z8UA/NiToR 1SenrPKdfTiVrpZdvhM7o6SH7nFq03NPL4YtPHqDYGzaLFIBWnV+esJkOdIBKItIcciQ qGSQ1Yo5FOetxOlYO0HsCkf0oq5CrwoPQTP9fZJdb6xtxnJM0pmPC2O3/8usIaj3G0WM ffrxuswAVcefeag6+7rF4yF5AkOlkQH3enl7PsqCMmqKC73qCDAfS2KBhN7sN8bPd+Kb eyyA== X-Gm-Message-State: ACrzQf2whyJqPtwo20COSiFLzYOln22n1vcL4tLNB0/EOYjOjeb137cq KeWoJBU42z37mi9hihBuTufCBjU3gIRyOPt4yII= X-Google-Smtp-Source: AMsMyM4JMNboDGtKzaGGqXnEuHo4BpUh8+3yq+nkBBidPoVVvMmWHFwuxh5dIOEGtBNyBOOr17gzkIJPtOTZd3gnr3Q= X-Received: by 2002:a05:622a:202:b0:39c:d5f0:82fb with SMTP id b2-20020a05622a020200b0039cd5f082fbmr588122qtx.566.1665802463165; Fri, 14 Oct 2022 19:54:23 -0700 (PDT) MIME-Version: 1.0 References: <20221014164008.1325863-1-goldstein.w.n@gmail.com> <20221015002100.129511-1-goldstein.w.n@gmail.com> <20221015002100.129511-4-goldstein.w.n@gmail.com> In-Reply-To: <20221015002100.129511-4-goldstein.w.n@gmail.com> From: "H.J. Lu" Date: Fri, 14 Oct 2022 19:53:47 -0700 Message-ID: Subject: Re: [PATCH v9 4/6] x86: Update memset to use new VEC macros To: Noah Goldstein Cc: libc-alpha@sourceware.org, carlos@systemhalted.org Content-Type: text/plain; charset="UTF-8" X-Spam-Status: No, score=-3022.6 required=5.0 tests=BAYES_00,DKIM_SIGNED,DKIM_VALID,DKIM_VALID_AU,DKIM_VALID_EF,FREEMAIL_FROM,GIT_PATCH_0,RCVD_IN_DNSWL_NONE,SPF_HELO_NONE,SPF_PASS,TXREP,URIBL_BLACK autolearn=ham autolearn_force=no version=3.4.6 X-Spam-Checker-Version: SpamAssassin 3.4.6 (2021-04-09) on server2.sourceware.org List-Id: On Fri, Oct 14, 2022 at 5:21 PM Noah Goldstein wrote: > > Replace %VEC(n) -> %VMM(n) > > This commit does not change libc.so > > Tested build on x86-64 > --- > .../memset-avx2-unaligned-erms-rtm.S | 8 +-- > .../multiarch/memset-avx2-unaligned-erms.S | 14 +--- > .../multiarch/memset-avx512-unaligned-erms.S | 20 +----- > .../multiarch/memset-evex-unaligned-erms.S | 20 +----- > .../multiarch/memset-sse2-unaligned-erms.S | 10 +-- > .../multiarch/memset-vec-unaligned-erms.S | 70 ++++++++----------- > 6 files changed, 43 insertions(+), 99 deletions(-) > > diff --git a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms-rtm.S b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms-rtm.S > index 8ac3e479bb..bc8605faf3 100644 > --- a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms-rtm.S > +++ b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms-rtm.S > @@ -1,10 +1,6 @@ > -#define ZERO_UPPER_VEC_REGISTERS_RETURN \ > - ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST > +#include "x86-avx-rtm-vecs.h" > > -#define VZEROUPPER_RETURN jmp L(return) > - > -#define SECTION(p) p##.avx.rtm > #define MEMSET_SYMBOL(p,s) p##_avx2_##s##_rtm > #define WMEMSET_SYMBOL(p,s) p##_avx2_##s##_rtm > > -#include "memset-avx2-unaligned-erms.S" > +# include "memset-avx2-unaligned-erms.S" > diff --git a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S > index a9054a9122..47cf5072a4 100644 > --- a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S > +++ b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S > @@ -4,14 +4,9 @@ > > # define USE_WITH_AVX2 1 > > -# define VEC_SIZE 32 > -# define MOV_SIZE 4 > -# define RET_SIZE 4 > - > -# define VEC(i) ymm##i > - > -# define VMOVU vmovdqu > -# define VMOVA vmovdqa > +# ifndef VEC_SIZE > +# include "x86-avx-vecs.h" > +# endif > > # define MEMSET_SET_VEC0_AND_SET_RETURN(d, r) \ > vmovd d, %xmm0; \ > @@ -26,9 +21,6 @@ > # define WMEMSET_VDUP_TO_VEC0_HIGH() vpbroadcastd %xmm0, %ymm0 > # define WMEMSET_VDUP_TO_VEC0_LOW() vpbroadcastd %xmm0, %xmm0 > > -# ifndef SECTION > -# define SECTION(p) p##.avx > -# endif > # ifndef MEMSET_SYMBOL > # define MEMSET_SYMBOL(p,s) p##_avx2_##s > # endif > diff --git a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S > index 47623b8ee8..84145b6c27 100644 > --- a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S > +++ b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S > @@ -4,26 +4,14 @@ > > # define USE_WITH_AVX512 1 > > -# define VEC_SIZE 64 > -# define MOV_SIZE 6 > -# define RET_SIZE 1 > - > -# define XMM0 xmm16 > -# define YMM0 ymm16 > -# define VEC0 zmm16 > -# define VEC(i) VEC##i > - > -# define VMOVU vmovdqu64 > -# define VMOVA vmovdqa64 > - > -# define VZEROUPPER > +# include "x86-evex512-vecs.h" > > # define MEMSET_SET_VEC0_AND_SET_RETURN(d, r) \ > - vpbroadcastb d, %VEC0; \ > + vpbroadcastb d, %VMM(0); \ > movq r, %rax > > # define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \ > - vpbroadcastd d, %VEC0; \ > + vpbroadcastd d, %VMM(0); \ > movq r, %rax > > # define MEMSET_VDUP_TO_VEC0_HIGH() > @@ -32,8 +20,6 @@ > # define WMEMSET_VDUP_TO_VEC0_HIGH() > # define WMEMSET_VDUP_TO_VEC0_LOW() > > -# define SECTION(p) p##.evex512 > - > #ifndef MEMSET_SYMBOL > # define MEMSET_SYMBOL(p,s) p##_avx512_##s > #endif > diff --git a/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S > index ac4b2d2d50..1f03b26bf8 100644 > --- a/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S > +++ b/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S > @@ -4,26 +4,14 @@ > > # define USE_WITH_EVEX 1 > > -# define VEC_SIZE 32 > -# define MOV_SIZE 6 > -# define RET_SIZE 1 > - > -# define XMM0 xmm16 > -# define YMM0 ymm16 > -# define VEC0 ymm16 > -# define VEC(i) VEC##i > - > -# define VMOVU vmovdqu64 > -# define VMOVA vmovdqa64 > - > -# define VZEROUPPER > +# include "x86-evex256-vecs.h" > > # define MEMSET_SET_VEC0_AND_SET_RETURN(d, r) \ > - vpbroadcastb d, %VEC0; \ > + vpbroadcastb d, %VMM(0); \ > movq r, %rax > > # define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \ > - vpbroadcastd d, %VEC0; \ > + vpbroadcastd d, %VMM(0); \ > movq r, %rax > > # define MEMSET_VDUP_TO_VEC0_HIGH() > @@ -32,8 +20,6 @@ > # define WMEMSET_VDUP_TO_VEC0_HIGH() > # define WMEMSET_VDUP_TO_VEC0_LOW() > > -# define SECTION(p) p##.evex > - > #ifndef MEMSET_SYMBOL > # define MEMSET_SYMBOL(p,s) p##_evex_##s > #endif > diff --git a/sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S > index 44f9b8888b..34b245d8ca 100644 > --- a/sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S > +++ b/sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S > @@ -26,13 +26,7 @@ > # include > # define USE_WITH_SSE2 1 > > -# define VEC_SIZE 16 > -# define MOV_SIZE 3 > -# define RET_SIZE 1 > - > -# define VEC(i) xmm##i > -# define VMOVU movups > -# define VMOVA movaps > +# include "x86-sse2-vecs.h" > > # define MEMSET_SET_VEC0_AND_SET_RETURN(d, r) \ > movd d, %xmm0; \ > @@ -52,8 +46,6 @@ > # define WMEMSET_VDUP_TO_VEC0_HIGH() > # define WMEMSET_VDUP_TO_VEC0_LOW() > > -# define SECTION(p) p > - > # ifndef MEMSET_SYMBOL > # define MEMSET_SYMBOL(p,s) p##_sse2_##s > # endif > diff --git a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S > index 905d0fa464..03de0ab907 100644 > --- a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S > +++ b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S > @@ -34,14 +34,6 @@ > # define WMEMSET_CHK_SYMBOL(p,s) WMEMSET_SYMBOL(p, s) > #endif > > -#ifndef XMM0 > -# define XMM0 xmm0 > -#endif > - > -#ifndef YMM0 > -# define YMM0 ymm0 > -#endif > - > #ifndef VZEROUPPER > # if VEC_SIZE > 16 > # define VZEROUPPER vzeroupper > @@ -150,8 +142,8 @@ L(entry_from_wmemset): > cmpq $(VEC_SIZE * 2), %rdx > ja L(more_2x_vec) > /* From VEC and to 2 * VEC. No branch when size == VEC_SIZE. */ > - VMOVU %VEC(0), -VEC_SIZE(%rdi,%rdx) > - VMOVU %VEC(0), (%rdi) > + VMOVU %VMM(0), -VEC_SIZE(%rdi,%rdx) > + VMOVU %VMM(0), (%rdi) > VZEROUPPER_RETURN > #if defined USE_MULTIARCH && IS_IN (libc) > END (MEMSET_SYMBOL (__memset, unaligned)) > @@ -175,19 +167,19 @@ ENTRY_P2ALIGN (MEMSET_SYMBOL (__memset, unaligned_erms), 6) > cmp $(VEC_SIZE * 2), %RDX_LP > ja L(stosb_more_2x_vec) > /* From VEC and to 2 * VEC. No branch when size == VEC_SIZE. */ > - VMOVU %VEC(0), (%rdi) > - VMOVU %VEC(0), (VEC_SIZE * -1)(%rdi, %rdx) > + VMOVU %VMM(0), (%rdi) > + VMOVU %VMM(0), (VEC_SIZE * -1)(%rdi, %rdx) > VZEROUPPER_RETURN > #endif > > .p2align 4,, 4 > L(last_2x_vec): > #ifdef USE_LESS_VEC_MASK_STORE > - VMOVU %VEC(0), (VEC_SIZE * -2)(%rdi, %rdx) > - VMOVU %VEC(0), (VEC_SIZE * -1)(%rdi, %rdx) > + VMOVU %VMM(0), (VEC_SIZE * -2)(%rdi, %rdx) > + VMOVU %VMM(0), (VEC_SIZE * -1)(%rdi, %rdx) > #else > - VMOVU %VEC(0), (VEC_SIZE * -2)(%rdi) > - VMOVU %VEC(0), (VEC_SIZE * -1)(%rdi) > + VMOVU %VMM(0), (VEC_SIZE * -2)(%rdi) > + VMOVU %VMM(0), (VEC_SIZE * -1)(%rdi) > #endif > VZEROUPPER_RETURN > > @@ -221,7 +213,7 @@ L(less_vec_from_wmemset): > bzhil %edx, %ecx, %ecx > kmovd %ecx, %k1 > # endif > - vmovdqu8 %VEC(0), (%rax){%k1} > + vmovdqu8 %VMM(0), (%rax){%k1} > VZEROUPPER_RETURN > > # if defined USE_MULTIARCH && IS_IN (libc) > @@ -249,8 +241,8 @@ L(stosb_more_2x_vec): > and (4x, 8x] jump to target. */ > L(more_2x_vec): > /* Store next 2x vec regardless. */ > - VMOVU %VEC(0), (%rdi) > - VMOVU %VEC(0), (VEC_SIZE * 1)(%rdi) > + VMOVU %VMM(0), (%rdi) > + VMOVU %VMM(0), (VEC_SIZE * 1)(%rdi) > > > /* Two different methods of setting up pointers / compare. The two > @@ -278,8 +270,8 @@ L(more_2x_vec): > #endif > > /* Store next 2x vec regardless. */ > - VMOVU %VEC(0), (VEC_SIZE * 2)(%rax) > - VMOVU %VEC(0), (VEC_SIZE * 3)(%rax) > + VMOVU %VMM(0), (VEC_SIZE * 2)(%rax) > + VMOVU %VMM(0), (VEC_SIZE * 3)(%rax) > > > #if defined USE_WITH_EVEX || defined USE_WITH_AVX512 > @@ -304,20 +296,20 @@ L(more_2x_vec): > andq $(VEC_SIZE * -2), %LOOP_REG > .p2align 4 > L(loop): > - VMOVA %VEC(0), LOOP_4X_OFFSET(%LOOP_REG) > - VMOVA %VEC(0), (VEC_SIZE + LOOP_4X_OFFSET)(%LOOP_REG) > - VMOVA %VEC(0), (VEC_SIZE * 2 + LOOP_4X_OFFSET)(%LOOP_REG) > - VMOVA %VEC(0), (VEC_SIZE * 3 + LOOP_4X_OFFSET)(%LOOP_REG) > + VMOVA %VMM(0), LOOP_4X_OFFSET(%LOOP_REG) > + VMOVA %VMM(0), (VEC_SIZE + LOOP_4X_OFFSET)(%LOOP_REG) > + VMOVA %VMM(0), (VEC_SIZE * 2 + LOOP_4X_OFFSET)(%LOOP_REG) > + VMOVA %VMM(0), (VEC_SIZE * 3 + LOOP_4X_OFFSET)(%LOOP_REG) > subq $-(VEC_SIZE * 4), %LOOP_REG > cmpq %END_REG, %LOOP_REG > jb L(loop) > .p2align 4,, MOV_SIZE > L(last_4x_vec): > - VMOVU %VEC(0), LOOP_4X_OFFSET(%END_REG) > - VMOVU %VEC(0), (VEC_SIZE + LOOP_4X_OFFSET)(%END_REG) > - VMOVU %VEC(0), (VEC_SIZE * 2 + LOOP_4X_OFFSET)(%END_REG) > - VMOVU %VEC(0), (VEC_SIZE * 3 + LOOP_4X_OFFSET)(%END_REG) > -L(return): > + VMOVU %VMM(0), LOOP_4X_OFFSET(%END_REG) > + VMOVU %VMM(0), (VEC_SIZE + LOOP_4X_OFFSET)(%END_REG) > + VMOVU %VMM(0), (VEC_SIZE * 2 + LOOP_4X_OFFSET)(%END_REG) > + VMOVU %VMM(0), (VEC_SIZE * 3 + LOOP_4X_OFFSET)(%END_REG) > +L(return_vzeroupper): > #if VEC_SIZE > 16 > ZERO_UPPER_VEC_REGISTERS_RETURN > #else > @@ -355,7 +347,7 @@ L(cross_page): > jge L(between_16_31) > #endif > #ifndef USE_XMM_LESS_VEC > - MOVQ %XMM0, %SET_REG64 > + MOVQ %VMM_128(0), %SET_REG64 > #endif > cmpl $8, %edx > jge L(between_8_15) > @@ -374,8 +366,8 @@ L(between_0_0): > .p2align 4,, SMALL_MEMSET_ALIGN(MOV_SIZE, RET_SIZE) > /* From 32 to 63. No branch when size == 32. */ > L(between_32_63): > - VMOVU %YMM0, (%LESS_VEC_REG) > - VMOVU %YMM0, -32(%LESS_VEC_REG, %rdx) > + VMOVU %VMM_256(0), (%LESS_VEC_REG) > + VMOVU %VMM_256(0), -32(%LESS_VEC_REG, %rdx) > VZEROUPPER_RETURN > #endif > > @@ -383,8 +375,8 @@ L(between_32_63): > .p2align 4,, SMALL_MEMSET_ALIGN(MOV_SIZE, 1) > L(between_16_31): > /* From 16 to 31. No branch when size == 16. */ > - VMOVU %XMM0, (%LESS_VEC_REG) > - VMOVU %XMM0, -16(%LESS_VEC_REG, %rdx) > + VMOVU %VMM_128(0), (%LESS_VEC_REG) > + VMOVU %VMM_128(0), -16(%LESS_VEC_REG, %rdx) > ret > #endif > > @@ -394,8 +386,8 @@ L(between_16_31): > L(between_8_15): > /* From 8 to 15. No branch when size == 8. */ > #ifdef USE_XMM_LESS_VEC > - MOVQ %XMM0, (%rdi) > - MOVQ %XMM0, -8(%rdi, %rdx) > + MOVQ %VMM_128(0), (%rdi) > + MOVQ %VMM_128(0), -8(%rdi, %rdx) > #else > movq %SET_REG64, (%LESS_VEC_REG) > movq %SET_REG64, -8(%LESS_VEC_REG, %rdx) > @@ -408,8 +400,8 @@ L(between_8_15): > L(between_4_7): > /* From 4 to 7. No branch when size == 4. */ > #ifdef USE_XMM_LESS_VEC > - MOVD %XMM0, (%rdi) > - MOVD %XMM0, -4(%rdi, %rdx) > + MOVD %VMM_128(0), (%rdi) > + MOVD %VMM_128(0), -4(%rdi, %rdx) > #else > movl %SET_REG32, (%LESS_VEC_REG) > movl %SET_REG32, -4(%LESS_VEC_REG, %rdx) > -- > 2.34.1 > LGTM. Thanks. -- H.J.