From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: from mail-yb1-xb31.google.com (mail-yb1-xb31.google.com [IPv6:2607:f8b0:4864:20::b31]) by sourceware.org (Postfix) with ESMTPS id 8F222386CE5D for ; Wed, 29 Jun 2022 23:09:19 +0000 (GMT) DMARC-Filter: OpenDMARC Filter v1.4.1 sourceware.org 8F222386CE5D Received: by mail-yb1-xb31.google.com with SMTP id g4so19106951ybg.9 for ; Wed, 29 Jun 2022 16:09:19 -0700 (PDT) X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=1e100.net; s=20210112; h=x-gm-message-state:mime-version:references:in-reply-to:from:date :message-id:subject:to:cc; bh=1XqolQnHq1XjZZCA2UehtujobSZqISMt8waDs+jJuRA=; b=yzPu2OnaJ0ZOnDQDdc93t/b7Pxxkmpc6SdUHxk7i1iMeUEVkHIdesXy1vvyRmXP0QE 6QD/FXg6DqYN8jMLmQiOwG5RPja6jaNp1PO6ncvuc69wuI+PgKQv6GUIH7kWYN0DN1Br Zbf1sunAyjDT21RpOrFCvBtc1bbiXT+V2eVCuiqol0Cvu60s62utASP62N9f8iy9ZBmv xIBn+kZKMkXfpwqyqMlFkT3+muPYyjeZ+Y8q1BgIqzkcesiHzn8HBbGoDFHBBWGByW7M HZBS3UlUEhop9VUVYN4O6rKBSac4P6CaB3ZyAyF+VC5PF2dh95/Ee/MSMAv/FKd7qps4 B+ZQ== X-Gm-Message-State: AJIora9YOJ3Wb75OQJS5VA5/FIAboipb76iGOgteEwtzOfrx+9b5r9dM jqa8OjwBXkfRQ+8xjw0prb4Qqo/6EsZgWVX7WNpV8DzBeow= X-Google-Smtp-Source: AGRyM1sxIxuoe3CuRmc1/9qZLQQs+jqra7KRazQJZnYWTsYgdLDM66vg6E9V5nw2Lch86OGlTqdtP7Y2f3zjtU8Xogw= X-Received: by 2002:a25:f30a:0:b0:66c:9632:342b with SMTP id c10-20020a25f30a000000b0066c9632342bmr6086354ybs.283.1656544158871; Wed, 29 Jun 2022 16:09:18 -0700 (PDT) MIME-Version: 1.0 References: <20220628152735.17863-1-goldstein.w.n@gmail.com> <20220629221204.1242709-1-goldstein.w.n@gmail.com> <20220629221204.1242709-2-goldstein.w.n@gmail.com> In-Reply-To: From: Noah Goldstein Date: Wed, 29 Jun 2022 16:09:08 -0700 Message-ID: Subject: Re: [PATCH v2 2/3] x86: Move and slightly improve memset_erms To: "H.J. Lu" Cc: GNU C Library , "Carlos O'Donell" Content-Type: text/plain; charset="UTF-8" X-Spam-Status: No, score=-8.4 required=5.0 tests=BAYES_00, DKIM_SIGNED, DKIM_VALID, DKIM_VALID_AU, DKIM_VALID_EF, FREEMAIL_FROM, GIT_PATCH_0, RCVD_IN_DNSWL_NONE, SPF_HELO_NONE, SPF_PASS, TXREP, T_SCC_BODY_TEXT_LINE autolearn=ham autolearn_force=no version=3.4.6 X-Spam-Checker-Version: SpamAssassin 3.4.6 (2021-04-09) on server2.sourceware.org X-BeenThere: libc-alpha@sourceware.org X-Mailman-Version: 2.1.29 Precedence: list List-Id: Libc-alpha mailing list List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , X-List-Received-Date: Wed, 29 Jun 2022 23:09:21 -0000 On Wed, Jun 29, 2022 at 3:19 PM H.J. Lu wrote: > > On Wed, Jun 29, 2022 at 3:12 PM Noah Goldstein wrote: > > > > Implementation wise: > > 1. Remove the VZEROUPPER as memset_{impl}_unaligned_erms does not > > use the L(stosb) label that was previously defined. > > > > 2. Don't give the hotpath (fallthrough) to zero size. > > > > Code positioning wise: > > > > Move memset_{chk}_erms to its own file. Leaving it in between the > > memset_{impl}_unaligned both adds unnecessary complexity to the > > file and wastes space in a relatively hot cache section. > > --- > > sysdeps/x86_64/multiarch/Makefile | 1 + > > sysdeps/x86_64/multiarch/memset-erms.S | 25 +++++++++++++++ > > .../multiarch/memset-vec-unaligned-erms.S | 31 ------------------- > > 3 files changed, 26 insertions(+), 31 deletions(-) > > create mode 100644 sysdeps/x86_64/multiarch/memset-erms.S > > > > diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile > > index 62a4d96fb8..18cea04423 100644 > > --- a/sysdeps/x86_64/multiarch/Makefile > > +++ b/sysdeps/x86_64/multiarch/Makefile > > @@ -30,6 +30,7 @@ sysdep_routines += \ > > memset-avx2-unaligned-erms-rtm \ > > memset-avx512-no-vzeroupper \ > > memset-avx512-unaligned-erms \ > > + memset-erms \ > > memset-evex-unaligned-erms \ > > memset-sse2-unaligned-erms \ > > rawmemchr-avx2 \ > > diff --git a/sysdeps/x86_64/multiarch/memset-erms.S b/sysdeps/x86_64/multiarch/memset-erms.S > > new file mode 100644 > > index 0000000000..1fce0c9fcc > > --- /dev/null > > +++ b/sysdeps/x86_64/multiarch/memset-erms.S > > @@ -0,0 +1,25 @@ > > Need copyright notice. Fixed in V3. > > > +#include > > + > > +#if defined USE_MULTIARCH && IS_IN (libc) > > + .text > > +ENTRY (__memset_chk_erms) > > + cmp %RDX_LP, %RCX_LP > > + jb HIDDEN_JUMPTARGET (__chk_fail) > > +END (__memset_chk_erms) > > + > > +/* Only used to measure performance of REP STOSB. */ > > +ENTRY (__memset_erms) > > + /* Skip zero length. */ > > + test %RDX_LP, %RDX_LP > > + jz L(stosb_return_zero) > > + mov %RDX_LP, %RCX_LP > > + movzbl %sil, %eax > > + mov %RDI_LP, %RDX_LP > > + rep stosb > > + mov %RDX_LP, %RAX_LP > > + ret > > +L(stosb_return_zero): > > + movq %rdi, %rax > > + ret > > +END (__memset_erms) > > +#endif > > diff --git a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S > > index abc12d9cda..905d0fa464 100644 > > --- a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S > > +++ b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S > > @@ -156,37 +156,6 @@ L(entry_from_wmemset): > > #if defined USE_MULTIARCH && IS_IN (libc) > > END (MEMSET_SYMBOL (__memset, unaligned)) > > > > -# if VEC_SIZE == 16 > > -ENTRY (__memset_chk_erms) > > - cmp %RDX_LP, %RCX_LP > > - jb HIDDEN_JUMPTARGET (__chk_fail) > > -END (__memset_chk_erms) > > - > > -/* Only used to measure performance of REP STOSB. */ > > -ENTRY (__memset_erms) > > - /* Skip zero length. */ > > - test %RDX_LP, %RDX_LP > > - jnz L(stosb) > > - movq %rdi, %rax > > - ret > > -# else > > -/* Provide a hidden symbol to debugger. */ > > - .hidden MEMSET_SYMBOL (__memset, erms) > > -ENTRY (MEMSET_SYMBOL (__memset, erms)) > > -# endif > > -L(stosb): > > - mov %RDX_LP, %RCX_LP > > - movzbl %sil, %eax > > - mov %RDI_LP, %RDX_LP > > - rep stosb > > - mov %RDX_LP, %RAX_LP > > - VZEROUPPER_RETURN > > -# if VEC_SIZE == 16 > > -END (__memset_erms) > > -# else > > -END (MEMSET_SYMBOL (__memset, erms)) > > -# endif > > - > > # if defined SHARED && IS_IN (libc) > > ENTRY_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned_erms)) > > cmp %RDX_LP, %RCX_LP > > -- > > 2.34.1 > > > > > -- > H.J.