From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: from mail-qk1-x72d.google.com (mail-qk1-x72d.google.com [IPv6:2607:f8b0:4864:20::72d]) by sourceware.org (Postfix) with ESMTPS id 2362A3858D1E for ; Sat, 15 Oct 2022 02:58:46 +0000 (GMT) DMARC-Filter: OpenDMARC Filter v1.4.1 sourceware.org 2362A3858D1E Authentication-Results: sourceware.org; dmarc=pass (p=none dis=none) header.from=gmail.com Authentication-Results: sourceware.org; spf=pass smtp.mailfrom=gmail.com Received: by mail-qk1-x72d.google.com with SMTP id a5so3663897qkl.6 for ; Fri, 14 Oct 2022 19:58:46 -0700 (PDT) DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=gmail.com; s=20210112; h=cc:to:subject:message-id:date:from:in-reply-to:references :mime-version:from:to:cc:subject:date:message-id:reply-to; bh=vn1AL6eYzMNdhES/paumQAjMKYMV+VhHIETCPB5eizA=; b=TeafqDmesj67nkn6KiGxUP9tRZueurjG3yvG1TEEVGuhaeY8W5/G0BMGQUigFJIHVi USmOmLzVN3B13OTtY4e1FUHaZyqFuxBF/tpsgMM7lPTkV/kmnL2w4SPPbLKCokX2lBsQ zZj0uIqjAUcnWu15e+8tzqrmZX74kP2N6Scouigjh23YVK6IiTAfdRXD257O5B4TgPA4 jsyN6IujQYSwuUTt8fUuuPmPDX3GhNUXzXINXQNZ9xmlItgSBfnQ4TqR4tqETbGUnqlS n1zNZaB6bgSu8dGuTLJHFuB8lLVZO6OnU8NmcCeI9ybZCYdq3DSDKVP4G0GMb7r0LXc3 0MaA== X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=1e100.net; s=20210112; h=cc:to:subject:message-id:date:from:in-reply-to:references :mime-version:x-gm-message-state:from:to:cc:subject:date:message-id :reply-to; bh=vn1AL6eYzMNdhES/paumQAjMKYMV+VhHIETCPB5eizA=; b=RXFW6fI+SloNQvQp1bZOsbT9X6tq9f78v9Eh8u9pUOIDD87awYzw/SxKmdEb+b7U0R 4tIsqrv0t+Ab5wzwhxRKB6mwrJ12nWAZ+J0xWgySflGla8TPCO00uWQZ8da/8x8wDHR8 vsXaw0z5cNXfyxBwUmfXQK33i5yB5eu7hqxT/0DKJ7cZu+UViP2kXCCf0vheXMZXn0vt msJL2NhOTcrETo8EmMR8PxE5jU4C5/ptoT992tejQsQYI7SVktCTVctaEcF7T4Fqha3I /uRjCHfcAM5KngkeffkKmq4q9n/0gq7tBvWe9HZ4uQ/sX+VdiIrlHmOQlg7xJbyH7rh7 tb3A== X-Gm-Message-State: ACrzQf32yjUOxPFRmEAAK3F0GH3mL/Tj9Kmj/FuOEPXAhDcI/pFiQSYe 3aV6jyVAiKiTiwMy41gMDsFWxf3Q7Tj3ZPFHtA16FXeN X-Google-Smtp-Source: AMsMyM4YKG9NXumTb7DbTEGl13FmzQN6BGsnJVALH5Y3nJJxxOze4OvJHnogbnqgWNsgURayAmf+rrExUaFEIvWKbY4= X-Received: by 2002:a05:620a:66a:b0:6ee:83d0:7896 with SMTP id a10-20020a05620a066a00b006ee83d07896mr605933qkh.81.1665802725435; Fri, 14 Oct 2022 19:58:45 -0700 (PDT) MIME-Version: 1.0 References: <20221014164008.1325863-1-goldstein.w.n@gmail.com> <20221015002100.129511-1-goldstein.w.n@gmail.com> <20221015002100.129511-6-goldstein.w.n@gmail.com> In-Reply-To: <20221015002100.129511-6-goldstein.w.n@gmail.com> From: "H.J. Lu" Date: Fri, 14 Oct 2022 19:58:09 -0700 Message-ID: Subject: Re: [PATCH v9 6/6] x86: Update strlen-evex-base to use new reg/vec macros. To: Noah Goldstein Cc: libc-alpha@sourceware.org, carlos@systemhalted.org Content-Type: text/plain; charset="UTF-8" X-Spam-Status: No, score=-3023.0 required=5.0 tests=BAYES_00,DKIM_SIGNED,DKIM_VALID,DKIM_VALID_AU,DKIM_VALID_EF,FREEMAIL_FROM,GIT_PATCH_0,RCVD_IN_DNSWL_NONE,SPF_HELO_NONE,SPF_PASS,TXREP,URIBL_BLACK autolearn=ham autolearn_force=no version=3.4.6 X-Spam-Checker-Version: SpamAssassin 3.4.6 (2021-04-09) on server2.sourceware.org List-Id: On Fri, Oct 14, 2022 at 5:21 PM Noah Goldstein wrote: > > To avoid duplicate the VMM / GPR / mask insn macros in all incoming > evex512 files use the macros defined in 'reg-macros.h' and > '{vec}-macros.h' > > This commit does not change libc.so > > Tested build on x86-64 > --- > sysdeps/x86_64/multiarch/strlen-evex-base.S | 116 +++++++------------- > sysdeps/x86_64/multiarch/strlen-evex512.S | 4 +- > 2 files changed, 44 insertions(+), 76 deletions(-) > > diff --git a/sysdeps/x86_64/multiarch/strlen-evex-base.S b/sysdeps/x86_64/multiarch/strlen-evex-base.S > index 418e9f8411..c832b15a48 100644 > --- a/sysdeps/x86_64/multiarch/strlen-evex-base.S > +++ b/sysdeps/x86_64/multiarch/strlen-evex-base.S > @@ -36,42 +36,10 @@ > # define CHAR_SIZE 1 > # endif > > -# define XMM0 xmm16 > # define PAGE_SIZE 4096 > # define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE) > > -# if VEC_SIZE == 64 > -# define KMOV kmovq > -# define KORTEST kortestq > -# define RAX rax > -# define RCX rcx > -# define RDX rdx > -# define SHR shrq > -# define TEXTSUFFIX evex512 > -# define VMM0 zmm16 > -# define VMM1 zmm17 > -# define VMM2 zmm18 > -# define VMM3 zmm19 > -# define VMM4 zmm20 > -# define VMOVA vmovdqa64 > -# elif VEC_SIZE == 32 > -/* Currently Unused. */ > -# define KMOV kmovd > -# define KORTEST kortestd > -# define RAX eax > -# define RCX ecx > -# define RDX edx > -# define SHR shrl > -# define TEXTSUFFIX evex256 > -# define VMM0 ymm16 > -# define VMM1 ymm17 > -# define VMM2 ymm18 > -# define VMM3 ymm19 > -# define VMM4 ymm20 > -# define VMOVA vmovdqa32 > -# endif > - > - .section .text.TEXTSUFFIX, "ax", @progbits > + .section SECTION(.text),"ax",@progbits > /* Aligning entry point to 64 byte, provides better performance for > one vector length string. */ > ENTRY_P2ALIGN (STRLEN, 6) > @@ -86,18 +54,18 @@ ENTRY_P2ALIGN (STRLEN, 6) > # endif > > movl %edi, %eax > - vpxorq %XMM0, %XMM0, %XMM0 > + vpxorq %VMM_128(0), %VMM_128(0), %VMM_128(0) > andl $(PAGE_SIZE - 1), %eax > cmpl $(PAGE_SIZE - VEC_SIZE), %eax > ja L(page_cross) > > /* Compare [w]char for null, mask bit will be set for match. */ > - VPCMP $0, (%rdi), %VMM0, %k0 > - KMOV %k0, %RAX > - test %RAX, %RAX > + VPCMP $0, (%rdi), %VMM(0), %k0 > + KMOV %k0, %VRAX > + test %VRAX, %VRAX > jz L(align_more) > > - bsf %RAX, %RAX > + bsf %VRAX, %VRAX > # ifdef USE_AS_STRNLEN > cmpq %rsi, %rax > cmovnb %rsi, %rax > @@ -120,7 +88,7 @@ L(align_more): > movq %rax, %rdx > subq %rdi, %rdx > # ifdef USE_AS_WCSLEN > - SHR $2, %RDX > + shr $2, %VRDX > # endif > /* At this point rdx contains [w]chars already compared. */ > subq %rsi, %rdx > @@ -131,9 +99,9 @@ L(align_more): > # endif > > /* Loop unroll 4 times for 4 vector loop. */ > - VPCMP $0, (%rax), %VMM0, %k0 > - KMOV %k0, %RCX > - test %RCX, %RCX > + VPCMP $0, (%rax), %VMM(0), %k0 > + KMOV %k0, %VRCX > + test %VRCX, %VRCX > jnz L(ret_vec_x1) > > # ifdef USE_AS_STRNLEN > @@ -141,9 +109,9 @@ L(align_more): > jbe L(ret_max) > # endif > > - VPCMP $0, VEC_SIZE(%rax), %VMM0, %k0 > - KMOV %k0, %RCX > - test %RCX, %RCX > + VPCMP $0, VEC_SIZE(%rax), %VMM(0), %k0 > + KMOV %k0, %VRCX > + test %VRCX, %VRCX > jnz L(ret_vec_x2) > > # ifdef USE_AS_STRNLEN > @@ -151,9 +119,9 @@ L(align_more): > jbe L(ret_max) > # endif > > - VPCMP $0, (VEC_SIZE * 2)(%rax), %VMM0, %k0 > - KMOV %k0, %RCX > - test %RCX, %RCX > + VPCMP $0, (VEC_SIZE * 2)(%rax), %VMM(0), %k0 > + KMOV %k0, %VRCX > + test %VRCX, %VRCX > jnz L(ret_vec_x3) > > # ifdef USE_AS_STRNLEN > @@ -161,9 +129,9 @@ L(align_more): > jbe L(ret_max) > # endif > > - VPCMP $0, (VEC_SIZE * 3)(%rax), %VMM0, %k0 > - KMOV %k0, %RCX > - test %RCX, %RCX > + VPCMP $0, (VEC_SIZE * 3)(%rax), %VMM(0), %k0 > + KMOV %k0, %VRCX > + test %VRCX, %VRCX > jnz L(ret_vec_x4) > > # ifdef USE_AS_STRNLEN > @@ -179,7 +147,7 @@ L(align_more): > # ifdef USE_AS_STRNLEN > subq %rax, %rcx > # ifdef USE_AS_WCSLEN > - SHR $2, %RCX > + shr $2, %VRCX > # endif > /* rcx contains number of [w]char will be recompared due to > alignment fixes. rdx must be incremented by rcx to offset > @@ -199,42 +167,42 @@ L(loop_entry): > # endif > /* VPMINU and VPCMP combination provide better performance as > compared to alternative combinations. */ > - VMOVA (VEC_SIZE * 4)(%rax), %VMM1 > - VPMINU (VEC_SIZE * 5)(%rax), %VMM1, %VMM2 > - VMOVA (VEC_SIZE * 6)(%rax), %VMM3 > - VPMINU (VEC_SIZE * 7)(%rax), %VMM3, %VMM4 > + VMOVA (VEC_SIZE * 4)(%rax), %VMM(1) > + VPMINU (VEC_SIZE * 5)(%rax), %VMM(1), %VMM(2) > + VMOVA (VEC_SIZE * 6)(%rax), %VMM(3) > + VPMINU (VEC_SIZE * 7)(%rax), %VMM(3), %VMM(4) > > - VPTESTN %VMM2, %VMM2, %k0 > - VPTESTN %VMM4, %VMM4, %k1 > + VPTESTN %VMM(2), %VMM(2), %k0 > + VPTESTN %VMM(4), %VMM(4), %k1 > > subq $-(VEC_SIZE * 4), %rax > KORTEST %k0, %k1 > jz L(loop) > > - VPTESTN %VMM1, %VMM1, %k2 > - KMOV %k2, %RCX > - test %RCX, %RCX > + VPTESTN %VMM(1), %VMM(1), %k2 > + KMOV %k2, %VRCX > + test %VRCX, %VRCX > jnz L(ret_vec_x1) > > - KMOV %k0, %RCX > + KMOV %k0, %VRCX > /* At this point, if k0 is non zero, null char must be in the > second vector. */ > - test %RCX, %RCX > + test %VRCX, %VRCX > jnz L(ret_vec_x2) > > - VPTESTN %VMM3, %VMM3, %k3 > - KMOV %k3, %RCX > - test %RCX, %RCX > + VPTESTN %VMM(3), %VMM(3), %k3 > + KMOV %k3, %VRCX > + test %VRCX, %VRCX > jnz L(ret_vec_x3) > /* At this point null [w]char must be in the fourth vector so no > need to check. */ > - KMOV %k1, %RCX > + KMOV %k1, %VRCX > > /* Fourth, third, second vector terminating are pretty much > same, implemented this way to avoid branching and reuse code > from pre loop exit condition. */ > L(ret_vec_x4): > - bsf %RCX, %RCX > + bsf %VRCX, %VRCX > subq %rdi, %rax > # ifdef USE_AS_WCSLEN > subq $-(VEC_SIZE * 3), %rax > @@ -250,7 +218,7 @@ L(ret_vec_x4): > ret > > L(ret_vec_x3): > - bsf %RCX, %RCX > + bsf %VRCX, %VRCX > subq %rdi, %rax > # ifdef USE_AS_WCSLEN > subq $-(VEC_SIZE * 2), %rax > @@ -268,7 +236,7 @@ L(ret_vec_x3): > L(ret_vec_x2): > subq $-VEC_SIZE, %rax > L(ret_vec_x1): > - bsf %RCX, %RCX > + bsf %VRCX, %VRCX > subq %rdi, %rax > # ifdef USE_AS_WCSLEN > shrq $2, %rax > @@ -289,13 +257,13 @@ L(page_cross): > /* ecx contains number of w[char] to be skipped as a result > of address alignment. */ > xorq %rdi, %rax > - VPCMP $0, (PAGE_SIZE - VEC_SIZE)(%rax), %VMM0, %k0 > - KMOV %k0, %RAX > + VPCMP $0, (PAGE_SIZE - VEC_SIZE)(%rax), %VMM(0), %k0 > + KMOV %k0, %VRAX > /* Ignore number of character for alignment adjustment. */ > - SHR %cl, %RAX > + shr %cl, %VRAX > jz L(align_more) > > - bsf %RAX, %RAX > + bsf %VRAX, %VRAX > # ifdef USE_AS_STRNLEN > cmpq %rsi, %rax > cmovnb %rsi, %rax > diff --git a/sysdeps/x86_64/multiarch/strlen-evex512.S b/sysdeps/x86_64/multiarch/strlen-evex512.S > index 116f8981c8..10c3415c8a 100644 > --- a/sysdeps/x86_64/multiarch/strlen-evex512.S > +++ b/sysdeps/x86_64/multiarch/strlen-evex512.S > @@ -2,6 +2,6 @@ > # define STRLEN __strlen_evex512 > #endif > > -#define VEC_SIZE 64 > - > +#include "x86-evex512-vecs.h" > +#include "reg-macros.h" > #include "strlen-evex-base.S" > -- > 2.34.1 > LGTM. Thanks. -- H.J.