From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: from mail-pg1-x52e.google.com (mail-pg1-x52e.google.com [IPv6:2607:f8b0:4864:20::52e]) by sourceware.org (Postfix) with ESMTPS id 266EE385781F for ; Mon, 28 Mar 2022 18:52:36 +0000 (GMT) DMARC-Filter: OpenDMARC Filter v1.4.1 sourceware.org 266EE385781F Received: by mail-pg1-x52e.google.com with SMTP id t13so11671887pgn.8 for ; Mon, 28 Mar 2022 11:52:36 -0700 (PDT) X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=1e100.net; s=20210112; h=x-gm-message-state:mime-version:references:in-reply-to:from:date :message-id:subject:to:cc; bh=xNrKC9eUsDoyZPWGcnMYJhWswoEapjC//zBr/kLKXAk=; b=BDyXNGlS6qhvuFqPrrpi92Qy9WLo0CUx1TeW8U1AvqQuygFtPaALA2mU+TC3PEkJ1I uTjh/1iMjhmGZcRjJAaW8+DfDBJMcf1TMCpPbTBKtT+pfSD4b46jg4kN6NT4uwFBy+25 tv/T7IkgADVc9J0fckohAgqL0jiBikbk7/hDKo3LvNJ7wM4n+8PDjPDSSRr2+0Xgpoo7 fBU5hcg/n+POEcULYT3ezn+BCPG8tBZZJzLsGgERXnlo8hi5N9xD50vfRGtuPI12Zrxb Wg7sCYT9MaZEfI2awbIpT9WyHRoPkMdb6Y6Neoym0HA1R1Z8QZPh0kW0yzA+5VzNvifC uDNg== X-Gm-Message-State: AOAM533byTadb6MLGWs8dntyBLaADGgc24JJMDrMSltyDbp/paeiKYdP bpfXDt5SAFuuQ1McZAZb4YhxOqlzHRqXJDqU+sjDlln40JM= X-Google-Smtp-Source: ABdhPJzAU7/rLX2VaDsKnXseoXC/xMjdSBQNhWJzE3aRf8OVVb90efSCS/UlgcbSDmpqV1Pm0dfM8iV/vWVgJS5H03I= X-Received: by 2002:aa7:8d54:0:b0:4e0:bd6:cfb9 with SMTP id s20-20020aa78d54000000b004e00bd6cfb9mr24663367pfe.60.1648493555118; Mon, 28 Mar 2022 11:52:35 -0700 (PDT) MIME-Version: 1.0 References: <20220325221333.3079015-1-goldstein.w.n@gmail.com> <20220325221333.3079015-2-goldstein.w.n@gmail.com> In-Reply-To: <20220325221333.3079015-2-goldstein.w.n@gmail.com> From: "H.J. Lu" Date: Mon, 28 Mar 2022 11:51:59 -0700 Message-ID: Subject: Re: [PATCH v1 2/2] x86: Small improvements for wcslen To: Noah Goldstein Cc: GNU C Library , "Carlos O'Donell" Content-Type: text/plain; charset="UTF-8" X-Spam-Status: No, score=-3026.0 required=5.0 tests=BAYES_00, DKIM_SIGNED, DKIM_VALID, DKIM_VALID_AU, DKIM_VALID_EF, FREEMAIL_FROM, GIT_PATCH_0, RCVD_IN_DNSWL_NONE, SPF_HELO_NONE, SPF_PASS, TXREP, T_SCC_BODY_TEXT_LINE autolearn=ham autolearn_force=no version=3.4.4 X-Spam-Checker-Version: SpamAssassin 3.4.4 (2020-01-24) on server2.sourceware.org X-BeenThere: libc-alpha@sourceware.org X-Mailman-Version: 2.1.29 Precedence: list List-Id: Libc-alpha mailing list List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , X-List-Received-Date: Mon, 28 Mar 2022 18:52:37 -0000 On Fri, Mar 25, 2022 at 3:13 PM Noah Goldstein wrote: > > Just a few QOL changes. > 1. Prefer `add` > `lea` as it has high execution units it can run > on. > 2. Don't break macro-fusion between `test` and `jcc` > 3. Reduce code size by removing gratuitous padding bytes (-90 > bytes). > > geometric_mean(N=20) of all benchmarks New / Original: 0.959 > > All string/memory tests pass. > --- > sysdeps/x86_64/wcslen.S | 86 ++++++++++++++++++++--------------------- > 1 file changed, 41 insertions(+), 45 deletions(-) > > diff --git a/sysdeps/x86_64/wcslen.S b/sysdeps/x86_64/wcslen.S > index c9165dbf03..d641141d75 100644 > --- a/sysdeps/x86_64/wcslen.S > +++ b/sysdeps/x86_64/wcslen.S > @@ -40,82 +40,82 @@ ENTRY (__wcslen) > pxor %xmm0, %xmm0 > > lea 32(%rdi), %rax > - lea 16(%rdi), %rcx > + addq $16, %rdi > and $-16, %rax > > pcmpeqd (%rax), %xmm0 > pmovmskb %xmm0, %edx > pxor %xmm1, %xmm1 > + addq $16, %rax > test %edx, %edx > - lea 16(%rax), %rax > jnz L(exit) > > pcmpeqd (%rax), %xmm1 > pmovmskb %xmm1, %edx > pxor %xmm2, %xmm2 > + addq $16, %rax > test %edx, %edx > - lea 16(%rax), %rax > jnz L(exit) > > pcmpeqd (%rax), %xmm2 > pmovmskb %xmm2, %edx > pxor %xmm3, %xmm3 > + addq $16, %rax > test %edx, %edx > - lea 16(%rax), %rax > jnz L(exit) > > pcmpeqd (%rax), %xmm3 > pmovmskb %xmm3, %edx > + addq $16, %rax > test %edx, %edx > - lea 16(%rax), %rax > jnz L(exit) > > pcmpeqd (%rax), %xmm0 > pmovmskb %xmm0, %edx > + addq $16, %rax > test %edx, %edx > - lea 16(%rax), %rax > jnz L(exit) > > pcmpeqd (%rax), %xmm1 > pmovmskb %xmm1, %edx > + addq $16, %rax > test %edx, %edx > - lea 16(%rax), %rax > jnz L(exit) > > pcmpeqd (%rax), %xmm2 > pmovmskb %xmm2, %edx > + addq $16, %rax > test %edx, %edx > - lea 16(%rax), %rax > jnz L(exit) > > pcmpeqd (%rax), %xmm3 > pmovmskb %xmm3, %edx > + addq $16, %rax > test %edx, %edx > - lea 16(%rax), %rax > jnz L(exit) > > pcmpeqd (%rax), %xmm0 > pmovmskb %xmm0, %edx > + addq $16, %rax > test %edx, %edx > - lea 16(%rax), %rax > jnz L(exit) > > pcmpeqd (%rax), %xmm1 > pmovmskb %xmm1, %edx > + addq $16, %rax > test %edx, %edx > - lea 16(%rax), %rax > jnz L(exit) > > pcmpeqd (%rax), %xmm2 > pmovmskb %xmm2, %edx > + addq $16, %rax > test %edx, %edx > - lea 16(%rax), %rax > jnz L(exit) > > pcmpeqd (%rax), %xmm3 > pmovmskb %xmm3, %edx > + addq $16, %rax > test %edx, %edx > - lea 16(%rax), %rax > jnz L(exit) > > and $-0x40, %rax > @@ -132,104 +132,100 @@ L(aligned_64_loop): > pminub %xmm0, %xmm2 > pcmpeqd %xmm3, %xmm2 > pmovmskb %xmm2, %edx > + addq $64, %rax > test %edx, %edx > - lea 64(%rax), %rax > jz L(aligned_64_loop) > > pcmpeqd -64(%rax), %xmm3 > pmovmskb %xmm3, %edx > + addq $48, %rdi > test %edx, %edx > - lea 48(%rcx), %rcx > jnz L(exit) > > pcmpeqd %xmm1, %xmm3 > pmovmskb %xmm3, %edx > + addq $-16, %rdi > test %edx, %edx > - lea -16(%rcx), %rcx > jnz L(exit) > > pcmpeqd -32(%rax), %xmm3 > pmovmskb %xmm3, %edx > + addq $-16, %rdi > test %edx, %edx > - lea -16(%rcx), %rcx > jnz L(exit) > > pcmpeqd %xmm6, %xmm3 > pmovmskb %xmm3, %edx > + addq $-16, %rdi > test %edx, %edx > - lea -16(%rcx), %rcx > - jnz L(exit) > - > - jmp L(aligned_64_loop) > + jz L(aligned_64_loop) > > .p2align 4 > L(exit): > - sub %rcx, %rax > + sub %rdi, %rax > shr $2, %rax > test %dl, %dl > jz L(exit_high) > > - mov %dl, %cl > - and $15, %cl > + andl $15, %edx > jz L(exit_1) > ret > > - .p2align 4 > + /* No align here. Naturally aligned % 16 == 1. */ > L(exit_high): > - mov %dh, %ch > - and $15, %ch > + andl $(15 << 8), %edx > jz L(exit_3) > add $2, %rax > ret > > - .p2align 4 > + .p2align 3 > L(exit_1): > add $1, %rax > ret > > - .p2align 4 > + .p2align 3 > L(exit_3): > add $3, %rax > ret > > - .p2align 4 > + .p2align 3 > L(exit_tail0): > - xor %rax, %rax > + xorl %eax, %eax > ret > > - .p2align 4 > + .p2align 3 > L(exit_tail1): > - mov $1, %rax > + movl $1, %eax > ret > > - .p2align 4 > + .p2align 3 > L(exit_tail2): > - mov $2, %rax > + movl $2, %eax > ret > > - .p2align 4 > + .p2align 3 > L(exit_tail3): > - mov $3, %rax > + movl $3, %eax > ret > > - .p2align 4 > + .p2align 3 > L(exit_tail4): > - mov $4, %rax > + movl $4, %eax > ret > > - .p2align 4 > + .p2align 3 > L(exit_tail5): > - mov $5, %rax > + movl $5, %eax > ret > > - .p2align 4 > + .p2align 3 > L(exit_tail6): > - mov $6, %rax > + movl $6, %eax > ret > > - .p2align 4 > + .p2align 3 > L(exit_tail7): > - mov $7, %rax > + movl $7, %eax > ret > > END (__wcslen) > -- > 2.25.1 > LGTM. Reviewed-by: H.J. Lu Thanks. -- H.J.