From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: from mail-vs1-xe2b.google.com (mail-vs1-xe2b.google.com [IPv6:2607:f8b0:4864:20::e2b]) by sourceware.org (Postfix) with ESMTPS id B40F73839C43; Thu, 12 May 2022 19:55:48 +0000 (GMT) DMARC-Filter: OpenDMARC Filter v1.4.1 sourceware.org B40F73839C43 Received: by mail-vs1-xe2b.google.com with SMTP id b7so1966898vsq.1; Thu, 12 May 2022 12:55:48 -0700 (PDT) X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=1e100.net; s=20210112; h=x-gm-message-state:mime-version:references:in-reply-to:from:date :message-id:subject:to:cc; bh=sTkY7kImr1F0ShAUUOTQQJppIIkHzfxRvW2JG6WSWic=; b=PhNXKwyQy8miljpen4uhvpsAZwgbOcHZ5etPwz+FfHZVNL4MwtWwoUIoeMJYWgrzZk hKdYVD0KVEKLpxRySiZXF4GWad4ND9foDG/v4YCKkmRSMp55/f/RUtvBf2j6kdFHBauX p7uRlJiWfM5Lj13BkB87cn8A9t7T8VuH8sTI3WpQGBgIdZcX00X4nOROqWwwpAzxe86u 8FU9CoEBeTDUxJKmWuhoj2rixeWhdIrVxdDJqpN5KeODDwwTPlazhDUIuAda/60HVbTQ XmoO0Tub4meE4KQ+7nS2aMMxwV5VfV+2gIMJJzyLsTi8SfppynHlmZpCHoz+AdT/xVQ0 dagg== X-Gm-Message-State: AOAM532eBqsJOq9FVYDjwEx0nwISw2eKLMhqslY6L6yrv2AmWUfdueMw DavnvvCBD6iKmFTtFONaPSH2V3tqL34Q8V9cwA8= X-Google-Smtp-Source: ABdhPJyopzuAQxcy8l5pHlBqdl9bjRQMnFvzsjdtAqJyLJjRv6jD/v8uQSStRmPubUWJPt1LCkvqo6AQrnAarwsWYsI= X-Received: by 2002:a67:ee4f:0:b0:32c:ee75:6e98 with SMTP id g15-20020a67ee4f000000b0032cee756e98mr875019vsp.79.1652385348173; Thu, 12 May 2022 12:55:48 -0700 (PDT) MIME-Version: 1.0 References: <20220325221333.3079015-1-goldstein.w.n@gmail.com> <20220325221333.3079015-2-goldstein.w.n@gmail.com> In-Reply-To: From: Sunil Pandey Date: Thu, 12 May 2022 12:55:12 -0700 Message-ID: Subject: Re: [PATCH v1 2/2] x86: Small improvements for wcslen To: "H.J. Lu" , Libc-stable Mailing List Cc: Noah Goldstein , GNU C Library Content-Type: text/plain; charset="UTF-8" X-Spam-Status: No, score=-7.8 required=5.0 tests=BAYES_00, DKIM_SIGNED, DKIM_VALID, DKIM_VALID_AU, DKIM_VALID_EF, FREEMAIL_ENVFROM_END_DIGIT, FREEMAIL_FROM, GIT_PATCH_0, HK_RANDOM_ENVFROM, HK_RANDOM_FROM, RCVD_IN_DNSWL_NONE, SPF_HELO_NONE, SPF_PASS, TXREP, T_SCC_BODY_TEXT_LINE autolearn=ham autolearn_force=no version=3.4.6 X-Spam-Checker-Version: SpamAssassin 3.4.6 (2021-04-09) on server2.sourceware.org X-BeenThere: libc-stable@sourceware.org X-Mailman-Version: 2.1.29 Precedence: list List-Id: Libc-stable mailing list List-Unsubscribe: , List-Archive: List-Help: List-Subscribe: , X-List-Received-Date: Thu, 12 May 2022 19:55:50 -0000 On Mon, Mar 28, 2022 at 11:53 AM H.J. Lu via Libc-alpha wrote: > > On Fri, Mar 25, 2022 at 3:13 PM Noah Goldstein wrote: > > > > Just a few QOL changes. > > 1. Prefer `add` > `lea` as it has high execution units it can run > > on. > > 2. Don't break macro-fusion between `test` and `jcc` > > 3. Reduce code size by removing gratuitous padding bytes (-90 > > bytes). > > > > geometric_mean(N=20) of all benchmarks New / Original: 0.959 > > > > All string/memory tests pass. > > --- > > sysdeps/x86_64/wcslen.S | 86 ++++++++++++++++++++--------------------- > > 1 file changed, 41 insertions(+), 45 deletions(-) > > > > diff --git a/sysdeps/x86_64/wcslen.S b/sysdeps/x86_64/wcslen.S > > index c9165dbf03..d641141d75 100644 > > --- a/sysdeps/x86_64/wcslen.S > > +++ b/sysdeps/x86_64/wcslen.S > > @@ -40,82 +40,82 @@ ENTRY (__wcslen) > > pxor %xmm0, %xmm0 > > > > lea 32(%rdi), %rax > > - lea 16(%rdi), %rcx > > + addq $16, %rdi > > and $-16, %rax > > > > pcmpeqd (%rax), %xmm0 > > pmovmskb %xmm0, %edx > > pxor %xmm1, %xmm1 > > + addq $16, %rax > > test %edx, %edx > > - lea 16(%rax), %rax > > jnz L(exit) > > > > pcmpeqd (%rax), %xmm1 > > pmovmskb %xmm1, %edx > > pxor %xmm2, %xmm2 > > + addq $16, %rax > > test %edx, %edx > > - lea 16(%rax), %rax > > jnz L(exit) > > > > pcmpeqd (%rax), %xmm2 > > pmovmskb %xmm2, %edx > > pxor %xmm3, %xmm3 > > + addq $16, %rax > > test %edx, %edx > > - lea 16(%rax), %rax > > jnz L(exit) > > > > pcmpeqd (%rax), %xmm3 > > pmovmskb %xmm3, %edx > > + addq $16, %rax > > test %edx, %edx > > - lea 16(%rax), %rax > > jnz L(exit) > > > > pcmpeqd (%rax), %xmm0 > > pmovmskb %xmm0, %edx > > + addq $16, %rax > > test %edx, %edx > > - lea 16(%rax), %rax > > jnz L(exit) > > > > pcmpeqd (%rax), %xmm1 > > pmovmskb %xmm1, %edx > > + addq $16, %rax > > test %edx, %edx > > - lea 16(%rax), %rax > > jnz L(exit) > > > > pcmpeqd (%rax), %xmm2 > > pmovmskb %xmm2, %edx > > + addq $16, %rax > > test %edx, %edx > > - lea 16(%rax), %rax > > jnz L(exit) > > > > pcmpeqd (%rax), %xmm3 > > pmovmskb %xmm3, %edx > > + addq $16, %rax > > test %edx, %edx > > - lea 16(%rax), %rax > > jnz L(exit) > > > > pcmpeqd (%rax), %xmm0 > > pmovmskb %xmm0, %edx > > + addq $16, %rax > > test %edx, %edx > > - lea 16(%rax), %rax > > jnz L(exit) > > > > pcmpeqd (%rax), %xmm1 > > pmovmskb %xmm1, %edx > > + addq $16, %rax > > test %edx, %edx > > - lea 16(%rax), %rax > > jnz L(exit) > > > > pcmpeqd (%rax), %xmm2 > > pmovmskb %xmm2, %edx > > + addq $16, %rax > > test %edx, %edx > > - lea 16(%rax), %rax > > jnz L(exit) > > > > pcmpeqd (%rax), %xmm3 > > pmovmskb %xmm3, %edx > > + addq $16, %rax > > test %edx, %edx > > - lea 16(%rax), %rax > > jnz L(exit) > > > > and $-0x40, %rax > > @@ -132,104 +132,100 @@ L(aligned_64_loop): > > pminub %xmm0, %xmm2 > > pcmpeqd %xmm3, %xmm2 > > pmovmskb %xmm2, %edx > > + addq $64, %rax > > test %edx, %edx > > - lea 64(%rax), %rax > > jz L(aligned_64_loop) > > > > pcmpeqd -64(%rax), %xmm3 > > pmovmskb %xmm3, %edx > > + addq $48, %rdi > > test %edx, %edx > > - lea 48(%rcx), %rcx > > jnz L(exit) > > > > pcmpeqd %xmm1, %xmm3 > > pmovmskb %xmm3, %edx > > + addq $-16, %rdi > > test %edx, %edx > > - lea -16(%rcx), %rcx > > jnz L(exit) > > > > pcmpeqd -32(%rax), %xmm3 > > pmovmskb %xmm3, %edx > > + addq $-16, %rdi > > test %edx, %edx > > - lea -16(%rcx), %rcx > > jnz L(exit) > > > > pcmpeqd %xmm6, %xmm3 > > pmovmskb %xmm3, %edx > > + addq $-16, %rdi > > test %edx, %edx > > - lea -16(%rcx), %rcx > > - jnz L(exit) > > - > > - jmp L(aligned_64_loop) > > + jz L(aligned_64_loop) > > > > .p2align 4 > > L(exit): > > - sub %rcx, %rax > > + sub %rdi, %rax > > shr $2, %rax > > test %dl, %dl > > jz L(exit_high) > > > > - mov %dl, %cl > > - and $15, %cl > > + andl $15, %edx > > jz L(exit_1) > > ret > > > > - .p2align 4 > > + /* No align here. Naturally aligned % 16 == 1. */ > > L(exit_high): > > - mov %dh, %ch > > - and $15, %ch > > + andl $(15 << 8), %edx > > jz L(exit_3) > > add $2, %rax > > ret > > > > - .p2align 4 > > + .p2align 3 > > L(exit_1): > > add $1, %rax > > ret > > > > - .p2align 4 > > + .p2align 3 > > L(exit_3): > > add $3, %rax > > ret > > > > - .p2align 4 > > + .p2align 3 > > L(exit_tail0): > > - xor %rax, %rax > > + xorl %eax, %eax > > ret > > > > - .p2align 4 > > + .p2align 3 > > L(exit_tail1): > > - mov $1, %rax > > + movl $1, %eax > > ret > > > > - .p2align 4 > > + .p2align 3 > > L(exit_tail2): > > - mov $2, %rax > > + movl $2, %eax > > ret > > > > - .p2align 4 > > + .p2align 3 > > L(exit_tail3): > > - mov $3, %rax > > + movl $3, %eax > > ret > > > > - .p2align 4 > > + .p2align 3 > > L(exit_tail4): > > - mov $4, %rax > > + movl $4, %eax > > ret > > > > - .p2align 4 > > + .p2align 3 > > L(exit_tail5): > > - mov $5, %rax > > + movl $5, %eax > > ret > > > > - .p2align 4 > > + .p2align 3 > > L(exit_tail6): > > - mov $6, %rax > > + movl $6, %eax > > ret > > > > - .p2align 4 > > + .p2align 3 > > L(exit_tail7): > > - mov $7, %rax > > + movl $7, %eax > > ret > > > > END (__wcslen) > > -- > > 2.25.1 > > > > LGTM. > > Reviewed-by: H.J. Lu > > Thanks. > > -- > H.J. I would like to backport this patch to release branches. Any comments or objections? --Sunil