Re: [PATCH v1 1/2] x86: Small improvements for wcscpy-ssse3

public inbox for libc-alpha@sourceware.org
 help / color / mirror / Atom feed

From: "H.J. Lu" <hjl.tools@gmail.com>
To: Noah Goldstein <goldstein.w.n@gmail.com>
Cc: GNU C Library <libc-alpha@sourceware.org>,
	"Carlos O'Donell" <carlos@systemhalted.org>
Subject: Re: [PATCH v1 1/2] x86: Small improvements for wcscpy-ssse3
Date: Mon, 28 Mar 2022 11:51:33 -0700	[thread overview]
Message-ID: <CAMe9rOrECOhAiiJ8j5Y8kCwjcvuJx0nEmK9s3GF59KirDTbAbg@mail.gmail.com> (raw)
In-Reply-To: <20220325221333.3079015-1-goldstein.w.n@gmail.com>

On Fri, Mar 25, 2022 at 3:13 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> Just a few small QOL changes.
>     1. Prefer `add` > `lea` as it has high execution units it can run
>        on.
>     2. Don't break macro-fusion between `test` and `jcc`
>
> geometric_mean(N=20) of all benchmarks New / Original: 0.973
>
> All string/memory tests pass.
> ---
>  sysdeps/x86_64/multiarch/wcscpy-ssse3.S | 194 ++++++++++++------------
>  1 file changed, 97 insertions(+), 97 deletions(-)
>
> diff --git a/sysdeps/x86_64/multiarch/wcscpy-ssse3.S b/sysdeps/x86_64/multiarch/wcscpy-ssse3.S
> index 34b09af327..aa2b9d030f 100644
> --- a/sysdeps/x86_64/multiarch/wcscpy-ssse3.S
> +++ b/sysdeps/x86_64/multiarch/wcscpy-ssse3.S
> @@ -52,7 +52,7 @@ ENTRY (__wcscpy_ssse3)
>         jnz     L(CopyFrom1To16Bytes)
>
>         mov     %rdx, %rax
> -       lea     16(%rdx), %rdx
> +       addq    $16, %rdx
>         and     $-16, %rdx
>         sub     %rdx, %rax
>         sub     %rax, %rcx
> @@ -75,55 +75,55 @@ L(Align16Both):
>         movaps  16(%rcx), %xmm2
>         movaps  %xmm1, (%rdx)
>         pcmpeqd %xmm2, %xmm0
> -       pmovmskb %xmm0, %rax
> -       lea     16(%rsi), %rsi
> +       pmovmskb %xmm0, %eax
> +       addq    $16, %rsi
>
> -       test    %rax, %rax
> +       test    %eax, %eax
>         jnz     L(CopyFrom1To16Bytes)
>
>         movaps  16(%rcx, %rsi), %xmm3
>         movaps  %xmm2, (%rdx, %rsi)
>         pcmpeqd %xmm3, %xmm0
> -       pmovmskb %xmm0, %rax
> -       lea     16(%rsi), %rsi
> +       pmovmskb %xmm0, %eax
> +       addq    $16, %rsi
>
> -       test    %rax, %rax
> +       test    %eax, %eax
>         jnz     L(CopyFrom1To16Bytes)
>
>         movaps  16(%rcx, %rsi), %xmm4
>         movaps  %xmm3, (%rdx, %rsi)
>         pcmpeqd %xmm4, %xmm0
> -       pmovmskb %xmm0, %rax
> -       lea     16(%rsi), %rsi
> +       pmovmskb %xmm0, %eax
> +       addq    $16, %rsi
>
> -       test    %rax, %rax
> +       test    %eax, %eax
>         jnz     L(CopyFrom1To16Bytes)
>
>         movaps  16(%rcx, %rsi), %xmm1
>         movaps  %xmm4, (%rdx, %rsi)
>         pcmpeqd %xmm1, %xmm0
> -       pmovmskb %xmm0, %rax
> -       lea     16(%rsi), %rsi
> +       pmovmskb %xmm0, %eax
> +       addq    $16, %rsi
>
> -       test    %rax, %rax
> +       test    %eax, %eax
>         jnz     L(CopyFrom1To16Bytes)
>
>         movaps  16(%rcx, %rsi), %xmm2
>         movaps  %xmm1, (%rdx, %rsi)
>         pcmpeqd %xmm2, %xmm0
> -       pmovmskb %xmm0, %rax
> -       lea     16(%rsi), %rsi
> +       pmovmskb %xmm0, %eax
> +       addq    $16, %rsi
>
> -       test    %rax, %rax
> +       test    %eax, %eax
>         jnz     L(CopyFrom1To16Bytes)
>
>         movaps  16(%rcx, %rsi), %xmm3
>         movaps  %xmm2, (%rdx, %rsi)
>         pcmpeqd %xmm3, %xmm0
> -       pmovmskb %xmm0, %rax
> -       lea     16(%rsi), %rsi
> +       pmovmskb %xmm0, %eax
> +       addq    $16, %rsi
>
> -       test    %rax, %rax
> +       test    %eax, %eax
>         jnz     L(CopyFrom1To16Bytes)
>
>         movaps  %xmm3, (%rdx, %rsi)
> @@ -147,10 +147,10 @@ L(Aligned64Loop):
>         pminub  %xmm7, %xmm3
>         pminub  %xmm2, %xmm3
>         pcmpeqd %xmm0, %xmm3
> -       pmovmskb %xmm3, %rax
> -       lea     64(%rdx), %rdx
> -       lea     64(%rcx), %rcx
> -       test    %rax, %rax
> +       pmovmskb %xmm3, %eax
> +       addq    $64, %rdx
> +       addq    $64, %rcx
> +       testl   %eax, %eax
>         jnz     L(Aligned64Leave)
>         movaps  %xmm4, -64(%rdx)
>         movaps  %xmm5, -48(%rdx)
> @@ -160,32 +160,32 @@ L(Aligned64Loop):
>
>  L(Aligned64Leave):
>         pcmpeqd %xmm4, %xmm0
> -       pmovmskb %xmm0, %rax
> -       test    %rax, %rax
> +       pmovmskb %xmm0, %eax
> +       test    %eax, %eax
>         jnz     L(CopyFrom1To16Bytes)
>
>         pcmpeqd %xmm5, %xmm0
>
> -       pmovmskb %xmm0, %rax
> +       pmovmskb %xmm0, %eax
>         movaps  %xmm4, -64(%rdx)
> -       test    %rax, %rax
> -       lea     16(%rsi), %rsi
> +       addq    $16, %rsi
> +       test    %eax, %eax
>         jnz     L(CopyFrom1To16Bytes)
>
>         pcmpeqd %xmm6, %xmm0
>
> -       pmovmskb %xmm0, %rax
> +       pmovmskb %xmm0, %eax
>         movaps  %xmm5, -48(%rdx)
> -       test    %rax, %rax
> -       lea     16(%rsi), %rsi
> +       addq    $16, %rsi
> +       test    %eax, %eax
>         jnz     L(CopyFrom1To16Bytes)
>
>         movaps  %xmm6, -32(%rdx)
>         pcmpeqd %xmm7, %xmm0
>
> -       pmovmskb %xmm0, %rax
> -       lea     16(%rsi), %rsi
> -       test    %rax, %rax
> +       pmovmskb %xmm0, %eax
> +       addq    $16, %rsi
> +       test    %eax, %eax
>         jnz     L(CopyFrom1To16Bytes)
>
>         mov     $-0x40, %rsi
> @@ -198,10 +198,10 @@ L(Shl4):
>         movaps  12(%rcx), %xmm2
>  L(Shl4Start):
>         pcmpeqd %xmm2, %xmm0
> -       pmovmskb %xmm0, %rax
> +       pmovmskb %xmm0, %eax
>         movaps  %xmm2, %xmm3
>
> -       test    %rax, %rax
> +       test    %eax, %eax
>         jnz     L(Shl4LoopExit)
>
>         palignr $4, %xmm1, %xmm2
> @@ -209,12 +209,12 @@ L(Shl4Start):
>         movaps  28(%rcx), %xmm2
>
>         pcmpeqd %xmm2, %xmm0
> -       lea     16(%rdx), %rdx
> -       pmovmskb %xmm0, %rax
> -       lea     16(%rcx), %rcx
> +       addq    $16, %rdx
> +       pmovmskb %xmm0, %eax
> +       addq    $16, %rcx
>         movaps  %xmm2, %xmm1
>
> -       test    %rax, %rax
> +       test    %eax, %eax
>         jnz     L(Shl4LoopExit)
>
>         palignr $4, %xmm3, %xmm2
> @@ -222,12 +222,12 @@ L(Shl4Start):
>         movaps  28(%rcx), %xmm2
>
>         pcmpeqd %xmm2, %xmm0
> -       lea     16(%rdx), %rdx
> -       pmovmskb %xmm0, %rax
> -       lea     16(%rcx), %rcx
> +       addq    $16, %rdx
> +       pmovmskb %xmm0, %eax
> +       addq    $16, %rcx
>         movaps  %xmm2, %xmm3
>
> -       test    %rax, %rax
> +       test    %eax, %eax
>         jnz     L(Shl4LoopExit)
>
>         palignr $4, %xmm1, %xmm2
> @@ -235,22 +235,22 @@ L(Shl4Start):
>         movaps  28(%rcx), %xmm2
>
>         pcmpeqd %xmm2, %xmm0
> -       lea     16(%rdx), %rdx
> -       pmovmskb %xmm0, %rax
> -       lea     16(%rcx), %rcx
> +       addq    $16, %rdx
> +       pmovmskb %xmm0, %eax
> +       addq    $16, %rcx
>
> -       test    %rax, %rax
> +       test    %eax, %eax
>         jnz     L(Shl4LoopExit)
>
>         palignr $4, %xmm3, %xmm2
>         movaps  %xmm2, (%rdx)
> -       lea     28(%rcx), %rcx
> -       lea     16(%rdx), %rdx
> +       addq    $28, %rcx
> +       addq    $16, %rdx
>
>         mov     %rcx, %rax
>         and     $-0x40, %rcx
>         sub     %rcx, %rax
> -       lea     -12(%rcx), %rcx
> +       addq    $-12, %rcx
>         sub     %rax, %rdx
>
>         movaps  -4(%rcx), %xmm1
> @@ -267,22 +267,22 @@ L(Shl4LoopStart):
>         pminub  %xmm5, %xmm7
>         pminub  %xmm6, %xmm7
>         pcmpeqd %xmm0, %xmm7
> -       pmovmskb %xmm7, %rax
> +       pmovmskb %xmm7, %eax
>         movaps  %xmm5, %xmm7
>         palignr $4, %xmm4, %xmm5
> -       test    %rax, %rax
>         palignr $4, %xmm3, %xmm4
> +       test    %eax, %eax
>         jnz     L(Shl4Start)
>
>         palignr $4, %xmm2, %xmm3
> -       lea     64(%rcx), %rcx
> +       addq    $64, %rcx
>         palignr $4, %xmm1, %xmm2
>         movaps  %xmm7, %xmm1
>         movaps  %xmm5, 48(%rdx)
>         movaps  %xmm4, 32(%rdx)
>         movaps  %xmm3, 16(%rdx)
>         movaps  %xmm2, (%rdx)
> -       lea     64(%rdx), %rdx
> +       addq    $64, %rdx
>         jmp     L(Shl4LoopStart)
>
>  L(Shl4LoopExit):
> @@ -297,10 +297,10 @@ L(Shl8):
>         movaps  8(%rcx), %xmm2
>  L(Shl8Start):
>         pcmpeqd %xmm2, %xmm0
> -       pmovmskb %xmm0, %rax
> +       pmovmskb %xmm0, %eax
>         movaps  %xmm2, %xmm3
>
> -       test    %rax, %rax
> +       test    %eax, %eax
>         jnz     L(Shl8LoopExit)
>
>         palignr $8, %xmm1, %xmm2
> @@ -308,12 +308,12 @@ L(Shl8Start):
>         movaps  24(%rcx), %xmm2
>
>         pcmpeqd %xmm2, %xmm0
> -       lea     16(%rdx), %rdx
> -       pmovmskb %xmm0, %rax
> -       lea     16(%rcx), %rcx
> +       addq    $16, %rdx
> +       pmovmskb %xmm0, %eax
> +       addq    $16, %rcx
>         movaps  %xmm2, %xmm1
>
> -       test    %rax, %rax
> +       test    %eax, %eax
>         jnz     L(Shl8LoopExit)
>
>         palignr $8, %xmm3, %xmm2
> @@ -321,12 +321,12 @@ L(Shl8Start):
>         movaps  24(%rcx), %xmm2
>
>         pcmpeqd %xmm2, %xmm0
> -       lea     16(%rdx), %rdx
> -       pmovmskb %xmm0, %rax
> -       lea     16(%rcx), %rcx
> +       addq    $16, %rdx
> +       pmovmskb %xmm0, %eax
> +       addq    $16, %rcx
>         movaps  %xmm2, %xmm3
>
> -       test    %rax, %rax
> +       test    %eax, %eax
>         jnz     L(Shl8LoopExit)
>
>         palignr $8, %xmm1, %xmm2
> @@ -334,22 +334,22 @@ L(Shl8Start):
>         movaps  24(%rcx), %xmm2
>
>         pcmpeqd %xmm2, %xmm0
> -       lea     16(%rdx), %rdx
> -       pmovmskb %xmm0, %rax
> -       lea     16(%rcx), %rcx
> +       addq    $16, %rdx
> +       pmovmskb %xmm0, %eax
> +       addq    $16, %rcx
>
> -       test    %rax, %rax
> +       test    %eax, %eax
>         jnz     L(Shl8LoopExit)
>
>         palignr $8, %xmm3, %xmm2
>         movaps  %xmm2, (%rdx)
> -       lea     24(%rcx), %rcx
> -       lea     16(%rdx), %rdx
> +       addq    $24, %rcx
> +       addq    $16, %rdx
>
>         mov     %rcx, %rax
>         and     $-0x40, %rcx
>         sub     %rcx, %rax
> -       lea     -8(%rcx), %rcx
> +       addq    $-8, %rcx
>         sub     %rax, %rdx
>
>         movaps  -8(%rcx), %xmm1
> @@ -366,22 +366,22 @@ L(Shl8LoopStart):
>         pminub  %xmm5, %xmm7
>         pminub  %xmm6, %xmm7
>         pcmpeqd %xmm0, %xmm7
> -       pmovmskb %xmm7, %rax
> +       pmovmskb %xmm7, %eax
>         movaps  %xmm5, %xmm7
>         palignr $8, %xmm4, %xmm5
> -       test    %rax, %rax
>         palignr $8, %xmm3, %xmm4
> +       test    %eax, %eax
>         jnz     L(Shl8Start)
>
>         palignr $8, %xmm2, %xmm3
> -       lea     64(%rcx), %rcx
> +       addq    $64, %rcx
>         palignr $8, %xmm1, %xmm2
>         movaps  %xmm7, %xmm1
>         movaps  %xmm5, 48(%rdx)
>         movaps  %xmm4, 32(%rdx)
>         movaps  %xmm3, 16(%rdx)
>         movaps  %xmm2, (%rdx)
> -       lea     64(%rdx), %rdx
> +       addq    $64, %rdx
>         jmp     L(Shl8LoopStart)
>
>  L(Shl8LoopExit):
> @@ -396,10 +396,10 @@ L(Shl12):
>         movaps  4(%rcx), %xmm2
>  L(Shl12Start):
>         pcmpeqd %xmm2, %xmm0
> -       pmovmskb %xmm0, %rax
> +       pmovmskb %xmm0, %eax
>         movaps  %xmm2, %xmm3
>
> -       test    %rax, %rax
> +       test    %eax, %eax
>         jnz     L(Shl12LoopExit)
>
>         palignr $12, %xmm1, %xmm2
> @@ -407,12 +407,12 @@ L(Shl12Start):
>         movaps  20(%rcx), %xmm2
>
>         pcmpeqd %xmm2, %xmm0
> -       lea     16(%rdx), %rdx
> -       pmovmskb %xmm0, %rax
> -       lea     16(%rcx), %rcx
> +       addq    $16, %rdx
> +       pmovmskb %xmm0, %eax
> +       addq    $16, %rcx
>         movaps  %xmm2, %xmm1
>
> -       test    %rax, %rax
> +       test    %eax, %eax
>         jnz     L(Shl12LoopExit)
>
>         palignr $12, %xmm3, %xmm2
> @@ -420,12 +420,12 @@ L(Shl12Start):
>         movaps  20(%rcx), %xmm2
>
>         pcmpeqd %xmm2, %xmm0
> -       lea     16(%rdx), %rdx
> -       pmovmskb %xmm0, %rax
> -       lea     16(%rcx), %rcx
> +       addq    $16, %rdx
> +       pmovmskb %xmm0, %eax
> +       addq    $16, %rcx
>         movaps  %xmm2, %xmm3
>
> -       test    %rax, %rax
> +       test    %eax, %eax
>         jnz     L(Shl12LoopExit)
>
>         palignr $12, %xmm1, %xmm2
> @@ -433,22 +433,22 @@ L(Shl12Start):
>         movaps  20(%rcx), %xmm2
>
>         pcmpeqd %xmm2, %xmm0
> -       lea     16(%rdx), %rdx
> -       pmovmskb %xmm0, %rax
> -       lea     16(%rcx), %rcx
> +       addq    $16, %rdx
> +       pmovmskb %xmm0, %eax
> +       addq    $16, %rcx
>
> -       test    %rax, %rax
> +       test    %eax, %eax
>         jnz     L(Shl12LoopExit)
>
>         palignr $12, %xmm3, %xmm2
>         movaps  %xmm2, (%rdx)
> -       lea     20(%rcx), %rcx
> -       lea     16(%rdx), %rdx
> +       addq    $20, %rcx
> +       addq    $16, %rdx
>
>         mov     %rcx, %rax
>         and     $-0x40, %rcx
>         sub     %rcx, %rax
> -       lea     -4(%rcx), %rcx
> +       addq    $-4, %rcx
>         sub     %rax, %rdx
>
>         movaps  -12(%rcx), %xmm1
> @@ -465,21 +465,21 @@ L(Shl12LoopStart):
>         pminub  %xmm5, %xmm7
>         pminub  %xmm6, %xmm7
>         pcmpeqd %xmm0, %xmm7
> -       pmovmskb %xmm7, %rax
> +       pmovmskb %xmm7, %eax
>         movaps  %xmm5, %xmm7
>         palignr $12, %xmm4, %xmm5
> -       test    %rax, %rax
>         palignr $12, %xmm3, %xmm4
> +       test    %eax, %eax
>         jnz     L(Shl12Start)
>         palignr $12, %xmm2, %xmm3
> -       lea     64(%rcx), %rcx
> +       addq    $64, %rcx
>         palignr $12, %xmm1, %xmm2
>         movaps  %xmm7, %xmm1
>         movaps  %xmm5, 48(%rdx)
>         movaps  %xmm4, 32(%rdx)
>         movaps  %xmm3, 16(%rdx)
>         movaps  %xmm2, (%rdx)
> -       lea     64(%rdx), %rdx
> +       addq    $64, %rdx
>         jmp     L(Shl12LoopStart)
>
>  L(Shl12LoopExit):
> --
> 2.25.1
>

LGTM.

Reviewed-by: H.J. Lu <hjl.tools@gmail.com>

Thanks.

-- 
H.J.

     prev parent reply	other threads:[~2022-03-28 18:52 UTC|newest]

Thread overview: 5+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2022-03-25 22:13 Noah Goldstein
2022-03-25 22:13 ` [PATCH v1 2/2] x86: Small improvements for wcslen Noah Goldstein
2022-03-28 18:51   ` H.J. Lu
2022-05-12 19:55     ` Sunil Pandey
2022-03-28 18:51 ` H.J. Lu [this message]

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=CAMe9rOrECOhAiiJ8j5Y8kCwjcvuJx0nEmK9s3GF59KirDTbAbg@mail.gmail.com \
    --to=hjl.tools@gmail.com \
    --cc=carlos@systemhalted.org \
    --cc=goldstein.w.n@gmail.com \
    --cc=libc-alpha@sourceware.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).