From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: from mail-pf1-x42e.google.com (mail-pf1-x42e.google.com [IPv6:2607:f8b0:4864:20::42e]) by sourceware.org (Postfix) with ESMTPS id B5AB73858C50 for ; Mon, 28 Mar 2022 18:52:10 +0000 (GMT) DMARC-Filter: OpenDMARC Filter v1.4.1 sourceware.org B5AB73858C50 Received: by mail-pf1-x42e.google.com with SMTP id u22so13554322pfg.6 for ; Mon, 28 Mar 2022 11:52:10 -0700 (PDT) X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=1e100.net; s=20210112; h=x-gm-message-state:mime-version:references:in-reply-to:from:date :message-id:subject:to:cc; bh=Ed/9f73jsj94XUNlMY8g5xId6pmKiDlFfW8D7UX3s+Y=; b=nn5PCysC3D3ajjh/kkxGf6NnXRMGSxe/VESEyahzaR9wZuKIYnFNngPWvP9NyEh0yr +JLRrN4Toir/HVVLHrsa/cgEXRBqj8NjT1R8FXtMMCtTqASlWmIqa79S7VU5Nb63zMhF u1dwevtP4l/Ew/0VJzUEqUK2LtF6vyI2dgMnDHWyUXGCS0hGWbj1YLUKXn5RxE+MVZtL r4w0OPWzKyuaBqK4XxTXUGjwidkojqYBN7h+RAasNjJNsX4VWF6WLCLoGOBaF/EOJhNF 9oJCBnoi3I3RURrZDR65R3bzCQ5lRsn1jYA+GrsOO7oJPF1Z1lf7ex4iDCXmswso6bu8 E64Q== X-Gm-Message-State: AOAM530IHxgdaoFW14hl0D/BWZ4HKJ6GrRqmvu4s+L1Slja1cSBsgiQl IwFIJmQgVHdtKSg6WST6gCcsPJ8u7BubM2Si5UlhMJRjMg8= X-Google-Smtp-Source: ABdhPJybygdk68d3rW/n2Ir71jXhdlm5Y+3l+5UTRnIL7OP+KrTGMx3jOegHyUB4Q6D8ryL+MuuDJBONHpHpZLR11Nw= X-Received: by 2002:a63:dd47:0:b0:381:2bb3:86ba with SMTP id g7-20020a63dd47000000b003812bb386bamr11275652pgj.381.1648493529513; Mon, 28 Mar 2022 11:52:09 -0700 (PDT) MIME-Version: 1.0 References: <20220325221333.3079015-1-goldstein.w.n@gmail.com> In-Reply-To: <20220325221333.3079015-1-goldstein.w.n@gmail.com> From: "H.J. Lu" Date: Mon, 28 Mar 2022 11:51:33 -0700 Message-ID: Subject: Re: [PATCH v1 1/2] x86: Small improvements for wcscpy-ssse3 To: Noah Goldstein Cc: GNU C Library , "Carlos O'Donell" Content-Type: text/plain; charset="UTF-8" X-Spam-Status: No, score=-3025.6 required=5.0 tests=BAYES_00, DKIM_SIGNED, DKIM_VALID, DKIM_VALID_AU, DKIM_VALID_EF, FREEMAIL_FROM, GIT_PATCH_0, KAM_NUMSUBJECT, RCVD_IN_DNSWL_NONE, SPF_HELO_NONE, SPF_PASS, TXREP, T_SCC_BODY_TEXT_LINE autolearn=ham autolearn_force=no version=3.4.4 X-Spam-Checker-Version: SpamAssassin 3.4.4 (2020-01-24) on server2.sourceware.org X-BeenThere: libc-alpha@sourceware.org X-Mailman-Version: 2.1.29 Precedence: list List-Id: Libc-alpha mailing list List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , X-List-Received-Date: Mon, 28 Mar 2022 18:52:13 -0000 On Fri, Mar 25, 2022 at 3:13 PM Noah Goldstein wrote: > > Just a few small QOL changes. > 1. Prefer `add` > `lea` as it has high execution units it can run > on. > 2. Don't break macro-fusion between `test` and `jcc` > > geometric_mean(N=20) of all benchmarks New / Original: 0.973 > > All string/memory tests pass. > --- > sysdeps/x86_64/multiarch/wcscpy-ssse3.S | 194 ++++++++++++------------ > 1 file changed, 97 insertions(+), 97 deletions(-) > > diff --git a/sysdeps/x86_64/multiarch/wcscpy-ssse3.S b/sysdeps/x86_64/multiarch/wcscpy-ssse3.S > index 34b09af327..aa2b9d030f 100644 > --- a/sysdeps/x86_64/multiarch/wcscpy-ssse3.S > +++ b/sysdeps/x86_64/multiarch/wcscpy-ssse3.S > @@ -52,7 +52,7 @@ ENTRY (__wcscpy_ssse3) > jnz L(CopyFrom1To16Bytes) > > mov %rdx, %rax > - lea 16(%rdx), %rdx > + addq $16, %rdx > and $-16, %rdx > sub %rdx, %rax > sub %rax, %rcx > @@ -75,55 +75,55 @@ L(Align16Both): > movaps 16(%rcx), %xmm2 > movaps %xmm1, (%rdx) > pcmpeqd %xmm2, %xmm0 > - pmovmskb %xmm0, %rax > - lea 16(%rsi), %rsi > + pmovmskb %xmm0, %eax > + addq $16, %rsi > > - test %rax, %rax > + test %eax, %eax > jnz L(CopyFrom1To16Bytes) > > movaps 16(%rcx, %rsi), %xmm3 > movaps %xmm2, (%rdx, %rsi) > pcmpeqd %xmm3, %xmm0 > - pmovmskb %xmm0, %rax > - lea 16(%rsi), %rsi > + pmovmskb %xmm0, %eax > + addq $16, %rsi > > - test %rax, %rax > + test %eax, %eax > jnz L(CopyFrom1To16Bytes) > > movaps 16(%rcx, %rsi), %xmm4 > movaps %xmm3, (%rdx, %rsi) > pcmpeqd %xmm4, %xmm0 > - pmovmskb %xmm0, %rax > - lea 16(%rsi), %rsi > + pmovmskb %xmm0, %eax > + addq $16, %rsi > > - test %rax, %rax > + test %eax, %eax > jnz L(CopyFrom1To16Bytes) > > movaps 16(%rcx, %rsi), %xmm1 > movaps %xmm4, (%rdx, %rsi) > pcmpeqd %xmm1, %xmm0 > - pmovmskb %xmm0, %rax > - lea 16(%rsi), %rsi > + pmovmskb %xmm0, %eax > + addq $16, %rsi > > - test %rax, %rax > + test %eax, %eax > jnz L(CopyFrom1To16Bytes) > > movaps 16(%rcx, %rsi), %xmm2 > movaps %xmm1, (%rdx, %rsi) > pcmpeqd %xmm2, %xmm0 > - pmovmskb %xmm0, %rax > - lea 16(%rsi), %rsi > + pmovmskb %xmm0, %eax > + addq $16, %rsi > > - test %rax, %rax > + test %eax, %eax > jnz L(CopyFrom1To16Bytes) > > movaps 16(%rcx, %rsi), %xmm3 > movaps %xmm2, (%rdx, %rsi) > pcmpeqd %xmm3, %xmm0 > - pmovmskb %xmm0, %rax > - lea 16(%rsi), %rsi > + pmovmskb %xmm0, %eax > + addq $16, %rsi > > - test %rax, %rax > + test %eax, %eax > jnz L(CopyFrom1To16Bytes) > > movaps %xmm3, (%rdx, %rsi) > @@ -147,10 +147,10 @@ L(Aligned64Loop): > pminub %xmm7, %xmm3 > pminub %xmm2, %xmm3 > pcmpeqd %xmm0, %xmm3 > - pmovmskb %xmm3, %rax > - lea 64(%rdx), %rdx > - lea 64(%rcx), %rcx > - test %rax, %rax > + pmovmskb %xmm3, %eax > + addq $64, %rdx > + addq $64, %rcx > + testl %eax, %eax > jnz L(Aligned64Leave) > movaps %xmm4, -64(%rdx) > movaps %xmm5, -48(%rdx) > @@ -160,32 +160,32 @@ L(Aligned64Loop): > > L(Aligned64Leave): > pcmpeqd %xmm4, %xmm0 > - pmovmskb %xmm0, %rax > - test %rax, %rax > + pmovmskb %xmm0, %eax > + test %eax, %eax > jnz L(CopyFrom1To16Bytes) > > pcmpeqd %xmm5, %xmm0 > > - pmovmskb %xmm0, %rax > + pmovmskb %xmm0, %eax > movaps %xmm4, -64(%rdx) > - test %rax, %rax > - lea 16(%rsi), %rsi > + addq $16, %rsi > + test %eax, %eax > jnz L(CopyFrom1To16Bytes) > > pcmpeqd %xmm6, %xmm0 > > - pmovmskb %xmm0, %rax > + pmovmskb %xmm0, %eax > movaps %xmm5, -48(%rdx) > - test %rax, %rax > - lea 16(%rsi), %rsi > + addq $16, %rsi > + test %eax, %eax > jnz L(CopyFrom1To16Bytes) > > movaps %xmm6, -32(%rdx) > pcmpeqd %xmm7, %xmm0 > > - pmovmskb %xmm0, %rax > - lea 16(%rsi), %rsi > - test %rax, %rax > + pmovmskb %xmm0, %eax > + addq $16, %rsi > + test %eax, %eax > jnz L(CopyFrom1To16Bytes) > > mov $-0x40, %rsi > @@ -198,10 +198,10 @@ L(Shl4): > movaps 12(%rcx), %xmm2 > L(Shl4Start): > pcmpeqd %xmm2, %xmm0 > - pmovmskb %xmm0, %rax > + pmovmskb %xmm0, %eax > movaps %xmm2, %xmm3 > > - test %rax, %rax > + test %eax, %eax > jnz L(Shl4LoopExit) > > palignr $4, %xmm1, %xmm2 > @@ -209,12 +209,12 @@ L(Shl4Start): > movaps 28(%rcx), %xmm2 > > pcmpeqd %xmm2, %xmm0 > - lea 16(%rdx), %rdx > - pmovmskb %xmm0, %rax > - lea 16(%rcx), %rcx > + addq $16, %rdx > + pmovmskb %xmm0, %eax > + addq $16, %rcx > movaps %xmm2, %xmm1 > > - test %rax, %rax > + test %eax, %eax > jnz L(Shl4LoopExit) > > palignr $4, %xmm3, %xmm2 > @@ -222,12 +222,12 @@ L(Shl4Start): > movaps 28(%rcx), %xmm2 > > pcmpeqd %xmm2, %xmm0 > - lea 16(%rdx), %rdx > - pmovmskb %xmm0, %rax > - lea 16(%rcx), %rcx > + addq $16, %rdx > + pmovmskb %xmm0, %eax > + addq $16, %rcx > movaps %xmm2, %xmm3 > > - test %rax, %rax > + test %eax, %eax > jnz L(Shl4LoopExit) > > palignr $4, %xmm1, %xmm2 > @@ -235,22 +235,22 @@ L(Shl4Start): > movaps 28(%rcx), %xmm2 > > pcmpeqd %xmm2, %xmm0 > - lea 16(%rdx), %rdx > - pmovmskb %xmm0, %rax > - lea 16(%rcx), %rcx > + addq $16, %rdx > + pmovmskb %xmm0, %eax > + addq $16, %rcx > > - test %rax, %rax > + test %eax, %eax > jnz L(Shl4LoopExit) > > palignr $4, %xmm3, %xmm2 > movaps %xmm2, (%rdx) > - lea 28(%rcx), %rcx > - lea 16(%rdx), %rdx > + addq $28, %rcx > + addq $16, %rdx > > mov %rcx, %rax > and $-0x40, %rcx > sub %rcx, %rax > - lea -12(%rcx), %rcx > + addq $-12, %rcx > sub %rax, %rdx > > movaps -4(%rcx), %xmm1 > @@ -267,22 +267,22 @@ L(Shl4LoopStart): > pminub %xmm5, %xmm7 > pminub %xmm6, %xmm7 > pcmpeqd %xmm0, %xmm7 > - pmovmskb %xmm7, %rax > + pmovmskb %xmm7, %eax > movaps %xmm5, %xmm7 > palignr $4, %xmm4, %xmm5 > - test %rax, %rax > palignr $4, %xmm3, %xmm4 > + test %eax, %eax > jnz L(Shl4Start) > > palignr $4, %xmm2, %xmm3 > - lea 64(%rcx), %rcx > + addq $64, %rcx > palignr $4, %xmm1, %xmm2 > movaps %xmm7, %xmm1 > movaps %xmm5, 48(%rdx) > movaps %xmm4, 32(%rdx) > movaps %xmm3, 16(%rdx) > movaps %xmm2, (%rdx) > - lea 64(%rdx), %rdx > + addq $64, %rdx > jmp L(Shl4LoopStart) > > L(Shl4LoopExit): > @@ -297,10 +297,10 @@ L(Shl8): > movaps 8(%rcx), %xmm2 > L(Shl8Start): > pcmpeqd %xmm2, %xmm0 > - pmovmskb %xmm0, %rax > + pmovmskb %xmm0, %eax > movaps %xmm2, %xmm3 > > - test %rax, %rax > + test %eax, %eax > jnz L(Shl8LoopExit) > > palignr $8, %xmm1, %xmm2 > @@ -308,12 +308,12 @@ L(Shl8Start): > movaps 24(%rcx), %xmm2 > > pcmpeqd %xmm2, %xmm0 > - lea 16(%rdx), %rdx > - pmovmskb %xmm0, %rax > - lea 16(%rcx), %rcx > + addq $16, %rdx > + pmovmskb %xmm0, %eax > + addq $16, %rcx > movaps %xmm2, %xmm1 > > - test %rax, %rax > + test %eax, %eax > jnz L(Shl8LoopExit) > > palignr $8, %xmm3, %xmm2 > @@ -321,12 +321,12 @@ L(Shl8Start): > movaps 24(%rcx), %xmm2 > > pcmpeqd %xmm2, %xmm0 > - lea 16(%rdx), %rdx > - pmovmskb %xmm0, %rax > - lea 16(%rcx), %rcx > + addq $16, %rdx > + pmovmskb %xmm0, %eax > + addq $16, %rcx > movaps %xmm2, %xmm3 > > - test %rax, %rax > + test %eax, %eax > jnz L(Shl8LoopExit) > > palignr $8, %xmm1, %xmm2 > @@ -334,22 +334,22 @@ L(Shl8Start): > movaps 24(%rcx), %xmm2 > > pcmpeqd %xmm2, %xmm0 > - lea 16(%rdx), %rdx > - pmovmskb %xmm0, %rax > - lea 16(%rcx), %rcx > + addq $16, %rdx > + pmovmskb %xmm0, %eax > + addq $16, %rcx > > - test %rax, %rax > + test %eax, %eax > jnz L(Shl8LoopExit) > > palignr $8, %xmm3, %xmm2 > movaps %xmm2, (%rdx) > - lea 24(%rcx), %rcx > - lea 16(%rdx), %rdx > + addq $24, %rcx > + addq $16, %rdx > > mov %rcx, %rax > and $-0x40, %rcx > sub %rcx, %rax > - lea -8(%rcx), %rcx > + addq $-8, %rcx > sub %rax, %rdx > > movaps -8(%rcx), %xmm1 > @@ -366,22 +366,22 @@ L(Shl8LoopStart): > pminub %xmm5, %xmm7 > pminub %xmm6, %xmm7 > pcmpeqd %xmm0, %xmm7 > - pmovmskb %xmm7, %rax > + pmovmskb %xmm7, %eax > movaps %xmm5, %xmm7 > palignr $8, %xmm4, %xmm5 > - test %rax, %rax > palignr $8, %xmm3, %xmm4 > + test %eax, %eax > jnz L(Shl8Start) > > palignr $8, %xmm2, %xmm3 > - lea 64(%rcx), %rcx > + addq $64, %rcx > palignr $8, %xmm1, %xmm2 > movaps %xmm7, %xmm1 > movaps %xmm5, 48(%rdx) > movaps %xmm4, 32(%rdx) > movaps %xmm3, 16(%rdx) > movaps %xmm2, (%rdx) > - lea 64(%rdx), %rdx > + addq $64, %rdx > jmp L(Shl8LoopStart) > > L(Shl8LoopExit): > @@ -396,10 +396,10 @@ L(Shl12): > movaps 4(%rcx), %xmm2 > L(Shl12Start): > pcmpeqd %xmm2, %xmm0 > - pmovmskb %xmm0, %rax > + pmovmskb %xmm0, %eax > movaps %xmm2, %xmm3 > > - test %rax, %rax > + test %eax, %eax > jnz L(Shl12LoopExit) > > palignr $12, %xmm1, %xmm2 > @@ -407,12 +407,12 @@ L(Shl12Start): > movaps 20(%rcx), %xmm2 > > pcmpeqd %xmm2, %xmm0 > - lea 16(%rdx), %rdx > - pmovmskb %xmm0, %rax > - lea 16(%rcx), %rcx > + addq $16, %rdx > + pmovmskb %xmm0, %eax > + addq $16, %rcx > movaps %xmm2, %xmm1 > > - test %rax, %rax > + test %eax, %eax > jnz L(Shl12LoopExit) > > palignr $12, %xmm3, %xmm2 > @@ -420,12 +420,12 @@ L(Shl12Start): > movaps 20(%rcx), %xmm2 > > pcmpeqd %xmm2, %xmm0 > - lea 16(%rdx), %rdx > - pmovmskb %xmm0, %rax > - lea 16(%rcx), %rcx > + addq $16, %rdx > + pmovmskb %xmm0, %eax > + addq $16, %rcx > movaps %xmm2, %xmm3 > > - test %rax, %rax > + test %eax, %eax > jnz L(Shl12LoopExit) > > palignr $12, %xmm1, %xmm2 > @@ -433,22 +433,22 @@ L(Shl12Start): > movaps 20(%rcx), %xmm2 > > pcmpeqd %xmm2, %xmm0 > - lea 16(%rdx), %rdx > - pmovmskb %xmm0, %rax > - lea 16(%rcx), %rcx > + addq $16, %rdx > + pmovmskb %xmm0, %eax > + addq $16, %rcx > > - test %rax, %rax > + test %eax, %eax > jnz L(Shl12LoopExit) > > palignr $12, %xmm3, %xmm2 > movaps %xmm2, (%rdx) > - lea 20(%rcx), %rcx > - lea 16(%rdx), %rdx > + addq $20, %rcx > + addq $16, %rdx > > mov %rcx, %rax > and $-0x40, %rcx > sub %rcx, %rax > - lea -4(%rcx), %rcx > + addq $-4, %rcx > sub %rax, %rdx > > movaps -12(%rcx), %xmm1 > @@ -465,21 +465,21 @@ L(Shl12LoopStart): > pminub %xmm5, %xmm7 > pminub %xmm6, %xmm7 > pcmpeqd %xmm0, %xmm7 > - pmovmskb %xmm7, %rax > + pmovmskb %xmm7, %eax > movaps %xmm5, %xmm7 > palignr $12, %xmm4, %xmm5 > - test %rax, %rax > palignr $12, %xmm3, %xmm4 > + test %eax, %eax > jnz L(Shl12Start) > palignr $12, %xmm2, %xmm3 > - lea 64(%rcx), %rcx > + addq $64, %rcx > palignr $12, %xmm1, %xmm2 > movaps %xmm7, %xmm1 > movaps %xmm5, 48(%rdx) > movaps %xmm4, 32(%rdx) > movaps %xmm3, 16(%rdx) > movaps %xmm2, (%rdx) > - lea 64(%rdx), %rdx > + addq $64, %rdx > jmp L(Shl12LoopStart) > > L(Shl12LoopExit): > -- > 2.25.1 > LGTM. Reviewed-by: H.J. Lu Thanks. -- H.J.