From: Noah Goldstein <goldstein.w.n@gmail.com>
To: Sunil K Pandey <skpgkp2@gmail.com>
Cc: libc-alpha@sourceware.org
Subject: Re: [PATCH] x86-64: Improve evex512 version of strlen functions
Date: Fri, 28 Oct 2022 11:33:54 -0500 [thread overview]
Message-ID: <CAFUsyfKMm3wequaZXF1kZFSuc3nY8NDso1LP1CqBvd7hKW0dFQ@mail.gmail.com> (raw)
In-Reply-To: <20221028154810.1801123-1-skpgkp2@gmail.com>
On Fri, Oct 28, 2022 at 10:49 AM Sunil K Pandey via Libc-alpha
<libc-alpha@sourceware.org> wrote:
>
> This patch improves following functionality
> - Replace VPCMP with VPCMPEQ.
> - Replace page cross check logic with sall.
> - Remove extra lea from align_more.
> - Remove uncondition loop jump.
> - Use bsf to check max length in first vector.
> ---
> sysdeps/x86_64/multiarch/strlen-evex-base.S | 91 +++++++++++++--------
> 1 file changed, 57 insertions(+), 34 deletions(-)
>
> diff --git a/sysdeps/x86_64/multiarch/strlen-evex-base.S b/sysdeps/x86_64/multiarch/strlen-evex-base.S
> index c832b15a48..fd6c770e6e 100644
> --- a/sysdeps/x86_64/multiarch/strlen-evex-base.S
> +++ b/sysdeps/x86_64/multiarch/strlen-evex-base.S
> @@ -25,12 +25,12 @@
> # include <sysdep.h>
>
> # ifdef USE_AS_WCSLEN
> -# define VPCMP vpcmpd
> +# define VPCMPEQ vpcmpeqd
> # define VPTESTN vptestnmd
> # define VPMINU vpminud
> # define CHAR_SIZE 4
> # else
> -# define VPCMP vpcmpb
> +# define VPCMPEQ vpcmpeqb
> # define VPTESTN vptestnmb
> # define VPMINU vpminub
> # define CHAR_SIZE 1
> @@ -55,20 +55,29 @@ ENTRY_P2ALIGN (STRLEN, 6)
>
> movl %edi, %eax
> vpxorq %VMM_128(0), %VMM_128(0), %VMM_128(0)
> - andl $(PAGE_SIZE - 1), %eax
> - cmpl $(PAGE_SIZE - VEC_SIZE), %eax
> + sall $20, %eax
> + cmpl $((PAGE_SIZE - VEC_SIZE) << 20), %eax
> ja L(page_cross)
>
> /* Compare [w]char for null, mask bit will be set for match. */
> - VPCMP $0, (%rdi), %VMM(0), %k0
> + VPCMPEQ (%rdi), %VMM(0), %k0
> +# ifdef USE_AS_STRNLEN
> + KMOV %k0, %VRCX
> + /* Store max length in rax. */
> + mov %rsi, %rax
> + /* If rcx is 0, rax will have max length. We can not use VRCX
> + and VRAX here for evex256 because, upper 32 bits may be
> + undefined for ecx and eax. */
> + bsfq %rcx, %rax
> + cmp $CHAR_PER_VEC, %rax
> + ja L(align_more)
> + cmpq %rax, %rsi
> + cmovb %esi, %eax
> +# else
> KMOV %k0, %VRAX
> test %VRAX, %VRAX
> jz L(align_more)
> -
> bsf %VRAX, %VRAX
> -# ifdef USE_AS_STRNLEN
> - cmpq %rsi, %rax
> - cmovnb %rsi, %rax
> # endif
> ret
>
> @@ -81,25 +90,24 @@ L(ret_max):
> # endif
>
> L(align_more):
> - leaq VEC_SIZE(%rdi), %rax
> + mov %rdi, %rax
> /* Align rax to VEC_SIZE. */
> andq $-VEC_SIZE, %rax
> # ifdef USE_AS_STRNLEN
> - movq %rax, %rdx
> - subq %rdi, %rdx
> + movq %rdi, %rdx
> + subq %rax, %rdx
> # ifdef USE_AS_WCSLEN
> shr $2, %VRDX
> # endif
> /* At this point rdx contains [w]chars already compared. */
> - subq %rsi, %rdx
> - jae L(ret_max)
> - negq %rdx
> + leaq -CHAR_PER_VEC(%rsi, %rdx), %rdx
> /* At this point rdx contains number of w[char] needs to go.
> Now onwards rdx will keep decrementing with each compare. */
> # endif
>
> /* Loop unroll 4 times for 4 vector loop. */
> - VPCMP $0, (%rax), %VMM(0), %k0
> + VPCMPEQ VEC_SIZE(%rax), %VMM(0), %k0
> + subq $-VEC_SIZE, %rax
> KMOV %k0, %VRCX
> test %VRCX, %VRCX
> jnz L(ret_vec_x1)
> @@ -109,7 +117,7 @@ L(align_more):
> jbe L(ret_max)
> # endif
>
> - VPCMP $0, VEC_SIZE(%rax), %VMM(0), %k0
> + VPCMPEQ VEC_SIZE(%rax), %VMM(0), %k0
> KMOV %k0, %VRCX
> test %VRCX, %VRCX
> jnz L(ret_vec_x2)
> @@ -119,7 +127,7 @@ L(align_more):
> jbe L(ret_max)
> # endif
>
> - VPCMP $0, (VEC_SIZE * 2)(%rax), %VMM(0), %k0
> + VPCMPEQ (VEC_SIZE * 2)(%rax), %VMM(0), %k0
> KMOV %k0, %VRCX
> test %VRCX, %VRCX
> jnz L(ret_vec_x3)
> @@ -129,7 +137,7 @@ L(align_more):
> jbe L(ret_max)
> # endif
>
> - VPCMP $0, (VEC_SIZE * 3)(%rax), %VMM(0), %k0
> + VPCMPEQ (VEC_SIZE * 3)(%rax), %VMM(0), %k0
> KMOV %k0, %VRCX
> test %VRCX, %VRCX
> jnz L(ret_vec_x4)
> @@ -155,16 +163,10 @@ L(align_more):
> addq %rcx, %rdx
> /* Need jump as we don't want to add/subtract rdx for first
> iteration of 4 x VEC_SIZE aligned loop. */
> - jmp L(loop_entry)
> # endif
>
> .p2align 4,,11
> L(loop):
> -# ifdef USE_AS_STRNLEN
> - subq $(CHAR_PER_VEC * 4), %rdx
> - jbe L(ret_max)
> -L(loop_entry):
> -# endif
> /* VPMINU and VPCMP combination provide better performance as
> compared to alternative combinations. */
> VMOVA (VEC_SIZE * 4)(%rax), %VMM(1)
> @@ -177,7 +179,18 @@ L(loop_entry):
>
> subq $-(VEC_SIZE * 4), %rax
> KORTEST %k0, %k1
> - jz L(loop)
> +
> +# ifndef USE_AS_STRNLEN
> + jz L(loop)
> +# else
> + jnz L(loopend)
> + subq $(CHAR_PER_VEC * 4), %rdx
> + ja L(loop)
> + mov %rsi, %rax
> + ret
> +# endif
> +
> +L(loopend):
>
> VPTESTN %VMM(1), %VMM(1), %k2
> KMOV %k2, %VRCX
> @@ -249,24 +262,34 @@ L(ret_vec_x1):
> ret
>
> L(page_cross):
> - movl %eax, %ecx
> -# ifdef USE_AS_WCSLEN
> + mov %rdi, %rax
> + movl %edi, %ecx
> andl $(VEC_SIZE - 1), %ecx
> +# ifdef USE_AS_WCSLEN
> sarl $2, %ecx
> # endif
> /* ecx contains number of w[char] to be skipped as a result
> of address alignment. */
> - xorq %rdi, %rax
> - VPCMP $0, (PAGE_SIZE - VEC_SIZE)(%rax), %VMM(0), %k0
> - KMOV %k0, %VRAX
> + andq $-VEC_SIZE, %rax
> + VPCMPEQ (%rax), %VMM(0), %k0
> + KMOV %k0, %VRDX
> /* Ignore number of character for alignment adjustment. */
> - shr %cl, %VRAX
> + shr %cl, %VRDX
> +# ifdef USE_AS_STRNLEN
> + jnz L(page_cross_end)
> + movl $CHAR_PER_VEC, %eax
> + sub %ecx, %eax
> + cmp %rax, %rsi
> + ja L(align_more)
> +# else
> jz L(align_more)
> +# endif
>
> - bsf %VRAX, %VRAX
> +L(page_cross_end):
> + bsf %VRDX, %VRAX
> # ifdef USE_AS_STRNLEN
> cmpq %rsi, %rax
> - cmovnb %rsi, %rax
> + cmovnb %esi, %eax
> # endif
> ret
>
> --
> 2.36.1
>
Can you post some performance numbers?
next prev parent reply other threads:[~2022-10-28 16:34 UTC|newest]
Thread overview: 6+ messages / expand[flat|nested] mbox.gz Atom feed top
2022-10-28 15:48 Sunil K Pandey
2022-10-28 16:33 ` Noah Goldstein [this message]
2022-10-28 17:12 ` Sunil Pandey
2022-10-28 17:23 ` Noah Goldstein
2022-10-30 19:02 ` Sunil Pandey
2022-10-30 19:33 ` Noah Goldstein
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=CAFUsyfKMm3wequaZXF1kZFSuc3nY8NDso1LP1CqBvd7hKW0dFQ@mail.gmail.com \
--to=goldstein.w.n@gmail.com \
--cc=libc-alpha@sourceware.org \
--cc=skpgkp2@gmail.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).