Re: [PATCH] x86-64: Improve evex512 version of strlen functions

public inbox for libc-alpha@sourceware.org
 help / color / mirror / Atom feed

From: Noah Goldstein <goldstein.w.n@gmail.com>
To: Sunil K Pandey <skpgkp2@gmail.com>
Cc: libc-alpha@sourceware.org
Subject: Re: [PATCH] x86-64: Improve evex512 version of strlen functions
Date: Fri, 28 Oct 2022 11:33:54 -0500	[thread overview]
Message-ID: <CAFUsyfKMm3wequaZXF1kZFSuc3nY8NDso1LP1CqBvd7hKW0dFQ@mail.gmail.com> (raw)
In-Reply-To: <20221028154810.1801123-1-skpgkp2@gmail.com>

On Fri, Oct 28, 2022 at 10:49 AM Sunil K Pandey via Libc-alpha
<libc-alpha@sourceware.org> wrote:
>
> This patch improves following functionality
> - Replace VPCMP with VPCMPEQ.
> - Replace page cross check logic with sall.
> - Remove extra lea from align_more.
> - Remove uncondition loop jump.
> - Use bsf to check max length in first vector.
> ---
>  sysdeps/x86_64/multiarch/strlen-evex-base.S | 91 +++++++++++++--------
>  1 file changed, 57 insertions(+), 34 deletions(-)
>
> diff --git a/sysdeps/x86_64/multiarch/strlen-evex-base.S b/sysdeps/x86_64/multiarch/strlen-evex-base.S
> index c832b15a48..fd6c770e6e 100644
> --- a/sysdeps/x86_64/multiarch/strlen-evex-base.S
> +++ b/sysdeps/x86_64/multiarch/strlen-evex-base.S
> @@ -25,12 +25,12 @@
>  # include <sysdep.h>
>
>  # ifdef USE_AS_WCSLEN
> -#  define VPCMP                vpcmpd
> +#  define VPCMPEQ      vpcmpeqd
>  #  define VPTESTN      vptestnmd
>  #  define VPMINU       vpminud
>  #  define CHAR_SIZE    4
>  # else
> -#  define VPCMP                vpcmpb
> +#  define VPCMPEQ      vpcmpeqb
>  #  define VPTESTN      vptestnmb
>  #  define VPMINU       vpminub
>  #  define CHAR_SIZE    1
> @@ -55,20 +55,29 @@ ENTRY_P2ALIGN (STRLEN, 6)
>
>         movl    %edi, %eax
>         vpxorq  %VMM_128(0), %VMM_128(0), %VMM_128(0)
> -       andl    $(PAGE_SIZE - 1), %eax
> -       cmpl    $(PAGE_SIZE - VEC_SIZE), %eax
> +       sall    $20, %eax
> +       cmpl    $((PAGE_SIZE - VEC_SIZE) << 20), %eax
>         ja      L(page_cross)
>
>         /* Compare [w]char for null, mask bit will be set for match.  */
> -       VPCMP   $0, (%rdi), %VMM(0), %k0
> +       VPCMPEQ (%rdi), %VMM(0), %k0
> +# ifdef USE_AS_STRNLEN
> +       KMOV    %k0, %VRCX
> +       /* Store max length in rax.  */
> +       mov     %rsi, %rax
> +       /* If rcx is 0, rax will have max length.  We can not use VRCX
> +          and VRAX here for evex256 because, upper 32 bits may be
> +          undefined for ecx and eax.  */
> +       bsfq    %rcx, %rax
> +       cmp     $CHAR_PER_VEC, %rax
> +       ja      L(align_more)
> +       cmpq    %rax, %rsi
> +       cmovb   %esi, %eax
> +# else
>         KMOV    %k0, %VRAX
>         test    %VRAX, %VRAX
>         jz      L(align_more)
> -
>         bsf     %VRAX, %VRAX
> -# ifdef USE_AS_STRNLEN
> -       cmpq    %rsi, %rax
> -       cmovnb  %rsi, %rax
>  # endif
>         ret
>
> @@ -81,25 +90,24 @@ L(ret_max):
>  # endif
>
>  L(align_more):
> -       leaq    VEC_SIZE(%rdi), %rax
> +       mov     %rdi, %rax
>         /* Align rax to VEC_SIZE.  */
>         andq    $-VEC_SIZE, %rax
>  # ifdef USE_AS_STRNLEN
> -       movq    %rax, %rdx
> -       subq    %rdi, %rdx
> +       movq    %rdi, %rdx
> +       subq    %rax, %rdx
>  #  ifdef USE_AS_WCSLEN
>         shr     $2, %VRDX
>  #  endif
>         /* At this point rdx contains [w]chars already compared.  */
> -       subq    %rsi, %rdx
> -       jae     L(ret_max)
> -       negq    %rdx
> +       leaq    -CHAR_PER_VEC(%rsi, %rdx), %rdx
>         /* At this point rdx contains number of w[char] needs to go.
>            Now onwards rdx will keep decrementing with each compare.  */
>  # endif
>
>         /* Loop unroll 4 times for 4 vector loop.  */
> -       VPCMP   $0, (%rax), %VMM(0), %k0
> +       VPCMPEQ VEC_SIZE(%rax), %VMM(0), %k0
> +       subq    $-VEC_SIZE, %rax
>         KMOV    %k0, %VRCX
>         test    %VRCX, %VRCX
>         jnz     L(ret_vec_x1)
> @@ -109,7 +117,7 @@ L(align_more):
>         jbe     L(ret_max)
>  # endif
>
> -       VPCMP   $0, VEC_SIZE(%rax), %VMM(0), %k0
> +       VPCMPEQ VEC_SIZE(%rax), %VMM(0), %k0
>         KMOV    %k0, %VRCX
>         test    %VRCX, %VRCX
>         jnz     L(ret_vec_x2)
> @@ -119,7 +127,7 @@ L(align_more):
>         jbe     L(ret_max)
>  # endif
>
> -       VPCMP   $0, (VEC_SIZE * 2)(%rax), %VMM(0), %k0
> +       VPCMPEQ (VEC_SIZE * 2)(%rax), %VMM(0), %k0
>         KMOV    %k0, %VRCX
>         test    %VRCX, %VRCX
>         jnz     L(ret_vec_x3)
> @@ -129,7 +137,7 @@ L(align_more):
>         jbe     L(ret_max)
>  # endif
>
> -       VPCMP   $0, (VEC_SIZE * 3)(%rax), %VMM(0), %k0
> +       VPCMPEQ (VEC_SIZE * 3)(%rax), %VMM(0), %k0
>         KMOV    %k0, %VRCX
>         test    %VRCX, %VRCX
>         jnz     L(ret_vec_x4)
> @@ -155,16 +163,10 @@ L(align_more):
>         addq    %rcx, %rdx
>         /* Need jump as we don't want to add/subtract rdx for first
>            iteration of 4 x VEC_SIZE aligned loop.  */
> -       jmp     L(loop_entry)
>  # endif
>
>         .p2align 4,,11
>  L(loop):
> -# ifdef USE_AS_STRNLEN
> -       subq    $(CHAR_PER_VEC * 4), %rdx
> -       jbe     L(ret_max)
> -L(loop_entry):
> -# endif
>         /* VPMINU and VPCMP combination provide better performance as
>            compared to alternative combinations.  */
>         VMOVA   (VEC_SIZE * 4)(%rax), %VMM(1)
> @@ -177,7 +179,18 @@ L(loop_entry):
>
>         subq    $-(VEC_SIZE * 4), %rax
>         KORTEST %k0, %k1
> -       jz      L(loop)
> +
> +# ifndef USE_AS_STRNLEN
> +       jz      L(loop)
> +# else
> +       jnz     L(loopend)
> +       subq    $(CHAR_PER_VEC * 4), %rdx
> +       ja      L(loop)
> +       mov     %rsi, %rax
> +       ret
> +# endif
> +
> +L(loopend):
>
>         VPTESTN %VMM(1), %VMM(1), %k2
>         KMOV    %k2, %VRCX
> @@ -249,24 +262,34 @@ L(ret_vec_x1):
>         ret
>
>  L(page_cross):
> -       movl    %eax, %ecx
> -# ifdef USE_AS_WCSLEN
> +       mov     %rdi, %rax
> +       movl    %edi, %ecx
>         andl    $(VEC_SIZE - 1), %ecx
> +# ifdef USE_AS_WCSLEN
>         sarl    $2, %ecx
>  # endif
>         /* ecx contains number of w[char] to be skipped as a result
>            of address alignment.  */
> -       xorq    %rdi, %rax
> -       VPCMP   $0, (PAGE_SIZE - VEC_SIZE)(%rax), %VMM(0), %k0
> -       KMOV    %k0, %VRAX
> +       andq    $-VEC_SIZE, %rax
> +       VPCMPEQ (%rax), %VMM(0), %k0
> +       KMOV    %k0, %VRDX
>         /* Ignore number of character for alignment adjustment.  */
> -       shr     %cl, %VRAX
> +       shr     %cl, %VRDX
> +# ifdef USE_AS_STRNLEN
> +       jnz     L(page_cross_end)
> +       movl    $CHAR_PER_VEC, %eax
> +       sub     %ecx, %eax
> +       cmp     %rax, %rsi
> +       ja      L(align_more)
> +# else
>         jz      L(align_more)
> +# endif
>
> -       bsf     %VRAX, %VRAX
> +L(page_cross_end):
> +       bsf     %VRDX, %VRAX
>  # ifdef USE_AS_STRNLEN
>         cmpq    %rsi, %rax
> -       cmovnb  %rsi, %rax
> +       cmovnb  %esi, %eax
>  # endif
>         ret
>
> --
> 2.36.1
>

Can you post some performance numbers?

next prev parent reply	other threads:[~2022-10-28 16:34 UTC|newest]

Thread overview: 6+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2022-10-28 15:48 Sunil K Pandey
2022-10-28 16:33 ` Noah Goldstein [this message]
2022-10-28 17:12   ` Sunil Pandey
2022-10-28 17:23     ` Noah Goldstein
2022-10-30 19:02       ` Sunil Pandey
2022-10-30 19:33         ` Noah Goldstein

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=CAFUsyfKMm3wequaZXF1kZFSuc3nY8NDso1LP1CqBvd7hKW0dFQ@mail.gmail.com \
    --to=goldstein.w.n@gmail.com \
    --cc=libc-alpha@sourceware.org \
    --cc=skpgkp2@gmail.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).