Re: [PATCH] x86-64: Improve evex512 version of strlen functions

public inbox for libc-alpha@sourceware.org
 help / color / mirror / Atom feed

From: Noah Goldstein <goldstein.w.n@gmail.com>
To: Sunil Pandey <skpgkp2@gmail.com>
Cc: libc-alpha@sourceware.org
Subject: Re: [PATCH] x86-64: Improve evex512 version of strlen functions
Date: Sun, 30 Oct 2022 14:33:30 -0500	[thread overview]
Message-ID: <CAFUsyfL6ENk-+3Fw7erHOQ5HDGR2FfR97E1KVgF0jz=a2HhEug@mail.gmail.com> (raw)
In-Reply-To: <CAMAf5_f0J_tVG5xSeDBbrJQT3Bu-KFS012wxbAZhVxD58LC8tw@mail.gmail.com>

On Sun, Oct 30, 2022 at 2:03 PM Sunil Pandey <skpgkp2@gmail.com> wrote:
>
> On Fri, Oct 28, 2022 at 10:24 AM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> >
> > On Fri, Oct 28, 2022 at 12:12 PM Sunil Pandey <skpgkp2@gmail.com> wrote:
> > >
> > > On Fri, Oct 28, 2022 at 9:34 AM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> > > >
> > > > On Fri, Oct 28, 2022 at 10:49 AM Sunil K Pandey via Libc-alpha
> > > > <libc-alpha@sourceware.org> wrote:
> > > > >
> > > > > This patch improves following functionality
> > > > > - Replace VPCMP with VPCMPEQ.
> > > > > - Replace page cross check logic with sall.
> > > > > - Remove extra lea from align_more.
> > > > > - Remove uncondition loop jump.
> > > > > - Use bsf to check max length in first vector.
> > > > > ---
> > > > >  sysdeps/x86_64/multiarch/strlen-evex-base.S | 91 +++++++++++++--------
> > > > >  1 file changed, 57 insertions(+), 34 deletions(-)
> > > > >
> > > > > diff --git a/sysdeps/x86_64/multiarch/strlen-evex-base.S b/sysdeps/x86_64/multiarch/strlen-evex-base.S
> > > > > index c832b15a48..fd6c770e6e 100644
> > > > > --- a/sysdeps/x86_64/multiarch/strlen-evex-base.S
> > > > > +++ b/sysdeps/x86_64/multiarch/strlen-evex-base.S
> > > > > @@ -25,12 +25,12 @@
> > > > >  # include <sysdep.h>
> > > > >
> > > > >  # ifdef USE_AS_WCSLEN
> > > > > -#  define VPCMP                vpcmpd
> > > > > +#  define VPCMPEQ      vpcmpeqd
> > > > >  #  define VPTESTN      vptestnmd
> > > > >  #  define VPMINU       vpminud
> > > > >  #  define CHAR_SIZE    4
> > > > >  # else
> > > > > -#  define VPCMP                vpcmpb
> > > > > +#  define VPCMPEQ      vpcmpeqb
> > > > >  #  define VPTESTN      vptestnmb
> > > > >  #  define VPMINU       vpminub
> > > > >  #  define CHAR_SIZE    1
> > > > > @@ -55,20 +55,29 @@ ENTRY_P2ALIGN (STRLEN, 6)
> > > > >
> > > > >         movl    %edi, %eax
> > > > >         vpxorq  %VMM_128(0), %VMM_128(0), %VMM_128(0)
> > > > > -       andl    $(PAGE_SIZE - 1), %eax
> > > > > -       cmpl    $(PAGE_SIZE - VEC_SIZE), %eax
> > > > > +       sall    $20, %eax
> > > > > +       cmpl    $((PAGE_SIZE - VEC_SIZE) << 20), %eax
> > > > >         ja      L(page_cross)
> > > > >
> > > > >         /* Compare [w]char for null, mask bit will be set for match.  */
> > > > > -       VPCMP   $0, (%rdi), %VMM(0), %k0
> > > > > +       VPCMPEQ (%rdi), %VMM(0), %k0
> > > > > +# ifdef USE_AS_STRNLEN
> > > > > +       KMOV    %k0, %VRCX
> > > > > +       /* Store max length in rax.  */
> > > > > +       mov     %rsi, %rax
> > > > > +       /* If rcx is 0, rax will have max length.  We can not use VRCX
> > > > > +          and VRAX here for evex256 because, upper 32 bits may be
> > > > > +          undefined for ecx and eax.  */
> > > > > +       bsfq    %rcx, %rax
> > > > > +       cmp     $CHAR_PER_VEC, %rax
> > > > > +       ja      L(align_more)
> > > > > +       cmpq    %rax, %rsi
> > > > > +       cmovb   %esi, %eax
> > > > > +# else
> > > > >         KMOV    %k0, %VRAX
> > > > >         test    %VRAX, %VRAX
> > > > >         jz      L(align_more)
> > > > > -
> > > > >         bsf     %VRAX, %VRAX
> > > > > -# ifdef USE_AS_STRNLEN
> > > > > -       cmpq    %rsi, %rax
> > > > > -       cmovnb  %rsi, %rax
> > > > >  # endif
> > > > >         ret
> > > > >
> > > > > @@ -81,25 +90,24 @@ L(ret_max):
> > > > >  # endif
> > > > >
> > > > >  L(align_more):
> > > > > -       leaq    VEC_SIZE(%rdi), %rax
> > > > > +       mov     %rdi, %rax
> > > > >         /* Align rax to VEC_SIZE.  */
> > > > >         andq    $-VEC_SIZE, %rax
> > > > >  # ifdef USE_AS_STRNLEN
> > > > > -       movq    %rax, %rdx
> > > > > -       subq    %rdi, %rdx
> > > > > +       movq    %rdi, %rdx
> > > > > +       subq    %rax, %rdx
> > > > >  #  ifdef USE_AS_WCSLEN
> > > > >         shr     $2, %VRDX
> > > > >  #  endif
> > > > >         /* At this point rdx contains [w]chars already compared.  */
> > > > > -       subq    %rsi, %rdx
> > > > > -       jae     L(ret_max)
> > > > > -       negq    %rdx
> > > > > +       leaq    -CHAR_PER_VEC(%rsi, %rdx), %rdx
> > > > >         /* At this point rdx contains number of w[char] needs to go.
> > > > >            Now onwards rdx will keep decrementing with each compare.  */
> > > > >  # endif
> > > > >
> > > > >         /* Loop unroll 4 times for 4 vector loop.  */
> > > > > -       VPCMP   $0, (%rax), %VMM(0), %k0
> > > > > +       VPCMPEQ VEC_SIZE(%rax), %VMM(0), %k0
> > > > > +       subq    $-VEC_SIZE, %rax
> > > > >         KMOV    %k0, %VRCX
> > > > >         test    %VRCX, %VRCX
> > > > >         jnz     L(ret_vec_x1)
> > > > > @@ -109,7 +117,7 @@ L(align_more):
> > > > >         jbe     L(ret_max)
> > > > >  # endif
> > > > >
> > > > > -       VPCMP   $0, VEC_SIZE(%rax), %VMM(0), %k0
> > > > > +       VPCMPEQ VEC_SIZE(%rax), %VMM(0), %k0
> > > > >         KMOV    %k0, %VRCX
> > > > >         test    %VRCX, %VRCX
> > > > >         jnz     L(ret_vec_x2)
> > > > > @@ -119,7 +127,7 @@ L(align_more):
> > > > >         jbe     L(ret_max)
> > > > >  # endif
> > > > >
> > > > > -       VPCMP   $0, (VEC_SIZE * 2)(%rax), %VMM(0), %k0
> > > > > +       VPCMPEQ (VEC_SIZE * 2)(%rax), %VMM(0), %k0
> > > > >         KMOV    %k0, %VRCX
> > > > >         test    %VRCX, %VRCX
> > > > >         jnz     L(ret_vec_x3)
> > > > > @@ -129,7 +137,7 @@ L(align_more):
> > > > >         jbe     L(ret_max)
> > > > >  # endif
> > > > >
> > > > > -       VPCMP   $0, (VEC_SIZE * 3)(%rax), %VMM(0), %k0
> > > > > +       VPCMPEQ (VEC_SIZE * 3)(%rax), %VMM(0), %k0
> > > > >         KMOV    %k0, %VRCX
> > > > >         test    %VRCX, %VRCX
> > > > >         jnz     L(ret_vec_x4)
> > > > > @@ -155,16 +163,10 @@ L(align_more):
> > > > >         addq    %rcx, %rdx
> > > > >         /* Need jump as we don't want to add/subtract rdx for first
> > > > >            iteration of 4 x VEC_SIZE aligned loop.  */
> > > > > -       jmp     L(loop_entry)
> > > > >  # endif
> > > > >
> > > > >         .p2align 4,,11
> > > > >  L(loop):
> > > > > -# ifdef USE_AS_STRNLEN
> > > > > -       subq    $(CHAR_PER_VEC * 4), %rdx
> > > > > -       jbe     L(ret_max)
> > > > > -L(loop_entry):
> > > > > -# endif
> > > > >         /* VPMINU and VPCMP combination provide better performance as
> > > > >            compared to alternative combinations.  */
> > > > >         VMOVA   (VEC_SIZE * 4)(%rax), %VMM(1)
> > > > > @@ -177,7 +179,18 @@ L(loop_entry):
> > > > >
> > > > >         subq    $-(VEC_SIZE * 4), %rax
> > > > >         KORTEST %k0, %k1
> > > > > -       jz      L(loop)
> > > > > +
> > > > > +# ifndef USE_AS_STRNLEN
> > > > > +       jz      L(loop)
> > > > > +# else
> > > > > +       jnz     L(loopend)
> > > > > +       subq    $(CHAR_PER_VEC * 4), %rdx
> > > > > +       ja      L(loop)
> > > > > +       mov     %rsi, %rax
> > > > > +       ret
> > > > > +# endif
> > > > > +
> > > > > +L(loopend):
> > > > >
> > > > >         VPTESTN %VMM(1), %VMM(1), %k2
> > > > >         KMOV    %k2, %VRCX
> > > > > @@ -249,24 +262,34 @@ L(ret_vec_x1):
> > > > >         ret
> > > > >
> > > > >  L(page_cross):
> > > > > -       movl    %eax, %ecx
> > > > > -# ifdef USE_AS_WCSLEN
> > > > > +       mov     %rdi, %rax
> > > > > +       movl    %edi, %ecx
> > > > >         andl    $(VEC_SIZE - 1), %ecx
> > > > > +# ifdef USE_AS_WCSLEN
> > > > >         sarl    $2, %ecx
> > > > >  # endif
> > > > >         /* ecx contains number of w[char] to be skipped as a result
> > > > >            of address alignment.  */
> > > > > -       xorq    %rdi, %rax
> > > > > -       VPCMP   $0, (PAGE_SIZE - VEC_SIZE)(%rax), %VMM(0), %k0
> > > > > -       KMOV    %k0, %VRAX
> > > > > +       andq    $-VEC_SIZE, %rax
> > > > > +       VPCMPEQ (%rax), %VMM(0), %k0
> > > > > +       KMOV    %k0, %VRDX
> > > > >         /* Ignore number of character for alignment adjustment.  */
> > > > > -       shr     %cl, %VRAX
> > > > > +       shr     %cl, %VRDX
> > > > > +# ifdef USE_AS_STRNLEN
> > > > > +       jnz     L(page_cross_end)
> > > > > +       movl    $CHAR_PER_VEC, %eax
> > > > > +       sub     %ecx, %eax
> > > > > +       cmp     %rax, %rsi
> > > > > +       ja      L(align_more)
> > > > > +# else
> > > > >         jz      L(align_more)
> > > > > +# endif
> > > > >
> > > > > -       bsf     %VRAX, %VRAX
> > > > > +L(page_cross_end):
> > > > > +       bsf     %VRDX, %VRAX
> > > > >  # ifdef USE_AS_STRNLEN
> > > > >         cmpq    %rsi, %rax
> > > > > -       cmovnb  %rsi, %rax
> > > > > +       cmovnb  %esi, %eax
> > > > >  # endif
> > > > >         ret
> > > > >
> > > > > --
> > > > > 2.36.1
> > > > >
> > > >
> > > > Can you post some performance numbers?
> > >
> > > SKX perf data attached.
> >
> > Can you add numbers comparing this version to the previous version?
>
> Here you go.

This change is okay by me.

     prev parent reply	other threads:[~2022-10-30 19:33 UTC|newest]

Thread overview: 6+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2022-10-28 15:48 Sunil K Pandey
2022-10-28 16:33 ` Noah Goldstein
2022-10-28 17:12   ` Sunil Pandey
2022-10-28 17:23     ` Noah Goldstein
2022-10-30 19:02       ` Sunil Pandey
2022-10-30 19:33         ` Noah Goldstein [this message]

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to='CAFUsyfL6ENk-+3Fw7erHOQ5HDGR2FfR97E1KVgF0jz=a2HhEug@mail.gmail.com' \
    --to=goldstein.w.n@gmail.com \
    --cc=libc-alpha@sourceware.org \
    --cc=skpgkp2@gmail.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link

Be sure your reply has a Subject: header at the top and a blank line before the message body.

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).