On Wed, May 18, 2022 at 1:29 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> On Wed, May 18, 2022 at 1:59 PM Sunil K Pandey via Libc-alpha
> <libc-alpha@sourceware.org> wrote:
> >
> > This patch implements following evex512 version of string functions.
> > Perf gain up to 50% as compared to evex, depending on length and
> > alignment.
>
> Can you include a csv (or any consistent fmt really) somewhere of all
> the benchmarks
> and results of ~10-20 runs and the hardware your benchmarking on?

Machine:
Intel(R) Xeon(R) Gold 6148 CPU @ 2.40GHz
Fedora 35
Glibc master

20 iteration data for each function is attached, please use any text
editor(vi) to
access it.

> >
> > - String length function using 512 bit vectors.
> > - String N length using 512 bit vectors.
> > - Wide string length using 512 bit vectors.
> > - Wide string N length using 512 bit vectors.
> > ---
> >  sysdeps/x86_64/multiarch/Makefile          |   4 +
> >  sysdeps/x86_64/multiarch/ifunc-impl-list.c |  20 ++
> >  sysdeps/x86_64/multiarch/strlen-evex512.S  | 291 +++++++++++++++++++++
> >  sysdeps/x86_64/multiarch/strnlen-evex512.S |   4 +
> >  sysdeps/x86_64/multiarch/wcslen-evex512.S  |   4 +
> >  sysdeps/x86_64/multiarch/wcsnlen-evex512.S |   5 +
> >  6 files changed, 328 insertions(+)
> >  create mode 100644 sysdeps/x86_64/multiarch/strlen-evex512.S
> >  create mode 100644 sysdeps/x86_64/multiarch/strnlen-evex512.S
> >  create mode 100644 sysdeps/x86_64/multiarch/wcslen-evex512.S
> >  create mode 100644 sysdeps/x86_64/multiarch/wcsnlen-evex512.S
> >
> > diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
> > index f3ab5e0928..d0869c3ac3 100644
> > --- a/sysdeps/x86_64/multiarch/Makefile
> > +++ b/sysdeps/x86_64/multiarch/Makefile
> > @@ -81,6 +81,7 @@ sysdep_routines += \
> >    strlen-avx2 \
> >    strlen-avx2-rtm \
> >    strlen-evex \
> > +  strlen-evex512 \
> >    strlen-sse2 \
> >    strncase_l-avx2 \
> >    strncase_l-avx2-rtm \
> > @@ -105,6 +106,7 @@ sysdep_routines += \
> >    strnlen-avx2 \
> >    strnlen-avx2-rtm \
> >    strnlen-evex \
> > +  strnlen-evex512 \
> >    strnlen-sse2 \
> >    strpbrk-c \
> >    strpbrk-sse2 \
> > @@ -138,6 +140,7 @@ sysdep_routines += \
> >    wcslen-avx2 \
> >    wcslen-avx2-rtm \
> >    wcslen-evex \
> > +  wcslen-evex512 \
> >    wcslen-sse2 \
> >    wcslen-sse4_1 \
> >    wcsncmp-avx2 \
> > @@ -148,6 +151,7 @@ sysdep_routines += \
> >    wcsnlen-avx2-rtm \
> >    wcsnlen-c \
> >    wcsnlen-evex \
> > +  wcsnlen-evex512 \
> >    wcsnlen-sse4_1 \
> >    wcsrchr-avx2 \
> >    wcsrchr-avx2-rtm \
> > diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> > index 7218095430..c5cd9466fe 100644
> > --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> > +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> > @@ -328,6 +328,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> >                                && CPU_FEATURE_USABLE (AVX512BW)
> >                                && CPU_FEATURE_USABLE (BMI2)),
> >                               __strlen_evex)
> > +             IFUNC_IMPL_ADD (array, i, strlen,
> > +                             (CPU_FEATURE_USABLE (AVX512VL)
> > +                              && CPU_FEATURE_USABLE (AVX512BW)
> > +                              && CPU_FEATURE_USABLE (BMI2)),
> > +                             __strlen_evex512)
> >               IFUNC_IMPL_ADD (array, i, strlen, 1, __strlen_sse2))
> >
> >    /* Support sysdeps/x86_64/multiarch/strnlen.c.  */
> > @@ -346,6 +351,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> >                                && CPU_FEATURE_USABLE (AVX512BW)
> >                                && CPU_FEATURE_USABLE (BMI2)),
> >                               __strnlen_evex)
> > +             IFUNC_IMPL_ADD (array, i, strnlen,
> > +                             (CPU_FEATURE_USABLE (AVX512VL)
> > +                              && CPU_FEATURE_USABLE (AVX512BW)
> > +                              && CPU_FEATURE_USABLE (BMI2)),
> > +                             __strnlen_evex512)
> >               IFUNC_IMPL_ADD (array, i, strnlen, 1, __strnlen_sse2))
> >
> >    /* Support sysdeps/x86_64/multiarch/stpncpy.c.  */
> > @@ -699,6 +709,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> >                                && CPU_FEATURE_USABLE (AVX512BW)
> >                                && CPU_FEATURE_USABLE (BMI2)),
> >                               __wcslen_evex)
> > +             IFUNC_IMPL_ADD (array, i, wcslen,
> > +                             (CPU_FEATURE_USABLE (AVX512VL)
> > +                              && CPU_FEATURE_USABLE (AVX512BW)
> > +                              && CPU_FEATURE_USABLE (BMI2)),
> > +                             __wcslen_evex512)
> >               IFUNC_IMPL_ADD (array, i, wcslen,
> >                               CPU_FEATURE_USABLE (SSE4_1),
> >                               __wcslen_sse4_1)
> > @@ -720,6 +735,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> >                                && CPU_FEATURE_USABLE (AVX512BW)
> >                                && CPU_FEATURE_USABLE (BMI2)),
> >                               __wcsnlen_evex)
> > +             IFUNC_IMPL_ADD (array, i, wcsnlen,
> > +                             (CPU_FEATURE_USABLE (AVX512VL)
> > +                              && CPU_FEATURE_USABLE (AVX512BW)
> > +                              && CPU_FEATURE_USABLE (BMI2)),
> > +                             __wcsnlen_evex512)
> >               IFUNC_IMPL_ADD (array, i, wcsnlen,
> >                               CPU_FEATURE_USABLE (SSE4_1),
> >                               __wcsnlen_sse4_1)
> > diff --git a/sysdeps/x86_64/multiarch/strlen-evex512.S b/sysdeps/x86_64/multiarch/strlen-evex512.S
> > new file mode 100644
> > index 0000000000..13a6b34615
> > --- /dev/null
> > +++ b/sysdeps/x86_64/multiarch/strlen-evex512.S
> > @@ -0,0 +1,291 @@
> > +/* Copyright (C) 2022 Free Software Foundation, Inc.
> > +   This file is part of the GNU C Library.
> > +
> > +   The GNU C Library is free software; you can redistribute it and/or
> > +   modify it under the terms of the GNU Lesser General Public
> > +   License as published by the Free Software Foundation; either
> > +   version 2.1 of the License, or (at your option) any later version.
> > +
> > +   The GNU C Library is distributed in the hope that it will be useful,
> > +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> > +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> > +   Lesser General Public License for more details.
> > +
> > +   You should have received a copy of the GNU Lesser General Public
> > +   License along with the GNU C Library; if not, see
> > +   <https://www.gnu.org/licenses/>.  */
> > +
> > +#if IS_IN (libc)
> > +
> > +# include <sysdep.h>
> > +
> > +# ifndef STRLEN
> > +#  define STRLEN       __strlen_evex512
> > +# endif
> > +
> > +# define VMOVA         vmovdqa64
> > +# ifdef USE_AS_WCSLEN
> > +#  define VPCMP                vpcmpd
> > +#  define VPMINU       vpminud
> > +#  define CHAR_SIZE    4
> > +# else
> > +#  define VPCMP                vpcmpb
> > +#  define VPMINU       vpminub
> > +#  define CHAR_SIZE    1
> > +# endif
> > +
> > +# define XMM0          xmm16
> > +# define ZMM0          zmm16
> > +# define ZMM1          zmm17
> > +# define ZMM2          zmm18
> > +# define ZMM3          zmm19
> > +# define ZMM4          zmm20
> > +# define VEC_SIZE      64
> > +# define PAGE_SIZE     4096
> > +# define CHAR_PER_VEC  (VEC_SIZE / CHAR_SIZE)
>
> Is it possible to integrate this file cleanly with the evex256 version?
> Something similar to what we do for memset/memmove.

Good suggestion, I will look into it. For the first iteration, let's
keep it standalone
for now.

> > +
> > +       .section .text.evex512, "ax", @progbits
> > +/* Aligning entry point to 64 byte, provides better performance for
> > +   one vector length string.  */
> > +ENTRY_P2ALIGN (STRLEN, 6)
> > +# ifdef USE_AS_STRNLEN
> > +        /* Check zero length.  */
> > +       test    %RSI_LP, %RSI_LP
> > +       jz      L(zero)
> > +#  ifdef __ILP32__
> > +       /* Clear the upper 32 bits.  */
> > +       movl    %esi, %esi
> > +#  endif
> > +# endif
> > +
> > +       movl    %edi, %ecx
> > +       vpxorq  %XMM0, %XMM0, %XMM0
> > +       andl    $(PAGE_SIZE - 1), %ecx
> > +       cmpl    $(PAGE_SIZE - VEC_SIZE), %ecx
> > +       ja      L(page_cross)
> > +
> > +       /* Compare [w]char for null, mask bit will be set for match.  */
> > +       VPCMP   $0, (%rdi), %ZMM0, %k0
> > +       kmovq   %k0, %rax
> > +       testq   %rax, %rax
> > +       jz      L(align_more)
> > +
> > +       tzcntq  %rax, %rax
> > +# ifdef USE_AS_STRNLEN
> > +       cmpq    %rsi, %rax
> cmpl
>
> > +       jae     L(ret_max)
> > +# endif
> > +       ret
> > +
> > +# ifdef USE_AS_STRNLEN
> > +       /* eax instead of rax used to save encoding space.  */
> > +L(zero):
> > +       xorl    %eax, %eax
> > +       ret
> > +# endif
> > +
> > +       /* At this point vector max length reached.  */
> > +# ifdef USE_AS_STRNLEN
> > +L(ret_max):
> > +       movq    %rsi, %rax
> > +       ret
> > +# endif
> > +
> > +L(page_cross):
>
> Imo unless you need the 2-byte encoding on the jump this should be at
> the end of the
> file as its expected to not be hot.

One of my goal, to reduce size as much as possible, as long as it
doesn't hurt performance.  Keeping the jump target nearby reduces
size by a few bytes, without hurting performance.

> > +       andl    $(VEC_SIZE - 1), %ecx
> > +# ifdef USE_AS_WCSLEN
> > +       sarl    $2, %ecx
> > +# endif
> > +       /* ecx contains number of w[char] to be skipped as a result
> > +          of address alignment.  */
> > +       movq    %rdi, %rax
> > +       andq    $-VEC_SIZE, %rax
> > +       VPCMP   $0, (%rax), %ZMM0, %k0
> > +       kmovq   %k0, %rax
> > +       /* Ignore number of character for alignment adjustment.  */
> > +       shrq    %cl, %rax
> > +       jz      L(align_more)
> > +
> > +       tzcntq  %rax, %rax
> > +# ifdef USE_AS_STRNLEN
> > +       cmpq    %rsi, %rax
> > +       jae     L(ret_max)
> > +# endif
> > +       ret
> > +
> > +L(align_more):
> > +       leaq    VEC_SIZE(%rdi), %rax
> > +       /* Align rax to VEC_SIZE.  */
> > +       andq    $-VEC_SIZE, %rax
> > +# ifdef USE_AS_STRNLEN
> > +       movq    %rax, %rdx
> > +       subq    %rdi, %rdx
> > +#  ifdef USE_AS_WCSLEN
> > +       shrq    $2, %rdx
> > +#  endif
> > +       /* At this point rdx contains [w]chars already compared.  */
> > +       cmpq    %rsi, %rdx
> > +       jae     L(ret_max)
> > +       subq    %rsi, %rdx
> > +       negq    %rdx
> > +       /* At this point rdx contains number of w[char] needs to go.
> > +          Now onwards rdx will keep decrementing with each compare.  */
> > +# endif
> > +
> > +       /* Loop unroll 4 times for 4 vector loop.  */
> > +       VPCMP   $0, (%rax), %ZMM0, %k0
> > +       kmovq   %k0, %rcx
> > +       testq   %rcx, %rcx
> > +       jnz     L(first_vector)
>
> Just to keep consistent with the other files can you
> rename first_vector/second_vector... to ret_vec_x{N}
> or something like that.

Agree, will be fixed in v1.

> > +
> > +# ifdef USE_AS_STRNLEN
> > +       subq    $CHAR_PER_VEC, %rdx
> > +       jbe     L(ret_max)
> > +# endif
> > +
> > +       VPCMP   $0, VEC_SIZE(%rax), %ZMM0, %k0
> > +       kmovq   %k0, %rcx
> > +       testq   %rcx, %rcx
> > +       jnz     L(second_vector)
> > +
> > +# ifdef USE_AS_STRNLEN
> > +       subq    $CHAR_PER_VEC, %rdx
> > +       jbe     L(ret_max)
> > +# endif
>
> The evex256 / avx2 versions do a simple check if we will be able
> to do all 4 aligning compares w.o a branch. This saves total
> branches. Why not do something similar here?

Done this way to reduce size and complexity. Branch taken, will
jump to terminating condition. Branch not taken has no impact on perf.

> > +
> > +       VPCMP   $0, (2 * VEC_SIZE)(%rax), %ZMM0, %k0
> > +       kmovq   %k0, %rcx
> > +       testq   %rcx, %rcx
> > +       jnz     L(third_vector)
> > +
> > +# ifdef USE_AS_STRNLEN
> > +       subq    $CHAR_PER_VEC, %rdx
> > +       jbe     L(ret_max)
> > +# endif
> > +
> > +       VPCMP   $0, (3 * VEC_SIZE)(%rax), %ZMM0, %k0
> > +       kmovq   %k0, %rcx
> > +       testq   %rcx, %rcx
> > +       jnz     L(fourth_vector)
> > +
> > +       addq    $(4 * VEC_SIZE), %rax
> > +
> > +# ifdef USE_AS_STRNLEN
> > +       /* Instead of decreasing, rdx increased to prepare for loop
> > +          first iteration.  Incremented 3 times because one increment
> > +          cancelled by previous decrement.  */
> > +       addq    $(3 * CHAR_PER_VEC), %rdx
> > +# endif
> > +
> > +       /* Test if address is already 4 * VEC_SIZE byte aligned goto
> > +          loop.  */
> > +       testq   $(3 * VEC_SIZE), %rax
> > +       jz      L(loop)
> > +
> > +       movq    %rax, %rcx
> > +
> > +       /* Align address to 4 * VEC_SIZE for loop.  */
> > +       andq    $-(4 * VEC_SIZE), %rax
> > +
> > +# ifdef USE_AS_STRNLEN
> > +       subq    %rax, %rcx
> > +#  ifdef USE_AS_WCSLEN
> > +       sarq    $2, %rcx
> > +#  endif
> > +       /* rcx contains number of [w]char will be recompared due to
> > +          alignment fixes.  rdx must be incremented by rcx to offset
> > +          alignment adjustmentment.  */
> > +       addq    %rcx, %rdx
> > +# endif
> > +
> > +L(loop):
> > +# ifdef USE_AS_STRNLEN
> > +       subq    $(CHAR_PER_VEC * 4), %rdx
> > +       jbe     L(ret_max)
>
> we have potential to overread by 255 bytes. Not correctness issue because
> we are page aligned by seems like a possible perf issue.

Correct, but overread data will be read from cache not memory, not a
significant impact, but this is the cost we have to pay for 4 vector alignments.

> > +# endif
> > +       /* VPMINU and VPCMP combination provide better perfomance as
> > +          compared to alternative combinations.  */
> > +       VMOVA   (%rax), %ZMM1
> > +       VPMINU  (VEC_SIZE)(%rax), %ZMM1, %ZMM2
> > +       VMOVA   (2 * VEC_SIZE)(%rax), %ZMM3
> > +       VPMINU  (3 * VEC_SIZE)(%rax), %ZMM3, %ZMM4
>
> I think doing 4x in the main loop is probably overkill no?
> Aligning to 256 is pretty extreme.
>
> Also I don't think the 4x zmm loads can even keep up with
> 2x / cycle so seems like it may not be worth wasting up to
> 255 bytes to get it.

Perf number looks good, so for now it should be ok.

> > +
> > +       VPCMP   $0, %ZMM2, %ZMM0, %k0
> > +       VPCMP   $0, %ZMM4, %ZMM0, %k1
> > +
> > +       addq    $(4 * VEC_SIZE), %rax
> > +       kortestq %k0, %k1
> > +       jz      L(loop)
> > +
> > +       /* Need 4 vector subtraction because address incremented in
> > +          the loop before terminating condition check.  Also want to
> > +          reuse code for exit condition before and after the loop.  */
> > +       subq    $(4 * VEC_SIZE), %rax
> > +
> > +       VPCMP   $0, %ZMM1, %ZMM0, %k2
> > +       kmovq   %k2, %rcx
> > +       testq   %rcx, %rcx
> > +       jnz     L(first_vector)
> > +
> > +       kmovq   %k0, %rcx
> > +       /* At this point, if k0 is non zero, null char must be in the
> > +          second vector.  */
> > +       testq   %rcx, %rcx
> > +       jnz     L(second_vector)
> > +
> > +       VPCMP   $0, %ZMM3, %ZMM0, %k3
> > +       kmovq   %k3, %rcx
> > +       testq   %rcx, %rcx
> > +       jnz     L(third_vector)
> > +       /* At this point null [w]char must be in the fourth vector so no
> > +          need to check.  */
> > +       kmovq   %k1, %rcx
> > +
> > +       /* Termination fourth, third, second vector are pretty much
> > +          same, implemented this way to avoid branching and reuse code
> > +          from pre loop exit condition.  */
> > +L(fourth_vector):
> > +       addq    $(3 * VEC_SIZE), %rax
> > +       tzcntq  %rcx, %rcx
> > +       subq    %rdi, %rax
> Can this be hoisted out to the begining of L(aligned_more).
> It seems every return path uses it.
>

It really depends on where the control is coming from. So moving before
align_more will not be correct, or I may be missing something here.

> > +# ifdef USE_AS_WCSLEN
> > +       sarq    $2, %rax
> > +# endif
> > +       addq    %rcx, %rax
>
> if not wcslen probably faster to use lea instead of 2x add

I'm not sure whether there will be any significant gain. lea vs add. Used add
because it's readily available on all ports.


>
> > +# ifdef USE_AS_STRNLEN
> > +       cmpq    %rsi, %rax
> > +       jae     L(ret_max)
> > +# endif
> > +       ret
> > +
> > +L(third_vector):
> > +       addq    $(2 * VEC_SIZE), %rax
> > +       tzcntq  %rcx, %rcx
> > +       subq    %rdi, %rax
> > +# ifdef USE_AS_WCSLEN
> > +       sarq    $2, %rax
> > +# endif
> > +       addq    %rcx, %rax
> > +# ifdef USE_AS_STRNLEN
> > +       cmpq    %rsi, %rax
> > +       jae     L(ret_max)
> > +# endif
> > +       ret
> > +
> > +L(second_vector):
> > +       addq    $VEC_SIZE, %rax
> > +L(first_vector):
> > +       tzcntq  %rcx, %rcx
> > +       subq    %rdi, %rax
> > +# ifdef USE_AS_WCSLEN
> > +       sarq    $2, %rax
> > +# endif
> > +       addq    %rcx, %rax
> > +# ifdef USE_AS_STRNLEN
> > +       cmpq    %rsi, %rax
> > +       jae     L(ret_max)
> > +# endif
> > +       ret
> > +
> > +END (STRLEN)
> > +#endif
> > diff --git a/sysdeps/x86_64/multiarch/strnlen-evex512.S b/sysdeps/x86_64/multiarch/strnlen-evex512.S
> > new file mode 100644
> > index 0000000000..0b7f220214
> > --- /dev/null
> > +++ b/sysdeps/x86_64/multiarch/strnlen-evex512.S
> > @@ -0,0 +1,4 @@
> > +#define STRLEN __strnlen_evex512
> > +#define USE_AS_STRNLEN 1
> > +
> > +#include "strlen-evex512.S"
> > diff --git a/sysdeps/x86_64/multiarch/wcslen-evex512.S b/sysdeps/x86_64/multiarch/wcslen-evex512.S
> > new file mode 100644
> > index 0000000000..f59c372b78
> > --- /dev/null
> > +++ b/sysdeps/x86_64/multiarch/wcslen-evex512.S
> > @@ -0,0 +1,4 @@
> > +#define STRLEN __wcslen_evex512
> > +#define USE_AS_WCSLEN 1
> > +
> > +#include "strlen-evex512.S"
> > diff --git a/sysdeps/x86_64/multiarch/wcsnlen-evex512.S b/sysdeps/x86_64/multiarch/wcsnlen-evex512.S
> > new file mode 100644
> > index 0000000000..73dcf2f210
> > --- /dev/null
> > +++ b/sysdeps/x86_64/multiarch/wcsnlen-evex512.S
> > @@ -0,0 +1,5 @@
> > +#define STRLEN __wcsnlen_evex512
> > +#define USE_AS_WCSLEN 1
> > +#define USE_AS_STRNLEN 1
> > +
> > +#include "strlen-evex512.S"
> > --
> > 2.35.3
> >