public inbox for libc-alpha@sourceware.org
 help / color / mirror / Atom feed
From: Noah Goldstein <goldstein.w.n@gmail.com>
To: GNU C Library <libc-alpha@sourceware.org>
Subject: Re: [PATCH v1 6/6] x86_64: Add evex optimized __memcmpeq in memcmpeq-evex.S
Date: Tue, 26 Oct 2021 21:44:54 -0500	[thread overview]
Message-ID: <CAFUsyfK0dk8im2gWRxaz6u1aWZiZ2z+b3x2J7y0-Q2HNKAKCKQ@mail.gmail.com> (raw)
In-Reply-To: <20211027024323.1199441-6-goldstein.w.n@gmail.com>

[-- Attachment #1: Type: text/plain, Size: 13943 bytes --]

On Tue, Oct 26, 2021 at 9:43 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> No bug. This commit adds new optimized __memcmpeq implementation for
> evex.
>
> The primary optimizations are:
>
> 1) skipping the logic to find the difference of the first mismatched
> byte.
>
> 2) not updating src/dst addresses as the non-equals logic does not
> need to be reused by different areas.
> ---
>  sysdeps/x86_64/multiarch/ifunc-impl-list.c |   1 -
>  sysdeps/x86_64/multiarch/ifunc-memcmpeq.h  |   1 -
>  sysdeps/x86_64/multiarch/memcmpeq-evex.S   | 308 ++++++++++++++++++++-
>  3 files changed, 304 insertions(+), 6 deletions(-)
>
> diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> index 535450f52c..ea8df9f9b9 100644
> --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> @@ -52,7 +52,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
>               IFUNC_IMPL_ADD (array, i, __memcmpeq,
>                               (CPU_FEATURE_USABLE (AVX512VL)
>                                && CPU_FEATURE_USABLE (AVX512BW)
> -                   && CPU_FEATURE_USABLE (MOVBE)
>                                && CPU_FEATURE_USABLE (BMI2)),
>                               __memcmpeq_evex)
>               IFUNC_IMPL_ADD (array, i, __memcmpeq, 1, __memcmpeq_sse2))
> diff --git a/sysdeps/x86_64/multiarch/ifunc-memcmpeq.h b/sysdeps/x86_64/multiarch/ifunc-memcmpeq.h
> index e596c5048b..2ea38adf05 100644
> --- a/sysdeps/x86_64/multiarch/ifunc-memcmpeq.h
> +++ b/sysdeps/x86_64/multiarch/ifunc-memcmpeq.h
> @@ -34,7 +34,6 @@ IFUNC_SELECTOR (void)
>        && CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load))
>      {
>        if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
> -         && CPU_FEATURE_USABLE_P (cpu_features, MOVBE)
>           && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW))
>         return OPTIMIZE1 (evex);
>
> diff --git a/sysdeps/x86_64/multiarch/memcmpeq-evex.S b/sysdeps/x86_64/multiarch/memcmpeq-evex.S
> index 951e1e9560..f27e732036 100644
> --- a/sysdeps/x86_64/multiarch/memcmpeq-evex.S
> +++ b/sysdeps/x86_64/multiarch/memcmpeq-evex.S
> @@ -16,8 +16,308 @@
>     License along with the GNU C Library; if not, see
>     <https://www.gnu.org/licenses/>.  */
>
> -#ifndef MEMCMP
> -# define MEMCMP        __memcmpeq_evex
> -#endif
> +#if IS_IN (libc)
> +
> +/* __memcmpeq is implemented as:
> +   1. Use ymm vector compares when possible. The only case where
> +      vector compares is not possible for when size < VEC_SIZE
> +      and loading from either s1 or s2 would cause a page cross.
> +   2. Use xmm vector compare when size >= 8 bytes.
> +   3. Optimistically compare up to first 4 * VEC_SIZE one at a
> +      to check for early mismatches. Only do this if its guranteed the
> +      work is not wasted.
> +   4. If size is 8 * VEC_SIZE or less, unroll the loop.
> +   5. Compare 4 * VEC_SIZE at a time with the aligned first memory
> +      area.
> +   6. Use 2 vector compares when size is 2 * VEC_SIZE or less.
> +   7. Use 4 vector compares when size is 4 * VEC_SIZE or less.
> +   8. Use 8 vector compares when size is 8 * VEC_SIZE or less.  */
> +
> +# include <sysdep.h>
> +
> +# ifndef MEMCMPEQ
> +#  define MEMCMPEQ     __memcmpeq_evex
> +# endif
> +
> +# define VMOVU vmovdqu64
> +# define VPCMP vpcmpub
> +# define VPTEST        vptestmb
> +
> +# define VEC_SIZE      32
> +# define PAGE_SIZE     4096
> +
> +# define YMM0          ymm16
> +# define YMM1          ymm17
> +# define YMM2          ymm18
> +# define YMM3          ymm19
> +# define YMM4          ymm20
> +# define YMM5          ymm21
> +# define YMM6          ymm22
> +
> +
> +       .section .text.evex, "ax", @progbits
> +ENTRY_P2ALIGN (MEMCMPEQ, 6)
> +# ifdef __ILP32__
> +       /* Clear the upper 32 bits.  */
> +       movl    %edx, %edx
> +# endif
> +       cmp     $VEC_SIZE, %RDX_LP
> +       jb      L(less_vec)
> +
> +       /* From VEC to 2 * VEC.  No branch when size == VEC_SIZE.  */
> +       VMOVU   (%rsi), %YMM1
> +       /* Use compare not equals to directly check for mismatch.  */
> +       VPCMP   $4, (%rdi), %YMM1, %k1
> +       kmovd   %k1, %eax
> +       testl   %eax, %eax
> +       jnz     L(return_neq0)
> +
> +       cmpq    $(VEC_SIZE * 2), %rdx
> +       jbe     L(last_1x_vec)
> +
> +       /* Check second VEC no matter what.  */
> +       VMOVU   VEC_SIZE(%rsi), %YMM2
> +       VPCMP   $4, VEC_SIZE(%rdi), %YMM2, %k1
> +       kmovd   %k1, %eax
> +       testl   %eax, %eax
> +       jnz     L(return_neq0)
> +
> +       /* Less than 4 * VEC.  */
> +       cmpq    $(VEC_SIZE * 4), %rdx
> +       jbe     L(last_2x_vec)
> +
> +       /* Check third and fourth VEC no matter what.  */
> +       VMOVU   (VEC_SIZE * 2)(%rsi), %YMM3
> +       VPCMP   $4, (VEC_SIZE * 2)(%rdi), %YMM3, %k1
> +       kmovd   %k1, %eax
> +       testl   %eax, %eax
> +       jnz     L(return_neq0)
> +
> +       VMOVU   (VEC_SIZE * 3)(%rsi), %YMM4
> +       VPCMP   $4, (VEC_SIZE * 3)(%rdi), %YMM4, %k1
> +       kmovd   %k1, %eax
> +       testl   %eax, %eax
> +       jnz     L(return_neq0)
> +
> +       /* Go to 4x VEC loop.  */
> +       cmpq    $(VEC_SIZE * 8), %rdx
> +       ja      L(more_8x_vec)
> +
> +       /* Handle remainder of size = 4 * VEC + 1 to 8 * VEC without any
> +          branches.  */
> +
> +       VMOVU   -(VEC_SIZE * 4)(%rsi, %rdx), %YMM1
> +       VMOVU   -(VEC_SIZE * 3)(%rsi, %rdx), %YMM2
> +       addq    %rdx, %rdi
> +
> +       /* Wait to load from s1 until addressed adjust due to
> +          unlamination.  */
> +
> +       /* vpxor will be all 0s if s1 and s2 are equal. Otherwise it
> +          will have some 1s.  */
> +       vpxorq  -(VEC_SIZE * 4)(%rdi), %YMM1, %YMM1
> +       /* Ternary logic to xor -(VEC_SIZE * 3)(%rdi) with YMM2 while
> +          oring with YMM1. Result is stored in YMM1.  */
> +       vpternlogd $0xde, -(VEC_SIZE * 3)(%rdi), %YMM1, %YMM2
> +
> +       VMOVU   -(VEC_SIZE * 2)(%rsi, %rdx), %YMM3
> +       vpxorq  -(VEC_SIZE * 2)(%rdi), %YMM3, %YMM3
> +       /* Or together YMM1, YMM2, and YMM3 into YMM3.  */
> +       VMOVU   -(VEC_SIZE)(%rsi, %rdx), %YMM4
> +       vpxorq  -(VEC_SIZE)(%rdi), %YMM4, %YMM4
> +
> +       /* Or together YMM2, YMM3, and YMM4 into YMM4.  */
> +       vpternlogd $0xfe, %YMM2, %YMM3, %YMM4
>
> -#include "memcmp-evex-movbe.S"
> +       /* Compare YMM4 with 0. If any 1s s1 and s2 don't match.  */
> +       VPTEST  %YMM4, %YMM4, %k1
> +       kmovd   %k1, %eax
> +L(return_neq0):
> +       ret
> +
> +       /* Fits in padding needed to .p2align 5 L(less_vec).  */
> +L(last_1x_vec):
> +       VMOVU   -(VEC_SIZE * 1)(%rsi, %rdx), %YMM1
> +       VPCMP   $4, -(VEC_SIZE * 1)(%rdi, %rdx), %YMM1, %k1
> +       kmovd   %k1, %eax
> +       ret
> +
> +       /* NB: p2align 5 here will ensure the L(loop_4x_vec) is also 32
> +          byte aligned.  */
> +       .p2align 5
> +L(less_vec):
> +       /* Check if one or less char. This is necessary for size = 0 but
> +          is also faster for size = 1.  */
> +       cmpl    $1, %edx
> +       jbe     L(one_or_less)
> +
> +       /* Check if loading one VEC from either s1 or s2 could cause a
> +          page cross. This can have false positives but is by far the
> +          fastest method.  */
> +       movl    %edi, %eax
> +       orl     %esi, %eax
> +       andl    $(PAGE_SIZE - 1), %eax
> +       cmpl    $(PAGE_SIZE - VEC_SIZE), %eax
> +       jg      L(page_cross_less_vec)
> +
> +       /* No page cross possible.  */
> +       VMOVU   (%rsi), %YMM2
> +       VPCMP   $4, (%rdi), %YMM2, %k1
> +       kmovd   %k1, %eax
> +       /* Result will be zero if s1 and s2 match. Otherwise first set
> +          bit will be first mismatch.  */
> +       bzhil   %edx, %eax, %eax
> +       ret
> +
> +       /* Relatively cold but placing close to L(less_vec) for 2 byte
> +          jump encoding.  */
> +       .p2align 4
> +L(one_or_less):
> +       jb      L(zero)
> +       movzbl  (%rsi), %ecx
> +       movzbl  (%rdi), %eax
> +       subl    %ecx, %eax
> +       /* No ymm register was touched.  */
> +       ret
> +       /* Within the same 16 byte block is L(one_or_less).  */
> +L(zero):
> +       xorl    %eax, %eax
> +       ret
> +
> +       .p2align 4
> +L(last_2x_vec):
> +       VMOVU   -(VEC_SIZE * 2)(%rsi, %rdx), %YMM1
> +       vpxorq  -(VEC_SIZE * 2)(%rdi, %rdx), %YMM1, %YMM1
> +       VMOVU   -(VEC_SIZE * 1)(%rsi, %rdx), %YMM2
> +       vpternlogd $0xde, -(VEC_SIZE * 1)(%rdi, %rdx), %YMM1, %YMM2
> +       VPTEST  %YMM2, %YMM2, %k1
> +       kmovd   %k1, %eax
> +       ret
> +
> +       .p2align 4
> +L(more_8x_vec):
> +       /* Set end of s1 in rdx.  */
> +       leaq    -(VEC_SIZE * 4)(%rdi, %rdx), %rdx
> +       /* rsi stores s2 - s1. This allows loop to only update one
> +          pointer.  */
> +       subq    %rdi, %rsi
> +       /* Align s1 pointer.  */
> +       andq    $-VEC_SIZE, %rdi
> +       /* Adjust because first 4x vec where check already.  */
> +       subq    $-(VEC_SIZE * 4), %rdi
> +       .p2align 4
> +L(loop_4x_vec):
> +       VMOVU   (%rsi, %rdi), %YMM1
> +       vpxorq  (%rdi), %YMM1, %YMM1
> +
> +       VMOVU   VEC_SIZE(%rsi, %rdi), %YMM2
> +       vpternlogd $0xde, (VEC_SIZE)(%rdi), %YMM1, %YMM2
> +
> +       VMOVU   (VEC_SIZE * 2)(%rsi, %rdi), %YMM3
> +       vpxorq  (VEC_SIZE * 2)(%rdi), %YMM3, %YMM3
> +
> +       VMOVU   (VEC_SIZE * 3)(%rsi, %rdi), %YMM4
> +       vpxorq  (VEC_SIZE * 3)(%rdi), %YMM4, %YMM4
> +
> +       vpternlogd $0xfe, %YMM2, %YMM3, %YMM4
> +       VPTEST  %YMM4, %YMM4, %k1
> +       kmovd   %k1, %eax
> +       testl   %eax, %eax
> +       jnz     L(return_neq2)
> +       subq    $-(VEC_SIZE * 4), %rdi
> +       cmpq    %rdx, %rdi
> +       jb      L(loop_4x_vec)
> +
> +       subq    %rdx, %rdi
> +       VMOVU   (VEC_SIZE * 3)(%rsi, %rdx), %YMM4
> +       vpxorq  (VEC_SIZE * 3)(%rdx), %YMM4, %YMM4
> +       /* rdi has 4 * VEC_SIZE - remaining length.  */
> +       cmpl    $(VEC_SIZE * 3), %edi
> +       jae     L(8x_last_1x_vec)
> +       /* Load regardless of branch.  */
> +       VMOVU   (VEC_SIZE * 2)(%rsi, %rdx), %YMM3
> +       /* Ternary logic to xor (VEC_SIZE * 2)(%rdx) with YMM3 while
> +          oring with YMM4. Result is stored in YMM4.  */
> +       vpternlogd $0xf6, (VEC_SIZE * 2)(%rdx), %YMM3, %YMM4
> +       cmpl    $(VEC_SIZE * 2), %edi
> +       jae     L(8x_last_2x_vec)
> +
> +       VMOVU   VEC_SIZE(%rsi, %rdx), %YMM2
> +       vpxorq  VEC_SIZE(%rdx), %YMM2, %YMM2
> +
> +       VMOVU   (%rsi, %rdx), %YMM1
> +       vpxorq  (%rdx), %YMM1, %YMM1
> +
> +       vpternlogd $0xfe, %YMM1, %YMM2, %YMM4
> +L(8x_last_1x_vec):
> +L(8x_last_2x_vec):
> +       VPTEST  %YMM4, %YMM4, %k1
> +       kmovd   %k1, %eax
> +L(return_neq2):
> +       ret
> +
> +       /* Relatively cold case as page cross are unexpected.  */
> +       .p2align 4
> +L(page_cross_less_vec):
> +       cmpl    $16, %edx
> +       jae     L(between_16_31)
> +       cmpl    $8, %edx
> +       ja      L(between_9_15)
> +       cmpl    $4, %edx
> +       jb      L(between_2_3)
> +       /* From 4 to 8 bytes.  No branch when size == 4.  */
> +       movl    (%rdi), %eax
> +       subl    (%rsi), %eax
> +       movl    -4(%rdi, %rdx), %ecx
> +       movl    -4(%rsi, %rdx), %edi
> +       subl    %edi, %ecx
> +       orl     %ecx, %eax
> +       ret
> +
> +       .p2align 4,, 8
> +L(between_16_31):
> +       /* From 16 to 31 bytes.  No branch when size == 16.  */
> +
> +       /* Safe to use xmm[0, 15] as no vzeroupper is needed so RTM safe.
> +        */
> +       vmovdqu (%rsi), %xmm1
> +       vpcmpeqb (%rdi), %xmm1, %xmm1
> +       vmovdqu -16(%rsi, %rdx), %xmm2
> +       vpcmpeqb -16(%rdi, %rdx), %xmm2, %xmm2
> +       vpand   %xmm1, %xmm2, %xmm2
> +       vpmovmskb %xmm2, %eax
> +       notw    %ax
> +       /* No ymm register was touched.  */
> +       ret
> +
> +       .p2align 4,, 8
> +L(between_9_15):
> +       /* From 9 to 15 bytes.  */
> +       movq    (%rdi), %rax
> +       subq    (%rsi), %rax
> +       movq    -8(%rdi, %rdx), %rcx
> +       movq    -8(%rsi, %rdx), %rdi
> +       subq    %rdi, %rcx
> +       orq     %rcx, %rax
> +       /* edx is guranteed to be a non-zero int.  */
> +       cmovnz  %edx, %eax
> +       ret
> +
> +       /* Don't align. This is cold and aligning here will cause code
> +          to spill into next cache line.  */
> +L(between_2_3):
> +       /* From 2 to 3 bytes.  No branch when size == 2.  */
> +       movzwl  (%rdi), %eax
> +       movzwl  (%rsi), %ecx
> +       subl    %ecx, %eax
> +       movzbl  -1(%rdi, %rdx), %ecx
> +       /* All machines that support evex will insert a "merging uop"
> +          avoiding any serious partial register stalls.  */
> +       subb    -1(%rsi, %rdx), %cl
> +       orl     %ecx, %eax
> +       /* No ymm register was touched.  */
> +       ret
> +
> +    /* 4 Bytes from next cache line. */
> +END (MEMCMPEQ)
> +#endif
> --
> 2.25.1
>


Roughly ~0-25% improvement over memcmp. Generally larger improvement
for the smaller size ranges which ultimately are the most important to
opimize for.

Numbers for new implementations attached.

Tests where run on the following CPUs:

Tigerlake: https://ark.intel.com/content/www/us/en/ark/products/208921/intel-core-i7-1165g7-processor-12m-cache-up-to-4-70-ghz-with-ipu.html
Skylake: https://ark.intel.com/content/www/us/en/ark/products/149091/intel-core-i7-8565u-processor-8m-cache-up-to-4-60-ghz.html

Some notes on the numbers.

There are some regressions in the sse2 version. I didn't optimize
these versions beyond defining out obviously irrelivant code for
bcmp. My intuition is that the slowdowns are alignment related. As
well I tested on hardware for which sse2 is outdate, so I am not sure
if these issues would translate to architectures that would actually
use sse2.

The avx2 and evex versions are basically universal improvements for
evex and avx2.

[-- Attachment #2: bcmp-tgl.pdf --]
[-- Type: application/pdf, Size: 223172 bytes --]

[-- Attachment #3: bcmp-skl.pdf --]
[-- Type: application/pdf, Size: 195097 bytes --]

  reply	other threads:[~2021-10-27  2:45 UTC|newest]

Thread overview: 19+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2021-10-27  2:43 [PATCH v1 1/6] String: Add __memcmpeq as build target Noah Goldstein
2021-10-27  2:43 ` [PATCH v1 2/6] Benchtests: Add benchtests for __memcmpeq Noah Goldstein
2021-10-27 12:45   ` H.J. Lu
2021-10-27 16:08     ` Noah Goldstein
2021-10-27 16:07   ` [PATCH v2 " Noah Goldstein
2021-10-27 17:59     ` H.J. Lu
2021-10-27  2:43 ` [PATCH v1 3/6] x86_64: Add support for __memcmpeq using sse2, avx2, and evex Noah Goldstein
2021-10-27 12:47   ` H.J. Lu
2021-10-27  2:43 ` [PATCH v1 4/6] x86_64: Add sse2 optimized __memcmpeq in memcmp-sse2.S Noah Goldstein
2021-10-27 12:48   ` H.J. Lu
2021-10-27  2:43 ` [PATCH v1 5/6] x86_64: Add avx2 optimized __memcmpeq in memcmpeq-avx2.S Noah Goldstein
2021-10-27 12:48   ` H.J. Lu
2021-10-27  2:43 ` [PATCH v1 6/6] x86_64: Add evex optimized __memcmpeq in memcmpeq-evex.S Noah Goldstein
2021-10-27  2:44   ` Noah Goldstein [this message]
2021-10-27 12:49   ` H.J. Lu
2021-10-27 12:42 ` [PATCH v1 1/6] String: Add __memcmpeq as build target H.J. Lu
2021-10-27 18:46   ` Noah Goldstein
2021-10-28 17:57 ` Joseph Myers
2021-10-28 18:25   ` Noah Goldstein

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=CAFUsyfK0dk8im2gWRxaz6u1aWZiZ2z+b3x2J7y0-Q2HNKAKCKQ@mail.gmail.com \
    --to=goldstein.w.n@gmail.com \
    --cc=libc-alpha@sourceware.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).