On Tue, Oct 26, 2021 at 9:43 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> No bug. This commit adds new optimized __memcmpeq implementation for
> evex.
>
> The primary optimizations are:
>
> 1) skipping the logic to find the difference of the first mismatched
> byte.
>
> 2) not updating src/dst addresses as the non-equals logic does not
> need to be reused by different areas.
> ---
>  sysdeps/x86_64/multiarch/ifunc-impl-list.c |   1 -
>  sysdeps/x86_64/multiarch/ifunc-memcmpeq.h  |   1 -
>  sysdeps/x86_64/multiarch/memcmpeq-evex.S   | 308 ++++++++++++++++++++-
>  3 files changed, 304 insertions(+), 6 deletions(-)
>
> diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> index 535450f52c..ea8df9f9b9 100644
> --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> @@ -52,7 +52,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
>               IFUNC_IMPL_ADD (array, i, __memcmpeq,
>                               (CPU_FEATURE_USABLE (AVX512VL)
>                                && CPU_FEATURE_USABLE (AVX512BW)
> -                   && CPU_FEATURE_USABLE (MOVBE)
>                                && CPU_FEATURE_USABLE (BMI2)),
>                               __memcmpeq_evex)
>               IFUNC_IMPL_ADD (array, i, __memcmpeq, 1, __memcmpeq_sse2))
> diff --git a/sysdeps/x86_64/multiarch/ifunc-memcmpeq.h b/sysdeps/x86_64/multiarch/ifunc-memcmpeq.h
> index e596c5048b..2ea38adf05 100644
> --- a/sysdeps/x86_64/multiarch/ifunc-memcmpeq.h
> +++ b/sysdeps/x86_64/multiarch/ifunc-memcmpeq.h
> @@ -34,7 +34,6 @@ IFUNC_SELECTOR (void)
>        && CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load))
>      {
>        if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
> -         && CPU_FEATURE_USABLE_P (cpu_features, MOVBE)
>           && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW))
>         return OPTIMIZE1 (evex);
>
> diff --git a/sysdeps/x86_64/multiarch/memcmpeq-evex.S b/sysdeps/x86_64/multiarch/memcmpeq-evex.S
> index 951e1e9560..f27e732036 100644
> --- a/sysdeps/x86_64/multiarch/memcmpeq-evex.S
> +++ b/sysdeps/x86_64/multiarch/memcmpeq-evex.S
> @@ -16,8 +16,308 @@
>     License along with the GNU C Library; if not, see
>     <https://www.gnu.org/licenses/>.  */
>
> -#ifndef MEMCMP
> -# define MEMCMP        __memcmpeq_evex
> -#endif
> +#if IS_IN (libc)
> +
> +/* __memcmpeq is implemented as:
> +   1. Use ymm vector compares when possible. The only case where
> +      vector compares is not possible for when size < VEC_SIZE
> +      and loading from either s1 or s2 would cause a page cross.
> +   2. Use xmm vector compare when size >= 8 bytes.
> +   3. Optimistically compare up to first 4 * VEC_SIZE one at a
> +      to check for early mismatches. Only do this if its guranteed the
> +      work is not wasted.
> +   4. If size is 8 * VEC_SIZE or less, unroll the loop.
> +   5. Compare 4 * VEC_SIZE at a time with the aligned first memory
> +      area.
> +   6. Use 2 vector compares when size is 2 * VEC_SIZE or less.
> +   7. Use 4 vector compares when size is 4 * VEC_SIZE or less.
> +   8. Use 8 vector compares when size is 8 * VEC_SIZE or less.  */
> +
> +# include <sysdep.h>
> +
> +# ifndef MEMCMPEQ
> +#  define MEMCMPEQ     __memcmpeq_evex
> +# endif
> +
> +# define VMOVU vmovdqu64
> +# define VPCMP vpcmpub
> +# define VPTEST        vptestmb
> +
> +# define VEC_SIZE      32
> +# define PAGE_SIZE     4096
> +
> +# define YMM0          ymm16
> +# define YMM1          ymm17
> +# define YMM2          ymm18
> +# define YMM3          ymm19
> +# define YMM4          ymm20
> +# define YMM5          ymm21
> +# define YMM6          ymm22
> +
> +
> +       .section .text.evex, "ax", @progbits
> +ENTRY_P2ALIGN (MEMCMPEQ, 6)
> +# ifdef __ILP32__
> +       /* Clear the upper 32 bits.  */
> +       movl    %edx, %edx
> +# endif
> +       cmp     $VEC_SIZE, %RDX_LP
> +       jb      L(less_vec)
> +
> +       /* From VEC to 2 * VEC.  No branch when size == VEC_SIZE.  */
> +       VMOVU   (%rsi), %YMM1
> +       /* Use compare not equals to directly check for mismatch.  */
> +       VPCMP   $4, (%rdi), %YMM1, %k1
> +       kmovd   %k1, %eax
> +       testl   %eax, %eax
> +       jnz     L(return_neq0)
> +
> +       cmpq    $(VEC_SIZE * 2), %rdx
> +       jbe     L(last_1x_vec)
> +
> +       /* Check second VEC no matter what.  */
> +       VMOVU   VEC_SIZE(%rsi), %YMM2
> +       VPCMP   $4, VEC_SIZE(%rdi), %YMM2, %k1
> +       kmovd   %k1, %eax
> +       testl   %eax, %eax
> +       jnz     L(return_neq0)
> +
> +       /* Less than 4 * VEC.  */
> +       cmpq    $(VEC_SIZE * 4), %rdx
> +       jbe     L(last_2x_vec)
> +
> +       /* Check third and fourth VEC no matter what.  */
> +       VMOVU   (VEC_SIZE * 2)(%rsi), %YMM3
> +       VPCMP   $4, (VEC_SIZE * 2)(%rdi), %YMM3, %k1
> +       kmovd   %k1, %eax
> +       testl   %eax, %eax
> +       jnz     L(return_neq0)
> +
> +       VMOVU   (VEC_SIZE * 3)(%rsi), %YMM4
> +       VPCMP   $4, (VEC_SIZE * 3)(%rdi), %YMM4, %k1
> +       kmovd   %k1, %eax
> +       testl   %eax, %eax
> +       jnz     L(return_neq0)
> +
> +       /* Go to 4x VEC loop.  */
> +       cmpq    $(VEC_SIZE * 8), %rdx
> +       ja      L(more_8x_vec)
> +
> +       /* Handle remainder of size = 4 * VEC + 1 to 8 * VEC without any
> +          branches.  */
> +
> +       VMOVU   -(VEC_SIZE * 4)(%rsi, %rdx), %YMM1
> +       VMOVU   -(VEC_SIZE * 3)(%rsi, %rdx), %YMM2
> +       addq    %rdx, %rdi
> +
> +       /* Wait to load from s1 until addressed adjust due to
> +          unlamination.  */
> +
> +       /* vpxor will be all 0s if s1 and s2 are equal. Otherwise it
> +          will have some 1s.  */
> +       vpxorq  -(VEC_SIZE * 4)(%rdi), %YMM1, %YMM1
> +       /* Ternary logic to xor -(VEC_SIZE * 3)(%rdi) with YMM2 while
> +          oring with YMM1. Result is stored in YMM1.  */
> +       vpternlogd $0xde, -(VEC_SIZE * 3)(%rdi), %YMM1, %YMM2
> +
> +       VMOVU   -(VEC_SIZE * 2)(%rsi, %rdx), %YMM3
> +       vpxorq  -(VEC_SIZE * 2)(%rdi), %YMM3, %YMM3
> +       /* Or together YMM1, YMM2, and YMM3 into YMM3.  */
> +       VMOVU   -(VEC_SIZE)(%rsi, %rdx), %YMM4
> +       vpxorq  -(VEC_SIZE)(%rdi), %YMM4, %YMM4
> +
> +       /* Or together YMM2, YMM3, and YMM4 into YMM4.  */
> +       vpternlogd $0xfe, %YMM2, %YMM3, %YMM4
>
> -#include "memcmp-evex-movbe.S"
> +       /* Compare YMM4 with 0. If any 1s s1 and s2 don't match.  */
> +       VPTEST  %YMM4, %YMM4, %k1
> +       kmovd   %k1, %eax
> +L(return_neq0):
> +       ret
> +
> +       /* Fits in padding needed to .p2align 5 L(less_vec).  */
> +L(last_1x_vec):
> +       VMOVU   -(VEC_SIZE * 1)(%rsi, %rdx), %YMM1
> +       VPCMP   $4, -(VEC_SIZE * 1)(%rdi, %rdx), %YMM1, %k1
> +       kmovd   %k1, %eax
> +       ret
> +
> +       /* NB: p2align 5 here will ensure the L(loop_4x_vec) is also 32
> +          byte aligned.  */
> +       .p2align 5
> +L(less_vec):
> +       /* Check if one or less char. This is necessary for size = 0 but
> +          is also faster for size = 1.  */
> +       cmpl    $1, %edx
> +       jbe     L(one_or_less)
> +
> +       /* Check if loading one VEC from either s1 or s2 could cause a
> +          page cross. This can have false positives but is by far the
> +          fastest method.  */
> +       movl    %edi, %eax
> +       orl     %esi, %eax
> +       andl    $(PAGE_SIZE - 1), %eax
> +       cmpl    $(PAGE_SIZE - VEC_SIZE), %eax
> +       jg      L(page_cross_less_vec)
> +
> +       /* No page cross possible.  */
> +       VMOVU   (%rsi), %YMM2
> +       VPCMP   $4, (%rdi), %YMM2, %k1
> +       kmovd   %k1, %eax
> +       /* Result will be zero if s1 and s2 match. Otherwise first set
> +          bit will be first mismatch.  */
> +       bzhil   %edx, %eax, %eax
> +       ret
> +
> +       /* Relatively cold but placing close to L(less_vec) for 2 byte
> +          jump encoding.  */
> +       .p2align 4
> +L(one_or_less):
> +       jb      L(zero)
> +       movzbl  (%rsi), %ecx
> +       movzbl  (%rdi), %eax
> +       subl    %ecx, %eax
> +       /* No ymm register was touched.  */
> +       ret
> +       /* Within the same 16 byte block is L(one_or_less).  */
> +L(zero):
> +       xorl    %eax, %eax
> +       ret
> +
> +       .p2align 4
> +L(last_2x_vec):
> +       VMOVU   -(VEC_SIZE * 2)(%rsi, %rdx), %YMM1
> +       vpxorq  -(VEC_SIZE * 2)(%rdi, %rdx), %YMM1, %YMM1
> +       VMOVU   -(VEC_SIZE * 1)(%rsi, %rdx), %YMM2
> +       vpternlogd $0xde, -(VEC_SIZE * 1)(%rdi, %rdx), %YMM1, %YMM2
> +       VPTEST  %YMM2, %YMM2, %k1
> +       kmovd   %k1, %eax
> +       ret
> +
> +       .p2align 4
> +L(more_8x_vec):
> +       /* Set end of s1 in rdx.  */
> +       leaq    -(VEC_SIZE * 4)(%rdi, %rdx), %rdx
> +       /* rsi stores s2 - s1. This allows loop to only update one
> +          pointer.  */
> +       subq    %rdi, %rsi
> +       /* Align s1 pointer.  */
> +       andq    $-VEC_SIZE, %rdi
> +       /* Adjust because first 4x vec where check already.  */
> +       subq    $-(VEC_SIZE * 4), %rdi
> +       .p2align 4
> +L(loop_4x_vec):
> +       VMOVU   (%rsi, %rdi), %YMM1
> +       vpxorq  (%rdi), %YMM1, %YMM1
> +
> +       VMOVU   VEC_SIZE(%rsi, %rdi), %YMM2
> +       vpternlogd $0xde, (VEC_SIZE)(%rdi), %YMM1, %YMM2
> +
> +       VMOVU   (VEC_SIZE * 2)(%rsi, %rdi), %YMM3
> +       vpxorq  (VEC_SIZE * 2)(%rdi), %YMM3, %YMM3
> +
> +       VMOVU   (VEC_SIZE * 3)(%rsi, %rdi), %YMM4
> +       vpxorq  (VEC_SIZE * 3)(%rdi), %YMM4, %YMM4
> +
> +       vpternlogd $0xfe, %YMM2, %YMM3, %YMM4
> +       VPTEST  %YMM4, %YMM4, %k1
> +       kmovd   %k1, %eax
> +       testl   %eax, %eax
> +       jnz     L(return_neq2)
> +       subq    $-(VEC_SIZE * 4), %rdi
> +       cmpq    %rdx, %rdi
> +       jb      L(loop_4x_vec)
> +
> +       subq    %rdx, %rdi
> +       VMOVU   (VEC_SIZE * 3)(%rsi, %rdx), %YMM4
> +       vpxorq  (VEC_SIZE * 3)(%rdx), %YMM4, %YMM4
> +       /* rdi has 4 * VEC_SIZE - remaining length.  */
> +       cmpl    $(VEC_SIZE * 3), %edi
> +       jae     L(8x_last_1x_vec)
> +       /* Load regardless of branch.  */
> +       VMOVU   (VEC_SIZE * 2)(%rsi, %rdx), %YMM3
> +       /* Ternary logic to xor (VEC_SIZE * 2)(%rdx) with YMM3 while
> +          oring with YMM4. Result is stored in YMM4.  */
> +       vpternlogd $0xf6, (VEC_SIZE * 2)(%rdx), %YMM3, %YMM4
> +       cmpl    $(VEC_SIZE * 2), %edi
> +       jae     L(8x_last_2x_vec)
> +
> +       VMOVU   VEC_SIZE(%rsi, %rdx), %YMM2
> +       vpxorq  VEC_SIZE(%rdx), %YMM2, %YMM2
> +
> +       VMOVU   (%rsi, %rdx), %YMM1
> +       vpxorq  (%rdx), %YMM1, %YMM1
> +
> +       vpternlogd $0xfe, %YMM1, %YMM2, %YMM4
> +L(8x_last_1x_vec):
> +L(8x_last_2x_vec):
> +       VPTEST  %YMM4, %YMM4, %k1
> +       kmovd   %k1, %eax
> +L(return_neq2):
> +       ret
> +
> +       /* Relatively cold case as page cross are unexpected.  */
> +       .p2align 4
> +L(page_cross_less_vec):
> +       cmpl    $16, %edx
> +       jae     L(between_16_31)
> +       cmpl    $8, %edx
> +       ja      L(between_9_15)
> +       cmpl    $4, %edx
> +       jb      L(between_2_3)
> +       /* From 4 to 8 bytes.  No branch when size == 4.  */
> +       movl    (%rdi), %eax
> +       subl    (%rsi), %eax
> +       movl    -4(%rdi, %rdx), %ecx
> +       movl    -4(%rsi, %rdx), %edi
> +       subl    %edi, %ecx
> +       orl     %ecx, %eax
> +       ret
> +
> +       .p2align 4,, 8
> +L(between_16_31):
> +       /* From 16 to 31 bytes.  No branch when size == 16.  */
> +
> +       /* Safe to use xmm[0, 15] as no vzeroupper is needed so RTM safe.
> +        */
> +       vmovdqu (%rsi), %xmm1
> +       vpcmpeqb (%rdi), %xmm1, %xmm1
> +       vmovdqu -16(%rsi, %rdx), %xmm2
> +       vpcmpeqb -16(%rdi, %rdx), %xmm2, %xmm2
> +       vpand   %xmm1, %xmm2, %xmm2
> +       vpmovmskb %xmm2, %eax
> +       notw    %ax
> +       /* No ymm register was touched.  */
> +       ret
> +
> +       .p2align 4,, 8
> +L(between_9_15):
> +       /* From 9 to 15 bytes.  */
> +       movq    (%rdi), %rax
> +       subq    (%rsi), %rax
> +       movq    -8(%rdi, %rdx), %rcx
> +       movq    -8(%rsi, %rdx), %rdi
> +       subq    %rdi, %rcx
> +       orq     %rcx, %rax
> +       /* edx is guranteed to be a non-zero int.  */
> +       cmovnz  %edx, %eax
> +       ret
> +
> +       /* Don't align. This is cold and aligning here will cause code
> +          to spill into next cache line.  */
> +L(between_2_3):
> +       /* From 2 to 3 bytes.  No branch when size == 2.  */
> +       movzwl  (%rdi), %eax
> +       movzwl  (%rsi), %ecx
> +       subl    %ecx, %eax
> +       movzbl  -1(%rdi, %rdx), %ecx
> +       /* All machines that support evex will insert a "merging uop"
> +          avoiding any serious partial register stalls.  */
> +       subb    -1(%rsi, %rdx), %cl
> +       orl     %ecx, %eax
> +       /* No ymm register was touched.  */
> +       ret
> +
> +    /* 4 Bytes from next cache line. */
> +END (MEMCMPEQ)
> +#endif
> --
> 2.25.1
>


Roughly ~0-25% improvement over memcmp. Generally larger improvement
for the smaller size ranges which ultimately are the most important to
opimize for.

Numbers for new implementations attached.

Tests where run on the following CPUs:

Tigerlake: https://ark.intel.com/content/www/us/en/ark/products/208921/intel-core-i7-1165g7-processor-12m-cache-up-to-4-70-ghz-with-ipu.html
Skylake: https://ark.intel.com/content/www/us/en/ark/products/149091/intel-core-i7-8565u-processor-8m-cache-up-to-4-60-ghz.html

Some notes on the numbers.

There are some regressions in the sse2 version. I didn't optimize
these versions beyond defining out obviously irrelivant code for
bcmp. My intuition is that the slowdowns are alignment related. As
well I tested on hardware for which sse2 is outdate, so I am not sure
if these issues would translate to architectures that would actually
use sse2.

The avx2 and evex versions are basically universal improvements for
evex and avx2.