On Fri, Apr 15, 2022 at 10:33 AM H.J. Lu via Libc-alpha wrote: > > On Fri, Apr 15, 2022 at 10:28 AM Noah Goldstein wrote: > > > > Code didn't actually use any sse4 instructions since `ptest` was > > removed in: > > > > commit 2f9062d7171850451e6044ef78d91ff8c017b9c0 > > Author: Noah Goldstein > > Date: Wed Nov 10 16:18:56 2021 -0600 > > > > x86: Shrink memcmp-sse4.S code size > > > > The new memcmp-sse2 implementation is also faster. > > > > geometric_mean(N=20) of page cross cases SSE2 / SSE4: 0.905 > > > > Note there are two regressions prefering SSE2 for Size = 1 and Size = > preferring > > LGTM with the commit log typo fix. > > Reviewed-by: H.J. Lu > > Thanks. > > > 65. > > > > Size = 1: > > size, align0, align1, ret, New Time/Old Time > > 1, 1, 1, 0, 1.2 > > 1, 1, 1, 1, 1.197 > > 1, 1, 1, -1, 1.2 > > > > This is intentional. Size == 1 is significantly less hot based on > > profiles of GCC11 and Python3 than sizes [4, 8] (which is made > > hotter). > > > > Python3 Size = 1 -> 13.64% > > Python3 Size = [4, 8] -> 60.92% > > > > GCC11 Size = 1 -> 1.29% > > GCC11 Size = [4, 8] -> 33.86% > > > > size, align0, align1, ret, New Time/Old Time > > 4, 4, 4, 0, 0.622 > > 4, 4, 4, 1, 0.797 > > 4, 4, 4, -1, 0.805 > > 5, 5, 5, 0, 0.623 > > 5, 5, 5, 1, 0.777 > > 5, 5, 5, -1, 0.802 > > 6, 6, 6, 0, 0.625 > > 6, 6, 6, 1, 0.813 > > 6, 6, 6, -1, 0.788 > > 7, 7, 7, 0, 0.625 > > 7, 7, 7, 1, 0.799 > > 7, 7, 7, -1, 0.795 > > 8, 8, 8, 0, 0.625 > > 8, 8, 8, 1, 0.848 > > 8, 8, 8, -1, 0.914 > > 9, 9, 9, 0, 0.625 > > > > Size = 65: > > size, align0, align1, ret, New Time/Old Time > > 65, 0, 0, 0, 1.103 > > 65, 0, 0, 1, 1.216 > > 65, 0, 0, -1, 1.227 > > 65, 65, 0, 0, 1.091 > > 65, 0, 65, 1, 1.19 > > 65, 65, 65, -1, 1.215 > > > > This is because A) the checks in range [65, 96] are now unrolled 2x > > and B) because smaller values <= 16 are now given a hotter path. By > > contrast the SSE4 version has a branch for Size = 80. The unrolled > > version has get better performance for returns which need both > > comparisons. > > > > size, align0, align1, ret, New Time/Old Time > > 128, 4, 8, 0, 0.858 > > 128, 4, 8, 1, 0.879 > > 128, 4, 8, -1, 0.888 > > > > As well, out of microbenchmark environments that are not full > > predictable the branch will have a real-cost. > > --- > > sysdeps/x86_64/multiarch/Makefile | 2 - > > sysdeps/x86_64/multiarch/ifunc-impl-list.c | 4 - > > sysdeps/x86_64/multiarch/ifunc-memcmp.h | 4 - > > sysdeps/x86_64/multiarch/memcmp-sse4.S | 803 --------------------- > > 4 files changed, 813 deletions(-) > > delete mode 100644 sysdeps/x86_64/multiarch/memcmp-sse4.S > > > > diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile > > index b573966966..0400ea332b 100644 > > --- a/sysdeps/x86_64/multiarch/Makefile > > +++ b/sysdeps/x86_64/multiarch/Makefile > > @@ -11,7 +11,6 @@ sysdep_routines += \ > > memcmp-avx2-movbe-rtm \ > > memcmp-evex-movbe \ > > memcmp-sse2 \ > > - memcmp-sse4 \ > > memcmpeq-avx2 \ > > memcmpeq-avx2-rtm \ > > memcmpeq-evex \ > > @@ -164,7 +163,6 @@ sysdep_routines += \ > > wmemcmp-avx2-movbe-rtm \ > > wmemcmp-evex-movbe \ > > wmemcmp-sse2 \ > > - wmemcmp-sse4 \ > > # sysdep_routines > > endif > > > > diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c > > index c6008a73ed..a8afcf81bb 100644 > > --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c > > +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c > > @@ -96,8 +96,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, > > && CPU_FEATURE_USABLE (BMI2) > > && CPU_FEATURE_USABLE (MOVBE)), > > __memcmp_evex_movbe) > > - IFUNC_IMPL_ADD (array, i, memcmp, CPU_FEATURE_USABLE (SSE4_1), > > - __memcmp_sse4_1) > > IFUNC_IMPL_ADD (array, i, memcmp, 1, __memcmp_sse2)) > > > > #ifdef SHARED > > @@ -809,8 +807,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, > > && CPU_FEATURE_USABLE (BMI2) > > && CPU_FEATURE_USABLE (MOVBE)), > > __wmemcmp_evex_movbe) > > - IFUNC_IMPL_ADD (array, i, wmemcmp, CPU_FEATURE_USABLE (SSE4_1), > > - __wmemcmp_sse4_1) > > IFUNC_IMPL_ADD (array, i, wmemcmp, 1, __wmemcmp_sse2)) > > > > /* Support sysdeps/x86_64/multiarch/wmemset.c. */ > > diff --git a/sysdeps/x86_64/multiarch/ifunc-memcmp.h b/sysdeps/x86_64/multiarch/ifunc-memcmp.h > > index 44759a3ad5..c743970fe3 100644 > > --- a/sysdeps/x86_64/multiarch/ifunc-memcmp.h > > +++ b/sysdeps/x86_64/multiarch/ifunc-memcmp.h > > @@ -20,7 +20,6 @@ > > # include > > > > extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden; > > -extern __typeof (REDIRECT_NAME) OPTIMIZE (sse4_1) attribute_hidden; > > extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_movbe) attribute_hidden; > > extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_movbe_rtm) attribute_hidden; > > extern __typeof (REDIRECT_NAME) OPTIMIZE (evex_movbe) attribute_hidden; > > @@ -46,8 +45,5 @@ IFUNC_SELECTOR (void) > > return OPTIMIZE (avx2_movbe); > > } > > > > - if (CPU_FEATURE_USABLE_P (cpu_features, SSE4_1)) > > - return OPTIMIZE (sse4_1); > > - > > return OPTIMIZE (sse2); > > } > > diff --git a/sysdeps/x86_64/multiarch/memcmp-sse4.S b/sysdeps/x86_64/multiarch/memcmp-sse4.S > > deleted file mode 100644 > > index cd57c1e2c7..0000000000 > > --- a/sysdeps/x86_64/multiarch/memcmp-sse4.S > > +++ /dev/null > > @@ -1,803 +0,0 @@ > > -/* memcmp with SSE4.1, wmemcmp with SSE4.1 > > - Copyright (C) 2010-2022 Free Software Foundation, Inc. > > - This file is part of the GNU C Library. > > - > > - The GNU C Library is free software; you can redistribute it and/or > > - modify it under the terms of the GNU Lesser General Public > > - License as published by the Free Software Foundation; either > > - version 2.1 of the License, or (at your option) any later version. > > - > > - The GNU C Library is distributed in the hope that it will be useful, > > - but WITHOUT ANY WARRANTY; without even the implied warranty of > > - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > > - Lesser General Public License for more details. > > - > > - You should have received a copy of the GNU Lesser General Public > > - License along with the GNU C Library; if not, see > > - . */ > > - > > -#if IS_IN (libc) > > - > > -# include > > - > > -# ifndef MEMCMP > > -# define MEMCMP __memcmp_sse4_1 > > -# endif > > - > > -#ifdef USE_AS_WMEMCMP > > -# define CMPEQ pcmpeqd > > -# define CHAR_SIZE 4 > > -#else > > -# define CMPEQ pcmpeqb > > -# define CHAR_SIZE 1 > > -#endif > > - > > - > > -/* Warning! > > - wmemcmp has to use SIGNED comparison for elements. > > - memcmp has to use UNSIGNED comparison for elemnts. > > -*/ > > - > > - .section .text.sse4.1,"ax",@progbits > > -ENTRY (MEMCMP) > > -# ifdef USE_AS_WMEMCMP > > - shl $2, %RDX_LP > > -# elif defined __ILP32__ > > - /* Clear the upper 32 bits. */ > > - mov %edx, %edx > > -# endif > > - cmp $79, %RDX_LP > > - ja L(79bytesormore) > > - > > - cmp $CHAR_SIZE, %RDX_LP > > - jbe L(firstbyte) > > - > > - /* N in (CHAR_SIZE, 79) bytes. */ > > - cmpl $32, %edx > > - ja L(more_32_bytes) > > - > > - cmpl $16, %edx > > - jae L(16_to_32_bytes) > > - > > -# ifndef USE_AS_WMEMCMP > > - cmpl $8, %edx > > - jae L(8_to_16_bytes) > > - > > - cmpl $4, %edx > > - jb L(2_to_3_bytes) > > - > > - movl (%rdi), %eax > > - movl (%rsi), %ecx > > - > > - bswap %eax > > - bswap %ecx > > - > > - shlq $32, %rax > > - shlq $32, %rcx > > - > > - movl -4(%rdi, %rdx), %edi > > - movl -4(%rsi, %rdx), %esi > > - > > - bswap %edi > > - bswap %esi > > - > > - orq %rdi, %rax > > - orq %rsi, %rcx > > - subq %rcx, %rax > > - cmovne %edx, %eax > > - sbbl %ecx, %ecx > > - orl %ecx, %eax > > - ret > > - > > - .p2align 4,, 8 > > -L(2_to_3_bytes): > > - movzwl (%rdi), %eax > > - movzwl (%rsi), %ecx > > - shll $8, %eax > > - shll $8, %ecx > > - bswap %eax > > - bswap %ecx > > - movzbl -1(%rdi, %rdx), %edi > > - movzbl -1(%rsi, %rdx), %esi > > - orl %edi, %eax > > - orl %esi, %ecx > > - subl %ecx, %eax > > - ret > > - > > - .p2align 4,, 8 > > -L(8_to_16_bytes): > > - movq (%rdi), %rax > > - movq (%rsi), %rcx > > - > > - bswap %rax > > - bswap %rcx > > - > > - subq %rcx, %rax > > - jne L(8_to_16_bytes_done) > > - > > - movq -8(%rdi, %rdx), %rax > > - movq -8(%rsi, %rdx), %rcx > > - > > - bswap %rax > > - bswap %rcx > > - > > - subq %rcx, %rax > > - > > -L(8_to_16_bytes_done): > > - cmovne %edx, %eax > > - sbbl %ecx, %ecx > > - orl %ecx, %eax > > - ret > > -# else > > - xorl %eax, %eax > > - movl (%rdi), %ecx > > - cmpl (%rsi), %ecx > > - jne L(8_to_16_bytes_done) > > - movl 4(%rdi), %ecx > > - cmpl 4(%rsi), %ecx > > - jne L(8_to_16_bytes_done) > > - movl -4(%rdi, %rdx), %ecx > > - cmpl -4(%rsi, %rdx), %ecx > > - jne L(8_to_16_bytes_done) > > - ret > > -# endif > > - > > - .p2align 4,, 3 > > -L(ret_zero): > > - xorl %eax, %eax > > -L(zero): > > - ret > > - > > - .p2align 4,, 8 > > -L(firstbyte): > > - jb L(ret_zero) > > -# ifdef USE_AS_WMEMCMP > > - xorl %eax, %eax > > - movl (%rdi), %ecx > > - cmpl (%rsi), %ecx > > - je L(zero) > > -L(8_to_16_bytes_done): > > - setg %al > > - leal -1(%rax, %rax), %eax > > -# else > > - movzbl (%rdi), %eax > > - movzbl (%rsi), %ecx > > - sub %ecx, %eax > > -# endif > > - ret > > - > > - .p2align 4 > > -L(vec_return_begin_48): > > - addq $16, %rdi > > - addq $16, %rsi > > -L(vec_return_begin_32): > > - bsfl %eax, %eax > > -# ifdef USE_AS_WMEMCMP > > - movl 32(%rdi, %rax), %ecx > > - xorl %edx, %edx > > - cmpl 32(%rsi, %rax), %ecx > > - setg %dl > > - leal -1(%rdx, %rdx), %eax > > -# else > > - movzbl 32(%rsi, %rax), %ecx > > - movzbl 32(%rdi, %rax), %eax > > - subl %ecx, %eax > > -# endif > > - ret > > - > > - .p2align 4 > > -L(vec_return_begin_16): > > - addq $16, %rdi > > - addq $16, %rsi > > -L(vec_return_begin): > > - bsfl %eax, %eax > > -# ifdef USE_AS_WMEMCMP > > - movl (%rdi, %rax), %ecx > > - xorl %edx, %edx > > - cmpl (%rsi, %rax), %ecx > > - setg %dl > > - leal -1(%rdx, %rdx), %eax > > -# else > > - movzbl (%rsi, %rax), %ecx > > - movzbl (%rdi, %rax), %eax > > - subl %ecx, %eax > > -# endif > > - ret > > - > > - .p2align 4 > > -L(vec_return_end_16): > > - subl $16, %edx > > -L(vec_return_end): > > - bsfl %eax, %eax > > - addl %edx, %eax > > -# ifdef USE_AS_WMEMCMP > > - movl -16(%rdi, %rax), %ecx > > - xorl %edx, %edx > > - cmpl -16(%rsi, %rax), %ecx > > - setg %dl > > - leal -1(%rdx, %rdx), %eax > > -# else > > - movzbl -16(%rsi, %rax), %ecx > > - movzbl -16(%rdi, %rax), %eax > > - subl %ecx, %eax > > -# endif > > - ret > > - > > - .p2align 4,, 8 > > -L(more_32_bytes): > > - movdqu (%rdi), %xmm0 > > - movdqu (%rsi), %xmm1 > > - CMPEQ %xmm0, %xmm1 > > - pmovmskb %xmm1, %eax > > - incw %ax > > - jnz L(vec_return_begin) > > - > > - movdqu 16(%rdi), %xmm0 > > - movdqu 16(%rsi), %xmm1 > > - CMPEQ %xmm0, %xmm1 > > - pmovmskb %xmm1, %eax > > - incw %ax > > - jnz L(vec_return_begin_16) > > - > > - cmpl $64, %edx > > - jbe L(32_to_64_bytes) > > - movdqu 32(%rdi), %xmm0 > > - movdqu 32(%rsi), %xmm1 > > - CMPEQ %xmm0, %xmm1 > > - pmovmskb %xmm1, %eax > > - incw %ax > > - jnz L(vec_return_begin_32) > > - > > - .p2align 4,, 6 > > -L(32_to_64_bytes): > > - movdqu -32(%rdi, %rdx), %xmm0 > > - movdqu -32(%rsi, %rdx), %xmm1 > > - CMPEQ %xmm0, %xmm1 > > - pmovmskb %xmm1, %eax > > - incw %ax > > - jnz L(vec_return_end_16) > > - > > - movdqu -16(%rdi, %rdx), %xmm0 > > - movdqu -16(%rsi, %rdx), %xmm1 > > - CMPEQ %xmm0, %xmm1 > > - pmovmskb %xmm1, %eax > > - incw %ax > > - jnz L(vec_return_end) > > - ret > > - > > - .p2align 4 > > -L(16_to_32_bytes): > > - movdqu (%rdi), %xmm0 > > - movdqu (%rsi), %xmm1 > > - CMPEQ %xmm0, %xmm1 > > - pmovmskb %xmm1, %eax > > - incw %ax > > - jnz L(vec_return_begin) > > - > > - movdqu -16(%rdi, %rdx), %xmm0 > > - movdqu -16(%rsi, %rdx), %xmm1 > > - CMPEQ %xmm0, %xmm1 > > - pmovmskb %xmm1, %eax > > - incw %ax > > - jnz L(vec_return_end) > > - ret > > - > > - > > - .p2align 4 > > -L(79bytesormore): > > - movdqu (%rdi), %xmm0 > > - movdqu (%rsi), %xmm1 > > - CMPEQ %xmm0, %xmm1 > > - pmovmskb %xmm1, %eax > > - incw %ax > > - jnz L(vec_return_begin) > > - > > - > > - mov %rsi, %rcx > > - and $-16, %rsi > > - add $16, %rsi > > - sub %rsi, %rcx > > - > > - sub %rcx, %rdi > > - add %rcx, %rdx > > - test $0xf, %rdi > > - jz L(2aligned) > > - > > - cmp $128, %rdx > > - ja L(128bytesormore) > > - > > - .p2align 4,, 6 > > -L(less128bytes): > > - movdqu (%rdi), %xmm1 > > - CMPEQ (%rsi), %xmm1 > > - pmovmskb %xmm1, %eax > > - incw %ax > > - jnz L(vec_return_begin) > > - > > - movdqu 16(%rdi), %xmm1 > > - CMPEQ 16(%rsi), %xmm1 > > - pmovmskb %xmm1, %eax > > - incw %ax > > - jnz L(vec_return_begin_16) > > - > > - movdqu 32(%rdi), %xmm1 > > - CMPEQ 32(%rsi), %xmm1 > > - pmovmskb %xmm1, %eax > > - incw %ax > > - jnz L(vec_return_begin_32) > > - > > - movdqu 48(%rdi), %xmm1 > > - CMPEQ 48(%rsi), %xmm1 > > - pmovmskb %xmm1, %eax > > - incw %ax > > - jnz L(vec_return_begin_48) > > - > > - cmp $96, %rdx > > - jb L(32_to_64_bytes) > > - > > - addq $64, %rdi > > - addq $64, %rsi > > - subq $64, %rdx > > - > > - .p2align 4,, 6 > > -L(last_64_bytes): > > - movdqu (%rdi), %xmm1 > > - CMPEQ (%rsi), %xmm1 > > - pmovmskb %xmm1, %eax > > - incw %ax > > - jnz L(vec_return_begin) > > - > > - movdqu 16(%rdi), %xmm1 > > - CMPEQ 16(%rsi), %xmm1 > > - pmovmskb %xmm1, %eax > > - incw %ax > > - jnz L(vec_return_begin_16) > > - > > - movdqu -32(%rdi, %rdx), %xmm0 > > - movdqu -32(%rsi, %rdx), %xmm1 > > - CMPEQ %xmm0, %xmm1 > > - pmovmskb %xmm1, %eax > > - incw %ax > > - jnz L(vec_return_end_16) > > - > > - movdqu -16(%rdi, %rdx), %xmm0 > > - movdqu -16(%rsi, %rdx), %xmm1 > > - CMPEQ %xmm0, %xmm1 > > - pmovmskb %xmm1, %eax > > - incw %ax > > - jnz L(vec_return_end) > > - ret > > - > > - .p2align 4 > > -L(128bytesormore): > > - cmp $256, %rdx > > - ja L(unaligned_loop) > > -L(less256bytes): > > - movdqu (%rdi), %xmm1 > > - CMPEQ (%rsi), %xmm1 > > - pmovmskb %xmm1, %eax > > - incw %ax > > - jnz L(vec_return_begin) > > - > > - movdqu 16(%rdi), %xmm1 > > - CMPEQ 16(%rsi), %xmm1 > > - pmovmskb %xmm1, %eax > > - incw %ax > > - jnz L(vec_return_begin_16) > > - > > - movdqu 32(%rdi), %xmm1 > > - CMPEQ 32(%rsi), %xmm1 > > - pmovmskb %xmm1, %eax > > - incw %ax > > - jnz L(vec_return_begin_32) > > - > > - movdqu 48(%rdi), %xmm1 > > - CMPEQ 48(%rsi), %xmm1 > > - pmovmskb %xmm1, %eax > > - incw %ax > > - jnz L(vec_return_begin_48) > > - > > - addq $64, %rdi > > - addq $64, %rsi > > - > > - movdqu (%rdi), %xmm1 > > - CMPEQ (%rsi), %xmm1 > > - pmovmskb %xmm1, %eax > > - incw %ax > > - jnz L(vec_return_begin) > > - > > - movdqu 16(%rdi), %xmm1 > > - CMPEQ 16(%rsi), %xmm1 > > - pmovmskb %xmm1, %eax > > - incw %ax > > - jnz L(vec_return_begin_16) > > - > > - movdqu 32(%rdi), %xmm1 > > - CMPEQ 32(%rsi), %xmm1 > > - pmovmskb %xmm1, %eax > > - incw %ax > > - jnz L(vec_return_begin_32) > > - > > - movdqu 48(%rdi), %xmm1 > > - CMPEQ 48(%rsi), %xmm1 > > - pmovmskb %xmm1, %eax > > - incw %ax > > - jnz L(vec_return_begin_48) > > - > > - addq $-128, %rdx > > - subq $-64, %rsi > > - subq $-64, %rdi > > - > > - cmp $64, %rdx > > - ja L(less128bytes) > > - > > - cmp $32, %rdx > > - ja L(last_64_bytes) > > - > > - movdqu -32(%rdi, %rdx), %xmm0 > > - movdqu -32(%rsi, %rdx), %xmm1 > > - CMPEQ %xmm0, %xmm1 > > - pmovmskb %xmm1, %eax > > - incw %ax > > - jnz L(vec_return_end_16) > > - > > - movdqu -16(%rdi, %rdx), %xmm0 > > - movdqu -16(%rsi, %rdx), %xmm1 > > - CMPEQ %xmm0, %xmm1 > > - pmovmskb %xmm1, %eax > > - incw %ax > > - jnz L(vec_return_end) > > - ret > > - > > - .p2align 4 > > -L(unaligned_loop): > > -# ifdef DATA_CACHE_SIZE_HALF > > - mov $DATA_CACHE_SIZE_HALF, %R8_LP > > -# else > > - mov __x86_data_cache_size_half(%rip), %R8_LP > > -# endif > > - movq %r8, %r9 > > - addq %r8, %r8 > > - addq %r9, %r8 > > - cmpq %r8, %rdx > > - ja L(L2_L3_cache_unaligned) > > - sub $64, %rdx > > - .p2align 4 > > -L(64bytesormore_loop): > > - movdqu (%rdi), %xmm0 > > - movdqu 16(%rdi), %xmm1 > > - movdqu 32(%rdi), %xmm2 > > - movdqu 48(%rdi), %xmm3 > > - > > - CMPEQ (%rsi), %xmm0 > > - CMPEQ 16(%rsi), %xmm1 > > - CMPEQ 32(%rsi), %xmm2 > > - CMPEQ 48(%rsi), %xmm3 > > - > > - pand %xmm0, %xmm1 > > - pand %xmm2, %xmm3 > > - pand %xmm1, %xmm3 > > - > > - pmovmskb %xmm3, %eax > > - incw %ax > > - jnz L(64bytesormore_loop_end) > > - > > - add $64, %rsi > > - add $64, %rdi > > - sub $64, %rdx > > - ja L(64bytesormore_loop) > > - > > - .p2align 4,, 6 > > -L(loop_tail): > > - addq %rdx, %rdi > > - movdqu (%rdi), %xmm0 > > - movdqu 16(%rdi), %xmm1 > > - movdqu 32(%rdi), %xmm2 > > - movdqu 48(%rdi), %xmm3 > > - > > - addq %rdx, %rsi > > - movdqu (%rsi), %xmm4 > > - movdqu 16(%rsi), %xmm5 > > - movdqu 32(%rsi), %xmm6 > > - movdqu 48(%rsi), %xmm7 > > - > > - CMPEQ %xmm4, %xmm0 > > - CMPEQ %xmm5, %xmm1 > > - CMPEQ %xmm6, %xmm2 > > - CMPEQ %xmm7, %xmm3 > > - > > - pand %xmm0, %xmm1 > > - pand %xmm2, %xmm3 > > - pand %xmm1, %xmm3 > > - > > - pmovmskb %xmm3, %eax > > - incw %ax > > - jnz L(64bytesormore_loop_end) > > - ret > > - > > -L(L2_L3_cache_unaligned): > > - subq $64, %rdx > > - .p2align 4 > > -L(L2_L3_unaligned_128bytes_loop): > > - prefetchnta 0x1c0(%rdi) > > - prefetchnta 0x1c0(%rsi) > > - > > - movdqu (%rdi), %xmm0 > > - movdqu 16(%rdi), %xmm1 > > - movdqu 32(%rdi), %xmm2 > > - movdqu 48(%rdi), %xmm3 > > - > > - CMPEQ (%rsi), %xmm0 > > - CMPEQ 16(%rsi), %xmm1 > > - CMPEQ 32(%rsi), %xmm2 > > - CMPEQ 48(%rsi), %xmm3 > > - > > - pand %xmm0, %xmm1 > > - pand %xmm2, %xmm3 > > - pand %xmm1, %xmm3 > > - > > - pmovmskb %xmm3, %eax > > - incw %ax > > - jnz L(64bytesormore_loop_end) > > - > > - add $64, %rsi > > - add $64, %rdi > > - sub $64, %rdx > > - ja L(L2_L3_unaligned_128bytes_loop) > > - jmp L(loop_tail) > > - > > - > > - /* This case is for machines which are sensitive for unaligned > > - * instructions. */ > > - .p2align 4 > > -L(2aligned): > > - cmp $128, %rdx > > - ja L(128bytesormorein2aligned) > > -L(less128bytesin2aligned): > > - movdqa (%rdi), %xmm1 > > - CMPEQ (%rsi), %xmm1 > > - pmovmskb %xmm1, %eax > > - incw %ax > > - jnz L(vec_return_begin) > > - > > - movdqa 16(%rdi), %xmm1 > > - CMPEQ 16(%rsi), %xmm1 > > - pmovmskb %xmm1, %eax > > - incw %ax > > - jnz L(vec_return_begin_16) > > - > > - movdqa 32(%rdi), %xmm1 > > - CMPEQ 32(%rsi), %xmm1 > > - pmovmskb %xmm1, %eax > > - incw %ax > > - jnz L(vec_return_begin_32) > > - > > - movdqa 48(%rdi), %xmm1 > > - CMPEQ 48(%rsi), %xmm1 > > - pmovmskb %xmm1, %eax > > - incw %ax > > - jnz L(vec_return_begin_48) > > - > > - cmp $96, %rdx > > - jb L(32_to_64_bytes) > > - > > - addq $64, %rdi > > - addq $64, %rsi > > - subq $64, %rdx > > - > > - .p2align 4,, 6 > > -L(aligned_last_64_bytes): > > - movdqa (%rdi), %xmm1 > > - CMPEQ (%rsi), %xmm1 > > - pmovmskb %xmm1, %eax > > - incw %ax > > - jnz L(vec_return_begin) > > - > > - movdqa 16(%rdi), %xmm1 > > - CMPEQ 16(%rsi), %xmm1 > > - pmovmskb %xmm1, %eax > > - incw %ax > > - jnz L(vec_return_begin_16) > > - > > - movdqu -32(%rdi, %rdx), %xmm0 > > - movdqu -32(%rsi, %rdx), %xmm1 > > - CMPEQ %xmm0, %xmm1 > > - pmovmskb %xmm1, %eax > > - incw %ax > > - jnz L(vec_return_end_16) > > - > > - movdqu -16(%rdi, %rdx), %xmm0 > > - movdqu -16(%rsi, %rdx), %xmm1 > > - CMPEQ %xmm0, %xmm1 > > - pmovmskb %xmm1, %eax > > - incw %ax > > - jnz L(vec_return_end) > > - ret > > - > > - .p2align 4 > > -L(128bytesormorein2aligned): > > - cmp $256, %rdx > > - ja L(aligned_loop) > > -L(less256bytesin2alinged): > > - movdqa (%rdi), %xmm1 > > - CMPEQ (%rsi), %xmm1 > > - pmovmskb %xmm1, %eax > > - incw %ax > > - jnz L(vec_return_begin) > > - > > - movdqa 16(%rdi), %xmm1 > > - CMPEQ 16(%rsi), %xmm1 > > - pmovmskb %xmm1, %eax > > - incw %ax > > - jnz L(vec_return_begin_16) > > - > > - movdqa 32(%rdi), %xmm1 > > - CMPEQ 32(%rsi), %xmm1 > > - pmovmskb %xmm1, %eax > > - incw %ax > > - jnz L(vec_return_begin_32) > > - > > - movdqa 48(%rdi), %xmm1 > > - CMPEQ 48(%rsi), %xmm1 > > - pmovmskb %xmm1, %eax > > - incw %ax > > - jnz L(vec_return_begin_48) > > - > > - addq $64, %rdi > > - addq $64, %rsi > > - > > - movdqa (%rdi), %xmm1 > > - CMPEQ (%rsi), %xmm1 > > - pmovmskb %xmm1, %eax > > - incw %ax > > - jnz L(vec_return_begin) > > - > > - movdqa 16(%rdi), %xmm1 > > - CMPEQ 16(%rsi), %xmm1 > > - pmovmskb %xmm1, %eax > > - incw %ax > > - jnz L(vec_return_begin_16) > > - > > - movdqa 32(%rdi), %xmm1 > > - CMPEQ 32(%rsi), %xmm1 > > - pmovmskb %xmm1, %eax > > - incw %ax > > - jnz L(vec_return_begin_32) > > - > > - movdqa 48(%rdi), %xmm1 > > - CMPEQ 48(%rsi), %xmm1 > > - pmovmskb %xmm1, %eax > > - incw %ax > > - jnz L(vec_return_begin_48) > > - > > - addq $-128, %rdx > > - subq $-64, %rsi > > - subq $-64, %rdi > > - > > - cmp $64, %rdx > > - ja L(less128bytesin2aligned) > > - > > - cmp $32, %rdx > > - ja L(aligned_last_64_bytes) > > - > > - movdqu -32(%rdi, %rdx), %xmm0 > > - movdqu -32(%rsi, %rdx), %xmm1 > > - CMPEQ %xmm0, %xmm1 > > - pmovmskb %xmm1, %eax > > - incw %ax > > - jnz L(vec_return_end_16) > > - > > - movdqu -16(%rdi, %rdx), %xmm0 > > - movdqu -16(%rsi, %rdx), %xmm1 > > - CMPEQ %xmm0, %xmm1 > > - pmovmskb %xmm1, %eax > > - incw %ax > > - jnz L(vec_return_end) > > - ret > > - > > - .p2align 4 > > -L(aligned_loop): > > -# ifdef DATA_CACHE_SIZE_HALF > > - mov $DATA_CACHE_SIZE_HALF, %R8_LP > > -# else > > - mov __x86_data_cache_size_half(%rip), %R8_LP > > -# endif > > - movq %r8, %r9 > > - addq %r8, %r8 > > - addq %r9, %r8 > > - cmpq %r8, %rdx > > - ja L(L2_L3_cache_aligned) > > - > > - sub $64, %rdx > > - .p2align 4 > > -L(64bytesormore_loopin2aligned): > > - movdqa (%rdi), %xmm0 > > - movdqa 16(%rdi), %xmm1 > > - movdqa 32(%rdi), %xmm2 > > - movdqa 48(%rdi), %xmm3 > > - > > - CMPEQ (%rsi), %xmm0 > > - CMPEQ 16(%rsi), %xmm1 > > - CMPEQ 32(%rsi), %xmm2 > > - CMPEQ 48(%rsi), %xmm3 > > - > > - pand %xmm0, %xmm1 > > - pand %xmm2, %xmm3 > > - pand %xmm1, %xmm3 > > - > > - pmovmskb %xmm3, %eax > > - incw %ax > > - jnz L(64bytesormore_loop_end) > > - add $64, %rsi > > - add $64, %rdi > > - sub $64, %rdx > > - ja L(64bytesormore_loopin2aligned) > > - jmp L(loop_tail) > > - > > -L(L2_L3_cache_aligned): > > - subq $64, %rdx > > - .p2align 4 > > -L(L2_L3_aligned_128bytes_loop): > > - prefetchnta 0x1c0(%rdi) > > - prefetchnta 0x1c0(%rsi) > > - movdqa (%rdi), %xmm0 > > - movdqa 16(%rdi), %xmm1 > > - movdqa 32(%rdi), %xmm2 > > - movdqa 48(%rdi), %xmm3 > > - > > - CMPEQ (%rsi), %xmm0 > > - CMPEQ 16(%rsi), %xmm1 > > - CMPEQ 32(%rsi), %xmm2 > > - CMPEQ 48(%rsi), %xmm3 > > - > > - pand %xmm0, %xmm1 > > - pand %xmm2, %xmm3 > > - pand %xmm1, %xmm3 > > - > > - pmovmskb %xmm3, %eax > > - incw %ax > > - jnz L(64bytesormore_loop_end) > > - > > - addq $64, %rsi > > - addq $64, %rdi > > - subq $64, %rdx > > - ja L(L2_L3_aligned_128bytes_loop) > > - jmp L(loop_tail) > > - > > - .p2align 4 > > -L(64bytesormore_loop_end): > > - pmovmskb %xmm0, %ecx > > - incw %cx > > - jnz L(loop_end_ret) > > - > > - pmovmskb %xmm1, %ecx > > - notw %cx > > - sall $16, %ecx > > - jnz L(loop_end_ret) > > - > > - pmovmskb %xmm2, %ecx > > - notw %cx > > - shlq $32, %rcx > > - jnz L(loop_end_ret) > > - > > - addq $48, %rdi > > - addq $48, %rsi > > - movq %rax, %rcx > > - > > - .p2align 4,, 6 > > -L(loop_end_ret): > > - bsfq %rcx, %rcx > > -# ifdef USE_AS_WMEMCMP > > - movl (%rdi, %rcx), %eax > > - xorl %edx, %edx > > - cmpl (%rsi, %rcx), %eax > > - setg %dl > > - leal -1(%rdx, %rdx), %eax > > -# else > > - movzbl (%rdi, %rcx), %eax > > - movzbl (%rsi, %rcx), %ecx > > - subl %ecx, %eax > > -# endif > > - ret > > -END (MEMCMP) > > -#endif > > -- > > 2.25.1 > > > > > -- > H.J. I would like to backport this patch to release branches. Any comments or objections? Conflict resolution patch attached, --Sunil