From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: from mail-qt1-x82f.google.com (mail-qt1-x82f.google.com [IPv6:2607:f8b0:4864:20::82f]) by sourceware.org (Postfix) with ESMTPS id A51803858C2C; Sat, 23 Apr 2022 01:45:09 +0000 (GMT) DMARC-Filter: OpenDMARC Filter v1.4.1 sourceware.org A51803858C2C Received: by mail-qt1-x82f.google.com with SMTP id x24so6725150qtq.11; Fri, 22 Apr 2022 18:45:09 -0700 (PDT) X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=1e100.net; s=20210112; h=x-gm-message-state:mime-version:references:in-reply-to:from:date :message-id:subject:to:cc; bh=9Ma05GU67ELvVz4EKuBgmfoI37dTaEJ/TRmICBOTeL0=; b=YMXQl1WVYYGKyy+yiSWM2acAnmw4oF5jL3NmayCVmPYIm6d6/aNankFSSeltoPZcKw AlKCDos+P93g+JW1LK7K6fG2oR/OUx2vNZWiNmKv3K4WPUrlhFeWJVNK7iqVgSuG8ZWn D3SUsNY8YGG0I4xY0vRWls2JF6fElHJP0IUqu9qpNturvzAGLUw8IXVgx6QP433YQOLJ WvIGUyo1qn8CZJwIhBYBBWTrGnoeUheNqxJRwvNxSoRKGAvYaGauW7eRbJs4svTUJvCn cuG0yNvVC/PpgDx9q9BubzJnjtODIlmtiu0REWxHWVXGfTlf7f5bQLoIBAt9XHaWFVFL NEdg== X-Gm-Message-State: AOAM533Lswmy904FvIBJI+S145INPyVGaEtUFlClV8/p5eCSTGV6y3wx TR8gAvdkA86lPlEqMO4zUYH7DLKBlYw41lXurtQRjVPu10g= X-Google-Smtp-Source: ABdhPJyHXuZ2bqaIGo2rMx8k1p3gbWuzQwvti/Z/JyoU7YRC8s3iRhT1f5X6acNfWNzwD+H0yaW5P6DPIHRMiG7Edg4= X-Received: by 2002:a05:622a:1492:b0:2f1:f0a6:df02 with SMTP id t18-20020a05622a149200b002f1f0a6df02mr5376102qtx.282.1650678308650; Fri, 22 Apr 2022 18:45:08 -0700 (PDT) MIME-Version: 1.0 References: <20211111002714.2527889-1-goldstein.w.n@gmail.com> In-Reply-To: From: Sunil Pandey Date: Fri, 22 Apr 2022 18:44:32 -0700 Message-ID: Subject: Re: [PATCH v1] x86: Shrink memcmp-sse4.S code size To: Noah Goldstein , libc-stable@sourceware.org Cc: "H.J. Lu" , GNU C Library Content-Type: text/plain; charset="UTF-8" X-Spam-Status: No, score=-6.3 required=5.0 tests=BAYES_00, DKIM_SIGNED, DKIM_VALID, DKIM_VALID_AU, DKIM_VALID_EF, FREEMAIL_ENVFROM_END_DIGIT, FREEMAIL_FROM, GIT_PATCH_0, HK_RANDOM_ENVFROM, HK_RANDOM_FROM, RCVD_IN_DNSWL_NONE, SPF_HELO_NONE, SPF_PASS, TXREP autolearn=ham autolearn_force=no version=3.4.4 X-Spam-Checker-Version: SpamAssassin 3.4.4 (2020-01-24) on server2.sourceware.org X-BeenThere: libc-stable@sourceware.org X-Mailman-Version: 2.1.29 Precedence: list List-Id: Libc-stable mailing list List-Unsubscribe: , List-Archive: List-Help: List-Subscribe: , X-List-Received-Date: Sat, 23 Apr 2022 01:45:14 -0000 On Wed, Nov 10, 2021 at 6:15 PM Noah Goldstein via Libc-alpha wrote: > > On Wed, Nov 10, 2021 at 8:03 PM H.J. Lu wrote: > > > > On Wed, Nov 10, 2021 at 4:27 PM Noah Goldstein wrote: > > > > > > No bug. > > > > > > This implementation refactors memcmp-sse4.S primarily with minimizing > > > code size in mind. It does this by removing the lookup table logic and > > > removing the unrolled check from (256, 512] bytes. > > > > > > memcmp-sse4 code size reduction : -3487 bytes > > > wmemcmp-sse4 code size reduction: -1472 bytes > > > > > > The current memcmp-sse4.S implementation has a large code size > > > cost. This has serious adverse affects on the ICache / ITLB. While > > > in micro-benchmarks the implementations appears fast, traces of > > > real-world code have shown that the speed in micro benchmarks does not > > > translate when the ICache/ITLB are not primed, and that the cost > > > of the code size has measurable negative affects on overall > > > application performance. > > > > > > See https://research.google/pubs/pub48320/ for more details. > > > --- > > > sysdeps/x86_64/multiarch/memcmp-sse4.S | 2267 +++++++----------------- > > > 1 file changed, 646 insertions(+), 1621 deletions(-) > > > > > > diff --git a/sysdeps/x86_64/multiarch/memcmp-sse4.S b/sysdeps/x86_64/multiarch/memcmp-sse4.S > > > index b82adcd5fa..7686d1aa9e 100644 > > > --- a/sysdeps/x86_64/multiarch/memcmp-sse4.S > > > +++ b/sysdeps/x86_64/multiarch/memcmp-sse4.S > > > @@ -24,14 +24,14 @@ > > > # define MEMCMP __memcmp_sse4_1 > > > # endif > > > > > > -# define JMPTBL(I, B) (I - B) > > > +#ifdef USE_AS_WMEMCMP > > > +# define CMPEQ pcmpeqd > > > +# define CHAR_SIZE 4 > > > +#else > > > +# define CMPEQ pcmpeqb > > > +# define CHAR_SIZE 1 > > > +#endif > > > > > > -# define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \ > > > - lea TABLE(%rip), %r11; \ > > > - movslq (%r11, INDEX, SCALE), %rcx; \ > > > - add %r11, %rcx; \ > > > - _CET_NOTRACK jmp *%rcx; \ > > > - ud2 > > > > > > /* Warning! > > > wmemcmp has to use SIGNED comparison for elements. > > > @@ -46,33 +46,253 @@ ENTRY (MEMCMP) > > > /* Clear the upper 32 bits. */ > > > mov %edx, %edx > > > # endif > > > - pxor %xmm0, %xmm0 > > > cmp $79, %RDX_LP > > > ja L(79bytesormore) > > > + > > > + cmp $CHAR_SIZE, %RDX_LP > > > + jbe L(firstbyte) > > > + > > > + /* N in (CHAR_SIZE, 79) bytes. */ > > > + cmpl $32, %edx > > > + ja L(more_32_bytes) > > > + > > > + cmpl $16, %edx > > > + jae L(16_to_32_bytes) > > > + > > > # ifndef USE_AS_WMEMCMP > > > - cmp $1, %RDX_LP > > > - je L(firstbyte) > > > + cmpl $8, %edx > > > + jae L(8_to_16_bytes) > > > + > > > + cmpl $4, %edx > > > + jb L(2_to_3_bytes) > > > + > > > + movl (%rdi), %eax > > > + movl (%rsi), %ecx > > > + > > > + bswap %eax > > > + bswap %ecx > > > + > > > + shlq $32, %rax > > > + shlq $32, %rcx > > > + > > > + movl -4(%rdi, %rdx), %edi > > > + movl -4(%rsi, %rdx), %esi > > > + > > > + bswap %edi > > > + bswap %esi > > > + > > > + orq %rdi, %rax > > > + orq %rsi, %rcx > > > + subq %rcx, %rax > > > + cmovne %edx, %eax > > > + sbbl %ecx, %ecx > > > + orl %ecx, %eax > > > + ret > > > + > > > + .p2align 4,, 8 > > > +L(2_to_3_bytes): > > > + movzwl (%rdi), %eax > > > + movzwl (%rsi), %ecx > > > + shll $8, %eax > > > + shll $8, %ecx > > > + bswap %eax > > > + bswap %ecx > > > + movzbl -1(%rdi, %rdx), %edi > > > + movzbl -1(%rsi, %rdx), %esi > > > + orl %edi, %eax > > > + orl %esi, %ecx > > > + subl %ecx, %eax > > > + ret > > > + > > > + .p2align 4,, 8 > > > +L(8_to_16_bytes): > > > + movq (%rdi), %rax > > > + movq (%rsi), %rcx > > > + > > > + bswap %rax > > > + bswap %rcx > > > + > > > + subq %rcx, %rax > > > + jne L(8_to_16_bytes_done) > > > + > > > + movq -8(%rdi, %rdx), %rax > > > + movq -8(%rsi, %rdx), %rcx > > > + > > > + bswap %rax > > > + bswap %rcx > > > + > > > + subq %rcx, %rax > > > + > > > +L(8_to_16_bytes_done): > > > + cmovne %edx, %eax > > > + sbbl %ecx, %ecx > > > + orl %ecx, %eax > > > + ret > > > +# else > > > + xorl %eax, %eax > > > + movl (%rdi), %ecx > > > + cmpl (%rsi), %ecx > > > + jne L(8_to_16_bytes_done) > > > + movl 4(%rdi), %ecx > > > + cmpl 4(%rsi), %ecx > > > + jne L(8_to_16_bytes_done) > > > + movl -4(%rdi, %rdx), %ecx > > > + cmpl -4(%rsi, %rdx), %ecx > > > + jne L(8_to_16_bytes_done) > > > + ret > > > # endif > > > - add %rdx, %rsi > > > - add %rdx, %rdi > > > - BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4) > > > > > > -# ifndef USE_AS_WMEMCMP > > > - .p2align 4 > > > + .p2align 4,, 3 > > > +L(ret_zero): > > > + xorl %eax, %eax > > > +L(zero): > > > + ret > > > + > > > + .p2align 4,, 8 > > > L(firstbyte): > > > + jb L(ret_zero) > > > +# ifdef USE_AS_WMEMCMP > > > + xorl %eax, %eax > > > + movl (%rdi), %ecx > > > + cmpl (%rsi), %ecx > > > + je L(zero) > > > +L(8_to_16_bytes_done): > > > + setg %al > > > + leal -1(%rax, %rax), %eax > > > +# else > > > movzbl (%rdi), %eax > > > movzbl (%rsi), %ecx > > > sub %ecx, %eax > > > +# endif > > > ret > > > + > > > + .p2align 4 > > > +L(vec_return_begin_48): > > > + addq $16, %rdi > > > + addq $16, %rsi > > > +L(vec_return_begin_32): > > > + bsfl %eax, %eax > > > +# ifdef USE_AS_WMEMCMP > > > + movl 32(%rdi, %rax), %ecx > > > + xorl %edx, %edx > > > + cmpl 32(%rsi, %rax), %ecx > > > + setg %dl > > > + leal -1(%rdx, %rdx), %eax > > > +# else > > > + movzbl 32(%rsi, %rax), %ecx > > > + movzbl 32(%rdi, %rax), %eax > > > + subl %ecx, %eax > > > +# endif > > > + ret > > > + > > > + .p2align 4 > > > +L(vec_return_begin_16): > > > + addq $16, %rdi > > > + addq $16, %rsi > > > +L(vec_return_begin): > > > + bsfl %eax, %eax > > > +# ifdef USE_AS_WMEMCMP > > > + movl (%rdi, %rax), %ecx > > > + xorl %edx, %edx > > > + cmpl (%rsi, %rax), %ecx > > > + setg %dl > > > + leal -1(%rdx, %rdx), %eax > > > +# else > > > + movzbl (%rsi, %rax), %ecx > > > + movzbl (%rdi, %rax), %eax > > > + subl %ecx, %eax > > > +# endif > > > + ret > > > + > > > + .p2align 4 > > > +L(vec_return_end_16): > > > + subl $16, %edx > > > +L(vec_return_end): > > > + bsfl %eax, %eax > > > + addl %edx, %eax > > > +# ifdef USE_AS_WMEMCMP > > > + movl -16(%rdi, %rax), %ecx > > > + xorl %edx, %edx > > > + cmpl -16(%rsi, %rax), %ecx > > > + setg %dl > > > + leal -1(%rdx, %rdx), %eax > > > +# else > > > + movzbl -16(%rsi, %rax), %ecx > > > + movzbl -16(%rdi, %rax), %eax > > > + subl %ecx, %eax > > > # endif > > > + ret > > > + > > > + .p2align 4,, 8 > > > +L(more_32_bytes): > > > + movdqu (%rdi), %xmm0 > > > + movdqu (%rsi), %xmm1 > > > + CMPEQ %xmm0, %xmm1 > > > + pmovmskb %xmm1, %eax > > > + incw %ax > > > + jnz L(vec_return_begin) > > > + > > > + movdqu 16(%rdi), %xmm0 > > > + movdqu 16(%rsi), %xmm1 > > > + CMPEQ %xmm0, %xmm1 > > > + pmovmskb %xmm1, %eax > > > + incw %ax > > > + jnz L(vec_return_begin_16) > > > + > > > + cmpl $64, %edx > > > + jbe L(32_to_64_bytes) > > > + movdqu 32(%rdi), %xmm0 > > > + movdqu 32(%rsi), %xmm1 > > > + CMPEQ %xmm0, %xmm1 > > > + pmovmskb %xmm1, %eax > > > + incw %ax > > > + jnz L(vec_return_begin_32) > > > + > > > + .p2align 4,, 6 > > > +L(32_to_64_bytes): > > > + movdqu -32(%rdi, %rdx), %xmm0 > > > + movdqu -32(%rsi, %rdx), %xmm1 > > > + CMPEQ %xmm0, %xmm1 > > > + pmovmskb %xmm1, %eax > > > + incw %ax > > > + jnz L(vec_return_end_16) > > > + > > > + movdqu -16(%rdi, %rdx), %xmm0 > > > + movdqu -16(%rsi, %rdx), %xmm1 > > > + CMPEQ %xmm0, %xmm1 > > > + pmovmskb %xmm1, %eax > > > + incw %ax > > > + jnz L(vec_return_end) > > > + ret > > > + > > > + .p2align 4 > > > +L(16_to_32_bytes): > > > + movdqu (%rdi), %xmm0 > > > + movdqu (%rsi), %xmm1 > > > + CMPEQ %xmm0, %xmm1 > > > + pmovmskb %xmm1, %eax > > > + incw %ax > > > + jnz L(vec_return_begin) > > > + > > > + movdqu -16(%rdi, %rdx), %xmm0 > > > + movdqu -16(%rsi, %rdx), %xmm1 > > > + CMPEQ %xmm0, %xmm1 > > > + pmovmskb %xmm1, %eax > > > + incw %ax > > > + jnz L(vec_return_end) > > > + ret > > > + > > > > > > .p2align 4 > > > L(79bytesormore): > > > + movdqu (%rdi), %xmm0 > > > movdqu (%rsi), %xmm1 > > > - movdqu (%rdi), %xmm2 > > > - pxor %xmm1, %xmm2 > > > - ptest %xmm2, %xmm0 > > > - jnc L(16bytesin256) > > > + CMPEQ %xmm0, %xmm1 > > > + pmovmskb %xmm1, %eax > > > + incw %ax > > > + jnz L(vec_return_begin) > > > + > > > + > > > mov %rsi, %rcx > > > and $-16, %rsi > > > add $16, %rsi > > > @@ -85,1694 +305,499 @@ L(79bytesormore): > > > > > > cmp $128, %rdx > > > ja L(128bytesormore) > > > -L(less128bytes): > > > - sub $64, %rdx > > > - > > > - movdqu (%rdi), %xmm2 > > > - pxor (%rsi), %xmm2 > > > - ptest %xmm2, %xmm0 > > > - jnc L(16bytesin256) > > > > > > - movdqu 16(%rdi), %xmm2 > > > - pxor 16(%rsi), %xmm2 > > > - ptest %xmm2, %xmm0 > > > - jnc L(32bytesin256) > > > - > > > - movdqu 32(%rdi), %xmm2 > > > - pxor 32(%rsi), %xmm2 > > > - ptest %xmm2, %xmm0 > > > - jnc L(48bytesin256) > > > - > > > - movdqu 48(%rdi), %xmm2 > > > - pxor 48(%rsi), %xmm2 > > > - ptest %xmm2, %xmm0 > > > - jnc L(64bytesin256) > > > - cmp $32, %rdx > > > - jb L(less32bytesin64) > > > - > > > - movdqu 64(%rdi), %xmm2 > > > - pxor 64(%rsi), %xmm2 > > > - ptest %xmm2, %xmm0 > > > - jnc L(80bytesin256) > > > - > > > - movdqu 80(%rdi), %xmm2 > > > - pxor 80(%rsi), %xmm2 > > > - ptest %xmm2, %xmm0 > > > - jnc L(96bytesin256) > > > - sub $32, %rdx > > > - add $32, %rdi > > > - add $32, %rsi > > > -L(less32bytesin64): > > > - add $64, %rdi > > > - add $64, %rsi > > > - add %rdx, %rsi > > > - add %rdx, %rdi > > > - BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4) > > > + .p2align 4,, 6 > > > +L(less128bytes): > > > + movdqu (%rdi), %xmm1 > > > + CMPEQ (%rsi), %xmm1 > > > + pmovmskb %xmm1, %eax > > > + incw %ax > > > + jnz L(vec_return_begin) > > > + > > > + movdqu 16(%rdi), %xmm1 > > > + CMPEQ 16(%rsi), %xmm1 > > > + pmovmskb %xmm1, %eax > > > + incw %ax > > > + jnz L(vec_return_begin_16) > > > + > > > + movdqu 32(%rdi), %xmm1 > > > + CMPEQ 32(%rsi), %xmm1 > > > + pmovmskb %xmm1, %eax > > > + incw %ax > > > + jnz L(vec_return_begin_32) > > > + > > > + movdqu 48(%rdi), %xmm1 > > > + CMPEQ 48(%rsi), %xmm1 > > > + pmovmskb %xmm1, %eax > > > + incw %ax > > > + jnz L(vec_return_begin_48) > > > + > > > + cmp $96, %rdx > > > + jb L(32_to_64_bytes) > > > + > > > + addq $64, %rdi > > > + addq $64, %rsi > > > + subq $64, %rdx > > > + > > > + .p2align 4,, 6 > > > +L(last_64_bytes): > > > + movdqu (%rdi), %xmm1 > > > + CMPEQ (%rsi), %xmm1 > > > + pmovmskb %xmm1, %eax > > > + incw %ax > > > + jnz L(vec_return_begin) > > > + > > > + movdqu 16(%rdi), %xmm1 > > > + CMPEQ 16(%rsi), %xmm1 > > > + pmovmskb %xmm1, %eax > > > + incw %ax > > > + jnz L(vec_return_begin_16) > > > + > > > + movdqu -32(%rdi, %rdx), %xmm0 > > > + movdqu -32(%rsi, %rdx), %xmm1 > > > + CMPEQ %xmm0, %xmm1 > > > + pmovmskb %xmm1, %eax > > > + incw %ax > > > + jnz L(vec_return_end_16) > > > + > > > + movdqu -16(%rdi, %rdx), %xmm0 > > > + movdqu -16(%rsi, %rdx), %xmm1 > > > + CMPEQ %xmm0, %xmm1 > > > + pmovmskb %xmm1, %eax > > > + incw %ax > > > + jnz L(vec_return_end) > > > + ret > > > > > > + .p2align 4 > > > L(128bytesormore): > > > - cmp $512, %rdx > > > - ja L(512bytesormore) > > > cmp $256, %rdx > > > - ja L(less512bytes) > > > + ja L(unaligned_loop) > > > L(less256bytes): > > > - sub $128, %rdx > > > - > > > - movdqu (%rdi), %xmm2 > > > - pxor (%rsi), %xmm2 > > > - ptest %xmm2, %xmm0 > > > - jnc L(16bytesin256) > > > - > > > - movdqu 16(%rdi), %xmm2 > > > - pxor 16(%rsi), %xmm2 > > > - ptest %xmm2, %xmm0 > > > - jnc L(32bytesin256) > > > - > > > - movdqu 32(%rdi), %xmm2 > > > - pxor 32(%rsi), %xmm2 > > > - ptest %xmm2, %xmm0 > > > - jnc L(48bytesin256) > > > - > > > - movdqu 48(%rdi), %xmm2 > > > - pxor 48(%rsi), %xmm2 > > > - ptest %xmm2, %xmm0 > > > - jnc L(64bytesin256) > > > - > > > - movdqu 64(%rdi), %xmm2 > > > - pxor 64(%rsi), %xmm2 > > > - ptest %xmm2, %xmm0 > > > - jnc L(80bytesin256) > > > - > > > - movdqu 80(%rdi), %xmm2 > > > - pxor 80(%rsi), %xmm2 > > > - ptest %xmm2, %xmm0 > > > - jnc L(96bytesin256) > > > - > > > - movdqu 96(%rdi), %xmm2 > > > - pxor 96(%rsi), %xmm2 > > > - ptest %xmm2, %xmm0 > > > - jnc L(112bytesin256) > > > - > > > - movdqu 112(%rdi), %xmm2 > > > - pxor 112(%rsi), %xmm2 > > > - ptest %xmm2, %xmm0 > > > - jnc L(128bytesin256) > > > - > > > - add $128, %rsi > > > - add $128, %rdi > > > - > > > - cmp $64, %rdx > > > - jae L(less128bytes) > > > - > > > - cmp $32, %rdx > > > - jb L(less32bytesin128) > > > - > > > - movdqu (%rdi), %xmm2 > > > - pxor (%rsi), %xmm2 > > > - ptest %xmm2, %xmm0 > > > - jnc L(16bytesin256) > > > - > > > - movdqu 16(%rdi), %xmm2 > > > - pxor 16(%rsi), %xmm2 > > > - ptest %xmm2, %xmm0 > > > - jnc L(32bytesin256) > > > - sub $32, %rdx > > > - add $32, %rdi > > > - add $32, %rsi > > > -L(less32bytesin128): > > > - add %rdx, %rsi > > > - add %rdx, %rdi > > > - BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4) > > > - > > > -L(less512bytes): > > > - sub $256, %rdx > > > - movdqu (%rdi), %xmm2 > > > - pxor (%rsi), %xmm2 > > > - ptest %xmm2, %xmm0 > > > - jnc L(16bytesin256) > > > - > > > - movdqu 16(%rdi), %xmm2 > > > - pxor 16(%rsi), %xmm2 > > > - ptest %xmm2, %xmm0 > > > - jnc L(32bytesin256) > > > - > > > - movdqu 32(%rdi), %xmm2 > > > - pxor 32(%rsi), %xmm2 > > > - ptest %xmm2, %xmm0 > > > - jnc L(48bytesin256) > > > - > > > - movdqu 48(%rdi), %xmm2 > > > - pxor 48(%rsi), %xmm2 > > > - ptest %xmm2, %xmm0 > > > - jnc L(64bytesin256) > > > - > > > - movdqu 64(%rdi), %xmm2 > > > - pxor 64(%rsi), %xmm2 > > > - ptest %xmm2, %xmm0 > > > - jnc L(80bytesin256) > > > - > > > - movdqu 80(%rdi), %xmm2 > > > - pxor 80(%rsi), %xmm2 > > > - ptest %xmm2, %xmm0 > > > - jnc L(96bytesin256) > > > - > > > - movdqu 96(%rdi), %xmm2 > > > - pxor 96(%rsi), %xmm2 > > > - ptest %xmm2, %xmm0 > > > - jnc L(112bytesin256) > > > - > > > - movdqu 112(%rdi), %xmm2 > > > - pxor 112(%rsi), %xmm2 > > > - ptest %xmm2, %xmm0 > > > - jnc L(128bytesin256) > > > - > > > - movdqu 128(%rdi), %xmm2 > > > - pxor 128(%rsi), %xmm2 > > > - ptest %xmm2, %xmm0 > > > - jnc L(144bytesin256) > > > - > > > - movdqu 144(%rdi), %xmm2 > > > - pxor 144(%rsi), %xmm2 > > > - ptest %xmm2, %xmm0 > > > - jnc L(160bytesin256) > > > - > > > - movdqu 160(%rdi), %xmm2 > > > - pxor 160(%rsi), %xmm2 > > > - ptest %xmm2, %xmm0 > > > - jnc L(176bytesin256) > > > - > > > - movdqu 176(%rdi), %xmm2 > > > - pxor 176(%rsi), %xmm2 > > > - ptest %xmm2, %xmm0 > > > - jnc L(192bytesin256) > > > - > > > - movdqu 192(%rdi), %xmm2 > > > - pxor 192(%rsi), %xmm2 > > > - ptest %xmm2, %xmm0 > > > - jnc L(208bytesin256) > > > - > > > - movdqu 208(%rdi), %xmm2 > > > - pxor 208(%rsi), %xmm2 > > > - ptest %xmm2, %xmm0 > > > - jnc L(224bytesin256) > > > - > > > - movdqu 224(%rdi), %xmm2 > > > - pxor 224(%rsi), %xmm2 > > > - ptest %xmm2, %xmm0 > > > - jnc L(240bytesin256) > > > - > > > - movdqu 240(%rdi), %xmm2 > > > - pxor 240(%rsi), %xmm2 > > > - ptest %xmm2, %xmm0 > > > - jnc L(256bytesin256) > > > - > > > - add $256, %rsi > > > - add $256, %rdi > > > - > > > - cmp $128, %rdx > > > - jae L(less256bytes) > > > + movdqu (%rdi), %xmm1 > > > + CMPEQ (%rsi), %xmm1 > > > + pmovmskb %xmm1, %eax > > > + incw %ax > > > + jnz L(vec_return_begin) > > > + > > > + movdqu 16(%rdi), %xmm1 > > > + CMPEQ 16(%rsi), %xmm1 > > > + pmovmskb %xmm1, %eax > > > + incw %ax > > > + jnz L(vec_return_begin_16) > > > + > > > + movdqu 32(%rdi), %xmm1 > > > + CMPEQ 32(%rsi), %xmm1 > > > + pmovmskb %xmm1, %eax > > > + incw %ax > > > + jnz L(vec_return_begin_32) > > > + > > > + movdqu 48(%rdi), %xmm1 > > > + CMPEQ 48(%rsi), %xmm1 > > > + pmovmskb %xmm1, %eax > > > + incw %ax > > > + jnz L(vec_return_begin_48) > > > + > > > + addq $64, %rdi > > > + addq $64, %rsi > > > + > > > + movdqu (%rdi), %xmm1 > > > + CMPEQ (%rsi), %xmm1 > > > + pmovmskb %xmm1, %eax > > > + incw %ax > > > + jnz L(vec_return_begin) > > > + > > > + movdqu 16(%rdi), %xmm1 > > > + CMPEQ 16(%rsi), %xmm1 > > > + pmovmskb %xmm1, %eax > > > + incw %ax > > > + jnz L(vec_return_begin_16) > > > + > > > + movdqu 32(%rdi), %xmm1 > > > + CMPEQ 32(%rsi), %xmm1 > > > + pmovmskb %xmm1, %eax > > > + incw %ax > > > + jnz L(vec_return_begin_32) > > > + > > > + movdqu 48(%rdi), %xmm1 > > > + CMPEQ 48(%rsi), %xmm1 > > > + pmovmskb %xmm1, %eax > > > + incw %ax > > > + jnz L(vec_return_begin_48) > > > + > > > + addq $-128, %rdx > > > + subq $-64, %rsi > > > + subq $-64, %rdi > > > > > > cmp $64, %rdx > > > - jae L(less128bytes) > > > + ja L(less128bytes) > > > > > > cmp $32, %rdx > > > - jb L(less32bytesin256) > > > - > > > - movdqu (%rdi), %xmm2 > > > - pxor (%rsi), %xmm2 > > > - ptest %xmm2, %xmm0 > > > - jnc L(16bytesin256) > > > - > > > - movdqu 16(%rdi), %xmm2 > > > - pxor 16(%rsi), %xmm2 > > > - ptest %xmm2, %xmm0 > > > - jnc L(32bytesin256) > > > - sub $32, %rdx > > > - add $32, %rdi > > > - add $32, %rsi > > > -L(less32bytesin256): > > > - add %rdx, %rsi > > > - add %rdx, %rdi > > > - BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4) > > > + ja L(last_64_bytes) > > > + > > > + movdqu -32(%rdi, %rdx), %xmm0 > > > + movdqu -32(%rsi, %rdx), %xmm1 > > > + CMPEQ %xmm0, %xmm1 > > > + pmovmskb %xmm1, %eax > > > + incw %ax > > > + jnz L(vec_return_end_16) > > > + > > > + movdqu -16(%rdi, %rdx), %xmm0 > > > + movdqu -16(%rsi, %rdx), %xmm1 > > > + CMPEQ %xmm0, %xmm1 > > > + pmovmskb %xmm1, %eax > > > + incw %ax > > > + jnz L(vec_return_end) > > > + ret > > > > > > .p2align 4 > > > -L(512bytesormore): > > > +L(unaligned_loop): > > > # ifdef DATA_CACHE_SIZE_HALF > > > mov $DATA_CACHE_SIZE_HALF, %R8_LP > > > # else > > > mov __x86_data_cache_size_half(%rip), %R8_LP > > > # endif > > > - mov %r8, %r9 > > > - shr $1, %r8 > > > - add %r9, %r8 > > > - cmp %r8, %rdx > > > - ja L(L2_L3_cache_unaglined) > > > + movq %r8, %r9 > > > + addq %r8, %r8 > > > + addq %r9, %r8 > > > + cmpq %r8, %rdx > > > + ja L(L2_L3_cache_unaligned) > > > sub $64, %rdx > > > .p2align 4 > > > L(64bytesormore_loop): > > > - movdqu (%rdi), %xmm2 > > > - pxor (%rsi), %xmm2 > > > - movdqa %xmm2, %xmm1 > > > + movdqu (%rdi), %xmm0 > > > + movdqu 16(%rdi), %xmm1 > > > + movdqu 32(%rdi), %xmm2 > > > + movdqu 48(%rdi), %xmm3 > > > > > > - movdqu 16(%rdi), %xmm3 > > > - pxor 16(%rsi), %xmm3 > > > - por %xmm3, %xmm1 > > > + CMPEQ (%rsi), %xmm0 > > > + CMPEQ 16(%rsi), %xmm1 > > > + CMPEQ 32(%rsi), %xmm2 > > > + CMPEQ 48(%rsi), %xmm3 > > > > > > - movdqu 32(%rdi), %xmm4 > > > - pxor 32(%rsi), %xmm4 > > > - por %xmm4, %xmm1 > > > + pand %xmm0, %xmm1 > > > + pand %xmm2, %xmm3 > > > + pand %xmm1, %xmm3 > > > > > > - movdqu 48(%rdi), %xmm5 > > > - pxor 48(%rsi), %xmm5 > > > - por %xmm5, %xmm1 > > > + pmovmskb %xmm3, %eax > > > + incw %ax > > > + jnz L(64bytesormore_loop_end) > > > > > > - ptest %xmm1, %xmm0 > > > - jnc L(64bytesormore_loop_end) > > > add $64, %rsi > > > add $64, %rdi > > > sub $64, %rdx > > > - jae L(64bytesormore_loop) > > > + ja L(64bytesormore_loop) > > > > > > - add $64, %rdx > > > - add %rdx, %rsi > > > - add %rdx, %rdi > > > - BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4) > > > + .p2align 4,, 6 > > > +L(loop_tail): > > > + addq %rdx, %rdi > > > + movdqu (%rdi), %xmm0 > > > + movdqu 16(%rdi), %xmm1 > > > + movdqu 32(%rdi), %xmm2 > > > + movdqu 48(%rdi), %xmm3 > > > + > > > + addq %rdx, %rsi > > > + movdqu (%rsi), %xmm4 > > > + movdqu 16(%rsi), %xmm5 > > > + movdqu 32(%rsi), %xmm6 > > > + movdqu 48(%rsi), %xmm7 > > > + > > > + CMPEQ %xmm4, %xmm0 > > > + CMPEQ %xmm5, %xmm1 > > > + CMPEQ %xmm6, %xmm2 > > > + CMPEQ %xmm7, %xmm3 > > > + > > > + pand %xmm0, %xmm1 > > > + pand %xmm2, %xmm3 > > > + pand %xmm1, %xmm3 > > > + > > > + pmovmskb %xmm3, %eax > > > + incw %ax > > > + jnz L(64bytesormore_loop_end) > > > + ret > > > > > > -L(L2_L3_cache_unaglined): > > > - sub $64, %rdx > > > +L(L2_L3_cache_unaligned): > > > + subq $64, %rdx > > > .p2align 4 > > > L(L2_L3_unaligned_128bytes_loop): > > > prefetchnta 0x1c0(%rdi) > > > prefetchnta 0x1c0(%rsi) > > > - movdqu (%rdi), %xmm2 > > > - pxor (%rsi), %xmm2 > > > - movdqa %xmm2, %xmm1 > > > > > > - movdqu 16(%rdi), %xmm3 > > > - pxor 16(%rsi), %xmm3 > > > - por %xmm3, %xmm1 > > > + movdqu (%rdi), %xmm0 > > > + movdqu 16(%rdi), %xmm1 > > > + movdqu 32(%rdi), %xmm2 > > > + movdqu 48(%rdi), %xmm3 > > > + > > > + CMPEQ (%rsi), %xmm0 > > > + CMPEQ 16(%rsi), %xmm1 > > > + CMPEQ 32(%rsi), %xmm2 > > > + CMPEQ 48(%rsi), %xmm3 > > > > > > - movdqu 32(%rdi), %xmm4 > > > - pxor 32(%rsi), %xmm4 > > > - por %xmm4, %xmm1 > > > + pand %xmm0, %xmm1 > > > + pand %xmm2, %xmm3 > > > + pand %xmm1, %xmm3 > > > > > > - movdqu 48(%rdi), %xmm5 > > > - pxor 48(%rsi), %xmm5 > > > - por %xmm5, %xmm1 > > > + pmovmskb %xmm3, %eax > > > + incw %ax > > > + jnz L(64bytesormore_loop_end) > > > > > > - ptest %xmm1, %xmm0 > > > - jnc L(64bytesormore_loop_end) > > > add $64, %rsi > > > add $64, %rdi > > > sub $64, %rdx > > > - jae L(L2_L3_unaligned_128bytes_loop) > > > + ja L(L2_L3_unaligned_128bytes_loop) > > > + jmp L(loop_tail) > > > > > > - add $64, %rdx > > > - add %rdx, %rsi > > > - add %rdx, %rdi > > > - BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4) > > > > > > -/* > > > - * This case is for machines which are sensitive for unaligned instructions. > > > - */ > > > + /* This case is for machines which are sensitive for unaligned > > > + * instructions. */ > > > .p2align 4 > > > L(2aligned): > > > cmp $128, %rdx > > > ja L(128bytesormorein2aligned) > > > L(less128bytesin2aligned): > > > - sub $64, %rdx > > > - > > > - movdqa (%rdi), %xmm2 > > > - pxor (%rsi), %xmm2 > > > - ptest %xmm2, %xmm0 > > > - jnc L(16bytesin256) > > > - > > > - movdqa 16(%rdi), %xmm2 > > > - pxor 16(%rsi), %xmm2 > > > - ptest %xmm2, %xmm0 > > > - jnc L(32bytesin256) > > > - > > > - movdqa 32(%rdi), %xmm2 > > > - pxor 32(%rsi), %xmm2 > > > - ptest %xmm2, %xmm0 > > > - jnc L(48bytesin256) > > > - > > > - movdqa 48(%rdi), %xmm2 > > > - pxor 48(%rsi), %xmm2 > > > - ptest %xmm2, %xmm0 > > > - jnc L(64bytesin256) > > > - cmp $32, %rdx > > > - jb L(less32bytesin64in2alinged) > > > - > > > - movdqa 64(%rdi), %xmm2 > > > - pxor 64(%rsi), %xmm2 > > > - ptest %xmm2, %xmm0 > > > - jnc L(80bytesin256) > > > - > > > - movdqa 80(%rdi), %xmm2 > > > - pxor 80(%rsi), %xmm2 > > > - ptest %xmm2, %xmm0 > > > - jnc L(96bytesin256) > > > - sub $32, %rdx > > > - add $32, %rdi > > > - add $32, %rsi > > > -L(less32bytesin64in2alinged): > > > - add $64, %rdi > > > - add $64, %rsi > > > - add %rdx, %rsi > > > - add %rdx, %rdi > > > - BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4) > > > + movdqa (%rdi), %xmm1 > > > + CMPEQ (%rsi), %xmm1 > > > + pmovmskb %xmm1, %eax > > > + incw %ax > > > + jnz L(vec_return_begin) > > > + > > > + movdqa 16(%rdi), %xmm1 > > > + CMPEQ 16(%rsi), %xmm1 > > > + pmovmskb %xmm1, %eax > > > + incw %ax > > > + jnz L(vec_return_begin_16) > > > + > > > + movdqa 32(%rdi), %xmm1 > > > + CMPEQ 32(%rsi), %xmm1 > > > + pmovmskb %xmm1, %eax > > > + incw %ax > > > + jnz L(vec_return_begin_32) > > > + > > > + movdqa 48(%rdi), %xmm1 > > > + CMPEQ 48(%rsi), %xmm1 > > > + pmovmskb %xmm1, %eax > > > + incw %ax > > > + jnz L(vec_return_begin_48) > > > + > > > + cmp $96, %rdx > > > + jb L(32_to_64_bytes) > > > + > > > + addq $64, %rdi > > > + addq $64, %rsi > > > + subq $64, %rdx > > > + > > > + .p2align 4,, 6 > > > +L(aligned_last_64_bytes): > > > + movdqa (%rdi), %xmm1 > > > + CMPEQ (%rsi), %xmm1 > > > + pmovmskb %xmm1, %eax > > > + incw %ax > > > + jnz L(vec_return_begin) > > > + > > > + movdqa 16(%rdi), %xmm1 > > > + CMPEQ 16(%rsi), %xmm1 > > > + pmovmskb %xmm1, %eax > > > + incw %ax > > > + jnz L(vec_return_begin_16) > > > + > > > + movdqu -32(%rdi, %rdx), %xmm0 > > > + movdqu -32(%rsi, %rdx), %xmm1 > > > + CMPEQ %xmm0, %xmm1 > > > + pmovmskb %xmm1, %eax > > > + incw %ax > > > + jnz L(vec_return_end_16) > > > + > > > + movdqu -16(%rdi, %rdx), %xmm0 > > > + movdqu -16(%rsi, %rdx), %xmm1 > > > + CMPEQ %xmm0, %xmm1 > > > + pmovmskb %xmm1, %eax > > > + incw %ax > > > + jnz L(vec_return_end) > > > + ret > > > > > > .p2align 4 > > > L(128bytesormorein2aligned): > > > - cmp $512, %rdx > > > - ja L(512bytesormorein2aligned) > > > cmp $256, %rdx > > > - ja L(256bytesormorein2aligned) > > > + ja L(aligned_loop) > > > L(less256bytesin2alinged): > > > - sub $128, %rdx > > > - > > > - movdqa (%rdi), %xmm2 > > > - pxor (%rsi), %xmm2 > > > - ptest %xmm2, %xmm0 > > > - jnc L(16bytesin256) > > > - > > > - movdqa 16(%rdi), %xmm2 > > > - pxor 16(%rsi), %xmm2 > > > - ptest %xmm2, %xmm0 > > > - jnc L(32bytesin256) > > > - > > > - movdqa 32(%rdi), %xmm2 > > > - pxor 32(%rsi), %xmm2 > > > - ptest %xmm2, %xmm0 > > > - jnc L(48bytesin256) > > > - > > > - movdqa 48(%rdi), %xmm2 > > > - pxor 48(%rsi), %xmm2 > > > - ptest %xmm2, %xmm0 > > > - jnc L(64bytesin256) > > > - > > > - movdqa 64(%rdi), %xmm2 > > > - pxor 64(%rsi), %xmm2 > > > - ptest %xmm2, %xmm0 > > > - jnc L(80bytesin256) > > > - > > > - movdqa 80(%rdi), %xmm2 > > > - pxor 80(%rsi), %xmm2 > > > - ptest %xmm2, %xmm0 > > > - jnc L(96bytesin256) > > > - > > > - movdqa 96(%rdi), %xmm2 > > > - pxor 96(%rsi), %xmm2 > > > - ptest %xmm2, %xmm0 > > > - jnc L(112bytesin256) > > > - > > > - movdqa 112(%rdi), %xmm2 > > > - pxor 112(%rsi), %xmm2 > > > - ptest %xmm2, %xmm0 > > > - jnc L(128bytesin256) > > > - > > > - add $128, %rsi > > > - add $128, %rdi > > > + movdqa (%rdi), %xmm1 > > > + CMPEQ (%rsi), %xmm1 > > > + pmovmskb %xmm1, %eax > > > + incw %ax > > > + jnz L(vec_return_begin) > > > + > > > + movdqa 16(%rdi), %xmm1 > > > + CMPEQ 16(%rsi), %xmm1 > > > + pmovmskb %xmm1, %eax > > > + incw %ax > > > + jnz L(vec_return_begin_16) > > > + > > > + movdqa 32(%rdi), %xmm1 > > > + CMPEQ 32(%rsi), %xmm1 > > > + pmovmskb %xmm1, %eax > > > + incw %ax > > > + jnz L(vec_return_begin_32) > > > + > > > + movdqa 48(%rdi), %xmm1 > > > + CMPEQ 48(%rsi), %xmm1 > > > + pmovmskb %xmm1, %eax > > > + incw %ax > > > + jnz L(vec_return_begin_48) > > > + > > > + addq $64, %rdi > > > + addq $64, %rsi > > > + > > > + movdqa (%rdi), %xmm1 > > > + CMPEQ (%rsi), %xmm1 > > > + pmovmskb %xmm1, %eax > > > + incw %ax > > > + jnz L(vec_return_begin) > > > + > > > + movdqa 16(%rdi), %xmm1 > > > + CMPEQ 16(%rsi), %xmm1 > > > + pmovmskb %xmm1, %eax > > > + incw %ax > > > + jnz L(vec_return_begin_16) > > > + > > > + movdqa 32(%rdi), %xmm1 > > > + CMPEQ 32(%rsi), %xmm1 > > > + pmovmskb %xmm1, %eax > > > + incw %ax > > > + jnz L(vec_return_begin_32) > > > + > > > + movdqa 48(%rdi), %xmm1 > > > + CMPEQ 48(%rsi), %xmm1 > > > + pmovmskb %xmm1, %eax > > > + incw %ax > > > + jnz L(vec_return_begin_48) > > > + > > > + addq $-128, %rdx > > > + subq $-64, %rsi > > > + subq $-64, %rdi > > > > > > cmp $64, %rdx > > > - jae L(less128bytesin2aligned) > > > + ja L(less128bytesin2aligned) > > > > > > cmp $32, %rdx > > > - jb L(less32bytesin128in2aligned) > > > - > > > - movdqu (%rdi), %xmm2 > > > - pxor (%rsi), %xmm2 > > > - ptest %xmm2, %xmm0 > > > - jnc L(16bytesin256) > > > - > > > - movdqu 16(%rdi), %xmm2 > > > - pxor 16(%rsi), %xmm2 > > > - ptest %xmm2, %xmm0 > > > - jnc L(32bytesin256) > > > - sub $32, %rdx > > > - add $32, %rdi > > > - add $32, %rsi > > > -L(less32bytesin128in2aligned): > > > - add %rdx, %rsi > > > - add %rdx, %rdi > > > - BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4) > > > - > > > - .p2align 4 > > > -L(256bytesormorein2aligned): > > > - > > > - sub $256, %rdx > > > - movdqa (%rdi), %xmm2 > > > - pxor (%rsi), %xmm2 > > > - ptest %xmm2, %xmm0 > > > - jnc L(16bytesin256) > > > - > > > - movdqa 16(%rdi), %xmm2 > > > - pxor 16(%rsi), %xmm2 > > > - ptest %xmm2, %xmm0 > > > - jnc L(32bytesin256) > > > - > > > - movdqa 32(%rdi), %xmm2 > > > - pxor 32(%rsi), %xmm2 > > > - ptest %xmm2, %xmm0 > > > - jnc L(48bytesin256) > > > - > > > - movdqa 48(%rdi), %xmm2 > > > - pxor 48(%rsi), %xmm2 > > > - ptest %xmm2, %xmm0 > > > - jnc L(64bytesin256) > > > - > > > - movdqa 64(%rdi), %xmm2 > > > - pxor 64(%rsi), %xmm2 > > > - ptest %xmm2, %xmm0 > > > - jnc L(80bytesin256) > > > - > > > - movdqa 80(%rdi), %xmm2 > > > - pxor 80(%rsi), %xmm2 > > > - ptest %xmm2, %xmm0 > > > - jnc L(96bytesin256) > > > - > > > - movdqa 96(%rdi), %xmm2 > > > - pxor 96(%rsi), %xmm2 > > > - ptest %xmm2, %xmm0 > > > - jnc L(112bytesin256) > > > - > > > - movdqa 112(%rdi), %xmm2 > > > - pxor 112(%rsi), %xmm2 > > > - ptest %xmm2, %xmm0 > > > - jnc L(128bytesin256) > > > - > > > - movdqa 128(%rdi), %xmm2 > > > - pxor 128(%rsi), %xmm2 > > > - ptest %xmm2, %xmm0 > > > - jnc L(144bytesin256) > > > - > > > - movdqa 144(%rdi), %xmm2 > > > - pxor 144(%rsi), %xmm2 > > > - ptest %xmm2, %xmm0 > > > - jnc L(160bytesin256) > > > - > > > - movdqa 160(%rdi), %xmm2 > > > - pxor 160(%rsi), %xmm2 > > > - ptest %xmm2, %xmm0 > > > - jnc L(176bytesin256) > > > - > > > - movdqa 176(%rdi), %xmm2 > > > - pxor 176(%rsi), %xmm2 > > > - ptest %xmm2, %xmm0 > > > - jnc L(192bytesin256) > > > - > > > - movdqa 192(%rdi), %xmm2 > > > - pxor 192(%rsi), %xmm2 > > > - ptest %xmm2, %xmm0 > > > - jnc L(208bytesin256) > > > - > > > - movdqa 208(%rdi), %xmm2 > > > - pxor 208(%rsi), %xmm2 > > > - ptest %xmm2, %xmm0 > > > - jnc L(224bytesin256) > > > - > > > - movdqa 224(%rdi), %xmm2 > > > - pxor 224(%rsi), %xmm2 > > > - ptest %xmm2, %xmm0 > > > - jnc L(240bytesin256) > > > - > > > - movdqa 240(%rdi), %xmm2 > > > - pxor 240(%rsi), %xmm2 > > > - ptest %xmm2, %xmm0 > > > - jnc L(256bytesin256) > > > - > > > - add $256, %rsi > > > - add $256, %rdi > > > - > > > - cmp $128, %rdx > > > - jae L(less256bytesin2alinged) > > > - > > > - cmp $64, %rdx > > > - jae L(less128bytesin2aligned) > > > - > > > - cmp $32, %rdx > > > - jb L(less32bytesin256in2alinged) > > > - > > > - movdqa (%rdi), %xmm2 > > > - pxor (%rsi), %xmm2 > > > - ptest %xmm2, %xmm0 > > > - jnc L(16bytesin256) > > > - > > > - movdqa 16(%rdi), %xmm2 > > > - pxor 16(%rsi), %xmm2 > > > - ptest %xmm2, %xmm0 > > > - jnc L(32bytesin256) > > > - sub $32, %rdx > > > - add $32, %rdi > > > - add $32, %rsi > > > -L(less32bytesin256in2alinged): > > > - add %rdx, %rsi > > > - add %rdx, %rdi > > > - BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4) > > > + ja L(aligned_last_64_bytes) > > > + > > > + movdqu -32(%rdi, %rdx), %xmm0 > > > + movdqu -32(%rsi, %rdx), %xmm1 > > > + CMPEQ %xmm0, %xmm1 > > > + pmovmskb %xmm1, %eax > > > + incw %ax > > > + jnz L(vec_return_end_16) > > > + > > > + movdqu -16(%rdi, %rdx), %xmm0 > > > + movdqu -16(%rsi, %rdx), %xmm1 > > > + CMPEQ %xmm0, %xmm1 > > > + pmovmskb %xmm1, %eax > > > + incw %ax > > > + jnz L(vec_return_end) > > > + ret > > > > > > .p2align 4 > > > -L(512bytesormorein2aligned): > > > +L(aligned_loop): > > > # ifdef DATA_CACHE_SIZE_HALF > > > mov $DATA_CACHE_SIZE_HALF, %R8_LP > > > # else > > > mov __x86_data_cache_size_half(%rip), %R8_LP > > > # endif > > > - mov %r8, %r9 > > > - shr $1, %r8 > > > - add %r9, %r8 > > > - cmp %r8, %rdx > > > - ja L(L2_L3_cache_aglined) > > > + movq %r8, %r9 > > > + addq %r8, %r8 > > > + addq %r9, %r8 > > > + cmpq %r8, %rdx > > > + ja L(L2_L3_cache_aligned) > > > > > > sub $64, %rdx > > > .p2align 4 > > > L(64bytesormore_loopin2aligned): > > > - movdqa (%rdi), %xmm2 > > > - pxor (%rsi), %xmm2 > > > - movdqa %xmm2, %xmm1 > > > - > > > - movdqa 16(%rdi), %xmm3 > > > - pxor 16(%rsi), %xmm3 > > > - por %xmm3, %xmm1 > > > + movdqa (%rdi), %xmm0 > > > + movdqa 16(%rdi), %xmm1 > > > + movdqa 32(%rdi), %xmm2 > > > + movdqa 48(%rdi), %xmm3 > > > > > > - movdqa 32(%rdi), %xmm4 > > > - pxor 32(%rsi), %xmm4 > > > - por %xmm4, %xmm1 > > > + CMPEQ (%rsi), %xmm0 > > > + CMPEQ 16(%rsi), %xmm1 > > > + CMPEQ 32(%rsi), %xmm2 > > > + CMPEQ 48(%rsi), %xmm3 > > > > > > - movdqa 48(%rdi), %xmm5 > > > - pxor 48(%rsi), %xmm5 > > > - por %xmm5, %xmm1 > > > + pand %xmm0, %xmm1 > > > + pand %xmm2, %xmm3 > > > + pand %xmm1, %xmm3 > > > > > > - ptest %xmm1, %xmm0 > > > - jnc L(64bytesormore_loop_end) > > > + pmovmskb %xmm3, %eax > > > + incw %ax > > > + jnz L(64bytesormore_loop_end) > > > add $64, %rsi > > > add $64, %rdi > > > sub $64, %rdx > > > - jae L(64bytesormore_loopin2aligned) > > > - > > > - add $64, %rdx > > > - add %rdx, %rsi > > > - add %rdx, %rdi > > > - BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4) > > > -L(L2_L3_cache_aglined): > > > - sub $64, %rdx > > > + ja L(64bytesormore_loopin2aligned) > > > + jmp L(loop_tail) > > > > > > +L(L2_L3_cache_aligned): > > > + subq $64, %rdx > > > .p2align 4 > > > L(L2_L3_aligned_128bytes_loop): > > > prefetchnta 0x1c0(%rdi) > > > prefetchnta 0x1c0(%rsi) > > > - movdqa (%rdi), %xmm2 > > > - pxor (%rsi), %xmm2 > > > - movdqa %xmm2, %xmm1 > > > - > > > - movdqa 16(%rdi), %xmm3 > > > - pxor 16(%rsi), %xmm3 > > > - por %xmm3, %xmm1 > > > + movdqa (%rdi), %xmm0 > > > + movdqa 16(%rdi), %xmm1 > > > + movdqa 32(%rdi), %xmm2 > > > + movdqa 48(%rdi), %xmm3 > > > > > > - movdqa 32(%rdi), %xmm4 > > > - pxor 32(%rsi), %xmm4 > > > - por %xmm4, %xmm1 > > > + CMPEQ (%rsi), %xmm0 > > > + CMPEQ 16(%rsi), %xmm1 > > > + CMPEQ 32(%rsi), %xmm2 > > > + CMPEQ 48(%rsi), %xmm3 > > > > > > - movdqa 48(%rdi), %xmm5 > > > - pxor 48(%rsi), %xmm5 > > > - por %xmm5, %xmm1 > > > + pand %xmm0, %xmm1 > > > + pand %xmm2, %xmm3 > > > + pand %xmm1, %xmm3 > > > > > > - ptest %xmm1, %xmm0 > > > - jnc L(64bytesormore_loop_end) > > > - add $64, %rsi > > > - add $64, %rdi > > > - sub $64, %rdx > > > - jae L(L2_L3_aligned_128bytes_loop) > > > - > > > - add $64, %rdx > > > - add %rdx, %rsi > > > - add %rdx, %rdi > > > - BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4) > > > + pmovmskb %xmm3, %eax > > > + incw %ax > > > + jnz L(64bytesormore_loop_end) > > > > > > + addq $64, %rsi > > > + addq $64, %rdi > > > + subq $64, %rdx > > > + ja L(L2_L3_aligned_128bytes_loop) > > > + jmp L(loop_tail) > > > > > > .p2align 4 > > > L(64bytesormore_loop_end): > > > - add $16, %rdi > > > - add $16, %rsi > > > - ptest %xmm2, %xmm0 > > > - jnc L(16bytes) > > > - > > > - add $16, %rdi > > > - add $16, %rsi > > > - ptest %xmm3, %xmm0 > > > - jnc L(16bytes) > > > - > > > - add $16, %rdi > > > - add $16, %rsi > > > - ptest %xmm4, %xmm0 > > > - jnc L(16bytes) > > > - > > > - add $16, %rdi > > > - add $16, %rsi > > > - jmp L(16bytes) > > > - > > > -L(256bytesin256): > > > - add $256, %rdi > > > - add $256, %rsi > > > - jmp L(16bytes) > > > -L(240bytesin256): > > > - add $240, %rdi > > > - add $240, %rsi > > > - jmp L(16bytes) > > > -L(224bytesin256): > > > - add $224, %rdi > > > - add $224, %rsi > > > - jmp L(16bytes) > > > -L(208bytesin256): > > > - add $208, %rdi > > > - add $208, %rsi > > > - jmp L(16bytes) > > > -L(192bytesin256): > > > - add $192, %rdi > > > - add $192, %rsi > > > - jmp L(16bytes) > > > -L(176bytesin256): > > > - add $176, %rdi > > > - add $176, %rsi > > > - jmp L(16bytes) > > > -L(160bytesin256): > > > - add $160, %rdi > > > - add $160, %rsi > > > - jmp L(16bytes) > > > -L(144bytesin256): > > > - add $144, %rdi > > > - add $144, %rsi > > > - jmp L(16bytes) > > > -L(128bytesin256): > > > - add $128, %rdi > > > - add $128, %rsi > > > - jmp L(16bytes) > > > -L(112bytesin256): > > > - add $112, %rdi > > > - add $112, %rsi > > > - jmp L(16bytes) > > > -L(96bytesin256): > > > - add $96, %rdi > > > - add $96, %rsi > > > - jmp L(16bytes) > > > -L(80bytesin256): > > > - add $80, %rdi > > > - add $80, %rsi > > > - jmp L(16bytes) > > > -L(64bytesin256): > > > - add $64, %rdi > > > - add $64, %rsi > > > - jmp L(16bytes) > > > -L(48bytesin256): > > > - add $16, %rdi > > > - add $16, %rsi > > > -L(32bytesin256): > > > - add $16, %rdi > > > - add $16, %rsi > > > -L(16bytesin256): > > > - add $16, %rdi > > > - add $16, %rsi > > > -L(16bytes): > > > - mov -16(%rdi), %rax > > > - mov -16(%rsi), %rcx > > > - cmp %rax, %rcx > > > - jne L(diffin8bytes) > > > -L(8bytes): > > > - mov -8(%rdi), %rax > > > - mov -8(%rsi), %rcx > > > - cmp %rax, %rcx > > > - jne L(diffin8bytes) > > > - xor %eax, %eax > > > - ret > > > - > > > - .p2align 4 > > > -L(12bytes): > > > - mov -12(%rdi), %rax > > > - mov -12(%rsi), %rcx > > > - cmp %rax, %rcx > > > - jne L(diffin8bytes) > > > -L(4bytes): > > > - mov -4(%rsi), %ecx > > > -# ifndef USE_AS_WMEMCMP > > > - mov -4(%rdi), %eax > > > - cmp %eax, %ecx > > > -# else > > > - cmp -4(%rdi), %ecx > > > -# endif > > > - jne L(diffin4bytes) > > > -L(0bytes): > > > - xor %eax, %eax > > > - ret > > > - > > > -# ifndef USE_AS_WMEMCMP > > > -/* unreal case for wmemcmp */ > > > - .p2align 4 > > > -L(65bytes): > > > - movdqu -65(%rdi), %xmm1 > > > - movdqu -65(%rsi), %xmm2 > > > - mov $-65, %dl > > > - pxor %xmm1, %xmm2 > > > - ptest %xmm2, %xmm0 > > > - jnc L(less16bytes) > > > -L(49bytes): > > > - movdqu -49(%rdi), %xmm1 > > > - movdqu -49(%rsi), %xmm2 > > > - mov $-49, %dl > > > - pxor %xmm1, %xmm2 > > > - ptest %xmm2, %xmm0 > > > - jnc L(less16bytes) > > > -L(33bytes): > > > - movdqu -33(%rdi), %xmm1 > > > - movdqu -33(%rsi), %xmm2 > > > - mov $-33, %dl > > > - pxor %xmm1, %xmm2 > > > - ptest %xmm2, %xmm0 > > > - jnc L(less16bytes) > > > -L(17bytes): > > > - mov -17(%rdi), %rax > > > - mov -17(%rsi), %rcx > > > - cmp %rax, %rcx > > > - jne L(diffin8bytes) > > > -L(9bytes): > > > - mov -9(%rdi), %rax > > > - mov -9(%rsi), %rcx > > > - cmp %rax, %rcx > > > - jne L(diffin8bytes) > > > - movzbl -1(%rdi), %eax > > > - movzbl -1(%rsi), %edx > > > - sub %edx, %eax > > > - ret > > > - > > > - .p2align 4 > > > -L(13bytes): > > > - mov -13(%rdi), %rax > > > - mov -13(%rsi), %rcx > > > - cmp %rax, %rcx > > > - jne L(diffin8bytes) > > > - mov -8(%rdi), %rax > > > - mov -8(%rsi), %rcx > > > - cmp %rax, %rcx > > > - jne L(diffin8bytes) > > > - xor %eax, %eax > > > - ret > > > - > > > - .p2align 4 > > > -L(5bytes): > > > - mov -5(%rdi), %eax > > > - mov -5(%rsi), %ecx > > > - cmp %eax, %ecx > > > - jne L(diffin4bytes) > > > - movzbl -1(%rdi), %eax > > > - movzbl -1(%rsi), %edx > > > - sub %edx, %eax > > > - ret > > > - > > > - .p2align 4 > > > -L(66bytes): > > > - movdqu -66(%rdi), %xmm1 > > > - movdqu -66(%rsi), %xmm2 > > > - mov $-66, %dl > > > - pxor %xmm1, %xmm2 > > > - ptest %xmm2, %xmm0 > > > - jnc L(less16bytes) > > > -L(50bytes): > > > - movdqu -50(%rdi), %xmm1 > > > - movdqu -50(%rsi), %xmm2 > > > - mov $-50, %dl > > > - pxor %xmm1, %xmm2 > > > - ptest %xmm2, %xmm0 > > > - jnc L(less16bytes) > > > -L(34bytes): > > > - movdqu -34(%rdi), %xmm1 > > > - movdqu -34(%rsi), %xmm2 > > > - mov $-34, %dl > > > - pxor %xmm1, %xmm2 > > > - ptest %xmm2, %xmm0 > > > - jnc L(less16bytes) > > > -L(18bytes): > > > - mov -18(%rdi), %rax > > > - mov -18(%rsi), %rcx > > > - cmp %rax, %rcx > > > - jne L(diffin8bytes) > > > -L(10bytes): > > > - mov -10(%rdi), %rax > > > - mov -10(%rsi), %rcx > > > - cmp %rax, %rcx > > > - jne L(diffin8bytes) > > > - movzwl -2(%rdi), %eax > > > - movzwl -2(%rsi), %ecx > > > - cmp %cl, %al > > > - jne L(end) > > > - and $0xffff, %eax > > > - and $0xffff, %ecx > > > - sub %ecx, %eax > > > - ret > > > - > > > - .p2align 4 > > > -L(14bytes): > > > - mov -14(%rdi), %rax > > > - mov -14(%rsi), %rcx > > > - cmp %rax, %rcx > > > - jne L(diffin8bytes) > > > - mov -8(%rdi), %rax > > > - mov -8(%rsi), %rcx > > > - cmp %rax, %rcx > > > - jne L(diffin8bytes) > > > - xor %eax, %eax > > > - ret > > > - > > > - .p2align 4 > > > -L(6bytes): > > > - mov -6(%rdi), %eax > > > - mov -6(%rsi), %ecx > > > - cmp %eax, %ecx > > > - jne L(diffin4bytes) > > > -L(2bytes): > > > - movzwl -2(%rsi), %ecx > > > - movzwl -2(%rdi), %eax > > > - cmp %cl, %al > > > - jne L(end) > > > - and $0xffff, %eax > > > - and $0xffff, %ecx > > > - sub %ecx, %eax > > > - ret > > > - > > > - .p2align 4 > > > -L(67bytes): > > > - movdqu -67(%rdi), %xmm2 > > > - movdqu -67(%rsi), %xmm1 > > > - mov $-67, %dl > > > - pxor %xmm1, %xmm2 > > > - ptest %xmm2, %xmm0 > > > - jnc L(less16bytes) > > > -L(51bytes): > > > - movdqu -51(%rdi), %xmm2 > > > - movdqu -51(%rsi), %xmm1 > > > - mov $-51, %dl > > > - pxor %xmm1, %xmm2 > > > - ptest %xmm2, %xmm0 > > > - jnc L(less16bytes) > > > -L(35bytes): > > > - movdqu -35(%rsi), %xmm1 > > > - movdqu -35(%rdi), %xmm2 > > > - mov $-35, %dl > > > - pxor %xmm1, %xmm2 > > > - ptest %xmm2, %xmm0 > > > - jnc L(less16bytes) > > > -L(19bytes): > > > - mov -19(%rdi), %rax > > > - mov -19(%rsi), %rcx > > > - cmp %rax, %rcx > > > - jne L(diffin8bytes) > > > -L(11bytes): > > > - mov -11(%rdi), %rax > > > - mov -11(%rsi), %rcx > > > - cmp %rax, %rcx > > > - jne L(diffin8bytes) > > > - mov -4(%rdi), %eax > > > - mov -4(%rsi), %ecx > > > - cmp %eax, %ecx > > > - jne L(diffin4bytes) > > > - xor %eax, %eax > > > - ret > > > - > > > - .p2align 4 > > > -L(15bytes): > > > - mov -15(%rdi), %rax > > > - mov -15(%rsi), %rcx > > > - cmp %rax, %rcx > > > - jne L(diffin8bytes) > > > - mov -8(%rdi), %rax > > > - mov -8(%rsi), %rcx > > > - cmp %rax, %rcx > > > - jne L(diffin8bytes) > > > - xor %eax, %eax > > > - ret > > > - > > > - .p2align 4 > > > -L(7bytes): > > > - mov -7(%rdi), %eax > > > - mov -7(%rsi), %ecx > > > - cmp %eax, %ecx > > > - jne L(diffin4bytes) > > > - mov -4(%rdi), %eax > > > - mov -4(%rsi), %ecx > > > - cmp %eax, %ecx > > > - jne L(diffin4bytes) > > > - xor %eax, %eax > > > - ret > > > - > > > - .p2align 4 > > > -L(3bytes): > > > - movzwl -3(%rdi), %eax > > > - movzwl -3(%rsi), %ecx > > > - cmp %eax, %ecx > > > - jne L(diffin2bytes) > > > -L(1bytes): > > > - movzbl -1(%rdi), %eax > > > - movzbl -1(%rsi), %ecx > > > - sub %ecx, %eax > > > - ret > > > -# endif > > > - > > > - .p2align 4 > > > -L(68bytes): > > > - movdqu -68(%rdi), %xmm2 > > > - movdqu -68(%rsi), %xmm1 > > > - mov $-68, %dl > > > - pxor %xmm1, %xmm2 > > > - ptest %xmm2, %xmm0 > > > - jnc L(less16bytes) > > > -L(52bytes): > > > - movdqu -52(%rdi), %xmm2 > > > - movdqu -52(%rsi), %xmm1 > > > - mov $-52, %dl > > > - pxor %xmm1, %xmm2 > > > - ptest %xmm2, %xmm0 > > > - jnc L(less16bytes) > > > -L(36bytes): > > > - movdqu -36(%rdi), %xmm2 > > > - movdqu -36(%rsi), %xmm1 > > > - mov $-36, %dl > > > - pxor %xmm1, %xmm2 > > > - ptest %xmm2, %xmm0 > > > - jnc L(less16bytes) > > > -L(20bytes): > > > - movdqu -20(%rdi), %xmm2 > > > - movdqu -20(%rsi), %xmm1 > > > - mov $-20, %dl > > > - pxor %xmm1, %xmm2 > > > - ptest %xmm2, %xmm0 > > > - jnc L(less16bytes) > > > - mov -4(%rsi), %ecx > > > - > > > -# ifndef USE_AS_WMEMCMP > > > - mov -4(%rdi), %eax > > > - cmp %eax, %ecx > > > -# else > > > - cmp -4(%rdi), %ecx > > > -# endif > > > - jne L(diffin4bytes) > > > - xor %eax, %eax > > > - ret > > > - > > > -# ifndef USE_AS_WMEMCMP > > > -/* unreal cases for wmemcmp */ > > > - .p2align 4 > > > -L(69bytes): > > > - movdqu -69(%rsi), %xmm1 > > > - movdqu -69(%rdi), %xmm2 > > > - mov $-69, %dl > > > - pxor %xmm1, %xmm2 > > > - ptest %xmm2, %xmm0 > > > - jnc L(less16bytes) > > > -L(53bytes): > > > - movdqu -53(%rsi), %xmm1 > > > - movdqu -53(%rdi), %xmm2 > > > - mov $-53, %dl > > > - pxor %xmm1, %xmm2 > > > - ptest %xmm2, %xmm0 > > > - jnc L(less16bytes) > > > -L(37bytes): > > > - movdqu -37(%rsi), %xmm1 > > > - movdqu -37(%rdi), %xmm2 > > > - mov $-37, %dl > > > - pxor %xmm1, %xmm2 > > > - ptest %xmm2, %xmm0 > > > - jnc L(less16bytes) > > > -L(21bytes): > > > - movdqu -21(%rsi), %xmm1 > > > - movdqu -21(%rdi), %xmm2 > > > - mov $-21, %dl > > > - pxor %xmm1, %xmm2 > > > - ptest %xmm2, %xmm0 > > > - jnc L(less16bytes) > > > - mov -8(%rdi), %rax > > > - mov -8(%rsi), %rcx > > > - cmp %rax, %rcx > > > - jne L(diffin8bytes) > > > - xor %eax, %eax > > > - ret > > > - > > > - .p2align 4 > > > -L(70bytes): > > > - movdqu -70(%rsi), %xmm1 > > > - movdqu -70(%rdi), %xmm2 > > > - mov $-70, %dl > > > - pxor %xmm1, %xmm2 > > > - ptest %xmm2, %xmm0 > > > - jnc L(less16bytes) > > > -L(54bytes): > > > - movdqu -54(%rsi), %xmm1 > > > - movdqu -54(%rdi), %xmm2 > > > - mov $-54, %dl > > > - pxor %xmm1, %xmm2 > > > - ptest %xmm2, %xmm0 > > > - jnc L(less16bytes) > > > -L(38bytes): > > > - movdqu -38(%rsi), %xmm1 > > > - movdqu -38(%rdi), %xmm2 > > > - mov $-38, %dl > > > - pxor %xmm1, %xmm2 > > > - ptest %xmm2, %xmm0 > > > - jnc L(less16bytes) > > > -L(22bytes): > > > - movdqu -22(%rsi), %xmm1 > > > - movdqu -22(%rdi), %xmm2 > > > - mov $-22, %dl > > > - pxor %xmm1, %xmm2 > > > - ptest %xmm2, %xmm0 > > > - jnc L(less16bytes) > > > - mov -8(%rdi), %rax > > > - mov -8(%rsi), %rcx > > > - cmp %rax, %rcx > > > - jne L(diffin8bytes) > > > - xor %eax, %eax > > > - ret > > > - > > > - .p2align 4 > > > -L(71bytes): > > > - movdqu -71(%rsi), %xmm1 > > > - movdqu -71(%rdi), %xmm2 > > > - mov $-71, %dl > > > - pxor %xmm1, %xmm2 > > > - ptest %xmm2, %xmm0 > > > - jnc L(less16bytes) > > > -L(55bytes): > > > - movdqu -55(%rdi), %xmm2 > > > - movdqu -55(%rsi), %xmm1 > > > - mov $-55, %dl > > > - pxor %xmm1, %xmm2 > > > - ptest %xmm2, %xmm0 > > > - jnc L(less16bytes) > > > -L(39bytes): > > > - movdqu -39(%rdi), %xmm2 > > > - movdqu -39(%rsi), %xmm1 > > > - mov $-39, %dl > > > - pxor %xmm1, %xmm2 > > > - ptest %xmm2, %xmm0 > > > - jnc L(less16bytes) > > > -L(23bytes): > > > - movdqu -23(%rdi), %xmm2 > > > - movdqu -23(%rsi), %xmm1 > > > - mov $-23, %dl > > > - pxor %xmm1, %xmm2 > > > - ptest %xmm2, %xmm0 > > > - jnc L(less16bytes) > > > - mov -8(%rdi), %rax > > > - mov -8(%rsi), %rcx > > > - cmp %rax, %rcx > > > - jne L(diffin8bytes) > > > - xor %eax, %eax > > > - ret > > > -# endif > > > - > > > - .p2align 4 > > > -L(72bytes): > > > - movdqu -72(%rsi), %xmm1 > > > - movdqu -72(%rdi), %xmm2 > > > - mov $-72, %dl > > > - pxor %xmm1, %xmm2 > > > - ptest %xmm2, %xmm0 > > > - jnc L(less16bytes) > > > -L(56bytes): > > > - movdqu -56(%rdi), %xmm2 > > > - movdqu -56(%rsi), %xmm1 > > > - mov $-56, %dl > > > - pxor %xmm1, %xmm2 > > > - ptest %xmm2, %xmm0 > > > - jnc L(less16bytes) > > > -L(40bytes): > > > - movdqu -40(%rdi), %xmm2 > > > - movdqu -40(%rsi), %xmm1 > > > - mov $-40, %dl > > > - pxor %xmm1, %xmm2 > > > - ptest %xmm2, %xmm0 > > > - jnc L(less16bytes) > > > -L(24bytes): > > > - movdqu -24(%rdi), %xmm2 > > > - movdqu -24(%rsi), %xmm1 > > > - mov $-24, %dl > > > - pxor %xmm1, %xmm2 > > > - ptest %xmm2, %xmm0 > > > - jnc L(less16bytes) > > > - > > > - mov -8(%rsi), %rcx > > > - mov -8(%rdi), %rax > > > - cmp %rax, %rcx > > > - jne L(diffin8bytes) > > > - xor %eax, %eax > > > - ret > > > - > > > -# ifndef USE_AS_WMEMCMP > > > -/* unreal cases for wmemcmp */ > > > - .p2align 4 > > > -L(73bytes): > > > - movdqu -73(%rsi), %xmm1 > > > - movdqu -73(%rdi), %xmm2 > > > - mov $-73, %dl > > > - pxor %xmm1, %xmm2 > > > - ptest %xmm2, %xmm0 > > > - jnc L(less16bytes) > > > -L(57bytes): > > > - movdqu -57(%rdi), %xmm2 > > > - movdqu -57(%rsi), %xmm1 > > > - mov $-57, %dl > > > - pxor %xmm1, %xmm2 > > > - ptest %xmm2, %xmm0 > > > - jnc L(less16bytes) > > > -L(41bytes): > > > - movdqu -41(%rdi), %xmm2 > > > - movdqu -41(%rsi), %xmm1 > > > - mov $-41, %dl > > > - pxor %xmm1, %xmm2 > > > - ptest %xmm2, %xmm0 > > > - jnc L(less16bytes) > > > -L(25bytes): > > > - movdqu -25(%rdi), %xmm2 > > > - movdqu -25(%rsi), %xmm1 > > > - mov $-25, %dl > > > - pxor %xmm1, %xmm2 > > > - ptest %xmm2, %xmm0 > > > - jnc L(less16bytes) > > > - mov -9(%rdi), %rax > > > - mov -9(%rsi), %rcx > > > - cmp %rax, %rcx > > > - jne L(diffin8bytes) > > > - movzbl -1(%rdi), %eax > > > - movzbl -1(%rsi), %ecx > > > - sub %ecx, %eax > > > - ret > > > - > > > - .p2align 4 > > > -L(74bytes): > > > - movdqu -74(%rsi), %xmm1 > > > - movdqu -74(%rdi), %xmm2 > > > - mov $-74, %dl > > > - pxor %xmm1, %xmm2 > > > - ptest %xmm2, %xmm0 > > > - jnc L(less16bytes) > > > -L(58bytes): > > > - movdqu -58(%rdi), %xmm2 > > > - movdqu -58(%rsi), %xmm1 > > > - mov $-58, %dl > > > - pxor %xmm1, %xmm2 > > > - ptest %xmm2, %xmm0 > > > - jnc L(less16bytes) > > > -L(42bytes): > > > - movdqu -42(%rdi), %xmm2 > > > - movdqu -42(%rsi), %xmm1 > > > - mov $-42, %dl > > > - pxor %xmm1, %xmm2 > > > - ptest %xmm2, %xmm0 > > > - jnc L(less16bytes) > > > -L(26bytes): > > > - movdqu -26(%rdi), %xmm2 > > > - movdqu -26(%rsi), %xmm1 > > > - mov $-26, %dl > > > - pxor %xmm1, %xmm2 > > > - ptest %xmm2, %xmm0 > > > - jnc L(less16bytes) > > > - mov -10(%rdi), %rax > > > - mov -10(%rsi), %rcx > > > - cmp %rax, %rcx > > > - jne L(diffin8bytes) > > > - movzwl -2(%rdi), %eax > > > - movzwl -2(%rsi), %ecx > > > - jmp L(diffin2bytes) > > > - > > > - .p2align 4 > > > -L(75bytes): > > > - movdqu -75(%rsi), %xmm1 > > > - movdqu -75(%rdi), %xmm2 > > > - mov $-75, %dl > > > - pxor %xmm1, %xmm2 > > > - ptest %xmm2, %xmm0 > > > - jnc L(less16bytes) > > > -L(59bytes): > > > - movdqu -59(%rdi), %xmm2 > > > - movdqu -59(%rsi), %xmm1 > > > - mov $-59, %dl > > > - pxor %xmm1, %xmm2 > > > - ptest %xmm2, %xmm0 > > > - jnc L(less16bytes) > > > -L(43bytes): > > > - movdqu -43(%rdi), %xmm2 > > > - movdqu -43(%rsi), %xmm1 > > > - mov $-43, %dl > > > - pxor %xmm1, %xmm2 > > > - ptest %xmm2, %xmm0 > > > - jnc L(less16bytes) > > > -L(27bytes): > > > - movdqu -27(%rdi), %xmm2 > > > - movdqu -27(%rsi), %xmm1 > > > - mov $-27, %dl > > > - pxor %xmm1, %xmm2 > > > - ptest %xmm2, %xmm0 > > > - jnc L(less16bytes) > > > - mov -11(%rdi), %rax > > > - mov -11(%rsi), %rcx > > > - cmp %rax, %rcx > > > - jne L(diffin8bytes) > > > - mov -4(%rdi), %eax > > > - mov -4(%rsi), %ecx > > > - cmp %eax, %ecx > > > - jne L(diffin4bytes) > > > - xor %eax, %eax > > > - ret > > > -# endif > > > - .p2align 4 > > > -L(76bytes): > > > - movdqu -76(%rsi), %xmm1 > > > - movdqu -76(%rdi), %xmm2 > > > - mov $-76, %dl > > > - pxor %xmm1, %xmm2 > > > - ptest %xmm2, %xmm0 > > > - jnc L(less16bytes) > > > -L(60bytes): > > > - movdqu -60(%rdi), %xmm2 > > > - movdqu -60(%rsi), %xmm1 > > > - mov $-60, %dl > > > - pxor %xmm1, %xmm2 > > > - ptest %xmm2, %xmm0 > > > - jnc L(less16bytes) > > > -L(44bytes): > > > - movdqu -44(%rdi), %xmm2 > > > - movdqu -44(%rsi), %xmm1 > > > - mov $-44, %dl > > > - pxor %xmm1, %xmm2 > > > - ptest %xmm2, %xmm0 > > > - jnc L(less16bytes) > > > -L(28bytes): > > > - movdqu -28(%rdi), %xmm2 > > > - movdqu -28(%rsi), %xmm1 > > > - mov $-28, %dl > > > - pxor %xmm1, %xmm2 > > > - ptest %xmm2, %xmm0 > > > - jnc L(less16bytes) > > > - mov -12(%rdi), %rax > > > - mov -12(%rsi), %rcx > > > - cmp %rax, %rcx > > > - jne L(diffin8bytes) > > > - mov -4(%rsi), %ecx > > > -# ifndef USE_AS_WMEMCMP > > > - mov -4(%rdi), %eax > > > - cmp %eax, %ecx > > > -# else > > > - cmp -4(%rdi), %ecx > > > -# endif > > > - jne L(diffin4bytes) > > > - xor %eax, %eax > > > - ret > > > - > > > -# ifndef USE_AS_WMEMCMP > > > -/* unreal cases for wmemcmp */ > > > - .p2align 4 > > > -L(77bytes): > > > - movdqu -77(%rsi), %xmm1 > > > - movdqu -77(%rdi), %xmm2 > > > - mov $-77, %dl > > > - pxor %xmm1, %xmm2 > > > - ptest %xmm2, %xmm0 > > > - jnc L(less16bytes) > > > -L(61bytes): > > > - movdqu -61(%rdi), %xmm2 > > > - movdqu -61(%rsi), %xmm1 > > > - mov $-61, %dl > > > - pxor %xmm1, %xmm2 > > > - ptest %xmm2, %xmm0 > > > - jnc L(less16bytes) > > > -L(45bytes): > > > - movdqu -45(%rdi), %xmm2 > > > - movdqu -45(%rsi), %xmm1 > > > - mov $-45, %dl > > > - pxor %xmm1, %xmm2 > > > - ptest %xmm2, %xmm0 > > > - jnc L(less16bytes) > > > -L(29bytes): > > > - movdqu -29(%rdi), %xmm2 > > > - movdqu -29(%rsi), %xmm1 > > > - mov $-29, %dl > > > - pxor %xmm1, %xmm2 > > > - ptest %xmm2, %xmm0 > > > - jnc L(less16bytes) > > > - > > > - mov -13(%rdi), %rax > > > - mov -13(%rsi), %rcx > > > - cmp %rax, %rcx > > > - jne L(diffin8bytes) > > > - > > > - mov -8(%rdi), %rax > > > - mov -8(%rsi), %rcx > > > - cmp %rax, %rcx > > > - jne L(diffin8bytes) > > > - xor %eax, %eax > > > - ret > > > - > > > - .p2align 4 > > > -L(78bytes): > > > - movdqu -78(%rsi), %xmm1 > > > - movdqu -78(%rdi), %xmm2 > > > - mov $-78, %dl > > > - pxor %xmm1, %xmm2 > > > - ptest %xmm2, %xmm0 > > > - jnc L(less16bytes) > > > -L(62bytes): > > > - movdqu -62(%rdi), %xmm2 > > > - movdqu -62(%rsi), %xmm1 > > > - mov $-62, %dl > > > - pxor %xmm1, %xmm2 > > > - ptest %xmm2, %xmm0 > > > - jnc L(less16bytes) > > > -L(46bytes): > > > - movdqu -46(%rdi), %xmm2 > > > - movdqu -46(%rsi), %xmm1 > > > - mov $-46, %dl > > > - pxor %xmm1, %xmm2 > > > - ptest %xmm2, %xmm0 > > > - jnc L(less16bytes) > > > -L(30bytes): > > > - movdqu -30(%rdi), %xmm2 > > > - movdqu -30(%rsi), %xmm1 > > > - mov $-30, %dl > > > - pxor %xmm1, %xmm2 > > > - ptest %xmm2, %xmm0 > > > - jnc L(less16bytes) > > > - mov -14(%rdi), %rax > > > - mov -14(%rsi), %rcx > > > - cmp %rax, %rcx > > > - jne L(diffin8bytes) > > > - mov -8(%rdi), %rax > > > - mov -8(%rsi), %rcx > > > - cmp %rax, %rcx > > > - jne L(diffin8bytes) > > > - xor %eax, %eax > > > - ret > > > - > > > - .p2align 4 > > > -L(79bytes): > > > - movdqu -79(%rsi), %xmm1 > > > - movdqu -79(%rdi), %xmm2 > > > - mov $-79, %dl > > > - pxor %xmm1, %xmm2 > > > - ptest %xmm2, %xmm0 > > > - jnc L(less16bytes) > > > -L(63bytes): > > > - movdqu -63(%rdi), %xmm2 > > > - movdqu -63(%rsi), %xmm1 > > > - mov $-63, %dl > > > - pxor %xmm1, %xmm2 > > > - ptest %xmm2, %xmm0 > > > - jnc L(less16bytes) > > > -L(47bytes): > > > - movdqu -47(%rdi), %xmm2 > > > - movdqu -47(%rsi), %xmm1 > > > - mov $-47, %dl > > > - pxor %xmm1, %xmm2 > > > - ptest %xmm2, %xmm0 > > > - jnc L(less16bytes) > > > -L(31bytes): > > > - movdqu -31(%rdi), %xmm2 > > > - movdqu -31(%rsi), %xmm1 > > > - mov $-31, %dl > > > - pxor %xmm1, %xmm2 > > > - ptest %xmm2, %xmm0 > > > - jnc L(less16bytes) > > > - mov -15(%rdi), %rax > > > - mov -15(%rsi), %rcx > > > - cmp %rax, %rcx > > > - jne L(diffin8bytes) > > > - mov -8(%rdi), %rax > > > - mov -8(%rsi), %rcx > > > - cmp %rax, %rcx > > > - jne L(diffin8bytes) > > > - xor %eax, %eax > > > - ret > > > -# endif > > > - .p2align 4 > > > -L(64bytes): > > > - movdqu -64(%rdi), %xmm2 > > > - movdqu -64(%rsi), %xmm1 > > > - mov $-64, %dl > > > - pxor %xmm1, %xmm2 > > > - ptest %xmm2, %xmm0 > > > - jnc L(less16bytes) > > > -L(48bytes): > > > - movdqu -48(%rdi), %xmm2 > > > - movdqu -48(%rsi), %xmm1 > > > - mov $-48, %dl > > > - pxor %xmm1, %xmm2 > > > - ptest %xmm2, %xmm0 > > > - jnc L(less16bytes) > > > -L(32bytes): > > > - movdqu -32(%rdi), %xmm2 > > > - movdqu -32(%rsi), %xmm1 > > > - mov $-32, %dl > > > - pxor %xmm1, %xmm2 > > > - ptest %xmm2, %xmm0 > > > - jnc L(less16bytes) > > > - > > > - mov -16(%rdi), %rax > > > - mov -16(%rsi), %rcx > > > - cmp %rax, %rcx > > > - jne L(diffin8bytes) > > > - > > > - mov -8(%rdi), %rax > > > - mov -8(%rsi), %rcx > > > - cmp %rax, %rcx > > > - jne L(diffin8bytes) > > > - xor %eax, %eax > > > - ret > > > - > > > -/* > > > - * Aligned 8 bytes to avoid 2 branch "taken" in one 16 alinged code block. > > > - */ > > > - .p2align 3 > > > -L(less16bytes): > > > - movsbq %dl, %rdx > > > - mov (%rsi, %rdx), %rcx > > > - mov (%rdi, %rdx), %rax > > > - cmp %rax, %rcx > > > - jne L(diffin8bytes) > > > - mov 8(%rsi, %rdx), %rcx > > > - mov 8(%rdi, %rdx), %rax > > > -L(diffin8bytes): > > > - cmp %eax, %ecx > > > - jne L(diffin4bytes) > > > - shr $32, %rcx > > > - shr $32, %rax > > > - > > > + pmovmskb %xmm0, %ecx > > > + incw %cx > > > + jnz L(loop_end_ret) > > > + > > > + pmovmskb %xmm1, %ecx > > > + notw %cx > > > + sall $16, %ecx > > > + jnz L(loop_end_ret) > > > + > > > + pmovmskb %xmm2, %ecx > > > + notw %cx > > > + shlq $32, %rcx > > > + jnz L(loop_end_ret) > > > + > > > + addq $48, %rdi > > > + addq $48, %rsi > > > + movq %rax, %rcx > > > + > > > + .p2align 4,, 6 > > > +L(loop_end_ret): > > > + bsfq %rcx, %rcx > > > # ifdef USE_AS_WMEMCMP > > > -/* for wmemcmp */ > > > - cmp %eax, %ecx > > > - jne L(diffin4bytes) > > > - xor %eax, %eax > > > - ret > > > -# endif > > > - > > > -L(diffin4bytes): > > > -# ifndef USE_AS_WMEMCMP > > > - cmp %cx, %ax > > > - jne L(diffin2bytes) > > > - shr $16, %ecx > > > - shr $16, %eax > > > -L(diffin2bytes): > > > - cmp %cl, %al > > > - jne L(end) > > > - and $0xffff, %eax > > > - and $0xffff, %ecx > > > - sub %ecx, %eax > > > - ret > > > - > > > - .p2align 4 > > > -L(end): > > > - and $0xff, %eax > > > - and $0xff, %ecx > > > - sub %ecx, %eax > > > - ret > > > + movl (%rdi, %rcx), %eax > > > + xorl %edx, %edx > > > + cmpl (%rsi, %rcx), %eax > > > + setg %dl > > > + leal -1(%rdx, %rdx), %eax > > > # else > > > - > > > -/* for wmemcmp */ > > > - mov $1, %eax > > > - jl L(nequal_bigger) > > > - neg %eax > > > - ret > > > - > > > - .p2align 4 > > > -L(nequal_bigger): > > > - ret > > > - > > > -L(unreal_case): > > > - xor %eax, %eax > > > - ret > > > + movzbl (%rdi, %rcx), %eax > > > + movzbl (%rsi, %rcx), %ecx > > > + subl %ecx, %eax > > > # endif > > > - > > > + ret > > > END (MEMCMP) > > > - > > > - .section .rodata.sse4.1,"a",@progbits > > > - .p2align 3 > > > -# ifndef USE_AS_WMEMCMP > > > -L(table_64bytes): > > > - .int JMPTBL (L(0bytes), L(table_64bytes)) > > > - .int JMPTBL (L(1bytes), L(table_64bytes)) > > > - .int JMPTBL (L(2bytes), L(table_64bytes)) > > > - .int JMPTBL (L(3bytes), L(table_64bytes)) > > > - .int JMPTBL (L(4bytes), L(table_64bytes)) > > > - .int JMPTBL (L(5bytes), L(table_64bytes)) > > > - .int JMPTBL (L(6bytes), L(table_64bytes)) > > > - .int JMPTBL (L(7bytes), L(table_64bytes)) > > > - .int JMPTBL (L(8bytes), L(table_64bytes)) > > > - .int JMPTBL (L(9bytes), L(table_64bytes)) > > > - .int JMPTBL (L(10bytes), L(table_64bytes)) > > > - .int JMPTBL (L(11bytes), L(table_64bytes)) > > > - .int JMPTBL (L(12bytes), L(table_64bytes)) > > > - .int JMPTBL (L(13bytes), L(table_64bytes)) > > > - .int JMPTBL (L(14bytes), L(table_64bytes)) > > > - .int JMPTBL (L(15bytes), L(table_64bytes)) > > > - .int JMPTBL (L(16bytes), L(table_64bytes)) > > > - .int JMPTBL (L(17bytes), L(table_64bytes)) > > > - .int JMPTBL (L(18bytes), L(table_64bytes)) > > > - .int JMPTBL (L(19bytes), L(table_64bytes)) > > > - .int JMPTBL (L(20bytes), L(table_64bytes)) > > > - .int JMPTBL (L(21bytes), L(table_64bytes)) > > > - .int JMPTBL (L(22bytes), L(table_64bytes)) > > > - .int JMPTBL (L(23bytes), L(table_64bytes)) > > > - .int JMPTBL (L(24bytes), L(table_64bytes)) > > > - .int JMPTBL (L(25bytes), L(table_64bytes)) > > > - .int JMPTBL (L(26bytes), L(table_64bytes)) > > > - .int JMPTBL (L(27bytes), L(table_64bytes)) > > > - .int JMPTBL (L(28bytes), L(table_64bytes)) > > > - .int JMPTBL (L(29bytes), L(table_64bytes)) > > > - .int JMPTBL (L(30bytes), L(table_64bytes)) > > > - .int JMPTBL (L(31bytes), L(table_64bytes)) > > > - .int JMPTBL (L(32bytes), L(table_64bytes)) > > > - .int JMPTBL (L(33bytes), L(table_64bytes)) > > > - .int JMPTBL (L(34bytes), L(table_64bytes)) > > > - .int JMPTBL (L(35bytes), L(table_64bytes)) > > > - .int JMPTBL (L(36bytes), L(table_64bytes)) > > > - .int JMPTBL (L(37bytes), L(table_64bytes)) > > > - .int JMPTBL (L(38bytes), L(table_64bytes)) > > > - .int JMPTBL (L(39bytes), L(table_64bytes)) > > > - .int JMPTBL (L(40bytes), L(table_64bytes)) > > > - .int JMPTBL (L(41bytes), L(table_64bytes)) > > > - .int JMPTBL (L(42bytes), L(table_64bytes)) > > > - .int JMPTBL (L(43bytes), L(table_64bytes)) > > > - .int JMPTBL (L(44bytes), L(table_64bytes)) > > > - .int JMPTBL (L(45bytes), L(table_64bytes)) > > > - .int JMPTBL (L(46bytes), L(table_64bytes)) > > > - .int JMPTBL (L(47bytes), L(table_64bytes)) > > > - .int JMPTBL (L(48bytes), L(table_64bytes)) > > > - .int JMPTBL (L(49bytes), L(table_64bytes)) > > > - .int JMPTBL (L(50bytes), L(table_64bytes)) > > > - .int JMPTBL (L(51bytes), L(table_64bytes)) > > > - .int JMPTBL (L(52bytes), L(table_64bytes)) > > > - .int JMPTBL (L(53bytes), L(table_64bytes)) > > > - .int JMPTBL (L(54bytes), L(table_64bytes)) > > > - .int JMPTBL (L(55bytes), L(table_64bytes)) > > > - .int JMPTBL (L(56bytes), L(table_64bytes)) > > > - .int JMPTBL (L(57bytes), L(table_64bytes)) > > > - .int JMPTBL (L(58bytes), L(table_64bytes)) > > > - .int JMPTBL (L(59bytes), L(table_64bytes)) > > > - .int JMPTBL (L(60bytes), L(table_64bytes)) > > > - .int JMPTBL (L(61bytes), L(table_64bytes)) > > > - .int JMPTBL (L(62bytes), L(table_64bytes)) > > > - .int JMPTBL (L(63bytes), L(table_64bytes)) > > > - .int JMPTBL (L(64bytes), L(table_64bytes)) > > > - .int JMPTBL (L(65bytes), L(table_64bytes)) > > > - .int JMPTBL (L(66bytes), L(table_64bytes)) > > > - .int JMPTBL (L(67bytes), L(table_64bytes)) > > > - .int JMPTBL (L(68bytes), L(table_64bytes)) > > > - .int JMPTBL (L(69bytes), L(table_64bytes)) > > > - .int JMPTBL (L(70bytes), L(table_64bytes)) > > > - .int JMPTBL (L(71bytes), L(table_64bytes)) > > > - .int JMPTBL (L(72bytes), L(table_64bytes)) > > > - .int JMPTBL (L(73bytes), L(table_64bytes)) > > > - .int JMPTBL (L(74bytes), L(table_64bytes)) > > > - .int JMPTBL (L(75bytes), L(table_64bytes)) > > > - .int JMPTBL (L(76bytes), L(table_64bytes)) > > > - .int JMPTBL (L(77bytes), L(table_64bytes)) > > > - .int JMPTBL (L(78bytes), L(table_64bytes)) > > > - .int JMPTBL (L(79bytes), L(table_64bytes)) > > > -# else > > > -L(table_64bytes): > > > - .int JMPTBL (L(0bytes), L(table_64bytes)) > > > - .int JMPTBL (L(unreal_case), L(table_64bytes)) > > > - .int JMPTBL (L(unreal_case), L(table_64bytes)) > > > - .int JMPTBL (L(unreal_case), L(table_64bytes)) > > > - .int JMPTBL (L(4bytes), L(table_64bytes)) > > > - .int JMPTBL (L(unreal_case), L(table_64bytes)) > > > - .int JMPTBL (L(unreal_case), L(table_64bytes)) > > > - .int JMPTBL (L(unreal_case), L(table_64bytes)) > > > - .int JMPTBL (L(8bytes), L(table_64bytes)) > > > - .int JMPTBL (L(unreal_case), L(table_64bytes)) > > > - .int JMPTBL (L(unreal_case), L(table_64bytes)) > > > - .int JMPTBL (L(unreal_case), L(table_64bytes)) > > > - .int JMPTBL (L(12bytes), L(table_64bytes)) > > > - .int JMPTBL (L(unreal_case), L(table_64bytes)) > > > - .int JMPTBL (L(unreal_case), L(table_64bytes)) > > > - .int JMPTBL (L(unreal_case), L(table_64bytes)) > > > - .int JMPTBL (L(16bytes), L(table_64bytes)) > > > - .int JMPTBL (L(unreal_case), L(table_64bytes)) > > > - .int JMPTBL (L(unreal_case), L(table_64bytes)) > > > - .int JMPTBL (L(unreal_case), L(table_64bytes)) > > > - .int JMPTBL (L(20bytes), L(table_64bytes)) > > > - .int JMPTBL (L(unreal_case), L(table_64bytes)) > > > - .int JMPTBL (L(unreal_case), L(table_64bytes)) > > > - .int JMPTBL (L(unreal_case), L(table_64bytes)) > > > - .int JMPTBL (L(24bytes), L(table_64bytes)) > > > - .int JMPTBL (L(unreal_case), L(table_64bytes)) > > > - .int JMPTBL (L(unreal_case), L(table_64bytes)) > > > - .int JMPTBL (L(unreal_case), L(table_64bytes)) > > > - .int JMPTBL (L(28bytes), L(table_64bytes)) > > > - .int JMPTBL (L(unreal_case), L(table_64bytes)) > > > - .int JMPTBL (L(unreal_case), L(table_64bytes)) > > > - .int JMPTBL (L(unreal_case), L(table_64bytes)) > > > - .int JMPTBL (L(32bytes), L(table_64bytes)) > > > - .int JMPTBL (L(unreal_case), L(table_64bytes)) > > > - .int JMPTBL (L(unreal_case), L(table_64bytes)) > > > - .int JMPTBL (L(unreal_case), L(table_64bytes)) > > > - .int JMPTBL (L(36bytes), L(table_64bytes)) > > > - .int JMPTBL (L(unreal_case), L(table_64bytes)) > > > - .int JMPTBL (L(unreal_case), L(table_64bytes)) > > > - .int JMPTBL (L(unreal_case), L(table_64bytes)) > > > - .int JMPTBL (L(40bytes), L(table_64bytes)) > > > - .int JMPTBL (L(unreal_case), L(table_64bytes)) > > > - .int JMPTBL (L(unreal_case), L(table_64bytes)) > > > - .int JMPTBL (L(unreal_case), L(table_64bytes)) > > > - .int JMPTBL (L(44bytes), L(table_64bytes)) > > > - .int JMPTBL (L(unreal_case), L(table_64bytes)) > > > - .int JMPTBL (L(unreal_case), L(table_64bytes)) > > > - .int JMPTBL (L(unreal_case), L(table_64bytes)) > > > - .int JMPTBL (L(48bytes), L(table_64bytes)) > > > - .int JMPTBL (L(unreal_case), L(table_64bytes)) > > > - .int JMPTBL (L(unreal_case), L(table_64bytes)) > > > - .int JMPTBL (L(unreal_case), L(table_64bytes)) > > > - .int JMPTBL (L(52bytes), L(table_64bytes)) > > > - .int JMPTBL (L(unreal_case), L(table_64bytes)) > > > - .int JMPTBL (L(unreal_case), L(table_64bytes)) > > > - .int JMPTBL (L(unreal_case), L(table_64bytes)) > > > - .int JMPTBL (L(56bytes), L(table_64bytes)) > > > - .int JMPTBL (L(unreal_case), L(table_64bytes)) > > > - .int JMPTBL (L(unreal_case), L(table_64bytes)) > > > - .int JMPTBL (L(unreal_case), L(table_64bytes)) > > > - .int JMPTBL (L(60bytes), L(table_64bytes)) > > > - .int JMPTBL (L(unreal_case), L(table_64bytes)) > > > - .int JMPTBL (L(unreal_case), L(table_64bytes)) > > > - .int JMPTBL (L(unreal_case), L(table_64bytes)) > > > - .int JMPTBL (L(64bytes), L(table_64bytes)) > > > - .int JMPTBL (L(unreal_case), L(table_64bytes)) > > > - .int JMPTBL (L(unreal_case), L(table_64bytes)) > > > - .int JMPTBL (L(unreal_case), L(table_64bytes)) > > > - .int JMPTBL (L(68bytes), L(table_64bytes)) > > > - .int JMPTBL (L(unreal_case), L(table_64bytes)) > > > - .int JMPTBL (L(unreal_case), L(table_64bytes)) > > > - .int JMPTBL (L(unreal_case), L(table_64bytes)) > > > - .int JMPTBL (L(72bytes), L(table_64bytes)) > > > - .int JMPTBL (L(unreal_case), L(table_64bytes)) > > > - .int JMPTBL (L(unreal_case), L(table_64bytes)) > > > - .int JMPTBL (L(unreal_case), L(table_64bytes)) > > > - .int JMPTBL (L(76bytes), L(table_64bytes)) > > > - .int JMPTBL (L(unreal_case), L(table_64bytes)) > > > - .int JMPTBL (L(unreal_case), L(table_64bytes)) > > > - .int JMPTBL (L(unreal_case), L(table_64bytes)) > > > -# endif > > > #endif > > > -- > > > 2.25.1 > > > > > > > LGTM. > > > > Reviewed-by: H.J. Lu > Thanks. Pushed. > > > > Thanks. > > > > -- > > H.J. I would like to backport this patch to release branches. Any comments or objections? --Sunil