From mboxrd@z Thu Jan 1 00:00:00 1970 Return-Path: Received: from mail-pl1-x635.google.com (mail-pl1-x635.google.com [IPv6:2607:f8b0:4864:20::635]) by sourceware.org (Postfix) with ESMTPS id A34493858C53 for ; Thu, 14 Apr 2022 18:04:57 +0000 (GMT) DMARC-Filter: OpenDMARC Filter v1.4.1 sourceware.org A34493858C53 Received: by mail-pl1-x635.google.com with SMTP id c23so5348346plo.0 for ; Thu, 14 Apr 2022 11:04:57 -0700 (PDT) X-Google-DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=1e100.net; s=20210112; h=x-gm-message-state:mime-version:references:in-reply-to:from:date :message-id:subject:to:cc; bh=bAqWC949wwWK4bep54bxAg+dhgjyZn/dvy7KFFx6Cdw=; b=tBW3yE3lQTI5Agx42S9OsqvZYVdK8DOqiyDqhSp1Z3+Xp0kona5RmJAx15L0w/xPkx 4h09k05KzSzAbAicTMyZs0a/eL3powOCb1nrZDUm1x79YHLxZ5RM/hz8lY9nr+3QPnrW SQTeudNM+zQc7DdY836VBYgrClNrV63Br//EQsbGcJEHYIqNWF4QTnRz3TNvXJF42zLT PJ+JImOz5ZOHn9RR3khpChAn/noXK/5ILwKi2YVr2LmjxufkN/OomCsaMJTWeNt7N0RN zrEJNlsfl2AhS/xy36fbDkRMqPD1PWw5eaLPaAzv1u4T/X6yfn8eGOdQBhC4s6x5P9q2 HG9g== X-Gm-Message-State: AOAM533o8hoLZ5OX7vIbST0ErR52rxA+6aXaSeSp9CrVoMJBh7YguawU UVwjSBokHvCxxIpSnIN0RJHw7jFh9ZsVusqEtmc= X-Google-Smtp-Source: ABdhPJxsIOr4R0i/sgfOgB0ACn2AiOhjLFQlitzG1WJ14TCoIaMXVsjlrLuSJEHYgimSlDYVH3l4pVJuu1RzQfFZlo0= X-Received: by 2002:a17:902:d506:b0:158:7a91:d066 with SMTP id b6-20020a170902d50600b001587a91d066mr17188094plg.102.1649959496301; Thu, 14 Apr 2022 11:04:56 -0700 (PDT) MIME-Version: 1.0 References: <20220325183625.1170867-2-goldstein.w.n@gmail.com> <20220414164739.3146735-1-goldstein.w.n@gmail.com> In-Reply-To: <20220414164739.3146735-1-goldstein.w.n@gmail.com> From: "H.J. Lu" Date: Thu, 14 Apr 2022 11:04:18 -0700 Message-ID: Subject: Re: [PATCH v5 1/6] x86: Remove {w}memcmp-ssse3 To: Noah Goldstein Cc: GNU C Library , "Carlos O'Donell" Content-Type: text/plain; charset="UTF-8" X-Spam-Status: No, score=-3025.5 required=5.0 tests=BAYES_00, DKIM_SIGNED, DKIM_VALID, DKIM_VALID_AU, DKIM_VALID_EF, FREEMAIL_FROM, GIT_PATCH_0, KAM_NUMSUBJECT, KAM_SHORT, RCVD_IN_DNSWL_NONE, SPF_HELO_NONE, SPF_PASS, TXREP, T_SCC_BODY_TEXT_LINE autolearn=ham autolearn_force=no version=3.4.4 X-Spam-Checker-Version: SpamAssassin 3.4.4 (2020-01-24) on server2.sourceware.org X-BeenThere: libc-alpha@sourceware.org X-Mailman-Version: 2.1.29 Precedence: list List-Id: Libc-alpha mailing list List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , X-List-Received-Date: Thu, 14 Apr 2022 18:05:02 -0000 On Thu, Apr 14, 2022 at 9:47 AM Noah Goldstein wrote: > > With SSE2, SSE4.1, AVX2, and EVEX versions very few targets prefer > SSSE3. As a result it is no longer worth it to keep the SSSE3 > versions given the code size cost. > --- > sysdeps/x86_64/multiarch/Makefile | 2 - > sysdeps/x86_64/multiarch/ifunc-impl-list.c | 4 - > sysdeps/x86_64/multiarch/ifunc-memcmp.h | 4 - > sysdeps/x86_64/multiarch/memcmp-ssse3.S | 1992 -------------------- > sysdeps/x86_64/multiarch/wmemcmp-ssse3.S | 4 - > 5 files changed, 2006 deletions(-) > delete mode 100644 sysdeps/x86_64/multiarch/memcmp-ssse3.S > delete mode 100644 sysdeps/x86_64/multiarch/wmemcmp-ssse3.S > > diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile > index 6507d1b7fa..51222dfab1 100644 > --- a/sysdeps/x86_64/multiarch/Makefile > +++ b/sysdeps/x86_64/multiarch/Makefile > @@ -12,7 +12,6 @@ sysdep_routines += \ > memcmp-evex-movbe \ > memcmp-sse2 \ > memcmp-sse4 \ > - memcmp-ssse3 \ > memcmpeq-avx2 \ > memcmpeq-avx2-rtm \ > memcmpeq-evex \ > @@ -179,7 +178,6 @@ sysdep_routines += \ > wmemcmp-c \ > wmemcmp-evex-movbe \ > wmemcmp-sse4 \ > - wmemcmp-ssse3 \ > # sysdep_routines > endif > > diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c > index 40cc6cc49e..f389928a4e 100644 > --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c > +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c > @@ -98,8 +98,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, > __memcmp_evex_movbe) > IFUNC_IMPL_ADD (array, i, memcmp, CPU_FEATURE_USABLE (SSE4_1), > __memcmp_sse4_1) > - IFUNC_IMPL_ADD (array, i, memcmp, CPU_FEATURE_USABLE (SSSE3), > - __memcmp_ssse3) > IFUNC_IMPL_ADD (array, i, memcmp, 1, __memcmp_sse2)) > > #ifdef SHARED > @@ -844,8 +842,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, > __wmemcmp_evex_movbe) > IFUNC_IMPL_ADD (array, i, wmemcmp, CPU_FEATURE_USABLE (SSE4_1), > __wmemcmp_sse4_1) > - IFUNC_IMPL_ADD (array, i, wmemcmp, CPU_FEATURE_USABLE (SSSE3), > - __wmemcmp_ssse3) > IFUNC_IMPL_ADD (array, i, wmemcmp, 1, __wmemcmp_sse2)) > > /* Support sysdeps/x86_64/multiarch/wmemset.c. */ > diff --git a/sysdeps/x86_64/multiarch/ifunc-memcmp.h b/sysdeps/x86_64/multiarch/ifunc-memcmp.h > index cd12613699..44759a3ad5 100644 > --- a/sysdeps/x86_64/multiarch/ifunc-memcmp.h > +++ b/sysdeps/x86_64/multiarch/ifunc-memcmp.h > @@ -20,7 +20,6 @@ > # include > > extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden; > -extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden; > extern __typeof (REDIRECT_NAME) OPTIMIZE (sse4_1) attribute_hidden; > extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_movbe) attribute_hidden; > extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_movbe_rtm) attribute_hidden; > @@ -50,8 +49,5 @@ IFUNC_SELECTOR (void) > if (CPU_FEATURE_USABLE_P (cpu_features, SSE4_1)) > return OPTIMIZE (sse4_1); > > - if (CPU_FEATURE_USABLE_P (cpu_features, SSSE3)) > - return OPTIMIZE (ssse3); > - > return OPTIMIZE (sse2); > } > diff --git a/sysdeps/x86_64/multiarch/memcmp-ssse3.S b/sysdeps/x86_64/multiarch/memcmp-ssse3.S > deleted file mode 100644 > index df1b1fc494..0000000000 > --- a/sysdeps/x86_64/multiarch/memcmp-ssse3.S > +++ /dev/null > @@ -1,1992 +0,0 @@ > -/* memcmp with SSSE3, wmemcmp with SSSE3 > - Copyright (C) 2011-2022 Free Software Foundation, Inc. > - This file is part of the GNU C Library. > - > - The GNU C Library is free software; you can redistribute it and/or > - modify it under the terms of the GNU Lesser General Public > - License as published by the Free Software Foundation; either > - version 2.1 of the License, or (at your option) any later version. > - > - The GNU C Library is distributed in the hope that it will be useful, > - but WITHOUT ANY WARRANTY; without even the implied warranty of > - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > - Lesser General Public License for more details. > - > - You should have received a copy of the GNU Lesser General Public > - License along with the GNU C Library; if not, see > - . */ > - > -#if IS_IN (libc) > - > -# include > - > -# ifndef MEMCMP > -# define MEMCMP __memcmp_ssse3 > -# endif > - > -/* Warning! > - wmemcmp has to use SIGNED comparison for elements. > - memcmp has to use UNSIGNED comparison for elemnts. > -*/ > - > - atom_text_section > -ENTRY (MEMCMP) > -# ifdef USE_AS_WMEMCMP > - shl $2, %RDX_LP > - test %RDX_LP, %RDX_LP > - jz L(equal) > -# elif defined __ILP32__ > - /* Clear the upper 32 bits. */ > - mov %edx, %edx > -# endif > - mov %rdx, %rcx > - mov %rdi, %rdx > - cmp $48, %rcx; > - jae L(48bytesormore) /* LEN => 48 */ > - > - add %rcx, %rsi > - add %rcx, %rdi > - jmp L(less48bytes) > - > - .p2align 4 > -/* ECX >= 32. */ > -L(48bytesormore): > - movdqu (%rdi), %xmm3 > - movdqu (%rsi), %xmm0 > - pcmpeqb %xmm0, %xmm3 > - pmovmskb %xmm3, %edx > - lea 16(%rdi), %rdi > - lea 16(%rsi), %rsi > - sub $0xffff, %edx > - jnz L(less16bytes) > - mov %edi, %edx > - and $0xf, %edx > - xor %rdx, %rdi > - sub %rdx, %rsi > - add %rdx, %rcx > - mov %esi, %edx > - and $0xf, %edx > - jz L(shr_0) > - xor %rdx, %rsi > - > -# ifndef USE_AS_WMEMCMP > - cmp $8, %edx > - jae L(next_unaligned_table) > - cmp $0, %edx > - je L(shr_0) > - cmp $1, %edx > - je L(shr_1) > - cmp $2, %edx > - je L(shr_2) > - cmp $3, %edx > - je L(shr_3) > - cmp $4, %edx > - je L(shr_4) > - cmp $5, %edx > - je L(shr_5) > - cmp $6, %edx > - je L(shr_6) > - jmp L(shr_7) > - > - .p2align 2 > -L(next_unaligned_table): > - cmp $8, %edx > - je L(shr_8) > - cmp $9, %edx > - je L(shr_9) > - cmp $10, %edx > - je L(shr_10) > - cmp $11, %edx > - je L(shr_11) > - cmp $12, %edx > - je L(shr_12) > - cmp $13, %edx > - je L(shr_13) > - cmp $14, %edx > - je L(shr_14) > - jmp L(shr_15) > -# else > - cmp $0, %edx > - je L(shr_0) > - cmp $4, %edx > - je L(shr_4) > - cmp $8, %edx > - je L(shr_8) > - jmp L(shr_12) > -# endif > - > - .p2align 4 > -L(shr_0): > - cmp $80, %rcx > - lea -48(%rcx), %rcx > - jae L(shr_0_gobble) > - xor %eax, %eax > - movdqa (%rsi), %xmm1 > - pcmpeqb (%rdi), %xmm1 > - movdqa 16(%rsi), %xmm2 > - pcmpeqb 16(%rdi), %xmm2 > - pand %xmm1, %xmm2 > - pmovmskb %xmm2, %edx > - lea 32(%rdi), %rdi > - lea 32(%rsi), %rsi > - sub $0xffff, %edx > - jnz L(exit) > - add %rcx, %rsi > - add %rcx, %rdi > - jmp L(less48bytes) > - > - .p2align 4 > -L(shr_0_gobble): > - movdqa (%rsi), %xmm0 > - xor %eax, %eax > - pcmpeqb (%rdi), %xmm0 > - sub $32, %rcx > - movdqa 16(%rsi), %xmm2 > - pcmpeqb 16(%rdi), %xmm2 > -L(shr_0_gobble_loop): > - pand %xmm0, %xmm2 > - sub $32, %rcx > - pmovmskb %xmm2, %edx > - movdqa %xmm0, %xmm1 > - movdqa 32(%rsi), %xmm0 > - movdqa 48(%rsi), %xmm2 > - sbb $0xffff, %edx > - pcmpeqb 32(%rdi), %xmm0 > - pcmpeqb 48(%rdi), %xmm2 > - lea 32(%rdi), %rdi > - lea 32(%rsi), %rsi > - jz L(shr_0_gobble_loop) > - > - pand %xmm0, %xmm2 > - cmp $0, %rcx > - jge L(next) > - inc %edx > - add $32, %rcx > -L(next): > - test %edx, %edx > - jnz L(exit) > - > - pmovmskb %xmm2, %edx > - movdqa %xmm0, %xmm1 > - lea 32(%rdi), %rdi > - lea 32(%rsi), %rsi > - sub $0xffff, %edx > - jnz L(exit) > - add %rcx, %rsi > - add %rcx, %rdi > - jmp L(less48bytes) > - > -# ifndef USE_AS_WMEMCMP > - > - .p2align 4 > -L(shr_1): > - cmp $80, %rcx > - lea -48(%rcx), %rcx > - mov %edx, %eax > - jae L(shr_1_gobble) > - > - movdqa 16(%rsi), %xmm1 > - movdqa %xmm1, %xmm2 > - palignr $1, (%rsi), %xmm1 > - pcmpeqb (%rdi), %xmm1 > - > - movdqa 32(%rsi), %xmm3 > - palignr $1, %xmm2, %xmm3 > - pcmpeqb 16(%rdi), %xmm3 > - > - pand %xmm1, %xmm3 > - pmovmskb %xmm3, %edx > - lea 32(%rdi), %rdi > - lea 32(%rsi), %rsi > - sub $0xffff, %edx > - jnz L(exit) > - add $1, %rsi > - add %rcx, %rsi > - add %rcx, %rdi > - jmp L(less48bytes) > - > - .p2align 4 > -L(shr_1_gobble): > - sub $32, %rcx > - movdqa 16(%rsi), %xmm0 > - palignr $1, (%rsi), %xmm0 > - pcmpeqb (%rdi), %xmm0 > - > - movdqa 32(%rsi), %xmm3 > - palignr $1, 16(%rsi), %xmm3 > - pcmpeqb 16(%rdi), %xmm3 > - > -L(shr_1_gobble_loop): > - pand %xmm0, %xmm3 > - sub $32, %rcx > - pmovmskb %xmm3, %edx > - movdqa %xmm0, %xmm1 > - > - movdqa 64(%rsi), %xmm3 > - palignr $1, 48(%rsi), %xmm3 > - sbb $0xffff, %edx > - movdqa 48(%rsi), %xmm0 > - palignr $1, 32(%rsi), %xmm0 > - pcmpeqb 32(%rdi), %xmm0 > - lea 32(%rsi), %rsi > - pcmpeqb 48(%rdi), %xmm3 > - > - lea 32(%rdi), %rdi > - jz L(shr_1_gobble_loop) > - pand %xmm0, %xmm3 > - > - cmp $0, %rcx > - jge L(shr_1_gobble_next) > - inc %edx > - add $32, %rcx > -L(shr_1_gobble_next): > - test %edx, %edx > - jnz L(exit) > - > - pmovmskb %xmm3, %edx > - movdqa %xmm0, %xmm1 > - lea 32(%rdi), %rdi > - lea 32(%rsi), %rsi > - sub $0xffff, %edx > - jnz L(exit) > - > - lea 1(%rsi), %rsi > - add %rcx, %rsi > - add %rcx, %rdi > - jmp L(less48bytes) > - > - > - .p2align 4 > -L(shr_2): > - cmp $80, %rcx > - lea -48(%rcx), %rcx > - mov %edx, %eax > - jae L(shr_2_gobble) > - > - movdqa 16(%rsi), %xmm1 > - movdqa %xmm1, %xmm2 > - palignr $2, (%rsi), %xmm1 > - pcmpeqb (%rdi), %xmm1 > - > - movdqa 32(%rsi), %xmm3 > - palignr $2, %xmm2, %xmm3 > - pcmpeqb 16(%rdi), %xmm3 > - > - pand %xmm1, %xmm3 > - pmovmskb %xmm3, %edx > - lea 32(%rdi), %rdi > - lea 32(%rsi), %rsi > - sub $0xffff, %edx > - jnz L(exit) > - add $2, %rsi > - add %rcx, %rsi > - add %rcx, %rdi > - jmp L(less48bytes) > - > - .p2align 4 > -L(shr_2_gobble): > - sub $32, %rcx > - movdqa 16(%rsi), %xmm0 > - palignr $2, (%rsi), %xmm0 > - pcmpeqb (%rdi), %xmm0 > - > - movdqa 32(%rsi), %xmm3 > - palignr $2, 16(%rsi), %xmm3 > - pcmpeqb 16(%rdi), %xmm3 > - > -L(shr_2_gobble_loop): > - pand %xmm0, %xmm3 > - sub $32, %rcx > - pmovmskb %xmm3, %edx > - movdqa %xmm0, %xmm1 > - > - movdqa 64(%rsi), %xmm3 > - palignr $2, 48(%rsi), %xmm3 > - sbb $0xffff, %edx > - movdqa 48(%rsi), %xmm0 > - palignr $2, 32(%rsi), %xmm0 > - pcmpeqb 32(%rdi), %xmm0 > - lea 32(%rsi), %rsi > - pcmpeqb 48(%rdi), %xmm3 > - > - lea 32(%rdi), %rdi > - jz L(shr_2_gobble_loop) > - pand %xmm0, %xmm3 > - > - cmp $0, %rcx > - jge L(shr_2_gobble_next) > - inc %edx > - add $32, %rcx > -L(shr_2_gobble_next): > - test %edx, %edx > - jnz L(exit) > - > - pmovmskb %xmm3, %edx > - movdqa %xmm0, %xmm1 > - lea 32(%rdi), %rdi > - lea 32(%rsi), %rsi > - sub $0xffff, %edx > - jnz L(exit) > - > - lea 2(%rsi), %rsi > - add %rcx, %rsi > - add %rcx, %rdi > - jmp L(less48bytes) > - > - .p2align 4 > -L(shr_3): > - cmp $80, %rcx > - lea -48(%rcx), %rcx > - mov %edx, %eax > - jae L(shr_3_gobble) > - > - movdqa 16(%rsi), %xmm1 > - movdqa %xmm1, %xmm2 > - palignr $3, (%rsi), %xmm1 > - pcmpeqb (%rdi), %xmm1 > - > - movdqa 32(%rsi), %xmm3 > - palignr $3, %xmm2, %xmm3 > - pcmpeqb 16(%rdi), %xmm3 > - > - pand %xmm1, %xmm3 > - pmovmskb %xmm3, %edx > - lea 32(%rdi), %rdi > - lea 32(%rsi), %rsi > - sub $0xffff, %edx > - jnz L(exit) > - add $3, %rsi > - add %rcx, %rsi > - add %rcx, %rdi > - jmp L(less48bytes) > - > - .p2align 4 > -L(shr_3_gobble): > - sub $32, %rcx > - movdqa 16(%rsi), %xmm0 > - palignr $3, (%rsi), %xmm0 > - pcmpeqb (%rdi), %xmm0 > - > - movdqa 32(%rsi), %xmm3 > - palignr $3, 16(%rsi), %xmm3 > - pcmpeqb 16(%rdi), %xmm3 > - > -L(shr_3_gobble_loop): > - pand %xmm0, %xmm3 > - sub $32, %rcx > - pmovmskb %xmm3, %edx > - movdqa %xmm0, %xmm1 > - > - movdqa 64(%rsi), %xmm3 > - palignr $3, 48(%rsi), %xmm3 > - sbb $0xffff, %edx > - movdqa 48(%rsi), %xmm0 > - palignr $3, 32(%rsi), %xmm0 > - pcmpeqb 32(%rdi), %xmm0 > - lea 32(%rsi), %rsi > - pcmpeqb 48(%rdi), %xmm3 > - > - lea 32(%rdi), %rdi > - jz L(shr_3_gobble_loop) > - pand %xmm0, %xmm3 > - > - cmp $0, %rcx > - jge L(shr_3_gobble_next) > - inc %edx > - add $32, %rcx > -L(shr_3_gobble_next): > - test %edx, %edx > - jnz L(exit) > - > - pmovmskb %xmm3, %edx > - movdqa %xmm0, %xmm1 > - lea 32(%rdi), %rdi > - lea 32(%rsi), %rsi > - sub $0xffff, %edx > - jnz L(exit) > - > - lea 3(%rsi), %rsi > - add %rcx, %rsi > - add %rcx, %rdi > - jmp L(less48bytes) > - > -# endif > - > - .p2align 4 > -L(shr_4): > - cmp $80, %rcx > - lea -48(%rcx), %rcx > - mov %edx, %eax > - jae L(shr_4_gobble) > - > - movdqa 16(%rsi), %xmm1 > - movdqa %xmm1, %xmm2 > - palignr $4, (%rsi), %xmm1 > - pcmpeqb (%rdi), %xmm1 > - > - movdqa 32(%rsi), %xmm3 > - palignr $4, %xmm2, %xmm3 > - pcmpeqb 16(%rdi), %xmm3 > - > - pand %xmm1, %xmm3 > - pmovmskb %xmm3, %edx > - lea 32(%rdi), %rdi > - lea 32(%rsi), %rsi > - sub $0xffff, %edx > - jnz L(exit) > - add $4, %rsi > - add %rcx, %rsi > - add %rcx, %rdi > - jmp L(less48bytes) > - > - .p2align 4 > -L(shr_4_gobble): > - sub $32, %rcx > - movdqa 16(%rsi), %xmm0 > - palignr $4, (%rsi), %xmm0 > - pcmpeqb (%rdi), %xmm0 > - > - movdqa 32(%rsi), %xmm3 > - palignr $4, 16(%rsi), %xmm3 > - pcmpeqb 16(%rdi), %xmm3 > - > -L(shr_4_gobble_loop): > - pand %xmm0, %xmm3 > - sub $32, %rcx > - pmovmskb %xmm3, %edx > - movdqa %xmm0, %xmm1 > - > - movdqa 64(%rsi), %xmm3 > - palignr $4, 48(%rsi), %xmm3 > - sbb $0xffff, %edx > - movdqa 48(%rsi), %xmm0 > - palignr $4, 32(%rsi), %xmm0 > - pcmpeqb 32(%rdi), %xmm0 > - lea 32(%rsi), %rsi > - pcmpeqb 48(%rdi), %xmm3 > - > - lea 32(%rdi), %rdi > - jz L(shr_4_gobble_loop) > - pand %xmm0, %xmm3 > - > - cmp $0, %rcx > - jge L(shr_4_gobble_next) > - inc %edx > - add $32, %rcx > -L(shr_4_gobble_next): > - test %edx, %edx > - jnz L(exit) > - > - pmovmskb %xmm3, %edx > - movdqa %xmm0, %xmm1 > - lea 32(%rdi), %rdi > - lea 32(%rsi), %rsi > - sub $0xffff, %edx > - jnz L(exit) > - > - lea 4(%rsi), %rsi > - add %rcx, %rsi > - add %rcx, %rdi > - jmp L(less48bytes) > - > -# ifndef USE_AS_WMEMCMP > - > - .p2align 4 > -L(shr_5): > - cmp $80, %rcx > - lea -48(%rcx), %rcx > - mov %edx, %eax > - jae L(shr_5_gobble) > - > - movdqa 16(%rsi), %xmm1 > - movdqa %xmm1, %xmm2 > - palignr $5, (%rsi), %xmm1 > - pcmpeqb (%rdi), %xmm1 > - > - movdqa 32(%rsi), %xmm3 > - palignr $5, %xmm2, %xmm3 > - pcmpeqb 16(%rdi), %xmm3 > - > - pand %xmm1, %xmm3 > - pmovmskb %xmm3, %edx > - lea 32(%rdi), %rdi > - lea 32(%rsi), %rsi > - sub $0xffff, %edx > - jnz L(exit) > - add $5, %rsi > - add %rcx, %rsi > - add %rcx, %rdi > - jmp L(less48bytes) > - > - .p2align 4 > -L(shr_5_gobble): > - sub $32, %rcx > - movdqa 16(%rsi), %xmm0 > - palignr $5, (%rsi), %xmm0 > - pcmpeqb (%rdi), %xmm0 > - > - movdqa 32(%rsi), %xmm3 > - palignr $5, 16(%rsi), %xmm3 > - pcmpeqb 16(%rdi), %xmm3 > - > -L(shr_5_gobble_loop): > - pand %xmm0, %xmm3 > - sub $32, %rcx > - pmovmskb %xmm3, %edx > - movdqa %xmm0, %xmm1 > - > - movdqa 64(%rsi), %xmm3 > - palignr $5, 48(%rsi), %xmm3 > - sbb $0xffff, %edx > - movdqa 48(%rsi), %xmm0 > - palignr $5, 32(%rsi), %xmm0 > - pcmpeqb 32(%rdi), %xmm0 > - lea 32(%rsi), %rsi > - pcmpeqb 48(%rdi), %xmm3 > - > - lea 32(%rdi), %rdi > - jz L(shr_5_gobble_loop) > - pand %xmm0, %xmm3 > - > - cmp $0, %rcx > - jge L(shr_5_gobble_next) > - inc %edx > - add $32, %rcx > -L(shr_5_gobble_next): > - test %edx, %edx > - jnz L(exit) > - > - pmovmskb %xmm3, %edx > - movdqa %xmm0, %xmm1 > - lea 32(%rdi), %rdi > - lea 32(%rsi), %rsi > - sub $0xffff, %edx > - jnz L(exit) > - > - lea 5(%rsi), %rsi > - add %rcx, %rsi > - add %rcx, %rdi > - jmp L(less48bytes) > - > - .p2align 4 > -L(shr_6): > - cmp $80, %rcx > - lea -48(%rcx), %rcx > - mov %edx, %eax > - jae L(shr_6_gobble) > - > - movdqa 16(%rsi), %xmm1 > - movdqa %xmm1, %xmm2 > - palignr $6, (%rsi), %xmm1 > - pcmpeqb (%rdi), %xmm1 > - > - movdqa 32(%rsi), %xmm3 > - palignr $6, %xmm2, %xmm3 > - pcmpeqb 16(%rdi), %xmm3 > - > - pand %xmm1, %xmm3 > - pmovmskb %xmm3, %edx > - lea 32(%rdi), %rdi > - lea 32(%rsi), %rsi > - sub $0xffff, %edx > - jnz L(exit) > - add $6, %rsi > - add %rcx, %rsi > - add %rcx, %rdi > - jmp L(less48bytes) > - > - .p2align 4 > -L(shr_6_gobble): > - sub $32, %rcx > - movdqa 16(%rsi), %xmm0 > - palignr $6, (%rsi), %xmm0 > - pcmpeqb (%rdi), %xmm0 > - > - movdqa 32(%rsi), %xmm3 > - palignr $6, 16(%rsi), %xmm3 > - pcmpeqb 16(%rdi), %xmm3 > - > -L(shr_6_gobble_loop): > - pand %xmm0, %xmm3 > - sub $32, %rcx > - pmovmskb %xmm3, %edx > - movdqa %xmm0, %xmm1 > - > - movdqa 64(%rsi), %xmm3 > - palignr $6, 48(%rsi), %xmm3 > - sbb $0xffff, %edx > - movdqa 48(%rsi), %xmm0 > - palignr $6, 32(%rsi), %xmm0 > - pcmpeqb 32(%rdi), %xmm0 > - lea 32(%rsi), %rsi > - pcmpeqb 48(%rdi), %xmm3 > - > - lea 32(%rdi), %rdi > - jz L(shr_6_gobble_loop) > - pand %xmm0, %xmm3 > - > - cmp $0, %rcx > - jge L(shr_6_gobble_next) > - inc %edx > - add $32, %rcx > -L(shr_6_gobble_next): > - test %edx, %edx > - jnz L(exit) > - > - pmovmskb %xmm3, %edx > - movdqa %xmm0, %xmm1 > - lea 32(%rdi), %rdi > - lea 32(%rsi), %rsi > - sub $0xffff, %edx > - jnz L(exit) > - > - lea 6(%rsi), %rsi > - add %rcx, %rsi > - add %rcx, %rdi > - jmp L(less48bytes) > - > - .p2align 4 > -L(shr_7): > - cmp $80, %rcx > - lea -48(%rcx), %rcx > - mov %edx, %eax > - jae L(shr_7_gobble) > - > - movdqa 16(%rsi), %xmm1 > - movdqa %xmm1, %xmm2 > - palignr $7, (%rsi), %xmm1 > - pcmpeqb (%rdi), %xmm1 > - > - movdqa 32(%rsi), %xmm3 > - palignr $7, %xmm2, %xmm3 > - pcmpeqb 16(%rdi), %xmm3 > - > - pand %xmm1, %xmm3 > - pmovmskb %xmm3, %edx > - lea 32(%rdi), %rdi > - lea 32(%rsi), %rsi > - sub $0xffff, %edx > - jnz L(exit) > - add $7, %rsi > - add %rcx, %rsi > - add %rcx, %rdi > - jmp L(less48bytes) > - > - .p2align 4 > -L(shr_7_gobble): > - sub $32, %rcx > - movdqa 16(%rsi), %xmm0 > - palignr $7, (%rsi), %xmm0 > - pcmpeqb (%rdi), %xmm0 > - > - movdqa 32(%rsi), %xmm3 > - palignr $7, 16(%rsi), %xmm3 > - pcmpeqb 16(%rdi), %xmm3 > - > -L(shr_7_gobble_loop): > - pand %xmm0, %xmm3 > - sub $32, %rcx > - pmovmskb %xmm3, %edx > - movdqa %xmm0, %xmm1 > - > - movdqa 64(%rsi), %xmm3 > - palignr $7, 48(%rsi), %xmm3 > - sbb $0xffff, %edx > - movdqa 48(%rsi), %xmm0 > - palignr $7, 32(%rsi), %xmm0 > - pcmpeqb 32(%rdi), %xmm0 > - lea 32(%rsi), %rsi > - pcmpeqb 48(%rdi), %xmm3 > - > - lea 32(%rdi), %rdi > - jz L(shr_7_gobble_loop) > - pand %xmm0, %xmm3 > - > - cmp $0, %rcx > - jge L(shr_7_gobble_next) > - inc %edx > - add $32, %rcx > -L(shr_7_gobble_next): > - test %edx, %edx > - jnz L(exit) > - > - pmovmskb %xmm3, %edx > - movdqa %xmm0, %xmm1 > - lea 32(%rdi), %rdi > - lea 32(%rsi), %rsi > - sub $0xffff, %edx > - jnz L(exit) > - > - lea 7(%rsi), %rsi > - add %rcx, %rsi > - add %rcx, %rdi > - jmp L(less48bytes) > - > -# endif > - > - .p2align 4 > -L(shr_8): > - cmp $80, %rcx > - lea -48(%rcx), %rcx > - mov %edx, %eax > - jae L(shr_8_gobble) > - > - movdqa 16(%rsi), %xmm1 > - movdqa %xmm1, %xmm2 > - palignr $8, (%rsi), %xmm1 > - pcmpeqb (%rdi), %xmm1 > - > - movdqa 32(%rsi), %xmm3 > - palignr $8, %xmm2, %xmm3 > - pcmpeqb 16(%rdi), %xmm3 > - > - pand %xmm1, %xmm3 > - pmovmskb %xmm3, %edx > - lea 32(%rdi), %rdi > - lea 32(%rsi), %rsi > - sub $0xffff, %edx > - jnz L(exit) > - add $8, %rsi > - add %rcx, %rsi > - add %rcx, %rdi > - jmp L(less48bytes) > - > - .p2align 4 > -L(shr_8_gobble): > - sub $32, %rcx > - movdqa 16(%rsi), %xmm0 > - palignr $8, (%rsi), %xmm0 > - pcmpeqb (%rdi), %xmm0 > - > - movdqa 32(%rsi), %xmm3 > - palignr $8, 16(%rsi), %xmm3 > - pcmpeqb 16(%rdi), %xmm3 > - > -L(shr_8_gobble_loop): > - pand %xmm0, %xmm3 > - sub $32, %rcx > - pmovmskb %xmm3, %edx > - movdqa %xmm0, %xmm1 > - > - movdqa 64(%rsi), %xmm3 > - palignr $8, 48(%rsi), %xmm3 > - sbb $0xffff, %edx > - movdqa 48(%rsi), %xmm0 > - palignr $8, 32(%rsi), %xmm0 > - pcmpeqb 32(%rdi), %xmm0 > - lea 32(%rsi), %rsi > - pcmpeqb 48(%rdi), %xmm3 > - > - lea 32(%rdi), %rdi > - jz L(shr_8_gobble_loop) > - pand %xmm0, %xmm3 > - > - cmp $0, %rcx > - jge L(shr_8_gobble_next) > - inc %edx > - add $32, %rcx > -L(shr_8_gobble_next): > - test %edx, %edx > - jnz L(exit) > - > - pmovmskb %xmm3, %edx > - movdqa %xmm0, %xmm1 > - lea 32(%rdi), %rdi > - lea 32(%rsi), %rsi > - sub $0xffff, %edx > - jnz L(exit) > - > - lea 8(%rsi), %rsi > - add %rcx, %rsi > - add %rcx, %rdi > - jmp L(less48bytes) > - > -# ifndef USE_AS_WMEMCMP > - > - .p2align 4 > -L(shr_9): > - cmp $80, %rcx > - lea -48(%rcx), %rcx > - mov %edx, %eax > - jae L(shr_9_gobble) > - > - movdqa 16(%rsi), %xmm1 > - movdqa %xmm1, %xmm2 > - palignr $9, (%rsi), %xmm1 > - pcmpeqb (%rdi), %xmm1 > - > - movdqa 32(%rsi), %xmm3 > - palignr $9, %xmm2, %xmm3 > - pcmpeqb 16(%rdi), %xmm3 > - > - pand %xmm1, %xmm3 > - pmovmskb %xmm3, %edx > - lea 32(%rdi), %rdi > - lea 32(%rsi), %rsi > - sub $0xffff, %edx > - jnz L(exit) > - add $9, %rsi > - add %rcx, %rsi > - add %rcx, %rdi > - jmp L(less48bytes) > - > - .p2align 4 > -L(shr_9_gobble): > - sub $32, %rcx > - movdqa 16(%rsi), %xmm0 > - palignr $9, (%rsi), %xmm0 > - pcmpeqb (%rdi), %xmm0 > - > - movdqa 32(%rsi), %xmm3 > - palignr $9, 16(%rsi), %xmm3 > - pcmpeqb 16(%rdi), %xmm3 > - > -L(shr_9_gobble_loop): > - pand %xmm0, %xmm3 > - sub $32, %rcx > - pmovmskb %xmm3, %edx > - movdqa %xmm0, %xmm1 > - > - movdqa 64(%rsi), %xmm3 > - palignr $9, 48(%rsi), %xmm3 > - sbb $0xffff, %edx > - movdqa 48(%rsi), %xmm0 > - palignr $9, 32(%rsi), %xmm0 > - pcmpeqb 32(%rdi), %xmm0 > - lea 32(%rsi), %rsi > - pcmpeqb 48(%rdi), %xmm3 > - > - lea 32(%rdi), %rdi > - jz L(shr_9_gobble_loop) > - pand %xmm0, %xmm3 > - > - cmp $0, %rcx > - jge L(shr_9_gobble_next) > - inc %edx > - add $32, %rcx > -L(shr_9_gobble_next): > - test %edx, %edx > - jnz L(exit) > - > - pmovmskb %xmm3, %edx > - movdqa %xmm0, %xmm1 > - lea 32(%rdi), %rdi > - lea 32(%rsi), %rsi > - sub $0xffff, %edx > - jnz L(exit) > - > - lea 9(%rsi), %rsi > - add %rcx, %rsi > - add %rcx, %rdi > - jmp L(less48bytes) > - > - .p2align 4 > -L(shr_10): > - cmp $80, %rcx > - lea -48(%rcx), %rcx > - mov %edx, %eax > - jae L(shr_10_gobble) > - > - movdqa 16(%rsi), %xmm1 > - movdqa %xmm1, %xmm2 > - palignr $10, (%rsi), %xmm1 > - pcmpeqb (%rdi), %xmm1 > - > - movdqa 32(%rsi), %xmm3 > - palignr $10, %xmm2, %xmm3 > - pcmpeqb 16(%rdi), %xmm3 > - > - pand %xmm1, %xmm3 > - pmovmskb %xmm3, %edx > - lea 32(%rdi), %rdi > - lea 32(%rsi), %rsi > - sub $0xffff, %edx > - jnz L(exit) > - add $10, %rsi > - add %rcx, %rsi > - add %rcx, %rdi > - jmp L(less48bytes) > - > - .p2align 4 > -L(shr_10_gobble): > - sub $32, %rcx > - movdqa 16(%rsi), %xmm0 > - palignr $10, (%rsi), %xmm0 > - pcmpeqb (%rdi), %xmm0 > - > - movdqa 32(%rsi), %xmm3 > - palignr $10, 16(%rsi), %xmm3 > - pcmpeqb 16(%rdi), %xmm3 > - > -L(shr_10_gobble_loop): > - pand %xmm0, %xmm3 > - sub $32, %rcx > - pmovmskb %xmm3, %edx > - movdqa %xmm0, %xmm1 > - > - movdqa 64(%rsi), %xmm3 > - palignr $10, 48(%rsi), %xmm3 > - sbb $0xffff, %edx > - movdqa 48(%rsi), %xmm0 > - palignr $10, 32(%rsi), %xmm0 > - pcmpeqb 32(%rdi), %xmm0 > - lea 32(%rsi), %rsi > - pcmpeqb 48(%rdi), %xmm3 > - > - lea 32(%rdi), %rdi > - jz L(shr_10_gobble_loop) > - pand %xmm0, %xmm3 > - > - cmp $0, %rcx > - jge L(shr_10_gobble_next) > - inc %edx > - add $32, %rcx > -L(shr_10_gobble_next): > - test %edx, %edx > - jnz L(exit) > - > - pmovmskb %xmm3, %edx > - movdqa %xmm0, %xmm1 > - lea 32(%rdi), %rdi > - lea 32(%rsi), %rsi > - sub $0xffff, %edx > - jnz L(exit) > - > - lea 10(%rsi), %rsi > - add %rcx, %rsi > - add %rcx, %rdi > - jmp L(less48bytes) > - > - .p2align 4 > -L(shr_11): > - cmp $80, %rcx > - lea -48(%rcx), %rcx > - mov %edx, %eax > - jae L(shr_11_gobble) > - > - movdqa 16(%rsi), %xmm1 > - movdqa %xmm1, %xmm2 > - palignr $11, (%rsi), %xmm1 > - pcmpeqb (%rdi), %xmm1 > - > - movdqa 32(%rsi), %xmm3 > - palignr $11, %xmm2, %xmm3 > - pcmpeqb 16(%rdi), %xmm3 > - > - pand %xmm1, %xmm3 > - pmovmskb %xmm3, %edx > - lea 32(%rdi), %rdi > - lea 32(%rsi), %rsi > - sub $0xffff, %edx > - jnz L(exit) > - add $11, %rsi > - add %rcx, %rsi > - add %rcx, %rdi > - jmp L(less48bytes) > - > - .p2align 4 > -L(shr_11_gobble): > - sub $32, %rcx > - movdqa 16(%rsi), %xmm0 > - palignr $11, (%rsi), %xmm0 > - pcmpeqb (%rdi), %xmm0 > - > - movdqa 32(%rsi), %xmm3 > - palignr $11, 16(%rsi), %xmm3 > - pcmpeqb 16(%rdi), %xmm3 > - > -L(shr_11_gobble_loop): > - pand %xmm0, %xmm3 > - sub $32, %rcx > - pmovmskb %xmm3, %edx > - movdqa %xmm0, %xmm1 > - > - movdqa 64(%rsi), %xmm3 > - palignr $11, 48(%rsi), %xmm3 > - sbb $0xffff, %edx > - movdqa 48(%rsi), %xmm0 > - palignr $11, 32(%rsi), %xmm0 > - pcmpeqb 32(%rdi), %xmm0 > - lea 32(%rsi), %rsi > - pcmpeqb 48(%rdi), %xmm3 > - > - lea 32(%rdi), %rdi > - jz L(shr_11_gobble_loop) > - pand %xmm0, %xmm3 > - > - cmp $0, %rcx > - jge L(shr_11_gobble_next) > - inc %edx > - add $32, %rcx > -L(shr_11_gobble_next): > - test %edx, %edx > - jnz L(exit) > - > - pmovmskb %xmm3, %edx > - movdqa %xmm0, %xmm1 > - lea 32(%rdi), %rdi > - lea 32(%rsi), %rsi > - sub $0xffff, %edx > - jnz L(exit) > - > - lea 11(%rsi), %rsi > - add %rcx, %rsi > - add %rcx, %rdi > - jmp L(less48bytes) > - > -# endif > - > - .p2align 4 > -L(shr_12): > - cmp $80, %rcx > - lea -48(%rcx), %rcx > - mov %edx, %eax > - jae L(shr_12_gobble) > - > - movdqa 16(%rsi), %xmm1 > - movdqa %xmm1, %xmm2 > - palignr $12, (%rsi), %xmm1 > - pcmpeqb (%rdi), %xmm1 > - > - movdqa 32(%rsi), %xmm3 > - palignr $12, %xmm2, %xmm3 > - pcmpeqb 16(%rdi), %xmm3 > - > - pand %xmm1, %xmm3 > - pmovmskb %xmm3, %edx > - lea 32(%rdi), %rdi > - lea 32(%rsi), %rsi > - sub $0xffff, %edx > - jnz L(exit) > - add $12, %rsi > - add %rcx, %rsi > - add %rcx, %rdi > - jmp L(less48bytes) > - > - .p2align 4 > -L(shr_12_gobble): > - sub $32, %rcx > - movdqa 16(%rsi), %xmm0 > - palignr $12, (%rsi), %xmm0 > - pcmpeqb (%rdi), %xmm0 > - > - movdqa 32(%rsi), %xmm3 > - palignr $12, 16(%rsi), %xmm3 > - pcmpeqb 16(%rdi), %xmm3 > - > -L(shr_12_gobble_loop): > - pand %xmm0, %xmm3 > - sub $32, %rcx > - pmovmskb %xmm3, %edx > - movdqa %xmm0, %xmm1 > - > - movdqa 64(%rsi), %xmm3 > - palignr $12, 48(%rsi), %xmm3 > - sbb $0xffff, %edx > - movdqa 48(%rsi), %xmm0 > - palignr $12, 32(%rsi), %xmm0 > - pcmpeqb 32(%rdi), %xmm0 > - lea 32(%rsi), %rsi > - pcmpeqb 48(%rdi), %xmm3 > - > - lea 32(%rdi), %rdi > - jz L(shr_12_gobble_loop) > - pand %xmm0, %xmm3 > - > - cmp $0, %rcx > - jge L(shr_12_gobble_next) > - inc %edx > - add $32, %rcx > -L(shr_12_gobble_next): > - test %edx, %edx > - jnz L(exit) > - > - pmovmskb %xmm3, %edx > - movdqa %xmm0, %xmm1 > - lea 32(%rdi), %rdi > - lea 32(%rsi), %rsi > - sub $0xffff, %edx > - jnz L(exit) > - > - lea 12(%rsi), %rsi > - add %rcx, %rsi > - add %rcx, %rdi > - jmp L(less48bytes) > - > -# ifndef USE_AS_WMEMCMP > - > - .p2align 4 > -L(shr_13): > - cmp $80, %rcx > - lea -48(%rcx), %rcx > - mov %edx, %eax > - jae L(shr_13_gobble) > - > - movdqa 16(%rsi), %xmm1 > - movdqa %xmm1, %xmm2 > - palignr $13, (%rsi), %xmm1 > - pcmpeqb (%rdi), %xmm1 > - > - movdqa 32(%rsi), %xmm3 > - palignr $13, %xmm2, %xmm3 > - pcmpeqb 16(%rdi), %xmm3 > - > - pand %xmm1, %xmm3 > - pmovmskb %xmm3, %edx > - lea 32(%rdi), %rdi > - lea 32(%rsi), %rsi > - sub $0xffff, %edx > - jnz L(exit) > - add $13, %rsi > - add %rcx, %rsi > - add %rcx, %rdi > - jmp L(less48bytes) > - > - .p2align 4 > -L(shr_13_gobble): > - sub $32, %rcx > - movdqa 16(%rsi), %xmm0 > - palignr $13, (%rsi), %xmm0 > - pcmpeqb (%rdi), %xmm0 > - > - movdqa 32(%rsi), %xmm3 > - palignr $13, 16(%rsi), %xmm3 > - pcmpeqb 16(%rdi), %xmm3 > - > -L(shr_13_gobble_loop): > - pand %xmm0, %xmm3 > - sub $32, %rcx > - pmovmskb %xmm3, %edx > - movdqa %xmm0, %xmm1 > - > - movdqa 64(%rsi), %xmm3 > - palignr $13, 48(%rsi), %xmm3 > - sbb $0xffff, %edx > - movdqa 48(%rsi), %xmm0 > - palignr $13, 32(%rsi), %xmm0 > - pcmpeqb 32(%rdi), %xmm0 > - lea 32(%rsi), %rsi > - pcmpeqb 48(%rdi), %xmm3 > - > - lea 32(%rdi), %rdi > - jz L(shr_13_gobble_loop) > - pand %xmm0, %xmm3 > - > - cmp $0, %rcx > - jge L(shr_13_gobble_next) > - inc %edx > - add $32, %rcx > -L(shr_13_gobble_next): > - test %edx, %edx > - jnz L(exit) > - > - pmovmskb %xmm3, %edx > - movdqa %xmm0, %xmm1 > - lea 32(%rdi), %rdi > - lea 32(%rsi), %rsi > - sub $0xffff, %edx > - jnz L(exit) > - > - lea 13(%rsi), %rsi > - add %rcx, %rsi > - add %rcx, %rdi > - jmp L(less48bytes) > - > - .p2align 4 > -L(shr_14): > - cmp $80, %rcx > - lea -48(%rcx), %rcx > - mov %edx, %eax > - jae L(shr_14_gobble) > - > - movdqa 16(%rsi), %xmm1 > - movdqa %xmm1, %xmm2 > - palignr $14, (%rsi), %xmm1 > - pcmpeqb (%rdi), %xmm1 > - > - movdqa 32(%rsi), %xmm3 > - palignr $14, %xmm2, %xmm3 > - pcmpeqb 16(%rdi), %xmm3 > - > - pand %xmm1, %xmm3 > - pmovmskb %xmm3, %edx > - lea 32(%rdi), %rdi > - lea 32(%rsi), %rsi > - sub $0xffff, %edx > - jnz L(exit) > - add $14, %rsi > - add %rcx, %rsi > - add %rcx, %rdi > - jmp L(less48bytes) > - > - .p2align 4 > -L(shr_14_gobble): > - sub $32, %rcx > - movdqa 16(%rsi), %xmm0 > - palignr $14, (%rsi), %xmm0 > - pcmpeqb (%rdi), %xmm0 > - > - movdqa 32(%rsi), %xmm3 > - palignr $14, 16(%rsi), %xmm3 > - pcmpeqb 16(%rdi), %xmm3 > - > -L(shr_14_gobble_loop): > - pand %xmm0, %xmm3 > - sub $32, %rcx > - pmovmskb %xmm3, %edx > - movdqa %xmm0, %xmm1 > - > - movdqa 64(%rsi), %xmm3 > - palignr $14, 48(%rsi), %xmm3 > - sbb $0xffff, %edx > - movdqa 48(%rsi), %xmm0 > - palignr $14, 32(%rsi), %xmm0 > - pcmpeqb 32(%rdi), %xmm0 > - lea 32(%rsi), %rsi > - pcmpeqb 48(%rdi), %xmm3 > - > - lea 32(%rdi), %rdi > - jz L(shr_14_gobble_loop) > - pand %xmm0, %xmm3 > - > - cmp $0, %rcx > - jge L(shr_14_gobble_next) > - inc %edx > - add $32, %rcx > -L(shr_14_gobble_next): > - test %edx, %edx > - jnz L(exit) > - > - pmovmskb %xmm3, %edx > - movdqa %xmm0, %xmm1 > - lea 32(%rdi), %rdi > - lea 32(%rsi), %rsi > - sub $0xffff, %edx > - jnz L(exit) > - > - lea 14(%rsi), %rsi > - add %rcx, %rsi > - add %rcx, %rdi > - jmp L(less48bytes) > - > - .p2align 4 > -L(shr_15): > - cmp $80, %rcx > - lea -48(%rcx), %rcx > - mov %edx, %eax > - jae L(shr_15_gobble) > - > - movdqa 16(%rsi), %xmm1 > - movdqa %xmm1, %xmm2 > - palignr $15, (%rsi), %xmm1 > - pcmpeqb (%rdi), %xmm1 > - > - movdqa 32(%rsi), %xmm3 > - palignr $15, %xmm2, %xmm3 > - pcmpeqb 16(%rdi), %xmm3 > - > - pand %xmm1, %xmm3 > - pmovmskb %xmm3, %edx > - lea 32(%rdi), %rdi > - lea 32(%rsi), %rsi > - sub $0xffff, %edx > - jnz L(exit) > - add $15, %rsi > - add %rcx, %rsi > - add %rcx, %rdi > - jmp L(less48bytes) > - > - .p2align 4 > -L(shr_15_gobble): > - sub $32, %rcx > - movdqa 16(%rsi), %xmm0 > - palignr $15, (%rsi), %xmm0 > - pcmpeqb (%rdi), %xmm0 > - > - movdqa 32(%rsi), %xmm3 > - palignr $15, 16(%rsi), %xmm3 > - pcmpeqb 16(%rdi), %xmm3 > - > -L(shr_15_gobble_loop): > - pand %xmm0, %xmm3 > - sub $32, %rcx > - pmovmskb %xmm3, %edx > - movdqa %xmm0, %xmm1 > - > - movdqa 64(%rsi), %xmm3 > - palignr $15, 48(%rsi), %xmm3 > - sbb $0xffff, %edx > - movdqa 48(%rsi), %xmm0 > - palignr $15, 32(%rsi), %xmm0 > - pcmpeqb 32(%rdi), %xmm0 > - lea 32(%rsi), %rsi > - pcmpeqb 48(%rdi), %xmm3 > - > - lea 32(%rdi), %rdi > - jz L(shr_15_gobble_loop) > - pand %xmm0, %xmm3 > - > - cmp $0, %rcx > - jge L(shr_15_gobble_next) > - inc %edx > - add $32, %rcx > -L(shr_15_gobble_next): > - test %edx, %edx > - jnz L(exit) > - > - pmovmskb %xmm3, %edx > - movdqa %xmm0, %xmm1 > - lea 32(%rdi), %rdi > - lea 32(%rsi), %rsi > - sub $0xffff, %edx > - jnz L(exit) > - > - lea 15(%rsi), %rsi > - add %rcx, %rsi > - add %rcx, %rdi > - jmp L(less48bytes) > -# endif > - .p2align 4 > -L(exit): > - pmovmskb %xmm1, %r8d > - sub $0xffff, %r8d > - jz L(first16bytes) > - lea -16(%rsi), %rsi > - lea -16(%rdi), %rdi > - mov %r8d, %edx > -L(first16bytes): > - add %rax, %rsi > -L(less16bytes): > -# ifndef USE_AS_WMEMCMP > - test %dl, %dl > - jz L(next_24_bytes) > - > - test $0x01, %dl > - jnz L(Byte16) > - > - test $0x02, %dl > - jnz L(Byte17) > - > - test $0x04, %dl > - jnz L(Byte18) > - > - test $0x08, %dl > - jnz L(Byte19) > - > - test $0x10, %dl > - jnz L(Byte20) > - > - test $0x20, %dl > - jnz L(Byte21) > - > - test $0x40, %dl > - jnz L(Byte22) > - > - movzbl -9(%rdi), %eax > - movzbl -9(%rsi), %edx > - sub %edx, %eax > - ret > - > - .p2align 4 > -L(Byte16): > - movzbl -16(%rdi), %eax > - movzbl -16(%rsi), %edx > - sub %edx, %eax > - ret > - > - .p2align 4 > -L(Byte17): > - movzbl -15(%rdi), %eax > - movzbl -15(%rsi), %edx > - sub %edx, %eax > - ret > - > - .p2align 4 > -L(Byte18): > - movzbl -14(%rdi), %eax > - movzbl -14(%rsi), %edx > - sub %edx, %eax > - ret > - > - .p2align 4 > -L(Byte19): > - movzbl -13(%rdi), %eax > - movzbl -13(%rsi), %edx > - sub %edx, %eax > - ret > - > - .p2align 4 > -L(Byte20): > - movzbl -12(%rdi), %eax > - movzbl -12(%rsi), %edx > - sub %edx, %eax > - ret > - > - .p2align 4 > -L(Byte21): > - movzbl -11(%rdi), %eax > - movzbl -11(%rsi), %edx > - sub %edx, %eax > - ret > - > - .p2align 4 > -L(Byte22): > - movzbl -10(%rdi), %eax > - movzbl -10(%rsi), %edx > - sub %edx, %eax > - ret > - > - .p2align 4 > -L(next_24_bytes): > - lea 8(%rdi), %rdi > - lea 8(%rsi), %rsi > - test $0x01, %dh > - jnz L(Byte16) > - > - test $0x02, %dh > - jnz L(Byte17) > - > - test $0x04, %dh > - jnz L(Byte18) > - > - test $0x08, %dh > - jnz L(Byte19) > - > - test $0x10, %dh > - jnz L(Byte20) > - > - test $0x20, %dh > - jnz L(Byte21) > - > - test $0x40, %dh > - jnz L(Byte22) > - > - movzbl -9(%rdi), %eax > - movzbl -9(%rsi), %edx > - sub %edx, %eax > - ret > -# else > -/* special for wmemcmp */ > - xor %eax, %eax > - test %dl, %dl > - jz L(next_two_double_words) > - and $15, %dl > - jz L(second_double_word) > - mov -16(%rdi), %eax > - cmp -16(%rsi), %eax > - jne L(find_diff) > - ret > - > - .p2align 4 > -L(second_double_word): > - mov -12(%rdi), %eax > - cmp -12(%rsi), %eax > - jne L(find_diff) > - ret > - > - .p2align 4 > -L(next_two_double_words): > - and $15, %dh > - jz L(fourth_double_word) > - mov -8(%rdi), %eax > - cmp -8(%rsi), %eax > - jne L(find_diff) > - ret > - > - .p2align 4 > -L(fourth_double_word): > - mov -4(%rdi), %eax > - cmp -4(%rsi), %eax > - jne L(find_diff) > - ret > -# endif > - > - .p2align 4 > -L(less48bytes): > - cmp $8, %ecx > - jae L(more8bytes) > - cmp $0, %ecx > - je L(0bytes) > -# ifndef USE_AS_WMEMCMP > - cmp $1, %ecx > - je L(1bytes) > - cmp $2, %ecx > - je L(2bytes) > - cmp $3, %ecx > - je L(3bytes) > - cmp $4, %ecx > - je L(4bytes) > - cmp $5, %ecx > - je L(5bytes) > - cmp $6, %ecx > - je L(6bytes) > - jmp L(7bytes) > -# else > - jmp L(4bytes) > -# endif > - > - .p2align 4 > -L(more8bytes): > - cmp $16, %ecx > - jae L(more16bytes) > - cmp $8, %ecx > - je L(8bytes) > -# ifndef USE_AS_WMEMCMP > - cmp $9, %ecx > - je L(9bytes) > - cmp $10, %ecx > - je L(10bytes) > - cmp $11, %ecx > - je L(11bytes) > - cmp $12, %ecx > - je L(12bytes) > - cmp $13, %ecx > - je L(13bytes) > - cmp $14, %ecx > - je L(14bytes) > - jmp L(15bytes) > -# else > - jmp L(12bytes) > -# endif > - > - .p2align 4 > -L(more16bytes): > - cmp $24, %ecx > - jae L(more24bytes) > - cmp $16, %ecx > - je L(16bytes) > -# ifndef USE_AS_WMEMCMP > - cmp $17, %ecx > - je L(17bytes) > - cmp $18, %ecx > - je L(18bytes) > - cmp $19, %ecx > - je L(19bytes) > - cmp $20, %ecx > - je L(20bytes) > - cmp $21, %ecx > - je L(21bytes) > - cmp $22, %ecx > - je L(22bytes) > - jmp L(23bytes) > -# else > - jmp L(20bytes) > -# endif > - > - .p2align 4 > -L(more24bytes): > - cmp $32, %ecx > - jae L(more32bytes) > - cmp $24, %ecx > - je L(24bytes) > -# ifndef USE_AS_WMEMCMP > - cmp $25, %ecx > - je L(25bytes) > - cmp $26, %ecx > - je L(26bytes) > - cmp $27, %ecx > - je L(27bytes) > - cmp $28, %ecx > - je L(28bytes) > - cmp $29, %ecx > - je L(29bytes) > - cmp $30, %ecx > - je L(30bytes) > - jmp L(31bytes) > -# else > - jmp L(28bytes) > -# endif > - > - .p2align 4 > -L(more32bytes): > - cmp $40, %ecx > - jae L(more40bytes) > - cmp $32, %ecx > - je L(32bytes) > -# ifndef USE_AS_WMEMCMP > - cmp $33, %ecx > - je L(33bytes) > - cmp $34, %ecx > - je L(34bytes) > - cmp $35, %ecx > - je L(35bytes) > - cmp $36, %ecx > - je L(36bytes) > - cmp $37, %ecx > - je L(37bytes) > - cmp $38, %ecx > - je L(38bytes) > - jmp L(39bytes) > -# else > - jmp L(36bytes) > -# endif > - > - .p2align 4 > -L(more40bytes): > - cmp $40, %ecx > - je L(40bytes) > -# ifndef USE_AS_WMEMCMP > - cmp $41, %ecx > - je L(41bytes) > - cmp $42, %ecx > - je L(42bytes) > - cmp $43, %ecx > - je L(43bytes) > - cmp $44, %ecx > - je L(44bytes) > - cmp $45, %ecx > - je L(45bytes) > - cmp $46, %ecx > - je L(46bytes) > - jmp L(47bytes) > - > - .p2align 4 > -L(44bytes): > - movl -44(%rdi), %eax > - movl -44(%rsi), %ecx > - cmp %ecx, %eax > - jne L(find_diff) > -L(40bytes): > - movl -40(%rdi), %eax > - movl -40(%rsi), %ecx > - cmp %ecx, %eax > - jne L(find_diff) > -L(36bytes): > - movl -36(%rdi), %eax > - movl -36(%rsi), %ecx > - cmp %ecx, %eax > - jne L(find_diff) > -L(32bytes): > - movl -32(%rdi), %eax > - movl -32(%rsi), %ecx > - cmp %ecx, %eax > - jne L(find_diff) > -L(28bytes): > - movl -28(%rdi), %eax > - movl -28(%rsi), %ecx > - cmp %ecx, %eax > - jne L(find_diff) > -L(24bytes): > - movl -24(%rdi), %eax > - movl -24(%rsi), %ecx > - cmp %ecx, %eax > - jne L(find_diff) > -L(20bytes): > - movl -20(%rdi), %eax > - movl -20(%rsi), %ecx > - cmp %ecx, %eax > - jne L(find_diff) > -L(16bytes): > - movl -16(%rdi), %eax > - movl -16(%rsi), %ecx > - cmp %ecx, %eax > - jne L(find_diff) > -L(12bytes): > - movl -12(%rdi), %eax > - movl -12(%rsi), %ecx > - cmp %ecx, %eax > - jne L(find_diff) > -L(8bytes): > - movl -8(%rdi), %eax > - movl -8(%rsi), %ecx > - cmp %ecx, %eax > - jne L(find_diff) > -L(4bytes): > - movl -4(%rdi), %eax > - movl -4(%rsi), %ecx > - cmp %ecx, %eax > - jne L(find_diff) > -L(0bytes): > - xor %eax, %eax > - ret > -# else > - .p2align 4 > -L(44bytes): > - movl -44(%rdi), %eax > - cmp -44(%rsi), %eax > - jne L(find_diff) > -L(40bytes): > - movl -40(%rdi), %eax > - cmp -40(%rsi), %eax > - jne L(find_diff) > -L(36bytes): > - movl -36(%rdi), %eax > - cmp -36(%rsi), %eax > - jne L(find_diff) > -L(32bytes): > - movl -32(%rdi), %eax > - cmp -32(%rsi), %eax > - jne L(find_diff) > -L(28bytes): > - movl -28(%rdi), %eax > - cmp -28(%rsi), %eax > - jne L(find_diff) > -L(24bytes): > - movl -24(%rdi), %eax > - cmp -24(%rsi), %eax > - jne L(find_diff) > -L(20bytes): > - movl -20(%rdi), %eax > - cmp -20(%rsi), %eax > - jne L(find_diff) > -L(16bytes): > - movl -16(%rdi), %eax > - cmp -16(%rsi), %eax > - jne L(find_diff) > -L(12bytes): > - movl -12(%rdi), %eax > - cmp -12(%rsi), %eax > - jne L(find_diff) > -L(8bytes): > - movl -8(%rdi), %eax > - cmp -8(%rsi), %eax > - jne L(find_diff) > -L(4bytes): > - movl -4(%rdi), %eax > - cmp -4(%rsi), %eax > - jne L(find_diff) > -L(0bytes): > - xor %eax, %eax > - ret > -# endif > - > -# ifndef USE_AS_WMEMCMP > - .p2align 4 > -L(45bytes): > - movl -45(%rdi), %eax > - movl -45(%rsi), %ecx > - cmp %ecx, %eax > - jne L(find_diff) > -L(41bytes): > - movl -41(%rdi), %eax > - movl -41(%rsi), %ecx > - cmp %ecx, %eax > - jne L(find_diff) > -L(37bytes): > - movl -37(%rdi), %eax > - movl -37(%rsi), %ecx > - cmp %ecx, %eax > - jne L(find_diff) > -L(33bytes): > - movl -33(%rdi), %eax > - movl -33(%rsi), %ecx > - cmp %ecx, %eax > - jne L(find_diff) > -L(29bytes): > - movl -29(%rdi), %eax > - movl -29(%rsi), %ecx > - cmp %ecx, %eax > - jne L(find_diff) > -L(25bytes): > - movl -25(%rdi), %eax > - movl -25(%rsi), %ecx > - cmp %ecx, %eax > - jne L(find_diff) > -L(21bytes): > - movl -21(%rdi), %eax > - movl -21(%rsi), %ecx > - cmp %ecx, %eax > - jne L(find_diff) > -L(17bytes): > - movl -17(%rdi), %eax > - movl -17(%rsi), %ecx > - cmp %ecx, %eax > - jne L(find_diff) > -L(13bytes): > - movl -13(%rdi), %eax > - movl -13(%rsi), %ecx > - cmp %ecx, %eax > - jne L(find_diff) > -L(9bytes): > - movl -9(%rdi), %eax > - movl -9(%rsi), %ecx > - cmp %ecx, %eax > - jne L(find_diff) > -L(5bytes): > - movl -5(%rdi), %eax > - movl -5(%rsi), %ecx > - cmp %ecx, %eax > - jne L(find_diff) > -L(1bytes): > - movzbl -1(%rdi), %eax > - cmpb -1(%rsi), %al > - jne L(set) > - xor %eax, %eax > - ret > - > - .p2align 4 > -L(46bytes): > - movl -46(%rdi), %eax > - movl -46(%rsi), %ecx > - cmp %ecx, %eax > - jne L(find_diff) > -L(42bytes): > - movl -42(%rdi), %eax > - movl -42(%rsi), %ecx > - cmp %ecx, %eax > - jne L(find_diff) > -L(38bytes): > - movl -38(%rdi), %eax > - movl -38(%rsi), %ecx > - cmp %ecx, %eax > - jne L(find_diff) > -L(34bytes): > - movl -34(%rdi), %eax > - movl -34(%rsi), %ecx > - cmp %ecx, %eax > - jne L(find_diff) > -L(30bytes): > - movl -30(%rdi), %eax > - movl -30(%rsi), %ecx > - cmp %ecx, %eax > - jne L(find_diff) > -L(26bytes): > - movl -26(%rdi), %eax > - movl -26(%rsi), %ecx > - cmp %ecx, %eax > - jne L(find_diff) > -L(22bytes): > - movl -22(%rdi), %eax > - movl -22(%rsi), %ecx > - cmp %ecx, %eax > - jne L(find_diff) > -L(18bytes): > - movl -18(%rdi), %eax > - movl -18(%rsi), %ecx > - cmp %ecx, %eax > - jne L(find_diff) > -L(14bytes): > - movl -14(%rdi), %eax > - movl -14(%rsi), %ecx > - cmp %ecx, %eax > - jne L(find_diff) > -L(10bytes): > - movl -10(%rdi), %eax > - movl -10(%rsi), %ecx > - cmp %ecx, %eax > - jne L(find_diff) > -L(6bytes): > - movl -6(%rdi), %eax > - movl -6(%rsi), %ecx > - cmp %ecx, %eax > - jne L(find_diff) > -L(2bytes): > - movzwl -2(%rdi), %eax > - movzwl -2(%rsi), %ecx > - cmpb %cl, %al > - jne L(set) > - cmp %ecx, %eax > - jne L(set) > - xor %eax, %eax > - ret > - > - .p2align 4 > -L(47bytes): > - movl -47(%rdi), %eax > - movl -47(%rsi), %ecx > - cmp %ecx, %eax > - jne L(find_diff) > -L(43bytes): > - movl -43(%rdi), %eax > - movl -43(%rsi), %ecx > - cmp %ecx, %eax > - jne L(find_diff) > -L(39bytes): > - movl -39(%rdi), %eax > - movl -39(%rsi), %ecx > - cmp %ecx, %eax > - jne L(find_diff) > -L(35bytes): > - movl -35(%rdi), %eax > - movl -35(%rsi), %ecx > - cmp %ecx, %eax > - jne L(find_diff) > -L(31bytes): > - movl -31(%rdi), %eax > - movl -31(%rsi), %ecx > - cmp %ecx, %eax > - jne L(find_diff) > -L(27bytes): > - movl -27(%rdi), %eax > - movl -27(%rsi), %ecx > - cmp %ecx, %eax > - jne L(find_diff) > -L(23bytes): > - movl -23(%rdi), %eax > - movl -23(%rsi), %ecx > - cmp %ecx, %eax > - jne L(find_diff) > -L(19bytes): > - movl -19(%rdi), %eax > - movl -19(%rsi), %ecx > - cmp %ecx, %eax > - jne L(find_diff) > -L(15bytes): > - movl -15(%rdi), %eax > - movl -15(%rsi), %ecx > - cmp %ecx, %eax > - jne L(find_diff) > -L(11bytes): > - movl -11(%rdi), %eax > - movl -11(%rsi), %ecx > - cmp %ecx, %eax > - jne L(find_diff) > -L(7bytes): > - movl -7(%rdi), %eax > - movl -7(%rsi), %ecx > - cmp %ecx, %eax > - jne L(find_diff) > -L(3bytes): > - movzwl -3(%rdi), %eax > - movzwl -3(%rsi), %ecx > - cmpb %cl, %al > - jne L(set) > - cmp %ecx, %eax > - jne L(set) > - movzbl -1(%rdi), %eax > - cmpb -1(%rsi), %al > - jne L(set) > - xor %eax, %eax > - ret > - > - .p2align 4 > -L(find_diff): > - cmpb %cl, %al > - jne L(set) > - cmpw %cx, %ax > - jne L(set) > - shr $16, %eax > - shr $16, %ecx > - cmpb %cl, %al > - jne L(set) > - > -/* We get there only if we already know there is a > -difference. */ > - > - cmp %ecx, %eax > -L(set): > - sbb %eax, %eax > - sbb $-1, %eax > - ret > -# else > - > -/* for wmemcmp */ > - .p2align 4 > -L(find_diff): > - mov $1, %eax > - jg L(find_diff_bigger) > - neg %eax > - ret > - > - .p2align 4 > -L(find_diff_bigger): > - ret > -# endif > - > - .p2align 4 > -L(equal): > - xor %eax, %eax > - ret > - > -END (MEMCMP) > -#endif > diff --git a/sysdeps/x86_64/multiarch/wmemcmp-ssse3.S b/sysdeps/x86_64/multiarch/wmemcmp-ssse3.S > deleted file mode 100644 > index a41ef95fc1..0000000000 > --- a/sysdeps/x86_64/multiarch/wmemcmp-ssse3.S > +++ /dev/null > @@ -1,4 +0,0 @@ > -#define USE_AS_WMEMCMP 1 > -#define MEMCMP __wmemcmp_ssse3 > - > -#include "memcmp-ssse3.S" > -- > 2.25.1 > LGTM. Reviewed-by: H.J. Lu Thanks. -- H.J.